In [8]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from dataloader import load_data
from helpers import get_cat_dims

from models import WGANGP

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from imblearn.over_sampling import SMOTE


import logging
logging.getLogger().setLevel(logging.INFO)

In [14]:
prima_df = pd.read_csv('/home/crespo99/Desktop/repos/awesome-tab-augmentation/data/diabetes.csv')
prima_label = prima_df[["Outcome"]]
prima_df = prima_df.drop(columns=["Outcome"])

X = prima_df
y = prima_label

num_cols = X.select_dtypes(include='number').columns
cat_cols = X.select_dtypes(include='object').columns

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2020)
cat_dims = get_cat_dims(X_train, cat_cols)

# preprocess data
num_prep = make_pipeline(SimpleImputer(strategy='mean'),
                         MinMaxScaler())
cat_prep = make_pipeline(SimpleImputer(strategy='most_frequent'),
                         OneHotEncoder(handle_unknown='ignore'))
prep = ColumnTransformer([
    ('num', num_prep, num_cols),
    ('cat', cat_prep, cat_cols)],
    remainder='drop')
X_train_trans = prep.fit_transform(X_train)
X_test_trans = prep.transform(X_test)

gan = WGANGP(write_to_disk=True, # whether to create an output folder. Plotting will be surpressed if flase
            compute_metrics_every=100, print_every=200, plot_every=1000,
            num_cols = num_cols, cat_dims=cat_dims,
            # pass the one hot encoder to the GAN to enable count plots of categorical variables
            transformer=prep.named_transformers_['cat']['onehotencoder'],
            # pass column names to enable
            cat_cols=cat_cols,
            use_aux_classifier_loss=True,
            d_updates_per_g=3, gp_weight=15)

In [None]:
gan.fit(X_train_trans, y=y_train.values, 
        condition=True,
        epochs=100,  
        batch_size=8,
        netG_kwargs = {'hidden_layer_sizes': (32,16), 
                        'n_cross_layers': 2,
                        'cat_activation': 'gumbel_softmax',
                        'num_activation': 'none',
                        'condition_num_on_cat': False, 
                        'noise_dim': 12, 
                        'normal_noise': False,
                        'activation':  'leaky_relu',
                        'reduce_cat_dim': True,
                        'use_num_hidden_layer': True,
                        'layer_norm':False,},
        netD_kwargs = {'hidden_layer_sizes': (64,32,16),
                        'n_cross_layers': 2,
                        'embedding_dims': 'auto',
                        'activation':  'leaky_relu',
                        'sigmoid_activation': False,
                        'noisy_num_cols': True,
                        'layer_norm':True,}
       )

In [None]:
y_train_arr = np.array(y_train).flatten()
X_res, y_res = gan.resample(X_train_trans, y=y_train_arr)
print(f'Original imbalance ratio was:{y_train_arr.mean():.2f}\nAfter resampling it is:{y_res.mean():.2f}')

X_test_trans = prep.transform(X_test)
clf = RandomForestClassifier(n_estimators=500, min_samples_leaf=1, max_features='sqrt', bootstrap=True,
                             random_state=2020, n_jobs=2)
    
clf.fit(X_res, y_res)
preds_oversampled = clf.predict_proba(X_test_trans)[:,1]

clf.fit(X_train_trans, y_train_arr)
preds_imbalanced = clf.predict_proba(X_test_trans)[:,1]

print(f'AUC-ROC Random Forest:\n'
      f'Balanced data:\t\t{roc_auc_score(y_test, preds_oversampled):.4f}\n'
      f'Imbalanced data:\t{roc_auc_score(y_test, preds_imbalanced):.4f}')

In [None]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_trans, y_train)

X_test_trans = prep.transform(X_test)
clf = RandomForestClassifier(n_estimators=500, min_samples_leaf=1, max_features='sqrt', bootstrap=True,
                             random_state=2020, n_jobs=2)
    
clf.fit(X_train_res, y_train_res)
preds_oversampled = clf.predict_proba(X_test_trans)[:,1]

clf.fit(X_train_trans, y_train)
preds_imbalanced = clf.predict_proba(X_test_trans)[:,1]

print(f'AUC-ROC Random Forest:\n'
      f'Balanced data:\t\t{roc_auc_score(y_test, preds_oversampled):.4f}\n'
      f'Imbalanced data:\t{roc_auc_score(y_test, preds_imbalanced):.4f}')

print(f'accuracy balanced: {np.mean((preds_oversampled>0.5) == y_test.values.flatten())}')
print(f'accuracy imbalanced: {np.mean((preds_imbalanced>0.5) == y_test.values.flatten())}')

In [None]:
clf = RandomForestClassifier(n_estimators=500, min_samples_leaf=1, max_features='sqrt', bootstrap=True,
                             random_state=2020, n_jobs=2)

clf.fit(X_train_trans, y_train_arr)
preds_imbalanced = clf.predict_proba(X_test_trans)[:,1]

print(f'AUC-ROC Random Forest:\n'
      f'Imbalanced data:\t{roc_auc_score(y_test, preds_imbalanced):.4f}')
from sklearn.metrics import f1_score

f1_score(y_test, preds_imbalanced>0.5)

In [None]:
preds_oversampled

In [None]:
preds_imbalanced