In [1]:
# Pandas, matplotlib, random
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import pickle

# Tensorflow
import tensorflow as tf

# Sklearn
from sklearn import tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer

from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Notebooks
import nbimporter
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from GAN import GAN


In [6]:
def build_GAN(dataframe, parameters):
    # unpack parameters
    use_corr_loss = parameters['use_corr_loss']
    BATCH_SIZE = parameters['BATCH_SIZE']
    data_dim = 15
    n_hidden = parameters['n_hidden']
    n_layers = parameters['n_layers']
    EPOCHS = parameters['EPOCHS']
    lr = parameters['lr']
    
    print("Number of Features: {}".format(data_dim))
    
    # cross-validation lists
    loss_g_cv = []
    loss_d_cv = []
    acc_pos_cv = []
    acc_neg_cv = []
    corr_cv = []
    # cross-validation loop
    cv = StratifiedKFold(n_splits=10, random_state=13, shuffle=True)
    dataframe_array = np.array(dataframe)
    for train_index, test_index in cv.split(dataframe_array[:,:-1], dataframe_array[:, -1]):
        # get train test split by index slices
        df_train, df_test = dataframe.loc[train_index, :], dataframe.loc[test_index, :]

        # initiate GAN
        gan = GAN(data_dim=data_dim, n_hidden=n_hidden, n_layers=n_layers, lr=lr, display=False)

        # Train and sample
        gan.train(dataframe=df_train,
                  EPOCHS=EPOCHS, 
                  use_corr_loss=use_corr_loss, 
                  BATCH_SIZE=BATCH_SIZE, 
                  SAMPLE_INTERVAL=100)
        gen_data = gan.generate_data(epoch=1, BATCH_SIZE=dataframe.shape[0])
        
        corr = corr_matrix(gen_data) - corr_matrix(df_test)
        loss_g, loss_d = gan.get_losses()
        acc_pos, acc_neg = gan.get_accuracies()

        loss_g_cv.append(loss_g)
        loss_d_cv.append(loss_d)
        acc_pos_cv.append(acc_pos)
        acc_neg_cv.append(acc_neg)
        corr_cv.append(np.sum(np.array(corr)))
        
    return gen_data, np.mean(loss_g_cv), np.mean(loss_d_cv), np.mean(acc_pos_cv), np.mean(acc_neg_cv), np.mean(corr_cv)


In [1]:
def batch_build_GAN(dataframe, batch_parameters):
    # unpacking parameters
    use_corr_loss_list = batch_parameters['use_corr_loss']
    batch_size_list = batch_parameters['BATCH_SIZE']
    n_hidden_list = batch_parameters['n_hidden']
    n_layers_list = batch_parameters['n_layers']
    epochs_list = batch_parameters['EPOCHS']
    lr_list = batch_parameters['lr']
    
    # list of results
    parameters_list = []
    gen_df_list = []
    loss_g_list = []
    loss_d_list = []
    acc_pos_list = []
    acc_neg_list = []
    corr_diff_list = []    
    corr_reduced_list = []
    
    # looping all parameters
    for batch_size in batch_size_list:
        for n_hidden in n_hidden_list:
            for n_layers in n_layers_list:
                for use_corr_loss in use_corr_loss_list:
                    for epochs in epochs_list:
                        for lr in lr_list:
                            parameters = {
                                'use_corr_loss': use_corr_loss,
                                'BATCH_SIZE' : batch_size,
                                'n_hidden' : n_hidden,
                                'n_layers': n_layers,
                                'EPOCHS': epochs,
                                'lr': lr
                            }

                            gen_df, loss_g, loss_d, acc_pos, acc_neg, corr_reduced = build_GAN(dataframe, parameters)
                            gen_df_list.append(gen_df)

                            loss_g_list.append(loss_g)
                            loss_d_list.append(loss_d)

                            acc_pos_list.append(acc_pos)
                            acc_neg_list.append(acc_neg)

                            parameters_list.append(parameters)
                            
#                             corr = corr_matrix(dataframe) - corr_matrix(gen_df)
#                             corr_diff_list.append(corr)
#                             corr_reduced_list.append(np.sum(np.array(corr)))

                            corr_reduced_list.append(corr_reduced)
                            
    return gen_df_list, loss_g_list, loss_d_list, acc_pos_list, acc_neg_list, parameters_list, corr_diff_list, corr_reduced_list


In [None]:
def corr_matrix(dataframe, get_corr=True, corr=None):
    # Compute the correlation matrix
    if get_corr:
        corr = dataframe.corr()

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})
    return corr
