# GAN for fraud detection
In this notebook, we train a GAN network to generate more positive data points in order to inprove the accuracy of the fraud detection process.

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')

import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, confusion_matrix

from google.colab import drive
drive.mount('/content/gdrive')

## Load data
The datasets contains transactions made by credit cards in September 2013 by european cardholders.
This dataset presents transactions that occurred in two days, where we have 492 (before duplicates are removed) frauds out of 284,807 (before duplicates are removed) transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

- It contains only numerical input variables which are the result of a PCA transformation. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. 

- Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. 
- The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. 
- Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

In [None]:
df = pd.read_csv("/content/gdrive/My Drive/BT4012/Week 8/creditcard.csv").drop_duplicates()
print(df.shape)
df.head()

In [None]:
df.Class.value_counts()

This is a rather imbalanced dataset.

# Exploratory Data Analysis

In [None]:
# 'Time' is seconds from first transaction in set
# 48 hours worth of data
# Let's convert time to time of day, in hours

df['Hour'] = (df['Time'].values // 3600 ) % 24

print(f"Last time value: {round(df['Time'].max() / 3600, 3)}")

plt.figure(figsize=(10, 6))
plt.hist( [df.loc[df['Class']==0, 'Hour'], df.loc[df['Class']==1, 'Hour']],
         density=True, label=['normal','fraud'], bins=np.linspace(0,24,25))
plt.legend()

Looks like normal transactions have a bias towards 8am to midnight and Fraud has spikes at 2-3am and noon.

In [None]:
# Log transform amount values to give more normal distribution

plt.figure(figsize=(18,6))
plt.subplot(1,2,1)
plt.hist(df['Amount'], bins=40)
plt.title('Original Amount Distribution')

plt.subplot(1,2,2)
d0 = np.log10(df['Amount'].values + 1 )
plt.hist(d0, bins=40)
plt.title('Log10(x+1) Transformed Amount Distribution')

df['Amount'] = d0

In [None]:
# data columns will be all other columns except class
data_cols = list(df.columns[df.columns != 'Class' ])

print(data_cols)
print('# of data columns: ',len(data_cols))

In [None]:
# Let's scale all numerical variables to standard normal variables

df[data_cols] = StandardScaler().fit_transform(df[data_cols])

In [None]:
# Print the correlations between pair wise features
plt.figure(figsize=(16, 13))
corr0 = df.corr()
sns.heatmap(corr0)

Note no correlations among PCA transformed columns, as expected

In [None]:
# Plot the data distribution of each feature with respect to target variables

axarr = [[]]*len(data_cols)
columns = 4
rows = int(np.ceil( len(data_cols) / columns))
f, fig = plt.subplots(figsize=(columns*3.5, rows*2))

f.suptitle('Data Distributions by Feature and Class', size=16)

for i, col in enumerate(data_cols[:]):
    axarr[i] = plt.subplot2grid((int(rows), int(columns)), (i//columns, i%columns))
    axarr[i].hist([df.loc[df.Class == 0, col], df.loc[df.Class==1, col]], label=['normal', 'fraud'], 
                          bins=np.linspace(np.percentile(df[col], 0.1), np.percentile(df[col], 99.9), 30),
                          density=True)
    axarr[i].set_xlabel(col, size=12)
    axarr[i].set_ylim([0,0.8])
    axarr[i].tick_params(axis='both', labelsize=10)
    if i == 0: 
        legend = axarr[i].legend()
        legend.get_frame().set_facecolor('white')
    if i%4 != 0: 
        axarr[i].tick_params(axis='y', left='off', labelleft='off')
    else:
        axarr[i].set_ylabel('Fraction',size=12)

plt.tight_layout(rect=[0,0,1,0.95]) # xmin, ymin, xmax, ymax

We can observe that certain features (V14, V4, etc.) are more discrimitive than other features. 

# xgboost for fraud detection
Let's build an xgboost classifier using these features.

In [None]:
# Set up the test and train sets
# Use stratified train test split
X_train, X_test, y_train, y_test = train_test_split(df.drop('Class', axis=1), 
                                                    df['Class'], test_size=0.3, 
                                                    random_state=31, shuffle=True, 
                                                    stratify=df['Class'])

dtrain, dtest = xgb.DMatrix(X_train, y_train, feature_names=data_cols), \
                xgb.DMatrix(X_test, y_test, feature_names=data_cols)
 

# Run the xgboost algorithm, maximize recall on the test set
results_dict = {}

xgb_params = {
    'objective': 'binary:logistic',
    'random_state': 0,
    'eval_metric': 'auc', 
}

xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=100, 
                     verbose_eval=False,
                     early_stopping_rounds=20, 
                     evals=[(dtrain,'train'),(dtest,'test')],
                     evals_result=results_dict            
                    )

y_pred = xgb_test.predict(dtest, ntree_limit=xgb_test.best_iteration+1)

print(f'best iteration: {xgb_test.best_iteration}\n')

print(f'recall: {round(recall_score(y_test, np.round(y_pred)), 3)}')
print(f'precision: {round(precision_score(y_test, np.round(y_pred)), 3)}')
print(f'auc: {round(roc_auc_score(y_test, y_pred), 3)}\n')

print (confusion_matrix(y_test, np.round(y_pred)))

In [None]:
# Let's look at how the metrics changed on the train and test sets as more trees were added

plt.figure(figsize=(10, 6))

for i in results_dict:
    for err in results_dict[i]:
        plt.plot(results_dict[i][err], label=i+' '+err)   

plt.axvline(xgb_test.best_iteration, c='green', label='best iteration')
plt.xlabel('iteration')
plt.title('xgboost learning curves')
plt.legend()
plt.grid() ;

In [None]:
# Plot feature importances

fig, ax = plt.subplots(1, 1, figsize=(8, 8))
xgb.plot_importance(xgb_test, max_num_features=10, height=0.5, ax=ax);

In [None]:
# Lets look at the effect of the ratio of normal:fraud data in the dataset on recall and roc_auc
# We'll use cross validation to see if differences are significant

n_test = np.sum(df.Class==1) # 473

normal_samples = df[df.Class==0].sample(frac=1.0, random_state=11).reset_index(drop=True)
fraud_samples = df[df.Class==1].sample(frac=1.0, random_state=11).reset_index(drop=True)

test_data = []

# 10%, 26%, 71%, ... 10000%

for i in np.logspace(-1,2,8):
    print(f'using {round(i*100, 3)}% normal data.' )
    train_df = pd.concat([normal_samples[:int(n_test*i)], fraud_samples], ignore_index=True).reset_index(drop=True)
    dtrain = xgb.DMatrix(train_df[data_cols], train_df['Class'], feature_names=data_cols)
    results = xgb.cv(xgb_params, dtrain, nfold=5, num_boost_round=100, early_stopping_rounds=10, seed=0)
    test_data.append(list([i]) + list(results.tail(1).index) + list(results.tail(1).values[0]))
    
test_data = pd.DataFrame(test_data, columns=list(['ratio','best']) + list(results.columns))
test_data

In [None]:
metric = 'auc'
xs = np.log10(test_data['ratio'].values)
ys = test_data['test-'+metric+'-mean'].values
stds = test_data['test-'+metric+'-std'].values
plt.figure(figsize=(10,6))
plt.plot(xs,ys,c='C1')
plt.plot(xs,ys+stds,linestyle=':',c='C2')
plt.plot(xs,ys-stds,linestyle=':',c='C2')
plt.xlabel('log10 ratio of normal:fraud data')
plt.ylabel(metric)

# Experiment with GAN model to generate additional data


In [None]:
from tensorflow.keras import applications
from tensorflow.keras import backend
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import optimizers

def generator_network(x, data_dim, base_n_count): 
    """
    Create a generator
    x: input layer
    data_dim: dimension of the data to be generated
    base_n_count: base number of neurons of hidden layer
    """
    x = layers.Dense(base_n_count, activation='relu')(x)
    x = layers.Dense(base_n_count*2, activation='relu')(x)
    x = layers.Dense(base_n_count*4, activation='relu')(x)
    x = layers.Dense(data_dim)(x)    
    return x

def discriminator_network(x, base_n_count):
    """
    Create a discriminator
    x: input layer
    base_n_count: base number of neurons of hidden layer
    """
    x = layers.Dense(base_n_count*4, activation='relu')(x)
    x = layers.Dense(base_n_count*2, activation='relu')(x)
    x = layers.Dense(base_n_count, activation='relu')(x)
    x = layers.Dense(1, activation='sigmoid')(x)
    return x

def define_models_GAN(rand_dim, data_dim, base_n_count):
    """
    Create a GAN network and returns the generator part, the discriminator part and the whole network
    rand_dim: random input dimension
    data_dim: dimension of the data
    base_n_count: base number of neurons of hidden layer
    """
    
    generator_input_tensor = layers.Input(shape=(rand_dim, ))
    generated_data_tensor = generator_network(generator_input_tensor, data_dim, base_n_count)

    generated_or_real_data_tensor = layers.Input(shape=(data_dim,))

    discriminator_output = discriminator_network(generated_or_real_data_tensor, base_n_count)

    generator_model = models.Model(inputs=[generator_input_tensor], 
                                   outputs=[generated_data_tensor], 
                                   name='generator')

    discriminator_model = models.Model(inputs=[generated_or_real_data_tensor],
                                      outputs=[discriminator_output],
                                      name='discriminator')

    combined_output = discriminator_model(generator_model(generator_input_tensor))
    combined_model = models.Model(inputs=[generator_input_tensor], 
                                outputs=[combined_output], 
                                name='combined')

    return generator_model, discriminator_model, combined_model   

In [None]:
def PlotData(x, g_z, data_cols):
    """
    Plot both the real data distribution and generated data distribution
    x: real data
    g_z: generated data
    data_col: list of column names
    """
    real_samples = pd.DataFrame(x, columns=data_cols)
    gen_samples = pd.DataFrame(g_z, columns=data_cols)
    
    f, axarr = plt.subplots(1, 2, figsize=(6,2) )

    axarr[0].scatter(real_samples[data_cols[0]], real_samples[data_cols[1]], cmap='plasma')
    axarr[1].scatter(gen_samples[data_cols[0]], gen_samples[data_cols[1]], cmap='plasma')

    axarr[0].set_title('real')
    axarr[1].set_title('generated')   
    
    # Only add y label to left plot
    axarr[0].set_ylabel(data_cols[1]) 

    for a in axarr: 
        a.set_xlabel(data_cols[0]) # Add x label to both plots

    # Use axes ranges from real data for generated data
    axarr[1].set_xlim(axarr[0].get_xlim()), axarr[1].set_ylim(axarr[0].get_ylim()) 
        
    plt.show()

In [None]:
def CheckAccuracy(x, g_z):    
    """
    Compute the auc of the xgboost classifier that tries to seperate real from fake samples
    Build a xgboost classifier to do discriminator's job 
    x: real data
    g_z: generated data
    """
    df_negative = pd.DataFrame(np.array(g_z))
    df_negative.columns = x.columns
    df_negative['label'] = 0
    
    x['label'] = 1

    train_all = pd.concat([x, df_negative])

    X_train, X_test, y_train, y_test = train_test_split(train_all.drop('label', axis=1), 
                                                        train_all['label'], 
                                                        test_size=0.3, 
                                                        random_state=31, 
                                                        shuffle=True)

    dtrain = xgb.DMatrix(X_train, y_train, feature_names=train_all.columns[:-1])
    dtest = xgb.DMatrix(X_test, feature_names=train_all.columns[:-1])
    
    xgb_params = {
        'max_depth': 4, 
        'objective': 'binary:logistic',
        'random_state': 0,
        'eval_metric': 'auc',
        }
    
    xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=10) 

    y_pred = np.round(xgb_test.predict(dtest))

    return roc_auc_score(y_test, y_pred) 
  

In [None]:
# Hyperparameters
batch_size = 128
rand_dim = 100
learning_rate = 5e-4
steps = 1000

# Create the GAN network
my_generator, my_discriminator, my_combined = define_models_GAN(rand_dim, len(data_cols), 128)
adam = optimizers.Adam(lr=learning_rate, beta_1=0.5, beta_2=0.9)

# compile models
my_generator.compile(optimizer=adam, loss='binary_crossentropy')
my_discriminator.compile(optimizer=adam, loss='binary_crossentropy')

# Set the trainable attribute to False so that the discriminator part won't be updated for us to train the generator
my_discriminator.trainable=False
my_combined.compile(optimizer=adam, loss='binary_crossentropy')

# We don't need the class information as GAN training is kind of self supervised
train = df[df['Class']==1].copy().reset_index(drop=True).drop('Class', axis=1)

# keep track of discriminator loss for positive samples and negative samples, generator loss and xgboost loss
loss_d_p, loss_d_n, loss_g, xgb_losses = [], [], [], []

for i in range(steps + 1):
  # train the discriminator
  for j in range(2):
    z = np.random.normal(size=(batch_size, rand_dim))
    x = train.sample(n=batch_size, random_state=i+j)
    g_z = my_generator(z)
    loss_d_p.append(my_discriminator.train_on_batch(x, np.random.uniform(low=0.999, high=1.0, size=batch_size))) # GANs need noise to prevent loss going to zero
    loss_d_n.append(my_discriminator.train_on_batch(g_z, np.random.uniform(low=0.0, high=0.001, size=batch_size))) # GANs need noise to prevent loss going to zero

  # train the generator
  for j in range(2):
    z = np.random.normal(size=(batch_size, rand_dim))
    loss_g.append(my_combined.train_on_batch(z, np.random.uniform(low=0.999, high=1.0, size=batch_size))) # GANs need noise to prevent loss going to zero

  # Determine xgb loss each step, after training generator and discriminator
  if not i % 10: # 2x faster than testing each step...
      x = train.sample(n=len(train), random_state=i)
      z = np.random.normal(size=(len(train), rand_dim))
      g_z = my_generator.predict(z)

      # This xgboost classifier is doing the same job as the discriminator: differentiate real samples from the fake samples
      xgb_losses.append(CheckAccuracy(x, g_z)) 

  # Saving weights and plotting images
  if not i % 100:
    print('Step: {} of {}.'.format(i, steps))         
    my_generator.save_weights(f'generator_step_{i}.h5')
    x = train.sample(n=len(train), random_state=0)
    g_z = my_generator(np.random.normal(size=(len(train), rand_dim)))                   
    PlotData(x, g_z, data_cols)

In [None]:
# Plot the discriminator losses for positive and negative samples, generator loss and AUC of xgboost trying to seperate real from fake samples

from scipy.signal import savgol_filter

f, axarr = plt.subplots(2, 2, figsize=(10, 10))

axarr[0][0].plot(savgol_filter(loss_d_p, 71, 3))
axarr[0][1].plot(savgol_filter(loss_d_n, 71, 3))
axarr[1][0].plot(savgol_filter(loss_g, 51, 3))
axarr[1][1].plot(savgol_filter(xgb_losses, 11, 3))

axarr[0][0].set_title('generator losses (positive)')
axarr[0][1].set_title('generator losses (negative)')
axarr[1][0].set_title('discriminator losses')  
axarr[1][1].set_title('xgb AUC') 

## We now use the generators trained before to see the effects of including these generated fraud data on a test dataset

In [None]:
def MakeCrossFolds(n_train_fraud, folds, g_z_df=[]):
    """
    Generates list of train, test datasets with different 
    n_train_fraud: number of real fraud data to be used 
    folds: number of (train, test) pairs to generate
    g_z_df: generated fraud dataframe
    """
    train_fraction = 0.7
    np.random.seed(0)

    train_normal_set, test_normal_set = [], []
    train_fraud_set, test_fraud_set = [], []

    normal_samples = df[df.Class==0].copy()
    fraud_samples = df[df.Class==1].copy()

    # Generate folds sets of train/test splits
    for seed in range(folds):
        #########################################
        # Prepare for fraud data                #
        #########################################
        # Shuffle the data
        fraud_samples = fraud_samples.sample(frac=1.0, random_state=seed).reset_index(drop=True) 

        # Take the first n_train_fraud real fraud samples for training data
        train_fraud_samples = fraud_samples[:n_train_fraud].reset_index(drop=True)

        # Take the last n_test_fraud real fraud samples for testing data
        # Note: there are 473 real fraud samples, int(473*0.3) = 141 < 373. So training and testing data have no overlaps.
        n_test_fraud = int(len(fraud_samples) * (1-train_fraction)) 
        test_fraud_samples = fraud_samples[-n_test_fraud:].reset_index(drop=True)

        # If there is no generated data provided, then we use the remaining real fraud data as training data. 
        if len(g_z_df)==0: 
            g_z_df = fraud_samples[n_train_fraud:-n_test_fraud] 
            
        # append the generated fraud samples to the n_train_fraud real samples to form the fraud dataset for training
        train_fraud_samples = train_fraud_samples.append(g_z_df).reset_index(drop=True)

        #######################################
        # Prepare for normal data             # 
        #######################################
        # Shuffle the data
        normal_samples = normal_samples.sample(frac=1.0, random_state=seed).reset_index(drop=True)

        n_train_normal = int(len(normal_samples) * train_fraction)

        # use the first n_train_normal normal data as training data and the remaining normal data as testing data
        train_normal_samples = normal_samples[:n_train_normal].reset_index(drop=True)
        test_normal_samples = normal_samples[n_train_normal:].reset_index(drop=True) 
        
        train_normal_set.append(train_normal_samples)
        test_normal_set.append(test_normal_samples)
        train_fraud_set.append(train_fraud_samples)
        test_fraud_set.append(test_fraud_samples)

    print (f'Number of real fraud samples: {n_train_fraud}')
    print (f'Number of generated fraud samples: {len(g_z_df)}')
    print (f'Number of (real + generated) fraud samples in training data: {len(train_fraud_samples)}')
    print (f'Number of real fraud samples in test data: {len(test_fraud_samples)}')
    
    return train_normal_set, test_normal_set, train_fraud_set, test_fraud_set



def Run_CV_Xgb(limit, 
               folds, 
               n_train_fraud, 
               train_normal_set, 
               test_normal_set, 
               train_fraud_set, 
               test_fraud_set):
    """
    Function to run an xgboost classifier on different cross-folds with different amounts of fake/real data added
    limit: the maximum additional real/fake fraud data to be used
    n_train_fraud: number of real fraud data to be used
    train_real_set: dataframe of normal data for training
    test_real_set: dataframe of normal data for testing
    train_fraud_set: dataframe of fraud data for training
    test_fraud_set: dataframe of fraud data for testing
    """
    list_results = []
    for i in np.logspace(0, np.log10(limit), num=5):
        print(f'# additional generated data tested: {round((i-1)*100, 2)}%, which is {int((i-1)*n_train_fraud)} additional real/fake data')

        for k in range(folds):
            train_df = pd.concat(
                [train_normal_set[k], train_fraud_set[k][:int(n_train_fraud*i)]], 
                ignore_index=True).reset_index(drop=True)
            test_df = pd.concat( 
                [test_normal_set[k], test_fraud_set[k]],
                ignore_index=True).reset_index(drop=True)

            dtrain = xgb.DMatrix(train_df[data_cols], train_df['Class'], feature_names=data_cols)
            dtest = xgb.DMatrix(test_df[data_cols], test_df['Class'], feature_names=data_cols)

            results_dict = {}
            xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=100, 
                                 verbose_eval=False, early_stopping_rounds=10, 
                                 evals=[(dtrain,'train'), (dtest,'test')],
                                 evals_result=results_dict 
                                 )

            y_pred = xgb_test.predict(dtest, ntree_limit=xgb_test.best_iteration+1)
            y_true = test_df['Class'].values     

            results = [k, i, xgb_test.best_iteration, 
                       recall_score(y_true, np.round(y_pred)), 
                       precision_score(y_true, np.round(y_pred)), 
                       roc_auc_score(y_true, y_pred)]                            
            list_results.append(results)

    return pd.DataFrame(list_results, columns=['k', 'ratio','best','recall','precision','auc'])

In [None]:
np.random.seed(0)

# n_train_fraud is the number of real fraud data used in the training data
n_train_fraud = 100

fold = 5

# limit is the maximum multiple of training data used with respect to n_train_fraud
# limit = (473 * 0.7) / 100 = 3.311
limit = len(df[df.Class == 1]) * 0.7 / n_train_fraud

#########################################################################
# Use an early generator to generate additional data                    #
#########################################################################
# Generate len(train), which is 473 fake fraud data
z = np.random.normal(size=(len(train), rand_dim))
my_generator_early, _, _ = define_models_GAN(rand_dim, len(data_cols), 128)

# Load a previously trained generator
my_generator_early.load_weights('./generator_step_200.h5')

# Generate fake fraud data using the generator
g_z = my_generator_early.predict(z)

# The labels for the generate data will all be 1, as they are supposed to be fraud data
g_z_df = pd.DataFrame(g_z, columns=data_cols)
g_z_df['Class'] = 1
train_normal_set, test_normal_set, train_fraud_set, test_fraud_set = MakeCrossFolds(n_train_fraud, fold, g_z_df)
t_early = Run_CV_Xgb(limit, fold, n_train_fraud, train_normal_set, test_normal_set, train_fraud_set, test_fraud_set)

print ('\n')
#########################################################################
# Use a late generator to generate additional data                      #
#########################################################################

# Generate len(train), which is 473 fake fraud data
z = np.random.normal(size=(len(train), rand_dim))
my_generator_late, _, _ = define_models_GAN(rand_dim, len(data_cols), 128)
my_generator_late.load_weights('./generator_step_1000.h5')
g_z = my_generator_late.predict(z)

# The labels for the generate data will all be 1, as they are supposed to be fraud data
g_z_df = pd.DataFrame(g_z, columns=data_cols )
g_z_df['Class'] = 1
train_normal_set, test_normal_set, train_fraud_set, test_fraud_set = MakeCrossFolds(n_train_fraud, fold, g_z_df)
t_late = Run_CV_Xgb(limit, fold, n_train_fraud, train_normal_set, test_normal_set, train_fraud_set, test_fraud_set)

print ('\n')
#########################################################################
# Use all real fraud data to train the model                            #
#########################################################################
train_normal_set, test_normal_set, train_fraud_set, test_fraud_set = MakeCrossFolds(n_train_fraud, fold)
t_real = Run_CV_Xgb(limit, fold, n_train_fraud, train_normal_set, test_normal_set, train_fraud_set, test_fraud_set)

In [None]:
# Plot the results
labels = ['trained 200 steps','trained 1000 steps','Actual Fraud Data']

for metric in ['recall', 'auc', 'precision']:

  plt.figure(figsize=(15,6))
  for i, [label, test_data] in enumerate(zip(labels, [t_early, t_late, t_real])):

      xs = [n_train_fraud * (ratio[0] - 1) for ratio in test_data.groupby('ratio')]
      ys = test_data.groupby('ratio')[metric].mean().values
      stds = test_data.groupby('ratio')[metric].std().values

      plt.subplot(1,3,i+1)
      plt.axhline(ys[0],linestyle='--',color='red')
      plt.plot(xs,ys,c='C1',marker='o')
      plt.plot(xs,ys+stds,linestyle=':',c='C2')
      plt.plot(xs,ys-stds,linestyle=':',c='C2')
      if i==0: 
          plt.ylabel(metric)
      plt.xlabel('# additional data')
      plt.title(label,size=12)
      plt.ylim([0.6,1.0])

  # plt.tight_layout(rect=[0,0,1,0.9])
  plt.suptitle('Effects of additional data on fraud detection', size=16)

## Impact of using additional generated fraud data on top of all real fraud data

In [None]:
# Set up the test and train sets
# Use stratified train test split
X_train, X_test, y_train, y_test = train_test_split(df.drop('Class', axis=1), 
                                                    df['Class'], test_size=0.3, 
                                                    random_state=31, shuffle=True, 
                                                    stratify=df['Class'])

dtrain, dtest = xgb.DMatrix(X_train, y_train, feature_names=data_cols), \
                xgb.DMatrix(X_test, y_test, feature_names=data_cols)
 

# Run the xgboost algorithm, maximize recall on the test set
results_dict = {}

xgb_params = {
    'objective': 'binary:logistic',
    'random_state': 0,
    'eval_metric': 'auc', 
}


xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=100, 
                     verbose_eval=False,
                     early_stopping_rounds=20, 
                     evals=[(dtrain,'train')],
                     evals_result=results_dict            
                    )

y_pred = xgb_test.predict(dtest, ntree_limit=xgb_test.best_iteration+1)

print(f'best iteration: {xgb_test.best_iteration}\n')

print(f'recall: {round(recall_score(y_test, np.round(y_pred)), 3)}')
print(f'precision: {round(precision_score(y_test, np.round(y_pred)), 3)}')
print(f'auc: {round(roc_auc_score(y_test, y_pred), 3)}\n')

print (confusion_matrix(y_test, np.round(y_pred)))

In [None]:
def impact_with_more_training_data(X_train, 
                                   y_train, 
                                   X_test, 
                                   y_test, 
                                   multiplier, 
                                   rand_dim):
    """
    X_train: features for training set
    y_train: labels for training set
    X_test: features for testing set
    y_test: labels for testing set
    multiple: additional training data used
    """
    z = np.random.normal(size=(multiplier * len(df[df.Class==1]), rand_dim))
    my_generator_late, _, _ = define_models_GAN(rand_dim, len(data_cols), 128)
    my_generator_late.load_weights('./generator_step_800.h5')
    g_z = my_generator_late.predict(z)
    dtrain, dtest = xgb.DMatrix(np.vstack([X_train, g_z]), 
                                np.concatenate([y_train, np.ones(multiplier * len(df[df.Class==1]))]), 
                                feature_names=data_cols), \
                    xgb.DMatrix(X_test, y_test, feature_names=data_cols)

    # Run the xgboost algorithm, maximize recall on the test set
    results_dict = {}

    xgb_params = {
        'objective': 'binary:logistic',
        'random_state': 0,
        'eval_metric': 'auc', 
    }

    xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=100, 
                         verbose_eval=False,
                         early_stopping_rounds=20, 
                         evals=[(dtrain,'train')],
                         evals_result=results_dict            
                        )

    y_pred = xgb_test.predict(dtest, ntree_limit=xgb_test.best_iteration+1)

    print(f'best iteration: {xgb_test.best_iteration}\n')
    print(f'recall: {round(recall_score(y_test, np.round(y_pred)), 3)}')
    print(f'precision: {round(precision_score(y_test, np.round(y_pred)), 3)}')
    print(f'auc: {round(roc_auc_score(y_test, y_pred), 3)}\n')

    return [recall_score(y_test, np.round(y_pred)), 
            precision_score(y_test, np.round(y_pred)), 
            roc_auc_score(y_test, y_pred)]

In [None]:
list_results = []

for i in range(1, 101, 2):
    print (f'Using {i}x of positive dataset')
    list_results.append([i] + impact_with_more_training_data(X_train, y_train, X_test, y_test, i, rand_dim))

In [None]:
df_results = pd.DataFrame(list_results, columns=['multiplier', 'recall', 'precision', 'auc'])
df_results.head()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(df_results['multiplier'], df_results['recall'], label='recall')
plt.plot(df_results['multiplier'], df_results['precision'], label='precision')
plt.plot(df_results['multiplier'], df_results['auc'], label='auc')
plt.axhline(recall_score(y_test, np.round(y_pred)),linestyle='--',color='blue')
plt.axhline(precision_score(y_test, np.round(y_pred)),linestyle='--',color='green')
plt.axhline(roc_auc_score(y_test, y_pred),linestyle='--',color='red')
plt.ylim([0.7,1.0])
plt.legend(loc='lower right')
plt.title('Effects of additional data')
plt.xlabel('Multiplier')