In [None]:
##--  Autoencoder example
##--     Based on Kaggle kernel:
##--     https://www.kaggle.com/shivamb/semi-supervised-classification-using-autoencoders

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Dense, Input
from keras.models import Model, Sequential
from keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler

In [None]:
# I/O Directories and Files
din = './input'
dout = './output'
fin = 'creditcard.csv'

# Read data
df = pd.read_csv(f'{din}/{fin}')

# Undersample the majority class
nfraud = df.loc[df['Class']==0].sample(1000)
fraud = df.loc[df['Class']==1]
df_resamp = (nfraud.append(fraud)            # Append the two datasets
                   .sample(frac=1)           # Resample to randomize order
                   .reset_index(drop=True))  # Reset the index because it's irrellevant

# Train/test split
X = df_resamp.drop(['Class', 'Time'], axis=1)
y = df_resamp['Class']
X_train_usamp, X_test_usamp, y_train_usamp, y_test_usamp = train_test_split(X.values, y.values, test_size=0.3, random_state=0)


In [None]:
# Create and train model
scaler = MinMaxScaler().fit(X_train_usamp)
X_train_usamp_sc = scaler.transform(X_train_usamp)
X_test_usamp_sc = scaler.transform(X_test_usamp)
model = SVC(probability=True).fit(X_train_usamp_sc, y_train_usamp)
y_tr_pred_usamp = model.predict(X_train_usamp_sc)
y_te_pred_usamp = model.predict(X_test_usamp_sc)
y_tr_prob_usamp = model.predict_proba(X_train_usamp_sc)
y_te_prob_usamp = model.predict_proba(X_test_usamp_sc)

'''
model = LogisticRegression(penalty='l1').fit(X_train_usamp, y_train_usamp)
y_tr_pred_usamp = model.predict(X_train_usamp)
y_te_pred_usamp = model.predict(X_test_usamp)
y_tr_prob_usamp = model.predict_proba(X_train_usamp)
y_te_prob_usamp = model.predict_proba(X_test_usamp)
'''

In [None]:
# Print metrics
print('Training ROC_AUC: {:.04f}'.format(roc_auc_score(y_train_usamp, y_tr_prob_usamp[:,1])))
print('Test ROC_AUC: {:.04f}'.format(roc_auc_score(y_test_usamp, y_te_prob_usamp[:,1])))
print('')
print('Training Recall: {:.04f}'.format(recall_score(y_train_usamp, y_tr_pred_usamp)))
print('Test Recall: {:.04f}'.format(recall_score(y_test_usamp, y_te_pred_usamp)))

In [None]:
# Perform t-SNE and plot training data
tsne = TSNE(n_components=2, random_state=0)
X_t = tsne.fit_transform(X_train_usamp)

fh = plt.figure(figsize=(10,10))
plt.scatter(X_t[y_train_usamp==0,0], X_t[y_train_usamp==0,1], marker='o', color='g', label='Non Fraud')
plt.scatter(X_t[y_train_usamp==1,0], X_t[y_train_usamp==1,1], marker='o', color='r', label='Fraud')
plt.title('Fraud/Non-Fraud t-SNE Plot')
plt.legend()

In [None]:
#-- Autoencoder will be built using only non-fraud cases, then it should identify fraud cases
#--  as anomalies.

In [None]:
# Build the autoencoder
X_train = X_train_usamp
inp_lyr = Input(shape=(X_train.shape[1],))
enc = Dense(100, activation='tanh', activity_regularizer=regularizers.l1(10e-5))(inp_lyr)
enc = Dense(50, activation='relu')(enc)
dec = Dense(50, activation='relu')(enc)
dec = Dense(100, activation='relu')(dec)
out_lyr = Dense(X_train.shape[1], activation='relu')(dec)

In [None]:
# Construct model and compile
autoencoder = Model(inp_lyr, out_lyr)
autoencoder.compile(optimizer="adadelta", loss="mse")

In [None]:
# Build the model training set, mon-max scaled non-fraud records
X_scale = MinMaxScaler().fit_transform( df.drop(['Class', 'Time'], axis=1).values )
x_sc_norm, x_sc_fraud = X_scale[df['Class'].values==0], X_scale[df['Class'].values==1]

# Random sample for fitting
n_fit_samp = 2000
fit_samp = x_sc_norm[np.random.choice(x_sc_norm.shape[0], n_fit_samp, replace=False),:]

In [None]:
# Train the model
autoencoder.fit(x=fit_samp, y=fit_samp,
               batch_size=256, epochs=15,
               shuffle=True, validation_split=0.2)

In [None]:
# Obtain latent representations
hidden_rep = Sequential(autoencoder.layers[:3])
norm_hid_rep = hidden_rep.predict(x_sc_norm[np.random.choice(x_sc_norm.shape[0], n_fit_samp*2, replace=False),:])
fraud_hid_rep = hidden_rep.predict(x_sc_fraud)

X_latent = np.append(norm_hid_rep, fraud_hid_rep, axis=0)
y_latent = np.append(np.zeros(norm_hid_rep.shape[0]), np.ones(fraud_hid_rep.shape[0]))

In [None]:
# Visualize latent representations (with t-SNE)
tsne = TSNE(n_components=2, random_state=0)
X_t = tsne.fit_transform(X_latent)

fh = plt.figure(figsize=(10,10))
plt.scatter(X_t[y_latent==0,0], X_t[y_latent==0,1], marker='o', color='g', alpha=0.7, label='Non Fraud')
plt.scatter(X_t[y_latent==1,0], X_t[y_latent==1,1], marker='o', color='r', alpha=0.7, label='Fraud')
plt.title('Fraud/Non-Fraud t-SNE Plot')
plt.legend()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, recall_score

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_t, y_latent, test_size=0.3, random_state=0)

# Create and train model
#model = SVC(probability=True).fit(X_train, y_train)
model = LogisticRegression(penalty='l1').fit(X_train, y_train)
y_tr_pred = model.predict(X_train)
y_te_pred = model.predict(X_test)
y_tr_prob = model.predict_proba(X_train)
y_te_prob = model.predict_proba(X_test)

In [None]:
# Print metrics
print('Training ROC_AUC: {:.04f}'.format(roc_auc_score(y_train, y_tr_prob[:,1])))
print('Test ROC_AUC: {:.04f}'.format(roc_auc_score(y_test, y_te_prob[:,1])))
print('')
print('Training Recall: {:.04f}'.format(recall_score(y_train, y_tr_pred)))
print('Test Recall: {:.04f}'.format(recall_score(y_test, y_te_pred)))

In [None]:
# Plot correct/incorrect in tSNE space

set_name = 'Undersampling LogReg Test Set'
X = X_test_usamp
y_true = y_test_usamp
y_pred = y_te_pred_usamp

def gen_pred_res(y_true, y_pred):
    pred_res = y_true*10 + y_pred
    #np.empty(y_true.shape)
    #pred_res[(y_true+y_pred)==2] = 11    # True Positive
    #pred_res[(y_true+y_pred)==0] = 0     # True Negative
    #pred_res[y_true>y_pred] = 
    return pred_res


pred_res = gen_pred_res(y_true, y_pred)
cases = { 0:{'label':'TN','color':'b', 'alpha':0.5}, 
          1:{'label':'FP','color':'r', 'alpha':0.8},
         10:{'label':'FN','color':'k', 'alpha':0.8},
         11:{'label':'TP','color':'g', 'alpha':0.5}}

fh = plt.figure(figsize=(10,10))

for case, params in cases.items():
    plt.scatter(X[pred_res==case,0], X[pred_res==case,1], marker='o', **params)

#plt.scatter(X_train[(y_tr_pred==y_train),0], X_train[(y_tr_pred==y_train),1], marker='o', color='g', alpha=0.7, label='Correct Prediction')
#plt.scatter(X_train[(y_tr_pred!=y_train),0], X_train[(y_tr_pred!=y_train),1], marker='o', color='r', alpha=0.7, label='Incorrect Prediction')
#plt.scatter(X_test[(y_te_pred==y_test),0], X_test[(y_te_pred==y_test),1], marker='o', color='g', alpha=0.7, label='Correct Prediction')
#plt.scatter(X_test[(y_te_pred!=y_test),0], X_test[(y_te_pred!=y_test),1], marker='o', color='r', alpha=0.7, label='Incorrect Prediction')

plt.title(f'{set_name} Prediction Errors in t-SNE Space')
plt.legend()


