# Comparative Study of Supervised Learning Methods for Credit Card Fraud Detection

### 0) Set-up the notebook

In [1]:
# hide any warnings
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

# install module that contains sampling methods
!pip install imblearn

Collecting imblearn
  Downloading https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl
Collecting imbalanced-learn (from imblearn)
[?25l  Downloading https://files.pythonhosted.org/packages/e5/4c/7557e1c2e791bd43878f8c82065bddc5798252084f26ef44527c02262af1/imbalanced_learn-0.4.3-py3-none-any.whl (166kB)
[K    100% |████████████████████████████████| 174kB 7.0MB/s 
Collecting scikit-learn>=0.20 (from imbalanced-learn->imblearn)
[?25l  Downloading https://files.pythonhosted.org/packages/0c/b2/05be9b6da9ae4a4c54f537be22e95833f722742a02b1e355fdc09363877c/scikit_learn-0.20.0-cp36-cp36m-manylinux1_x86_64.whl (5.3MB)
[K    100% |████████████████████████████████| 5.3MB 6.7MB/s 
[?25hInstalling collected packages: scikit-learn, imbalanced-learn, imblearn
  Found existing installation: scikit-learn 0.19.2
    Uninstalling scikit-learn-0.19.2:
      Successfully uninstalled scikit-learn-0.19.2
Successfully

In [0]:
# import libraries
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, fbeta_score

### 1) Load dataset

In [0]:
# load dataset to colab notebook
from google.colab import drive
drive.mount('/content/drive/')

In [4]:
# observe that the dataset is in chronological order
df = pd.read_csv('drive/My Drive/CS3244 Credit Card Fraud Detection/Machine Learning/credit_card.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [6]:
# identify that that there are indeed much more non-fraudulent transactions in the dataset
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [7]:
# ratio is approximately 0.0017 fraudulent : 1 non-fraudulent
df.Class.value_counts()[1] / (df.Class.value_counts()[0] + df.Class.value_counts()[1])

0.001727485630620034

### 2) Pre-process dataset

#### 2a) Obtain validation and test data from temporal (time-series) dataset

In [0]:
# remove time column
df.drop('Time', inplace=True, axis=1)

In [0]:
# split temporal data into training, validation and testing
# use 6:2:2 ratio
X = df.loc[:, df.columns!='Class']
y = df.loc[:, df.columns=='Class']
X_train = X.iloc[:170884, :] 
y_train = y.iloc[:170884, :]
X_val = X.iloc[170884:227844, :] 
y_val = y.iloc[170884:227844, :] 
X_test = X.iloc[227844:, :] 
y_test = y.iloc[227844:, :]

#### 2b) Data normalisation

In [0]:
# scale all features such that they have mean 0 and unit variance
# to prevent data leakage, only use fit_transform for train data
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_val = std_scaler.transform(X_val)
X_test = std_scaler.transform(X_test)

### 3) Preliminary models on imbalanced data

#### 3a) Logistic regression

In [12]:
# logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[56874    14]
 [   35    40]]
Classification accuracy: 0.9991397924968839
Sensitivity: 0.5333333333333333
Precision: 0.7407407407407407
F2 score: 0.5649717514124293


#### 3b) Naive bayes

In [13]:
# naive bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[55721  1167]
 [   15    60]]
Classification accuracy: 0.9792496883942208
Sensitivity: 0.8
Precision: 0.0488997555012225
F2 score: 0.19646365422396858


#### 3c) Decision tree

In [14]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[56837    51]
 [   23    52]]
Classification accuracy: 0.9987009111177431
Sensitivity: 0.6933333333333334
Precision: 0.5048543689320388
F2 score: 0.6451612903225806


#### 3d) Conclusion

The trained models have arbitrarily high classification accuracy. This is a consequence of imbalanced class. 

### 4) Oversample minority class with SMOTE

#### 4a) SMOTE

In [0]:
# apply SMOTE to train data
from imblearn.over_sampling import SMOTE
sm = SMOTE(ratio='minority')
X_smtrain, y_smtrain = sm.fit_sample(X_train, y_train.values.ravel())

In [13]:
# verify that after sampling, the classes are equal
import collections
collections.Counter(y_smtrain)

Counter({0: 170524, 1: 170524})

#### 4b) Logistic regression

In [17]:
# logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_smtrain, y_smtrain)
y_pred = logreg.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[55542  1346]
 [    8    67]]
Classification accuracy: 0.9762301845057318
Sensitivity: 0.8933333333333333
Precision: 0.047416843595187545
F2 score: 0.19556333917104496


#### 4c) K nearest neighbours

In [18]:
# knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_smtrain, y_smtrain)
y_pred = knn.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[56855    33]
 [   15    60]]
Classification accuracy: 0.9991573477520496
Sensitivity: 0.8
Precision: 0.6451612903225806
F2 score: 0.7633587786259541


In [19]:
# finding the best value of k
print('Results on validation set...')
neighbours = range(1, 10, 1)

for n_neighbour in neighbours:
  knn = KNeighborsClassifier(n_neighbors=n_neighbour)
  knn.fit(X_smtrain, y_smtrain)
  print('\nValue of K is %d...' % n_neighbour)
  y_pred = knn.predict(X_val)
  confusion = confusion_matrix(y_val, y_pred)
  TP = confusion[1][1]
  TN = confusion[0][0]
  FP = confusion[0][1]
  FN = confusion[1][0]
  sensitivity = TP / float(TP + FN)
  precision = TP / float(TP + FP)
  print('Classification accuracy:', accuracy_score(y_val, y_pred))
  print('Sensitivity:', sensitivity)
  print('Precision:', precision)
  print('F2 score:', fbeta_score(y_val, y_pred, 2))

Results on validation set...

Value of K is 1...
Classification accuracy: 0.9990695224719102
Sensitivity: 0.7543859649122807
Precision: 0.524390243902439
F2 score: 0.6935483870967742

Value of K is 2...
Classification accuracy: 0.9990695224719102
Sensitivity: 0.7543859649122807
Precision: 0.524390243902439
F2 score: 0.6935483870967742

Value of K is 3...
Classification accuracy: 0.9984901685393258
Sensitivity: 0.7543859649122807
Precision: 0.3739130434782609
F2 score: 0.6268221574344024

Value of K is 4...
Classification accuracy: 0.9984901685393258
Sensitivity: 0.7543859649122807
Precision: 0.3739130434782609
F2 score: 0.6268221574344024

Value of K is 5...
Classification accuracy: 0.9979108146067416
Sensitivity: 0.7894736842105263
Precision: 0.29605263157894735
F2 score: 0.5921052631578947

Value of K is 6...
Classification accuracy: 0.9979459269662921
Sensitivity: 0.7894736842105263
Precision: 0.3
F2 score: 0.5952380952380952

Value of K is 7...
Classification accuracy: 0.9974192415

In [20]:
# knn with k=2
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_smtrain, y_smtrain)
y_pred = knn.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[56856    32]
 [   15    60]]
Classification accuracy: 0.9991749030072152
Sensitivity: 0.8
Precision: 0.6521739130434783
F2 score: 0.7653061224489798


#### 4d) Naive bayes

In [21]:
# naive bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_smtrain, y_smtrain)
y_pred = nb.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[55625  1263]
 [   14    61]]
Classification accuracy: 0.9775819391534856
Sensitivity: 0.8133333333333334
Precision: 0.04607250755287009
F2 score: 0.187807881773399


#### 4e) Decision tree

In [22]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_smtrain, y_smtrain)
y_pred = dt.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[56736   152]
 [   23    52]]
Classification accuracy: 0.9969278303460141
Sensitivity: 0.6933333333333334
Precision: 0.2549019607843137
F2 score: 0.5158730158730158


#### 4f) Random forest

In [23]:
# random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_smtrain, y_smtrain)
y_pred = rf.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[56884     4]
 [   22    53]]
Classification accuracy: 0.9995435633656935
Sensitivity: 0.7066666666666667
Precision: 0.9298245614035088
F2 score: 0.7422969187675069


In [24]:
# finding the optimal number of trees in random forest
print('Results on validation set...')
num_estimators = range(1, 10, 1)

for num_tree in num_estimators:
  rf = RandomForestClassifier(random_state=0, n_estimators=num_tree)
  rf.fit(X_smtrain, y_smtrain)
  print('\nNumber of decision trees in random forest is %d...' % num_tree)
  y_pred = rf.predict(X_val)
  confusion = confusion_matrix(y_val, y_pred)
  TP = confusion[1][1]
  TN = confusion[0][0]
  FP = confusion[0][1]
  FN = confusion[1][0]
  sensitivity = TP / float(TP + FN)
  precision = TP / float(TP + FP)
  print('Classification accuracy:', accuracy_score(y_val, y_pred))
  print('Sensitivity:', sensitivity)
  print('Precision:', precision)
  print('F2 score:', fbeta_score(y_val, y_pred, 2))

Results on validation set...

Number of decision trees in random forest is 1...
Classification accuracy: 0.9937675561797753
Sensitivity: 0.631578947368421
Precision: 0.0972972972972973
F2 score: 0.3010033444816054

Number of decision trees in random forest is 2...
Classification accuracy: 0.9994030898876405
Sensitivity: 0.5263157894736842
Precision: 0.8108108108108109
F2 score: 0.5660377358490567

Number of decision trees in random forest is 3...
Classification accuracy: 0.999122191011236
Sensitivity: 0.6140350877192983
Precision: 0.5555555555555556
F2 score: 0.6013745704467354

Number of decision trees in random forest is 4...
Classification accuracy: 0.9995259831460674
Sensitivity: 0.5964912280701754
Precision: 0.8947368421052632
F2 score: 0.6390977443609023

Number of decision trees in random forest is 5...
Classification accuracy: 0.999561095505618
Sensitivity: 0.6491228070175439
Precision: 0.8809523809523809
F2 score: 0.6851851851851853

Number of decision trees in random forest i

In [25]:
# random forest with number of trees set at 8
rf = RandomForestClassifier(random_state=0, n_estimators=8)
rf.fit(X_smtrain, y_smtrain)
y_pred = rf.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[56882     6]
 [   22    53]]
Classification accuracy: 0.9995084528553623
Sensitivity: 0.7066666666666667
Precision: 0.8983050847457628
F2 score: 0.7381615598885793


#### 4g) Single layer perceptron

In [26]:
# single layer perceptron
from sklearn.linear_model import Perceptron
pct = Perceptron(tol=None, max_iter=1000)
pct.fit(X_smtrain, y_smtrain)
y_pred = pct.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[51639  5249]
 [    6    69]]
Classification accuracy: 0.9077471341045942
Sensitivity: 0.92
Precision: 0.012974802557352389
F2 score: 0.06140975436098255


#### 4h) Multi layer perceptron

In [27]:
# after some trials, mlp with 3 hidden layers appear to perform the best
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='sgd', # stochastic gradient descent
                    hidden_layer_sizes=(20,20,5), verbose=10, # state log
                    random_state=0, max_iter=1000, tol=1e-4) # set tolerance as 1e-4
mlp.fit(X_smtrain, y_smtrain)

Iteration 1, loss = 0.18973181
Iteration 2, loss = 0.10231158
Iteration 3, loss = 0.07750280
Iteration 4, loss = 0.06081228
Iteration 5, loss = 0.04771186
Iteration 6, loss = 0.03734229
Iteration 7, loss = 0.02990838
Iteration 8, loss = 0.02471018
Iteration 9, loss = 0.02118374
Iteration 10, loss = 0.01863738
Iteration 11, loss = 0.01664201
Iteration 12, loss = 0.01497504
Iteration 13, loss = 0.01357816
Iteration 14, loss = 0.01238342
Iteration 15, loss = 0.01139431
Iteration 16, loss = 0.01048671
Iteration 17, loss = 0.00979465
Iteration 18, loss = 0.00910163
Iteration 19, loss = 0.00852811
Iteration 20, loss = 0.00803835
Iteration 21, loss = 0.00763038
Iteration 22, loss = 0.00722934
Iteration 23, loss = 0.00688670
Iteration 24, loss = 0.00659646
Iteration 25, loss = 0.00635062
Iteration 26, loss = 0.00609455
Iteration 27, loss = 0.00589926
Iteration 28, loss = 0.00567958
Iteration 29, loss = 0.00545096
Iteration 30, loss = 0.00530347
Iteration 31, loss = 0.00513776
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 20, 5), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=0, shuffle=True, solver='sgd', tol=0.0001,
       validation_fraction=0.1, verbose=10, warm_start=False)

In [28]:
# performance on validation data
print('Results on validation set...')
y_pred = mlp.predict(X_val)
confusion = confusion_matrix(y_val, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print(confusion)
print('Classification accuracy:', accuracy_score(y_val, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_val, y_pred, 2))

Results on validation set...
[[56832    71]
 [   17    40]]
Classification accuracy: 0.9984550561797753
Sensitivity: 0.7017543859649122
Precision: 0.36036036036036034
F2 score: 0.5899705014749261


In [31]:
# l2 regularisation
mlp = MLPClassifier(solver='sgd', # stochastic gradient descent
                    alpha=0.0003, # regularisation
                    hidden_layer_sizes=(20,20,5), verbose=10, # state log
                    random_state=0, max_iter=1000, tol=1e-4) # set tolerance as 1e-4
mlp.fit(X_smtrain, y_smtrain)

Iteration 1, loss = 0.18976231
Iteration 2, loss = 0.10234649
Iteration 3, loss = 0.07753975
Iteration 4, loss = 0.06084899
Iteration 5, loss = 0.04774942
Iteration 6, loss = 0.03738147
Iteration 7, loss = 0.02994713
Iteration 8, loss = 0.02475126
Iteration 9, loss = 0.02122413
Iteration 10, loss = 0.01867855
Iteration 11, loss = 0.01668408
Iteration 12, loss = 0.01501937
Iteration 13, loss = 0.01362037
Iteration 14, loss = 0.01242778
Iteration 15, loss = 0.01143727
Iteration 16, loss = 0.01052656
Iteration 17, loss = 0.00983384
Iteration 18, loss = 0.00913644
Iteration 19, loss = 0.00856593
Iteration 20, loss = 0.00807769
Iteration 21, loss = 0.00767520
Iteration 22, loss = 0.00727923
Iteration 23, loss = 0.00695165
Iteration 24, loss = 0.00664877
Iteration 25, loss = 0.00640224
Iteration 26, loss = 0.00616972
Iteration 27, loss = 0.00595950
Iteration 28, loss = 0.00575064
Iteration 29, loss = 0.00551585
Iteration 30, loss = 0.00536478
Iteration 31, loss = 0.00520049
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0003, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 20, 5), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=0, shuffle=True, solver='sgd', tol=0.0001,
       validation_fraction=0.1, verbose=10, warm_start=False)

In [32]:
# performance on validation data
print('Results on validation set...')
y_pred = mlp.predict(X_val)
confusion = confusion_matrix(y_val, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print(confusion)
print('Classification accuracy:', accuracy_score(y_val, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_val, y_pred, 2))

Results on validation set...
[[56832    71]
 [   17    40]]
Classification accuracy: 0.9984550561797753
Sensitivity: 0.7017543859649122
Precision: 0.36036036036036034
F2 score: 0.5899705014749261


In [33]:
# performance on test data
print('Results on test set...')
y_pred = mlp.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[56825    63]
 [   16    59]]
Classification accuracy: 0.9986131348419149
Sensitivity: 0.7866666666666666
Precision: 0.48360655737704916
F2 score: 0.6990521327014219


### 5) Undersample majority class with Near Miss

#### 5a) Near Miss

In [0]:
# apply near miss to train data
from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_nmtrain, y_nmtrain = nm.fit_sample(X_train, y_train.values.ravel())

In [35]:
# verify that after sampling, the classes are equal
import collections
collections.Counter(y_nmtrain)

Counter({0: 360, 1: 360})

#### 5b) Logistic regression

In [36]:
# logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_nmtrain, y_nmtrain)
y_pred = logreg.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[36549 20339]
 [    2    73]]
Classification accuracy: 0.6429085546758422
Sensitivity: 0.9733333333333334
Precision: 0.0035763276504017245
F2 score: 0.01762263422170722


#### 5c) K nearest neighbours

In [37]:
# knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_nmtrain, y_nmtrain)
y_pred = knn.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[29798 27090]
 [    4    71]]
Classification accuracy: 0.5243579165423169
Sensitivity: 0.9466666666666667
Precision: 0.0026140421928500424
F2 score: 0.012927424347256107


In [38]:
# finding the best value of k
print('Results on validation set...')
neighbours = range(1, 10, 1)

for n_neighbour in neighbours:
  knn = KNeighborsClassifier(n_neighbors=n_neighbour)
  knn.fit(X_nmtrain, y_nmtrain)
  print('\nValue of K is %d...' % n_neighbour)
  y_pred = knn.predict(X_val)
  confusion = confusion_matrix(y_val, y_pred)
  TP = confusion[1][1]
  TN = confusion[0][0]
  FP = confusion[0][1]
  FN = confusion[1][0]
  sensitivity = TP / float(TP + FN)
  precision = TP / float(TP + FP)
  print('Classification accuracy:', accuracy_score(y_val, y_pred))
  print('Sensitivity:', sensitivity)
  print('Precision:', precision)
  print('F2 score:', fbeta_score(y_val, y_pred, 2))

Results on validation set...

Value of K is 1...
Classification accuracy: 0.48491924157303373
Sensitivity: 0.9298245614035088
Precision: 0.0018034571934122771
F2 score: 0.008947866018368451

Value of K is 2...
Classification accuracy: 0.7567591292134831
Sensitivity: 0.9122807017543859
Precision: 0.0037404689972665803
F2 score: 0.01840056617126681

Value of K is 3...
Classification accuracy: 0.6857619382022472
Sensitivity: 0.9298245614035088
Precision: 0.0029529752618676175
F2 score: 0.014579665492957748

Value of K is 4...
Classification accuracy: 0.8262816011235955
Sensitivity: 0.9122807017543859
Precision: 0.005230335948501308
F2 score: 0.025565388397246806

Value of K is 5...
Classification accuracy: 0.7912219101123595
Sensitivity: 0.9122807017543859
Precision: 0.0043554736577602815
F2 score: 0.021369277554039615

Value of K is 6...
Classification accuracy: 0.8738939606741573
Sensitivity: 0.8947368421052632
Precision: 0.007055893746541229
F2 score: 0.03420064377682403

Value of K is

In [39]:
# knn with k=8
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_nmtrain, y_nmtrain)
y_pred = knn.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[52529  4359]
 [    8    67]]
Classification accuracy: 0.9233362006916771
Sensitivity: 0.8933333333333333
Precision: 0.015137821961138725
F2 score: 0.07088446889547187


#### 5d) Naive bayes

In [40]:
# naive bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_nmtrain, y_nmtrain)
y_pred = nb.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[25897 30991]
 [    5    70]]
Classification accuracy: 0.4558573108860137
Sensitivity: 0.9333333333333333
Precision: 0.0022536299539615595
F2 score: 0.011160358406938554


#### 5e) Decision tree

In [41]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_nmtrain, y_nmtrain)
y_pred = dt.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[12650 44238]
 [    3    72]]
Classification accuracy: 0.22333795621719363
Sensitivity: 0.96
Precision: 0.0016249153689911983
F2 score: 0.008069939475453933


#### 5f) Random forest

In [42]:
# random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_nmtrain, y_nmtrain)
y_pred = rf.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[ 2550 54338]
 [    1    74]]
Classification accuracy: 0.046064989554623174
Sensitivity: 0.9866666666666667
Precision: 0.0013599941189443505
F2 score: 0.0067626846030121365


In [43]:
# finding the optimal number of trees in random forest
print('Results on validation set...')
num_estimators = range(1, 10, 1)

for num_tree in num_estimators:
  rf = RandomForestClassifier(random_state=0, n_estimators=num_tree)
  rf.fit(X_nmtrain, y_nmtrain)
  print('\nNumber of decision trees in random forest is %d...' % num_tree)
  y_pred = rf.predict(X_val)
  confusion = confusion_matrix(y_val, y_pred)
  TP = confusion[1][1]
  TN = confusion[0][0]
  FP = confusion[0][1]
  FN = confusion[1][0]
  sensitivity = TP / float(TP + FN)
  precision = TP / float(TP + FP)
  print('Classification accuracy:', accuracy_score(y_val, y_pred))
  print('Sensitivity:', sensitivity)
  print('Precision:', precision)
  print('F2 score:', fbeta_score(y_val, y_pred, 2))

Results on validation set...

Number of decision trees in random forest is 1...
Classification accuracy: 0.20180828651685392
Sensitivity: 0.9473684210526315
Precision: 0.0011863959926179805
F2 score: 0.005902413431269675

Number of decision trees in random forest is 2...
Classification accuracy: 0.24074789325842696
Sensitivity: 0.9298245614035088
Precision: 0.001224131559497413
F2 score: 0.0060885947982722175

Number of decision trees in random forest is 3...
Classification accuracy: 0.07550912921348314
Sensitivity: 0.9824561403508771
Precision: 0.0010623363812269984
F2 score: 0.005288806618563711

Number of decision trees in random forest is 4...
Classification accuracy: 0.1180126404494382
Sensitivity: 0.9649122807017544
Precision: 0.0010936350440436658
F2 score: 0.005443496506264968

Number of decision trees in random forest is 5...
Classification accuracy: 0.03297050561797753
Sensitivity: 1.0
Precision: 0.0010337510654890368
F2 score: 0.005147470514927665

Number of decision trees i

In [44]:
# random forest with number of trees set at 7
rf = RandomForestClassifier(random_state=0, n_estimators=7)
rf.fit(X_nmtrain, y_nmtrain)
y_pred = rf.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[ 1116 55772]
 [    1    74]]
Classification accuracy: 0.02089075364710426
Sensitivity: 0.9866666666666667
Precision: 0.001325072520860939
F2 score: 0.006589961885085314


#### 5g) Single layer perceptron

In [45]:
# single layer perceptron
from sklearn.linear_model import Perceptron
pct = Perceptron(tol=None, max_iter=1000)
pct.fit(X_nmtrain, y_nmtrain)
y_pred = pct.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print('Results on test set...')
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[25824 31064]
 [    4    71]]
Classification accuracy: 0.45459333251408807
Sensitivity: 0.9466666666666667
Precision: 0.002280391841978481
F2 score: 0.011293144584062352


#### 5h) Multi layer perceptron

In [46]:
# after some trials, mlp with 3 hidden layers appear to perform the best
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='sgd', # stochastic gradient descent
                    hidden_layer_sizes=(20,20,5), verbose=10, # state log
                    random_state=0, max_iter=1000, tol=1e-4) # set tolerance as 1e-4
mlp.fit(X_nmtrain, y_nmtrain)

Iteration 1, loss = 0.77466037
Iteration 2, loss = 0.70151136
Iteration 3, loss = 0.63282393
Iteration 4, loss = 0.57582175
Iteration 5, loss = 0.52598895
Iteration 6, loss = 0.48641368
Iteration 7, loss = 0.45434683
Iteration 8, loss = 0.43206920
Iteration 9, loss = 0.41715298
Iteration 10, loss = 0.40627984
Iteration 11, loss = 0.39800800
Iteration 12, loss = 0.39225328
Iteration 13, loss = 0.38721154
Iteration 14, loss = 0.38317877
Iteration 15, loss = 0.37976953
Iteration 16, loss = 0.37664810
Iteration 17, loss = 0.37383303
Iteration 18, loss = 0.37129605
Iteration 19, loss = 0.36872169
Iteration 20, loss = 0.36639727
Iteration 21, loss = 0.36402682
Iteration 22, loss = 0.36204242
Iteration 23, loss = 0.35994929
Iteration 24, loss = 0.35791559
Iteration 25, loss = 0.35592690
Iteration 26, loss = 0.35402229
Iteration 27, loss = 0.35225155
Iteration 28, loss = 0.35030475
Iteration 29, loss = 0.34846671
Iteration 30, loss = 0.34670838
Iteration 31, loss = 0.34488775
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 20, 5), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=0, shuffle=True, solver='sgd', tol=0.0001,
       validation_fraction=0.1, verbose=10, warm_start=False)

In [47]:
# performance on validation data
print('Results on validation set...')
y_pred = mlp.predict(X_val)
confusion = confusion_matrix(y_val, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print(confusion)
print('Classification accuracy:', accuracy_score(y_val, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_val, y_pred, 2))

Results on validation set...
[[21880 35023]
 [    2    55]]
Classification accuracy: 0.3850948033707865
Sensitivity: 0.9649122807017544
Precision: 0.0015679343178060324
F2 score: 0.007789044355067128


In [54]:
# l2 regularisation
mlp = MLPClassifier(solver='sgd', # stochastic gradient descent
                    alpha=0.0002, # regularisation
                    hidden_layer_sizes=(20,20,5), verbose=10, # state log
                    random_state=0, max_iter=1000, tol=1e-4) # set tolerance as 1e-4
mlp.fit(X_nmtrain, y_nmtrain)

Iteration 1, loss = 0.77467588
Iteration 2, loss = 0.70152687
Iteration 3, loss = 0.63283945
Iteration 4, loss = 0.57583728
Iteration 5, loss = 0.52600449
Iteration 6, loss = 0.48642923
Iteration 7, loss = 0.45436241
Iteration 8, loss = 0.43208479
Iteration 9, loss = 0.41716858
Iteration 10, loss = 0.40629545
Iteration 11, loss = 0.39802361
Iteration 12, loss = 0.39226891
Iteration 13, loss = 0.38722717
Iteration 14, loss = 0.38319440
Iteration 15, loss = 0.37978517
Iteration 16, loss = 0.37666375
Iteration 17, loss = 0.37384868
Iteration 18, loss = 0.37131171
Iteration 19, loss = 0.36873736
Iteration 20, loss = 0.36641294
Iteration 21, loss = 0.36404250
Iteration 22, loss = 0.36205810
Iteration 23, loss = 0.35996498
Iteration 24, loss = 0.35793128
Iteration 25, loss = 0.35594260
Iteration 26, loss = 0.35403799
Iteration 27, loss = 0.35226726
Iteration 28, loss = 0.35032047
Iteration 29, loss = 0.34848243
Iteration 30, loss = 0.34672411
Iteration 31, loss = 0.34490348
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0002, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 20, 5), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=0, shuffle=True, solver='sgd', tol=0.0001,
       validation_fraction=0.1, verbose=10, warm_start=False)

In [55]:
# performance on validation data
print('Results on validation set...')
y_pred = mlp.predict(X_val)
confusion = confusion_matrix(y_val, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print(confusion)
print('Classification accuracy:', accuracy_score(y_val, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_val, y_pred, 2))

Results on validation set...
[[21877 35026]
 [    2    55]]
Classification accuracy: 0.38504213483146066
Sensitivity: 0.9649122807017544
Precision: 0.001567800233744762
F2 score: 0.007788382565351611


In [56]:
# performance on test data
print('Results on test set...')
y_pred = mlp.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[23714 33174]
 [    3    72]]
Classification accuracy: 0.41756929936976633
Sensitivity: 0.96
Precision: 0.002165674066053059
F2 score: 0.010731532820604543


### 6) Explore stacked models

#### 6a) K nearest neighbours & Random forest

In [35]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# split data equally into training and validation
X_stacktrain, X_stackval, y_stacktrain, y_stackval = train_test_split(X_smtrain, y_smtrain, test_size=0.5, 
                                                                      shuffle=False, stratify=None)

# train base learners with the same training set
knn = KNeighborsClassifier(n_neighbors=2)
rf = RandomForestClassifier(random_state=0, n_estimators=8)
knn.fit(X_stacktrain, y_stacktrain)
rf.fit(X_stacktrain, y_stacktrain)

# use trained base learners to predict the classes in the validation set
knn_val_preds = knn.predict(X_stackval)
rf_val_preds = rf.predict(X_stackval)
stack_val_preds = pd.DataFrame(X_stackval.copy())
stack_val_preds['knn'] = knn_val_preds
stack_val_preds['rf'] = rf_val_preds

# train meta learner on stacked predictions
meta_model = MLPClassifier(solver='sgd', alpha=0.0003,
                    hidden_layer_sizes=(20,20,5),
                    random_state=0, max_iter=1000)
meta_model.fit(stack_val_preds, y_stackval)

# re-train base learners with full training set
knn.fit(X_smtrain, y_smtrain)
rf.fit(X_smtrain, y_smtrain)

# use trained base leaners to predict the classes in the test set
knn_test_preds = knn.predict(X_test)
rf_test_preds = rf.predict(X_test)
stack_test_preds = pd.DataFrame(X_test.copy())
stack_test_preds['knn'] = knn_test_preds
stack_test_preds['rf'] = rf_test_preds

# use trained meta learner to predict the classes in the test set
print('Results on test set...')
y_pred = meta_model.predict(stack_test_preds)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[56873    15]
 [   18    57]]
Classification accuracy: 0.999420676579534
Sensitivity: 0.76
Precision: 0.7916666666666666
F2 score: 0.7661290322580645


#### 6b) K nearest neighbours & Multi layer perceptron

In [37]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# split data equally into training and validation
X_stacktrain, X_stackval, y_stacktrain, y_stackval = train_test_split(X_smtrain, y_smtrain, test_size=0.5, 
                                                                     shuffle=False, stratify=None)

# train base learners with the same training set
knn = KNeighborsClassifier(n_neighbors=2)
mlp = MLPClassifier(solver='sgd', alpha=0.0003,
                    hidden_layer_sizes=(20,20,5),
                    random_state=0, max_iter=1000)
knn.fit(X_stacktrain, y_stacktrain)
mlp.fit(X_stacktrain, y_stacktrain)

# use trained base learners to predict the classes in the validation set
knn_val_preds = knn.predict(X_stackval)
mlp_val_preds = rf.predict(X_stackval)
stack_val_preds = pd.DataFrame(X_stackval.copy())
stack_val_preds['knn'] = knn_val_preds
stack_val_preds['mlp'] = mlp_val_preds

# train meta learner on stacked predictions
meta_model = RandomForestClassifier(random_state=0, n_estimators=8)
meta_model.fit(stack_val_preds, y_stackval)

# re-train base learners with full training set
knn.fit(X_smtrain, y_smtrain)
mlp.fit(X_smtrain, y_smtrain)

# use trained base leaners to predict the classes in the test set
knn_test_preds = knn.predict(X_test)
mlp_test_preds = mlp.predict(X_test)
stack_test_preds = pd.DataFrame(X_test.copy())
stack_test_preds['knn'] = knn_test_preds
stack_test_preds['mlp'] = mlp_test_preds

# use trained meta learner to predict the classes in the test set
print('Results on test set...')
y_pred = meta_model.predict(stack_test_preds)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[56882     6]
 [   19    56]]
Classification accuracy: 0.9995611186208592
Sensitivity: 0.7466666666666667
Precision: 0.9032258064516129
F2 score: 0.7734806629834254


#### 6c) Random forest & Multi layer perceptron

In [39]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# split data equally into training and validation
X_stacktrain, X_stackval, y_stacktrain, y_stackval = train_test_split(X_smtrain, y_smtrain, test_size=0.5, 
                                                                     shuffle=False, stratify=None)

# train base learners with the same training set
rf = RandomForestClassifier(random_state=0, n_estimators=8)
mlp = MLPClassifier(solver='sgd', alpha=0.0003,
                    hidden_layer_sizes=(20,20,5),
                    random_state=0, max_iter=1000)
rf.fit(X_stacktrain, y_stacktrain)
mlp.fit(X_stacktrain, y_stacktrain)

# use trained base learners to predict the classes in the validation set
rf_val_preds = knn.predict(X_stackval)
mlp_val_preds = rf.predict(X_stackval)
stack_val_preds = pd.DataFrame(X_stackval.copy())
stack_val_preds['rf'] = rf_val_preds
stack_val_preds['mlp'] = mlp_val_preds

# train meta learner on stacked predictions
meta_model = KNeighborsClassifier(n_neighbors=2)
meta_model.fit(stack_val_preds, y_stackval)

# re-train base learners with full training set
rf.fit(X_smtrain, y_smtrain)
mlp.fit(X_smtrain, y_smtrain)

# use trained base leaners to predict the classes in the test set
rf_test_preds = rf.predict(X_test)
mlp_test_preds = mlp.predict(X_test)
stack_test_preds = pd.DataFrame(X_test.copy())
stack_test_preds['rf'] = rf_test_preds
stack_test_preds['mlp'] = mlp_test_preds

# use trained meta learner to predict the classes in the test set
print('Results on test set...')
y_pred = meta_model.predict(stack_test_preds)
confusion = confusion_matrix(y_test, y_pred)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
sensitivity = TP / float(TP + FN)
precision = TP / float(TP + FP)
print(confusion)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Sensitivity:', sensitivity)
print('Precision:', precision)
print('F2 score:', fbeta_score(y_test, y_pred, 2))

Results on test set...
[[56865    23]
 [   16    59]]
Classification accuracy: 0.9993153450485402
Sensitivity: 0.7866666666666666
Precision: 0.7195121951219512
F2 score: 0.7722513089005236
