In [65]:
%matplotlib inline
import scipy.stats as stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


In [66]:
df = pd.read_csv('creditcard.csv')

In [67]:
scaler1 = StandardScaler()
scaler2 = StandardScaler()
amount_scaled = scaler1.fit_transform(df[['Amount']])
flat_list_1 = [item for sublist in amount_scaled.tolist() for item in sublist]
amount_scaled = pd.Series(flat_list_1)
time_scaled = scaler2.fit_transform(df[['Time']])
flat_list_2 = [item for sublist in time_scaled.tolist() for item in sublist]
time_scaled = pd.Series(flat_list_2)

In [68]:
df = pd.concat([df, amount_scaled.rename('amount_scaled'), time_scaled.rename('time_scaled')], axis=1)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,amount_scaled,time_scaled
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0,0.244964,-1.996583
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0,-0.342475,-1.996583
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0,1.160686,-1.996562
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0,0.140534,-1.996562
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0,-0.073403,-1.996541


In [69]:
df.drop(['Amount', 'Time'], axis=1, inplace=True)

In [70]:
mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]
print(f'Train data shape: {train.shape}')
print(f'Test Shape: {test.shape}')

Train data shape: (256098, 31)
Test Shape: (28709, 31)


In [71]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [72]:
no_of_frauds = train.Class.value_counts()[1]
print('There are {} fraudulent transactions in the train data.'.format(no_of_frauds))

There are 442 fraudulent transactions in the train data.


In [73]:
normal = train[train['Class'] == 0]
fraud = train[train['Class'] == 1]

In [74]:
selected = normal.sample(no_of_frauds)
selected.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,amount_scaled,time_scaled
21590,1.305739,-0.507857,-0.096756,-0.544395,-0.868778,-1.322867,0.016268,-0.340704,-1.073134,0.682378,...,-1.80902,0.188311,0.353292,0.051411,0.70213,-0.097754,0.013258,0,-0.085757,-1.301187
103944,-1.005068,0.282795,1.896848,-1.843361,-1.199319,-0.134268,-0.79758,0.817446,-1.161327,0.022574,...,1.370488,-0.186766,0.223769,-0.008294,-0.177408,0.228345,0.090871,0,-0.313328,-0.43916
246736,-0.79598,0.216408,-0.957177,-1.230082,-1.946009,0.879154,2.043298,-0.053746,-1.594057,0.079245,...,0.455407,0.058784,0.813537,-0.812268,0.221479,0.112985,-0.028279,0,1.451907,1.498906
236924,-0.63957,-0.234346,1.403517,1.055268,1.460704,0.380254,0.330439,-0.164897,0.440367,0.318589,...,-0.892413,0.094685,-0.871738,-0.788101,-1.045324,-0.262034,-0.090915,0,-0.291659,1.393343
35338,-5.394164,-5.196116,0.159648,-0.239677,-2.519266,1.606067,1.515646,0.004459,-0.183988,-0.231655,...,0.080731,-1.687427,-0.027051,-0.071217,-0.303035,1.125186,-1.340662,0,3.519237,-1.158246


In [75]:
selected.reset_index(drop=True, inplace=True)
fraud.reset_index(drop=True, inplace=True)

In [76]:
subsample = pd.concat([selected, fraud])
len(subsample)

884

In [77]:
subsample = subsample.sample(frac=1).reset_index(drop=True)
subsample.head(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,amount_scaled,time_scaled
0,2.0919,-0.757459,-1.192258,-0.755458,-0.620324,-0.322077,-1.082511,0.1172,-0.140927,0.249311,...,0.831939,0.142007,0.592615,-0.196143,-0.136676,0.020182,-0.01547,1,-0.273468,1.517626
1,0.947344,-0.688031,1.143622,0.482926,-1.313024,-0.367951,-0.61414,-0.059097,2.399565,-0.817553,...,-0.429805,-0.049549,0.407812,0.029241,0.937382,-0.08924,0.034043,0,0.246284,-1.763998
2,-1.404419,1.192014,0.728767,-1.008225,0.000447,-0.233754,0.299133,0.454832,0.546386,0.289724,...,-0.610565,-0.093989,-0.544204,-0.107106,0.306957,0.083838,-0.015608,0,-0.349231,-1.968892
3,1.769945,-0.358679,-1.868018,0.160267,0.733966,0.752475,-0.226617,0.364024,0.899733,-0.807962,...,-0.122457,0.187898,-1.55092,-0.323842,0.009826,0.02784,-0.039578,0,-0.117302,1.041823
4,-12.339603,4.488267,-16.587073,10.107274,-10.420199,0.13067,-15.600323,-1.157696,-5.304631,-12.938929,...,1.745315,1.376816,-0.554271,-1.610741,0.153725,1.212477,-1.86929,1,0.401529,-1.603326
5,-0.469327,1.111453,2.041003,1.731595,0.135147,-0.093625,0.266155,0.082988,0.580255,-0.164563,...,0.59267,-0.053596,0.320748,-0.369121,-0.136605,-0.100845,0.039347,1,-0.337517,-1.633755
6,1.086255,0.385727,1.172723,2.543302,-0.488313,-0.025821,-0.294069,0.110881,-0.471091,0.627513,...,0.564999,0.007218,0.398652,0.315433,0.106458,0.028456,0.030561,0,-0.31077,-0.852885
7,-0.513443,1.108307,1.300984,-0.257165,0.237911,-0.391081,0.664932,0.002833,-0.649458,-0.015944,...,-0.320782,0.003133,0.000859,-0.597031,-0.013136,0.223426,0.20873,0,-0.341275,-1.433347
8,-3.61385,-0.922136,-4.749887,3.373001,-0.545207,-1.171301,-4.172315,1.517016,-1.775833,-3.754054,...,0.893065,1.034907,0.097671,-1.345551,-0.788329,1.055442,0.099971,1,0.225693,1.011331
9,1.202884,0.323593,0.466062,1.120843,-0.346606,-0.874429,0.164562,-0.220293,-0.024821,-0.025602,...,-0.094349,-0.039312,0.394384,0.553782,-0.442428,0.022789,0.029544,0,-0.273308,-0.415512


In [78]:
corr = subsample.corr()
corr = corr[['Class']]

In [79]:
corr[corr.Class > 0.5]

Unnamed: 0,Class
V4,0.705298
V11,0.691612
Class,1.0


In [80]:
corr[corr.Class < -0.5]

Unnamed: 0,Class
V3,-0.568964
V9,-0.572177
V10,-0.632527
V12,-0.685001
V14,-0.749569
V16,-0.59454
V17,-0.556379


In [81]:
Q1 = subsample.quantile(0.25)
Q3 = subsample.quantile(0.75)
IQR = Q3 - Q1

df2 = subsample[~((subsample < (Q1 - 2.5 * IQR)) |(subsample > (Q3 + 2.5 * IQR))).any(axis=1)]

In [82]:
len_after = len(df2)
len_before = len(subsample)
len_difference = len(subsample) - len(df2)
print(f'Reduced size of data from {len_before} transactions by {len_difference} transactions to {len_after} transactions.')

Reduced size of data from 884 transactions by 262 transactions to 622 transactions.


In [83]:
X = df2.drop('Class', axis=1)
y = df2['Class']

In [84]:
X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X.values)

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [86]:
X_train = X_train.values
X_validation = X_test.values
y_train = y_train.values
y_validation = y_test.values

In [87]:
print('X_shapes:', X_train.shape, X_validation.shape)
print('Y_shapes:', y_train.shape, y_validation.shape)

X_shapes: (497, 30) (125, 30)
Y_shapes: (497,) (125,)


In [88]:
models = []
names = []
results = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('SVM', SVC()))

for name, model in models:
    kfold = KFold(n_splits=10, random_state=42)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='roc_auc')
    results.append(cv_results)
    names.append(name)
    print(f'The ROC AUC score for algorithm {name} is {cv_results.mean()}')

The ROC AUC score for algorithm KNN is 0.9558806374354749
The ROC AUC score for algorithm Decision Tree is 0.8998956779603366
The ROC AUC score for algorithm SVM is 0.9641462056368244


