In [None]:
! git clone https://github.com/aghk73/Statistical_Learning_HW3.git

In [None]:
cd Statistical_Learning_HW3

In [None]:
dataset = 'creditcard_short.csv.tar.gz'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras import Model


%matplotlib inline
plt.style.use('ggplot')
print(tf.__version__)

2.4.0


In [None]:
data = pd.read_csv(dataset, compression='gzip', dtype=np.float32).iloc[:, 1:]
data.dropna(inplace=True)
target = data['Class'].loc[:]

In [None]:
np.unique(target, return_counts=True)

(array([0., 1.], dtype=float32), array([181766,    365]))

In [None]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.619995,0.0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.660004,0.0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798279,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.989998,0.0


In [None]:
X, y = data.iloc[:, :-1].values, data['Class'].values
X.shape, y.shape

((182131, 29), (182131,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=46)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=46)
X_train.shape, X_val.shape, X_test.shape

((131133, 29), (14571, 29), (36427, 29))

In [None]:
np.unique(y_train, return_counts=True), np.unique(y_test, return_counts=True), np.unique(y_val, return_counts=True)

((array([0., 1.], dtype=float32), array([130870,    263])),
 (array([0., 1.], dtype=float32), array([36354,    73])),
 (array([0., 1.], dtype=float32), array([14542,    29])))

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
models = [LinearSVC(penalty='l1', dual=False, loss='squared_hinge', C=1), 
          LinearSVC(penalty='l2', dual=True, loss='hinge', C=1), 
          SVC(kernel='poly', degree=3),
          SVC(kernel='poly', degree=4),
          SVC(kernel='rbf', C=50)]

In [None]:
classifiers = [classifier.fit(X_train, y_train) for classifier in models]



In [None]:
train_scores = [classification_report(y_train, model.predict(X_train), target_names=['non-fraudulent', 'fraudulent']) for model in classifiers]
test_scores = [classification_report(y_test, model.predict(X_test), target_names=['non-fraudulent', 'fraudulent']) for model in classifiers]

In [None]:
models_list = ['linear svm l1-norm squared_hinge loss', 
               'linear svm l2-norm hinge loss', 
               'kernel poly degree 3', 
               'kernel poly degree 4', 
               'kernel RBF'
               ]

print('***************** train set scores *****************\n')
for i in range(len(models_list)):
    print(f'******** {models_list[i]} ********\n')
    print(train_scores[i], end='\n')

print('***************** test set scores *****************\n')
for i in range(len(models_list)):
    print(f'******** {models_list[i]} ********\n')
    print(test_scores[i], end='\n')

test_accs = [model.score(X_test, y_test) for model in classifiers]

print('---------- test accuracies ----------')
for i in range(len(test_accs)):
    print(models_list[i] + ': ', f'{test_accs[i]:3.5f}')

***************** train set scores *****************

******** linear svm l1-norm squared_hinge loss ********

                precision    recall  f1-score   support

non-fraudulent       1.00      1.00      1.00    130870
    fraudulent       0.88      0.61      0.72       263

      accuracy                           1.00    131133
     macro avg       0.94      0.80      0.86    131133
  weighted avg       1.00      1.00      1.00    131133

******** linear svm l2-norm hinge loss ********

                precision    recall  f1-score   support

non-fraudulent       1.00      1.00      1.00    130870
    fraudulent       0.87      0.83      0.85       263

      accuracy                           1.00    131133
     macro avg       0.94      0.91      0.92    131133
  weighted avg       1.00      1.00      1.00    131133

******** kernel poly degree 3 ********

                precision    recall  f1-score   support

non-fraudulent       1.00      1.00      1.00    130870
    fraud