In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, precision_score, recall_score, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv('../data/creditcard.csv')
x = np.array(data.loc[1:len(data)-1, 'Time'])
y = np.array(data.loc[0:len(data)-2, 'Time'])
data.loc[1:, 'Time'] = x-y
normal = data[data['Class']==0]
anomaly = data[data['Class']==1]

In [3]:
train_normal, test_normal = train_test_split(normal, test_size=0.4, random_state=42)
valid_normal, test_normal = train_test_split(test_normal, test_size=0.5, random_state=42)
train_anomaly, test_anomaly = train_test_split(anomaly, test_size=0.4, random_state=42)
valid_anomaly, test_anomaly = train_test_split(test_anomaly, test_size=0.5, random_state=42)

for x in [train_normal, valid_normal, test_normal, train_anomaly, valid_anomaly, test_anomaly]:
    x.reset_index(drop=True, inplace=True)

print('Normal Train:', train_normal.shape, 
      'Normal Valid:', valid_normal.shape, 
      'Normal Test:', test_normal.shape)
print('Anomaly Train:', train_anomaly.shape, 
      'Anomaly Valid:', valid_anomaly.shape, 
      'Anomaly Test:', test_anomaly.shape)

Normal Train: (170589, 31) Normal Valid: (56863, 31) Normal Test: (56863, 31)
Anomaly Train: (295, 31) Anomaly Valid: (98, 31) Anomaly Test: (99, 31)


In [4]:
train = train_normal.append(train_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)
valid = valid_normal.append(valid_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)
test = test_normal.append(test_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)

In [24]:
def print_data(model):
    x = train
    predict_model = model.predict(x.drop(columns=['Class']))
    recall_model = recall_score(y_true=x['Class'].values, y_pred=predict_model)
    precision_model = precision_score(y_true=x['Class'].values, y_pred=predict_model)
    fbeta_model = fbeta_score(y_true=x['Class'].values, y_pred=predict_model, beta=1.5)

    print('Train Recall:', recall_model, 
          '\nTrain Precision:', precision_model, 
          '\nTrain F-score:', fbeta_model)
    cnf_matrix_model = confusion_matrix(y_true=x['Class'].values, y_pred=predict_model)
    print('Train Confusion Matrix: ')
    print(cnf_matrix_model)
    

    x = valid
    predict_model = model.predict(x.drop(columns=['Class']))
    recall_model = recall_score(y_true=x['Class'].values, y_pred=predict_model)
    precision_model = precision_score(y_true=x['Class'].values, y_pred=predict_model)
    fbeta_model = fbeta_score(y_true=x['Class'].values, y_pred=predict_model, beta=1.5)

    print('Valid Recall:', recall_model, 
          '\nValid Precision:', precision_model, 
          '\nValid F-score:', fbeta_model)
    cnf_matrix_model = confusion_matrix(y_true=x['Class'].values, y_pred=predict_model)
    print('Valid Confusion Matrix: ')
    print(cnf_matrix_model)
    

    x = test
    predict_model = model.predict(x.drop(columns=['Class']))
    recall_model = recall_score(y_true=x['Class'].values, y_pred=predict_model)
    precision_model = precision_score(y_true=x['Class'].values, y_pred=predict_model)
    fbeta_model = fbeta_score(y_true=x['Class'].values, y_pred=predict_model, beta=1.5)

    print('Test Recall:', recall_model, 
          '\nTest Precision:', precision_model, 
          '\nTest F-score:', fbeta_model)

    cnf_matrix_model = confusion_matrix(y_true=x['Class'].values, y_pred=predict_model)
    print('Test Confusion Matrix: ')
    print(cnf_matrix_model)

In [25]:
logistic = LogisticRegression(random_state=0, solver='newton-cg',
                         multi_class='multinomial')
logistic.fit(train.drop(columns=['Class']), train['Class'])

print_data(logistic)

Train Recall: 0.5864406779661017 
Train Precision: 0.8522167487684729 
Train F-score: 0.6486876261897894
Train Confusion Matrix: 
[[170559     30]
 [   122    173]]
Valid Recall: 0.6122448979591837 
Valid Precision: 0.8823529411764706 
Valid F-score: 0.6759098786828422
Valid Confusion Matrix: 
[[56855     8]
 [   38    60]]
Test Recall: 0.5959595959595959 
Test Precision: 0.8676470588235294 
Test F-score: 0.6595012897678417
Test Confusion Matrix: 
[[56854     9]
 [   40    59]]


In [26]:
gnb = GaussianNB()
gnb.fit(train.drop(columns=['Class']), train['Class'])

print_data(gnb)

Train Recall: 0.823728813559322 
Train Precision: 0.058766626360338574 
Train F-score: 0.1645741078405835
Train Confusion Matrix: 
[[166697   3892]
 [    52    243]]
Valid Recall: 0.8469387755102041 
Valid Precision: 0.06102941176470588 
Valid F-score: 0.17067383739322997
Valid Confusion Matrix: 
[[55586  1277]
 [   15    83]]
Test Recall: 0.8282828282828283 
Test Precision: 0.05758426966292135 
Test F-score: 0.1618339152876879
Test Confusion Matrix: 
[[55521  1342]
 [   17    82]]


In [27]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=6, class_weight={0:5, 1:1})
tree.fit(train.drop(columns=['Class']), train['Class'])

print_data(tree)

Train Recall: 0.7762711864406779 
Train Precision: 1.0 
Train F-score: 0.8336600392047046
Train Confusion Matrix: 
[[170589      0]
 [    66    229]]
Valid Recall: 0.7551020408163265 
Valid Precision: 0.8809523809523809 
Valid F-score: 0.7898193760262725
Valid Confusion Matrix: 
[[56853    10]
 [   24    74]]
Test Recall: 0.7171717171717171 
Test Precision: 0.8987341772151899 
Test F-score: 0.764705882352941
Test Confusion Matrix: 
[[56855     8]
 [   28    71]]


In [28]:
lda = LinearDiscriminantAnalysis()
lda.fit(train.drop(columns=['Class']), train['Class'])

print_data(lda)

Train Recall: 0.7423728813559322 
Train Precision: 0.85546875 
Train F-score: 0.773851590106007
Train Confusion Matrix: 
[[170552     37]
 [    76    219]]
Valid Recall: 0.826530612244898 
Valid Precision: 0.8804347826086957 
Valid F-score: 0.8423999999999999
Valid Confusion Matrix: 
[[56852    11]
 [   17    81]]
Test Recall: 0.7777777777777778 
Test Precision: 0.875 
Test F-score: 0.8053097345132745
Test Confusion Matrix: 
[[56852    11]
 [   22    77]]


In [29]:
lda_bagging = BaggingClassifier(LinearDiscriminantAnalysis(), n_estimators=5)
lda_bagging.fit(train.drop(columns=['Class']), train['Class'])

print_data(lda_bagging)

Train Recall: 0.7491525423728813 
Train Precision: 0.8565891472868217 
Train F-score: 0.779224301600217
Train Confusion Matrix: 
[[170552     37]
 [    74    221]]
Valid Recall: 0.826530612244898 
Valid Precision: 0.8804347826086957 
Valid F-score: 0.8423999999999999
Valid Confusion Matrix: 
[[56852    11]
 [   17    81]]
Test Recall: 0.7777777777777778 
Test Precision: 0.875 
Test F-score: 0.8053097345132745
Test Confusion Matrix: 
[[56852    11]
 [   22    77]]


In [30]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(train.drop(columns=['Class']), train['Class'])

print_data(qda)

Train Recall: 0.8745762711864407 
Train Precision: 0.05386221294363257 
Train F-score: 0.15374742149896858
Train Confusion Matrix: 
[[166057   4532]
 [    37    258]]
Valid Recall: 0.9081632653061225 
Valid Precision: 0.056400506970849175 
Valid F-score: 0.1608284681679177
Valid Confusion Matrix: 
[[55374  1489]
 [    9    89]]
Test Recall: 0.8484848484848485 
Test Precision: 0.05286343612334802 
Test F-score: 0.15068304125845175
Test Confusion Matrix: 
[[55358  1505]
 [   15    84]]


In [31]:
qda_bagging = BaggingClassifier(QuadraticDiscriminantAnalysis(), n_estimators=11)
qda_bagging.fit(train.drop(columns=['Class']), train['Class'])

print_data(qda_bagging)

Train Recall: 0.8711864406779661 
Train Precision: 0.057961208840775824 
Train F-score: 0.16384679515472514
Train Confusion Matrix: 
[[166412   4177]
 [    38    257]]
Valid Recall: 0.9081632653061225 
Valid Precision: 0.060792349726775954 
Valid F-score: 0.17171267438409027
Valid Confusion Matrix: 
[[55488  1375]
 [    9    89]]
Test Recall: 0.8484848484848485 
Test Precision: 0.057455540355677154 
Test F-score: 0.16204184597121235
Test Confusion Matrix: 
[[55485  1378]
 [   15    84]]


In [32]:
classifier = [('lda', lda), ('qda', qda), ('log', logistic)]
vote = VotingClassifier(classifier)

vote.fit(train.drop(columns=['Class']), train['Class'])

print_data(vote)

Train Recall: 0.7559322033898305 
Train Precision: 0.8415094339622642 
Train F-score: 0.780349932705249
Train Confusion Matrix: 
[[170547     42]
 [    72    223]]
Valid Recall: 0.8367346938775511 
Valid Precision: 0.8631578947368421 
Valid F-score: 0.8446909667194928
Valid Confusion Matrix: 
[[56850    13]
 [   16    82]]
Test Recall: 0.797979797979798 
Test Precision: 0.8494623655913979 
Test F-score: 0.8131433095803642
Test Confusion Matrix: 
[[56849    14]
 [   20    79]]


In [33]:
rfc = RandomForestClassifier()
rfc.fit(train.drop(columns=['Class']), train['Class'])

print_data(rfc)



Train Recall: 0.9525423728813559 
Train Precision: 0.9964539007092199 
Train F-score: 0.9656357388316151
Train Confusion Matrix: 
[[170588      1]
 [    14    281]]
Valid Recall: 0.8163265306122449 
Valid Precision: 0.9411764705882353 
Valid F-score: 0.8510638297872339
Valid Confusion Matrix: 
[[56858     5]
 [   18    80]]
Test Recall: 0.7676767676767676 
Test Precision: 0.9620253164556962 
Test F-score: 0.8185584092792045
Test Confusion Matrix: 
[[56860     3]
 [   23    76]]


In [34]:
adb = AdaBoostClassifier(GaussianNB(),
                         n_estimators=50)
adb.fit(train.drop(columns=['Class']), train['Class'])

print_data(adb)

Train Recall: 0.7661016949152543 
Train Precision: 0.029639344262295083 
Train F-score: 0.08861408535665813
Train Confusion Matrix: 
[[163190   7399]
 [    69    226]]
Valid Recall: 0.7959183673469388 
Valid Precision: 0.031075697211155377 
Valid F-score: 0.09284013916865042
Valid Confusion Matrix: 
[[54431  2432]
 [   20    78]]
Test Recall: 0.7878787878787878 
Test Precision: 0.031489705288655634 
Test F-score: 0.09389758310954718
Test Confusion Matrix: 
[[54464  2399]
 [   21    78]]


In [38]:
nn = MLPClassifier(solver='lbfgs', alpha=1e-4, 
                   hidden_layer_sizes=(5, 4, 3, 3), random_state=1)
nn.fit(train.drop(columns=['Class']), train['Class'])

print_data(nn)

Train Recall: 0.7796610169491526 
Train Precision: 0.8646616541353384 
Train F-score: 0.8039795643990321
Train Confusion Matrix: 
[[170553     36]
 [    65    230]]
Valid Recall: 0.826530612244898 
Valid Precision: 0.8804347826086957 
Valid F-score: 0.8423999999999999
Valid Confusion Matrix: 
[[56852    11]
 [   17    81]]
Test Recall: 0.8080808080808081 
Test Precision: 0.851063829787234 
Test F-score: 0.8208366219415943
Test Confusion Matrix: 
[[56849    14]
 [   19    80]]


In [36]:
xg = XGBClassifier(max_depth=4, reg_lambda=0.5)
xg.fit(train.drop(columns=['Class']), train['Class'])

print_data(xg)

Train Recall: 0.8745762711864407 
Train Precision: 0.9923076923076923 
Train F-score: 0.9077131258457375
Train Confusion Matrix: 
[[170587      2]
 [    37    258]]
Valid Recall: 0.826530612244898 
Valid Precision: 0.9418604651162791 
Valid F-score: 0.8588907014681892
Valid Confusion Matrix: 
[[56858     5]
 [   17    81]]
Test Recall: 0.8080808080808081 
Test Precision: 0.9411764705882353 
Test F-score: 0.8448415922014622
Test Confusion Matrix: 
[[56858     5]
 [   19    80]]


In [37]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train.drop(columns=['Class']), train['Class'])

print_data(knn)

Train Recall: 1.0 
Train Precision: 1.0 
Train F-score: 1.0
Train Confusion Matrix: 
[[170589      0]
 [     0    295]]
Valid Recall: 0.7346938775510204 
Valid Precision: 0.8275862068965517 
Valid F-score: 0.7609756097560976
Valid Confusion Matrix: 
[[56848    15]
 [   26    72]]
Test Recall: 0.7474747474747475 
Test Precision: 0.8131868131868132 
Test F-score: 0.7665338645418327
Test Confusion Matrix: 
[[56846    17]
 [   25    74]]
