In [35]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, precision_score, recall_score, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [36]:
data = pd.read_csv('../data/creditcard.csv')
x = np.array(data.loc[1:len(data)-1, 'Time'])
y = np.array(data.loc[0:len(data)-2, 'Time'])
data.loc[1:, 'Time'] = x-y
normal = data[data['Class']==0]
anomaly = data[data['Class']==1]

In [37]:
train_normal, test_normal = train_test_split(normal, test_size=0.02, random_state=42)
valid_normal, test_normal = train_test_split(test_normal, test_size=0.5, random_state=42)
train_anomaly, test_anomaly = train_test_split(anomaly, test_size=0.4, random_state=42)
valid_anomaly, test_anomaly = train_test_split(test_anomaly, test_size=0.5, random_state=42)

for x in [train_normal, valid_normal, test_normal, train_anomaly, valid_anomaly, test_anomaly]:
    x.reset_index(drop=True, inplace=True)

print('Normal Train:', train_normal.shape, 
      'Normal Valid:', valid_normal.shape, 
      'Normal Test:', test_normal.shape)
print('Anomaly Train:', train_anomaly.shape, 
      'Anomaly Valid:', valid_anomaly.shape, 
      'Anomaly Test:', test_anomaly.shape)

Normal Train: (278628, 31) Normal Valid: (2843, 31) Normal Test: (2844, 31)
Anomaly Train: (295, 31) Anomaly Valid: (98, 31) Anomaly Test: (99, 31)


In [38]:
train = train_normal.append(train_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)
valid = valid_normal.append(valid_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)
test = test_normal.append(test_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)

In [39]:
def print_data(model):
    x = train
    predict_model = model.predict(x.drop(columns=['Class']))
    recall_model = recall_score(y_true=x['Class'].values, y_pred=predict_model)
    precision_model = precision_score(y_true=x['Class'].values, y_pred=predict_model)
    fbeta_model = fbeta_score(y_true=x['Class'].values, y_pred=predict_model, beta=1)

    print('Train Recall:', recall_model, 
          '\nTrain Precision:', precision_model, 
          '\nTrain F-score:', fbeta_model)

    x = valid
    predict_model = model.predict(x.drop(columns=['Class']))
    recall_model = recall_score(y_true=x['Class'].values, y_pred=predict_model)
    precision_model = precision_score(y_true=x['Class'].values, y_pred=predict_model)
    fbeta_model = fbeta_score(y_true=x['Class'].values, y_pred=predict_model, beta=1)

    print('Valid Recall:', recall_model, 
          '\nValid Precision:', precision_model, 
          '\nValid F-score:', fbeta_model)

    x = test
    predict_model = model.predict(x.drop(columns=['Class']))
    recall_model = recall_score(y_true=x['Class'].values, y_pred=predict_model)
    precision_model = precision_score(y_true=x['Class'].values, y_pred=predict_model)
    fbeta_model = fbeta_score(y_true=x['Class'].values, y_pred=predict_model, beta=1)

    print('Test Recall:', recall_model, 
          '\nTest Precision:', precision_model, 
          '\nTest F-score:', fbeta_model)

    #cnf_matrix_model = confusion_matrix(y_true=x['Class'].values, y_pred=predict_model)
    #cnf_matrix_model

In [None]:
logistic = LogisticRegression(random_state=0, solver='newton-cg',
                         multi_class='multinomial')
logistic.fit(train.drop(columns=['Class']), train['Class'])

In [23]:
print_data(test, logistic)

Recall: 0.5959595959595959 
Precision: 0.8676470588235294 
F-score: 0.6357758620689654


In [7]:
gnb = GaussianNB()
gnb.fit(train.drop(columns=['Class']), train['Class'])



x = test
recall_gnb = recall_score(y_true=x['Class'].values, y_pred=predict_gnb)
precision_gnb = precision_score(y_true=x['Class'].values, y_pred=predict_gnb)
fbeta_gnb = fbeta_score(y_true=x['Class'].values, y_pred=predict_gnb, beta=2)

print('Recall:', recall_gnb, '\nPrecision:', precision_gnb, '\nF-score:', fbeta_gnb)

cnf_matrix_gnb = confusion_matrix(y_true=x['Class'].values, y_pred=predict_gnb)
cnf_matrix_gnb

Recall: 0.8282828282828283 
Precision: 0.05758426966292135 
F-score: 0.22527472527472528


array([[55521,  1342],
       [   17,    82]])

In [43]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=7, class_weight={0:1, 1:20})
tree.fit(train.drop(columns=['Class']), train['Class'])

print_data(tree)

Train Recall: 0.8711864406779661 
Train Precision: 0.8210862619808307 
Train F-score: 0.8453947368421052
Valid Recall: 0.826530612244898 
Valid Precision: 0.9878048780487805 
Valid F-score: 0.9
Test Recall: 0.8181818181818182 
Test Precision: 1.0 
Test F-score: 0.9


In [13]:
SVM = SVC(kernel='linear',C=0.4)
SVM.fit(train.iloc[0:20000].drop(columns=['Class']), train.iloc[0:20000]['Class'])

print_data(test, SVM)
print_data(train, SVM)
print_data(valid, SVM)

Recall: 0.5050505050505051 
Precision: 0.9259259259259259 
F-score: 0.5555555555555557


array([[56859,     4],
       [   49,    50]])

In [25]:
lda = LinearDiscriminantAnalysis()
lda.fit(train.drop(columns=['Class']), train['Class'])

print_data(test, lda)
print_data(valid, lda)
print_data(train, lda)

Recall: 0.7777777777777778 
Precision: 0.875 
F-score: 0.7954545454545454
Recall: 0.7423728813559322 
Precision: 0.85546875 
F-score: 0.7625348189415042
Recall: 0.826530612244898 
Precision: 0.8804347826086957 
F-score: 0.8367768595041323


In [34]:
lda_bagging = BaggingClassifier(LinearDiscriminantAnalysis(), n_estimators=5)
lda_bagging.fit(train.drop(columns=['Class']), train['Class'])

print_data(train, lda_bagging)
print_data(valid, lda_bagging)
print_data(test, lda_bagging)

Recall: 0.7491525423728813 
Precision: 0.8565891472868217 
F-score: 0.7992766726943942
Recall: 0.826530612244898 
Precision: 0.8804347826086957 
F-score: 0.8526315789473684
Recall: 0.7777777777777778 
Precision: 0.875 
F-score: 0.823529411764706


In [26]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(train.drop(columns=['Class']), train['Class'])

print_data(train, qda)
print_data(valid, qda)
print_data(test, qda)

Recall: 0.8745762711864407 
Precision: 0.05386221294363257 
F-score: 0.21608040201005024
Recall: 0.8484848484848485 
Precision: 0.05286343612334802 
F-score: 0.2115869017632242
Recall: 0.9081632653061225 
Precision: 0.056400506970849175 
F-score: 0.2258883248730964


In [32]:
qda_bagging = BaggingClassifier(QuadraticDiscriminantAnalysis(), n_estimators=11)
qda_bagging.fit(train.drop(columns=['Class']), train['Class'])

print_data(train, qda_bagging)
print_data(valid, qda_bagging)
print_data(test, qda_bagging)

Recall: 0.8745762711864407 
Precision: 0.058265582655826556 
F-score: 0.10925259369045097
Recall: 0.9081632653061225 
Precision: 0.06070941336971351 
F-score: 0.11381074168797956
Recall: 0.8484848484848485 
Precision: 0.05718175629680054 
F-score: 0.10714285714285714


In [40]:
classifier = [('lda', lda), ('qda', qda), ('log', logistic)]
vote = VotingClassifier(classifier)

vote.fit(train.drop(columns=['Class']), train['Class'])

print_data(train, vote)
print_data(valid, vote)
print_data(test, vote)

Recall: 0.7559322033898305 
Precision: 0.8415094339622642 
F-score: 0.7964285714285715
Recall: 0.8367346938775511 
Precision: 0.8631578947368421 
F-score: 0.8497409326424872
Recall: 0.797979797979798 
Precision: 0.8494623655913979 
F-score: 0.8229166666666666


In [27]:
rfc = RandomForestClassifier()
rfc.fit(train.drop(columns=['Class']), train['Class'])

print_data(train, rfc)
print_data(valid, rfc)
print_data(test, rfc)



Recall: 0.9661016949152542 
Precision: 0.9965034965034965 
F-score: 0.9720327421555253
Recall: 0.8061224489795918 
Precision: 0.9186046511627907 
F-score: 0.8263598326359833
Recall: 0.7777777777777778 
Precision: 0.927710843373494 
F-score: 0.8037578288100209


In [12]:
adb = AdaBoostClassifier(GaussianNB(),
                         n_estimators=50)
adb.fit(train.drop(columns=['Class']), train['Class'])

print_data(train, adb)
print_data(valid, adb)
print_data(train, adb)

Recall: 0.7272727272727273 
Precision: 0.029715229054890633 
F-score: 0.12770485987938987


array([[54512,  2351],
       [   27,    72]])

In [8]:
nn = MLPClassifier(solver='lbfgs', alpha=1e-4, 
                   hidden_layer_sizes=(5, 4, 3, 3), random_state=1)
nn.fit(train.drop(columns=['Class']), train['Class'])

predict = nn.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.8080808080808081 
Precision: 0.851063829787234 
F-score: 0.8163265306122448


array([[56849,    14],
       [   19,    80]])

In [11]:
predict = nn.predict(train.drop(columns=['Class']))

x = train
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.7796610169491526 
Precision: 0.8646616541353384 
F-score: 0.7952973720608575


array([[170553,     36],
       [    65,    230]])

In [43]:
predict = nn.predict(train.drop(columns=['Class']))

x = train
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=1)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.7796610169491526 
Precision: 0.8646616541353384 
F-score: 0.8199643493761142


array([[170553,     36],
       [    65,    230]])

In [44]:
xg = XGBClassifier(max_depth=4, reg_lambda=0.5)
xg.fit(train.drop(columns=['Class']), train['Class'])

predict = xg.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=1)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.8080808080808081 
Precision: 0.9411764705882353 
F-score: 0.8695652173913043


array([[56858,     5],
       [   19,    80]])

In [47]:
x = train

predict = xg.predict(x.drop(columns=['Class']))
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=1)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.8745762711864407 
Precision: 0.9923076923076923 
F-score: 0.9297297297297297


array([[170587,      2],
       [    37,    258]])

In [69]:
gmm_normal = GaussianMixture(n_components=3, n_init=5, max_iter=500, random_state=42, tol=1e-6)
gmm_normal.fit(train_normal.drop(columns=['Class']))

gmm_anomaly = GaussianMixture(n_components=3, n_init=5, max_iter=500, random_state=42, tol=1e-5)
gmm_anomaly.fit(train_anomaly.drop(columns=['Class']))

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=500,
        means_init=None, n_components=3, n_init=5, precisions_init=None,
        random_state=42, reg_covar=1e-06, tol=1e-05, verbose=0,
        verbose_interval=10, warm_start=False, weights_init=None)

In [132]:
f_g_valid = gmm_normal.score_samples(valid.drop(columns=['Class'])) \
 - gmm_anomaly.score_samples(valid.drop(columns=['Class']))

In [203]:
gmm_tree = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=3)
gmm_tree.fit(f_g_valid.reshape(-1, 1), valid['Class'])

print_data

Recall: 0.8080808080808081 
Precision: 0.8247422680412371 
F-score: 0.8163265306122448


array([[56846,    17],
       [   19,    80]])

In [26]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train.drop(columns=['Class']), train['Class'])

print_data(knn)

Test Recall: 0.7474747474747475 
Test Precision: 0.9866666666666667 
Test F-score: 0.8505747126436782
