In [96]:
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from matplotlib import pyplot
from numpy import where
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, matthews_corrcoef
import pickle
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

mcc = metrics.make_scorer(metrics.matthews_corrcoef)

def evaluate_pred(ytest, ypred):
    precision = precision_score(ytest, ypred)
    recall = recall_score(ytest, ypred)
    f1 = f1_score(ytest, ypred)
    mcc = matthews_corrcoef(ytest, ypred)
    metricdict = {"precision" : round(precision, 4),
                 "recall" : round(recall, 4),
                  "mcc" : round(mcc, 4),
                  "f1" : round(f1, 4)}
    print(f"Precision : {precision:.4f}")
    print(f"Recall : {recall:.4f}")
    print(f"MCC : {mcc:.4f}")
    print(f"F1 score : {f1:.4f}")
    return metricdict

In [70]:
data = pd.read_csv("data/creditcard.csv")
x = data.drop("Class", axis = 1)
y = data["Class"]
ycounter = Counter(y)
print("Distribution of entire dataset- ", ycounter)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
ytraincounter = Counter(ytrain)
print("Distribution of test set- ", ytraincounter)
ytestcounter = Counter(ytest)
print("Distribution of test set- ", ytestcounter)
xtrain0 = xtrain[ytrain==0]
xtrain1 = xtrain[ytrain==1]
# output- 
# Distribution of entire dataset-  Counter({0: 284315, 1: 492})
# Distribution of test set-  Counter({0: 227439, 1: 406})
# Distribution of test set-  Counter({0: 56876, 1: 86})

Distribution of entire dataset-  Counter({0: 284315, 1: 492})
Distribution of test set-  Counter({0: 227443, 1: 402})
Distribution of test set-  Counter({0: 56872, 1: 90})


In [3]:
# training a Randomforest classifier
# modelrfc = RandomForestClassifier()
# modelrfc.fit(xtrain, ytrain)
# pickle.dump(modelrfc, open("models/modelrfc.pkl", "wb"))
modelrfc = pickle.load(open("models/modelrfc.pkl", "rb"))
ypredrfc = modelrfc.predict(xtest)
ypredrfccounter = Counter(ypredrfc)
print("Distribution of predicted- ", ypredrfccounter)
evaluate_pred(ytest, ypredrfc);

# output-
# Distribution of predicted-  Counter({0: 56880, 1: 82})
# Precision : 0.9756
# Recall : 0.8247
# MCC : 0.8969
# F1 score : 0.8939

Distribution of predicted-  Counter({0: 56878, 1: 84})
Precision : 1.0000
Recall : 0.9438
MCC : 0.9715
F1 score : 0.9711


In [4]:
# define outlier detection model
# modelsvm1 = OneClassSVM(gamma='auto', nu=0.17)
# fit on minority class
# modelsvm1.fit(xtrain1)
# pickle.dump(modelsvm1, open("models/modelsvm1.pkl", "wb"))
modelsvm1 = pickle.load(open("models/modelsvm1.pkl", "rb"))
# detect outliers in the test set
ypredsvm1 = modelsvm1.predict(xtest)
# mark inliers 1, outliers -1
ypredsvm1[ypredsvm1 == -1] = 0
ypredsvm1[ypredsvm1 == 1] = 1
ypredsvm1counter = Counter(ypredsvm1)
print("Distribution of predicted set- ", ypredsvm1counter)
evaluate_pred(ytest, ypredsvm1);

# output-
# Distribution of predicted set-  Counter({0: 56957, 1: 5})
# Precision : 1.0000
# Recall : 0.0515
# MCC : 0.2269
# F1 score : 0.0980

Distribution of predicted set-  Counter({0: 56941, 1: 21})
Precision : 1.0000
Recall : 0.2360
MCC : 0.4855
F1 score : 0.3818


In [5]:
# define outlier detection model
# modelif1 = IsolationForest(contamination=0.0017)
# fit on minority class
# modelif1.fit(xtrain1)
# pickle.dump(modelif1, open("models/modelif1.pkl", "wb"))
modelif1 = pickle.load(open("models/modelif1.pkl", "rb"))
# detect outliers in the test set
ypredif1 = modelif1.predict(xtest)
# mark inliers 1, outliers -1
ypredif1[ypredif1 == -1] = 0
ypredif1[ypredif1 == 1] = 1
ypredif1counter = Counter(ypredif1)
print("Distribution of predicted- ", ypredif1counter)
evaluate_pred(ytest, ypredif1);

# output- 
# Distribution of predicted-  Counter({1: 56961, 0: 1})
# Precision : 0.0017
# Recall : 1.0000
# MCC : 0.0002
# F1 score : 0.0034

Distribution of predicted-  Counter({1: 56960, 0: 2})
Precision : 0.0015
Recall : 0.9888
MCC : -0.0748
F1 score : 0.0031


In [6]:
# define outlier detection model
# modelee1 = EllipticEnvelope(contamination=0.0017)
# # fit on minority class
# modelee1.fit(xtrain1)
# pickle.dump(modelee1, open("models/modelee1.pkl", "wb"))
modelee1 = pickle.load(open("models/modelee1.pkl", "rb"))
# detect outliers in the test set
ypredee1 = modelee1.predict(xtest)
# mark inliers 1, outliers -1
ypredee1[ypredee1 == -1] = 0
ypredee1[ypredee1 == 1] = 1
ypredee1counter = Counter(ypredee1)
print("Distribution of predicted- ", ypredee1counter)
evaluate_pred(ytest, ypredee1);

# output- 
# Distribution of predicted-  Counter({1: 56150, 0: 812})
# Precision : 0.0017
# Recall : 0.9897
# MCC : 0.0014
# F1 score : 0.0034

Distribution of predicted-  Counter({1: 56113, 0: 849})
Precision : 0.0016
Recall : 0.9888
MCC : 0.0012
F1 score : 0.0031


In [7]:
# define outlier detection model
# modellof1 = LocalOutlierFactor(contamination=0.000017)
# fit on minority class
# modellof1.fit(xtrain1)
# pickle.dump(modellof1, open("models/modellof1.pkl", "wb"))
modellof1 = pickle.load(open("models/modellof1.pkl", "rb"))
# detect outliers in the test set
ypredlof1 = modellof1.fit_predict(xtest)
# mark inliers 1, outliers -1
ypredlof1[ypredlof1 == -1] = 0
ypredlof1[ypredlof1 == 1] = 1
ypredlof1counter = Counter(ypredlof1)
print("Distribution of predicted- ", ypredlof1counter)
evaluate_pred(ytest, ypredlof1);

# output- 
# Distribution of predicted-  Counter({1: 56961, 0: 1})
# Precision : 0.0017
# Recall : 1.0000
# MCC : 0.0002
# F1 score : 0.0034

Distribution of predicted-  Counter({1: 56961, 0: 1})
Precision : 0.0016
Recall : 1.0000
MCC : 0.0002
F1 score : 0.0031


In [112]:
# for kernels in ["linear", "poly", "rbf", "sigmoid"]:
newclf = OneClassSVM()
grid = {"kernel" : ["sigmoid", "linear", "poly", "rbf"],
        "gamma" : ["auto", "scale"],
        "nu" : [0.17, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
clfgscv = GridSearchCV(estimator=newclf, param_grid = grid,
                       cv=2, verbose=2, scoring = mcc
                      )
clfgscv.fit(xtrain1, ytrain[ytrain == 1])
print("Best Params- ", clfgscv.best_params_)

# output- Best Params-  {'gamma': 'auto', 'kernel': 'sigmoid', 'nu': 0.17}

# modelname = "models/modelsvm1-best.pkl"
# pickle.dump(clfgscv, open(modelname, "wb"))
# clfgscv = pickle.load(open(modelname, "rb"))
# ypredsvm = clfgscv.predict(xtest)
# # mark inliers 1, outliers -1
# ypredsvm[ypredsvm == -1] = 0
# ypredsvm[ypredsvm == 1] = 1
# ypredsvmcounter = Counter(ypredsvm)
# print("Distribution of predicted set- ", ypredsvmcounter)
# evaluate_pred(ytest, ypredsvm);

Fitting 2 folds for each of 80 candidates, totalling 160 fits
[CV] gamma=auto, kernel=sigmoid, nu=0.17 .............................
[CV] .............. gamma=auto, kernel=sigmoid, nu=0.17, total=   0.0s
[CV] gamma=auto, kernel=sigmoid, nu=0.17 .............................
[CV] .............. gamma=auto, kernel=sigmoid, nu=0.17, total=   0.0s
[CV] gamma=auto, kernel=sigmoid, nu=0.1 ..............................
[CV] ............... gamma=auto, kernel=sigmoid, nu=0.1, total=   0.0s
[CV] gamma=auto, kernel=sigmoid, nu=0.1 ..............................
[CV] ............... gamma=auto, kernel=sigmoid, nu=0.1, total=   0.0s
[CV] gamma=auto, kernel=sigmoid, nu=0.2 ..............................
[CV] ............... gamma=auto, kernel=sigmoid, nu=0.2, total=   0.0s
[CV] gamma=auto, kernel=sigmoid, nu=0.2 ..............................
[CV] ............... gamma=auto, kernel=sigmoid, nu=0.2, total=   0.0s
[CV] gamma=auto, kernel=sigmoid, nu=0.3 ..............................
[CV] ..........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.s

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

[CV] .................. gamma=auto, kernel=poly, nu=0.4, total=   0.0s
[CV] gamma=auto, kernel=poly, nu=0.5 .................................
[CV] .................. gamma=auto, kernel=poly, nu=0.5, total=   0.0s
[CV] gamma=auto, kernel=poly, nu=0.5 .................................
[CV] .................. gamma=auto, kernel=poly, nu=0.5, total=   0.0s
[CV] gamma=auto, kernel=poly, nu=0.6 .................................
[CV] .................. gamma=auto, kernel=poly, nu=0.6, total=   0.0s
[CV] gamma=auto, kernel=poly, nu=0.6 .................................
[CV] .................. gamma=auto, kernel=poly, nu=0.6, total=   0.0s
[CV] gamma=auto, kernel=poly, nu=0.7 .................................
[CV] .................. gamma=auto, kernel=poly, nu=0.7, total=   0.0s
[CV] gamma=auto, kernel=poly, nu=0.7 .................................
[CV] .................. gamma=auto, kernel=poly, nu=0.7, total=   0.0s
[CV] gamma=auto, kernel=poly, nu=0.8 .................................
[CV] .

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

[CV] ................... gamma=auto, kernel=rbf, nu=0.9, total=   0.0s
[CV] gamma=scale, kernel=sigmoid, nu=0.17 ............................
[CV] ............. gamma=scale, kernel=sigmoid, nu=0.17, total=   0.0s
[CV] gamma=scale, kernel=sigmoid, nu=0.17 ............................
[CV] ............. gamma=scale, kernel=sigmoid, nu=0.17, total=   0.0s
[CV] gamma=scale, kernel=sigmoid, nu=0.1 .............................
[CV] .............. gamma=scale, kernel=sigmoid, nu=0.1, total=   0.0s
[CV] gamma=scale, kernel=sigmoid, nu=0.1 .............................
[CV] .............. gamma=scale, kernel=sigmoid, nu=0.1, total=   0.0s
[CV] gamma=scale, kernel=sigmoid, nu=0.2 .............................
[CV] .............. gamma=scale, kernel=sigmoid, nu=0.2, total=   0.0s
[CV] gamma=scale, kernel=sigmoid, nu=0.2 .............................
[CV] .............. gamma=scale, kernel=sigmoid, nu=0.2, total=   0.0s
[CV] gamma=scale, kernel=sigmoid, nu=0.3 .............................
[CV] .

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

In [109]:
# define outlier detection model
modelsvm1best = OneClassSVM(gamma='auto', kernel= 'sigmoid', nu=0.17)
# fit on minority class
modelsvm1best.fit(xtrain1)
pickle.dump(modelsvm1best, open("models/modelsvm1best.pkl", "wb"))
modelsvm1best = pickle.load(open("models/modelsvm1best.pkl", "rb"))
# detect outliers in the test set
ypredsvm1best = modelsvm1best.predict(xtest)
# mark inliers 1, outliers -1
ypredsvm1best[ypredsvm1best == -1] = 0
ypredsvm1best[ypredsvm1best == 1] = 1
ypredsvm1bestcounter = Counter(ypredsvm1best)
print("Distribution of predicted set- ", ypredsvm1bestcounter)
evaluate_pred(ytest, ypredsvm1best);

# output-
# Distribution of predicted set-  Counter({0: 56962})
# Precision : 0.0000
# Recall : 0.0000
# MCC : 0.0000
# F1 score : 0.0000

Distribution of predicted set-  Counter({0: 56962})
Precision : 0.0000
Recall : 0.0000
MCC : 0.0000
F1 score : 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [None]:
# define outlier detection model
modelsvm0 = OneClassSVM(gamma='auto', nu=0.17)
# fit on majority class
# modelsvm0.fit(xtrain0)
# pickle.dump(modelsvm0, open("models/modelsvm0.pkl", "wb"))
modelsvm0 = pickle.load(open("models/modelsvm0.pkl", "rb"))
# detect outliers in the test set
ypredsvm0 = modelsvm0.predict(xtest)
# mark inliers 1, outliers -1
ypredsvm0[ypredsvm0 == -1] = 1
ypredsvm0[ypredsvm0 == 1] = 0
ypredsvm0counter = Counter(ypredsvm0)
print("Distribution of predicted- ", ypredsvm0counter)
evaluate_pred(ytest, ypredsvm0);

# output- 
# Distribution of predicted-  Counter({0: 56962})
# Precision : 0.0000
# Recall : 0.0000
# MCC : 0.0000
# F1 score : 0.0000

In [None]:
# define outlier detection model
modelif0 = IsolationForest(contamination=0.0017)
# fit on majority class
# modelif0.fit(xtrain0)
# pickle.dump(modelif0, open("models/modelif0.pkl", "wb"))
modelif0 = pickle.load(open("models/modelif0.pkl", "rb"))
# detect outliers in the test set
ypredif0 = modelif0.predict(xtest)
# mark inliers 1, outliers -1
ypredif0[ypredif0 == -1] = 1
ypredif0[ypredif0 == 1] = 0
ypredif0counter = Counter(ypredif0)
print("Distribution of predicted- ", ypredif0counter)
evaluate_pred(ytest, ypredif0);

# output- 
# Distribution of predicted-  Counter({0: 56962})
# Precision : 0.0000
# Recall : 0.0000
# MCC : 0.0000
# F1 score : 0.0000

In [None]:
# define outlier detection model
modelee0 = EllipticEnvelope(contamination=0.0017, support_fraction=1)
# fit on majority class
# modelee0.fit(xtrain0)
# pickle.dump(modelee0, open("models/modelee0.pkl", "wb"))
modelee0 = pickle.load(open("models/modelee0.pkl", "rb"))
# detect outliers in the test set
ypredee0 = modelee0.predict(xtest)
# mark inliers 1, outliers -1
ypredee0[ypredee0 == -1] = 1
ypredee0[ypredee0 == 1] = 0
ypredee0counter = Counter(ypredee0)
print("Distribution of predicted- ", ypredee0counter)
evaluate_pred(ytest, ypredee0);

# output- 
# Distribution of predicted-  Counter({0: 56962})
# Precision : 0.0000
# Recall : 0.0000
# MCC : 0.0000
# F1 score : 0.0000

In [None]:
# define outlier detection model
modellof0 = LocalOutlierFactor(contamination=0.17)
# fit on majority class
# modellof0.fit(xtrain0)
# pickle.dump(modellof0, open("models/modellof0.pkl", "wb"))
modellof0 = pickle.load(open("models/modellof0.pkl", "rb"))
# detect outliers in the test set
ypredlof0 = modellof0.fit_predict(xtest)
# mark inliers 1, outliers -1
ypredlof0[ypredlof0 == -1] = 1
ypredlof0[ypredlof0 == 1] = 0
ypredlof0counter = Counter(ypredlof0)
print("Distribution of predicted- ", ypredlof0counter)
evaluate_pred(ytest, ypredlof0);

# output- 
# Distribution of predicted-  Counter({0: 56962})
# Precision : 0.0000
# Recall : 0.0000
# MCC : 0.0000
# F1 score : 0.0000