In [32]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier 

In [33]:
df_credit = pd.read_csv('../data/credit_card.csv')
df_credit.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [34]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_credit['Amount'] = sc.fit_transform(pd.DataFrame(df_credit['Amount']))

In [35]:
df_credit = df_credit.drop(['Time'], axis = 1)

In [36]:
df_credit.duplicated ().any()

True

In [37]:
df_credit = df_credit.drop_duplicates()

In [38]:
df_credit['Class'].value_counts()

Class
0    275190
1       473
Name: count, dtype: int64

In [39]:
#separeting the fraud and non fraud data
legit = df_credit[df_credit.Class == 0]
fraud = df_credit[df_credit.Class == 1]


In [40]:
print(legit.shape)
print(fraud.shape)

(275190, 30)
(473, 30)


In [41]:
legit.Amount.describe()

count    275190.000000
mean          0.008682
std           1.012309
min          -0.353229
25%          -0.327682
50%          -0.258275
75%          -0.033782
max         102.362243
Name: Amount, dtype: float64

In [42]:
fraud.Amount.describe()

count    473.000000
mean       0.142021
std        1.040346
min       -0.353229
25%       -0.349231
50%       -0.313968
75%        0.070128
max        8.146182
Name: Amount, dtype: float64

In [43]:
##compare the values for both transactions
df_credit.groupby('Class').mean()


Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.029792,-0.008288,0.037131,-0.012054,-0.005596,-0.011768,0.017497,-0.007346,-0.00805,0.012492,...,0.002717,0.001781,0.005689,-0.001779,-0.006696,-0.00489,-0.000327,0.001557,0.000771,0.008682
1,-4.49828,3.405965,-6.729599,4.472591,-2.957197,-1.432518,-5.175912,0.953255,-2.522124,-5.453274,...,0.405043,0.46655,0.086639,-0.096464,-0.106643,0.040615,0.050456,0.213774,0.07827,0.142021


In [44]:
##undersampling

legit_sample = legit.sample(n=473)

In [45]:
new_dataset = pd.concat([legit_sample,fraud], axis=0)

In [46]:
new_dataset.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
261634,1.877851,-1.433969,-1.557765,-1.116215,-0.221814,0.363664,-0.541355,-0.049787,-0.466511,0.81122,...,0.261947,0.381914,-0.098079,-0.297348,-0.031125,-0.212221,-0.048377,-0.033814,0.422399,0
13948,-0.56578,0.754977,0.737825,0.030154,-1.383391,0.268107,0.687564,0.228757,0.64861,-0.81386,...,-0.217212,-0.546207,0.358494,-0.016505,-0.793856,0.699716,-0.195727,-0.104007,0.574327,0
203100,1.839848,0.196657,0.130777,3.63939,-0.154716,0.651474,-0.607511,0.366301,-0.44322,1.582024,...,-0.182449,-0.622267,0.412464,-0.532808,-0.624712,-0.369368,-0.001936,-0.042402,-0.333279,0
7458,-1.580555,-0.67571,1.493806,0.25772,1.714358,-0.415685,0.59895,-0.406663,1.434418,-0.120991,...,-0.600531,-0.656237,1.145982,-0.876875,0.08072,0.101874,-0.010018,-0.500766,-0.303333,0
263927,0.024852,0.852175,0.193858,-0.782179,0.675796,-0.604544,0.94187,-0.074261,-0.373719,-0.258925,...,-0.223133,-0.468084,0.009383,-0.326244,-0.456387,0.138631,0.247057,0.085271,-0.342475,0


In [47]:
new_dataset['Class'].value_counts()

Class
0    473
1    473
Name: count, dtype: int64

In [48]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.122746,0.06754,0.022417,-0.148762,-0.026354,0.082532,0.053484,-0.018501,-0.008154,-0.07532,...,0.046082,-0.023337,0.023698,-0.008256,-0.02811,-0.02958,0.002783,0.011286,-0.023259,0.018812
1,-4.49828,3.405965,-6.729599,4.472591,-2.957197,-1.432518,-5.175912,0.953255,-2.522124,-5.453274,...,0.405043,0.46655,0.086639,-0.096464,-0.106643,0.040615,0.050456,0.213774,0.07827,0.142021


In [49]:
##Splitting the data into Features and Targets
X = new_dataset.drop(columns='Class',  axis=1)
Y = new_dataset['Class']


In [50]:
##Split the data into Traning data and Testing Data

X_train, X_test, Y_train,Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [51]:
print(X.shape, X_train.shape, X_test.shape)

(946, 29) (756, 29) (190, 29)


In [52]:
classifier = {
    'LogisticRegression' : LogisticRegression(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier' : RandomForestClassifier(),
    'XGB classifier': XGBClassifier()
}

for name , clf in classifier . items():
    print(f'\n============{name}===========')
    clf.fit(X_train,Y_train)
    y_pred = clf.predict(X_test)
    print(f'\n Accuracy: {accuracy_score(Y_test, y_pred)}')
    print(f'\n Precision: {precision_score(Y_test, y_pred)}')
    print(f'\n Recall: {recall_score(Y_test, y_pred)}')
    print(f'\n Accuracy: {f1_score(Y_test, y_pred)}')



 Accuracy: 0.9526315789473684

 Precision: 0.9777777777777777

 Recall: 0.9263157894736842

 Accuracy: 0.9513513513513514


 Accuracy: 0.9052631578947369

 Precision: 0.8969072164948454

 Recall: 0.9157894736842105

 Accuracy: 0.90625


 Accuracy: 0.9473684210526315

 Precision: 0.9885057471264368

 Recall: 0.9052631578947369

 Accuracy: 0.945054945054945


 Accuracy: 0.9421052631578948

 Precision: 0.9666666666666667

 Recall: 0.9157894736842105

 Accuracy: 0.9405405405405406


In [53]:
## OVERSAMPLING

In [54]:
##Splitting the data into Features and Targets
X = df_credit.drop(columns='Class',  axis=1)
Y = df_credit['Class']

In [55]:
from imblearn.over_sampling import SMOTE
X_res, Y_res = SMOTE().fit_resample(X,Y)

In [56]:
Y_res.value_counts()

Class
0    275190
1    275190
Name: count, dtype: int64

In [58]:
##Split the data into Traning data and Testing Data
X_train, X_test, Y_train,Y_test = train_test_split(X_res,Y_res, test_size=0.2, random_state=42)

In [59]:
classifier = {
    'LogisticRegression' : LogisticRegression(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier' : RandomForestClassifier(),
    'XGB classifier': XGBClassifier()
}
for name , clf in classifier . items():
    print(f'\n============{name}===========')
    clf.fit(X_train,Y_train)
    y_pred = clf.predict(X_test)
    print(f'\n Accuracy: {accuracy_score(Y_test, y_pred)}')
    print(f'\n Precision: {precision_score(Y_test, y_pred)}')
    print(f'\n Recall: {recall_score(Y_test, y_pred)}')
    print(f'\n Accuracy: {f1_score(Y_test, y_pred)}')



 Accuracy: 0.9450288891311458

 Precision: 0.9729844631676586

 Recall: 0.9154046142937657

 Accuracy: 0.9433166902417776


 Accuracy: 0.9983920200588684

 Precision: 0.997856961243689

 Recall: 0.998927331236478

 Accuracy: 0.9983918593558352


 Accuracy: 0.9999364075729495

 Precision: 0.9998727504090166

 Recall: 1.0

 Accuracy: 0.9999363711561361


 Accuracy: 0.9997365456593627

 Precision: 0.9994730338712022

 Recall: 1.0

 Accuracy: 0.9997364474939792


In [61]:
rfc = RandomForestClassifier()
rfc.fit(X_res , Y_res)

In [62]:
import joblib

In [63]:
joblib.dump(rfc, 'credit_card_model.pkl')

['credit_card_model.pkl']

In [64]:
model = joblib.load('credit_card_model.pkl')