In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression


df = pd.read_csv('cleaned_data_final.csv')
df

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,nameOrigDest
0,1,0,9839.64,170136.00,160296.36,0.00,0.00,0,0
1,1,0,1864.28,21249.00,19384.72,0.00,0.00,0,0
2,1,1,181.00,181.00,0.00,0.00,0.00,1,1
3,1,2,181.00,181.00,0.00,21182.00,0.00,1,1
4,1,0,11668.14,41554.00,29885.86,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...
6362599,743,2,339682.13,339682.13,0.00,0.00,339682.13,1,1
6362600,743,1,6311409.28,6311409.28,0.00,0.00,0.00,1,1
6362601,743,2,6311409.28,6311409.28,0.00,68488.84,6379898.11,1,1
6362602,743,1,850002.52,850002.52,0.00,0.00,0.00,1,1


In [2]:
#Train Test Split

from sklearn.model_selection import train_test_split

X1 = df[['step', 'amount', 'oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest', 'type', 'nameOrigDest']]
y1 = df['isFraud']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.33, random_state=42)

## Original dataset

In [3]:
#Original
logmodel = LogisticRegression()
logmodel.fit(X1_train,y1_train)

pred = logmodel.predict(X1_test)

from sklearn.metrics import classification_report,confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_auc_score

print("Accuracy:",metrics.accuracy_score(y1_test,pred))
print(confusion_matrix(y1_test,pred))
print(classification_report(y1_test,pred))

ROC_AUC = roc_auc_score(y1_test, pred)
print('ROC AUC : {:.4f}'.format(ROC_AUC))

Accuracy: 0.9983025823228523
[[2094910    2079]
 [   1485    1186]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2096989
           1       0.36      0.44      0.40      2671

    accuracy                           1.00   2099660
   macro avg       0.68      0.72      0.70   2099660
weighted avg       1.00      1.00      1.00   2099660

ROC AUC : 0.7215


## Oversampling

In [4]:
#OverSampling
rus=RandomOverSampler(random_state=42)

x_rus, y_rus=rus.fit_resample(X1_train,y1_train)

logmodel = LogisticRegression()
logmodel.fit(x_rus,y_rus)

pred = logmodel.predict(X1_test)

print("Accuracy:",metrics.accuracy_score(y1_test,pred))
print(confusion_matrix(y1_test,pred))
print(classification_report(y1_test,pred))

ROC_AUC = roc_auc_score(y1_test, pred)
print('ROC AUC : {:.4f}'.format(ROC_AUC))

Accuracy: 0.9147766781288399
[[1918339  178650]
 [    290    2381]]
              precision    recall  f1-score   support

           0       1.00      0.91      0.96   2096989
           1       0.01      0.89      0.03      2671

    accuracy                           0.91   2099660
   macro avg       0.51      0.90      0.49   2099660
weighted avg       1.00      0.91      0.95   2099660

ROC AUC : 0.9031


## Undersampling

In [5]:
#UnderSampling
rus=RandomUnderSampler(random_state=42)

x_rus, y_rus=rus.fit_resample(X1_train,y1_train)

logmodel = LogisticRegression()
logmodel.fit(x_rus,y_rus)

pred = logmodel.predict(X1_test)

print("Accuracy:",metrics.accuracy_score(y1_test,pred))
print(confusion_matrix(y1_test,pred))
print(classification_report(y1_test,pred))

ROC_AUC = roc_auc_score(y1_test, pred)
print('ROC AUC : {:.4f}'.format(ROC_AUC))

Accuracy: 0.913977977386815
[[1916661  180328]
 [    289    2382]]
              precision    recall  f1-score   support

           0       1.00      0.91      0.96   2096989
           1       0.01      0.89      0.03      2671

    accuracy                           0.91   2099660
   macro avg       0.51      0.90      0.49   2099660
weighted avg       1.00      0.91      0.95   2099660

ROC AUC : 0.9029
