Build a machine learning model to identify fraudulent credit card
transactions.

Preprocess and normalize the transaction data, handle class
imbalance issues, and split the dataset into training and testing sets.

Train a classification algorithm, such as logistic regression or random
forests, to classify transactions as fraudulent or genuine.

Evaluate the model's performance using metrics like precision, recall,
and F1-score, and consider techniques like oversampling or
undersampling for improving results.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [51]:
df = pd.read_csv('creditcard.csv')

In [52]:
df.tail(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284797,172782.0,-0.241923,0.712247,0.399806,-0.463406,0.244531,-1.343668,0.929369,-0.20621,0.106234,...,-0.228876,-0.514376,0.279598,0.371441,-0.559238,0.113144,0.131507,0.081265,5.49,0
284798,172782.0,0.219529,0.881246,-0.635891,0.960928,-0.152971,-1.014307,0.427126,0.12134,-0.28567,...,0.099936,0.33712,0.251791,0.057688,-1.508368,0.144023,0.181205,0.215243,24.05,0
284799,172783.0,-1.775135,-0.004235,1.189786,0.331096,1.196063,5.51998,-1.518185,2.080825,1.159498,...,0.103302,0.65485,-0.348929,0.745323,0.704545,-0.127579,0.454379,0.130308,79.99,0
284800,172784.0,2.03956,-0.175233,-1.196825,0.23458,-0.008713,-0.726571,0.01705,-0.118228,0.435402,...,-0.268048,-0.717211,0.29793,-0.359769,-0.31561,0.201114,-0.080826,-0.075071,2.68,0
284801,172785.0,0.120316,0.931005,-0.546012,-0.745097,1.130314,-0.235973,0.812722,0.115093,-0.204064,...,-0.314205,-0.80852,0.050343,0.1028,-0.43587,0.124079,0.21794,0.068803,2.69,0
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [53]:
df.shape

(284807, 31)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [55]:
df.isna().sum().sum()

0

In [59]:
df.loc[df['Class'] == 1].shape[0]

492

In [60]:
fraud_count = df['Class'].value_counts().sort_index()
fig = px.bar(x = fraud_count.values, y = fraud_count.index, title = 'Balance of Fraud', orientation='h')
fig.update_layout(height = 300)
fig.show()

# Stratify Splitting for the data

In [61]:
x = df.drop('Class', axis=1)
y = df['Class']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 42)

print('Original Ratio = ', y.value_counts(normalize=True))
print('Train Ratio = ', y_train.value_counts(normalize=True))
print('Test Ratio = ', y_test.value_counts(normalize=True))

Original Ratio =  Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64
Train Ratio =  Class
0    0.998271
1    0.001729
Name: proportion, dtype: float64
Test Ratio =  Class
0    0.99828
1    0.00172
Name: proportion, dtype: float64


# Resampling

In [62]:
smote = SMOTE(sampling_strategy = 0.5, random_state=42)
x_upsampled, y_upsampled = smote.fit_resample(x_train, y_train)

nearmiss = NearMiss(sampling_strategy='auto')
x_resampled, y_resampled = nearmiss.fit_resample(x_upsampled, y_upsampled)

print('Original Distribution: ', y_train.value_counts())
print('Resampled Distribution', y_resampled.value_counts())


`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.


`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.



Original Distribution:  Class
0    227451
1       394
Name: count, dtype: int64
Resampled Distribution Class
0    113725
1    113725
Name: count, dtype: int64


In [63]:
fraud_count = y_resampled.value_counts().sort_index()
fig = px.bar(x = fraud_count.values, y = fraud_count.index, title = 'Balance of Fraud', orientation='h')
fig.update_layout(height = 300)
fig.show()

# Scaling

In [64]:
x_resampled.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,...,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0
mean,92207.078999,-2.374655,2.005514,-3.652728,2.327745,-1.547068,-0.753205,-2.921005,0.363204,-1.347795,...,0.15019,0.321123,0.024979,-0.019451,-0.050445,0.00654,0.024019,0.07873,0.037806,71.22865
std,47252.730813,5.458547,3.455735,6.191598,3.16115,4.132506,1.636388,5.653522,4.609055,2.190267,...,0.947286,2.517321,1.078611,0.991756,0.537143,0.626687,0.435436,0.918736,0.393738,147.86424
min,4462.0,-32.96281,-18.992015,-32.965346,-5.560118,-23.505167,-10.908778,-43.557242,-41.044261,-13.434066,...,-12.425404,-22.797604,-8.887017,-19.254328,-2.822684,-4.781606,-1.697018,-9.895244,-8.478686,0.0
25%,50746.587249,-2.864274,0.012518,-5.225366,-0.055823,-1.723633,-1.538563,-3.210508,-0.181997,-2.314117,...,-0.203664,-0.168003,-0.484979,-0.199721,-0.380214,-0.314716,-0.263125,-0.061764,-0.058364,2.84
50%,85266.670378,-0.74637,1.06237,-1.523102,1.45254,-0.355694,-0.708473,-0.736132,0.184907,-0.808305,...,0.005966,0.147003,0.034894,-0.016462,-0.004187,0.028201,-0.003823,0.060181,0.033212,19.75
75%,138989.977654,0.95075,2.836866,0.24624,4.359749,0.50384,-0.023479,0.171879,0.875784,0.044178,...,0.371213,0.620656,0.539158,0.186939,0.348324,0.35619,0.281298,0.425433,0.210262,73.58954
max,170380.0,2.451888,22.057729,4.187811,12.114672,16.238056,8.933762,6.755575,20.007208,9.234623,...,11.059004,27.202839,8.361985,15.093942,4.014444,4.747061,3.119295,5.417347,15.942151,2125.87


In [65]:
x_resampled.drop(columns='Time', inplace=True)
x_test.drop(columns='Time', inplace=True)

In [66]:
scaler = MinMaxScaler()
x_resampled['Amount'] = scaler.fit_transform(x_resampled[['Amount']])
x_test['Amount'] = scaler.fit_transform(x_test[['Amount']])

In [67]:
x_resampled.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,...,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0,227450.0
mean,-2.374655,2.005514,-3.652728,2.327745,-1.547068,-0.753205,-2.921005,0.363204,-1.347795,-2.875675,...,0.15019,0.321123,0.024979,-0.019451,-0.050445,0.00654,0.024019,0.07873,0.037806,0.033506
std,5.458547,3.455735,6.191598,3.16115,4.132506,1.636388,5.653522,4.609055,2.190267,4.432271,...,0.947286,2.517321,1.078611,0.991756,0.537143,0.626687,0.435436,0.918736,0.393738,0.069555
min,-32.96281,-18.992015,-32.965346,-5.560118,-23.505167,-10.908778,-43.557242,-41.044261,-13.434066,-24.588262,...,-12.425404,-22.797604,-8.887017,-19.254328,-2.822684,-4.781606,-1.697018,-9.895244,-8.478686,0.0
25%,-2.864274,0.012518,-5.225366,-0.055823,-1.723633,-1.538563,-3.210508,-0.181997,-2.314117,-4.574874,...,-0.203664,-0.168003,-0.484979,-0.199721,-0.380214,-0.314716,-0.263125,-0.061764,-0.058364,0.001336
50%,-0.74637,1.06237,-1.523102,1.45254,-0.355694,-0.708473,-0.736132,0.184907,-0.808305,-0.988934,...,0.005966,0.147003,0.034894,-0.016462,-0.004187,0.028201,-0.003823,0.060181,0.033212,0.00929
75%,0.95075,2.836866,0.24624,4.359749,0.50384,-0.023479,0.171879,0.875784,0.044178,-0.02629,...,0.371213,0.620656,0.539158,0.186939,0.348324,0.35619,0.281298,0.425433,0.210262,0.034616
max,2.451888,22.057729,4.187811,12.114672,16.238056,8.933762,6.755575,20.007208,9.234623,15.331742,...,11.059004,27.202839,8.361985,15.093942,4.014444,4.747061,3.119295,5.417347,15.942151,1.0


In [68]:
x_test.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,56962.0,56962.0,56962.0,56962.0,56962.0,56962.0,56962.0,56962.0,56962.0,56962.0,...,56962.0,56962.0,56962.0,56962.0,56962.0,56962.0,56962.0,56962.0,56962.0,56962.0
mean,-0.003181,0.001924,0.001532,7.9e-05,-0.000698,0.004685,-0.000307,0.001736,-0.002807,0.001558,...,0.003996,-0.001479,-0.001211,-0.000202,0.001109,0.002622,-0.000504,0.000282,-0.000615,0.006897
std,1.957791,1.660157,1.520112,1.415887,1.382965,1.336478,1.231741,1.205777,1.103746,1.099188,...,0.771395,0.732772,0.725571,0.61249,0.606951,0.521444,0.483381,0.398388,0.342373,0.019185
min,-37.558067,-42.172688,-32.454198,-5.263068,-40.427726,-19.996349,-31.197329,-50.688419,-9.481456,-22.187089,...,-28.009635,-22.75754,-8.887017,-26.751119,-2.822384,-7.495741,-1.855355,-9.845808,-8.412267,0.0
25%,-0.922012,-0.597742,-0.885412,-0.855101,-0.689776,-0.767032,-0.556383,-0.211144,-0.65446,-0.534021,...,-0.212152,-0.229432,-0.543363,-0.161485,-0.353306,-0.318244,-0.329162,-0.071348,-0.053144,0.000426
50%,0.014539,0.069051,0.179703,-0.013487,-0.052099,-0.274488,0.039029,0.018431,-0.053372,-0.091538,...,-0.062045,-0.030054,0.007985,-0.011928,0.041618,0.018827,-0.053818,0.000895,0.011321,0.001704
75%,1.315764,0.802856,1.029285,0.747315,0.615923,0.40349,0.570031,0.32559,0.593091,0.451679,...,0.134361,0.186474,0.529293,0.147841,0.442355,0.354786,0.241305,0.090562,0.078955,0.005928
max,2.45493,17.93055,4.226108,12.132323,28.762671,23.917837,44.054461,18.282168,9.125535,13.811758,...,26.237391,27.202839,8.272233,19.002942,4.022866,7.519589,3.1162,10.507884,22.620072,1.0


# Random Forest

In [None]:
model1 = RandomForestClassifier(n_estimators=50)
model1.fit(x_resampled, y_resampled)
y_pred = model1.predict(x_test)

print('confusion_matrix: ', confusion_matrix(y_test, y_pred))
print('classification_report: ', classification_report(y_test, y_pred))


confusion_matrix:  [[56830    34]
 [   13    85]]
classification_report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.71      0.87      0.78        98

    accuracy                           1.00     56962
   macro avg       0.86      0.93      0.89     56962
weighted avg       1.00      1.00      1.00     56962

f1_score:  0.783410138248848


# SVC

In [71]:
model2 = SVC()
model2.fit(x_resampled, y_resampled)
y_pred = model2.predict(x_test)

print('confusion_matrix: ', confusion_matrix(y_test, y_pred))
print('classification_report: ', classification_report(y_test, y_pred))

confusion_matrix:  [[56176   688]
 [    9    89]]
classification_report:                precision    recall  f1-score   support

           0       1.00      0.99      0.99     56864
           1       0.11      0.91      0.20        98

    accuracy                           0.99     56962
   macro avg       0.56      0.95      0.60     56962
weighted avg       1.00      0.99      0.99     56962



# XGBoost

In [73]:
model3 = XGBClassifier()
model3.fit(x_resampled, y_resampled)
y_pred = model3.predict(x_test)

print('confusion_matrix: ', confusion_matrix(y_test, y_pred))
print('classification_report: ', classification_report(y_test, y_pred))

confusion_matrix:  [[56759   105]
 [   12    86]]
classification_report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.45      0.88      0.60        98

    accuracy                           1.00     56962
   macro avg       0.73      0.94      0.80     56962
weighted avg       1.00      1.00      1.00     56962



# KNeighbors

In [75]:
model4 = KNeighborsClassifier()
model4.fit(x_resampled, y_resampled)
y_pred = model4.predict(x_test)

print('confusion_matrix: ', confusion_matrix(y_test, y_pred))
print('classification_report: ', classification_report(y_test, y_pred))

confusion_matrix:  [[56564   300]
 [   10    88]]
classification_report:                precision    recall  f1-score   support

           0       1.00      0.99      1.00     56864
           1       0.23      0.90      0.36        98

    accuracy                           0.99     56962
   macro avg       0.61      0.95      0.68     56962
weighted avg       1.00      0.99      1.00     56962

