In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

from xgboost import XGBClassifier

In [2]:
data = pd.read_csv("creditcard.csv")

In [3]:
data.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [4]:
print(data.shape)
data.isnull().sum()

(13954, 31)


Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [5]:
num_cols = data.columns.drop('Class')

data[num_cols] = data[num_cols].fillna(data[num_cols].median())
data['Class'] = data['Class'].fillna(data['Class'].mode()[0])

In [6]:
X = data.drop('Class',axis=1)
y = data['Class']

In [7]:
X.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')

In [8]:
y

Unnamed: 0,Class
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
13949,0.0
13950,0.0
13951,0.0
13952,0.0


In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=42,stratify=y)

In [10]:
scaler = StandardScaler()

X_train[['Time','Amount']] = scaler.fit_transform(X_train[['Time','Amount']])
X_test[['Time','Amount']] = scaler.transform(X_test[['Time','Amount']])

In [12]:
# Handle class imbalance

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

In [13]:
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)


In [14]:
y_pred = model.predict(X_test)

In [15]:
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))

Accuracy : 0.9984952708512468
Precision: 0.7727272727272727
Recall   : 0.8947368421052632
F1 Score : 0.8292682926829268


In [16]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[4628    5]
 [   2   17]]


In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4633
         1.0       0.77      0.89      0.83        19

    accuracy                           1.00      4652
   macro avg       0.89      0.95      0.91      4652
weighted avg       1.00      1.00      1.00      4652

