In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

print("Libraries loaded successfully ðŸš€")

Libraries loaded successfully ðŸš€


In [3]:
df = pd.read_csv("/content/Fraud.csv")

print("Dataset Loaded âœ…")
df.head()

Dataset Loaded âœ…


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
print("Shape of dataset:", df.shape)
df.info()

Shape of dataset: (6362620, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [5]:
for col in df.columns:
    if df[col].dtype == 'float64':
        df[col] = df[col].astype('float32')
    if df[col].dtype == 'int64':
        df[col] = df[col].astype('int32')

print("Memory optimized âœ…")
df.info()

Memory optimized âœ…
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int32  
 1   type            object 
 2   amount          float32
 3   nameOrig        object 
 4   oldbalanceOrg   float32
 5   newbalanceOrig  float32
 6   nameDest        object 
 7   oldbalanceDest  float32
 8   newbalanceDest  float32
 9   isFraud         int32  
 10  isFlaggedFraud  int32  
dtypes: float32(5), int32(3), object(3)
memory usage: 339.8+ MB


In [6]:
df['isFraud'].value_counts()
df['isFraud'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
isFraud,Unnamed: 1_level_1
0,0.998709
1,0.001291


In [7]:
df = df.drop(['nameOrig', 'nameDest'], axis=1)

print("Dropped ID columns âœ…")
df.head()

Dropped ID columns âœ…


Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.639648,170136.0,160296.359375,0.0,0.0,0,0
1,1,PAYMENT,1864.280029,21249.0,19384.720703,0.0,0.0,0,0
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0
4,1,PAYMENT,11668.139648,41554.0,29885.859375,0.0,0.0,0,0


In [8]:
df = pd.get_dummies(df, columns=['type'], drop_first=True)

print("Encoded 'type' column âœ…")
df.head()

Encoded 'type' column âœ…


Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.639648,170136.0,160296.359375,0.0,0.0,0,0,False,False,True,False
1,1,1864.280029,21249.0,19384.720703,0.0,0.0,0,0,False,False,True,False
2,1,181.0,181.0,0.0,0.0,0.0,1,0,False,False,False,True
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,True,False,False,False
4,1,11668.139648,41554.0,29885.859375,0.0,0.0,0,0,False,False,True,False


In [9]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']

print("Features and target separated âœ…")
print("X shape:", X.shape)
print("y shape:", y.shape)

Features and target separated âœ…
X shape: (6362620, 11)
y shape: (6362620,)


In [10]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train/Test split done âœ…")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

Train/Test split done âœ…
X_train: (5090096, 11)
X_test: (1272524, 11)


In [11]:
# Calculate scale_pos_weight
fraud = y_train.sum()
non_fraud = len(y_train) - fraud

scale_weight = non_fraud / fraud
print("scale_pos_weight:", scale_weight)

scale_pos_weight: 773.7482496194825


In [12]:
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_weight,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

print("XGBoost model trained âœ…")

XGBoost model trained âœ…


In [13]:
# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ROC AUC Score
roc_score = roc_auc_score(y_test, y_prob)
print("\nROC-AUC Score:", roc_score)

Confusion Matrix:
[[1263277    7604]
 [      6    1637]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00   1270881
           1       0.18      1.00      0.30      1643

    accuracy                           0.99   1272524
   macro avg       0.59      1.00      0.65   1272524
weighted avg       1.00      0.99      1.00   1272524


ROC-AUC Score: 0.9997368650027726


In [14]:
threshold = 0.90

y_pred_custom = (y_prob > threshold).astype(int)

print("Confusion Matrix at threshold =", threshold)
print(confusion_matrix(y_test, y_pred_custom))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_custom))

Confusion Matrix at threshold = 0.9
[[1269003    1878]
 [     36    1607]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.46      0.98      0.63      1643

    accuracy                           1.00   1272524
   macro avg       0.73      0.99      0.81   1272524
weighted avg       1.00      1.00      1.00   1272524



In [15]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

f1_scores = 2 * (precision * recall) / (precision + recall)

best_index = np.argmax(f1_scores)
best_threshold = thresholds[best_index]

print("Best Threshold:", best_threshold)
print("Best F1 Score:", f1_scores[best_index])

Best Threshold: 0.98621535
Best F1 Score: 0.8759398496240601


In [16]:
best_threshold = 0.98621535

y_pred_best = (y_prob > best_threshold).astype(int)

print("Confusion Matrix at Best Threshold:")
print(confusion_matrix(y_test, y_pred_best))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))

Confusion Matrix at Best Threshold:
[[1270730     151]
 [    246    1397]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.90      0.85      0.88      1643

    accuracy                           1.00   1272524
   macro avg       0.95      0.93      0.94   1272524
weighted avg       1.00      1.00      1.00   1272524

