In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    confusion_matrix, classification_report, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
import joblib
warnings.filterwarnings('ignore')


In [6]:
df = pd.read_csv("creditcard.csv")
print("Dataset shape:", df.shape)
df.head()
'''sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)'''

Dataset shape: (284807, 31)


'sm = SMOTE(random_state=42)\nX_res, y_res = sm.fit_resample(X, y)'

In [7]:
print("Missing values per column:\n", df.isnull().sum())
if df.isnull().sum().sum() == 0:
    print("\n No missing values found.")
else:
    df.fillna(df.median(), inplace=True)
    print("\n Missing values filled with column medians.")

Missing values per column:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

✅ No missing values found.


In [9]:
#Step 4. Scale Numerical Features
# ---------------------------------------------
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])
df['Time'] = scaler.fit_transform(df[['Time']])
print(" 'Amount' and 'Time' scaled successfully.")

 'Amount' and 'Time' scaled successfully.


In [10]:
#Step 5. Define Features (X) and Target (y)
# ---------------------------------------------
X = df.drop('Class', axis=1)
y = df['Class']
print(f"Feature shape: {X.shape}, Target shape: {y.shape}")

Feature shape: (284807, 30), Target shape: (284807,)


In [11]:
#train , test, split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print("Class distribution in Train:\n", y_train.value_counts(normalize=True))

Train shape: (227845, 30), Test shape: (56962, 30)
Class distribution in Train:
 Class
0    0.998271
1    0.001729
Name: proportion, dtype: float64


In [14]:
#Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_res.value_counts())
print(" Class imbalance handled with SMOTE.")

Before SMOTE: Class
0    227451
1       394
Name: count, dtype: int64
After SMOTE: Class
0    227451
1    227451
Name: count, dtype: int64
✅ Class imbalance handled with SMOTE.


In [15]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_res, y_train_res)
y_pred_lr = lr.predict(X_test)


In [None]:
# Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_res, y_train_res)
y_pred_rf = rf.predict(X_test)

In [None]:
# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_res, y_train_res)
y_pred_xgb = xgb.predict(X_test)

In [None]:
#define evaluation function
def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1 Score : {f1:.4f}")
    print(f"ROC-AUC  : {auc:.4f}")
    return {"Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1, "ROC-AUC": auc}

In [None]:
#Evaluate All Models
print(" Logistic Regression Performance:")
lr_metrics = evaluate_model(y_test, y_pred_lr)

print("\n Random Forest Performance:")
rf_metrics = evaluate_model(y_test, y_pred_rf)

print("\n XGBoost Performance:")
xgb_metrics = evaluate_model(y_test, y_pred_xgb)

In [None]:
#compare all results
results = pd.DataFrame([lr_metrics, rf_metrics, xgb_metrics],
                       index=['Logistic Regression', 'Random Forest', 'XGBoost'])
print("\n Model Performance Comparison:")
display(results)


In [None]:
plt.figure(figsize=(8,5))
sns.barplot(data=results.reset_index().melt(id_vars='index'),
            x='index', y='value', hue='variable')
plt.title("Model Performance Comparison")
plt.xlabel("Model")
plt.ylabel("Score")
plt.legend(title='Metric')
plt.show()

In [None]:
#confusion matrix
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Blues')
plt.title("Random Forest - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#feature importance
importance = pd.Series(xgb.feature_importances_, index=X.columns)
top_features = importance.nlargest(10)
plt.figure(figsize=(8,5))
top_features.sort_values().plot(kind='barh', color='teal')
plt.title("Top 10 Important Features (XGBoost)")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()
print("Top Features:\n", top_features)

In [None]:
best_model = xgb  # assuming XGBoost performs best
with open("../models/fraud_model.pkl", "wb") as f:
    pickle.dump(best_model, f)
print(" Best model saved as '../models/fraud_model.pkl'")
