In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score, precision_recall_curve

In [2]:
# Load the dataset (ensure the CSV file is in your working directory)
data = pd.read_csv("data/creditcard.csv")

In [None]:
# Display basic information about the dataset
print("Dataset Information:")
print(data.info())

print("\nDescriptive Statistics:")
print(data.describe())

# Check for missing values
print("\nMissing Values in Each Column:")
print(data.isnull().sum())

In [None]:
# Data Exploration

# Set a consistent style and customize rcParams for a polished look
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (15, 10)
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
plt.rcParams["axes.titlesize"] = 16
plt.rcParams["axes.titleweight"] = "bold"

# Create a grid layout: 2 rows x 3 columns (we have 5 plots, so one subplot will be removed)
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

# Plot 1: Distribution of 'Amount'
sns.histplot(data['Amount'], bins=50, kde=True, color='skyblue', ax=axes[0])
axes[0].set_title("Distribution of Transaction Amount")
axes[0].set_xlabel("Amount")
axes[0].set_ylabel("Frequency")

# Plot 2: Distribution of 'Time'
sns.histplot(data['Time'], bins=50, kde=True, color='salmon', ax=axes[1])
axes[1].set_title("Distribution of Transaction Time")
axes[1].set_xlabel("Time (seconds from first transaction)")
axes[1].set_ylabel("Frequency")

# Plot 3: Class Distribution (Legitimate vs. Fraudulent)
sns.countplot(x='Class', data=data, palette='viridis', ax=axes[2])
axes[2].set_title("Transaction Class Distribution")
axes[2].set_xlabel("Class (0: Legitimate, 1: Fraud)")
axes[2].set_ylabel("Count")

# Plot 4: Boxplot of 'Amount'
sns.boxplot(x=data['Amount'], color='lightgreen', ax=axes[3])
axes[3].set_title("Boxplot of Transaction Amount")
axes[3].set_xlabel("Amount")

# Plot 5: Boxplot of 'Time'
sns.boxplot(x=data['Time'], color='lightcoral', ax=axes[4])
axes[4].set_title("Boxplot of Transaction Time")
axes[4].set_xlabel("Time (seconds)")

# Remove the unused subplot (if any)
if len(axes) > 5:
    fig.delaxes(axes[5])

plt.tight_layout()
plt.show()

In [None]:
# Data Cleaning and Feature Engineering

# Scale the 'Amount' and 'Time' features using RobustScaler (since PCA features are already scaled)
rob_scaler = RobustScaler()
data['Amount_scaled'] = rob_scaler.fit_transform(data['Amount'].values.reshape(-1,1))
data['Time_scaled'] = rob_scaler.fit_transform(data['Time'].values.reshape(-1,1))

# Display the first few rows to verify new features
print("\nFirst few rows with new features:")
print(data.head())

In [None]:
# Modeling

# Prepare data
features = [col for col in data.columns if col not in ['Class']]
X = data[features]
y = data['Class']

# Train-Test Split with stratification to preserve class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

In [None]:
# Logistic Regression

# Class-weigth balanced to handle class imbalance
log_reg = LogisticRegression(
    class_weight='balanced', 
    max_iter=10000, 
    random_state=42
)

# Train
log_reg.fit(X_train, y_train)

# Predict probabilities for the positive class (fraud)
y_scores_log = log_reg.predict_proba(X_test)[:, 1]

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred_log = (y_scores_log >= 0.5).astype(int)

# Evaluate
log_ap_score = average_precision_score(y_test, y_scores_log)
print("Logistic Regression AUPRC:", log_ap_score)

# Print confusion matrix and classification report
print("Confusion Matrix (Logistic Regression):")
print(confusion_matrix(y_test, y_pred_log))
print("Classification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_log))

In [None]:
# SVM

# Class_weight balanced to handle class imbalance
svm_clf = SVC(
    kernel='rbf',
    class_weight='balanced',
    probability=True,  # to get predict_proba
    random_state=42
)

# Train
svm_clf.fit(X_train, y_train)

# Predict probabilities
y_scores_svm = svm_clf.predict_proba(X_test)[:, 1]

# Convert probabilities to binary predictions
y_pred_svm = (y_scores_svm >= 0.5).astype(int)

# Evaluate
svm_ap_score = average_precision_score(y_test, y_scores_svm)
print("SVM AUPRC:", svm_ap_score)

# Confusion Matrix & Classification Report
print("Confusion Matrix (SVM):")
print(confusion_matrix(y_test, y_pred_svm))
print("Classification Report (SVM):")
print(classification_report(y_test, y_pred_svm))

In [None]:
# Random Forest

# Class_weight balanced to handle class imbalance
rf_clf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42
)

# Train
rf_clf.fit(X_train, y_train)

# Predict probabilities
y_scores_rf = rf_clf.predict_proba(X_test)[:, 1]

# Convert probabilities to binary predictions
y_pred_rf = (y_scores_rf >= 0.5).astype(int)

# Evaluate
rf_ap_score = average_precision_score(y_test, y_scores_rf)
print("Random Forest AUPRC:", rf_ap_score)

# Confusion Matrix & Classification Report
print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))
print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

In [None]:
# Tuning Models

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Pipeline for Logistic Regression
log_pipe = Pipeline([
    ('clf', LogisticRegression(class_weight='balanced', solver='liblinear', max_iter=1000))
])

# Parameter grid
param_grid_log = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__penalty': ['l1', 'l2']  # 'l1' requires solver='liblinear' or 'saga'
}

# GridSearchCV
log_grid = GridSearchCV(
    estimator=log_pipe,
    param_grid=param_grid_log,
    scoring='average_precision',  # optimizes AUPRC
    cv=cv,
    n_jobs=-1,  # use all available CPU cores
    verbose=1
)

# Fit on training data
log_grid.fit(X_train, y_train)

# Best estimator and its performance
print("Best Params for Logistic Regression:", log_grid.best_params_)
print("Best Average Precision (CV) for Logistic Regression:", log_grid.best_score_)

In [None]:
# Pipeline for SVM
svm_pipe = Pipeline([
    ('clf', SVC(class_weight='balanced', probability=True, random_state=42))
])

# Parameter grid
param_grid_svm = {
    'clf__C': [0.1, 1, 10],
    'clf__gamma': [0.01, 0.001],
    'clf__kernel': ['rbf', 'linear']
}

# GridSearchCV
svm_grid = GridSearchCV(
    estimator=svm_pipe,
    param_grid=param_grid_svm,
    scoring='average_precision',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

# Fit on training data
svm_grid.fit(X_train, y_train)

# Best estimator and its performance
print("Best Params for SVM:", svm_grid.best_params_)
print("Best Average Precision (CV) for SVM:", svm_grid.best_score_)

In [None]:
# Pipeline for Random Forest
rf = RandomForestClassifier(random_state=42)

# Parameter grid
param_grid_rf = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}

# GridSearchCV
rf_grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    scoring='average_precision',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

# Fit on training data
rf_grid.fit(X_train, y_train)

# Best estimator and its performance
print("Best Params for Random Forest:", rf_grid.best_params_)
print("Best Average Precision (CV) for Random Forest:", rf_grid.best_score_)

In [None]:
# Best models
best_log_model = log_grid.best_estimator_
best_svm_model = svm_grid.best_estimator_
best_rf_model = rf_grid.best_estimator_

# Evaluate each on the test set
models = {
    "Logistic Regression": best_log_model,
    "SVM": best_svm_model,
    "Random Forest": best_rf_model
}

for model_name, model in models.items():
    # For classifiers with predict_proba:
    if hasattr(model, "predict_proba"):
        y_scores = model.predict_proba(X_test)[:, 1]
    else:
        # Some models (like SVC with probability=False) only have decision_function
        y_scores = model.decision_function(X_test)
    
    # Average Precision on test set
    ap_test = average_precision_score(y_test, y_scores)
    
    # Convert probabilities to binary predictions at threshold=0.5
    y_pred = (y_scores >= 0.5).astype(int)
    
    print(f"\n=== {model_name} ===")
    print(f"Test AUPRC: {ap_test:.4f}")
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
# Fit Model Recommendation
best_params = {
    'n_estimators': 200,
    'max_depth': 10,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'class_weight': 'balanced',
    'random_state': 42
}
rf_clf = RandomForestClassifier(**best_params)
rf_clf.fit(X_train, y_train)

In [None]:
# Feature Importances
# Although PCA-transformed features limit direct interpretability, we can still see which components were most influential.
importances = rf_clf.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("\nTop 10 Important Features:")
print(importance_df.head(10))

# Plot Feature Importances (Top 10 for readability)
plt.figure(figsize=(8, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10), color='skyblue')
plt.title("Random Forest Feature Importances (Top 10)")
plt.tight_layout()
plt.show()

# Precision-Recall Curve
precisions, recalls, thresholds = precision_recall_curve(y_test, y_scores)
plt.figure(figsize=(7, 5))
plt.plot(recalls, precisions, label=f'Random Forest (AUPRC = {test_auprc:.4f})', color='blue')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()