In [None]:
# Credit Card Fraud Detection - Modeling with Resampling and AUPRC Evaluation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (precision_recall_curve, auc, f1_score, classification_report, 
                             confusion_matrix, average_precision_score)
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Load data
df = pd.read_csv('creditcard.csv')

# Data overview
print(f"Shape: {df.shape}")
print(df['Class'].value_counts(normalize=True))

# Split features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Standardize 'Amount' and 'Time'
scaler = StandardScaler()
X[['Time', 'Amount']] = scaler.fit_transform(X[['Time', 'Amount']])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Helper function to evaluate model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    prc_auc = average_precision_score(y_test, y_proba)
    f1 = f1_score(y_test, y_pred)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"F1 Score: {f1:.4f}, AUPRC: {prc_auc:.4f}")

    # Precision-recall curve
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    plt.plot(recall, precision, label=f'AUPRC = {prc_auc:.4f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.grid()
    plt.show()

# 1. Logistic Regression with Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)
model_lr_rus = LogisticRegression(max_iter=1000)
model_lr_rus.fit(X_rus, y_rus)
print("\n--- Logistic Regression (Random Undersampling) ---")
evaluate_model(model_lr_rus, X_test, y_test)

# 2. Random Forest with SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
model_rf_smote = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf_smote.fit(X_smote, y_smote)
print("\n--- Random Forest (SMOTE Oversampling) ---")
evaluate_model(model_rf_smote, X_test, y_test)

# 3. XGBoost with SMOTE + Grid Search
grid_xgb = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1]
}
model_xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
                         grid_xgb, scoring='average_precision', cv=3, verbose=0)
model_xgb.fit(X_smote, y_smote)
print("\n--- XGBoost (SMOTE + GridSearchCV) ---")
evaluate_model(model_xgb.best_estimator_, X_test, y_test)

# 4. LightGBM with SMOTE + Grid Search
grid_lgbm = {
    'n_estimators': [50, 100],
    'num_leaves': [31, 64],
    'learning_rate': [0.05, 0.1]
}
model_lgbm = GridSearchCV(LGBMClassifier(), grid_lgbm, scoring='average_precision', cv=3, verbose=0)
model_lgbm.fit(X_smote, y_smote)
print("\n--- LightGBM (SMOTE + GridSearchCV) ---")
evaluate_model(model_lgbm.best_estimator_, X_test, y_test)

# Save AUPRCs for comparison
results = pd.DataFrame({
    'Model': [
        'Logistic Regression (Undersampling)', 
        'Random Forest (SMOTE)', 
        'XGBoost (SMOTE + GridSearchCV)', 
        'LightGBM (SMOTE + GridSearchCV)'
    ],
    'F1 Score': [
        f1_score(y_test, model_lr_rus.predict(X_test)),
        f1_score(y_test, model_rf_smote.predict(X_test)),
        f1_score(y_test, model_xgb.best_estimator_.predict(X_test)),
        f1_score(y_test, model_lgbm.best_estimator_.predict(X_test))
    ],
    'AUPRC': [
        average_precision_score(y_test, model_lr_rus.predict_proba(X_test)[:, 1]),
        average_precision_score(y_test, model_rf_smote.predict_proba(X_test)[:, 1]),
        average_precision_score(y_test, model_xgb.best_estimator_.predict_proba(X_test)[:, 1]),
        average_precision_score(y_test, model_lgbm.best_estimator_.predict_proba(X_test)[:, 1])
    ]
})

print("\nModel Comparison:")
print(results)

# Optional: Save results
# results.to_csv("model_performance_comparison.csv", index=False)

# End of notebook
