# 7. Final Model Comparison: ROC Curves and AUC

This notebook performs the definitive comparison of our three candidate models:
1.  **Logistic Regression** (Baseline)
2.  **XGBoost** (Intermediate)
3.  **CatBoost** (Champion)

To prove which model is best, we will not just look at accuracy. We will plot the **ROC (Receiver Operating Characteristic) Curve**.

The ROC curve shows the trade-off between **Recall** (catching bad loans) and **False Positives** (flagging good loans) at every possible probability threshold.
* **A perfect model** would bow sharply into the top-left corner.
* **The AUC (Area Under the Curve)** gives us a single score to rank them. (1.0 = Perfect, 0.5 = Random Guessing).

In [None]:
# Install CatBoost
!pip install catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.metrics import roc_curve, auc, classification_report
import os

# Create output directory
OUTPUT_DIR = "final_model_comparison"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

## Step 1: Train & Evaluate Logistic Regression
We load the **Scaled** dataset (`X_train.csv`) for this model.

In [None]:
print("--- 1. Evaluating Logistic Regression ---")
try:
    # Load the datasets
    X_train_lr = pd.read_csv('X_train.csv')
    y_train_lr = pd.read_csv('y_train.csv')
    X_test_lr = pd.read_csv('X_test.csv')
    y_test_lr = pd.read_csv('y_test.csv')

    # Drop Non-Numeric Columns
    non_numeric_cols = X_train_lr.select_dtypes(exclude=[np.number]).columns

    if not non_numeric_cols.empty:
        print(f"   Detected non-numeric columns: {list(non_numeric_cols)}")
        print("   Dropping them to prevent errors...")
        X_train_lr = X_train_lr.drop(columns=non_numeric_cols)
        X_test_lr = X_test_lr.drop(columns=non_numeric_cols)

    # Train
    model_lr = LogisticRegression(max_iter=1000, random_state=42)
    model_lr.fit(X_train_lr, y_train_lr.values.ravel())

    # Get Probabilities (Probability of Default)
    probs_lr = model_lr.predict_proba(X_test_lr)[:, 1]

    # Calculate ROC metrics
    fpr_lr, tpr_lr, _ = roc_curve(y_test_lr, probs_lr)
    auc_lr = auc(fpr_lr, tpr_lr)
    print(f"Logistic Regression AUC: {auc_lr:.4f}")

except FileNotFoundError:
    print("Error: Could not find 'X_train.csv'. Please upload the Scaled dataset.")

## Step 2: Train & Evaluate XGBoost
We load the **Tree-Optimized** dataset for this model.

In [None]:
print("\n--- 2. Evaluating XGBoost ---")
try:
    X_train_tree = pd.read_csv('X_train_tree.csv')
    y_train_tree = pd.read_csv('y_train_tree.csv')
    X_test_tree = pd.read_csv('X_test_tree.csv')
    y_test_tree = pd.read_csv('y_test_tree.csv')

    # Train
    model_xgb = xgb.XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1, eval_metric='logloss', use_label_encoder=False)
    model_xgb.fit(X_train_tree, y_train_tree.values.ravel())

    # Get Probabilities
    probs_xgb = model_xgb.predict_proba(X_test_tree)[:, 1]

    # Calculate ROC metrics
    fpr_xgb, tpr_xgb, _ = roc_curve(y_test_tree, probs_xgb)
    auc_xgb = auc(fpr_xgb, tpr_xgb)
    print(f"XGBoost AUC: {auc_xgb:.4f}")

except FileNotFoundError:
    print("Error: Could not find 'X_train_tree.csv'. Please upload the Tree dataset.")

## Step 3: Train & Evaluate CatBoost
We load the **Full Feature** dataset for this model.

In [None]:
print("\n--- 3. Evaluating CatBoost ---")
try:
    X_train_full = pd.read_csv('X_train_tree_full.csv')
    y_train_full = pd.read_csv('y_train_tree_full.csv')
    X_test_full = pd.read_csv('X_test_tree_full.csv')
    y_test_full = pd.read_csv('y_test_tree_full.csv')

    # FIX: Convert categorical columns to integers for CatBoost
    cat_cols = ['term', 'grade', 'sub_grade', 'home_ownership', 'verification_status', 'purpose']
    for col in cat_cols:
        if col in X_train_full.columns:
            X_train_full[col] = X_train_full[col].astype(int)
            X_test_full[col] = X_test_full[col].astype(int)

    # Get indices for CatBoost
    cat_indices = [X_train_full.columns.get_loc(c) for c in cat_cols if c in X_train_full.columns]

    # Train
    model_cb = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=0, random_seed=42)
    model_cb.fit(X_train_full, y_train_full.values.ravel(), cat_features=cat_indices)

    # Get Probabilities
    probs_cb = model_cb.predict_proba(X_test_full)[:, 1]

    # Calculate ROC metrics
    fpr_cb, tpr_cb, _ = roc_curve(y_test_full, probs_cb)
    auc_cb = auc(fpr_cb, tpr_cb)
    print(f"CatBoost AUC: {auc_cb:.4f}")

except FileNotFoundError:
    print("Error: Could not find 'X_train_tree_full.csv'. Please upload the Full dataset.")

## Step 4: Combined ROC Plot

We will plot all three curves on a single chart.

In [None]:
plt.figure(figsize=(10, 8))

# Plot Logistic Regression
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {auc_lr:.3f})', color='blue', linestyle='--')

# Plot XGBoost
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {auc_xgb:.3f})', color='orange')

# Plot CatBoost
plt.plot(fpr_cb, tpr_cb, label=f'CatBoost (AUC = {auc_cb:.3f})', color='green', linewidth=3)

# Plot Random Guess Line
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess (AUC = 0.500)')

# Formatting
plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12)
plt.ylabel('True Positive Rate (Recall)', fontsize=12)
plt.title('Final Model Comparison: ROC Curves', fontsize=16)
plt.legend(loc='lower right', fontsize=12)
plt.grid(alpha=0.3)

# Save
plt.savefig(os.path.join(OUTPUT_DIR, "final_roc_comparison.png"))
plt.show()

## Step 5: Performance Comparison Table

Finally, we generate a clean DataFrame summarizing the key metrics for performance evaluation.

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

# Helper function to get metrics
def get_metrics(y_true, y_pred, probs):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'AUC': roc_auc_score(y_true, probs),
        # Extract Class 1 (Default) metrics
        'Precision (Default)': classification_report(y_true, y_pred, output_dict=True)['1']['precision'],
        'Recall (Default)': classification_report(y_true, y_pred, output_dict=True)['1']['recall'],
        'F1 (Default)': classification_report(y_true, y_pred, output_dict=True)['1']['f1-score']
    }

# Gather all metrics
metrics = []
metrics.append({'Model': 'Logistic Regression', **get_metrics(y_test_lr, model_lr.predict(X_test_lr), probs_lr)})
metrics.append({'Model': 'XGBoost', **get_metrics(y_test_tree, model_xgb.predict(X_test_tree), probs_xgb)})
metrics.append({'Model': 'CatBoost', **get_metrics(y_test_full, model_cb.predict(X_test_full), probs_cb)})

# Create DataFrame
leaderboard = pd.DataFrame(metrics).set_index('Model')
leaderboard = leaderboard.sort_values(by='AUC', ascending=False)

# Display and Save
display(leaderboard)
leaderboard.to_csv(os.path.join(OUTPUT_DIR, "final_model_leaderboard.csv"))