In [None]:
# Setup
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')

# Load data
X_train, X_test, y_train, y_test = joblib.load(
    r"C:\Users\Asus\Documents\GitHub\Credit-Scoring\output\models\processed_data_lgbm_v2.pkl"
)

print(f"Training samples: {len(X_train):,}")
print(f"Default rate: {y_train.mean()*100:.2f}%")

In [None]:
# Define Models
original_ratio = (y_train == 0).sum() / (y_train == 1).sum()

models = {
    'LightGBM': lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        scale_pos_weight=original_ratio,
        random_state=42,
        verbose=-1
    ),
    
    'XGBoost': xgb.XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        scale_pos_weight=original_ratio,
        random_state=42,
        verbosity=0,
        eval_metric='logloss'
    ),
    
    'CatBoost': CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        scale_pos_weight=original_ratio,
        random_state=42,
        verbose=False
    ),
    
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=50,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ),
    
    'HistGradientBoosting': HistGradientBoostingClassifier(
        max_iter=500,
        learning_rate=0.05,
        max_depth=6,
        random_state=42
    )
}

In [None]:
# Cross-Validation Comparison
print("="*80)
print("5-FOLD CROSS-VALIDATION COMPARISON")
print("="*80)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, model in models.items():
    print(f"\nüîÑ Training {name}...")
    
    # Cross-validation
    cv_scores = cross_val_score(
        model, X_train, y_train, 
        cv=cv, scoring='roc_auc', n_jobs=-1
    )
    
    # Train on full training set
    model.fit(X_train, y_train)
    
    # Test set evaluation
    y_prob_test = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_prob_test)
    
    results.append({
        'Model': name,
        'CV Mean AUC': cv_scores.mean(),
        'CV Std': cv_scores.std(),
        'Test AUC': test_auc,
        'Overfit': cv_scores.mean() - test_auc
    })
    
    print(f"   CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
    print(f"   Test AUC: {test_auc:.4f}")

# Results DataFrame
results_df = pd.DataFrame(results).sort_values('Test AUC', ascending=False)
print("\n" + "="*80)
print("üìä FINAL COMPARISON")
print("="*80)
print(results_df.to_string(index=False))

# Best model
best_model_name = results_df.iloc[0]['Model']
print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   Test AUC: {results_df.iloc[0]['Test AUC']:.4f}")