In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except:
    HAS_XGB = False

try:
    from lightgbm import LGBMClassifier
    HAS_LGBM = True
except:
    HAS_LGBM = False

print("Libraries imported successfully.")
print("XGBoost available:", HAS_XGB)
print("LightGBM available:", HAS_LGBM)

✅ Libraries imported successfully.
XGBoost available: True
LightGBM available: True


In [3]:
data_path = Path(r'C:\Users\Asus\Documents\GitHub\Credit-Scoring\data\train_feature_engineered.csv')

if data_path.exists():
    df = pd.read_csv(data_path)
    print("✅ Loaded feature-engineered dataset:", df.shape)
else:
    raise FileNotFoundError("train_feature_engineered.csv not found! Please run 02_feature_engineering.ipynb first.")

df.head()

✅ Loaded feature-engineered dataset: (307511, 237)


Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,EMERGENCYSTATE_MODE_Yes,AGE_YEARS,EMPLOYED_YEARS,PAYMENT_RATIO,DEBT_TO_INCOME,CREDIT_UTILIZATION,CREDIT_HISTORY_SCORE,INCOME_PER_PERSON,CREDIT_TO_INCOME_RATIO,IS_CASH_LOAN
0,0.0,1,0.0,0.181506,0.122368,0.090032,0.077441,0.256321,0.85214,0.705433,...,False,0.111161,0.001744,0.080216,0.10357,0.113573,0.00157,0.181506,0.10357,0
1,3e-06,0,0.0,0.250783,0.422505,0.132924,0.271605,0.045016,0.951929,0.959566,...,False,0.522886,0.003253,0.093443,0.337793,0.332261,0.002136,0.238243,0.337793,0
2,6e-06,0,0.0,0.042952,0.030457,0.020025,0.023569,0.134897,0.827335,0.648326,...,False,0.651466,0.000616,0.019433,0.029203,0.029756,0.000373,0.042952,0.029203,0
3,1.1e-05,0,0.0,0.112229,0.090586,0.109477,0.063973,0.107023,0.601451,0.661387,...,False,0.649154,0.00832,0.100384,0.081446,0.08514,0.005045,0.106617,0.081446,0
4,1.4e-05,0,0.0,0.098373,0.158376,0.078975,0.117845,0.39288,0.825268,0.519522,...,False,0.701409,0.008318,0.068177,0.144191,0.141679,0.004889,0.098373,0.144191,0


In [4]:
assert 'TARGET' in df.columns, "❌ TARGET column missing! Ensure the dataset includes it."

X = df.drop(columns=['TARGET'])
y = df['TARGET']

# Split data 80% train / 20% test (stratified by default status)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (246008, 236)
Test shape: (61503, 236)


In [5]:
models = {
    "Logistic Regression (Balanced)": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(max_depth=6, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

if HAS_XGB:
    models["XGBoost"] = XGBClassifier(
        n_estimators=300, max_depth=4, learning_rate=0.1,
        subsample=0.8, colsample_bytree=0.8,
        eval_metric='logloss', random_state=42
    )

if HAS_LGBM:
    models["LightGBM"] = LGBMClassifier(
        n_estimators=500, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        objective='binary', random_state=42
    )

print("Models ready:")
for m in models:
    print(" -", m)

✅ Models ready:
 - Logistic Regression (Balanced)
 - Decision Tree
 - Random Forest
 - Gradient Boosting
 - XGBoost
 - LightGBM


In [9]:
results = []

for name, model in models.items():
    print(f"\n🚀 Training {name} ...")

    # ✅ If model supports epoch-like training progress
    if name in ["Gradient Boosting", "XGBoost", "LightGBM"]:
        # ---- Gradient Boosting ----
        if name == "Gradient Boosting":
            print(f"→ Training {model.n_estimators} boosting stages...")
            model.fit(X_train, y_train)
            # staged_predict() gives predictions at each iteration
            for i, y_pred_stage in enumerate(model.staged_predict(X_test)):
                if i % 20 == 0 or i == model.n_estimators - 1:
                    f1_stage = f1_score(y_test, y_pred_stage)
                    print(f"Epoch {i:03d}: F1={f1_stage:.4f}")

        # ---- XGBoost ----
        elif name == "XGBoost":
            # Set eval metric via set_params to avoid passing eval_metric to fit (compatibility across xgboost versions)
            model.set_params(eval_metric="auc")
            model.fit(
                X_train, y_train,
                eval_set=[(X_test, y_test)],
                verbose=True  # print progress each iteration
            )

        # ---- LightGBM ----
        elif name == "LightGBM":
            model.fit(
                X_train, y_train,
                eval_set=[(X_test, y_test)],
                eval_metric="auc",
                verbose=20  # print every 20 boosting rounds
            )

    else:
        # Non-iterative models (Logistic Regression, Decision Tree, Random Forest)
        model.fit(X_train, y_train)

    # --- Evaluate final model ---
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    pr_auc = average_precision_score(y_test, y_prob)

    results.append({
        "Model": name,
        "Accuracy": round(acc, 4),
        "F1": round(f1, 4),
        "ROC-AUC": round(auc, 4),
        "PR-AUC": round(pr_auc, 4)
    })

    print(f"\n✅ {name} Results:")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC: {auc:.4f}")
    print(f"PR-AUC: {pr_auc:.4f}")

# --- Summarize results ---
results_df = pd.DataFrame(results).sort_values(by="ROC-AUC", ascending=False).reset_index(drop=True)
results_df


🚀 Training Logistic Regression (Balanced) ...

✅ Logistic Regression (Balanced) Results:
Accuracy: 0.6909
F1 Score: 0.2627
ROC-AUC: 0.7492
PR-AUC: 0.2272

🚀 Training Decision Tree ...

✅ Decision Tree Results:
Accuracy: 0.9192
F1 Score: 0.0004
ROC-AUC: 0.7155
PR-AUC: 0.1898

🚀 Training Random Forest ...

✅ Random Forest Results:
Accuracy: 0.9193
F1 Score: 0.0012
ROC-AUC: 0.7157
PR-AUC: 0.2119

🚀 Training Gradient Boosting ...
→ Training 100 boosting stages...
Epoch 000: F1=0.0000
Epoch 020: F1=0.0000
Epoch 040: F1=0.0052
Epoch 060: F1=0.0143
Epoch 080: F1=0.0198
Epoch 099: F1=0.0225

✅ Gradient Boosting Results:
Accuracy: 0.9196
F1 Score: 0.0225
ROC-AUC: 0.7530
PR-AUC: 0.2423

🚀 Training XGBoost ...
[0]	validation_0-auc:0.70282
[1]	validation_0-auc:0.71159
[2]	validation_0-auc:0.71434
[3]	validation_0-auc:0.72010
[4]	validation_0-auc:0.72201
[5]	validation_0-auc:0.72321
[6]	validation_0-auc:0.72686
[7]	validation_0-auc:0.72829
[8]	validation_0-auc:0.73023
[9]	validation_0-auc:0.73131


TypeError: LGBMClassifier.fit() got an unexpected keyword argument 'verbose'