***LOADING MODULES***

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
np.random.seed(42)

***LOAD AND PRE-PROCESSING***

In [6]:
def load_and_preprocess_data():
    try:
        df = pd.read_csv('GiveMeSomeCredit-training.csv')
    except FileNotFoundError:
        print("Training file not found!")
        return None

    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', axis=1, inplace=True)
    if 'Id' in df.columns:
        df.drop('Id', axis=1, inplace=True)
    imputer = SimpleImputer(strategy='median')
    df['MonthlyIncome'] = imputer.fit_transform(df[['MonthlyIncome']])
    df['NumberOfDependents'] = imputer.fit_transform(df[['NumberOfDependents']])

    return df


In [7]:
def engineer_features(df):
    df['DebtToIncomeRatio'] = df['RevolvingUtilizationOfUnsecuredLines'] * df['MonthlyIncome']
    df['PaymentToIncomeRatio'] = df['DebtRatio'] * df['MonthlyIncome']
    df['CreditHistoryLength'] = df['NumberOfOpenCreditLinesAndLoans'] / (df['age'] + 1)
    df['DelinquencyScore'] = (
        df['NumberOfTime30-59DaysPastDueNotWorse'] * 0.3 +
        df['NumberOfTime60-89DaysPastDueNotWorse'] * 0.5 +
        df['NumberOfTimes90DaysLate'] * 1.0
    )
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.median(), inplace=True)
    return df


***SMOTE***

In [8]:
def apply_smote(X, y):
    smote = SMOTE(random_state=42)
    return smote.fit_resample(X, y)


***TRAIN MODEL***

In [9]:
def train_models(X_train, y_train):
    models = {
        "XGBoost": xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, eval_metric='auc'),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    }

    for name, model in models.items():
        print(f"\n🔧 Training {name}...")
        model.fit(X_train, y_train)
        models[name] = model

    return models

***EVALUATE MODEL***

In [10]:
def evaluate_model(name, model, X_test, y_test, feature_names):
    print(f"\n Evaluation for {name}")
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(classification_report(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} - Confusion Matrix')
    plt.savefig(f'{name.lower().replace(" ", "_")}_confusion_matrix.png')
    plt.close()


    if hasattr(model, 'feature_importances_'):
        plt.figure(figsize=(10, 6))
        sns.barplot(x=model.feature_importances_, y=feature_names)
        plt.title(f'{name} - Feature Importance')
        plt.savefig(f'{name.lower().replace(" ", "_")}_feature_importance.png')
        plt.close()

In [11]:
def main():
    df = load_and_preprocess_data()
    if df is None:
        return

    df = engineer_features(df)

    X = df.drop('SeriousDlqin2yrs', axis=1)
    y = df['SeriousDlqin2yrs']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_resampled, y_train_resampled = apply_smote(X_train_scaled, y_train)

    models = train_models(X_train_resampled, y_train_resampled)

    for name, model in models.items():
        evaluate_model(name, model, pd.DataFrame(X_test_scaled, columns=X.columns), y_test, X.columns)


    models["XGBoost"].save_model('credit_risk_model.json')
    print(" XGBoost model saved as 'credit_risk_model.json'")
    print(" Visuals saved for all models (confusion matrix & feature importance)")

if __name__ == "__main__":
    main()


🔧 Training XGBoost...

🔧 Training Random Forest...

🔧 Training Gradient Boosting...

📈 Evaluation for XGBoost
              precision    recall  f1-score   support

           0       0.96      0.92      0.94     27995
           1       0.32      0.51      0.39      2005

    accuracy                           0.89     30000
   macro avg       0.64      0.72      0.67     30000
weighted avg       0.92      0.89      0.91     30000

ROC AUC Score: 0.8448832464293811

📈 Evaluation for Random Forest




              precision    recall  f1-score   support

           0       0.95      0.96      0.95     27995
           1       0.36      0.35      0.36      2005

    accuracy                           0.92     30000
   macro avg       0.66      0.65      0.66     30000
weighted avg       0.91      0.92      0.91     30000

ROC AUC Score: 0.8269595345445995

📈 Evaluation for Gradient Boosting
              precision    recall  f1-score   support

           0       0.97      0.88      0.92     27995
           1       0.28      0.63      0.38      2005

    accuracy                           0.87     30000
   macro avg       0.62      0.76      0.65     30000
weighted avg       0.92      0.87      0.89     30000

ROC AUC Score: 0.8533297761133868




 XGBoost model saved as 'credit_risk_model.json'
 Visuals saved for all models (confusion matrix & feature importance)
