### Set-up and Imports

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

from sklearn.metrics import (
    roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

!pip -q install ucimlrepo

# EDA Section

###Data Loading

In [None]:

# 1) Load dataset

def load_credit_default():
    """
    fetch via ucimlrepo
    """
    try:

        from ucimlrepo import fetch_ucirepo
        # Load from UCI repository (id=350)
        dataset = fetch_ucirepo(id=350)
        X = dataset.data.features.copy()
        y = dataset.data.targets.copy()
        # ucimlrepo sometimes returns y as DataFrame
        if isinstance(y, pd.DataFrame):
            y = y.iloc[:, 0]
        y = y.astype(int)
        return X, y

    except Exception as e:
        print("ucimlrepo fetch failed:", repr(e))



X, y = load_credit_default()
print("X shape:", X.shape, "y mean(default rate):", float(y.mean()))
X.head()


X shape: (30000, 23) y mean(default rate): 0.2212


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
0,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


###Exploratory Data Overview

In [None]:

# 2) EDA

# Missingness summary (top 10)
missing = X.isna().mean().sort_values(ascending=False).head(10)
print("\nTop missingness:\n", missing)

# Target balance
print("\nDefault rate:", y.mean())

# Quick numeric summary
num_cols_preview = X.select_dtypes(include=[np.number]).columns[:10]
print("\nNumeric cols preview:", list(num_cols_preview))


Top missingness:
 X1     0.0
X2     0.0
X3     0.0
X4     0.0
X5     0.0
X6     0.0
X7     0.0
X8     0.0
X9     0.0
X10    0.0
dtype: float64

Default rate: 0.2212

Numeric cols preview: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10']


###Train–Test Split

In [None]:

# 3) Split (stratified because default is imbalanced)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)