In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
default_of_credit_card_clients = fetch_ucirepo(id=350) 
  
# data (as pandas dataframes) 
X = default_of_credit_card_clients.data.features 
y = default_of_credit_card_clients.data.targets 

In [None]:
X_renamed = X.copy()

X_renamed.columns = [
    "credit_limit",        # X1
    "gender",              # X2
    "education",           # X3
    "marital_status",      # X4
    "age",                 # X5

    # Repayment status (most recent â†’ oldest)
    "pay_sep",             # X6
    "pay_aug",             # X7
    "pay_jul",             # X8
    "pay_jun",             # X9
    "pay_may",             # X10
    "pay_apr",             # X11

    # Bill amounts
    "bill_sep",            # X12
    "bill_aug",            # X13
    "bill_jul",            # X14
    "bill_jun",            # X15
    "bill_may",            # X16
    "bill_apr",            # X17

    # Payment amounts
    "pay_amt_sep",         # X18
    "pay_amt_aug",         # X19
    "pay_amt_jul",         # X20
    "pay_amt_jun",         # X21
    "pay_amt_may",         # X22
    "pay_amt_apr"          # X23
]



In [None]:
pay_cols = [
    "pay_sep", "pay_aug", "pay_jul",
    "pay_jun", "pay_may", "pay_apr"
]

X_feat = X_renamed.copy()

# Features 
X_feat["recent_delay"] = X_feat["pay_sep"]

X_feat["max_delay"] = X_feat[pay_cols].max(axis=1)
# axis =1 as we need customer wise aggregation
X_feat["mean_delay"] = (X_feat[pay_cols].clip(lower=0).mean(axis=1))
# Here we used clip() as we need to focus only on delay payments thats how banks work

X_feat['num_delayed_months'] = (X_feat[pay_cols] > 0).sum(axis=1)
# Out of 6 months, how many times was the customer late?

X_feat["any_severe_delay"] = (X_feat["max_delay"] >= 3).astype(int)


In [None]:
# Bill and Payment Features

bill_cols = [
    "bill_sep", "bill_aug", "bill_jul",
    "bill_jun", "bill_may", "bill_apr"
]

pay_amt_cols = [
    "pay_amt_sep", "pay_amt_aug", "pay_amt_jul",
    "pay_amt_jun", "pay_amt_may", "pay_amt_apr"
]

X_feat["avg_bill"] = X_feat[bill_cols].mean(axis=1)
X_feat["avg_payment"] = X_feat[pay_amt_cols].mean(axis=1)

total_bill = X_feat[bill_cols].sum(axis=1)
total_payment = X_feat[pay_amt_cols].sum(axis=1)


# Here we will use clip(upper=2) cus we dont want our model to explode and anything above those can be treated same.
X_feat["payment_ratio"] = (
    total_payment / total_bill.replace(0, 1)
).clip(upper=2)

X_feat["utilization_ratio"] = (
    X_feat["avg_bill"] / X_feat["credit_limit"].replace(0, 1)
).clip(upper=2)

X_feat["recent_payment_ratio"] = (
    X_feat["pay_amt_sep"] / X_feat["bill_sep"].replace(0, 1)
).clip(upper=2)

Here we have removed raw colums from X6 to X23 and we have also removed gender, education, marital_Status as they do not add any good predicting power instead they canmake model bias.

In [None]:
features = [
    "credit_limit",
    "age",

    "recent_delay",
    "max_delay",
    "mean_delay",
    "num_delayed_months",
    "any_severe_delay",

    "avg_bill",
    "avg_payment",
    "payment_ratio",
    "recent_payment_ratio",
    "utilization_ratio"
]

X_final = X_feat[features].copy()

print(X_final.shape)
X_final.head()


In [None]:
y.mean()*100
# 22 % of customers default


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_full_train, X_test, y_full_train, y_test = train_test_split(
    X_final,
    y,
    test_size= 0.2,
    stratify=y,
    random_state=11
)

X_train, X_val, y_train, y_val = train_test_split(
    X_full_train,
    y_full_train,
    test_size= 0.25,
    random_state=11,
    stratify= y_full_train
)

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
type(y_train), type(y_val), type(y_test)

In [None]:
# Changing y to a numpy array for best results cus we need 1D target vectors.

y_train = y_train.values.ravel()
y_val   = y_val.values.ravel()
y_test  = y_test.values.ravel()

In [None]:
y_train, y_val, y_test

In [None]:
lr = LogisticRegression(max_iter = 1000)

lr.fit(X_train, y_train)

y_val_proba = lr.predict_proba(X_val)[:, 1]
y_train_proba = lr.predict_proba(X_train)[:, 1]

train_auc = roc_auc_score(y_train, y_train_proba)
val_auc   = roc_auc_score(y_val, y_val_proba)

train_auc, val_auc

In [None]:
feature_names = X_final.columns.tolist()

coef = pd.DataFrame({
    "feature": feature_names,
    "coef": lr.coef_[0]
}).sort_values(by="coef", ascending=False)

coef


The strongest features of risk of defaults were behavioural features such as delay in payments.
Age has mild positive risk as older people have slight chance to default.
Credit limit has negative coef as higher the credit limit more buffer and more trust between bank and customer.
Average_payment is also negative as people who pay regularly are safer.
payment_ratio overlaps with avg_payment.
recent_payment_ratio also also lowers risk of default.

Here mean_delay is slightly negative, it could be due to as this is a multivariate model and standalone features do not add meaning.
any_severe_delay also got captured in max_delay.

Here we will drop some unnecessaery features like any_severe_delay as it is redundant with max_delay and we will also drop mean_delay as it is collinear to few dealy features so it doesnt add anything new to model.

In [None]:
new_features = [
    "credit_limit",
    "age",
    "recent_delay",
    "max_delay",
    "num_delayed_months",
    "avg_bill",
    "avg_payment",
    "payment_ratio",
    "recent_payment_ratio",
    "utilization_ratio"
]



In [None]:
X_train_df = pd.DataFrame(X_train, columns=feature_names)
X_val_df   = pd.DataFrame(X_val,   columns=feature_names)
X_test_df  = pd.DataFrame(X_test,  columns=feature_names)

In [None]:
X_train_df = X_train_df[new_features]
X_val_df = X_val_df[new_features]
X_test_df = X_test_df[new_features]

In [None]:
new_feature_names = X_train_df.columns.tolist()

In [None]:
final_scaler = StandardScaler()

X_train_p = final_scaler.fit_transform(X_train_df)
X_val_p   = final_scaler.transform(X_val_df)
X_test_p  = final_scaler.transform(X_test_df)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_p, y_train)

y_train_proba_1 = lr.predict_proba(X_train_p)[:,1]
y_val_proba_1 = lr.predict_proba(X_val_p)[:,1]

train_auc = roc_auc_score(y_train, lr.predict_proba(X_train_p)[:,1])
val_auc   = roc_auc_score(y_val,   lr.predict_proba(X_val_p)[:,1])

train_auc, val_auc


In [None]:
# Creaating Risk Buckets:

risk = pd.DataFrame({
    "y_true": y_val,
    "y_score": y_val_proba_1
})


In [None]:
risk["risk_bucket"] = pd.qcut(
    risk["y_score"],
    q=5,
    labels=["Very Low", "Low", "Medium", "High", "Very High"]
)

In [None]:
risk

In [None]:
risk.groupby("risk_bucket")["y_true"].mean()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,          # IMPORTANT: keep shallow
    min_samples_leaf=200, # stabilizes probabilities
    random_state=1
)

rf.fit(X_train_p, y_train)

train_auc = roc_auc_score(y_train, rf.predict_proba(X_train_p)[:,1])
val_auc   = roc_auc_score(y_val, rf.predict_proba(X_val_p)[:,1])

train_auc, val_auc


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=0.01,
    max_depth=6,
    random_state=42
)

gb.fit(X_train_p, y_train)

train_auc = roc_auc_score(y_train, gb.predict_proba(X_train_p)[:,1])
val_auc   = roc_auc_score(y_val, gb.predict_proba(X_val_p)[:,1])

train_auc, val_auc

In [None]:
# Here gradient boosting is capturing non linear patters but not genralising that's why there is diffrence in trian and val auc.
# Gradient Boosting is overfillting here as after increasing n_estimators and decreasing learning rate still it overfits and this means 
# celling has reached now further if we use gb it will learn noise/small patterns and overfit.

# Here we should stop as train auc keeps on increasing by tuning hyperparameters but val auc remains constant or drops slightly. 
# so random forest will be final model.

#### Final Model - Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,          # IMPORTANT: keep shallow
    min_samples_leaf=200, # stabilizes probabilities
    random_state=1,
    n_jobs = -1           # n_jobs = -1 (use all available CPU cores)
)

rf.fit(X_train_p, y_train)

y_train_proba_2 = rf.predict_proba(X_train_p)[:,1]
y_val_proba_2 = rf.predict_proba(X_val_p)[:,1]

train_auc = roc_auc_score(y_train, rf.predict_proba(X_train_p)[:,1])
val_auc   = roc_auc_score(y_val, rf.predict_proba(X_val_p)[:,1])

train_auc, val_auc

In [None]:
# Creaating Risk Buckets:

risk = pd.DataFrame({
    "y_true": y_val,
    "y_score": y_val_proba_2
})

risk["risk_bucket"] = pd.qcut(
    risk["y_score"],
    q=5,
    labels=["Very Low", "Low", "Medium", "High", "Very High"]
)

risk

In [None]:
y_test_proba = rf.predict_proba(X_test_p)[:, 1]

risk_test = pd.DataFrame({
    "y_true": y_test,
    "y_score": y_test_proba
})

risk_test["risk_bucket"] = pd.qcut(
    risk_test["y_score"],
    q=5,
    labels=["Very Low", "Low", "Medium", "High", "Very High"]
)

risk_test

In [None]:
risk_test.groupby("risk_bucket")["y_true"].mean()