In [2]:
import pandas as pd
df = pd.read_csv("Task 3 and 4_Loan_Data.csv")
print(df.shape)
print(df.dtypes)
print(df.head())
print(df.isna().sum())

(10000, 8)
customer_id                   int64
credit_lines_outstanding      int64
loan_amt_outstanding        float64
total_debt_outstanding      float64
income                      float64
years_employed                int64
fico_score                    int64
default                       int64
dtype: object
   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4  

In [8]:
target_col = 'default'
print(df[target_col].value_counts(normalize=True))
print(df[target_col].value_counts())
print(df.groupby(target_col).mean().transpose())
print(df.describe().transpose())


default
0    0.8149
1    0.1851
Name: proportion, dtype: float64
default
0    8149
1    1851
Name: count, dtype: int64
default                              0             1
customer_id               4.967005e+06  5.007914e+06
credit_lines_outstanding  7.441404e-01  4.618044e+00
loan_amt_outstanding      4.092629e+03  4.454855e+03
total_debt_outstanding    6.322165e+03  1.927058e+04
income                    6.988389e+04  7.072674e+04
years_employed            4.765247e+00  3.617504e+00
fico_score                6.469388e+02  5.962577e+02
                            count          mean           std           min  \
customer_id               10000.0  4.974577e+06  2.293890e+06  1.000324e+06   
credit_lines_outstanding  10000.0  1.461200e+00  1.743846e+00  0.000000e+00   
loan_amt_outstanding      10000.0  4.159677e+03  1.421399e+03  4.678397e+01   
total_debt_outstanding    10000.0  8.718917e+03  6.627165e+03  3.165273e+01   
income                    10000.0  7.003990e+04  2.007221e+04 

In [12]:
def predict_pd(input_dict_or_df):
    if isinstance(input_dict_or_df, dict):
        df_in = pd.DataFrame([input_dict_or_df])[feature_order]
    else:
        df_in = pd.DataFrame(input_dict_or_df)[feature_order]
    Xs = best_scaler.transform(df_in)
    probs = best_model.predict_proba(Xs)[:, 1]
    return probs if len(probs) > 1 else float(probs[0])


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features = [c for c in df.columns if c not in ['customer_id', 'default']]
X = df[features].copy()
y = df['default'].copy()

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, stratify=y_trainval, random_state=42)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, brier_score_loss, average_precision_score
import pandas as pd

lr = LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42)
dt = DecisionTreeClassifier(max_depth=6, class_weight='balanced', random_state=42)
rf = RandomForestClassifier(n_estimators=200, max_depth=8, class_weight='balanced', random_state=42)

lr.fit(X_train_s, y_train)
dt.fit(X_train_s, y_train)
rf.fit(X_train_s, y_train)

cal_lr = CalibratedClassifierCV(lr, cv='prefit').fit(X_val_s, y_val)
cal_dt = CalibratedClassifierCV(dt, cv='prefit').fit(X_val_s, y_val)
cal_rf = CalibratedClassifierCV(rf, cv='prefit').fit(X_val_s, y_val)

models = {'Logistic': cal_lr, 'DecisionTree': cal_dt, 'RandomForest': cal_rf}
results = []
for name, model in models.items():
    probs = model.predict_proba(X_test_s)[:,1]
    auc = roc_auc_score(y_test, probs)
    brier = brier_score_loss(y_test, probs)
    ap = average_precision_score(y_test, probs)
    results.append({'model': name, 'roc_auc': auc, 'brier': brier, 'avg_precision': ap})
res_df = pd.DataFrame(results).sort_values('roc_auc', ascending=False).reset_index(drop=True)
print(res_df)


          model   roc_auc     brier  avg_precision
0      Logistic  0.999977  0.003564       0.999898
1  RandomForest  0.999751  0.003777       0.998993
2  DecisionTree  0.996619  0.004020       0.989861




In [15]:
import numpy as np

best_model = models['Logistic']   # choose best from res_df (adjust if you prefer another)
best_scaler = scaler
feature_order = features

def predict_pd(input_dict_or_df):
    if isinstance(input_dict_or_df, dict):
        df_in = pd.DataFrame([input_dict_or_df])
    else:
        df_in = pd.DataFrame(input_dict_or_df)
    X_in = df_in[feature_order].values
    Xs = best_scaler.transform(X_in)
    probs = best_model.predict_proba(Xs)[:,1]
    return probs if len(probs)>1 else float(probs[0])

def expected_loss(input_features, exposure, recovery_rate=0.10):
    pd_vals = predict_pd(input_features)
    lgd = 1.0 - recovery_rate
    if isinstance(pd_vals, np.ndarray):
        return pd_vals * exposure * lgd
    else:
        return pd_vals * exposure * lgd

# example usage
sample = {'credit_lines_outstanding':1, 'loan_amt_outstanding':5000, 'total_debt_outstanding':7000, 'income':60000, 'years_employed':4, 'fico_score':660}
print("PD:", predict_pd(sample))
print("Expected loss for exposure 10000:", expected_loss(sample, exposure=10000))


PD: 2.6246917568776445e-06
Expected loss for exposure 10000: 0.023622225811898802


