In [13]:
from google.colab import files
uploaded = files.upload()


Saving Loan_default.csv to Loan_default (1).csv


Load and Clean Data

In [15]:
import pandas as pd

# Load dataset
data = pd.read_csv('Loan_default.csv')

# Identify numeric and categorical columns
numeric_cols = ['Age', 'Income', 'LoanAmount', 'CreditScore',
                'MonthsEmployed', 'NumCreditLines', 'InterestRate',
                'LoanTerm', 'DTIRatio']

categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus',
                    'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']

# Impute missing numeric columns with median
for col in numeric_cols:
    data[col].fillna(data[col].median(), inplace=True)

# Impute missing categorical columns with mode
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Drop duplicate rows
data = data.drop_duplicates()

print(f"After cleaning, data shape: {data.shape}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


After cleaning, data shape: (255347, 18)


Feature Selection and Encoding

In [16]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

target_col = 'Default'
features = numeric_cols + categorical_cols

X = data[features]
y = data[target_col]

# Map binary categorical to 0/1
binary_map = {'Yes': 1, 'No': 0}
for col in ['HasMortgage', 'HasDependents', 'HasCoSigner']:
    X[col] = X[col].map(binary_map)

# Label encode multi-class categorical features
for col in ['Education', 'EmploymentType', 'MaritalStatus', 'LoanPurpose']:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Standard scale numerical features
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].map(binary_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].map(binary_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].map(binary_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

Train Test Split

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)


Handle Imbalance Data Using SMOTE

In [18]:
from imblearn.over_sampling import SMOTE
import pandas as pd

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Class distribution before SMOTE:\n", y_train.value_counts())
print("Class distribution after SMOTE:\n", pd.Series(y_train_res).value_counts())


Class distribution before SMOTE:
 Default
0    180555
1     23722
Name: count, dtype: int64
Class distribution after SMOTE:
 Default
0    180555
1    180555
Name: count, dtype: int64


Feature Engineering

In [19]:
# Create interaction feature Income * CreditScore
X_train_res['Income_CreditScore'] = X_train_res['Income'] * X_train_res['CreditScore']
X_test['Income_CreditScore'] = X_test['Income'] * X_test['CreditScore']

# Scale the new feature
X_train_res['Income_CreditScore'] = scaler.fit_transform(X_train_res[['Income_CreditScore']])
X_test['Income_CreditScore'] = scaler.transform(X_test[['Income_CreditScore']])


Train Baseline Models & Evaluate with Threshold Tuning

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def evaluate_model(model, X_test, y_test, threshold=0.4):
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:,1]
    else:
        y_prob = model.decision_function(X_test)
    y_pred = (y_prob >= threshold).astype(int)
    print(f"Model: {model.__class__.__name__}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}")
    print("="*60)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=150, class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_res, y_train_res)
    evaluate_model(model, X_test, y_test, threshold=0.4)


Training Logistic Regression...
Model: LogisticRegression
              precision    recall  f1-score   support

           0       0.95      0.59      0.73     45139
           1       0.19      0.74      0.30      5931

    accuracy                           0.61     51070
   macro avg       0.57      0.66      0.52     51070
weighted avg       0.86      0.61      0.68     51070

Confusion Matrix:
 [[26615 18524]
 [ 1543  4388]]
ROC AUC: 0.7319
Training Random Forest...
Model: RandomForestClassifier
              precision    recall  f1-score   support

           0       0.92      0.81      0.87     45139
           1       0.25      0.48      0.33      5931

    accuracy                           0.78     51070
   macro avg       0.59      0.65      0.60     51070
weighted avg       0.85      0.78      0.80     51070

Confusion Matrix:
 [[36780  8359]
 [ 3073  2858]]
ROC AUC: 0.7258
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Model: XGBClassifier
              precision    recall  f1-score   support

           0       0.91      0.92      0.92     45139
           1       0.34      0.31      0.32      5931

    accuracy                           0.85     51070
   macro avg       0.62      0.61      0.62     51070
weighted avg       0.84      0.85      0.85     51070

Confusion Matrix:
 [[41589  3550]
 [ 4121  1810]]
ROC AUC: 0.7310


Model Comparison And Selection

In [21]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

def model_comparison(models, X_test, y_test, threshold=0.4):
    results = []
    for name, model in models.items():
        y_prob = model.predict_proba(X_test)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
        roc_auc = roc_auc_score(y_test, y_prob)
        results.append({'Model': name, 'Precision': precision, 'Recall': recall, 'F1-Score': f1, 'ROC AUC': roc_auc})
    return pd.DataFrame(results)

comparison_df = model_comparison(models, X_test, y_test)
print(comparison_df)

# Select best model based on business priority, e.g., highest recall or F1-score
best_model_name = comparison_df.sort_values(by='Recall', ascending=False).iloc[0]['Model']
print(f"Selected best model for tuning: {best_model_name}")

best_model = models[best_model_name]


                 Model  Precision    Recall  F1-Score   ROC AUC
0  Logistic Regression   0.191515  0.739842  0.304268  0.731859
1        Random Forest   0.254792  0.481875  0.333333  0.725783
2              XGBoost   0.337687  0.305176  0.320609  0.731021
Selected best model for tuning: Logistic Regression


Hyperparameter Tuning for Logistic Regression

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'penalty': ['l1', 'l2'],      # Regularization norms
    'solver': ['liblinear']       # Supports both l1 and l2 penalties
}

logreg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=2)
grid_search.fit(X_train_res, y_train_res)

print("Best Hyperparameters:", grid_search.best_params_)
print("Best CV ROC AUC:", grid_search.best_score_)

best_logreg = grid_search.best_estimator_


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV ROC AUC: 0.7947701967775918


 Evaluate the Tuned Logistic Regression on Test Data

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

best_lr = grid_search.best_estimator_

def evaluate_final(model, X_test, y_test, threshold=0.4):
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = (y_prob >= threshold).astype(int)
    print("Test Set Evaluation (Logistic Regression):")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}")

evaluate_final(best_lr, X_test, y_test, threshold=0.4)


Test Set Evaluation (Logistic Regression):
              precision    recall  f1-score   support

           0       0.95      0.59      0.73     45139
           1       0.19      0.74      0.30      5931

    accuracy                           0.61     51070
   macro avg       0.57      0.66      0.52     51070
weighted avg       0.86      0.61      0.68     51070

Confusion Matrix:
 [[26616 18523]
 [ 1542  4389]]
ROC AUC: 0.7319


Train, Evaluate, and Compare Deep Learning Model

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

dl_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_res.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

dl_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

dl_model.fit(X_train_res, y_train_res,
             validation_split=0.2,
             epochs=10,
             batch_size=32,
             verbose=2)

# Evaluate DL model on test set
dl_pred_prob = dl_model.predict(X_test).ravel()
dl_pred = (dl_pred_prob >= 0.4).astype(int)
dl_accuracy = (dl_pred == y_test).mean()
dl_auc = roc_auc_score(y_test, dl_pred_prob)
print(f"DL Model - Accuracy: {dl_accuracy:.4f}, ROC AUC: {dl_auc:.4f}")
print(classification_report(y_test, dl_pred))


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


9028/9028 - 15s - 2ms/step - accuracy: 0.7264 - auc: 0.7807 - loss: 0.5402 - val_accuracy: 0.6018 - val_auc: 0.0000e+00 - val_loss: 0.6699
Epoch 2/10
9028/9028 - 14s - 2ms/step - accuracy: 0.7341 - auc: 0.7912 - loss: 0.5303 - val_accuracy: 0.6052 - val_auc: 0.0000e+00 - val_loss: 0.6746
Epoch 3/10
9028/9028 - 21s - 2ms/step - accuracy: 0.7355 - auc: 0.7933 - loss: 0.5279 - val_accuracy: 0.6152 - val_auc: 0.0000e+00 - val_loss: 0.6716
Epoch 4/10
9028/9028 - 14s - 2ms/step - accuracy: 0.7367 - auc: 0.7945 - loss: 0.5269 - val_accuracy: 0.6016 - val_auc: 0.0000e+00 - val_loss: 0.6907
Epoch 5/10
9028/9028 - 14s - 2ms/step - accuracy: 0.7371 - auc: 0.7955 - loss: 0.5256 - val_accuracy: 0.5818 - val_auc: 0.0000e+00 - val_loss: 0.7090
Epoch 6/10
9028/9028 - 14s - 2ms/step - accuracy: 0.7367 - auc: 0.7959 - loss: 0.5254 - val_accuracy: 0.5839 - val_auc: 0.0000e+00 - val_loss: 0.7132
Epoch 7/10
9028/9028 - 14s - 2ms/step - accuracy: 0.7380 - auc: 0.7967 - loss: 0.5245 - val_accuracy: 0.5783 - 

Model Performance Comparison Table

In [28]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, classification_report

# Step 1: Calculate probabilities and binary predictions for logistic regression
y_prob_lr = best_lr.predict_proba(X_test)[:, 1]
threshold = 0.4
y_pred_lr = (y_prob_lr >= threshold).astype(int)

# Step 2: Calculate similarly for deep learning model (assuming defined as dl_model)
y_prob_dl = dl_model.predict(X_test).ravel()
y_pred_dl = (y_prob_dl >= threshold).astype(int)

# Step 3: Compute metrics for logistic regression
lr_recall = recall_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_auc = roc_auc_score(y_test, y_prob_lr)

# Step 4: Compute metrics for deep learning
dl_recall = recall_score(y_test, y_pred_dl)
dl_precision = precision_score(y_test, y_pred_dl)
dl_auc = roc_auc_score(y_test, y_prob_dl)

# Step 5: Create comparison dataframe
import pandas as pd

comparison_df = pd.DataFrame({
    "Model": ["Logistic Regression", "Deep Learning"],
    "ROC AUC": [lr_auc, dl_auc],
    "Recall (Default)": [lr_recall, dl_recall],
    "Precision (Default)": [lr_precision, dl_precision]
})

print(comparison_df)

# Optional: Print classification reports for detailed insights
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Deep Learning Classification Report:\n", classification_report(y_test, y_pred_dl))


[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 877us/step
                 Model   ROC AUC  Recall (Default)  Precision (Default)
0  Logistic Regression  0.731903           0.74001             0.191559
1        Deep Learning  0.737472           0.62030             0.231078
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.59      0.73     45139
           1       0.19      0.74      0.30      5931

    accuracy                           0.61     51070
   macro avg       0.57      0.66      0.52     51070
weighted avg       0.86      0.61      0.68     51070

Deep Learning Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.73      0.82     45139
           1       0.23      0.62      0.34      5931

    accuracy                           0.72     51070
   macro avg       0.58      0.67      0.58     51070
weighted avg       0.85 