Loading Dataset

In [72]:
import pandas as pd
df=pd.read_csv('test_lending_club.csv')

df.head()

Unnamed: 0,issue_d,sub_grade,term,home_ownership,fico_range_low,total_acc,pub_rec,revol_util,annual_inc,int_rate,...,pub_rec_bankruptcies,addr_state,initial_list_status,fico_range_high,revol_bal,id,open_acc,emp_length,loan_status,time_to_earliest_cr_line
0,2016-07-01,A4,36 months,MORTGAGE,830.0,13.0,0.0,12.0,105682.0,7.99,...,0.0,TX,w,834.0,4266.0,84759443,5.0,11.0,1.0,789004.8
1,2016-07-01,B5,36 months,RENT,660.0,25.0,0.0,59.4,68000.0,11.49,...,0.0,CA,w,664.0,6944.0,84433407,11.0,6.0,1.0,294624.0
2,2016-07-01,D2,36 months,MORTGAGE,660.0,17.0,1.0,40.9,75000.0,17.99,...,1.0,AZ,w,664.0,5528.0,84646007,6.0,3.0,1.0,349747.2
3,2016-07-01,A1,36 months,RENT,740.0,36.0,0.0,27.7,114000.0,5.32,...,0.0,CA,w,744.0,25912.0,84477434,16.0,1.0,1.0,691632.0
4,2016-07-01,C4,60 months,MORTGAGE,680.0,14.0,0.0,44.3,47000.0,14.49,...,0.0,OH,w,684.0,4205.0,84525798,12.0,2.0,0.0,591667.2


Step 1 : Preprocess the data, handling missing values and class imbalance using techniques like SMOTE.

In [73]:
target = 'loan_status'

In [74]:
#seperate features and target

X = df.drop(columns=[target])
y = df[target]

In [75]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

#For numeric columns, fill with median
num_cols = X.select_dtypes(include=np.number).columns
imputer_num = SimpleImputer(strategy='median')
X[num_cols] = imputer_num.fit_transform(X[num_cols])

In [76]:
#For categorical columns, fill with mode
cat_cols = X.select_dtypes(include='object').columns
imputer_cat = SimpleImputer(strategy='most_frequent')
X[cat_cols] = imputer_cat.fit_transform(X[cat_cols])

In [77]:
# Encode categorical variables
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

In [78]:
nan_rows_in_y = y.isnull()
X = X[~nan_rows_in_y]
y = y[~nan_rows_in_y]

In [79]:
#Traintest
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [68]:
#Handle class imbalance with SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_res).value_counts())

Before SMOTE: loan_status
1.0    63087
0.0    11798
Name: count, dtype: int64
After SMOTE: loan_status
1.0    63087
0.0    63087
Name: count, dtype: int64


Step 2 : Training LightGBM

In [69]:
from lightgbm import LGBMClassifier
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train_res, y_train_res)
lgb_preds = lgb_model.predict(X_test)

[LightGBM] [Info] Number of positive: 63087, number of negative: 63087
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4473
[LightGBM] [Info] Number of data points in the train set: 126174, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Step 3 : Classificaion Report

In [70]:
from sklearn.metrics import classification_report, confusion_matrix

print("=== LightGBM Model ===")
print(classification_report(y_test, lgb_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, lgb_preds))

=== LightGBM Model ===
              precision    recall  f1-score   support

         0.0       0.64      0.40      0.49      2950
         1.0       0.90      0.96      0.93     15772

    accuracy                           0.87     18722
   macro avg       0.77      0.68      0.71     18722
weighted avg       0.85      0.87      0.86     18722

Confusion Matrix:
 [[ 1179  1771]
 [  664 15108]]


Step 4 : Recommendations

In [71]:
print("\n--- Recommendations for Lenders ---")
print("""
1. Focus on applicants flagged as high risk (predicted default) for further manual review.
2. Use financial indicators like debt-to-income ratio, credit history, and loan amount to refine lending decisions.
3. Consider tighter credit limits or higher interest rates for borderline risk applicants.
4. Use the model predictions to prioritize customer support and early intervention to reduce defaults.
""")


--- Recommendations for Lenders ---

1. Focus on applicants flagged as high risk (predicted default) for further manual review.
2. Use financial indicators like debt-to-income ratio, credit history, and loan amount to refine lending decisions.
3. Consider tighter credit limits or higher interest rates for borderline risk applicants.
4. Use the model predictions to prioritize customer support and early intervention to reduce defaults.



Before SMOTE: loan_status
1.0    12090
0.0     1634
Name: count, dtype: int64
After SMOTE: loan_status
0.0    12090
1.0    12090
Name: count, dtype: int64
[LightGBM] [Info] Number of positive: 12090, number of negative: 12090
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004129 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4448
[LightGBM] [Info] Number of data points in the train set: 24180, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
=== LightGBM Model ===
              precision    recall  f1-score   support

         0.0       0.72      0.61      0.66       409
         1.0       0.95      0.97      0.96      3023

    accuracy                           0.93      3432
   macro avg       0.83      0.79      0.81      3432
weighted avg       0.92      0.93      0.92      3432

Confusion Matrix:
 [[ 250  159]
 [  98 2925]]

--- Recommendations for Lenders ---

1. Focus on applicants flagged as high risk (predicted default) for further manual review.
2. Use financial indicators like debt-to-income ratio, credit history, and loan amount to refine lending decisions.
3. Consider tighter credit limits or higher interest rates for borderline risk applicants.
4. Use the model predictions to prioritize customer support and early intervention to reduce defaults.
