In [1]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, f1_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter


In [2]:
import kagglehub
path = kagglehub.dataset_download("nikhil1e9/loan-default")

df = pd.read_csv(f"{path}/Loan_default.csv")
df.head()

Using Colab cache for faster access to the 'loan-default' dataset.


Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [3]:
print("Initial Shape:", df.shape)

Initial Shape: (255347, 18)


In [4]:
print("Duplicate Rows:", df.duplicated().sum())
df = df.drop_duplicates()

Duplicate Rows: 0


In [5]:
print("\nMissing Values:\n", df.isnull().sum())


Missing Values:
 LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64


In [6]:
if 'LoanID' in df.columns:
    df = df.drop('LoanID', axis=1)

In [7]:
print("\nCleaned Shape:", df.shape)


Cleaned Shape: (255347, 17)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Age             255347 non-null  int64  
 1   Income          255347 non-null  int64  
 2   LoanAmount      255347 non-null  int64  
 3   CreditScore     255347 non-null  int64  
 4   MonthsEmployed  255347 non-null  int64  
 5   NumCreditLines  255347 non-null  int64  
 6   InterestRate    255347 non-null  float64
 7   LoanTerm        255347 non-null  int64  
 8   DTIRatio        255347 non-null  float64
 9   Education       255347 non-null  object 
 10  EmploymentType  255347 non-null  object 
 11  MaritalStatus   255347 non-null  object 
 12  HasMortgage     255347 non-null  object 
 13  HasDependents   255347 non-null  object 
 14  LoanPurpose     255347 non-null  object 
 15  HasCoSigner     255347 non-null  object 
 16  Default         255347 non-null  int64  
dtypes: float64

In [9]:
# Separate feature types
# num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols = ['Age','Income','LoanAmount','CreditScore','MonthsEmployed',
            'NumCreditLines','InterestRate','LoanTerm','DTIRatio']

#cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols = ['Education','EmploymentType','MaritalStatus','HasMortgage',
            'HasDependents','LoanPurpose','HasCoSigner']

target = 'Default'


print("Numeric Columns:", num_cols)
print("Categorical Columns:", cat_cols)

Numeric Columns: ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']
Categorical Columns: ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']


In [10]:
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [11]:
X = df_encoded.drop(target, axis=1)
y = df_encoded[target]

# Scale numerical features
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)

Training shape: (204277, 24)
Testing shape: (51070, 24)


**Apply SMOTE (Synthetic Minority Over-sampling Technique)**

This balance the dataset and helps models learn patterns from both classes equally.

In [13]:
# Before balancing
print("Before SMOTE:", Counter(y_train))

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# After balancing
print("After SMOTE:", Counter(y_train_bal))

Before SMOTE: Counter({0: 180555, 1: 23722})
After SMOTE: Counter({0: 180555, 1: 180555})


In [14]:
log_model = LogisticRegression(random_state=42, max_iter=1000, solver='saga')
log_model.fit(X_train_bal, y_train_bal)
y_pred_log = log_model.predict(X_test)

print("\n Logistic Regression ")
print("\nAccuracy:", accuracy_score(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_log))


 Logistic Regression 

Accuracy: 0.7034266692774623

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.72      0.81     45139
           1       0.21      0.56      0.31      5931

    accuracy                           0.70     51070
   macro avg       0.57      0.64      0.56     51070
weighted avg       0.84      0.70      0.75     51070


Confusion Matrix:
 [[32580 12559]
 [ 2587  3344]]


In [15]:
# --- Random Forest ---

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_bal, y_train_bal)
y_pred_rf = rf_model.predict(X_test)

print("\n Random Forest ")
print("\nAccuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


 Random Forest 

Accuracy: 0.8530644213824163

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.92     45139
           1       0.33      0.25      0.29      5931

    accuracy                           0.85     51070
   macro avg       0.62      0.59      0.60     51070
weighted avg       0.84      0.85      0.84     51070


Confusion Matrix:
 [[42059  3080]
 [ 4424  1507]]


In [16]:
# --- Naive Bayes ---

# Train model
nb_model = GaussianNB()
nb_model.fit(X_train_bal, y_train_bal)

# Predictions
y_pred_nb = nb_model.predict(X_test)

# Evaluation
print(" Naive Bayes Results ")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

 Naive Bayes Results 
Accuracy: 0.6963383591149402

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.71      0.80     45139
           1       0.22      0.62      0.32      5931

    accuracy                           0.70     51070
   macro avg       0.58      0.66      0.56     51070
weighted avg       0.85      0.70      0.75     51070

Confusion Matrix:
 [[31886 13253]
 [ 2255  3676]]


In [17]:
# --- XGBoost ---

# Train the model
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(X_train_bal, y_train_bal)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate results
print("\n XGBoost Results ")
print("\nAccuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))


 XGBoost Results 

Accuracy: 0.873722341883689

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93     45139
           1       0.41      0.19      0.26      5931

    accuracy                           0.87     51070
   macro avg       0.65      0.58      0.59     51070
weighted avg       0.84      0.87      0.85     51070


Confusion Matrix:
 [[43508  1631]
 [ 4818  1113]]


In [18]:

# Dictionary to store model results dynamically
results = {
    'Logistic Regression': {
        'Accuracy': accuracy_score(y_test, y_pred_log),
        'Precision': precision_score(y_test, y_pred_log),
        'Recall': recall_score(y_test, y_pred_log),
        'F1': f1_score(y_test, y_pred_log)
    },
    'Random Forest': {
        'Accuracy': accuracy_score(y_test, y_pred_rf),
        'Precision': precision_score(y_test, y_pred_rf),
        'Recall': recall_score(y_test, y_pred_rf),
        'F1': f1_score(y_test, y_pred_rf)
    },
    'Naive Bayes': {
        'Accuracy': accuracy_score(y_test, y_pred_nb),
        'Precision': precision_score(y_test, y_pred_nb),
        'Recall': recall_score(y_test, y_pred_nb),
        'F1': f1_score(y_test, y_pred_nb)
    },
    'XGBoost': {
        'Accuracy': accuracy_score(y_test, y_pred_xgb),
        'Precision': precision_score(y_test, y_pred_xgb),
        'Recall': recall_score(y_test, y_pred_xgb),
        'F1': f1_score(y_test, y_pred_xgb)
    }
}

# Convert to DataFrame
df_results = pd.DataFrame(results).T.reset_index()
df_results.rename(columns={'index': 'Model'}, inplace=True)

# Convert to percentage and round values
df_results[['Accuracy', 'Precision', 'Recall', 'F1']] = df_results[['Accuracy', 'Precision', 'Recall', 'F1']] * 100
df_results = df_results.round(2)

# Display table
print(" Model Comparison (%):")
display(df_results)


 Model Comparison (%):


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,70.34,21.03,56.38,30.63
1,Random Forest,85.31,32.85,25.41,28.66
2,Naive Bayes,69.63,21.71,61.98,32.16
3,XGBoost,87.37,40.56,18.77,25.66
