In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [2]:
data = pd.read_csv('loan_data.csv')

In [3]:
data.describe()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,75000.5,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,43301.414527,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37500.75,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,75000.5,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112500.25,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,150000.0,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Unnamed: 0                            150000 non-null  int64  
 1   SeriousDlqin2yrs                      150000 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 3   age                                   150000 non-null  int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 5   DebtRatio                             150000 non-null  float64
 6   MonthlyIncome                         120269 non-null  float64
 7   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 8   NumberOfTimes90DaysLate               150000 non-null  int64  
 9   NumberRealEstateLoansOrLines          150000 non-null  int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 11  

# DATA CLEANING

In [5]:
data.drop(columns=['Unnamed: 0'] , inplace=True)

##### handling null values 

In [6]:
data.isnull().sum()

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [7]:
data['NumberOfDependents'] = data['NumberOfDependents'].fillna(data['NumberOfDependents'].median())

In [8]:
data['MonthlyIncome'] = data['MonthlyIncome'].fillna(data['MonthlyIncome'].median())

In [9]:
data = data[data['age']>0]

In [10]:
data = data[data['RevolvingUtilizationOfUnsecuredLines'] <= 1.0]

In [11]:
data = data[data['MonthlyIncome'] > 0]

In [12]:
data = data[data['DebtRatio'] <= 10]

##### Trainign model

In [13]:
y = data['SeriousDlqin2yrs']
X = data.drop('SeriousDlqin2yrs', axis=1) # Features

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# applying this because the classes are biased 
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)



In [15]:
model = LogisticRegression(max_iter=5000, solver='saga')
model.fit(X_train_scaled, y_train_smote)



In [21]:
pred = model.predict(X_train_scaled)

In [16]:
new_applicants = pd.DataFrame({
    'RevolvingUtilizationOfUnsecuredLines': [0.15, 0.8],
    'age': [45, 33],
    'NumberOfTime30-59DaysPastDueNotWorse': [0, 1],
    'DebtRatio': [0.3, 45.0],
    'MonthlyIncome': [5500, 3000],
    'NumberOfOpenCreditLinesAndLoans': [6, 8],
    'NumberOfTimes90DaysLate': [0, 2],
    'NumberRealEstateLoansOrLines': [1, 0],
    'NumberOfTime60-89DaysPastDueNotWorse': [0, 1],
    'NumberOfDependents': [2, 0]
})

new_applicants_scaled = scaler.transform(new_applicants)

predictions = model.predict(new_applicants_scaled)

for i, pred in enumerate(predictions):
    print(f"Applicant {i+1} loan default prediction: {'Default' if pred == 1 else 'No Default'}")


Applicant 1 loan default prediction: No Default
Applicant 2 loan default prediction: Default
