Loan Approval Prediction System

In [1155]:
import pandas as pd

df = pd.read_csv('loan_prediction.csv')

In [1156]:
#creating a loan status column
import numpy as np

np.random.seed(42)

df['total_income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['income_to_loan_ratio'] = df['total_income'] / df['LoanAmount']

df['loan_status'] = (
    ((df['income_to_loan_ratio'] > 4) & (df['Credit_History'] == 1)) |
    ((df['ApplicantIncome'] > 5000) & (df['Education'] == 1))
).astype(int)

In [1157]:
print(df['loan_status'].value_counts())

loan_status
1    275
0     92
Name: count, dtype: int64


In [1158]:
#splitting data
y = df['loan_status']
X = df.drop(columns=['Loan_ID', 'loan_status'])

In [1159]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [1160]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

Creating Pipelines

In [1161]:
#null values pipeline
cat_columns = ['Gender', 'Dependents', 'Self_Employed', "Property_Area", "Married", "Education"]    #(columns that need to be converted (mode usage))
median_columns = ['LoanAmount', 'total_income', 'income_to_loan_ratio']        #(alr numbers (median usage))
mode_columns = ['Loan_Amount_Term', 'Credit_History']    #(alr numbers (mode usage))

cat_pipeline = Pipeline([       #filling the missing values and immediately encoding them
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

median_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

mode_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

#now combining them
col_transfer = ColumnTransformer([
    ("cat", cat_pipeline, cat_columns),
    ("median", median_pipeline, median_columns),
    ("mode", mode_pipeline, mode_columns),
], remainder="passthrough")

from sklearn.pipeline import make_pipeline

preprocessor = make_pipeline(
    col_transfer
)

In [1162]:
#main ML pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

lr_pipeline = ImbPipeline([
    ("transform", col_transfer),
    ("smote", SMOTE(random_state=42)),
    ("model", LogisticRegression(class_weight="balanced"))
])

dt_pipeline = ImbPipeline([
    ("transform", col_transfer),
    ("smote", SMOTE(random_state=42)),
    ("model", DecisionTreeClassifier(class_weight="balanced"))
])

rf_pipeline = ImbPipeline([
    ("transform", col_transfer),
    ("smote", SMOTE(random_state=42)),
    ("model", RandomForestClassifier(class_weight="balanced"))
])

svm_pipeline = ImbPipeline([
    ("transform", col_transfer),
    ("smote", SMOTE(random_state=42)),
    ("model", SVC(class_weight="balanced"))
])

In [1163]:
print(y_train.value_counts())
print(y_test.value_counts())


loan_status
1    220
0     73
Name: count, dtype: int64
loan_status
1    55
0    19
Name: count, dtype: int64


MODEL TRAINING

1. Logistic Regression

In [1164]:
lr_pipeline.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [1165]:
#ACCURACY

lr_pipeline.score(X_test, y_test)

0.8648648648648649

In [1166]:
#precision
from sklearn.metrics import precision_score

lr_y_pred = lr_pipeline.predict(X_test)
print("Precision: ", precision_score(y_test, lr_y_pred))

Precision:  0.8947368421052632


In [1167]:
#confusion matrix
from sklearn.metrics import confusion_matrix

print("Confusion matrix:\n", confusion_matrix(y_test, lr_y_pred))

Confusion matrix:
 [[13  6]
 [ 4 51]]


In [1168]:
from sklearn.metrics import classification_report

y_pred = lr_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.76      0.68      0.72        19
           1       0.89      0.93      0.91        55

    accuracy                           0.86        74
   macro avg       0.83      0.81      0.82        74
weighted avg       0.86      0.86      0.86        74



2. Decision Tree Classifier

In [1169]:
dt_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [1170]:
#ACCURACY

dt_pipeline.score(X_test, y_test)

0.8108108108108109

In [1171]:
#precision
dt_y_pred = dt_pipeline.predict(X_test)
print("Precision: ", precision_score(y_test, dt_y_pred))

Precision:  0.8867924528301887


In [1172]:
#confusion matrix

print("Confusion matrix:\n", confusion_matrix(y_test, dt_y_pred))

Confusion matrix:
 [[13  6]
 [ 8 47]]


In [1173]:
#classification report

dt_pred = dt_pipeline.predict(X_test)
print(classification_report(y_test, dt_pred))


              precision    recall  f1-score   support

           0       0.62      0.68      0.65        19
           1       0.89      0.85      0.87        55

    accuracy                           0.81        74
   macro avg       0.75      0.77      0.76        74
weighted avg       0.82      0.81      0.81        74



3. Random Forest Classifier

In [1174]:
rf_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [1175]:
#ACCURACY

rf_pipeline.score(X_test, y_test)

0.918918918918919

In [1176]:
#precision
rf_y_pred = rf_pipeline.predict(X_test)
print("Precision: ", precision_score(y_test, rf_y_pred))

Precision:  0.9016393442622951


In [1177]:
#confusion matrix

print("Confusion matrix:\n", confusion_matrix(y_test, rf_y_pred))

Confusion matrix:
 [[13  6]
 [ 0 55]]


In [1178]:
#classification report

rf_pred = rf_pipeline.predict(X_test)
print(classification_report(y_test, rf_pred))


              precision    recall  f1-score   support

           0       1.00      0.68      0.81        19
           1       0.90      1.00      0.95        55

    accuracy                           0.92        74
   macro avg       0.95      0.84      0.88        74
weighted avg       0.93      0.92      0.91        74



4. SVM

In [1179]:
svm_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [1180]:
#ACCURACY

svm_pipeline.score(X_test, y_test)

0.6486486486486487

In [1181]:
#precision
svm_y_pred = svm_pipeline.predict(X_test)
print("Precision: ", precision_score(y_test, svm_y_pred))

Precision:  0.7843137254901961


In [1182]:
#confusion matrix

print("Confusion matrix:\n", confusion_matrix(y_test, svm_y_pred))

Confusion matrix:
 [[ 8 11]
 [15 40]]


In [1183]:
#classification report

sv_pred = svm_pipeline.predict(X_test)
print(classification_report(y_test, sv_pred))

              precision    recall  f1-score   support

           0       0.35      0.42      0.38        19
           1       0.78      0.73      0.75        55

    accuracy                           0.65        74
   macro avg       0.57      0.57      0.57        74
weighted avg       0.67      0.65      0.66        74



Model Saving using Joblib: Random Forest Classifier as it has better accuracy and Precision score than the rest of the models

In [1184]:
import joblib

joblib.dump(rf_pipeline, 'random_forest_loan_model.pkl')

['random_forest_loan_model.pkl']

In [1185]:
best_loan_model = joblib.load('random_forest_loan_model.pkl')

In [1186]:
best_loan_model

In [1187]:
print(list(X.columns))

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'total_income', 'income_to_loan_ratio']


In [1188]:
import pandas as pd

sample = pd.DataFrame([[
    1,          # Gender
    1,          # Married
    0,          # Dependents
    1,          # Education
    1,          # Self_Employed
    9000,       # ApplicantIncome
    5000,       # CoapplicantIncome
    100,        # LoanAmount
    200,        # Loan_Amount_Term
    1,          # Credit_History
    0,          # Property_Area (assuming 0=Rural)
    6000,       # total_income
    60          # income_to_loan_ratio
]], columns=[
    'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
    'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 
    'Credit_History', 'Property_Area', 'total_income', 'income_to_loan_ratio'
])

pred = best_loan_model.predict(sample)
print(pred)

[1]
