# Loan Prediction Status Using Machine Learning 

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('loan_prediction.csv')

In [3]:
data.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


# Loan_ID : Unique Loan ID

# Gender : Male/ Female

# Married : Applicant married (Y/N)

# Dependents : Number of dependents

# Education : Applicant Education (Graduate/ Under Graduate)

# Self_Employed : Self employed (Y/N)

# ApplicantIncome : Applicant income

# CoapplicantIncome : Coapplicant income

# LoanAmount : Loan amount in thousands of dollars

# Loan_Amount_Term : Term of loan in months

# Credit_History : Credit history meets guidelines yes or no

# Property_Area : Urban/ Semi Urban/ Rural

# Loan_Status : Loan approved (Y/N) this is the target variable



# Find Shape of Our Dataset (Number of Rows And Number of Columns)


In [4]:
data.shape

(614, 13)

In [5]:
print("Number of Rows",data.shape[0])
print("Number of Columns",data.shape[1])


Number of Rows 614
Number of Columns 13


#  Missing Values Rectification

In [6]:
data = data.drop('Loan_ID',axis=1)

In [7]:
data.head(1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y


In [8]:
columns = ['Gender','Dependents','LoanAmount','Loan_Amount_Term']

In [9]:
columns

['Gender', 'Dependents', 'LoanAmount', 'Loan_Amount_Term']

In [10]:
data = data.dropna(subset=columns)

In [11]:
data.isnull().sum()*100 / len(data)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        5.424955
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.679928
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [12]:
data['Self_Employed'].mode()[0]

'No'

In [13]:
data['Self_Employed'] =data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])

In [14]:
data.isnull().sum()*100 / len(data)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        0.000000
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.679928
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [15]:
data['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [16]:
data['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [17]:
data['Credit_History'].mode()[0]



1.0

In [18]:
data['Credit_History'] =data['Credit_History'].fillna(data['Credit_History'].mode()[0])

In [19]:
data.isnull().sum()*100 / len(data)

Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

# Handling Categorical Columns

In [20]:
data.sample(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
288,Female,No,0,Graduate,No,4124,0.0,115.0,360.0,1.0,Semiurban,Y
533,Male,No,1,Graduate,No,11250,0.0,196.0,360.0,1.0,Semiurban,N
471,Male,Yes,1,Not Graduate,No,2653,1500.0,113.0,180.0,0.0,Rural,N
268,Female,No,0,Graduate,No,3418,0.0,135.0,360.0,1.0,Rural,N
570,Male,Yes,1,Graduate,No,3417,1750.0,186.0,360.0,1.0,Urban,Y


In [21]:
data['Dependents'] =data['Dependents'].replace(to_replace="3+",value='4')

In [22]:
data['Dependents'].unique()

array(['1', '0', '2', '4'], dtype=object)

In [23]:
data['Loan_Status'].unique()

array(['N', 'Y'], dtype=object)

In [24]:
data['Gender'] = data['Gender'].map({'Male':1,'Female':0}).astype('int')
data['Married'] = data['Married'].map({'Yes':1,'No':0}).astype('int')
data['Education'] = data['Education'].map({'Graduate':1,'Not Graduate':0}).astype('int')
data['Self_Employed'] = data['Self_Employed'].map({'Yes':1,'No':0}).astype('int')
data['Property_Area'] = data['Property_Area'].map({'Rural':0,'Semiurban':2,'Urban':1}).astype('int')
data['Loan_Status'] = data['Loan_Status'].map({'Y':1,'N':0}).astype('int')


In [25]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1,1


# Store Feature Matrix in X And Response (Target) in Vector y

In [26]:
X = data.drop('Loan_Status',axis=1)

y = data['Loan_Status']


# working on Feature scaling to streamline data

In [27]:
cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']

In [28]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
X[cols]=st.fit_transform(X[cols])


# Splitting The Dataset into (The Training Set) And (Test Set) & Applying K-Fold Cross Validation

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np


In [30]:

model_df={}
def model_val(model,X,y):
    X_train,X_test,y_train,y_test=train_test_split(X,y,
                                                   test_size=0.20,
                                                   random_state=42)
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    print(f"{model} accuracy is {accuracy_score(y_test,y_pred)}")
    
    score = cross_val_score(model,X,y,cv=5)
    print(f"{model} Avg cross val score is {np.mean(score)}")
    model_df[model]=round(np.mean(score)*100,2)
    



In [31]:
model_df

{}

# Models Evaluation

# LogisticRegression

In [32]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model_val(model,X,y)


LogisticRegression() accuracy is 0.8018018018018018
LogisticRegression() Avg cross val score is 0.8047829647829647


# SVC

In [33]:
from sklearn import svm
model = svm.SVC()
model_val(model,X,y)


SVC() accuracy is 0.7927927927927928
SVC() Avg cross val score is 0.7938902538902539


# DecisionTreeClassifier

In [34]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model_val(model,X,y)


DecisionTreeClassifier() accuracy is 0.7477477477477478
DecisionTreeClassifier() Avg cross val score is 0.7198034398034397


# RandomForestClassifier

In [35]:
from sklearn.ensemble import RandomForestClassifier
model =RandomForestClassifier()
model_val(model,X,y)


RandomForestClassifier() accuracy is 0.7837837837837838
RandomForestClassifier() Avg cross val score is 0.7866830466830467


# GradientBoostingClassifier

In [36]:
from sklearn.ensemble import GradientBoostingClassifier
model =GradientBoostingClassifier()
model_val(model,X,y)


GradientBoostingClassifier() accuracy is 0.7927927927927928
GradientBoostingClassifier() Avg cross val score is 0.7758067158067158


# Evaluate the Used Model  with  Hyperparameter Tuning

In [37]:
from sklearn.model_selection import RandomizedSearchCV

In [38]:
log_reg_grid={"C":np.logspace(-4,4,20),
             "solver":['liblinear']}


In [39]:
rs_log_reg=RandomizedSearchCV(LogisticRegression(),
                   param_distributions=log_reg_grid,
                  n_iter=20,cv=5,verbose=True)


In [40]:
rs_log_reg.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [42]:
rs_log_reg.best_score_

0.8047829647829647

In [43]:
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': 0.23357214690901212}

# Hyperparameter forSVC

In [44]:
svc_grid = {'C':[0.25,0.50,0.75,1],"kernel":["linear"]}

In [46]:
rs_svc=RandomizedSearchCV(svm.SVC(),
                  param_distributions=svc_grid,
                   cv=5,
                   n_iter=20,
                  verbose=True)


In [47]:
rs_svc.fit(X,y)



Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [48]:
rs_svc.best_score_

0.8066011466011467

In [50]:
rs_svc.best_params_

{'kernel': 'linear', 'C': 0.25}

# Hyperparameter for Random Forest Classifier

In [51]:
RandomForestClassifier()

In [52]:
rf_grid={'n_estimators':np.arange(10,1000,10),
  'max_features':['auto','sqrt'],
 'max_depth':[None,3,5,10,20,30],
 'min_samples_split':[2,5,20,50,100],
 'min_samples_leaf':[1,2,5,10]
 }


In [53]:

rs_rf=RandomizedSearchCV(RandomForestClassifier(),
                  param_distributions=rf_grid,
                   cv=5,
                   n_iter=20,
                  verbose=True)


In [54]:
rs_rf.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


55 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
55 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\welcome\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\welcome\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\welcome\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\welcome\AppData\Local\Programs\Python\Pytho

In [55]:

rs_rf.best_score_


0.8066175266175266

In [56]:
rs_rf.best_params_

{'n_estimators': 720,
 'min_samples_split': 50,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 20}

# Save the Model

In [57]:
X = data.drop('Loan_Status',axis=1)
y = data['Loan_Status']


In [58]:
rf = RandomForestClassifier(n_estimators=270,
 min_samples_split=5,
 min_samples_leaf=5,
 max_features='sqrt',
 max_depth=5)


In [59]:
rf.fit(X,y)

In [60]:
import joblib

In [61]:

joblib.dump(rf,'loan_status_predict')


['loan_status_predict']

In [62]:
model = joblib.load('loan_status_predict')

In [63]:
import pandas as pd

In [64]:
df = pd.DataFrame({
    'Gender':1,
    'Married':1,
    'Dependents':2,
    'Education':0,
    'Self_Employed':0,
    'ApplicantIncome':2889,
    'CoapplicantIncome':0.0,
    'LoanAmount':45,
    'Loan_Amount_Term':180,
    'Credit_History':0,
    'Property_Area':1
},index=[0])


In [65]:
result = model.predict(df)

In [67]:
if result==1:
    print("Loan Approved")
else:
    print("Loan Not Approved")

Loan Not Approved


# GUI for Loan Prediction

In [None]:
from tkinter import *
import joblib
import pandas as pd

def show_entry():
    # Retrieve user inputs from the entry widgets
    p1 = float(e1.get())
    p2 = float(e2.get())
    p3 = float(e3.get())
    p4 = float(e4.get())
    p5 = float(e5.get())
    p6 = float(e6.get())
    p7 = float(e7.get())
    p8 = float(e8.get())
    p9 = float(e9.get())
    p10 = float(e10.get())
    p11 = float(e11.get())
    
    # Load the pre-trained model
    model = joblib.load('loan_status_predict')
    
    # Create a DataFrame from the input values
    df = pd.DataFrame({
        'Gender': p1,
        'Married': p2,
        'Dependents': p3,
        'Education': p4,
        'Self_Employed': p5,
        'ApplicantIncome': p6,
        'CoapplicantIncome': p7,
        'LoanAmount': p8,
        'Loan_Amount_Term': p9,
        'Credit_History': p10,
        'Property_Area': p11
    }, index=[0])
    
    # Predict the loan status
    result = model.predict(df)
    
    # Display the prediction result in the GUI
    if result == 1:
        Label(master, text="Loan Approved", fg="green").grid(row=31)
    else:
        Label(master, text="Loan Not Approved", fg="red").grid(row=31)
        
# Create the main window
master = Tk()
master.title("Loan Status Prediction Using Machine Learning")

# Create and place the labels and entry widgets
Label(master, text="Loan Status Prediction", bg="black", fg="white").grid(row=0, columnspan=2)

Label(master, text="Gender [1:Male, 0:Female]").grid(row=1)
Label(master, text="Married [1:Yes, 0:No]").grid(row=2)
Label(master, text="Dependents [1, 2, 3, 4]").grid(row=3)
Label(master, text="Education").grid(row=4)
Label(master, text="Self_Employed").grid(row=5)
Label(master, text="ApplicantIncome").grid(row=6)
Label(master, text="CoapplicantIncome").grid(row=7)
Label(master, text="LoanAmount").grid(row=8)
Label(master, text="Loan_Amount_Term").grid(row=9)
Label(master, text="Credit_History").grid(row=10)
Label(master, text="Property_Area").grid(row=11)

# Create and place the entry widgets for user input
e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)
e8 = Entry(master)
e9 = Entry(master)
e10 = Entry(master)
e11 = Entry(master)

e1.grid(row=1, column=1)
e2.grid(row=2, column=1)
e3.grid(row=3, column=1)
e4.grid(row=4, column=1)
e5.grid(row=5, column=1)
e6.grid(row=6, column=1)
e7.grid(row=7, column=1)
e8.grid(row=8, column=1)
e9.grid(row=9, column=1)
e10.grid(row=10, column=1)
e11.grid(row=11, column=1)

# Create and place the predict button
Button(master, text="Predict", command=show_entry).grid(row=30, columnspan=2)

# Start the main event loop
mainloop()
