# Loan Approval Prediction

In [46]:
import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt
import seaborn as sns

In [47]:
data  = pd.read_csv('LoanApprovalPrediction.csv')
data.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            598 non-null    object 
 1   Gender             598 non-null    object 
 2   Married            598 non-null    object 
 3   Dependents         586 non-null    float64
 4   Education          598 non-null    object 
 5   Self_Employed      598 non-null    object 
 6   ApplicantIncome    598 non-null    int64  
 7   CoapplicantIncome  598 non-null    float64
 8   LoanAmount         577 non-null    float64
 9   Loan_Amount_Term   584 non-null    float64
 10  Credit_History     549 non-null    float64
 11  Property_Area      598 non-null    object 
 12  Loan_Status        598 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 60.9+ KB


***Finding null values***

In [49]:
data.isna().sum()

Loan_ID               0
Gender                0
Married               0
Dependents           12
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term     14
Credit_History       49
Property_Area         0
Loan_Status           0
dtype: int64

In [50]:
data.select_dtypes(include=['float64']).columns

Index(['Dependents', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
       'Credit_History'],
      dtype='object')

In [51]:
for i in data.select_dtypes(include=['float64']).columns:
    data[i] = data[i].mean()

In [52]:
data.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            598 non-null    object 
 1   Gender             598 non-null    object 
 2   Married            598 non-null    object 
 3   Dependents         598 non-null    float64
 4   Education          598 non-null    object 
 5   Self_Employed      598 non-null    object 
 6   ApplicantIncome    598 non-null    int64  
 7   CoapplicantIncome  598 non-null    float64
 8   LoanAmount         598 non-null    float64
 9   Loan_Amount_Term   598 non-null    float64
 10  Credit_History     598 non-null    float64
 11  Property_Area      598 non-null    object 
 12  Loan_Status        598 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 60.9+ KB


In [54]:
data.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.755973,Graduate,No,5849,1631.499866,144.968804,341.917808,0.843352,Urban,Y
1,LP001003,Male,Yes,0.755973,Graduate,No,4583,1631.499866,144.968804,341.917808,0.843352,Rural,N


***Encoding*** 

In [55]:
data.select_dtypes(include=['object']).columns

Index(['Loan_ID', 'Gender', 'Married', 'Education', 'Self_Employed',
       'Property_Area', 'Loan_Status'],
      dtype='object')

In [56]:
from  sklearn.preprocessing import LabelEncoder

Encoder = LabelEncoder()
data['Gender'] = Encoder.fit_transform(data['Gender'])
data['Married'] = Encoder.fit_transform(data['Married'])
data['Education'] = Encoder.fit_transform(data['Education'])
data['Self_Employed'] = Encoder.fit_transform(data['Self_Employed'])
data['Property_Area'] = Encoder.fit_transform(data['Property_Area'])
data['Loan_Status'] = Encoder.fit_transform(data['Loan_Status'])


In [57]:
data.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0.755973,0,0,5849,1631.499866,144.968804,341.917808,0.843352,2,1
1,LP001003,1,1,0.755973,0,0,4583,1631.499866,144.968804,341.917808,0.843352,0,0


In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            598 non-null    object 
 1   Gender             598 non-null    int32  
 2   Married            598 non-null    int32  
 3   Dependents         598 non-null    float64
 4   Education          598 non-null    int32  
 5   Self_Employed      598 non-null    int32  
 6   ApplicantIncome    598 non-null    int64  
 7   CoapplicantIncome  598 non-null    float64
 8   LoanAmount         598 non-null    float64
 9   Loan_Amount_Term   598 non-null    float64
 10  Credit_History     598 non-null    float64
 11  Property_Area      598 non-null    int32  
 12  Loan_Status        598 non-null    int32  
dtypes: float64(5), int32(6), int64(1), object(1)
memory usage: 46.8+ KB


In [59]:
# Assuming data is your DataFrame
X = data.drop(columns=['Loan_ID', 'Loan_Status'])

In [60]:
y = data['Loan_Status']

In [61]:
X , y

(     Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \
 0         1        0    0.755973          0              0             5849   
 1         1        1    0.755973          0              0             4583   
 2         1        1    0.755973          0              1             3000   
 3         1        1    0.755973          1              0             2583   
 4         1        0    0.755973          0              0             6000   
 ..      ...      ...         ...        ...            ...              ...   
 593       0        0    0.755973          0              0             2900   
 594       1        1    0.755973          0              0             4106   
 595       1        1    0.755973          0              0             8072   
 596       1        1    0.755973          0              0             7583   
 597       0        0    0.755973          0              1             4583   
 
      CoapplicantIncome  LoanAmount  L

In [62]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X , y , test_size = 0.2 , random_state = 0)

In [63]:
X_train.shape, X_test.shape

((478, 11), (120, 11))

In [64]:
y_train.shape , y_test.shape

((478,), (120,))

In [65]:
from sklearn.tree import DecisionTreeClassifier

Tree_Model = DecisionTreeClassifier()
Tree_Model.fit(X_train, y_train)

In [66]:
y_pred = Tree_Model.predict(X_test)

In [81]:
from sklearn.metrics import  accuracy_score
from sklearn.metrics import classification_report


In [84]:
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred) *100
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 66.66666666666666
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.38      0.39        34
           1       0.76      0.78      0.77        86

    accuracy                           0.67       120
   macro avg       0.58      0.58      0.58       120
weighted avg       0.66      0.67      0.66       120



# Hyperparameter tuning

In [86]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'auto', 'sqrt', 'log2']
}

model = DecisionTreeClassifier()

GridSearch = GridSearchCV( model , param_grid= param_grid , cv=5, n_jobs=-1, verbose=2 )

GridSearch.fit(X_train, y_train)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


1080 fits failed out of a total of 4320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
720 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mdaza\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mdaza\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\mdaza\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\mdaza\AppData\Local\Programs\Python\Python310

In [92]:
GridSearch.best_params_

{'criterion': 'gini',
 'max_depth': 40,
 'max_features': 'log2',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'splitter': 'random'}

In [94]:
GridSearch.best_estimator_

# Make predictions on the test set with the best estimator
y_pred = GridSearch.predict(X_test)

In [96]:
# Evaluate the best classifier
accuracy = accuracy_score(y_test, y_pred)*100
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 67.5
Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.15      0.20        34
           1       0.72      0.88      0.80        86

    accuracy                           0.68       120
   macro avg       0.53      0.52      0.50       120
weighted avg       0.61      0.68      0.63       120

