In [2]:
# Importing libraries
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import tensorflow as tf

from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


In [3]:
# Importing and read the loan_approval_dataset.csv.
loan_approval_df = pd.read_csv('Resources/loan_approval_dataset.csv')
# loan_approval_df.head()

# Dropping "loan id" row
loan_approval_df = loan_approval_df.drop('loan_id', axis = 1)
loan_approval_df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
# Cheking for the columns names - issues defining X and y variables
loan_approval_df.keys()

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [5]:
# Seperate data: X = features and y = target variable
## Analysis/code obtained from class activity: 20-Supervised-Learning\2\Activities\08-Stu_Predicting_Bank_Customers
y = loan_approval_df[' loan_status']
X = loan_approval_df.drop(columns=' loan_status')
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000


In [6]:
# Encoding the categorical variables using 'get_dummies'
X = pd.get_dummies(X)
X.head()

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,education_ Graduate,education_ Not Graduate,self_employed_ No,self_employed_ Yes
0,2,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1,0,1,0
1,0,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0,1,0,1
2,3,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1,0,1,0
3,3,8200000,30700000,8,467,18200000,3300000,23300000,7900000,1,0,1,0
4,5,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0,1,0,1


In [7]:
# Separating data into training and testing subsets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [8]:
# Using 'StandardScaler'
scaler = StandardScaler()

# Fitting data into the scaler
X_scaler = scaler.fit(X_train)

# Transforming the data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transforming the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)


In [9]:
# Instantiate the KNeighborsClassifier model with n_neighbors = 3 - cheking later for a different n_neighbors values
K_Neighbors = KNeighborsClassifier(n_neighbors=3)

# Train the model using the training data
K_Neighbors.fit(X_train_scaled, y_train)

In [10]:
# Creating predictions using the testing data
y_predictions = K_Neighbors.predict(X_test_scaled)

In [11]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_predictions))

              precision    recall  f1-score   support

    Approved       0.91      0.91      0.91       660
    Rejected       0.85      0.86      0.85       408

    accuracy                           0.89      1068
   macro avg       0.88      0.88      0.88      1068
weighted avg       0.89      0.89      0.89      1068



In [13]:
## Analysis/code obtained from class activity: 20-Supervised-Learning\2\Activities\\06-Stu_Predicting_Default

## Random Forest Model: 
# Creating random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=100, random_state=10)

# Fitting the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [14]:
# Making predictions using the random forest model:
predictions = rf_model.predict(X_test_scaled)

In [15]:
# Model evaluation: Evaluate model results:

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [16]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,650,10
Actual 1,10,398


Accuracy Score : 0.9812734082397003
Classification Report
              precision    recall  f1-score   support

    Approved       0.98      0.98      0.98       660
    Rejected       0.98      0.98      0.98       408

    accuracy                           0.98      1068
   macro avg       0.98      0.98      0.98      1068
weighted avg       0.98      0.98      0.98      1068



In [18]:
# Getting the feature importance array
importances = rf_model.feature_importances_

# List the top 5most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:5]

[(0.804187775308797, ' cibil_score'),
 (0.05488989556866576, ' loan_term'),
 (0.028911293901394956, ' loan_amount'),
 (0.02047096864406974, ' luxury_assets_value'),
 (0.01910773329460537, ' residential_assets_value')]