In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split #for data splitting to train
from sklearn.pipeline import Pipeline #for processing pipeline model
from sklearn.preprocessing import StandardScaler, OneHotEncoder #for normalization of inputs and encoding categorical column
from sklearn.ensemble import RandomForestClassifier #for modelling
from sklearn.compose import ColumnTransformer #for more than one column transformation
from sklearn.metrics import roc_auc_score #for model performance evaluation
from sklearn.preprocessing import LabelEncoder #for label transformation
from sklearn import metrics #for classfication report

### Data Processing

In [2]:
#Character Encoding Detector > encoding detection to read csv file
file = "loan_approval_dataset_updated.csv"

import chardet
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(4000))
result

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

In [3]:
df = pd.read_csv(file, encoding='ISO-8859-1', sep=";")
df.head()

Unnamed: 0,loan_id,no_of_dependents,City,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Ankara,Graduate,No,9600000,29900000,12,778.0,2400000.0,17600000,22700000,8000000,Approved
1,2,0,Ankara,Not Graduate,Yes,4100000,12200000,8,417.0,2700000.0,2200000,8800000,3300000,Rejected
2,3,3,Ankara,Graduate,No,9100000,29700000,20,,7100000.0,4500000,33300000,12800000,Rejected
3,4,3,Ankara,Graduate,No,8200000,30700000,8,,18200000.0,3300000,23300000,7900000,Rejected
4,5,5,Ankara,Not Graduate,Yes,9800000,24200000,20,,12400000.0,8200000,29400000,5000000,Rejected


In [4]:
df.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,3885.0,4146.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,598.538996,7490207.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.283909,6515407.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,451.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,597.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,745.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [5]:
df.dtypes

loan_id                        int64
 no_of_dependents              int64
City                          object
 education                    object
 self_employed                object
 income_annum                  int64
 loan_amount                   int64
 loan_term                     int64
 cibil_score                 float64
 residential_assets_value    float64
 commercial_assets_value       int64
 luxury_assets_value           int64
 bank_asset_value              int64
 loan_status                  object
dtype: object

In [6]:
#removing whitespace from column names, to remove whitespace is default by function itself
df.rename(columns=lambda x: x.strip(), inplace=True)

In [7]:
df.isnull().sum()

loan_id                       0
no_of_dependents              0
City                        156
education                    85
self_employed                 0
income_annum                  0
loan_amount                   0
loan_term                     0
cibil_score                 384
residential_assets_value    123
commercial_assets_value       0
luxury_assets_value           0
bank_asset_value              0
loan_status                   0
dtype: int64

In [8]:
print(set(df['education']))
print(set(df['City']))
print(set(df['self_employed']))
print(set(df['loan_status']))

{nan, ' Graduate', ' Not Graduate'}
{'Ýstanbul', 'Erzurum', 'Ankara', 'Bursa', 'Ýzmir', nan}
{' Yes', ' No'}
{' Approved', ' Rejected'}


In [9]:
df = df.replace({"Ýzmir": "İzmir", "Ýstanbul": "İstanbul"})

In [10]:
print(set(df['education']))
print(set(df['City']))
print(set(df['self_employed']))
print(set(df['loan_status']))


{nan, ' Graduate', ' Not Graduate'}
{'Erzurum', 'Ankara', 'Bursa', 'İzmir', 'İstanbul', nan}
{' Yes', ' No'}
{' Approved', ' Rejected'}


In [11]:
#missing value handling
df['education'].fillna('Other', inplace= True)
df['City'].fillna('Other', inplace= True)
df['cibil_score'].fillna(df['cibil_score'].mean(), inplace= True)
df['residential_assets_value'].fillna(df['residential_assets_value'].mean(), inplace= True)

In [12]:
print(set(df['education']))
print(set(df['City']))
print(set(df['self_employed']))
print(set(df['loan_status']))

{' Graduate', ' Not Graduate', 'Other'}
{'Erzurum', 'Ankara', 'Other', 'Bursa', 'İzmir', 'İstanbul'}
{' Yes', ' No'}
{' Approved', ' Rejected'}


In [13]:
df.isnull().sum()

loan_id                     0
no_of_dependents            0
City                        0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

In [14]:
#removing whitespace from colums values, to remove whitespace is default by function itself
df['education'] = df['education'].apply(lambda x: x.strip())
df['self_employed'] = df['self_employed'].apply(lambda x: x.strip())
df['loan_status'] = df['loan_status'].apply(lambda x: x.strip())

In [15]:
print(set(df['education']))
print(set(df['self_employed']))
print(set(df['loan_status']))

{'Not Graduate', 'Other', 'Graduate'}
{'Yes', 'No'}
{'Rejected', 'Approved'}


In [16]:
df.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,598.538996,7490207.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,164.350914,6420837.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,465.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,598.538996,5900000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,729.0,11100000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [17]:
df

Unnamed: 0,loan_id,no_of_dependents,City,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Ankara,Graduate,No,9600000,29900000,12,778.000000,2400000.0,17600000,22700000,8000000,Approved
1,2,0,Ankara,Not Graduate,Yes,4100000,12200000,8,417.000000,2700000.0,2200000,8800000,3300000,Rejected
2,3,3,Ankara,Graduate,No,9100000,29700000,20,598.538996,7100000.0,4500000,33300000,12800000,Rejected
3,4,3,Ankara,Graduate,No,8200000,30700000,8,598.538996,18200000.0,3300000,23300000,7900000,Rejected
4,5,5,Ankara,Not Graduate,Yes,9800000,24200000,20,598.538996,12400000.0,8200000,29400000,5000000,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,4265,5,İzmir,Graduate,Yes,1000000,2300000,12,317.000000,2800000.0,500000,3300000,800000,Rejected
4265,4266,0,İzmir,Not Graduate,Yes,3300000,11300000,20,559.000000,4200000.0,2900000,11000000,1900000,Approved
4266,4267,2,İzmir,Not Graduate,No,6500000,23900000,18,457.000000,1200000.0,12400000,18100000,7300000,Rejected
4267,4268,1,İzmir,Not Graduate,No,4100000,12800000,8,780.000000,8200000.0,700000,14100000,5800000,Approved


In [18]:
#eliminate key value column which is not meaningful for model development
df = df.loc[:, ~df.columns.isin(['loan_id'])]
df.shape

(4269, 13)

In [19]:
df

Unnamed: 0,no_of_dependents,City,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Ankara,Graduate,No,9600000,29900000,12,778.000000,2400000.0,17600000,22700000,8000000,Approved
1,0,Ankara,Not Graduate,Yes,4100000,12200000,8,417.000000,2700000.0,2200000,8800000,3300000,Rejected
2,3,Ankara,Graduate,No,9100000,29700000,20,598.538996,7100000.0,4500000,33300000,12800000,Rejected
3,3,Ankara,Graduate,No,8200000,30700000,8,598.538996,18200000.0,3300000,23300000,7900000,Rejected
4,5,Ankara,Not Graduate,Yes,9800000,24200000,20,598.538996,12400000.0,8200000,29400000,5000000,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,İzmir,Graduate,Yes,1000000,2300000,12,317.000000,2800000.0,500000,3300000,800000,Rejected
4265,0,İzmir,Not Graduate,Yes,3300000,11300000,20,559.000000,4200000.0,2900000,11000000,1900000,Approved
4266,2,İzmir,Not Graduate,No,6500000,23900000,18,457.000000,1200000.0,12400000,18100000,7300000,Rejected
4267,1,İzmir,Not Graduate,No,4100000,12800000,8,780.000000,8200000.0,700000,14100000,5800000,Approved


### Model development

In [20]:
# Features
X = df.iloc[:, :-1].values
print(X.shape)

(4269, 12)


In [21]:
# Output column
y = df.iloc[:, -1]
print(y.shape)
print(y[:11])

(4269,)
0     Approved
1     Rejected
2     Rejected
3     Rejected
4     Rejected
5     Rejected
6     Approved
7     Rejected
8     Approved
9     Rejected
10    Approved
Name: loan_status, dtype: object


In [22]:
# split test train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=265)

In [23]:
# pipeline
pipeline = Pipeline([
    ('ct-ohe', ColumnTransformer(
        [('ct', 
          OneHotEncoder(handle_unknown='ignore', categories='auto'), 
          [1,2,3])], remainder='passthrough') 
    ),
    ('scaler', StandardScaler()),
    ('estimator', RandomForestClassifier(max_depth = 3, n_estimators = 50))
])

In [24]:
#Pipeline fit
pipeline.fit(X_train, y_train)

### Model Evaluation & Classification Report

In [25]:
y_pred = pipeline.predict(X_test)

In [26]:
labelencoder_y = LabelEncoder()
y_pred = labelencoder_y.fit_transform(y_pred)

In [27]:
y_pred

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,

In [28]:
roc_auc_score(y_test, y_pred)

0.9040023043570542

In [29]:
print(metrics.classification_report(labelencoder_y.fit_transform(y_test), y_pred))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       559
           1       0.98      0.82      0.89       295

    accuracy                           0.93       854
   macro avg       0.95      0.90      0.92       854
weighted avg       0.93      0.93      0.93       854



### User Input Prediction Simulation

In [30]:
#Testing before deployment. It is simulation of a users raw data posting. It will be processed within pipeline model
from_user_input = np.array([[4,'Ankara','Not Graduate', 'No', 60000, 1250000, 16, 423, 4500000, 18000000, 45000000, 34000000]])
from_user_input2 = np.array([[4,'Ankara','Graduate', 'Yes', 60000, 10000, 16, 650, 4500000, 70000, 45000000, 6900000]])

In [31]:
print(pipeline.predict_proba(from_user_input))
print(pipeline.predict_proba(from_user_input2))

[[0.3121394 0.6878606]]
[[0.76784763 0.23215237]]


In [32]:
print(pipeline.predict(from_user_input))
print(pipeline.predict(from_user_input2))

['Rejected']
['Approved']


### Model Dump

In [33]:
import joblib
#Model dump
joblib.dump(pipeline, "loan_approval_random_forest_model.pkl")

['loan_approval_random_forest_model.pkl']

In [34]:
#Model load
pipeline_loaded = joblib.load("loan_approval_random_forest_model.pkl")
print(type(pipeline_loaded))

<class 'sklearn.pipeline.Pipeline'>


In [35]:
#Prediction on loaded model
print(pipeline_loaded.predict_proba(from_user_input))
print(pipeline_loaded.predict_proba(from_user_input2))

[[0.3121394 0.6878606]]
[[0.76784763 0.23215237]]
