##### Importing required Packages

In [5]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, accuracy_score
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

import xgboost as xgb # pip install xgboost
from xgboost import XGBClassifier

%matplotlib inline

plt.style.use('fast')
sns.set_style('whitegrid')

##### Reading Dataset

In [6]:
data = pd.read_excel(r'./Bank_Loan.xlsx')

##### Data Preprocess

In [7]:
data.head()

Unnamed: 0,ID,Gender,City,Monthly_Income,DOB,Lead_Creation_Date,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Employer_Name,...,Interest_Rate,Processing_Fee,EMI_Loan_Submitted,Filled_Form,Device_Type,Var2,Source,Var4,LoggedIn,Disbursed
0,ID000002C20,Female,Delhi,20000,1978-05-23,2015-05-15,300000.0,5.0,0.0,CYBOSOL,...,,,,N,Web-browser,G,S122,1,0,0
1,ID000004E40,Male,Mumbai,35000,1985-10-07,2015-05-04,200000.0,2.0,0.0,TATA CONSULTANCY SERVICES LTD (TCS),...,13.25,,6762.9,N,Web-browser,G,S122,3,0,0
2,ID000007H20,Male,Panchkula,22500,1981-10-10,2015-05-19,600000.0,4.0,0.0,ALCHEMIST HOSPITALS LTD,...,,,,N,Web-browser,B,S143,1,0,0
3,ID000008I30,Male,Saharsa,35000,1987-11-30,2015-05-09,1000000.0,5.0,0.0,BIHAR GOVERNMENT,...,,,,N,Web-browser,B,S143,3,0,0
4,ID000009J40,Male,Bengaluru,100000,1984-02-17,2015-05-20,500000.0,2.0,25000.0,GLOBAL EDGE SOFTWARE,...,,,,N,Web-browser,B,S134,3,1,0


In [15]:
data.shape

(87020, 26)

In [8]:
# check for missing values

data.isnull().sum()

ID                           0
Gender                       0
City                      1003
Monthly_Income               0
DOB                          0
Lead_Creation_Date           0
Loan_Amount_Applied         71
Loan_Tenure_Applied         71
Existing_EMI                71
Employer_Name               71
Salary_Account           11764
Mobile_Verified              0
Var5                         0
Var1                         0
Loan_Amount_Submitted    34613
Loan_Tenure_Submitted    34613
Interest_Rate            59294
Processing_Fee           59600
EMI_Loan_Submitted       59294
Filled_Form                  0
Device_Type                  0
Var2                         0
Source                       0
Var4                         0
LoggedIn                     0
Disbursed                    0
dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87020 entries, 0 to 87019
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   ID                     87020 non-null  object        
 1   Gender                 87020 non-null  object        
 2   City                   86017 non-null  object        
 3   Monthly_Income         87020 non-null  int64         
 4   DOB                    87020 non-null  datetime64[ns]
 5   Lead_Creation_Date     87020 non-null  datetime64[ns]
 6   Loan_Amount_Applied    86949 non-null  float64       
 7   Loan_Tenure_Applied    86949 non-null  float64       
 8   Existing_EMI           86949 non-null  float64       
 9   Employer_Name          86949 non-null  object        
 10  Salary_Account         75256 non-null  object        
 11  Mobile_Verified        87020 non-null  object        
 12  Var5                   87020 non-null  int64         
 13  V

In [13]:
categorical_col = data.columns[np.where(data.dtypes=='object')]
for col in categorical_col:
    print(col)
    print(data[col].nunique())

ID
87020
Gender
2
City
697
Employer_Name
43567
Salary_Account
57
Mobile_Verified
2
Var1
19
Filled_Form
2
Device_Type
2
Var2
7
Source
30


In [9]:
# ID - Uniqie ID can be droped
# City      - too many unique values - DROP
# DOB       - Calc the age of the cutomer and DROP
# EMI_Loan_Submitted  - more than 50% values are missing
# Employer_Name - too many unique values - DROP
# Existing_EMI  - very few (111) missing, check distribution and replace with mean/median
# Interest_Rate - more than 50% values are missing
# Lead_Creation_Date - too many unique values - DROP
# Loan_Amount_Applied  - very few (111) missing, check distribution and replace with mean/median
# Loan_Amount_Submitted - 40% missing
# Loan_Tenure_Applied  - very few (111) missing, check distribution and replace with mean/median
# Loan_Tenure_Submitted - 40% missing
# LoggedIn - not sure, can be excluded
# Processing_Fee - more than 50% values are missing
# Salary_Account - too many unique values - DROP

In [16]:
 #too many unique values - DROP

drop_list = ['ID','City', 'Employer_Name', 'Lead_Creation_Date', 'Salary_Account','Interest_Rate','Processing_Fee','EMI_Loan_Submitted','Loan_Amount_Submitted','Loan_Tenure_Submitted']

data1 = data.drop(drop_list, axis=1)

In [17]:
data1.isnull().sum()

Gender                  0
Monthly_Income          0
DOB                     0
Loan_Amount_Applied    71
Loan_Tenure_Applied    71
Existing_EMI           71
Mobile_Verified         0
Var5                    0
Var1                    0
Filled_Form             0
Device_Type             0
Var2                    0
Source                  0
Var4                    0
LoggedIn                0
Disbursed               0
dtype: int64

In [20]:
pd.set_option('display.float_format', lambda x: '%.3f' %x)
data1.describe()

Unnamed: 0,Monthly_Income,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Var5,Var4,LoggedIn,Disbursed
count,87020.0,86949.0,86949.0,86949.0,87020.0,87020.0,87020.0,87020.0
mean,58849.974,230250.7,2.131,3696.228,4.962,2.95,0.029,0.015
std,2177511.361,354206.759,2.014,39810.212,5.67,1.698,0.169,0.12
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16500.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,25000.0,100000.0,2.0,0.0,2.0,3.0,0.0,0.0
75%,40000.0,300000.0,4.0,3500.0,11.0,5.0,0.0,0.0
max,444554443.0,10000000.0,10.0,10000000.0,18.0,7.0,1.0,1.0


##### Replacing the missing values with mean of the columns

In [21]:
data1['Loan_Amount_Applied'].fillna(data1['Loan_Amount_Applied'].mean(), inplace=True)
data1['Loan_Tenure_Applied'].fillna(data1['Loan_Tenure_Applied'].mean(), inplace=True)
data1['Existing_EMI'].fillna(data1['Existing_EMI'].mean(), inplace=True)

In [22]:
data1.isnull().sum()

Gender                 0
Monthly_Income         0
DOB                    0
Loan_Amount_Applied    0
Loan_Tenure_Applied    0
Existing_EMI           0
Mobile_Verified        0
Var5                   0
Var1                   0
Filled_Form            0
Device_Type            0
Var2                   0
Source                 0
Var4                   0
LoggedIn               0
Disbursed              0
dtype: int64

In [23]:
data1.dtypes

Gender                         object
Monthly_Income                  int64
DOB                    datetime64[ns]
Loan_Amount_Applied           float64
Loan_Tenure_Applied           float64
Existing_EMI                  float64
Mobile_Verified                object
Var5                            int64
Var1                           object
Filled_Form                    object
Device_Type                    object
Var2                           object
Source                         object
Var4                            int64
LoggedIn                        int64
Disbursed                       int64
dtype: object

##### Binning or Grouping of categorical values based on their frequency distribution

In [25]:
categorical_col = data1.columns[np.where(data1.dtypes=='object')]
for col in categorical_col:
    print(col)
    print(100*(data1[col].value_counts(normalize=True)))
    

Gender
Male     57.283
Female   42.717
Name: Gender, dtype: float64
Mobile_Verified
Y   64.906
N   35.094
Name: Mobile_Verified, dtype: float64
Var1
HBXX   68.138
HBXC   10.354
HBXB    5.147
HAXA    3.343
HBXA    2.440
HAXB    2.311
HBXD    2.257
HAXC    1.765
HBXH    1.115
HCXF    0.830
HAYT    0.584
HAVC    0.441
HAXM    0.308
HCXD    0.272
HCYS    0.249
HVYS    0.214
HAZD    0.125
HCXG    0.090
HAXF    0.017
Name: Var1, dtype: float64
Filled_Form
N   77.603
Y   22.397
Name: Filled_Form, dtype: float64
Device_Type
Web-browser   73.909
Mobile        26.091
Name: Device_Type, dtype: float64
Var2
B   42.841
G   37.959
C   16.330
E    1.511
D    0.729
F    0.625
A    0.006
Name: Var2, dtype: float64
Source
S122   44.320
S133   34.343
S159    6.434
S143    4.978
S127    2.219
S137    1.981
S134    1.495
S161    0.884
S151    0.827
S157    0.747
S153    0.568
S156    0.354
S144    0.344
S158    0.239
S123    0.084
S141    0.066
S162    0.041
S124    0.028
S160    0.013
S150    0.011
S155  

In [26]:
data1['Var1'] = data['Var1'].apply(lambda x:x if x in ['HBXX', 'HBXC', 'HBXB', 'HBXA','HAXB','HBXD'] else "Others")
data1['Var2'] = data['Var2'].apply(lambda x:x if x in ['B','G','C'] else "Others")
data1['Source'] = data['Source'].apply(lambda x:x if x in ['S122','S133','S159'] else "Others")

In [27]:
data1['Var1'].value_counts()

HBXX      59294
HBXC       9010
Others     8139
HBXB       4479
HBXA       2123
HAXB       2011
HBXD       1964
Name: Var1, dtype: int64

In [28]:
data1.head()

Unnamed: 0,Gender,Monthly_Income,DOB,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Mobile_Verified,Var5,Var1,Filled_Form,Device_Type,Var2,Source,Var4,LoggedIn,Disbursed
0,Female,20000,1978-05-23,300000.0,5.0,0.0,N,0,HBXX,N,Web-browser,G,S122,1,0,0
1,Male,35000,1985-10-07,200000.0,2.0,0.0,Y,13,HBXA,N,Web-browser,G,S122,3,0,0
2,Male,22500,1981-10-10,600000.0,4.0,0.0,Y,0,HBXX,N,Web-browser,B,Others,1,0,0
3,Male,35000,1987-11-30,1000000.0,5.0,0.0,Y,10,HBXX,N,Web-browser,B,Others,3,0,0
4,Male,100000,1984-02-17,500000.0,2.0,25000.0,Y,17,HBXX,N,Web-browser,B,Others,3,1,0


In [29]:
data1["Age"] = data1.DOB.apply(lambda x: int((datetime.datetime.now()-x).days/365))

In [30]:
del data1['DOB']

In [31]:
data1.head()

Unnamed: 0,Gender,Monthly_Income,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Mobile_Verified,Var5,Var1,Filled_Form,Device_Type,Var2,Source,Var4,LoggedIn,Disbursed,Age
0,Female,20000,300000.0,5.0,0.0,N,0,HBXX,N,Web-browser,G,S122,1,0,0,43
1,Male,35000,200000.0,2.0,0.0,Y,13,HBXA,N,Web-browser,G,S122,3,0,0,35
2,Male,22500,600000.0,4.0,0.0,Y,0,HBXX,N,Web-browser,B,Others,1,0,0,39
3,Male,35000,1000000.0,5.0,0.0,Y,10,HBXX,N,Web-browser,B,Others,3,0,0,33
4,Male,100000,500000.0,2.0,25000.0,Y,17,HBXX,N,Web-browser,B,Others,3,1,0,37


##### One Hot Encoding

##### Label Encoding

In [32]:
le = LabelEncoder()

In [33]:
categorical_cols = data1.columns[np.where(data1.dtypes=='object')]
for i in categorical_cols:
    data1[i] = le.fit_transform(data1[i])

In [34]:
data1.head()

Unnamed: 0,Gender,Monthly_Income,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Mobile_Verified,Var5,Var1,Filled_Form,Device_Type,Var2,Source,Var4,LoggedIn,Disbursed,Age
0,0,20000,300000.0,5.0,0.0,0,0,5,0,1,2,1,1,0,0,43
1,1,35000,200000.0,2.0,0.0,1,13,1,0,1,2,1,3,0,0,35
2,1,22500,600000.0,4.0,0.0,1,0,5,0,1,0,0,1,0,0,39
3,1,35000,1000000.0,5.0,0.0,1,10,5,0,1,0,0,3,0,0,33
4,1,100000,500000.0,2.0,25000.0,1,17,5,0,1,0,0,3,1,0,37


##### Model building 

In [35]:
models = []
models.append(('Logistic Regression', LogisticRegression(random_state=0)))
models.append(("Naive Bayes",GaussianNB()))
models.append(("Random Forest",RandomForestClassifier(random_state=0)))
models.append(('Ada Boost', AdaBoostClassifier(random_state=0,base_estimator=RandomForestClassifier(random_state=0,n_estimators=50))))
models.append(("xgboost",XGBClassifier(random_state=0)))


##### Evaluation Metris function

In [36]:
def eval_metrics(target,prob,threshold):
        """ target =y_yest,prob=logistic.predict(X_test),Threshold =Cutoff Threshold """
        data=pd.DataFrame()
        data['pred_proba'] = pd.Series(prob)
        data['pred'] = data['pred_proba'].map(lambda x: 1.0 if x > threshold else 0.0)
        accuracy=metrics.accuracy_score(target,data['pred'])
        precision= metrics.precision_score(target,data['pred'])
        recall= metrics.recall_score(target,data['pred'])
        f1_score = metrics.f1_score(target,data['pred'])
        roc_auc = metrics.roc_auc_score(target,data['pred'])
        return(accuracy,precision,recall,f1_score,roc_auc)


##### Feature Importance Function

In [37]:
def feature_importance_plot(features,importance,nameofmodel):
    print("Fature importance of" ,nameofmodel)
    sorted_index=np.argsort(importance)
    plt.figure(figsize=(15,5))
    sns.barplot(x = features[sorted_index], y = importance[sorted_index],)
    plt.show()

##### Hyper Parameter Tuning function(Grisearch CV)

In [39]:
    pram_grid = {
        
            'Logistic Regression': {'penalty': ('l1', 'l2')},
        
            'Random Forest': {'n_estimators' : [20,50,100],
            'criterion': ['gini', 'entropy'],
            'max_features':[4,5,6]},
            
            'xgboost':{
   
                'learning_rate':[1,0.5,0.1,0.01,0.001],
                'max_depth': [3,5,10,20],
                'n_estimators':[10,50,100,200]}
            }

* Note: Param grid in hyperparameter tuning  should follow the same order of model List

In [40]:
def Hyper_parameter_tuning(name,model,X_train,y_train):
    grid_results = GridSearchCV(model, pram_grid[name],scoring='f1', cv = 3)
    grid_results.fit(X_train, y_train)
    print("Best Score",grid_results.best_score_)
    print("Best parameters",grid_results.best_params_)

##### Main Model Building Function

In [41]:
def Model_Building_Phase(dataset,target_column,models_list,th,gridsearch=False,cv_validation=False):# defining the fuction to AUtomate modle Building
     #Independent and Dependent Variabls
    X=dataset.drop([target_column],axis=1)
    Y=dataset[[target_column]]
    
    #Test and train split
    X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=1)
   
   
    # fitting Model
    col_names = ['Algorithm', 'Accuracy','Precison','Recall','f1-score','AUC-ROC']
    results_final = pd.DataFrame(columns=col_names)
    i = 0
    for name,model in models_list:
        print("model",name)
        
        model.fit(X_train, y_train)
        # predictions on testing data 
        pred_test = model.predict_proba(X_test)[:,1]
       
        #Evaluation Metrics for the model built
        accuracy,precision,recall,f1_score,roc_auc=eval_metrics(y_test,pred_test,th)
        results_final.loc[i]=[name,accuracy,precision,recall,f1_score,roc_auc]
        i += 1
        #Plotting Feature importance 
#         try:
#             feature_importance_plot(X_train.columns,model.feature_importances_,name)
#         except:
#             print(name +" has no Feature importance")
#             continue
    
        # Grid Search 
        if gridsearch:
            Hyper_parameter_tuning(name,model,X_train,y_train)
            
        
               
        #Applying k-Fold Cross Validation           
        if cv_validation:
            score_cv = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10,scoring='f1')
            print(" K fold Cross validation scores",score_cv)
            print("Mean of K fold cross validation ",score_cv.mean())
       
   
    return(results_final)
        

In [32]:
# Model_Building_Phase(data_dummies,'Disbursed',models,th=0.5,gridsearch=False,cv_validation=False)

In [42]:
Model_Building_Phase(data1,'Disbursed', models, th=0.5, gridsearch=False, cv_validation=False)

model Logistic Regression
model Naive Bayes
model Random Forest
model Ada Boost
model xgboost


Unnamed: 0,Algorithm,Accuracy,Precison,Recall,f1-score,AUC-ROC
0,Logistic Regression,0.986,0.0,0.0,0.0,0.5
1,Naive Bayes,0.501,0.021,0.736,0.041,0.617
2,Random Forest,0.987,0.538,0.504,0.521,0.749
3,Ada Boost,0.987,0.565,0.436,0.492,0.716
4,xgboost,0.986,0.53,0.532,0.531,0.763


In [32]:
# Model_Building_Phase(data1,'Disbursed',models,th=0.5,gridsearch=False,cv_validation=False )

model Logistic Regression
model Naive Bayes
model Random Forest
model Ada Boost
model xgboost


Unnamed: 0,Algorithm,Accuracy,Precison,Recall,f1-score,AUC-ROC
0,Logistic Regression,0.9856,0.0,0.0,0.0,0.5
1,Naive Bayes,0.5011,0.0209,0.736,0.0407,0.6169
2,Random Forest,0.9879,0.5823,0.552,0.5667,0.7731
3,Ada Boost,0.9879,0.5882,0.52,0.552,0.7573
4,xgboost,0.9872,0.5547,0.548,0.5513,0.7708


# !pip install xgboost