In [1]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from miceforest import ImputationKernel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import pickle

#### Loading Training and Testing dataset

In [4]:
train = pd.read_csv("train_ctrUa4K.csv")
train

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [6]:
train.Loan_Status.value_counts()

Loan_Status
Y    422
N    192
Name: count, dtype: int64

In [7]:
test = pd.read_csv("test_lAUu6dG.csv")
test.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


#### Size of Train and test dataset

In [8]:
shape = train.shape
shape2 = test.shape

print(f'{shape} - Train')
print(f'{shape2} - Test')     

(614, 13) - Train
(367, 12) - Test


#### Basic Info about dataet

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


#### Data Preprocessing

In [10]:
data1 = pd.concat([train,test])
data1

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777.0,113.0,360.0,1.0,Urban,
363,LP002975,Male,Yes,0,Graduate,No,4158,709.0,115.0,360.0,1.0,Urban,
364,LP002980,Male,No,0,Graduate,No,3250,1993.0,126.0,360.0,,Semiurban,
365,LP002986,Male,Yes,0,Graduate,No,5000,2393.0,158.0,360.0,1.0,Rural,


In [11]:
# Dropping Loan ID

data1 = data1.drop(columns = "Loan_ID")
data1

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
362,Male,Yes,3+,Not Graduate,Yes,4009,1777.0,113.0,360.0,1.0,Urban,
363,Male,Yes,0,Graduate,No,4158,709.0,115.0,360.0,1.0,Urban,
364,Male,No,0,Graduate,No,3250,1993.0,126.0,360.0,,Semiurban,
365,Male,Yes,0,Graduate,No,5000,2393.0,158.0,360.0,1.0,Rural,


In [12]:
# Identify Missing values

data1.isnull().sum()

# WE have missing values in various fields, we can do univariate and bivariate analysis first and then do preprocessing of data

Gender                24
Married                3
Dependents            25
Education              0
Self_Employed         55
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount            27
Loan_Amount_Term      20
Credit_History        79
Property_Area          0
Loan_Status          367
dtype: int64

In [13]:
data1.Gender.value_counts()

Gender
Male      775
Female    182
Name: count, dtype: int64

In [14]:
data1.Gender = data1.Gender.map({"Male":int('0'),"Female":int('1')})
data1.Married = data1.Married.map({"No":int('0'),"Yes":int('1')})
data1.Education = data1.Education.map({"Not Graduate":int('0'),"Graduate":int('1')})
data1.Property_Area = data1.Property_Area.map({"Rural":int('0'),"Semiurban":int('1'),"Urban":int('2')})
data1.Self_Employed = data1.Self_Employed.map({"No":int('0'),"Yes":int('1')})
data1.Loan_Status = data1.Loan_Status.map({"N":int('0'),"Y":int('1')})
data1.Dependents = data1.Dependents.map({"0":int('0'),"1":int('1'),"2":int('2'), "3+":int('3')})

In [15]:
data1.Gender=data1.Gender.astype("category")
data1.Married=data1.Married.astype("category")
data1.Education=data1.Education.astype("category")
data1.Property_Area=data1.Property_Area.astype("category")
data1.Self_Employed=data1.Self_Employed.astype("category")
data1.Loan_Status=data1.Loan_Status.astype("category")
data1.Dependents=data1.Dependents.astype("category")
data1.Credit_History=data1.Credit_History.astype("category")

In [16]:
data1["Loan_Amount_Term"]=data1.Loan_Amount_Term.fillna(data1["Loan_Amount_Term"].mode()[0])

In [17]:
mice_kernel=ImputationKernel(data=data1,save_all_iterations=True,random_state=2023)
mice_kernel.mice(2)
mice_imputation=mice_kernel.complete_data()
mice_imputation.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0.0,0.0,0.0,1,0.0,5849,0.0,133.0,360.0,1.0,2,1.0
1,0.0,1.0,1.0,1,0.0,4583,1508.0,128.0,360.0,1.0,0,0.0
2,0.0,1.0,0.0,1,1.0,3000,0.0,66.0,360.0,1.0,2,1.0
3,0.0,1.0,0.0,0,0.0,2583,2358.0,120.0,360.0,1.0,2,1.0
4,0.0,0.0,0.0,1,0.0,6000,0.0,141.0,360.0,1.0,2,1.0


In [18]:
data1=mice_imputation
mice_imputation

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0.0,0.0,0.0,1,0.0,5849,0.0,133.0,360.0,1.0,2,1.0
1,0.0,1.0,1.0,1,0.0,4583,1508.0,128.0,360.0,1.0,0,0.0
2,0.0,1.0,0.0,1,1.0,3000,0.0,66.0,360.0,1.0,2,1.0
3,0.0,1.0,0.0,0,0.0,2583,2358.0,120.0,360.0,1.0,2,1.0
4,0.0,0.0,0.0,1,0.0,6000,0.0,141.0,360.0,1.0,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
362,0.0,1.0,3.0,0,1.0,4009,1777.0,113.0,360.0,1.0,2,1.0
363,0.0,1.0,0.0,1,0.0,4158,709.0,115.0,360.0,1.0,2,1.0
364,0.0,0.0,0.0,1,0.0,3250,1993.0,126.0,360.0,1.0,1,1.0
365,0.0,1.0,0.0,1,0.0,5000,2393.0,158.0,360.0,1.0,0,1.0


In [19]:
data1.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### Feature Engineering

In [20]:
# Combine applicants income and coapplicants income into a single column name Total income

data1["Total_income"]=data1["ApplicantIncome"]+data1["CoapplicantIncome"]
data1 = data1.drop(columns=["ApplicantIncome","CoapplicantIncome"])
data1

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_income
0,0.0,0.0,0.0,1,0.0,133.0,360.0,1.0,2,1.0,5849.0
1,0.0,1.0,1.0,1,0.0,128.0,360.0,1.0,0,0.0,6091.0
2,0.0,1.0,0.0,1,1.0,66.0,360.0,1.0,2,1.0,3000.0
3,0.0,1.0,0.0,0,0.0,120.0,360.0,1.0,2,1.0,4941.0
4,0.0,0.0,0.0,1,0.0,141.0,360.0,1.0,2,1.0,6000.0
...,...,...,...,...,...,...,...,...,...,...,...
362,0.0,1.0,3.0,0,1.0,113.0,360.0,1.0,2,1.0,5786.0
363,0.0,1.0,0.0,1,0.0,115.0,360.0,1.0,2,1.0,4867.0
364,0.0,0.0,0.0,1,0.0,126.0,360.0,1.0,1,1.0,5243.0
365,0.0,1.0,0.0,1,0.0,158.0,360.0,1.0,0,1.0,7393.0


## Modelling

In [21]:
data2 = pd.get_dummies(data1,drop_first=True,dtype=np.uint8)
data2

Unnamed: 0,LoanAmount,Loan_Amount_Term,Total_income,Gender_1.0,Married_1.0,Dependents_1.0,Dependents_2.0,Dependents_3.0,Education_1,Self_Employed_1.0,Credit_History_1.0,Property_Area_1,Property_Area_2,Loan_Status_1.0
0,133.0,360.0,5849.0,0,0,0,0,0,1,0,1,0,1,1
1,128.0,360.0,6091.0,0,1,1,0,0,1,0,1,0,0,0
2,66.0,360.0,3000.0,0,1,0,0,0,1,1,1,0,1,1
3,120.0,360.0,4941.0,0,1,0,0,0,0,0,1,0,1,1
4,141.0,360.0,6000.0,0,0,0,0,0,1,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,113.0,360.0,5786.0,0,1,0,0,1,0,1,1,0,1,1
363,115.0,360.0,4867.0,0,1,0,0,0,1,0,1,0,1,1
364,126.0,360.0,5243.0,0,0,0,0,0,1,0,1,1,0,1
365,158.0,360.0,7393.0,0,1,0,0,0,1,0,1,0,0,1


In [22]:
test1 = data2.iloc[614:].drop(columns="Loan_Status_1.0")
test1

Unnamed: 0,LoanAmount,Loan_Amount_Term,Total_income,Gender_1.0,Married_1.0,Dependents_1.0,Dependents_2.0,Dependents_3.0,Education_1,Self_Employed_1.0,Credit_History_1.0,Property_Area_1,Property_Area_2
0,110.0,360.0,5720.0,0,1,0,0,0,1,0,1,0,1
1,126.0,360.0,4576.0,0,1,1,0,0,1,0,1,0,1
2,208.0,360.0,6800.0,0,1,0,1,0,1,0,1,0,1
3,100.0,360.0,4886.0,0,1,0,1,0,1,0,0,0,1
4,78.0,360.0,3276.0,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,113.0,360.0,5786.0,0,1,0,0,1,0,1,1,0,1
363,115.0,360.0,4867.0,0,1,0,0,0,1,0,1,0,1
364,126.0,360.0,5243.0,0,0,0,0,0,1,0,1,1,0
365,158.0,360.0,7393.0,0,1,0,0,0,1,0,1,0,0


In [23]:
train1=data2.iloc[:614]
train1

Unnamed: 0,LoanAmount,Loan_Amount_Term,Total_income,Gender_1.0,Married_1.0,Dependents_1.0,Dependents_2.0,Dependents_3.0,Education_1,Self_Employed_1.0,Credit_History_1.0,Property_Area_1,Property_Area_2,Loan_Status_1.0
0,133.0,360.0,5849.0,0,0,0,0,0,1,0,1,0,1,1
1,128.0,360.0,6091.0,0,1,1,0,0,1,0,1,0,0,0
2,66.0,360.0,3000.0,0,1,0,0,0,1,1,1,0,1,1
3,120.0,360.0,4941.0,0,1,0,0,0,0,0,1,0,1,1
4,141.0,360.0,6000.0,0,0,0,0,0,1,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,71.0,360.0,2900.0,1,0,0,0,0,1,0,1,0,0,1
610,40.0,180.0,4106.0,0,1,0,0,1,1,0,1,0,0,1
611,253.0,360.0,8312.0,0,1,1,0,0,1,0,1,0,1,1
612,187.0,360.0,7583.0,0,1,0,1,0,1,0,1,0,1,1


In [24]:

mms = MinMaxScaler()
scaled = mms.fit_transform(train1)
train2 = pd.DataFrame(scaled, columns=train1.columns)
train2

Unnamed: 0,LoanAmount,Loan_Amount_Term,Total_income,Gender_1.0,Married_1.0,Dependents_1.0,Dependents_2.0,Dependents_3.0,Education_1,Self_Employed_1.0,Credit_History_1.0,Property_Area_1,Property_Area_2,Loan_Status_1.0
0,0.179450,0.743590,0.055394,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
1,0.172214,0.743590,0.058435,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.082489,0.743590,0.019583,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
3,0.160637,0.743590,0.043980,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
4,0.191027,0.743590,0.057292,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0.089725,0.743590,0.018326,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
610,0.044863,0.358974,0.033485,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
611,0.353111,0.743590,0.086352,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
612,0.257598,0.743590,0.077189,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


In [25]:
X = train2.drop(columns="Loan_Status_1.0")
y=train2["Loan_Status_1.0"]

### Logistic regression with K fold validation

In [25]:
fold_no=1
f1score = []
total_acc=[]
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True) 
for train_index,test_index in kf.split(X,y):
    xtr,xvl = X.iloc[train_index],X.iloc[test_index]
    ytr,yvl = y.iloc[train_index],y.iloc[test_index]
    model = LogisticRegression(random_state=1)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = f1_score(yvl,pred_test).round(3)
    accuracy_lr=accuracy_score(yvl,pred_test)
    print('Fold',str(fold_no),'F1 score',score)   
    f1score.append(score)
    total_acc.append(accuracy_lr)
    fold_no+=1
print("Average F1 Score is", np.mean(f1score).round(4))
print("Average Accuracy Score is", np.mean(total_acc).round(4))

Fold 1 F1 score 0.87
Fold 2 F1 score 0.905
Fold 3 F1 score 0.913
Fold 4 F1 score 0.875
Fold 5 F1 score 0.894
Fold 6 F1 score 0.854
Fold 7 F1 score 0.863
Fold 8 F1 score 0.889
Fold 9 F1 score 0.894
Fold 10 F1 score 0.872
Average F1 Score is 0.8829
Average Accuracy Score is 0.8208


### Kneighbor Classifier

In [26]:
fold_no=1
f1score_k = []
total_acc_k=[]
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True) 
for train_index,test_index in kf.split(X,y):
    xtr,xvl = X.iloc[train_index],X.iloc[test_index]
    ytr,yvl = y.iloc[train_index],y.iloc[test_index]
    model = KNeighborsClassifier(n_neighbors=10)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = f1_score(yvl,pred_test).round(3)
    accuracy_lr=accuracy_score(yvl,pred_test)
    print('Fold',str(fold_no),'F1 score',score)   
    f1score_k.append(score)
    total_acc_k.append(accuracy_lr)
    fold_no+=1
print("Average F1 Score is", np.mean(f1score_k).round(4))
print("Average Accuracy Score is", np.mean(total_acc_k).round(4))

Fold 1 F1 score 0.851
Fold 2 F1 score 0.896
Fold 3 F1 score 0.882
Fold 4 F1 score 0.848
Fold 5 F1 score 0.884
Fold 6 F1 score 0.857
Fold 7 F1 score 0.833
Fold 8 F1 score 0.87
Fold 9 F1 score 0.845
Fold 10 F1 score 0.842
Average F1 Score is 0.8608
Average Accuracy Score is 0.7833


### Random Forest

In [27]:
fold_no=1
f1score_rf = []
total_acc_rf=[]
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True) 
for train_index,test_index in kf.split(X,y):
    xtr,xvl = X.iloc[train_index],X.iloc[test_index]
    ytr,yvl = y.iloc[train_index],y.iloc[test_index]
    model = RandomForestClassifier(max_depth=10)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = f1_score(yvl,pred_test).round(3)
    accuracy_lr=accuracy_score(yvl,pred_test)
    print('Fold',str(fold_no),'F1 score',score)   
    f1score_rf.append(score)
    total_acc_rf.append(accuracy_lr)
    fold_no+=1
print("Average F1 Score is", np.mean(f1score_rf).round(4))
print("Average Accuracy Score is", np.mean(total_acc_rf).round(4))

Fold 1 F1 score 0.879
Fold 2 F1 score 0.903
Fold 3 F1 score 0.921
Fold 4 F1 score 0.863
Fold 5 F1 score 0.901
Fold 6 F1 score 0.842
Fold 7 F1 score 0.86
Fold 8 F1 score 0.899
Fold 9 F1 score 0.882
Fold 10 F1 score 0.872
Average F1 Score is 0.8822
Average Accuracy Score is 0.8224


### XGBoost

In [28]:
fold_no=1
f1score_xg = []
total_acc_xg=[]
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True) 
for train_index,test_index in kf.split(X,y):
    xtr,xvl = X.iloc[train_index],X.iloc[test_index]
    ytr,yvl = y.iloc[train_index],y.iloc[test_index]
    model = XGBClassifier(n_estimators=50, max_depth=4)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = f1_score(yvl,pred_test).round(3)
    accuracy_lr=accuracy_score(yvl,pred_test)
    print('Fold',str(fold_no),'F1 score',score)   
    f1score_xg.append(score)
    total_acc_xg.append(accuracy_lr)
    fold_no+=1
print("Average F1 Score is", np.mean(f1score_xg).round(4))
print("Average Accuracy Score is", np.mean(total_acc_xg).round(4))

Fold 1 F1 score 0.857
Fold 2 F1 score 0.874
Fold 3 F1 score 0.892
Fold 4 F1 score 0.839
Fold 5 F1 score 0.901
Fold 6 F1 score 0.863
Fold 7 F1 score 0.839
Fold 8 F1 score 0.874
Fold 9 F1 score 0.891
Fold 10 F1 score 0.835
Average F1 Score is 0.8665
Average Accuracy Score is 0.8029


In [30]:
result_lr =[np.mean(f1score),np.mean(total_acc)]
result_k =[np.mean(f1score_k),np.mean(total_acc_k)]
result_rf =[np.mean(f1score_rf),np.mean(total_acc_rf)]
result_xg =[np.mean(f1score_xg),np.mean(total_acc_xg)]
results = {'Logistic Regression':result_lr,'K Neighbor':result_k,'Random Forest':result_rf,"XGBoost":result_xg}
results = pd.DataFrame(results,index=["F1 Score","Accuracy"])
results


Unnamed: 0,Logistic Regression,K Neighbor,Random Forest,XGBoost
F1 Score,0.8829,0.8608,0.8822,0.8665
Accuracy,0.820756,0.783289,0.822369,0.802909


### Optimization

### Logistic Regression 

In [40]:
lr_grid = {'solver':['newton-cg', 'lbfgs', 'liblinear'],
          'penalty' :['l2'],
          'C':[100, 10, 1.0, 0.1, 0.01]}

In [41]:
lr_opt = GridSearchCV(LogisticRegression(),param_grid=lr_grid,scoring='f1',cv=10)
lr_opt.fit(X_train,y_train)
print(lr_opt.best_params_)

{'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}


In [42]:
fold_no=1
f1score_opt = []
total_acc_opt=[]
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True) 
for train_index,test_index in kf.split(X,y):
    xtr,xvl = X.iloc[train_index],X.iloc[test_index]
    ytr,yvl = y.iloc[train_index],y.iloc[test_index]
    model = LogisticRegression(C=100,penalty='l2',solver='newton-cg',random_state=1)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = f1_score(yvl,pred_test).round(3)
    accuracy_lr=accuracy_score(yvl,pred_test)
    print('Fold',str(fold_no),'F1 score',score)   
    f1score_opt.append(score)
    total_acc_opt.append(accuracy_lr)
    fold_no+=1
print("Average F1 Score is", np.mean(f1score_opt).round(4))
print("Average Accuracy Score is", np.mean(total_acc_opt).round(4))

Fold 1 F1 score 0.87
Fold 2 F1 score 0.905
Fold 3 F1 score 0.913
Fold 4 F1 score 0.875
Fold 5 F1 score 0.894
Fold 6 F1 score 0.854
Fold 7 F1 score 0.863
Fold 8 F1 score 0.889
Fold 9 F1 score 0.882
Fold 10 F1 score 0.872
Average F1 Score is 0.8817
Average Accuracy Score is 0.8191


### K neighbor Optimization

In [43]:
k_params = {'leaf_size' :list(range(1,50)),
            'n_neighbors' : list(range(1,30)),
            'p':[1,2]}

In [44]:
k_opt = GridSearchCV(KNeighborsClassifier(),param_grid=k_params,scoring='f1',cv=10)
k_opt.fit(X_train,y_train)
print(k_opt.best_params_)

{'leaf_size': 1, 'n_neighbors': 5, 'p': 1}


In [45]:
fold_no=1
f1score_k_opt = []
total_acc_k_opt=[]
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True) 
for train_index,test_index in kf.split(X,y):
    xtr,xvl = X.iloc[train_index],X.iloc[test_index]
    ytr,yvl = y.iloc[train_index],y.iloc[test_index]
    model = KNeighborsClassifier(leaf_size=1,n_neighbors=5,p=1)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = f1_score(yvl,pred_test).round(3)
    accuracy_lr=accuracy_score(yvl,pred_test)
    print('Fold',str(fold_no),'F1 score',score)   
    f1score_k_opt.append(score)
    total_acc_k_opt.append(accuracy_lr)
    fold_no+=1
print("Average F1 Score is", np.mean(f1score_k_opt).round(4))
print("Average Accuracy Score is", np.mean(total_acc_k_opt).round(4))

Fold 1 F1 score 0.86
Fold 2 F1 score 0.903
Fold 3 F1 score 0.87
Fold 4 F1 score 0.837
Fold 5 F1 score 0.882
Fold 6 F1 score 0.821
Fold 7 F1 score 0.863
Fold 8 F1 score 0.841
Fold 9 F1 score 0.804
Fold 10 F1 score 0.872
Average F1 Score is 0.8553
Average Accuracy Score is 0.78


### Random Forest

In [33]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [34]:
forest_params = {'max_depth':list(range(1,20,2)),
                "n_estimators":list(range(1,200,20)),
                'criterion':['gini',"entropy",'log_loss']}

In [38]:
rfc_opt = GridSearchCV(RandomForestClassifier(),param_grid=forest_params,scoring='f1',cv=10)
rfc_opt.fit(X_train,y_train)
print(rfc_opt.best_params_)

{'criterion': 'gini', 'max_depth': 3, 'n_estimators': 21}


In [100]:
rfc2 = RandomForestClassifier(n_estimators=121,max_depth=3,criterion='log_loss')
rfc2.fit(X_train,y_train)
rfc2.score(X_test,y_test)

0.8455284552845529

In [50]:
fold_no=1
f1score_rf_opt = []
total_acc_rf_opt=[]
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True) 
for train_index,test_index in kf.split(X,y):
    xtr,xvl = X.iloc[train_index],X.iloc[test_index]
    ytr,yvl = y.iloc[train_index],y.iloc[test_index]
    model = RandomForestClassifier(n_estimators=21,max_depth=3,criterion='gini',random_state=1)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = f1_score(yvl,pred_test).round(3)
    accuracy_lr=accuracy_score(yvl,pred_test)
    print('Fold',str(fold_no),'F1 score',score)   
    f1score_rf_opt.append(score)
    total_acc_rf_opt.append(accuracy_lr)
    fold_no+=1
print("Average F1 Score is", np.mean(f1score_rf_opt).round(4))
print("Average Accuracy Score is", np.mean(total_acc_rf_opt).round(4))

Fold 1 F1 score 0.87
Fold 2 F1 score 0.905
Fold 3 F1 score 0.913
Fold 4 F1 score 0.875
Fold 5 F1 score 0.894
Fold 6 F1 score 0.854
Fold 7 F1 score 0.863
Fold 8 F1 score 0.889
Fold 9 F1 score 0.894
Fold 10 F1 score 0.872
Average F1 Score is 0.8829
Average Accuracy Score is 0.8208


### XGBoost Optimization

In [46]:
xg_params = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

In [47]:
xg_opt = GridSearchCV(XGBClassifier(),param_grid=xg_params,scoring='f1',cv=10)
xg_opt.fit(X_train,y_train)
print(xg_opt.best_params_)

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 60}


In [48]:
fold_no=1
f1score_xg_opt= []
total_acc_xg_opt=[]
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True) 
for train_index,test_index in kf.split(X,y):
    xtr,xvl = X.iloc[train_index],X.iloc[test_index]
    ytr,yvl = y.iloc[train_index],y.iloc[test_index]
    model = XGBClassifier(learning_rate=0.01,n_estimators=60, max_depth=2)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = f1_score(yvl,pred_test).round(3)
    accuracy_lr=accuracy_score(yvl,pred_test)
    print('Fold',str(fold_no),'F1 score',score)   
    f1score_xg_opt.append(score)
    total_acc_xg_opt.append(accuracy_lr)
    fold_no+=1
print("Average F1 Score is", np.mean(f1score_xg_opt).round(4))
print("Average Accuracy Score is", np.mean(total_acc_xg_opt).round(4))

Fold 1 F1 score 0.87
Fold 2 F1 score 0.905
Fold 3 F1 score 0.913
Fold 4 F1 score 0.875
Fold 5 F1 score 0.894
Fold 6 F1 score 0.854
Fold 7 F1 score 0.863
Fold 8 F1 score 0.876
Fold 9 F1 score 0.894
Fold 10 F1 score 0.86
Average F1 Score is 0.8804
Average Accuracy Score is 0.8175


In [52]:
result_lr_opt=[np.mean(f1score_opt),np.mean(total_acc_opt)]
result_k_opt =[np.mean(f1score_k_opt),np.mean(total_acc_k_opt)]
result_rf_opt =[np.mean(f1score_rf_opt),np.mean(total_acc_rf_opt)]
result_xg_opt =[np.mean(f1score_xg_opt),np.mean(total_acc_xg_opt)]
results_opt = {'Logistic Regression':result_lr_opt,'K Neighbor':result_k_opt,'Random Forest':result_rf_opt,"XGBoost":result_xg_opt}
results_opt = pd.DataFrame(results_opt,index=["F1 Score","Accuracy"])
results_opt

Unnamed: 0,Logistic Regression,K Neighbor,Random Forest,XGBoost
F1 Score,0.8817,0.8553,0.8829,0.8804
Accuracy,0.819117,0.780011,0.820756,0.817478


### Best model is random forest as it has comparatively more f1 score as well as accuracy compared to other 3 models

In [26]:
final_model = RandomForestClassifier(n_estimators=21,max_depth=3,criterion='gini',random_state=1)
final_model.fit(X,y)

In [27]:
final_results = final_model.predict(test1)
final_results

array([1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
       0., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1.,
       1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1.

In [28]:
loan_id=test["Loan_ID"]
final_pred=pd.DataFrame(final_results,columns=['Loan_Status']).set_index(loan_id)
final_pred["Loan_Status"]=final_pred["Loan_Status"].map({1:'Y',0:'N'})
final_pred.to_csv("loan_pred.csv")