In [202]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import sys
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression,LogisticRegression
from impyute.imputation.cs import mice,fast_knn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest,f_regression,f_classif,chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [203]:
train_data=pd.read_csv('train_ctrUa4K.csv')
test_data=pd.read_csv('test_lAUu6dG.csv')

# Merge train and test data

In [204]:
train_data.shape

(614, 13)

In [205]:
test_data.shape

(367, 12)

In [206]:
y_train=train_data.Loan_Status

In [207]:
y_train.shape

(614,)

In [208]:
y_train.value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [209]:
train_data.drop(['Loan_Status'],axis=1,inplace=True)

In [210]:
train_data.shape

(614, 12)

In [211]:
data=pd.concat([train_data,test_data],axis=0,ignore_index=True)

In [212]:
data.shape

(981, 12)

In [213]:
data.head(9)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban


In [214]:
y_train.replace({'Y':1,'N':0},inplace=True)

# Drop Loan_ID

In [215]:
data.drop(['Loan_ID'],inplace=True,axis=1)

# Encode Dependents Variable

In [216]:
data.corr()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
ApplicantIncome,1.0,-0.114247,0.551811,-0.023089,0.023378
CoapplicantIncome,-0.114247,1.0,0.179228,-0.04386,-0.027253
LoanAmount,0.551811,0.179228,1.0,0.055636,-0.008235
Loan_Amount_Term,-0.023089,-0.04386,0.055636,1.0,-0.020439
Credit_History,0.023378,-0.027253,-0.008235,-0.020439,1.0


In [217]:
#data.Dependents.replace({'0':4,'1':3,'2':2,'3+':1},inplace=True)

In [218]:
data.corr()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
ApplicantIncome,1.0,-0.114247,0.551811,-0.023089,0.023378
CoapplicantIncome,-0.114247,1.0,0.179228,-0.04386,-0.027253
LoanAmount,0.551811,0.179228,1.0,0.055636,-0.008235
Loan_Amount_Term,-0.023089,-0.04386,0.055636,1.0,-0.020439
Credit_History,0.023378,-0.027253,-0.008235,-0.020439,1.0


In [219]:
data.isnull().sum()

Gender               24
Married               3
Dependents           25
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
dtype: int64

# Missing Value Imputation for Categorical data

In [220]:
cat_data=data[['Gender','Married','Self_Employed']]

In [221]:
cat_data

Unnamed: 0,Gender,Married,Self_Employed
0,Male,No,No
1,Male,Yes,No
2,Male,Yes,Yes
3,Male,Yes,No
4,Male,No,No
...,...,...,...
976,Male,Yes,Yes
977,Male,Yes,No
978,Male,No,No
979,Male,Yes,No


In [222]:
imp=SimpleImputer(strategy='most_frequent')
imp.fit(cat_data)
i_cat_data=imp.transform(cat_data)

In [223]:
i_cat_data=pd.DataFrame(i_cat_data)
i_cat_data.columns=['Gender','Married','Self_Employed']

In [224]:
i_cat_data

Unnamed: 0,Gender,Married,Self_Employed
0,Male,No,No
1,Male,Yes,No
2,Male,Yes,Yes
3,Male,Yes,No
4,Male,No,No
...,...,...,...
976,Male,Yes,Yes
977,Male,Yes,No
978,Male,No,No
979,Male,Yes,No


In [225]:
i_cat_data.isnull().sum()

Gender           0
Married          0
Self_Employed    0
dtype: int64

In [226]:
data.drop(['Gender','Married','Self_Employed'],inplace=True,axis=1)

In [227]:
data.shape

(981, 8)

In [228]:
i_cat_data

Unnamed: 0,Gender,Married,Self_Employed
0,Male,No,No
1,Male,Yes,No
2,Male,Yes,Yes
3,Male,Yes,No
4,Male,No,No
...,...,...,...
976,Male,Yes,Yes
977,Male,Yes,No
978,Male,No,No
979,Male,Yes,No


In [229]:
data=pd.concat([data,i_cat_data],axis=1)

In [230]:
data

Unnamed: 0,Dependents,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Gender,Married,Self_Employed
0,0,Graduate,5849,0.0,,360.0,1.0,Urban,Male,No,No
1,1,Graduate,4583,1508.0,128.0,360.0,1.0,Rural,Male,Yes,No
2,0,Graduate,3000,0.0,66.0,360.0,1.0,Urban,Male,Yes,Yes
3,0,Not Graduate,2583,2358.0,120.0,360.0,1.0,Urban,Male,Yes,No
4,0,Graduate,6000,0.0,141.0,360.0,1.0,Urban,Male,No,No
...,...,...,...,...,...,...,...,...,...,...,...
976,3+,Not Graduate,4009,1777.0,113.0,360.0,1.0,Urban,Male,Yes,Yes
977,0,Graduate,4158,709.0,115.0,360.0,1.0,Urban,Male,Yes,No
978,0,Graduate,3250,1993.0,126.0,360.0,,Semiurban,Male,No,No
979,0,Graduate,5000,2393.0,158.0,360.0,1.0,Rural,Male,Yes,No


In [231]:
data.isnull().sum()

Dependents           25
Education             0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Gender                0
Married               0
Self_Employed         0
dtype: int64

# Encoding Categorical Variables

In [232]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981 entries, 0 to 980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dependents         956 non-null    object 
 1   Education          981 non-null    object 
 2   ApplicantIncome    981 non-null    int64  
 3   CoapplicantIncome  981 non-null    float64
 4   LoanAmount         954 non-null    float64
 5   Loan_Amount_Term   961 non-null    float64
 6   Credit_History     902 non-null    float64
 7   Property_Area      981 non-null    object 
 8   Gender             981 non-null    object 
 9   Married            981 non-null    object 
 10  Self_Employed      981 non-null    object 
dtypes: float64(4), int64(1), object(6)
memory usage: 61.4+ KB


In [233]:
d_Gender=pd.get_dummies(data['Gender'],drop_first=True)
d_Gender.columns=['Gender_Male']
d_Gender

Unnamed: 0,Gender_Male
0,1
1,1
2,1
3,1
4,1
...,...
976,1
977,1
978,1
979,1


In [234]:
d_Married=pd.get_dummies(data.Married,drop_first=True)
d_Married.columns=['Married_yes']
d_Married

Unnamed: 0,Married_yes
0,0
1,1
2,1
3,1
4,0
...,...
976,1
977,1
978,0
979,1


In [235]:
d_Education=pd.get_dummies(data.Education,drop_first=True)
d_Education

Unnamed: 0,Not Graduate
0,0
1,0
2,0
3,1
4,0
...,...
976,1
977,0
978,0
979,0


In [236]:
d_Self_Employed=pd.get_dummies(data.Self_Employed,drop_first=True)
d_Self_Employed.columns=['Self_Employed_yes']
d_Self_Employed

Unnamed: 0,Self_Employed_yes
0,0
1,0
2,1
3,0
4,0
...,...
976,1
977,0
978,0
979,0


In [237]:
d_area=pd.get_dummies(data.Property_Area,drop_first=True)
d_area


Unnamed: 0,Semiurban,Urban
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
976,0,1
977,0,1
978,1,0
979,0,0


In [238]:
pd.crosstab(d_area.Semiurban,d_area.Urban)

Urban,0,1
Semiurban,Unnamed: 1_level_1,Unnamed: 2_level_1
0,290,342
1,349,0


In [239]:
data.drop(['Gender','Married','Education','Self_Employed','Property_Area'],axis=1,inplace=True)

In [240]:
data.columns

Index(['Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [241]:
data=pd.concat([data,d_Gender,d_Married,d_Education,d_Self_Employed,d_area],axis=1)

In [242]:
data

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_yes,Not Graduate,Self_Employed_yes,Semiurban,Urban
0,0,5849,0.0,,360.0,1.0,1,0,0,0,0,1
1,1,4583,1508.0,128.0,360.0,1.0,1,1,0,0,0,0
2,0,3000,0.0,66.0,360.0,1.0,1,1,0,1,0,1
3,0,2583,2358.0,120.0,360.0,1.0,1,1,1,0,0,1
4,0,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
976,3+,4009,1777.0,113.0,360.0,1.0,1,1,1,1,0,1
977,0,4158,709.0,115.0,360.0,1.0,1,1,0,0,0,1
978,0,3250,1993.0,126.0,360.0,,1,0,0,0,1,0
979,0,5000,2393.0,158.0,360.0,1.0,1,1,0,0,0,0


In [243]:
data.isnull().sum()

Dependents           25
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Gender_Male           0
Married_yes           0
Not Graduate          0
Self_Employed_yes     0
Semiurban             0
Urban                 0
dtype: int64

In [244]:
data.columns

Index(['Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Male', 'Married_yes',
       'Not Graduate', 'Self_Employed_yes', 'Semiurban', 'Urban'],
      dtype='object')

In [245]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981 entries, 0 to 980
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dependents         956 non-null    object 
 1   ApplicantIncome    981 non-null    int64  
 2   CoapplicantIncome  981 non-null    float64
 3   LoanAmount         954 non-null    float64
 4   Loan_Amount_Term   961 non-null    float64
 5   Credit_History     902 non-null    float64
 6   Gender_Male        981 non-null    uint8  
 7   Married_yes        981 non-null    uint8  
 8   Not Graduate       981 non-null    uint8  
 9   Self_Employed_yes  981 non-null    uint8  
 10  Semiurban          981 non-null    uint8  
 11  Urban              981 non-null    uint8  
dtypes: float64(4), int64(1), object(1), uint8(6)
memory usage: 48.0+ KB


In [246]:
d=pd.get_dummies(data.Dependents,drop_first=True)

In [247]:
data=pd.concat([data,d],axis=1)

In [248]:
data.drop(['Dependents'],axis=1,inplace=True)

In [261]:
data.columns

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Male', 'Married_yes',
       'Not Graduate', 'Self_Employed_yes', 'Semiurban', 'Urban', '1', '2',
       '3+'],
      dtype='object')

In [262]:
data.isnull().sum()

ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Gender_Male           0
Married_yes           0
Not Graduate          0
Self_Employed_yes     0
Semiurban             0
Urban                 0
1                     0
2                     0
3+                    0
dtype: int64

# Missing Value Imputation for Numerical Variable 

In [263]:
i_data=mice(data.values)

In [264]:
i_data.shape

(981, 14)

In [265]:
i_data=pd.DataFrame(i_data)

In [266]:
i_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,5849.0,0.0,123.669696,360.0,1.000000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,4583.0,1508.0,128.000000,360.0,1.000000,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3000.0,0.0,66.000000,360.0,1.000000,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,2583.0,2358.0,120.000000,360.0,1.000000,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,6000.0,0.0,141.000000,360.0,1.000000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,4009.0,1777.0,113.000000,360.0,1.000000,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
977,4158.0,709.0,115.000000,360.0,1.000000,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
978,3250.0,1993.0,126.000000,360.0,0.840945,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
979,5000.0,2393.0,158.000000,360.0,1.000000,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [267]:
i_data.columns=['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Male', 'Married_yes',
       'Not Graduate', 'Self_Employed_yes', 'Semiurban', 'Urban', '1', '2',
       '3+']

In [268]:
i_data

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_yes,Not Graduate,Self_Employed_yes,Semiurban,Urban,1,2,3+
0,5849.0,0.0,123.669696,360.0,1.000000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,4583.0,1508.0,128.000000,360.0,1.000000,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3000.0,0.0,66.000000,360.0,1.000000,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,2583.0,2358.0,120.000000,360.0,1.000000,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,6000.0,0.0,141.000000,360.0,1.000000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,4009.0,1777.0,113.000000,360.0,1.000000,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
977,4158.0,709.0,115.000000,360.0,1.000000,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
978,3250.0,1993.0,126.000000,360.0,0.840945,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
979,5000.0,2393.0,158.000000,360.0,1.000000,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [269]:
i_data.isnull().sum()

ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Gender_Male          0
Married_yes          0
Not Graduate         0
Self_Employed_yes    0
Semiurban            0
Urban                0
1                    0
2                    0
3+                   0
dtype: int64

In [270]:
i_data.corr()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_yes,Not Graduate,Self_Employed_yes,Semiurban,Urban,1,2,3+
ApplicantIncome,1.0,-0.114247,0.554342,-0.023388,0.024776,0.060444,0.052126,-0.138909,0.113106,-0.009034,0.009171,0.027478,-0.021144,0.150853
CoapplicantIncome,-0.114247,1.0,0.177385,-0.043921,-0.038541,0.082428,0.061606,-0.06038,-0.018861,-0.026793,-0.007484,-0.007011,0.018375,-0.00881
LoanAmount,0.554342,0.177385,1.0,0.057869,-0.006939,0.097699,0.161102,-0.168123,0.108014,-0.007421,-0.030579,0.040175,0.05243,0.111366
Loan_Amount_Term,-0.023388,-0.043921,0.057869,1.0,-0.018447,-0.071042,-0.050114,-0.024387,-0.032192,0.032331,-0.052339,-0.046658,-0.008707,-0.072426
Credit_History,0.024776,-0.038541,-0.006939,-0.018447,1.0,0.025939,0.026497,-0.068144,0.038242,0.010577,0.014395,0.031074,-0.003153,-0.071163
Gender_Male,0.060444,0.082428,0.097699,-0.071042,0.025939,1.0,0.327012,0.040649,0.024719,-0.094498,0.029989,-0.016439,0.118421,0.071256
Married_yes,0.052126,0.061606,0.161102,-0.050114,0.026497,0.327012,1.0,0.026211,0.013666,-0.006909,0.008825,0.118849,0.245805,0.148371
Not Graduate,-0.138909,-0.06038,-0.168123,-0.024387,-0.068144,0.040649,0.026211,1.0,-0.010848,-0.013089,-0.036017,-0.023597,0.042769,0.07419
Self_Employed_yes,0.113106,-0.018861,0.108014,-0.032192,0.038242,0.024719,0.013666,-0.010848,1.0,-0.015231,-0.042498,0.047252,0.013448,-0.000417
Semiurban,-0.009034,-0.026793,-0.007421,0.032331,0.010577,-0.094498,-0.006909,-0.013089,-0.015231,1.0,-0.543647,0.017742,-0.028364,0.011934


In [271]:
i_data.shape


(981, 14)

In [272]:
x_train=i_data

# Data Splitting

In [273]:
x_train=i_data.iloc[0:614,:]

In [274]:
x_train.shape

(614, 14)

In [275]:
x_test=i_data.iloc[614:,:]

In [276]:
x_test.shape

(367, 14)

# Data Balancing

In [308]:
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [309]:
sm=SMOTE(random_state=27,sampling_strategy=1)

In [310]:
new_x_train , new_y_train =sm.fit_sample(x_train,y_train)

In [311]:
new_x_train

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_yes,Not Graduate,Self_Employed_yes,Semiurban,Urban,1,2,3+
0,5849.000000,0.000000,123.669696,360.000000,1.000000,1.00000,0.0,0.0,0.000000,0.000000,1.000000,0.0,0.000000,0.000000
1,4583.000000,1508.000000,128.000000,360.000000,1.000000,1.00000,1.0,0.0,0.000000,0.000000,0.000000,1.0,0.000000,0.000000
2,3000.000000,0.000000,66.000000,360.000000,1.000000,1.00000,1.0,0.0,1.000000,0.000000,1.000000,0.0,0.000000,0.000000
3,2583.000000,2358.000000,120.000000,360.000000,1.000000,1.00000,1.0,1.0,0.000000,0.000000,1.000000,0.0,0.000000,0.000000
4,6000.000000,0.000000,141.000000,360.000000,1.000000,1.00000,0.0,0.0,0.000000,0.000000,1.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,11685.874132,0.000000,168.561171,360.000000,0.801688,1.00000,0.0,0.0,0.198312,0.801688,0.198312,0.0,0.000000,0.000000
840,3176.211307,1841.095268,119.136342,360.000000,0.000000,1.00000,1.0,1.0,0.000000,0.648758,0.000000,0.0,0.351242,0.000000
841,6195.883258,0.000000,130.882448,360.000000,0.882358,1.00000,0.0,1.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
842,2978.139957,1665.175093,87.626391,356.263909,0.000000,1.00000,1.0,1.0,0.000000,0.000000,1.000000,0.0,0.000000,0.412454


In [225]:
new_y_train.value_counts()

1    422
0    422
Name: Loan_Status, dtype: int64

# Logistic regression

In [374]:
from sklearn.linear_model import LogisticRegression

In [375]:
LR=LogisticRegression(max_iter=1000)

In [228]:
LR.fit(new_x_train,new_y_train)

LogisticRegression(max_iter=1000)

In [229]:
y_test=LR.predict(x_test)

In [230]:
y_test=pd.DataFrame(y_test)
y_test

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,0
...,...
362,1
363,1
364,1
365,1


In [231]:
final=test_data.Loan_ID
final=pd.DataFrame(final)
final['Loan_Status']=y_test

In [232]:
final

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,1
1,LP001022,1
2,LP001031,1
3,LP001035,1
4,LP001051,0
...,...,...
362,LP002971,1
363,LP002975,1
364,LP002980,1
365,LP002986,1


In [233]:
final.Loan_Status.replace({1:'Y',0:'N'},inplace=True)


In [234]:
final

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,N
...,...,...
362,LP002971,Y
363,LP002975,Y
364,LP002980,Y
365,LP002986,Y


In [237]:
final.to_csv('C:\\Users\\ADMIN\\Desktop\\datasets\\Loan Prediction AV\\sample_submission_with smote LR.csv')

In [238]:
final.Loan_Status.value_counts()

Y    268
N     99
Name: Loan_Status, dtype: int64

In [177]:
test_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


# KNN

In [265]:
knn=KNeighborsClassifier()
knn_params={'n_neighbors':np.arange(1,200),'weights':['uniform','distance']}

In [266]:
GS=GridSearchCV(knn,knn_params,cv=3,scoring='roc_auc')

In [242]:
GS.fit(new_x_train,new_y_train)

GridSearchCV(cv=3, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92...
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
       183, 184, 185, 186, 18

In [243]:
GS.best_params_

{'n_neighbors': 13, 'weights': 'distance'}

In [244]:
KNN=KNeighborsClassifier(n_neighbors=13,weights='distance')

In [245]:
KNN.fit(new_x_train,new_y_train)

KNeighborsClassifier(n_neighbors=13, weights='distance')

In [246]:
knn_y_test=KNN.predict(x_test)

In [248]:
knn_y_test=pd.DataFrame(knn_y_test)

In [249]:
final_knn=test_data.Loan_ID
final_knn=pd.DataFrame(final_knn)
final_knn['Loan_Status']=knn_y_test

In [250]:
final_knn

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,1
1,LP001022,0
2,LP001031,0
3,LP001035,1
4,LP001051,1
...,...,...
362,LP002971,1
363,LP002975,1
364,LP002980,1
365,LP002986,0


In [251]:
final_knn.Loan_Status.replace({1:'Y',0:'N'},inplace=True)

In [252]:
final_knn.to_csv('C:\\Users\\ADMIN\\Desktop\\datasets\\Loan Prediction AV\\sample_submission_knn new dependent smote.csv')

In [253]:
final_knn.Loan_Status.value_counts()

Y    200
N    167
Name: Loan_Status, dtype: int64

# Decision Tree

In [295]:
def GridSearch_DT(x_train,y_train):
    dt=DecisionTreeClassifier(random_state=0)
    dt_params={'max_depth':np.arange(1,100),'min_samples_leaf':np.arange(1,75),'criterion':['gini','entropy']}
    GS_dt=GridSearchCV(dt,dt_params,cv=3,scoring='roc_auc')
    GS_dt.fit(x_train,y_train)
    print(GS_dt.best_params_)

In [296]:
GridSearch_DT(x_train,y_train)

{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 68}


In [297]:
DT=DecisionTreeClassifier(criterion= 'entropy', max_depth= 4, min_samples_leaf= 68)

In [298]:
DT.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=68)

In [299]:
dt_y_test=DT.predict(x_test)

In [300]:
dt_y_test=pd.DataFrame(dt_y_test)

In [301]:
final_dt=test_data.Loan_ID
final_dt=pd.DataFrame(final_dt)
final_dt['Loan_Status']=dt_y_test

In [302]:
final_dt.Loan_Status.replace({1:'Y',0:'N'},inplace=True)

In [303]:
final_dt.to_csv('C:\\Users\\ADMIN\\Desktop\\datasets\\Loan Prediction AV\\sample_submission_dt new dependent without smote log.csv')

In [304]:
final_dt.Loan_Status.value_counts()

Y    303
N     64
Name: Loan_Status, dtype: int64

In [264]:
new_x_train.head(20)

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_yes,Not Graduate,Self_Employed_yes,Semiurban,Urban
0,4.0,5849.0,0.0,123.932852,360.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,3.0,4583.0,1508.0,128.0,360.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
2,4.0,3000.0,0.0,66.0,360.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
3,4.0,2583.0,2358.0,120.0,360.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
4,4.0,6000.0,0.0,141.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
5,2.0,5417.0,4196.0,267.0,360.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
6,4.0,2333.0,1516.0,95.0,360.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
7,1.0,3036.0,2504.0,158.0,360.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
8,2.0,4006.0,1526.0,168.0,360.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
9,3.0,12841.0,10968.0,349.0,360.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0


# Scaling

In [130]:
sc=StandardScaler()
scaled_x_train=sc.fit_transform(new_x_train)
scaled_x_train=pd.DataFrame(scaled_x_train)

In [131]:
scaled_x_train.columns=['Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Male', 'Married_yes',
       'Not Graduate', 'Self_Employed_yes', 'Semiurban', 'Urban']

In [132]:
sc=StandardScaler()
scaled_x_test=sc.fit_transform(x_test)
scaled_x_test=pd.DataFrame(scaled_x_test)

In [133]:
scaled_x_test.columns=['Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Male', 'Married_yes',
       'Not Graduate', 'Self_Employed_yes', 'Semiurban', 'Urban']

# Logistic Regression

In [303]:
LR_scaled=LogisticRegression(max_iter=1000)

In [304]:
LR_scaled.fit(principalDf_x_train,y_train)

LogisticRegression(max_iter=1000)

In [305]:
lr_scaled_y_test=LR_scaled.predict(principalDf_x_test)
lr_scaled_y_test=pd.DataFrame(lr_scaled_y_test)

In [306]:
final_scaled_lr=test_data.Loan_ID
final_scaled_lr=pd.DataFrame(final_scaled_lr)
final_scaled_lr['Loan_Status']=lr_scaled_y_test
final_scaled_lr.Loan_Status.replace({1:'Y',0:'N'},inplace=True)

In [293]:
final_scaled_lr.to_csv('C:\\Users\\ADMIN\\Desktop\\datasets\\Loan Prediction AV\\sample_submission_lr_with_new_encoded_Dependent_dummy_pca.csv')

In [307]:
final_scaled_lr.Loan_Status.value_counts()

Y    366
N      1
Name: Loan_Status, dtype: int64

# KNN

In [383]:
def GridSearch_KNN(x_train,y_train):
    knn=KNeighborsClassifier()
    knn_params={'n_neighbors':np.arange(1,200),'weights':['uniform','distance']}
    GS=GridSearchCV(knn,knn_params,cv=3,scoring='roc_auc')
    GS.fit(x_train,y_train)
    print(GS.best_params_)

In [384]:
GridSearch_KNN(x_train,y_train)

{'n_neighbors': 15, 'weights': 'distance'}


In [385]:
KNN=KNeighborsClassifier(n_neighbors= 15, weights='distance')

In [386]:
KNN.fit(x_train,y_train)
knn_y_test=pd.DataFrame(KNN.predict(x_test))

In [387]:
final_knn=pd.DataFrame(test_data.Loan_ID)
final_knn['Loan_Status']=knn_y_test
final_knn.Loan_Status.replace({1:'Y',0:'N'},inplace=True)

In [388]:
final_knn.to_csv('C:\\Users\\ADMIN\\Desktop\\datasets\\Loan Prediction AV\\sample_submission_knn new dependent dummy without smote.csv')

In [389]:
final_knn.Loan_Status.value_counts()

Y    326
N     41
Name: Loan_Status, dtype: int64

In [167]:
LR_wosmote_woscaled=LogisticRegression(max_iter=1000)

In [168]:
LR_wosmote_woscaled.fit(x_train,y_train)

LogisticRegression(max_iter=1000)

In [169]:
imp_y_test=pd.DataFrame(LR_wosmote_woscaled.predict(scaled_x_test))


In [170]:
imp_lr=pd.DataFrame(test_data.Loan_ID)
imp_lr['Loan_Status']=imp_y_test
imp_lr.Loan_Status.replace({1:'Y',0:'N'},inplace=True)

In [171]:
imp_lr.to_csv('C:\\Users\\ADMIN\\Desktop\\datasets\\Loan Prediction AV\\sample_submission_imp_lr.csv')

In [173]:
imp_lr.Loan_Status.value_counts()

Y    307
N     60
Name: Loan_Status, dtype: int64

In [174]:
GridSearch_KNN(x_train,y_train)

{'n_neighbors': 15, 'weights': 'distance'}


In [181]:
KNN_without_smote=KNeighborsClassifier(n_neighbors= 15, weights='distance')
KNN_without_smote.fit(x_train,y_train)
imp_knn_y_test=pd.DataFrame(KNN_without_smote.predict(x_test))

In [182]:
imp_knn=pd.DataFrame(test_data.Loan_ID)
imp_knn['Loan_Status']=imp_knn_y_test
imp_knn.Loan_Status.replace({1:'Y',0:'N'},inplace=True)

In [183]:
imp_knn.to_csv('C:\\Users\\ADMIN\\Desktop\\datasets\\Loan Prediction AV\\sample_submission_imp_knn.csv')

In [184]:
imp_knn.Loan_Status.value_counts()

Y    326
N     41
Name: Loan_Status, dtype: int64

In [185]:
GridSearch_DT(x_train,y_train)

{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 68}


In [394]:
pipeline = Pipeline(
    [
     ('selector',SelectKBest(f_classif)),
     ('model',LogisticRegression())
    ]
)

search = GridSearchCV(
    estimator = pipeline,
    param_grid = {'selector__k':[2,3,4,5,6,7,8,9,10,11,12,13]},
    n_jobs=-1,
    scoring="neg_mean_squared_error",
    cv=5,
    verbose=3
)


In [395]:
search.fit(x_train,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    1.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.1s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('selector', SelectKBest()),
                                       ('model', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'selector__k': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13]},
             scoring='neg_mean_squared_error', verbose=3)

In [396]:
search.best_params_

{'selector__k': 6}

In [397]:
search.best_score_

-0.18727175796348128

In [399]:
x_train.columns

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Male', 'Married_yes',
       'Not Graduate', 'Self_Employed_yes', 'Semiurban', 'Urban', '1', '2',
       '3+'],
      dtype='object')

In [401]:
k_x_train=x_train[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Male']]

In [402]:
L=LogisticRegression(max_iter=1000)

In [403]:
L.fit(k_x_train,y_train)

LogisticRegression(max_iter=1000)

In [404]:
k_x_test=x_test[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Male']]

In [405]:
k_y_test=L.predict(k_x_test)
k_y_test=pd.DataFrame(k_y_test)

In [406]:
k_lr=test_data.Loan_ID
k_lr=pd.DataFrame(k_lr)
k_lr['Loan_Status']=k_y_test
k_lr.Loan_Status.replace({1:'Y',0:'N'},inplace=True)

In [407]:
k_lr.to_csv('C:\\Users\\ADMIN\\Desktop\\datasets\\Loan Prediction AV\\k_l_new dependent dummy.csv')

In [408]:
k_lr.Loan_Status.value_counts()


Y    308
N     59
Name: Loan_Status, dtype: int64

In [108]:
pipeline = Pipeline(
    [
     ('selector',SelectKBest(chi2)),
     ('model',KNeighborsClassifier())
    ]
)

search = GridSearchCV(
    estimator = pipeline,
    param_grid = {'selector__k':[2,3,4,5,6,7,8,9,10,11,12,13],
                  'model__n_neighbors':np.arange(1,200),
                  'model__weights':['uniform','distance']
                 },
    n_jobs=-1,
    scoring="neg_mean_squared_error",
    cv=5,
    verbose=3
)


In [109]:
search.fit(x_train,y_train)

Fitting 5 folds for each of 4776 candidates, totalling 23880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 392 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 1032 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 1928 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done 3080 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done 4488 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done 6152 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 8072 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 10248 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 12680 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 15368 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 18312 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 21512 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 23880 out of 23880 | elapsed:  5.6min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('selector',
                                        SelectKBest(score_func=<function chi2 at 0x0F1F4AE0>)),
                                       ('model', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'model__n_neighbors': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  5...
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
       183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
       196, 197, 198, 199]),
                         'model__weights': ['uniform'

In [110]:
search.best_params_

{'model__n_neighbors': 25, 'model__weights': 'uniform', 'selector__k': 2}

In [111]:
k_

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_yes,Not Graduate,Self_Employed_yes,Semiurban,Urban
0,0.0,5849.0,0.0,123.933538,360.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,4583.0,1508.0,128.000000,360.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.0,3000.0,0.0,66.000000,360.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
3,0.0,2583.0,2358.0,120.000000,360.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
4,0.0,6000.0,0.0,141.000000,360.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0.0,2900.0,0.0,71.000000,360.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
610,3.0,4106.0,0.0,40.000000,180.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
611,1.0,8072.0,240.0,253.000000,360.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
612,2.0,7583.0,0.0,187.000000,360.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0


In [113]:
k_x_train=x_train[['Dependents', 'ApplicantIncome']]
k_x_test=x_test[['Dependents', 'ApplicantIncome']]
            

In [118]:
K=KNeighborsClassifier(n_neighbors=25,weights='uniform')

In [122]:
K.fit(k_x_train,y_train)

KNeighborsClassifier(n_neighbors=25)

In [123]:
k_y_test=K.predict(k_x_test)
k_y_test=pd.DataFrame(k_y_test)

In [124]:
k_knn=test_data.Loan_ID
k_knn=pd.DataFrame(k_knn)
k_knn['Loan_Status']=k_y_test
k_knn.Loan_Status.replace({1:'Y',0:'N'},inplace=True)

In [125]:
k_knn.to_csv('C:\\Users\\ADMIN\\Desktop\\datasets\\Loan Prediction AV\\k_knn.csv')

In [126]:
k_knn.Loan_Status.value_counts()

Y    360
N      7
Name: Loan_Status, dtype: int64

In [135]:
bestfeatures = SelectKBest(score_func=f_classif, k=10)
fit = bestfeatures.fit(scaled_x_train,new_y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(scaled_x_train.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

                Specs       Score
5      Credit_History  363.509935
10          Semiurban   23.742512
7         Married_yes    6.847113
8        Not Graduate    6.463758
2   CoapplicantIncome    3.222894
11              Urban    2.760412
4    Loan_Amount_Term    1.860989
1     ApplicantIncome    1.287270
6         Gender_Male    0.454746
3          LoanAmount    0.174760


In [136]:
def GridSearch_DT(x_train,y_train):
    dt=DecisionTreeClassifier(random_state=0)
    dt_params={'max_depth':np.arange(1,100),'min_samples_leaf':np.arange(1,75),'criterion':['gini','entropy']}
    GS_dt=GridSearchCV(dt,dt_params,cv=3,scoring='roc_auc')
    GS_dt.fit(x_train,y_train)
    print(GS_dt.best_params_)

In [137]:
GridSearch_DT(x_train,y_train)

{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 68}


In [138]:
D=DecisionTreeClassifier(criterion='gini', max_depth= 4, min_samples_leaf= 68)

In [139]:
D.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=4, min_samples_leaf=68)

In [140]:
dt_y_test=D.predict(x_test)

In [141]:
dt_y_test=pd.DataFrame(dt_y_test)
final_dt=test_data.Loan_ID
final_dt=pd.DataFrame(final_dt)
final_dt['Loan_Status']=dt_y_test
final_dt.Loan_Status.replace({1:'Y',0:'N'},inplace=True)

In [143]:
final_dt.to_csv('C:\\Users\\ADMIN\\Desktop\\datasets\\Loan Prediction AV\\sample_submission_dt_without_smote.csv')

# coding dependent variable

In [145]:
i_data.Dependents.replace({0:0,1:1,2:2,3:'3+'},inplace=True)

In [146]:
i_data.head(20)

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_yes,Not Graduate,Self_Employed_yes,Semiurban,Urban
0,0,5849.0,0.0,123.933538,360.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1,4583.0,1508.0,128.0,360.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0,3000.0,0.0,66.0,360.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
3,0,2583.0,2358.0,120.0,360.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
4,0,6000.0,0.0,141.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
5,2,5417.0,4196.0,267.0,360.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
6,0,2333.0,1516.0,95.0,360.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
7,3+,3036.0,2504.0,158.0,360.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
8,2,4006.0,1526.0,168.0,360.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
9,1,12841.0,10968.0,349.0,360.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0


In [147]:
train_data.head(20)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,1
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,1
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,0
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,1
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,0


In [313]:
from sklearn.decomposition import PCA
b=[]
for i in range(1,14):
    pca = PCA(n_components=i)
    principalComponents = pca.fit_transform(new_x_train)
    a=pca.explained_variance_ratio_
    print ('Component No')
    print ( i)
    print ('Variance Captured')
    print (a.sum())
    print ('----------------------------------------------------------')
    b.append(a.sum())
    a=[]

Component No
1
Variance Captured
0.7530224907481091
----------------------------------------------------------
Component No
2
Variance Captured
0.9998169202632191
----------------------------------------------------------
Component No
3
Variance Captured
0.9999184063131924
----------------------------------------------------------
Component No
4
Variance Captured
0.9999999626776128
----------------------------------------------------------
Component No
5
Variance Captured
0.9999999706586523
----------------------------------------------------------
Component No
6
Variance Captured
0.9999999770015738
----------------------------------------------------------
Component No
7
Variance Captured
0.9999999813561659
----------------------------------------------------------
Component No
8
Variance Captured
0.999999985115662
----------------------------------------------------------
Component No
9
Variance Captured
0.9999999886816421
----------------------------------------------------------
Co

In [315]:
pca = PCA(n_components=5)
principalComponents = pca.fit_transform(x_test)
principalDf_x_train = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2','principal component 3', 'principal component 4','principal component 5'])

In [316]:
pca.explained_variance_ratio_

array([8.18393508e-01, 1.81375427e-01, 1.43688430e-04, 8.73233522e-05,
       1.18894086e-08])

In [317]:
from sklearn.decomposition import PCA
b=[]
for i in range(1,14):
    pca = PCA(n_components=i)
    principalComponents = pca.fit_transform(x_test)
    a=pca.explained_variance_ratio_
    print ('Component No')
    print ( i)
    print ('Variance Captured')
    print (a.sum())
    print ('----------------------------------------------------------')
    b.append(a.sum())
    a=[]

Component No
1
Variance Captured
0.8183935082752367
----------------------------------------------------------
Component No
2
Variance Captured
0.9997689348415766
----------------------------------------------------------
Component No
3
Variance Captured
0.9999126232716635
----------------------------------------------------------
Component No
4
Variance Captured
0.9999999466238715
----------------------------------------------------------
Component No
5
Variance Captured
0.99999995851328
----------------------------------------------------------
Component No
6
Variance Captured
0.9999999675394733
----------------------------------------------------------
Component No
7
Variance Captured
0.9999999736119837
----------------------------------------------------------
Component No
8
Variance Captured
0.9999999787294657
----------------------------------------------------------
Component No
9
Variance Captured
0.9999999833584536
----------------------------------------------------------
Com

In [302]:
pca = PCA(n_components=5)
principalComponents = pca.fit_transform(x_test)
principalDf_x_test = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2','principal component 3', 'principal component 4','principal component 5'])

In [318]:
from sklearn.ensemble import RandomForestClassifier

In [344]:
rf=RandomForestClassifier(random_state=0)
rf_params={'n_estimators':np.arange(1,150)}

In [345]:
GS_rf=GridSearchCV(rf,rf_params,cv=3,scoring='roc_auc')

In [346]:
GS_rf.fit(new_x_train,new_y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=0),
             param_grid={'n_estimators': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149])},
             scoring='roc_auc')

In [347]:
GS_rf.best_params_

{'n_estimators': 128}

In [348]:
rf=RandomForestClassifier(n_estimators=128)

In [372]:
rf.fit(rf_x_train,y_train)

RandomForestClassifier(n_estimators=128)

In [373]:
rf.feature_importances_

array([0.31019698, 0.25674032, 0.23071407, 0.13339561, 0.02408428,
       0.02535593, 0.01951281])

In [359]:
df=pd.DataFrame(new_x_train.columns)
df.columns=['features']

In [360]:
df['score']=rf.feature_importances_

In [366]:
df.sort_values(ascending=False,by='score',inplace=True)

In [367]:
df

Unnamed: 0,features,score
4,Credit_History,0.286943
0,ApplicantIncome,0.14306
2,LoanAmount,0.137201
1,CoapplicantIncome,0.085119
9,Semiurban,0.068429
6,Married_yes,0.043994
10,Urban,0.040958
3,Loan_Amount_Term,0.040742
7,Not Graduate,0.036911
5,Gender_Male,0.033058


In [368]:
df.features

4        Credit_History
0       ApplicantIncome
2            LoanAmount
1     CoapplicantIncome
9             Semiurban
6           Married_yes
10                Urban
3      Loan_Amount_Term
7          Not Graduate
5           Gender_Male
12                    2
11                    1
8     Self_Employed_yes
13                   3+
Name: features, dtype: object

In [370]:
rf_x_train=x_train[['Credit_History' ,   'ApplicantIncome',
        'LoanAmount',
     'CoapplicantIncome',
             'Semiurban',
           'Married_yes',
                'Urban']]

rf_x_test=x_test[['Credit_History' ,   'ApplicantIncome',
        'LoanAmount',
     'CoapplicantIncome',
             'Semiurban',
           'Married_yes',
                'Urban']]

In [375]:
rf_y_test=pd.DataFrame(rf.predict(rf_x_test))

In [376]:
final_rf=pd.DataFrame(test_data.Loan_ID)
final_rf['Loan_Status']=rf_y_test

In [377]:
final_rf.Loan_Status.value_counts()

1    284
0     83
Name: Loan_Status, dtype: int64

In [378]:
final_rf.Loan_Status.replace({1:'Y',0:'N'},inplace=True)

In [379]:
final_rf.to_csv('C:\\Users\\ADMIN\\Desktop\\datasets\\Loan Prediction AV\\final_rf.csv')

In [338]:
auc_avg = []
auc_var = []
for ne in np.arange(1,100):
    RF = RandomForestClassifier(n_estimators = ne , random_state=0 )
    kfold = model_selection.KFold(shuffle = True , n_splits = 5 , random_state = 0)
    auc = model_selection.cross_val_score(RF , x_train , y_train , cv = kfold , 
                                                 scoring = 'roc_auc')
    auc_var.append(np.var(auc, ddof = 1))
    auc_avg.append(1 - np.mean(auc))
print(auc_var)
print(auc_avg)

NameError: name 'model_selection' is not defined