In [92]:
# import 
import pandas as pd
import numpy as np

from sklearn.preprocessing import Imputer, LabelEncoder, StandardScaler

from sklearn.model_selection import KFold, cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression
from sklearn.naive_bayes import BernoulliNB


from sklearn.metrics import accuracy_score

In [35]:
# reading train and test data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(len(train), len(test))

614 367


In [3]:
print(train[:3])

    Loan_ID Gender Married Dependents Education Self_Employed  \
0  LP001002   Male      No          0  Graduate            No   
1  LP001003   Male     Yes          1  Graduate            No   
2  LP001005   Male     Yes          0  Graduate           Yes   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0         Urban           Y  


In [9]:
# checking summary
print(train.describe())

       ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
count       614.000000         614.000000  592.000000         600.00000   
mean       5403.459283        1621.245798  146.412162         342.00000   
std        6109.041673        2926.248369   85.587325          65.12041   
min         150.000000           0.000000    9.000000          12.00000   
25%        2877.500000           0.000000         NaN               NaN   
50%        3812.500000        1188.500000         NaN               NaN   
75%        5795.000000        2297.250000         NaN               NaN   
max       81000.000000       41667.000000  700.000000         480.00000   

       Credit_History  
count      564.000000  
mean         0.842199  
std          0.364878  
min          0.000000  
25%               NaN  
50%               NaN  
75%               NaN  
max          1.000000  




In [14]:
# NULL checkng
print(sum(train['LoanAmount'].isnull()))
print(sum(train['Loan_Amount_Term'].isnull()))
print(sum(train['Credit_History'].isnull()))

22
14
50


In [36]:
# simple null checking
print(train.isnull().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


We need to handle the NULLs/NaN values first

In [37]:
print(train.isnull().sum().sum())

149


In [38]:
#let's remove all the rows which is having nulls
# train = train[~train.isnull()]
train = train.dropna()

In [39]:
# let's check the count
print(train.isnull().sum().sum())

0


In [40]:
# checking the summary once again
# checking summary
print(train.isnull().sum())

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [63]:
# now encoding the values
enc = LabelEncoder()
train['Gender'] = enc.fit_transform(train['Gender'])
train['Self_Employed'] = enc.fit_transform(train['Self_Employed'])
train['Married'] = enc.fit_transform(train['Married'])
train['Education'] = enc.fit_transform(train['Education'])
train['Property_Area'] = enc.fit_transform(train['Property_Area'])
train['Loan_Status'] = enc.fit_transform(train['Loan_Status'])
train['Dependents'] = enc.fit_transform(train['Dependents'])


In [64]:
train.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1
5,LP001011,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2,1


In [65]:
# now splittng the data into X and Y
X_train = train.iloc[:,1:-1]
X_train.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2


In [66]:
Y_train = train.iloc[:,-1]
Y_train.head(5)

1    0
2    1
3    1
4    1
5    1
Name: Loan_Status, dtype: int64

In [108]:
#clf = DecisionTreeClassifier()
#clf = LogisticRegression()
clf = BernoulliNB()
kf = KFold(n_splits=5)
scores = cross_val_score(X=X_train, y=Y_train, cv=kf, estimator=clf, n_jobs=1)
print(scores)
print(np.mean(scores))

[ 0.80208333  0.75        0.79166667  0.83333333  0.85416667]
0.80625


In [69]:
# now reading the test data
test = pd.read_csv("test.csv")
print(test.describe())

       ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
count       367.000000         367.000000  362.000000        361.000000   
mean       4805.599455        1569.577657  136.132597        342.537396   
std        4910.685399        2334.232099   61.366652         65.156643   
min           0.000000           0.000000   28.000000          6.000000   
25%        2864.000000           0.000000         NaN               NaN   
50%        3786.000000        1025.000000         NaN               NaN   
75%        5060.000000        2430.500000         NaN               NaN   
max       72529.000000       24000.000000  550.000000        480.000000   

       Credit_History  
count      338.000000  
mean         0.825444  
std          0.380150  
min          0.000000  
25%               NaN  
50%               NaN  
75%               NaN  
max          1.000000  




In [74]:
test = test.dropna()

# now encoding the values
enc = LabelEncoder()
test['Gender'] = enc.fit_transform(test['Gender'])
test['Self_Employed'] = enc.fit_transform(test['Self_Employed'])
test['Married'] = enc.fit_transform(test['Married'])
test['Education'] = enc.fit_transform(test['Education'])
test['Property_Area'] = enc.fit_transform(test['Property_Area'])
test['Dependents'] = enc.fit_transform(test['Dependents'])

test = test.iloc[:,1:]

In [75]:
test_pred = clf.predict(test)

In [83]:
print(test_pred[:5])
test_pred = enc.inverse_transform(test_pred)
print(test_pred[:5])

[1 1 1 1 0]
[1 1 1 1 0]
