In [1]:
# import
import pandas as pd
import numpy as np
from collections import defaultdict

from sklearn.preprocessing import Imputer, LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score

In [2]:
# reading the data
data = pd.read_csv("train.csv")
data[:4]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y


In [3]:
#let's remove all the rows which is having nulls
# train = train[~train.isnull()]
data = data.dropna()

In [4]:
enc = dict()

# Encoding the variable
for x in ['Gender', 'Married', 'Education', 'Dependents', 'Self_Employed', 'Property_Area']:
    enc[x]=LabelEncoder()
    enc[x].fit(data[x])
    print(enc[x].classes_)
    data[x]=enc[x].transform(data[x])
#print(enc)

#fit = data.apply(lambda x: d[x.name].fit_transform(x))

# Inverse the encoded
#fit.apply(lambda x: d[x.name].inverse_transform(x))

# Using the dictionary to label future data
#data.apply(lambda x: d[x.name].transform(x))

['Female' 'Male']
['No' 'Yes']
['Graduate' 'Not Graduate']
['0' '1' '2' '3+']
['No' 'Yes']
['Rural' 'Semiurban' 'Urban']


In [5]:
data_id = data.iloc[:, 0]

In [6]:
# scaling the data
sc = StandardScaler()
data.iloc[:,1:-1] = sc.fit_transform(data.iloc[:,1:-1])

In [7]:
# Splitting the data into X and y
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

print(X[:2])
print(y[:2])

     Gender   Married  Dependents  Education  Self_Employed  ApplicantIncome  \
1  0.467198  0.737162    0.218599  -0.503253      -0.399275        -0.137970   
2  0.467198  0.737162   -0.762033  -0.503253       2.504541        -0.417536   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
1          -0.027952   -0.208089          0.275542        0.413197   
2          -0.604633   -0.979001          0.275542        0.413197   

   Property_Area  
1      -1.318868  
2       1.259774  
1    N
2    Y
Name: Loan_Status, dtype: object


In [8]:
# Split the data into 25% test and 75% training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

** Implementing RandomForestClassifier **

In [9]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10000, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [10]:
# Print the name and gini importance of each feature
for feature in zip(X_train.columns, clf.feature_importances_):
    print(feature)

('Gender', 0.021641038906192175)
('Married', 0.029685146387984803)
('Dependents', 0.049374880515425511)
('Education', 0.021661303118526379)
('Self_Employed', 0.020061592095794289)
('ApplicantIncome', 0.20259018981388396)
('CoapplicantIncome', 0.11398999775876162)
('LoanAmount', 0.20349136926820866)
('Loan_Amount_Term', 0.044481963103573892)
('Credit_History', 0.24249326406846211)
('Property_Area', 0.050529254963187553)


In [11]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.01
sfm = SelectFromModel(clf, threshold=0.02)

In [12]:
sfm.fit(X_train, y_train)

# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(X_train.columns[feature_list_index])

Gender
Married
Dependents
Education
Self_Employed
ApplicantIncome
CoapplicantIncome
LoanAmount
Loan_Amount_Term
Credit_History
Property_Area


In [13]:
# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [14]:
# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10000, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [15]:
# Apply The Full Featured Classifier To The Test Data
y_pred = clf.predict(X_test)

# View The Accuracy Of Our Full Feature Model
accuracy_score(y_test, y_pred)

0.77500000000000002

In [16]:
# Apply The Full Featured Classifier To The Test Data
y_important_pred = clf_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature  Model
accuracy_score(y_test, y_important_pred)

0.77500000000000002

As we can see there is no improvement after selecting some features from the given dataset  
** Implementing Logistic Regression **

In [17]:
clf2 = LogisticRegression()

clf2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
y2_pred = clf2.predict(X_test)

accuracy_score(y_test, y2_pred)

0.77500000000000002

** Implementing LogisticRegression with KFold **

In [19]:
kf = KFold(n_splits=5)
scores = cross_val_score(X=X_train, y=y_train, cv=kf, estimator=clf2, n_jobs=1)
print(scores)
print(np.mean(scores))

[ 0.79166667  0.77777778  0.84722222  0.84722222  0.80555556]
0.813888888889


** Implementing DecisionTreeClassifier **

In [20]:
clf3 = DecisionTreeClassifier()

clf3.fit(X_train, y_train)

y3_pred = clf3.predict(X_test)

accuracy_score(y_test, y3_pred)

0.68333333333333335

In [21]:
clf4 = BernoulliNB()

clf4.fit(X_train, y_train)

y4_pred = clf3.predict(X_test)

accuracy_score(y_test, y4_pred)

0.68333333333333335

In [22]:
kf = KFold(n_splits=5)
scores = cross_val_score(X=X_train, y=y_train, cv=kf, estimator=clf3, n_jobs=1)
print(scores)
print(np.mean(scores))

[ 0.76388889  0.73611111  0.77777778  0.63888889  0.65277778]
0.713888888889


In [23]:
test_data = pd.read_csv('test.csv')
test_data[:4]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban


In [24]:
test_data = test_data.dropna()

In [25]:
enc = dict()

# Encoding the variable
for x in ['Gender', 'Married', 'Education', 'Dependents', 'Self_Employed', 'Property_Area']:
    enc[x]=LabelEncoder()
    enc[x].fit(test_data[x])
    print(enc[x].classes_)
    test_data[x]=enc[x].transform(test_data[x])

['Female' 'Male']
['No' 'Yes']
['Graduate' 'Not Graduate']
['0' '1' '2' '3+']
['No' 'Yes']
['Rural' 'Semiurban' 'Urban']


In [26]:
test_data_id = test_data.iloc[:, 0]

# scaling the data
sc = StandardScaler()
test_data.iloc[:,1:-1] = sc.fit_transform(test_data.iloc[:,1:-1])

# Splitting the data into X and y
X_test = test_data.iloc[:, 1:]

In [27]:
y_test_pred = clf2.predict(X_test)

In [30]:
#y_test_pred = ['Y' if x==1 else 'N' for x in list(y_test_pred)]

result = pd.DataFrame(list(zip(test_data_id, y_test_pred)))

pd.DataFrame.to_csv(result, 'result.csv', index=None, header = ['Loan_ID','Loan_Status'])

In [29]:
list(zip(test_data_id, y_test_pred))

[('LP001015', 'Y'),
 ('LP001022', 'Y'),
 ('LP001031', 'Y'),
 ('LP001051', 'Y'),
 ('LP001054', 'Y'),
 ('LP001055', 'Y'),
 ('LP001056', 'N'),
 ('LP001067', 'Y'),
 ('LP001078', 'Y'),
 ('LP001096', 'Y'),
 ('LP001099', 'Y'),
 ('LP001105', 'Y'),
 ('LP001107', 'Y'),
 ('LP001108', 'Y'),
 ('LP001115', 'Y'),
 ('LP001121', 'Y'),
 ('LP001124', 'Y'),
 ('LP001135', 'Y'),
 ('LP001149', 'Y'),
 ('LP001153', 'N'),
 ('LP001169', 'Y'),
 ('LP001176', 'Y'),
 ('LP001177', 'Y'),
 ('LP001183', 'Y'),
 ('LP001185', 'Y'),
 ('LP001187', 'Y'),
 ('LP001190', 'Y'),
 ('LP001203', 'N'),
 ('LP001210', 'Y'),
 ('LP001211', 'Y'),
 ('LP001219', 'Y'),
 ('LP001220', 'Y'),
 ('LP001221', 'Y'),
 ('LP001226', 'Y'),
 ('LP001230', 'Y'),
 ('LP001231', 'Y'),
 ('LP001242', 'Y'),
 ('LP001270', 'Y'),
 ('LP001284', 'Y'),
 ('LP001291', 'Y'),
 ('LP001298', 'Y'),
 ('LP001312', 'Y'),
 ('LP001313', 'N'),
 ('LP001317', 'Y'),
 ('LP001321', 'Y'),
 ('LP001323', 'N'),
 ('LP001324', 'Y'),
 ('LP001332', 'Y'),
 ('LP001335', 'Y'),
 ('LP001338', 'Y'),
