In [1]:
#Import libraries

#System level library
import sys

#Library for scientific computation
import scipy

#Library for graph plotting (For visualization)
import matplotlib

#Library for analyzing tabular data
import pandas

#Library to perform vector/array operation
import numpy

#Library for machine learning algorithm
import sklearn

import seaborn

In [2]:
#Import necessary libraries

from pandas.plotting import scatter_matrix

from matplotlib import pyplot as plt

from sklearn import model_selection

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [36]:
trainurl = '/Users/ayushjain/Downloads/Loan Prediction Project/clean data/cleaned_train_data.csv'
testurl = '/Users/ayushjain/Downloads/Loan Prediction Project/clean data/cleaned_test_data.csv'

train = pandas.read_csv(trainurl)
test = pandas.read_csv(testurl)

train = train.sample(frac=1).reset_index(drop=True)

#Convert all the non-numeric data to numeric data by using dummies function from pandas library
cols = ['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area']

traindata = pandas.get_dummies(train, columns = cols)
testdata = pandas.get_dummies(test, columns = cols)

Y = traindata.Loan_Status

loanid = test.Loan_ID

traindata = traindata.drop(['Loan_ID', 'Loan_Status'], axis = 1)

X = traindata.astype(numpy.float32).reset_index(drop=True)

testdata = testdata.drop('Loan_ID', axis = 1)

testdata.insert(8, 'Married_Unknown', 0)

#Using the train_test_split() to create both training and test data.
#Training data is used to create the model
#Test data is used for validation of the model which was created using training data

#Let's split the full_data in 80-20, where 80% is training data & test_data as 20%
test_data_size = 0.20

train_test_list = model_selection.train_test_split(X, Y, test_size = test_data_size, random_state = 10)

X_train_data = train_test_list[0]

X_test_data = train_test_list[1]

Y_train_data = train_test_list[2]

Y_test_data = train_test_list[3]

#We have splitted the data into training and test data.

In [37]:
#We will now run the Gradient Boosting Algorithm on the training data and fit the test data. 

#For the one with which we get highest accuracy, we will take that parameters and fit the model using that parameters.

#Then we estimate the Loan_Status on our test data set.

#Learning rate : It determines the impact of each tree on the final outcome.
#number of estimators : Number of sequential trees to be modeled.
#The more the number of estimators, the better. But beware, we may overfit the model from training data.
#Choosing the value for n_estimator is critical.
learning_rates = [0.05, 0.25, 0.5, 0.75, 1]

n_estimators = [20,50,100,200,300,400]

for n_estimator in n_estimators:
    for learning_rate in learning_rates:
        gb = GradientBoostingClassifier(n_estimators = n_estimator, learning_rate = learning_rate, max_features = 2, max_depth = 2)
        gb.fit(X_train_data, Y_train_data)
        print('For learning rate:'+str(learning_rate))
        print('Estimators:'+ str(n_estimator))
        print('Score:'+str(gb.score(X_train_data, Y_train_data)))
        print()
        
#We see that with 400 estimators we get almost 100% accuracy, but by choosing 400 estimators we are overfitting training dataset.

#Choosing the parameters is very CRITICAL.




For learning rate:0.05
Estimators:20
Score:0.7107942973523421

For learning rate:0.25
Estimators:20
Score:0.8167006109979633

For learning rate:0.5
Estimators:20
Score:0.8187372708757638

For learning rate:0.75
Estimators:20
Score:0.8289205702647657

For learning rate:1
Estimators:20
Score:0.8289205702647657

For learning rate:0.05
Estimators:50
Score:0.8065173116089613

For learning rate:0.25
Estimators:50
Score:0.8268839103869654

For learning rate:0.5
Estimators:50
Score:0.8492871690427699

For learning rate:0.75
Estimators:50
Score:0.8615071283095723

For learning rate:1
Estimators:50
Score:0.8757637474541752

For learning rate:0.05
Estimators:100
Score:0.8167006109979633

For learning rate:0.25
Estimators:100
Score:0.835030549898167

For learning rate:0.5
Estimators:100
Score:0.879837067209776

For learning rate:0.75
Estimators:100
Score:0.8940936863543788

For learning rate:1
Estimators:100
Score:0.9103869653767821

For learning rate:0.05
Estimators:200
Score:0.8187372708757638



In [38]:
#Let's choose number of estimators as 300 and learning rate as 0.75

model = GradientBoostingClassifier(n_estimators = 300, learning_rate = 0.75, max_features = 2, max_depth = 2)

#Fit the model on our training dataset
model.fit(X, Y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.75, loss='deviance', max_depth=2,
              max_features=2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [40]:
#We have our model ready.
#Let's make predictions on the test dataset

predictions = model.predict(testdata)

predictions = pandas.Series(predictions)

final = pandas.concat([loanid, predictions], names = ['Loan_ID', 'Loan_Status'], axis = 1)

final

Unnamed: 0,Loan_ID,0
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,N
5,LP001054,Y
6,LP001055,Y
7,LP001056,N
8,LP001059,Y
9,LP001067,Y


In [None]:
#We have successfully imputed the values on the test dataset.