In [1]:
import numpy as np
import pandas as pd
import warnings
from pylab import rcParams
from sklearn import metrics, preprocessing, tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
%matplotlib inline
rcParams['figure.figsize'] = 6, 6
warnings.filterwarnings("ignore")

In [3]:
file_loc = './loan_prediction.csv'

In [4]:
df = pd.read_csv(file_loc)

In [5]:
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849,0.0,0.0,360.0,1.0,1
1,4583,1508.0,128.0,360.0,1.0,0
2,3000,0.0,66.0,360.0,1.0,1
3,2583,2358.0,120.0,360.0,1.0,1
4,6000,0.0,141.0,360.0,1.0,1


In [6]:
df.shape

(614, 6)

In [7]:
clf = RandomForestClassifier(n_estimators=100, oob_score=True)

In [8]:
X = df.iloc[:,0:len(df.columns)-1].values
Y = df.iloc[:,-1].values

In [9]:
X.shape

(614, 5)

In [10]:
Y.shape

(614,)

In [11]:
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [12]:
clf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [13]:
clf.score(X_train, Y_train)

1.0

In [14]:
clf.oob_score_

0.7152173913043478

In [15]:
clf_pred = clf.predict(X_test)

In [16]:
accuracy_score(Y_test, clf_pred)

0.7662337662337663

In [17]:
parameters = {'max_depth':[1,2,3,4,5], 
              'min_samples_leaf':[1,2,3,4,5], 
              'min_samples_split':[2,3,4,5],
              'criterion' : ['gini','entropy']}
scorer = make_scorer(accuracy_score)

In [18]:
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

In [19]:
grid_fit = grid_obj.fit(X_train, Y_train)

In [20]:
best_clf = grid_fit.best_estimator_

In [21]:
best_clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [22]:
best_clf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [23]:
best_clf.score(X_train, Y_train)

0.7630434782608696

In [24]:
best_clf.oob_score_

0.7543478260869565

In [25]:
best_clf_pred = best_clf.predict(X_test)

In [26]:
accuracy_score(Y_test, best_clf_pred)

0.7922077922077922