# Hyperparameter Tuning of Decision Trees

In [4]:
# importing essential libraries
import numpy as np
import pandas as pd
import pydotplus
from IPython.display import Image
from pylab import rcParams
from sklearn import metrics, preprocessing, tree
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from io import StringIO
import warnings
%matplotlib inline

In [5]:
rcParams['figure.figsize'] = 6,6
warnings.filterwarnings('ignore')

In [6]:
# making a function to plot decision tree
def plot_decision_tree(clf,feature_name,target_name):
    '''
    inputs:
    clf : model
    feature_name : independent variable columns
    target_name : dependent variable column
    '''
    dot_data = StringIO()  
    tree.export_graphviz(clf, out_file=dot_data,  
                         feature_names=feature_name,  
                         class_names=target_name,  
                         filled=True, rounded=True,  
                         special_characters=True)  
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png())

In [7]:
# loading the dataset
df = pd.read_csv('loan_prediction.csv')
print(df.shape)
df.head()

(614, 6)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849,0.0,0.0,360.0,1.0,1
1,4583,1508.0,128.0,360.0,1.0,0
2,3000,0.0,66.0,360.0,1.0,1
3,2583,2358.0,120.0,360.0,1.0,1
4,6000,0.0,141.0,360.0,1.0,1


In [8]:
# importing decision tree and making an instance of it
from sklearn.tree import DecisionTreeClassifier as dt
clf = dt()

In [9]:
clf

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [10]:
# segregating dependent and independent variables
X = df.iloc[:,0:len(df.columns)-1].values
Y = df.iloc[:,-1].values

In [11]:
print("Shape of x",X.shape)
print("Shape of y",Y.shape)

Shape of x (614, 5)
Shape of y (614,)


In [12]:
# splitting into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,\
                                    test_size=0.25,\
                                    random_state=0)
print(X_train.shape)
print(X_test.shape)

(460, 5)
(154, 5)


In [13]:
# obtaining the cross validation score
scores = cross_val_score(clf, X_train, Y_train, cv=5,\
         scoring='f1_macro')
scores.mean()

0.6337042886760117

In [14]:
# fitting the model
clf.fit(X_train,Y_train)
# finding training set and test set predictions
train_predictions = clf.predict(X_train)
test_predictions = clf.predict(X_test)

In [15]:
# printing the parameters of the model
clf

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [16]:
# segregating feature columns and target column
feature_cols = df.columns[0:len(df.columns)-1]
target_cols = df.columns[-1]

In [17]:
# plotting decision tree
plot_decision_tree(clf,feature_cols,target_cols)

InvocationException: GraphViz's executables not found

In [18]:
# printing f1 score of training and test set
print('The training F1 Score is' ,\
      f1_score(train_predictions, Y_train))
print('The Test F1 Score is ',\
      f1_score(test_predictions, Y_test ))

The training F1 Score is 1.0
The Test F1 Score is  0.7579908675799087


In [19]:
# making parameter grid/dictionary for GridSearchCV 
parameters = {'max_depth':[1,2,3,4,5],\
              'min_samples_leaf':[1,2,3,4,5],\
             'min_samples_split':[2,3,4,5],\
             'criterion':['gini','entropy']}

# make_Scorer is used in GridSearchCV for scoring the models
scorer = make_scorer(f1_score)

In [20]:
# Making an instance of GridSearchCV and fitting it on X_train and y_train
# And finding the best estimator
grid_obj = GridSearchCV(clf, parameters, scoring = scorer )
grid_fit = grid_obj.fit(X_train, Y_train)
best_clf = grid_fit.best_estimator_
best_clf

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=1, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [24]:
# plotting best estimator
plot_decision_tree(best_clf, feature_cols, target_cols)

InvocationException: GraphViz's executables not found

In [22]:
# finding the cross validation score of best estimator using F1 scoring
scores = cross_val_score(best_clf, X_train, Y_train, cv=5,\
                        scoring='f1_macro')
scores.mean()

0.7058924321624135

In [23]:
best_clf.fit(X_train, Y_train)

best_train_predictions = best_clf.predict(X_train)
best_test_predictions = best_clf.predict(X_test)

# Training and test set scores of the best estimator

print("The Training F1 Score ",\
      f1_score(best_train_predictions, Y_train))
print("The testing F1 Score ",\
     f1_score(best_test_predictions,Y_test))

The Training F1 Score  0.8360902255639098
The testing F1 Score  0.8620689655172413
