In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('diabetes.csv')

In [3]:
df.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [4]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=0)

In [5]:
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
print("Without hyper parameter tunning, the accuracy is = ",dt.score(x_test,y_test))

Without hyper parameter tunning, the accuracy is =  0.7402597402597403


In [6]:
path = dt.cost_complexity_pruning_path(x_train,y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas = list(ccp_alphas)
ccp_alphas

[0.0,
 0.0016324626865671641,
 0.0016791044776119407,
 0.0017324093816631135,
 0.001812366737739872,
 0.002238805970149254,
 0.0023361453601557416,
 0.0024626865671641797,
 0.0024875621890547263,
 0.0024875621890547263,
 0.0024875621890547263,
 0.0024875621890547263,
 0.0024875621890547263,
 0.0024875621890547263,
 0.0025393864013266996,
 0.002660309563294638,
 0.0026687988628287134,
 0.002747625508819538,
 0.0027784156142365065,
 0.002798507462686567,
 0.0028617780661907865,
 0.003126939559627607,
 0.003135364842454396,
 0.0031982942430703633,
 0.0031982942430703633,
 0.0031982942430703633,
 0.0032662318658806793,
 0.0033056443970623066,
 0.0033167495854063015,
 0.0033167495854063015,
 0.0033582089552238797,
 0.0033582089552238814,
 0.0033921302578018998,
 0.0036263134063119207,
 0.0036551934206518442,
 0.00367211561241412,
 0.0037716180999763085,
 0.004132697855261507,
 0.004227116167414675,
 0.004443926202321725,
 0.004875621890547259,
 0.005192568625404446,
 0.005772109892134769,
 

In [7]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth': [int(x) for x in np.linspace(start=2, stop=100, num = 20)],
    'min_samples_leaf':[int(x) for x in np.linspace(start=1, stop=20, num=20)],
    'max_features': ['auto','sqrt','log2'],
    'min_samples_leaf': [int(x) for x in np.linspace(start=1, stop=20, num = 10)]
}

In [8]:
grid = GridSearchCV(estimator=dt,param_grid=param_grid,verbose=3, n_jobs=-1,cv=10, scoring='roc_auc')
grid.fit(x_train,y_train)

Fitting 10 folds for each of 2400 candidates, totalling 24000 fits


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 7, 12, 17, 22, 27, 32, 38, 43, 48, 53,
                                       58, 63, 69, 74, 79, 84, 89, 94, 100],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 3, 5, 7, 9, 11, 13, 15, 17,
                                              20],
                         'splitter': ['best', 'random']},
             scoring='roc_auc', verbose=3)

In [9]:
grid.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=27, max_features='log2',
                       min_samples_leaf=20)

In [10]:
grid.best_params_

{'criterion': 'entropy',
 'max_depth': 27,
 'max_features': 'log2',
 'min_samples_leaf': 20,
 'splitter': 'best'}

In [11]:
grid.best_score_

0.7988616762494472

In [12]:
grid.score(x_test,y_test)

0.7894392033542976

In [13]:
from sklearn.ensemble import GradientBoostingClassifier
rf = GradientBoostingClassifier()
rf.fit(x_train,y_train)
rf.score(x_test,y_test)

0.7748917748917749

In [15]:
param_grid = {
    'loss': ['deviance', 'exponential'],
    'learning_rate':[float(x) for x in np.linspace(start=0.1, stop=1, num=5)],
    'criterion': ['friedman_mse', 'mse', 'mae'],
    'max_depth': [int(x) for x in np.linspace(start=2, stop=100, num = 20)],
    'min_samples_leaf':[int(x) for x in np.linspace(start=1, stop=20, num=20)],
    'max_features': ['auto','sqrt','log2']
}
    

In [None]:
grid = GridSearchCV(estimator=rf,param_grid=param_grid,verbose=3, n_jobs=-1,cv=10, scoring='roc_auc')
grid.fit(x_train,y_train)

Fitting 10 folds for each of 18000 candidates, totalling 180000 fits
