In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from catboost import CatBoostClassifier

In [2]:
df = pd.read_csv('diabetes.csv')

X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=0)

rf = RandomForestClassifier()
rf.fit(x_train,y_train)
rf.score(x_test,y_test)

0.7662337662337663

# Using Hyper Parameter Tunning

In [3]:
n_estimators      = [int(x) for x in np.linspace(start=100,stop=2000,num=50)]
criterion         = ['gini', 'entropy']
max_depth         = [int(x) for x in np.linspace(start=10, stop=100, num=50)]
min_samples_split = [int(x) for x in np.linspace(start=2,stop=100, num=50)]
min_samples_leaf  = [int(x) for x in np.linspace(start=1, stop=100, num=50)]
max_features      = ['auto', 'sqrt', 'log2']
max_leaf_nodes    = [int(x) for x in np.linspace(start=2,stop=100, num=50)]

In [4]:
random_grid = {
    'n_estimators'     : n_estimators,
    'criterion'        : criterion,
    'max_depth'        : max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf' :min_samples_leaf,
    'max_features'     : max_features,
    'max_leaf_nodes'   :max_leaf_nodes
    
}

In [5]:
rf = RandomForestClassifier()
randomCv_ = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,n_iter= 100,cv=3,n_jobs=-1, scoring='accuracy')
randomCv_.fit(x_train,y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 11, 13, 15, 17, 19,
                                                      21, 22, 24, 26, 28, 30,
                                                      32, 33, 35, 37, 39, 41,
                                                      43, 44, 46, 48, 50, 52,
                                                      54, 55, 57, 59, 61, 63, ...],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'max_leaf_nodes': [2, 4, 6, 8, 10, 12,
                                                           14, 16, 18, 20, 22,
                                                           24, 26, 28, 30, 32,
                                                   

In [6]:
randomCv_.best_params_

{'n_estimators': 1069,
 'min_samples_split': 86,
 'min_samples_leaf': 17,
 'max_leaf_nodes': 26,
 'max_features': 'log2',
 'max_depth': 77,
 'criterion': 'gini'}

In [7]:
randomCv_.best_score_

0.759274370723746

In [8]:
randomCv_.best_estimator_

RandomForestClassifier(max_depth=77, max_features='log2', max_leaf_nodes=26,
                       min_samples_leaf=17, min_samples_split=86,
                       n_estimators=1069)

In [9]:
random_ = randomCv_.best_estimator_
random_.predict(x_test)
print("Using Random Forest, the accuracy is = ",random_.score(x_test,y_test))

Using Random Forest, the accuracy is =  0.7878787878787878


In [10]:
param_distributions = {
    'loss'              : ['deviance', 'exponential'],
    'learning_rate'     : [float(x) for x in np.linspace(start=0.1, stop= 1.0, num=10)],
    'n_estimators'      : [int(x) for x in np.linspace(start=100,stop=2000,num=50)],
    'criterion'         : ['friedman_mse', 'mse', 'mae'],
    'min_samples_split' : [int(x) for x in np.linspace(start=2,stop=100, num=50)],
    'max_depth'         : [int(x) for x in np.linspace(start=10, stop=100, num=50)]
    
}

In [11]:
gb      = GradientBoostingClassifier()
random_ = RandomizedSearchCV(estimator=gb, param_distributions=param_distributions,cv=3,n_jobs=-1, scoring='accuracy')
random_.fit(x_train,y_train)

RandomizedSearchCV(cv=3, estimator=GradientBoostingClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['friedman_mse', 'mse',
                                                      'mae'],
                                        'learning_rate': [0.1, 0.2,
                                                          0.30000000000000004,
                                                          0.4, 0.5, 0.6,
                                                          0.7000000000000001,
                                                          0.8, 0.9, 1.0],
                                        'loss': ['deviance', 'exponential'],
                                        'max_depth': [10, 11, 13, 15, 17, 19,
                                                      21, 22, 24, 26, 28, 30,
                                                      32, 33, 35, 37, 39, 41,
                                                      43, 44, 46, 48, 50, 52,
                        

In [12]:
random_.best_params_

{'n_estimators': 1340,
 'min_samples_split': 34,
 'max_depth': 41,
 'loss': 'exponential',
 'learning_rate': 0.9,
 'criterion': 'mse'}

In [13]:
random_.best_estimator_

GradientBoostingClassifier(criterion='mse', learning_rate=0.9,
                           loss='exponential', max_depth=41,
                           min_samples_split=34, n_estimators=1340)

In [14]:
print("Using Gradient Boosting, the accuracy is = ",random_.score(x_test,y_test))

Using Gradient Boosting, the accuracy is =  0.7272727272727273


In [15]:
n_estimators = randomCv_.best_params_['n_estimators']
min_samples_split = randomCv_.best_params_['min_samples_split']
min_samples_leaf = randomCv_.best_params_['min_samples_leaf']
max_leaf_nodes = randomCv_.best_params_['max_leaf_nodes']
max_features = randomCv_.best_params_['max_features']
max_depth  = randomCv_.best_params_['max_depth']
criterion = randomCv_.best_params_['criterion']

In [16]:
param_grid_ ={
    'n_estimators': [n_estimators, n_estimators+100,n_estimators-150,n_estimators+200],
    'min_samples_split': [min_samples_split,min_samples_split+10,min_samples_split+20],
    'min_samples_leaf' :[min_samples_leaf,min_samples_leaf+10,min_samples_leaf+20],
    'max_leaf_nodes': [max_leaf_nodes,max_leaf_nodes+10,max_leaf_nodes+20],
    'max_features'  :['log2','auto','sqrt'],
    'max_depth': [max_depth,max_depth+10,max_depth+20],
    'criterion': [criterion]
}

In [17]:
rf = RandomForestClassifier()
grid_ = GridSearchCV(estimator=rf,param_grid=param_grid_,cv=3,n_jobs=-1,scoring='accuracy')
grid_.fit(x_train,y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [77, 87, 97],
                         'max_features': ['log2', 'auto', 'sqrt'],
                         'max_leaf_nodes': [26, 36, 46],
                         'min_samples_leaf': [17, 27, 37],
                         'min_samples_split': [86, 96, 106],
                         'n_estimators': [1069, 1169, 919, 1269]},
             scoring='accuracy')

In [18]:
grid_.best_score_

0.7629987655095934

In [19]:
grid_.best_estimator_

RandomForestClassifier(max_depth=87, max_features='log2', max_leaf_nodes=46,
                       min_samples_leaf=37, min_samples_split=96,
                       n_estimators=919)

In [20]:
grid_.param_grid

{'n_estimators': [1069, 1169, 919, 1269],
 'min_samples_split': [86, 96, 106],
 'min_samples_leaf': [17, 27, 37],
 'max_leaf_nodes': [26, 36, 46],
 'max_features': ['log2', 'auto', 'sqrt'],
 'max_depth': [77, 87, 97],
 'criterion': ['gini']}

In [21]:
grid_.score(x_test,y_test)

0.7878787878787878

In [23]:
cat = CatBoostClassifier()
cat.fit(x_train,y_train)
print("catboost, the accuray is = ",cat.score(x_test,y_test))

Learning rate set to 0.007894
0:	learn: 0.6885926	total: 6.55ms	remaining: 6.54s
1:	learn: 0.6854476	total: 9.34ms	remaining: 4.66s
2:	learn: 0.6819666	total: 11.8ms	remaining: 3.91s
3:	learn: 0.6778276	total: 14.5ms	remaining: 3.6s
4:	learn: 0.6748451	total: 17.2ms	remaining: 3.42s
5:	learn: 0.6702339	total: 18.9ms	remaining: 3.12s
6:	learn: 0.6673895	total: 21.2ms	remaining: 3.01s
7:	learn: 0.6635487	total: 24.1ms	remaining: 2.99s
8:	learn: 0.6601041	total: 26ms	remaining: 2.87s
9:	learn: 0.6577907	total: 27ms	remaining: 2.67s
10:	learn: 0.6542430	total: 28.8ms	remaining: 2.59s
11:	learn: 0.6507346	total: 31.6ms	remaining: 2.6s
12:	learn: 0.6470389	total: 34.2ms	remaining: 2.6s
13:	learn: 0.6437158	total: 36.8ms	remaining: 2.59s
14:	learn: 0.6407461	total: 39ms	remaining: 2.56s
15:	learn: 0.6377673	total: 41.9ms	remaining: 2.58s
16:	learn: 0.6352130	total: 45ms	remaining: 2.6s
17:	learn: 0.6322626	total: 48.2ms	remaining: 2.63s
18:	learn: 0.6290161	total: 50.8ms	remaining: 2.62s
19:	