In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
df = pd.read_csv('diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
import numpy as np
df['Glucose']=np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin']=np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness']=np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])

In [5]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [5]:
X = df.drop('Outcome', axis=1)
y=df['Outcome']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=33)

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=10)
rf.fit(X_train,y_train)
prediction=rf.predict(X_test)

In [8]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(classification_report(y_test,prediction))
print(accuracy_score(y_test,prediction))

[[85 14]
 [26 29]]
              precision    recall  f1-score   support

           0       0.77      0.86      0.81        99
           1       0.67      0.53      0.59        55

    accuracy                           0.74       154
   macro avg       0.72      0.69      0.70       154
weighted avg       0.73      0.74      0.73       154

0.7402597402597403


Manual Hyperparameter tuning

In [10]:
model=RandomForestClassifier(n_estimators=500,criterion='gini',max_features='sqrt',min_samples_leaf=10,random_state=100)
print(confusion_matrix(y_test,prediction))
print(classification_report(y_test,prediction))
print(accuracy_score(y_test,prediction))

[[84 15]
 [27 28]]
              precision    recall  f1-score   support

           0       0.76      0.85      0.80        99
           1       0.65      0.51      0.57        55

    accuracy                           0.73       154
   macro avg       0.70      0.68      0.69       154
weighted avg       0.72      0.73      0.72       154

0.7272727272727273


Randomized Search

In [14]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [15]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [16]:
rf_randomcv.best_params_

{'n_estimators': 400,
 'min_samples_split': 14,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 670,
 'criterion': 'gini'}

In [18]:
rf_randomcv.best_estimator_

RandomForestClassifier(max_depth=670, max_features='sqrt', min_samples_leaf=2,
                       min_samples_split=14, n_estimators=400)

In [19]:
best_random_grid=rf_randomcv.best_estimator_

In [20]:
from sklearn.metrics import accuracy_score
y_pred=best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[86 13]
 [25 30]]
Accuracy Score 0.7532467532467533
Classification report:               precision    recall  f1-score   support

           0       0.77      0.87      0.82        99
           1       0.70      0.55      0.61        55

    accuracy                           0.75       154
   macro avg       0.74      0.71      0.72       154
weighted avg       0.75      0.75      0.75       154



Grid SearchCV

In [21]:
rf_randomcv.best_params_

{'n_estimators': 400,
 'min_samples_split': 14,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 670,
 'criterion': 'gini'}

In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth': [rf_randomcv.best_params_['max_depth']],
    'max_features': [rf_randomcv.best_params_['max_features']],
    'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
                         rf_randomcv.best_params_['min_samples_leaf']+3, 
                         rf_randomcv.best_params_['min_samples_leaf'] + 5],
    'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 1,
                          rf_randomcv.best_params_['min_samples_split'] - 5,
                          rf_randomcv.best_params_['min_samples_split'], 
                          rf_randomcv.best_params_['min_samples_split'] +3,
                          rf_randomcv.best_params_['min_samples_split'] + 6],
    'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100, 
                     rf_randomcv.best_params_['n_estimators'], 
                     rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 300]
}

print(param_grid)

{'criterion': ['gini'], 'max_depth': [670], 'max_features': ['sqrt'], 'min_samples_leaf': [2, 5, 7], 'min_samples_split': [13, 9, 14, 17, 20], 'n_estimators': [200, 300, 400, 500, 700]}


In [28]:
#### Fit the grid_search to the data
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [670],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [2, 5, 7],
                         'min_samples_split': [13, 9, 14, 17, 20],
                         'n_estimators': [200, 300, 400, 500, 700]},
             verbose=2)

In [29]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=670, max_features='sqrt', min_samples_leaf=5,
                       min_samples_split=17, n_estimators=400)

In [30]:
best_grid=grid_search.best_estimator_

In [31]:
y_pred=best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[88 11]
 [27 28]]
Accuracy Score 0.7532467532467533
Classification report:               precision    recall  f1-score   support

           0       0.77      0.89      0.82        99
           1       0.72      0.51      0.60        55

    accuracy                           0.75       154
   macro avg       0.74      0.70      0.71       154
weighted avg       0.75      0.75      0.74       154



# Automated Hyper Tuning

## Bayesian Optimization

In [33]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [34]:
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [35]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x1981222e5e0>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x198122d43d0>,
 'max_features': <hyperopt.pyll.base.Apply at 0x198122d4520>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x198121b5970>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x198123863d0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x198123d5880>}

In [37]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'] 
                                 )
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [38]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|███████████████████████████████████████████████| 80/80 [07:17<00:00,  5.47s/trial, best loss: -0.7769159003065441]


{'criterion': 0,
 'max_depth': 1170.0,
 'max_features': 3,
 'min_samples_leaf': 0.055842423507544564,
 'min_samples_split': 0.03313645385019368,
 'n_estimators': 3}

In [42]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 15, 1: 67, 2: 300, 3: 750, 4: 1200,5:1300,6:1550}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

entropy
None
750


In [43]:
best['min_samples_leaf']

0.055842423507544564

In [44]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

[[86 13]
 [27 28]]
0.7402597402597403
              precision    recall  f1-score   support

           0       0.76      0.87      0.81        99
           1       0.68      0.51      0.58        55

    accuracy                           0.74       154
   macro avg       0.72      0.69      0.70       154
weighted avg       0.73      0.74      0.73       154



### Genetic Algorithms

In [9]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [10]:
from tpot import TPOTClassifier


tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_train,y_train)

Optimization Progress:   0%|          | 0/84 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7801969272557507

Generation 2 - Current best internal CV score: 0.7801969272557507

Generation 3 - Current best internal CV score: 0.7801969272557507

Generation 4 - Current best internal CV score: 0.7801969272557507

Generation 5 - Current best internal CV score: 0.7801969272557507

Best pipeline: RandomForestClassifier(input_matrix, criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400)


TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['entropy',
                                                                                      'gini'],
                                                                        'max_depth': [10,
                                                                                      120,
                                                                                      230,
                                                                                      340,
                                                                                      450,
                                                                                      560,
                                                                                      670,
                                                                                      780,
                                                                                 

In [11]:
accuracy = tpot_classifier.score(X_test, y_test)
print(accuracy)

0.7662337662337663


### Optuna

In [12]:
import optuna
import sklearn.svm
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        
        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(
        clf,X_train,y_train, n_jobs=-1, cv=3).mean()

In [13]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2022-09-30 20:00:41,560][0m A new study created in memory with name: no-name-8aff8444-3955-4573-9e84-ff008dfc69e5[0m
[32m[I 2022-09-30 20:01:01,634][0m Trial 0 finished with value: 0.7655188904830225 and parameters: {'classifier': 'RandomForest', 'n_estimators': 270, 'max_depth': 20.84812061022293}. Best is trial 0 with value: 0.7655188904830225.[0m
[32m[I 2022-09-30 20:01:07,205][0m Trial 1 finished with value: 0.7704128805993943 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1680, 'max_depth': 22.56435052307719}. Best is trial 1 with value: 0.7704128805993943.[0m
[32m[I 2022-09-30 20:01:09,130][0m Trial 2 finished with value: 0.6530926191614858 and parameters: {'classifier': 'SVC', 'svc_c': 59559.60070413331}. Best is trial 1 with value: 0.7704128805993943.[0m
[32m[I 2022-09-30 20:01:12,272][0m Trial 3 finished with value: 0.7703969392635103 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1400, 'max_depth': 13.977260318743367}. Best

[32m[I 2022-09-30 20:02:43,926][0m Trial 35 finished with value: 0.6530926191614858 and parameters: {'classifier': 'SVC', 'svc_c': 727792346.5206801}. Best is trial 24 with value: 0.7752989000478241.[0m
[32m[I 2022-09-30 20:02:48,073][0m Trial 36 finished with value: 0.7736489717838354 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1500, 'max_depth': 40.522473261951355}. Best is trial 24 with value: 0.7752989000478241.[0m
[32m[I 2022-09-30 20:02:52,317][0m Trial 37 finished with value: 0.7655029491471385 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1460, 'max_depth': 40.13078398370642}. Best is trial 24 with value: 0.7752989000478241.[0m
[32m[I 2022-09-30 20:02:52,380][0m Trial 38 finished with value: 0.6530926191614858 and parameters: {'classifier': 'SVC', 'svc_c': 0.11164729411127663}. Best is trial 24 with value: 0.7752989000478241.[0m
[32m[I 2022-09-30 20:02:55,592][0m Trial 39 finished with value: 0.7704049099314524 and parameters: {

[32m[I 2022-09-30 20:04:48,097][0m Trial 70 finished with value: 0.7687788936712897 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1820, 'max_depth': 10.374765457692318}. Best is trial 24 with value: 0.7752989000478241.[0m
[32m[I 2022-09-30 20:04:53,438][0m Trial 71 finished with value: 0.7720229555236728 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1920, 'max_depth': 41.091228116064}. Best is trial 24 with value: 0.7752989000478241.[0m
[32m[I 2022-09-30 20:04:58,044][0m Trial 72 finished with value: 0.772038896859557 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1910, 'max_depth': 36.9565390036213}. Best is trial 24 with value: 0.7752989000478241.[0m
[32m[I 2022-09-30 20:05:01,343][0m Trial 73 finished with value: 0.772038896859557 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1370, 'max_depth': 65.53102882178962}. Best is trial 24 with value: 0.7752989000478241.[0m
[32m[I 2022-09-30 20:05:04,370][0m Tri

Accuracy: 0.7752989000478241
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 1170, 'max_depth': 27.62752957583289}


In [14]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 1170,
 'max_depth': 27.62752957583289}

In [15]:
rf=RandomForestClassifier(n_estimators=330,max_depth=30)
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=30, n_estimators=330)

In [16]:
y_pred=rf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[84 15]
 [23 32]]
0.7532467532467533
              precision    recall  f1-score   support

           0       0.79      0.85      0.82        99
           1       0.68      0.58      0.63        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.72       154
weighted avg       0.75      0.75      0.75       154

