In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [8]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
# handling zero values.

df['Glucose'] = np.where(df['Glucose'] == 0, df['Glucose'].median(), df['Glucose'])
df['Insulin'] = np.where(df['Insulin'] == 0, df['Insulin'].median(), df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness'] == 0, df['SkinThickness'].median(), df['SkinThickness'])

In [10]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [11]:
# Getting dependent and independent features
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [12]:
# Splitting into training and test set data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=33)

In [21]:
# Creating model

rf_classifier = RandomForestClassifier(n_estimators=10)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

In [25]:
# Checking the performance of the model

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"Accuracy of the model = {accuracy_score(y_test, y_pred)}")

[[87 12]
 [28 27]]
              precision    recall  f1-score   support

           0       0.76      0.88      0.81        99
           1       0.69      0.49      0.57        55

    accuracy                           0.74       154
   macro avg       0.72      0.68      0.69       154
weighted avg       0.73      0.74      0.73       154

Accuracy of the model = 0.7402597402597403


### The parameters used in RandomForestClassifier: 
- criterion = the function used to evaluate the quality of a split.
- max_depth = maximum number of levels allowed in each tree.
- max_features = maximum number of features considered when splitting a node.
- min_samples_leaf = minimum number of samples which can be stored in a tree leaf.
- min_samples_split = minimum number of samples necessary in a node to cause node splitting.
- n_estimators = number of trees in the ensemble

### Manual Hyperparameter Tuning
Randomly selecting parameters on your own.

In [29]:
model = RandomForestClassifier(n_estimators=500,criterion='entropy',
                             max_features='sqrt',min_samples_leaf=10,random_state=100)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"Accuracy of the model = {accuracy_score(y_test, y_pred)}")

[[88 11]
 [28 27]]
              precision    recall  f1-score   support

           0       0.76      0.89      0.82        99
           1       0.71      0.49      0.58        55

    accuracy                           0.75       154
   macro avg       0.73      0.69      0.70       154
weighted avg       0.74      0.75      0.73       154

Accuracy of the model = 0.7467532467532467


### Randomized Search CV:

In [66]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest 
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]

# Number of features to select at every split
max_features = ['auto', 'sqrt', 'log2']

# Maximum number of levels
max_depth = [int(x) for x in np.linspace(10, 1000, 10)]

# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4, 7, 9]

# Minimum number of samples required at a tree leaf
min_samples_leaf = [1, 2, 4, 6, 8]

# Creating a random grid (dictionary):
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'criterion': ['gini', 'entropy']
}

random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000],
 'min_samples_leaf': [1, 2, 4, 6, 8],
 'min_samples_split': [2, 3, 4, 7, 9],
 'criterion': ['gini', 'entropy']}

In [67]:
# Our model/estimator
model = RandomForestClassifier()

rf_randomcv = RandomizedSearchCV(estimator=model, param_distributions=random_grid, 
                                n_iter=100, cv=3, verbose=2, random_state=100, n_jobs=-1) 

# fitting the randomized model
rf_randomcv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.0min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 3, 4, 7, 9],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [68]:
# the best estimator according to the RandomizedSearchCV: 
best_estimator = rf_randomcv.best_estimator_

In [69]:
y_pred = best_estimator.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"Accuracy of the model = {accuracy_score(y_test, y_pred)}")

# As we see the accuracy has somewhat increased from 74 to 75 percent

[[84 15]
 [24 31]]
              precision    recall  f1-score   support

           0       0.78      0.85      0.81        99
           1       0.67      0.56      0.61        55

    accuracy                           0.75       154
   macro avg       0.73      0.71      0.71       154
weighted avg       0.74      0.75      0.74       154

Accuracy of the model = 0.7467532467532467


### Grid Search CV

In [70]:
rf_randomcv.best_params_

{'n_estimators': 2000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': 'log2',
 'max_depth': 10,
 'criterion': 'entropy'}

In [71]:
from sklearn.model_selection import GridSearchCV

In [72]:
criterion = [rf_randomcv.best_params_['criterion']]

min_samples_leaf = [rf_randomcv.best_params_['min_samples_leaf'],
                    rf_randomcv.best_params_['min_samples_leaf'] + 2,
                    rf_randomcv.best_params_['min_samples_leaf'] + 4]

max_features = [rf_randomcv.best_params_['max_features']]

max_depth = [rf_randomcv.best_params_['max_depth']]

min_samples_split = [rf_randomcv.best_params_['min_samples_split'] - 1,
                    rf_randomcv.best_params_['min_samples_split'],
                    rf_randomcv.best_params_['min_samples_split'] + 1,
                    rf_randomcv.best_params_['min_samples_split'] + 2]

n_estimators = [rf_randomcv.best_params_['n_estimators'] - 200,
               rf_randomcv.best_params_['n_estimators'] - 100,
               rf_randomcv.best_params_['n_estimators'],
               rf_randomcv.best_params_['n_estimators'] + 100,
               rf_randomcv.best_params_['n_estimators'] + 200]

grid_params = {
    'criterion' : criterion,
    'min_samples_leaf': min_samples_leaf,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators
}

for key, value in grid_params.items():
    print(f"{key} : {value}")

criterion : ['entropy']
min_samples_leaf : [8, 10, 12]
max_features : ['log2']
max_depth : [10]
min_samples_split : [1, 2, 3, 4]
n_estimators : [1800, 1900, 2000, 2100, 2200]


In [73]:
# Now performing GridSearchCV
model = RandomForestClassifier()
rf_gridcv = GridSearchCV(estimator=model, param_grid=grid_params, n_jobs=-1,
                        cv=10, verbose=2)

rf_gridcv.fit(X_train, y_train)

Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 10.5min finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy'], 'max_depth': [10],
                         'max_features': ['log2'],
                         'min_samples_leaf': [8, 10, 12],
                         'min_samples_split': [1, 2, 3, 4],
                         'n_estimators': [1800, 1900, 2000, 2100, 2200]},
             verbose=2)

In [74]:
rf_gridcv.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=10, max_features='log2',
                       min_samples_leaf=12, min_samples_split=3,
                       n_estimators=1800)

In [75]:
best_grid = rf_gridcv.best_estimator_

In [76]:
y_pred = best_grid.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"Accuracy of the model = {accuracy_score(y_test, y_pred)}")

[[86 13]
 [29 26]]
              precision    recall  f1-score   support

           0       0.75      0.87      0.80        99
           1       0.67      0.47      0.55        55

    accuracy                           0.73       154
   macro avg       0.71      0.67      0.68       154
weighted avg       0.72      0.73      0.71       154

Accuracy of the model = 0.7272727272727273


## Automated Hyperparameter Tuning
Automated Hyperparameter tuning can be done by using techniques such as:
- Bayesian Optimization
- Gradient Descent
- Evolutionary Algorithms

### Bayesian Optimization
Bayesian optimization uses probability to find the minimum of a function. The final aim is to find the input value to a function which can gives us the lowest possible output value.It usually performs better than random,grid and manual search providing better performance in the testing phase and reduced optimization time. In Hyperopt, Bayesian Optimization can be implemented giving 3 three main parameters to the function fmin.

- Objective Function = defines the loss function to minimize.
- Domain Space = defines the range of input values to test (in Bayesian Optimization this space creates a probability distribution for each of the used Hyperparameters).
- Optimization Algorithm = defines the search algorithm to use to select the best input values to use in each new iteration

In [7]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

In [6]:
# Creating Domain Space

space = {
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_depth': hp.quniform('max_depth', 10, 1200, 10),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
    'min_samples_split': hp.uniform('min_samples_split', 0, 1),
    'n_estimators': hp.choice('n_estimators', [50, 100, 200, 300, 600, 900, 1200, 1500])
}

space

{'criterion': <hyperopt.pyll.base.Apply at 0x1b39fcce788>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x1b39fcceb48>,
 'max_features': <hyperopt.pyll.base.Apply at 0x1b39fc46508>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x1b39fc9f088>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x1b39fc3f648>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x1b39fc3f948>}

In [16]:
# Creating Objective Function:

def objective(space):
    clf = RandomForestClassifier(criterion=space['criterion'], max_depth=space['max_depth'], 
                                max_features=space['max_features'], min_samples_leaf=space['min_samples_leaf'],
                                min_samples_split=space['min_samples_split'], n_estimators=space['n_estimators'])
    
    accuracy = cross_val_score(clf, X_train, y_train, cv=5).mean()
    
    return {'loss': -accuracy, 'status': STATUS_OK}

In [17]:
# Now we will run Hyperopt function(trials)

trials = Trials()
best = fmin(fn=objective,
           space=space,
           algo=tpe.suggest,
           max_evals=80,
           trials=trials)

100%|████████████████████████████████████████████████████████████████████████████| 80/80 [06:16<00:00,  4.71s/trial, best loss: -0.7753032120485139]


In [18]:
best

{'criterion': 0,
 'max_depth': 910.0,
 'max_features': 3,
 'min_samples_leaf': 0.0583022755265539,
 'min_samples_split': 0.10210919918535663,
 'n_estimators': 1}

In [32]:
crit = {0: 'gini', 1: 'entropy'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 50, 1: 100, 2: 200, 3: 300, 4: 600, 6: 900, 7: 1200, 8: 1500}


In [34]:
print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

gini
None
100


In [35]:
# After getting all the parametes, now we can create the model

model = RandomForestClassifier(criterion=crit[best['criterion']],
                              max_depth=best['max_depth'],
                              max_features=feat[best['max_features']],
                              min_samples_leaf=best['min_samples_leaf'],
                              min_samples_split=best['min_samples_split'],
                              n_estimators=est[best['n_estimators']])

In [37]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [38]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"Accuracy of the model = {accuracy_score(y_test, y_pred)}")

[[88 11]
 [27 28]]
              precision    recall  f1-score   support

           0       0.77      0.89      0.82        99
           1       0.72      0.51      0.60        55

    accuracy                           0.75       154
   macro avg       0.74      0.70      0.71       154
weighted avg       0.75      0.75      0.74       154

Accuracy of the model = 0.7532467532467533


## Genetic Algorithms
Genetic Algorithms tries to apply natural selection mechanisms to Machine Learning contexts.

Let's immagine we create a population of N Machine Learning models with some predifined Hyperparameters. We can then calculate the accuracy of each model and decide to keep just half of the models (the ones that performs best). We can now generate some offsprings having similar Hyperparameters to the ones of the best models so that go get again a population of N models. At this point we can again caltulate the accuracy of each model and repeate the cycle for a defined number of generations. In this way, just the best models will survive at the end of the process.

In [2]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:

df['Glucose'] = np.where(df['Glucose'] == 0, df['Glucose'].median(), df['Glucose'])
df['Insulin'] = np.where(df['Insulin'] == 0, df['Insulin'].median(), df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness'] == 0, df['SkinThickness'].median(), df['SkinThickness'])

In [4]:
# Getting dependent and independent features
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=33)

In [6]:
from sklearn.model_selection import RandomizedSearchCV

In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}
