### All Techniques Of Hyper Parameter Optimization
   - GridSearchCV
   - RandomizedSearchCV
   - Bayesian Optimization -Automate Hyperparameter Tuning (Hyperopt)
   - Sequential Model Based Optimization (Tuning a scikit-learn estimator with skopt)
   - Optuna-Automate Hyperparameter Tuning
   - Genetic Algorithms (TPOT Classifier)
###### References
- https://github.com/fmfn/BayesianOptimization
- https://github.com/hyperopt/hyperopt
- https://www.jeremyjordan.me/hyperparameter-tuning/
- https://optuna.org/
- https://towardsdatascience.com/hyperparameters-optimization-526348bb8e2d(By Pier Paolo Ippolito )
- https://scikit-optimize.github.io/stable/auto_examples/hyperparameter-optimization.html


In [29]:
import pandas as pd

In [30]:
data = pd.read_csv("diabetes.csv")

In [31]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [6]:
import numpy as np
data["Glucose"]=np.where(data["Glucose"]==0,data["Glucose"].median(),data["Glucose"])
data["Insulin"]=np.where(data["Insulin"]==0,data["Insulin"].median(),data["Insulin"])
data["SkinThickness"]=np.where(data["SkinThickness"]==0,data["SkinThickness"].median(),data["SkinThickness"])

In [7]:
X=data.drop('Outcome',axis=1)
Y = data["Outcome"]

In [8]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(X,Y,test_size=0.3,random_state=35,stratify=Y)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report
clf = RandomForestClassifier()
clf.fit(train_x,train_y)
pred_y = clf.predict(test_x)
print(confusion_matrix(test_y,pred_y))
print(classification_report(test_y,pred_y))

[[130  20]
 [ 26  55]]
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       150
           1       0.73      0.68      0.71        81

    accuracy                           0.80       231
   macro avg       0.78      0.77      0.78       231
weighted avg       0.80      0.80      0.80       231



##### cross_val_score

In [8]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier()
accuracy = cross_val_score(clf,train_x,train_y,cv=10,n_jobs=-1,scoring="accuracy",verbose=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    3.9s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.2s finished


In [9]:
accuracy

array([0.68518519, 0.75925926, 0.74074074, 0.77777778, 0.75925926,
       0.7037037 , 0.74074074, 0.77358491, 0.67924528, 0.71698113])

In [10]:
accuracy.mean()

0.7336477987421384

### RandomizedSearchCV

In [11]:
import numpy as np
param = {
     "n_estimators":[np.int(x) for x in np.linspace(100,1000,10)],
    "criterion":["gini","entropy"],
    "max_depth":[x for x in np.linspace(10,512,10)],
    "min_samples_split":[2,3,4,5,6],
    "min_samples_leaf":[2,3,4,5,6],
    "max_features":['auto',"log2","squard"]}
param

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
 'criterion': ['gini', 'entropy'],
 'max_depth': [10.0,
  65.77777777777777,
  121.55555555555556,
  177.33333333333334,
  233.11111111111111,
  288.8888888888889,
  344.6666666666667,
  400.44444444444446,
  456.22222222222223,
  512.0],
 'min_samples_split': [2, 3, 4, 5, 6],
 'min_samples_leaf': [2, 3, 4, 5, 6],
 'max_features': ['auto', 'log2', 'squard']}

In [12]:
from sklearn.model_selection import RandomizedSearchCV

In [13]:
random_search=RandomizedSearchCV(clf,param_distributions=param,cv=10,scoring="accuracy",n_jobs=-1,verbose=3)
random_search.fit(train_x,train_y)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   45.0s finished


RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10.0, 65.77777777777777,
                                                      121.55555555555556,
                                                      177.33333333333334,
                                                      233.11111111111111,
                                                      288.8888888888889,
                                                      344.6666666666667,
                                                      400.44444444444446,
                                                      456.22222222222223,
                                                      512.0],
                                        'max_features': ['auto', 'log2',
                                                         'squard'],
                                        'min_samples_le

In [14]:
params = random_search.best_params_
params

{'n_estimators': 400,
 'min_samples_split': 6,
 'min_samples_leaf': 5,
 'max_features': 'auto',
 'max_depth': 10.0,
 'criterion': 'entropy'}

In [15]:
random_search.best_score_

0.7486023759608665

In [16]:
clf = RandomForestClassifier(**params)
clf.fit(train_x,train_y)
pred_y = clf.predict(test_x)
print(confusion_matrix(test_y,pred_y))
print(classification_report(test_y,pred_y))

[[132  18]
 [ 29  52]]
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       150
           1       0.74      0.64      0.69        81

    accuracy                           0.80       231
   macro avg       0.78      0.76      0.77       231
weighted avg       0.79      0.80      0.79       231



### GridSearchCV

In [19]:
from sklearn.model_selection import GridSearchCV

In [24]:
param = {'n_estimators':[ params["n_estimators"],params["n_estimators"]-100,params["n_estimators"]-200,params["n_estimators"]+100,params["n_estimators"]+200],
 'min_samples_split': [params['min_samples_split'],params['min_samples_split']-1,
                       params['min_samples_split']-2,params['min_samples_split']+1,
                       params['min_samples_split']+2],
 'min_samples_leaf': [params['min_samples_leaf'],params['min_samples_leaf']-1,
                      params['min_samples_leaf']-2,params['min_samples_leaf']+1,
                      params['min_samples_leaf']+2],
 'max_features': ['auto'],
 'max_depth': [params['max_depth'],params['max_depth']-2,
               params['max_depth']-4,
               params['max_depth']+2,
               params['max_depth']+4],
 'criterion': ['entropy']}

In [25]:
grid_search = GridSearchCV(clf,param_grid=param,cv=10,scoring="accuracy",n_jobs=-1,verbose=3)
grid_search.fit(train_x,train_y)

Fitting 10 folds for each of 625 candidates, totalling 6250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   51.0s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 2040 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 21.5min
[Parallel(n_jobs=-1)]: Done 3864 tasks      | elapsed: 26.0min
[Parallel(n_jobs=-1)]: Done 4600 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-1)]: Done 5400 tasks      | elapsed: 36.4min
[Parallel(n_jobs=-1)]: Done 6250 out of 6250 | elapsed: 42.3min finished


GridSearchCV(cv=10,
             estimator=RandomForestClassifier(criterion='entropy',
                                              max_depth=10.0,
                                              min_samples_leaf=5,
                                              min_samples_split=6,
                                              n_estimators=400),
             n_jobs=-1,
             param_grid={'criterion': ['entropy'],
                         'max_depth': [10.0, 8.0, 6.0, 12.0, 14.0],
                         'max_features': ['auto'],
                         'min_samples_leaf': [5, 4, 3, 6, 7],
                         'min_samples_split': [6, 5, 4, 7, 8],
                         'n_estimators': [400, 300, 200, 500, 600]},
             scoring='accuracy', verbose=3)

In [28]:
grid_param = grid_search.best_params_
grid_param

{'criterion': 'entropy',
 'max_depth': 6.0,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 7,
 'n_estimators': 600}

In [27]:
grid_search.best_score_

0.7617051013277429

In [29]:
clf = RandomForestClassifier(**grid_param)
clf.fit(train_x,train_y)
pred_y = clf.predict(test_x)
print(confusion_matrix(test_y,pred_y))
print(classification_report(test_y,pred_y))

[[133  17]
 [ 30  51]]
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       150
           1       0.75      0.63      0.68        81

    accuracy                           0.80       231
   macro avg       0.78      0.76      0.77       231
weighted avg       0.79      0.80      0.79       231



### Bayesian Optimization -Automate Hyperparameter Tuning (Hyperopt)

### Automated Hyperparameter Tuning
Automated Hyperparameter Tuning can be done by using techniques such as 
- Bayesian Optimization
- Gradient Descent
- Evolutionary Algorithms

#### Bayesian Optimization
Bayesian optimization uses probability to find the minimum of a function. The final aim is to find the input value to a function which can gives us the lowest possible output value.It usually performs better than random,grid and manual search providing better performance in the testing phase and reduced optimization time.
In Hyperopt, Bayesian Optimization can be implemented giving 3 three main parameters to the function fmin.

- Objective Function = defines the loss function to minimize.
- Domain Space = defines the range of input values to test (in Bayesian Optimization this space creates a probability distribution for each of the used Hyperparameters).
- Optimization Algorithm = defines the search algorithm to use to select the best input values to use in each new iteration.

In [11]:
# !pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.4-py2.py3-none-any.whl (964 kB)
Collecting networkx>=2.2
  Using cached networkx-2.5-py3-none-any.whl (1.6 MB)
Installing collected packages: networkx, hyperopt
Successfully installed hyperopt-0.2.4 networkx-2.5


You should consider upgrading via the 'g:\python program\data science\scripts\python.exe -m pip install --upgrade pip' command.


In [23]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [18]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, train_x, train_y, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [19]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|███████████████████████████████████████████████| 80/80 [11:08<00:00,  8.35s/trial, best loss: -0.7411907234337141]


{'criterion': 1,
 'max_depth': 410.0,
 'max_features': 0,
 'min_samples_leaf': 0.0005172313617776911,
 'min_samples_split': 0.003519177501550591,
 'n_estimators': 6}

In [20]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

gini
auto
1500


In [24]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(train_x,train_y)
predictionforest = trainedforest.predict(test_x)
print(confusion_matrix(test_y,predictionforest))
print(accuracy_score(test_y,predictionforest))
print(classification_report(test_y,predictionforest))
acc5 = accuracy_score(test_y,predictionforest)

[[131  19]
 [ 25  56]]
0.8095238095238095
              precision    recall  f1-score   support

           0       0.84      0.87      0.86       150
           1       0.75      0.69      0.72        81

    accuracy                           0.81       231
   macro avg       0.79      0.78      0.79       231
weighted avg       0.81      0.81      0.81       231



#### Genetic Algorithms
Genetic Algorithms tries to apply natural selection mechanisms to Machine Learning contexts.

Let's immagine we create a population of N Machine Learning models with some predifined Hyperparameters. We can then calculate the accuracy of each model and decide to keep just half of the models (the ones that performs best). We can now generate some offsprings having similar Hyperparameters to the ones of the best models so that go get again a population of N models. At this point we can again caltulate the accuracy of each model and repeate the cycle for a defined number of generations. In this way, just the best models will survive at the end of the process.

In [25]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [26]:
# !pip install tpot

Collecting tpot
  Downloading TPOT-0.11.5-py3-none-any.whl (82 kB)
Collecting stopit>=1.1.1
  Downloading stopit-1.1.2.tar.gz (18 kB)
Collecting update-checker>=0.16
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting deap>=1.2
  Downloading deap-1.3.1-cp37-cp37m-win_amd64.whl (108 kB)
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py): started
  Building wheel for stopit (setup.py): finished with status 'done'
  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11959 sha256=dc156526b9e184fca2aef52e8c61890cba952fb94063be5e9e19e9421744db41
  Stored in directory: c:\users\avinash mourya\appdata\local\pip\cache\wheels\e2\d2\79\eaf81edb391e27c87f51b8ef901ecc85a5363dc96b8b8d71e3
Successfully built stopit
Installing collected packages: stopit, update-checker, deap, tpot
Successfully installed deap-1.3.1 stopit-1.1.2 tpot-0.11.5 update-checker-0.18.0


You should consider upgrading via the 'g:\python program\data science\scripts\python.exe -m pip install --upgrade pip' command.


In [28]:
from tpot import TPOTClassifier


tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(train_x,train_y)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=84.0, style=ProgressStyle(des…


Generation 1 - Current best internal CV score: 0.7561359867330016
Generation 2 - Current best internal CV score: 0.7579878385848535
Generation 3 - Current best internal CV score: 0.7635572139303483
Generation 4 - Current best internal CV score: 0.7635572139303483
Generation 5 - Current best internal CV score: 0.7635572139303483
Best pipeline: RandomForestClassifier(input_matrix, criterion=gini, max_depth=340, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=800)


TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['entropy',
                                                                                      'gini'],
                                                                        'max_depth': [10,
                                                                                      120,
                                                                                      230,
                                                                                      340,
                                                                                      450,
                                                                                      560,
                                                                                      670,
                                                                                      780,
                                                                                 

In [33]:
accuracy = tpot_classifier.score(test_x, test_y)
print(accuracy)

0.7922077922077922


### Optimize hyperparameters of the model using Optuna

The hyperparameters of the above algorithm are `n_estimators` and `max_depth` for which we can try different values to see if the model accuracy can be improved. The `objective` function is modified to accept a trial object. This trial has several methods for sampling hyperparameters. We create a study to run the hyperparameter optimization and finally read the best hyperparameters.

In [35]:
# !pip install optuna

Collecting optuna
  Downloading optuna-2.0.0.tar.gz (226 kB)
Processing c:\users\avinash mourya\appdata\local\pip\cache\wheels\4e\b5\00\f93fe1c90b3d501774e91e2e99987f49d16019e40e4bd3afc3\alembic-1.4.2-py2.py3-none-any.whl
Collecting cliff
  Downloading cliff-3.4.0-py3-none-any.whl (76 kB)
Collecting cmaes>=0.5.1
  Downloading cmaes-0.6.0-py3-none-any.whl (9.6 kB)
Collecting colorlog
  Downloading colorlog-4.2.1-py2.py3-none-any.whl (14 kB)
Collecting sqlalchemy>=1.1.0
  Downloading SQLAlchemy-1.3.19-cp37-cp37m-win_amd64.whl (1.2 MB)
Collecting python-editor>=0.3
  Using cached python_editor-1.0.4-py3-none-any.whl (4.9 kB)
Collecting pbr!=2.1.0,>=2.0.0
  Downloading pbr-5.4.5-py2.py3-none-any.whl (110 kB)
Collecting cmd2!=0.8.3,>=0.8.0
  Downloading cmd2-1.3.8-py3-none-any.whl (130 kB)
Collecting PyYAML>=3.12
  Using cached PyYAML-5.3.1-cp37-cp37m-win_amd64.whl (216 kB)
Collecting stevedore>=2.0.1
  Downloading stevedore-3.2.0-py3-none-any.whl (42 kB)
Collecting PrettyTable<0.8,>=0.7.2


You should consider upgrading via the 'g:\python program\data science\scripts\python.exe -m pip install --upgrade pip' command.


In [38]:
import optuna
import sklearn.svm
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        
        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(
        clf,train_x,train_y, n_jobs=-1, cv=3).mean()


In [39]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2020-08-30 04:13:32,511] Trial 0 finished with value: 0.6517690875232774 and parameters: {'classifier': 'SVC', 'svc_c': 0.06873662782239964}. Best is trial 0 with value: 0.6517690875232774.
[I 2020-08-30 04:13:33,929] Trial 1 finished with value: 0.6517690875232774 and parameters: {'classifier': 'SVC', 'svc_c': 1.9647042399796806}. Best is trial 0 with value: 0.6517690875232774.
[I 2020-08-30 04:13:33,985] Trial 2 finished with value: 0.6517690875232774 and parameters: {'classifier': 'SVC', 'svc_c': 0.00028104839404189064}. Best is trial 0 with value: 0.6517690875232774.
[I 2020-08-30 04:13:36,279] Trial 3 finished with value: 0.74487895716946 and parameters: {'classifier': 'RandomForest', 'n_estimators': 290, 'max_depth': 11.73307282474172}. Best is trial 3 with value: 0.74487895716946.
[I 2020-08-30 04:13:36,344] Trial 4 finished with value: 0.6517690875232774 and parameters: {'classifier': 'SVC', 'svc_c': 0.6555521231065513}. Best is trial 3 with value: 0.74487895716946.
[I 2020-

[I 2020-08-30 04:17:39,049] Trial 73 finished with value: 0.7486033519553073 and parameters: {'classifier': 'RandomForest', 'n_estimators': 640, 'max_depth': 27.385448069204017}. Best is trial 45 with value: 0.7523277467411545.
[I 2020-08-30 04:17:41,047] Trial 74 finished with value: 0.750465549348231 and parameters: {'classifier': 'RandomForest', 'n_estimators': 590, 'max_depth': 37.42607064190432}. Best is trial 45 with value: 0.7523277467411545.
[I 2020-08-30 04:17:42,561] Trial 75 finished with value: 0.7467411545623835 and parameters: {'classifier': 'RandomForest', 'n_estimators': 470, 'max_depth': 36.06983569532787}. Best is trial 45 with value: 0.7523277467411545.
[I 2020-08-30 04:17:44,514] Trial 76 finished with value: 0.7411545623836125 and parameters: {'classifier': 'RandomForest', 'n_estimators': 610, 'max_depth': 19.970791158594025}. Best is trial 45 with value: 0.7523277467411545.
[I 2020-08-30 04:17:46,567] Trial 77 finished with value: 0.74487895716946 and parameters: 

Accuracy: 0.7541899441340782
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 510, 'max_depth': 23.86949196431638}


In [40]:
trial

FrozenTrial(number=83, value=0.7541899441340782, datetime_start=datetime.datetime(2020, 8, 30, 4, 17, 57, 456439), datetime_complete=datetime.datetime(2020, 8, 30, 4, 17, 59, 75442), params={'classifier': 'RandomForest', 'n_estimators': 510, 'max_depth': 23.86949196431638}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest', 'SVC')), 'n_estimators': IntUniformDistribution(high=2000, low=200, step=10), 'max_depth': LogUniformDistribution(high=100, low=10)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=83, state=TrialState.COMPLETE)

In [41]:
rf=RandomForestClassifier(n_estimators=330,max_depth=30)
rf.fit(train_x,train_y)

RandomForestClassifier(max_depth=30, n_estimators=330)

In [42]:
y_pred=rf.predict(test_x)
print(confusion_matrix(test_y,y_pred))
print(accuracy_score(test_y,y_pred))
print(classification_report(test_y,y_pred))

[[130  20]
 [ 23  58]]
0.8138528138528138
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       150
           1       0.74      0.72      0.73        81

    accuracy                           0.81       231
   macro avg       0.80      0.79      0.79       231
weighted avg       0.81      0.81      0.81       231

