In [3]:
!pip install hyperopt
!pip install lightgbm
!pip install xgboost

Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 1.4/1.4 MB 15.2 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.5.0
Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   - -------------------------------------- 3.1/124.9 MB 20.5 MB/s eta 0:00:06
   --- ------------------------------------ 10.5/124.9 MB 28.5 MB/s eta 0:00:05
   ----- ---------------------------------- 16.0/124.9 MB 28.8 MB/s eta 0:00:04
   ------ --------------------------------- 21.5/124.9 MB 27.2 MB/s eta 0:00:04
   --------- ------------------------------ 29.4/124.9 MB 29.6 MB/s eta 0:00:04
   -------

In [1]:
import pandas as pd
import numpy as  np

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

import xgboost as xgb
from hyperopt import hp, fmin, rand,  Trials

#HP : define the hyperparameter space
#fmin : function to minimize
#rand : random search
#Trials : store the results of the search


In [2]:
breast_cancer_X, breast_cancer_y = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(breast_cancer_X)
y = pd.Series(breast_cancer_y).map({0: 1, 1: 0})

X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Define the hyperparameter space

In [5]:
#Determine the hyperparameter space


param_grid = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
    'eta': hp.uniform('eta', 0.01, 1),
    'max_depth': hp.quniform('max_depth', 1, 10, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'gamma': hp.uniform('gamma', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'objective': 'binary:logistic',
    'nthread': 4,
    'silent': 1,
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1),
    'reg_lambda': hp.uniform('reg_lambda', 1,20),
}

## Define the objective function
This is the hyperparameter response space, the function we want to minimize

In [6]:
# The objective function takes the hyperparameters as input and returns the cross-validated accuracy of the model

def objective(params):
    params_dict ={
        'n_estimators': int(params['n_estimators']),
        'eta': params['eta'],
        'max_depth': int(params['max_depth']),
        'min_child_weight': int(params['min_child_weight']),
        'subsample': params['subsample'],
        'gamma': params['gamma'],
        'colsample_bytree': params['colsample_bytree'],
        'objective': 'binary:logistic',
        'nthread': 4,
        'silent': 1,
        'colsample_bylevel': params['colsample_bylevel'],
        'reg_lambda': params['reg_lambda'],
    }

    # with ** we pass the items in the dictionary as parameters

    gbm = xgb.XGBClassifier(**params_dict)

    #Train with cv
    score = cross_val_score(gbm, X_train, y_train, scoring = 'accuracy', cv = 5, n_jobs = 4).mean()

    return -score


## Randomized Search


In [9]:
#fmin performs the minimization
# rand.suggest samples the parameter at random
# i.e., performs the random search

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

search = fmin(
    fn = objective,
    space = param_grid,
    algo = rand.suggest,
    max_evals = 100,
    trials = Trials(),

)

100%|██████████| 100/100 [00:25<00:00,  3.86trial/s, best loss: -0.9736263736263737]


In [10]:
type(search)

dict

In [11]:
search

{'colsample_bylevel': 0.8298302864008947,
 'colsample_bytree': 0.642131174089724,
 'eta': 0.7647345065794771,
 'gamma': 0.6227480761569197,
 'max_depth': 3.0,
 'min_child_weight': 1.0,
 'n_estimators': 433.0,
 'reg_lambda': 11.529309664559868,
 'subsample': 0.7673398259743814}

In [12]:
#Create another dictionary with the best hyperparameters to a new xgb
best_hp_dict = {
    'n_estimators': int(search['n_estimators']),
    'eta': search['eta'],
    'max_depth': int(search['max_depth']),
    'min_child_weight': int(search['min_child_weight']),
    'subsample': search['subsample'],
    'gamma': search['gamma'],
    'colsample_bytree': search['colsample_bytree'],
    'objective': 'binary:logistic',
    'nthread': 4,
    'silent': 1,
    'colsample_bylevel': search['colsample_bylevel'],
    'reg_lambda': search['reg_lambda'],
}



In [13]:
#after the search we can train the model with the best hyperparameters
gbm_final = xgb.XGBClassifier(**best_hp_dict)
gbm_final.fit(X_train, y_train)

Parameters: { "silent" } are not used.



In [15]:
X_train_preds = gbm_final.predict(X_train)
X_test_preds = gbm_final.predict(X_test)

train_accuracy = accuracy_score(y_train, X_train_preds)
test_accuracy = accuracy_score(y_test, X_test_preds)
print(f'Train accuracy: {train_accuracy}')
print(f'Test accuracy: {test_accuracy}')

Train accuracy: 0.9956043956043956
Test accuracy: 0.9649122807017544


## Evaluating the search

In [16]:
trials = Trials()

In [17]:
second_search = fmin(
    fn = objective,
    space = param_grid,
    algo = rand.suggest, #randomized search
    max_evals = 100,
    trials = trials,
)

100%|██████████| 100/100 [00:22<00:00,  4.37trial/s, best loss: -0.9736263736263737]


In [18]:
#best parameters
second_search

{'colsample_bylevel': 0.8140453592059262,
 'colsample_bytree': 0.6838780609853796,
 'eta': 0.7280865856165323,
 'gamma': 0.7943913293601066,
 'max_depth': 3.0,
 'min_child_weight': 3.0,
 'n_estimators': 311.0,
 'reg_lambda': 2.6781490437084927,
 'subsample': 0.5454821717419962}

In [19]:
trials.argmin

{'colsample_bylevel': 0.8140453592059262,
 'colsample_bytree': 0.6838780609853796,
 'eta': 0.7280865856165323,
 'gamma': 0.7943913293601066,
 'max_depth': 3.0,
 'min_child_weight': 3.0,
 'n_estimators': 311.0,
 'reg_lambda': 2.6781490437084927,
 'subsample': 0.5454821717419962}

In [20]:
trials.best_trial

{'state': 2,
 'tid': 46,
 'spec': None,
 'result': {'loss': -0.9736263736263737, 'status': 'ok'},
 'misc': {'tid': 46,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'colsample_bylevel': [46],
   'colsample_bytree': [46],
   'eta': [46],
   'gamma': [46],
   'max_depth': [46],
   'min_child_weight': [46],
   'n_estimators': [46],
   'reg_lambda': [46],
   'subsample': [46]},
  'vals': {'colsample_bylevel': [0.8140453592059262],
   'colsample_bytree': [0.6838780609853796],
   'eta': [0.7280865856165323],
   'gamma': [0.7943913293601066],
   'max_depth': [3.0],
   'min_child_weight': [3.0],
   'n_estimators': [311.0],
   'reg_lambda': [2.6781490437084927],
   'subsample': [0.5454821717419962]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2025, 1, 3, 1, 55, 8, 647000),
 'refresh_time': datetime.datetime(2025, 1, 3, 1, 55, 8, 788000)}

In [24]:
pd.DataFrame(trials.results).sort_values('loss', ascending = False).head()

Unnamed: 0,loss,status
5,-0.927473,ok
6,-0.931868,ok
76,-0.934066,ok
24,-0.940659,ok
50,-0.942857,ok


In [25]:
results['loss'].plot()
plt.xlabel('Iteration')
plt.ylabel('Accuracy')

NameError: name 'results' is not defined