# Perform an initial search for optimal hyperparameters

__author__: Andrew Bartnof

__copyright__: Copyright 2025, Rocky Mountain Institute

__credits__: Alex Engel, Andrew Bartnof

In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import dask
import os 

from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
# from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import ConcurrencyLimiter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# from optuna.samplers import TPESampler

In [2]:
data_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker'
dir_working_model_b_training = os.path.join(data_dir, 'working_data/model_b/model_b_training')
dir_working_model_b_training

'/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_b/model_b_training'

In [3]:
fn_x = os.path.join(dir_working_model_b_training, 'x.parquet')
fn_y = os.path.join(dir_working_model_b_training, 'y.parquet')
fn_id = os.path.join(dir_working_model_b_training, 'id.parquet')

dir_hyperparameters = dir_working_model_b_training
fn_out = os.path.join(dir_working_model_b_training, 'gb_ray_tune/model_b_ann_hp_search.csv')

# dir_hyperparameters = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_b/train'
# fn_out = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_b/train/gb_ray_tune/grid_search.csv'

In [4]:
def np_cleaning(X):
    X = np.clip(X, a_min=-3, a_max=3)
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    return X

In [5]:
def fit_mod(space):
    
    # Read data
    X = pd.read_parquet(fn_x)
    Y = pd.read_parquet(fn_y)
    ID = pd.read_parquet(fn_id)
    
    # Split data into testing and training
    fold_array = np.arange(5)
    fold_variable = np.random.choice(fold_array, size=1)[0]
    
    is_train_mask = (ID['fold'] != fold_variable).values
    XTrain = X.loc[is_train_mask]
    XTest = X.loc[~is_train_mask]
    
    # Scale numeric values
    standard_scaler = StandardScaler()
    
    standard_scaler.fit(XTrain)
    XTrain = standard_scaler.transform(XTrain)
    XTest  = standard_scaler.transform(XTest)
    
    XTrain = np_cleaning(XTrain)
    XTest  = np_cleaning(XTest)
    
    # Package in training and testing objects
    train_set = lgb.Dataset(XTrain, Y.loc[is_train_mask])
    test_set  = lgb.Dataset(XTest,  Y.loc[~is_train_mask])

    # Model
    evals={}
    gbm = lgb.train(
        space,
        train_set,
        valid_sets=[test_set],
        callbacks = [lgb.record_evaluation(evals)]
    )
    binary_logloss = evals['valid_0']['binary_logloss'][-1]
    auc = evals['valid_0']['auc'][-1]
    
    train.report(
        {
            "binary_logloss": binary_logloss,
            "auc": auc
        }
    )

In [6]:
space = {
    # 'num_iterations': tune.randint(1, 1000),
    'verbose':-1,
    'num_trees': tune.randint(1, 1000),
    'learning_rate': tune.uniform(0.0001, 0.75),
    'min_data_in_leaf': tune.randint(1, 200),
    'objective':'binary', 
    # 'early_stopping_round':2,
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
    }

In [7]:
# asha = ASHAScheduler(metric='binary_logloss', mode='min')

search_alg = OptunaSearch(metric="binary_logloss", mode="min")
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=1)

In [8]:
tuner = tune.Tuner(
    fit_mod,
    tune_config=tune.TuneConfig(
        # scheduler=asha,
        num_samples=250,
        search_alg=search_alg,
    ),
    param_space=space,
    run_config=train.RunConfig(
        storage_path=dir_hyperparameters, 
        name="gb_ray_tune"
    )
)
results = tuner.fit()

0,1
Current time:,2025-02-26 22:20:52
Running for:,10:51:47.66
Memory:,2.7/8.0 GiB

Trial name,status,loc,learning_rate,min_data_in_leaf,num_trees,iter,total time (s),binary_logloss,auc
fit_mod_b9fa7f76,TERMINATED,127.0.0.1:59338,0.561238,171,692,1,176.955,8.56241,0.605504
fit_mod_accfe384,TERMINATED,127.0.0.1:59428,0.707286,155,427,1,102.146,0.0351214,0.373414
fit_mod_6ad59ba2,TERMINATED,127.0.0.1:59445,0.373086,28,195,1,86.6072,0.0564925,0.372775
fit_mod_7770997a,TERMINATED,127.0.0.1:59453,0.648973,60,760,1,132.519,0.0441865,0.943841
fit_mod_222ddd2d,TERMINATED,127.0.0.1:59459,0.556329,28,978,1,179.546,0.0851383,0.7657
fit_mod_50e7273e,TERMINATED,127.0.0.1:59528,0.226599,78,149,1,71.1514,3.91601,0.0751492
fit_mod_bd2e5220,TERMINATED,127.0.0.1:59536,0.368673,78,283,1,81.7855,0.0378754,0.806072
fit_mod_f1f37d33,TERMINATED,127.0.0.1:59547,0.157607,79,799,1,171.339,0.0483853,0.83194
fit_mod_777490ae,TERMINATED,127.0.0.1:59554,0.104643,42,817,1,189.5,0.039468,0.408563
fit_mod_5a0a7009,TERMINATED,127.0.0.1:59599,0.399154,198,286,1,86.3667,0.189297,0.471416


2025-02-26 22:20:52,953	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_b/model_b_training/gb_ray_tune' in 0.2210s.
2025-02-26 22:20:52,984	INFO tune.py:1041 -- Total run time: 39107.71 seconds (39107.43 seconds for the tuning loop).


In [15]:
Grid = results.get_dataframe().copy()

In [16]:
Grid.index.name = 'order'
RankedGrid = Grid.sort_values(['binary_logloss', 'auc'], ascending=[True, False]).reset_index()
RankedGrid.index.name = 'rank'
RankedGrid.to_csv(fn_out)

In [17]:
RankedGrid.sort_values('binary_logloss').head(10)[['binary_logloss', 'auc', 'config/num_trees', 'config/learning_rate', 'config/min_data_in_leaf']]

Unnamed: 0_level_0,binary_logloss,auc,config/num_trees,config/learning_rate,config/min_data_in_leaf
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.000194,0.999964,899,0.01419,152
1,0.000197,0.999961,837,0.012723,149
2,0.000199,0.999992,888,0.010747,153
3,0.0002,0.999958,894,0.010668,153
4,0.000201,0.999992,856,0.012409,159
5,0.000203,0.999964,889,0.012008,152
6,0.000208,0.99999,936,0.015645,151
7,0.000209,0.999958,770,0.015908,141
8,0.000209,0.999992,853,0.011632,146
9,0.000219,0.99988,897,0.012356,151


In [18]:
# experiment_path = "/Users/andrewbartnof/Documents/rmi/rematch_ferc_eia1/clean_data/model_full_gradient_boost/ray_tune/gb_ray_tune"
# restored_tuner = tune.Tuner.restore(experiment_path, trainable=fit_mod)

In [19]:
# fn_results = '/Users/andrewbartnof/Documents/rmi/rematch_ferc_eia1/clean_data/model_full_gradient_boost/ray_tune/ray_tune_dataframe.csv'
# restored_tuner.get_results().get_dataframe().to_csv(fn_results)

In [21]:
!jupyter nbconvert --to script model_b_gbm_hyperparameter_search.ipynb

[NbConvertApp] Converting notebook model_b_gbm_hyperparameter_search.ipynb to script
[NbConvertApp] Writing 4399 bytes to model_b_gbm_hyperparameter_search.py
