# Perform an initial search for optimal hyperparameters

__author__: Andrew Bartnof

__copyright__: Copyright 2025, Rocky Mountain Institute

__credits__: Alex Engel, Andrew Bartnof

In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import dask
import os 

from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
# from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import ConcurrencyLimiter
# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# from optuna.samplers import TPESampler

In [2]:
data_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker'
dir_working_model_a_training = os.path.join(data_dir, 'working_data/model_a/model_a_training')
dir_working_model_a_training

'/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/model_a_training'

In [3]:
fn_x = os.path.join(dir_working_model_a_training, 'x.parquet')
fn_y = os.path.join(dir_working_model_a_training, 'y.parquet')
fn_id = os.path.join(dir_working_model_a_training, 'id.parquet')

dir_hyperparameters = dir_working_model_a_training
fn_out = os.path.join(dir_working_model_a_training, 'gb_ray_tune/model_a_ann_hp_search.csv')

# dir_hyperparameters = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train'
# fn_out = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/gb_ray_tune/grid_search.csv'

In [4]:
def np_cleaning(X):
    X = np.clip(X, a_min=-3, a_max=3)
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    return X

In [5]:
def fit_mod(space):
    
    # Read data
    X = pd.read_parquet(fn_x)
    Y = pd.read_parquet(fn_y)
    ID = pd.read_parquet(fn_id)
    
    # Split data into testing and training
    fold_array = np.arange(5)
    fold_variable = np.random.choice(fold_array, size=1)[0]
    
    is_train_mask = (ID['fold'] != fold_variable).values
    XTrain = X.loc[is_train_mask]
    XTest = X.loc[~is_train_mask]
    
    # Scale numeric values
    standard_scaler = StandardScaler()
    
    standard_scaler.fit(XTrain)
    XTrain = standard_scaler.transform(XTrain)
    XTest  = standard_scaler.transform(XTest)
    
    XTrain = np_cleaning(XTrain)
    XTest  = np_cleaning(XTest)
    
    # Package in training and testing objects
    train_set = lgb.Dataset(XTrain, Y.loc[is_train_mask])
    test_set  = lgb.Dataset(XTest,  Y.loc[~is_train_mask])

    evals={}
    # Model
    gbm = lgb.train(
        space,
        train_set,
        valid_sets=[test_set],
        callbacks = [lgb.record_evaluation(evals)]
    )
    binary_logloss = evals['valid_0']['binary_logloss'][-1]
    auc = evals['valid_0']['auc'][-1]
    train.report(
        {
            "binary_logloss": binary_logloss,
            "auc": auc
        }
    )

In [18]:
space = {
    'verbose':-1,
    'num_trees': tune.randint(1, 1000),
    'learning_rate': tune.uniform(0.0001, 0.75),
    'min_data_in_leaf': tune.randint(1, 200),
    'objective':'binary', 
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
    }

In [19]:
# asha = ASHAScheduler(metric='binary_logloss', mode='min')

search_alg = OptunaSearch(metric="binary_logloss", mode="min")
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=1)

In [20]:
tuner = tune.Tuner(
    fit_mod,
    tune_config=tune.TuneConfig(
        # scheduler=asha,
        num_samples=250,
        search_alg=search_alg,
    ),
    param_space=space,
    run_config=train.RunConfig(
        storage_path=dir_hyperparameters, 
        name="gb_ray_tune"
    )
)
results = tuner.fit()

0,1
Current time:,2025-02-26 00:51:08
Running for:,15:00:53.40
Memory:,3.1/8.0 GiB

Trial name,status,loc,learning_rate,min_data_in_leaf,num_trees,iter,total time (s),binary_logloss,auc
fit_mod_31ce5d79,TERMINATED,127.0.0.1:43721,0.00394297,134,230,1,91.8194,0.00115674,0.999182
fit_mod_9d1edc0c,TERMINATED,127.0.0.1:43729,0.561713,139,39,1,41.2985,0.0430312,0.434541
fit_mod_ac59ed12,TERMINATED,127.0.0.1:43733,0.320987,121,215,1,68.384,0.157805,0.630679
fit_mod_8fa0443a,TERMINATED,127.0.0.1:43742,0.287588,135,651,1,117.042,0.0473562,0.944032
fit_mod_dade9211,TERMINATED,127.0.0.1:43800,0.0271097,13,762,1,328.865,0.00472533,0.991497
fit_mod_778f93ba,TERMINATED,127.0.0.1:43876,0.128696,168,565,1,158.0,0.0489357,0.831074
fit_mod_51bbc241,TERMINATED,127.0.0.1:43899,0.356902,163,91,1,52.6034,0.0454682,0.934021
fit_mod_651e1ef4,TERMINATED,127.0.0.1:43910,0.265844,180,611,1,135.624,0.111482,0.788793
fit_mod_38817f59,TERMINATED,127.0.0.1:43935,0.260174,190,433,1,125.804,0.0243623,0.902341
fit_mod_f82216de,TERMINATED,127.0.0.1:43943,0.588541,81,508,1,105.404,1.14706,0.861848


2025-02-26 00:51:08,522	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/model_a_training/gb_ray_tune' in 0.2783s.
2025-02-26 00:51:08,555	INFO tune.py:1041 -- Total run time: 54053.44 seconds (54053.12 seconds for the tuning loop).


In [28]:
Grid = results.get_dataframe().copy()

In [29]:
Grid.index.name = 'order'
RankedGrid = Grid.sort_values(['binary_logloss', 'auc'], ascending=[True, False]).reset_index()
RankedGrid.index.name = 'rank'
RankedGrid.to_csv(fn_out)

In [30]:
RankedGrid.sort_values('binary_logloss').head(20)[['binary_logloss', 'auc', 'config/num_trees', 'config/learning_rate', 'config/min_data_in_leaf']]

Unnamed: 0_level_0,binary_logloss,auc,config/num_trees,config/learning_rate,config/min_data_in_leaf
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.00013,0.999996,638,0.013884,105
1,0.00013,0.999996,638,0.010748,104
2,0.000133,0.999996,741,0.009355,105
3,0.000135,0.999996,742,0.011899,117
4,0.000136,0.999996,593,0.01399,105
5,0.000137,0.999996,657,0.0098,117
6,0.000139,0.999996,617,0.010613,103
7,0.00014,0.999988,626,0.015733,112
8,0.000142,0.999995,611,0.012979,111
9,0.000142,0.999993,639,0.010769,125


In [12]:
# experiment_path = "/Users/andrewbartnof/Documents/rmi/rematch_ferc_eia1/clean_data/model_full_gradient_boost/ray_tune/gb_ray_tune"
# restored_tuner = tune.Tuner.restore(experiment_path, trainable=fit_mod)

In [13]:
# fn_results = '/Users/andrewbartnof/Documents/rmi/rematch_ferc_eia1/clean_data/model_full_gradient_boost/ray_tune/ray_tune_dataframe.csv'
# restored_tuner.get_results().get_dataframe().to_csv(fn_results)

In [27]:
!jupyter nbconvert --to script model_a_gbm_hyperparameter_search.ipynb

[NbConvertApp] Converting notebook model_a_gbm_hyperparameter_search.ipynb to script
[NbConvertApp] Writing 4322 bytes to model_a_gbm_hyperparameter_search.py
