In [16]:
import pandas as pd
import numpy as np
import os

import lightgbm as lgb

from sklearn.preprocessing import StandardScaler

from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import ConcurrencyLimiter

In [17]:
data_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker'

In [18]:
dir_hyperparameters = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_02_21')
fn_hyperparameters = os.path.join(dir_hyperparameter, 'gbm_grid_2025_02_21.csv')

dir_temp = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_02_21/temp_folder')

fn_x_train = os.path.join(dir_temp, 'x_train.parquet')
fn_x_test  = os.path.join(dir_temp, 'x_test.parquet')
fn_y_train = os.path.join(dir_temp, 'y_train.parquet')
fn_y_test  = os.path.join(dir_temp, 'y_test.parquet')

In [19]:
space = {
    'verbose':-1,
    'num_trees': tune.randint(1, 1000),  # used to max at 500
    'learning_rate': tune.uniform(0.0001, 0.75),
    'min_data_in_leaf': tune.randint(1, 200),
    'objective':'binary', 
    # 'early_stopping_round':2,
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
    }

In [20]:
search_alg = OptunaSearch(metric="binary_logloss", mode="min")
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=1)

In [21]:
def fit_mod(space):
    
    # Read data
    XTrain = pd.read_parquet(fn_x_train)
    XTest  = pd.read_parquet(fn_x_test)
    YTrain = pd.read_parquet(fn_y_train)
    YTest  = pd.read_parquet(fn_y_test)
    
    # Package in training and testing objects
    train_set = lgb.Dataset(XTrain, YTrain)
    test_set  = lgb.Dataset(XTest,  YTest)

    # Model
    gbm = lgb.train(
        space,
        train_set,
        valid_sets=[test_set]    
    )
    binary_logloss = gbm.best_score['valid_0']['binary_logloss']
    auc = gbm.best_score['valid_0']['auc']
    train.report(
        {
            "binary_logloss": binary_logloss,
            "auc": auc
        }
    )

In [None]:
tuner = tune.Tuner(
    fit_mod,
    tune_config=tune.TuneConfig(
        num_samples=500,  # 250 at prev. stages
        search_alg=search_alg,
    ),
    param_space=space,
    run_config=train.RunConfig(
        storage_path=dir_hyperparameters, 
        name="gb_ray_tune"
    )
)
results = tuner.fit()

0,1
Current time:,2025-02-24 20:48:23
Running for:,01:01:55.16
Memory:,5.9/8.0 GiB

Trial name,status,loc,learning_rate,min_data_in_leaf,num_trees,iter,total time (s),binary_logloss,auc
fit_mod_6e6659ff,RUNNING,127.0.0.1:32778,0.0523709,129,437,,,,
fit_mod_01c203b1,TERMINATED,127.0.0.1:32758,0.0855301,109,364,1.0,99.3569,1.10379,0.367466
fit_mod_05b1a6d9,TERMINATED,127.0.0.1:32017,0.368979,7,394,1.0,76.3629,15.3507,0.00144769
fit_mod_0a34ab4e,TERMINATED,127.0.0.1:32103,0.479899,123,210,1.0,49.8794,0.0526561,0.792984
fit_mod_0a7abff5,TERMINATED,127.0.0.1:32094,0.19396,150,5,1.0,17.6898,0.00906965,0.933266
fit_mod_22508025,TERMINATED,127.0.0.1:32146,0.00483009,72,780,1.0,109.838,0.000166225,0.999997
fit_mod_285fcf04,TERMINATED,127.0.0.1:32384,0.117563,23,688,1.0,145.683,0.012568,0.893448
fit_mod_2d7a220b,TERMINATED,127.0.0.1:32187,0.0150821,91,86,1.0,26.5502,0.000474048,0.999227
fit_mod_48be281b,TERMINATED,127.0.0.1:32199,0.00509173,101,309,1.0,61.4197,0.000523857,0.999994
fit_mod_4f72cda5,TERMINATED,127.0.0.1:32489,0.0149429,134,76,1.0,28.7228,0.000543794,0.999226




In [None]:
Grid = results.get_dataframe().copy()

In [None]:
Grid.index.name = 'order'
RankedGrid = Grid.sort_values(['binary_logloss', 'auc'], ascending=[True, False]).reset_index()
RankedGrid.index.name = 'rank'
RankedGrid.to_csv(fn_hyperparameters)

In [None]:
RankedGrid.sort_values('binary_logloss').head(10)[['binary_logloss', 'auc', 'config/num_trees', 'config/learning_rate', 'config/min_data_in_leaf']]