In [10]:
import pandas as pd
import numpy as np
import os

import lightgbm as lgb

from sklearn.preprocessing import StandardScaler

from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import ConcurrencyLimiter

from sklearn.metrics import log_loss

In [18]:
data_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker'

In [10]:
dir_hyperparameters = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_03_21')
fn_hyperparameters = os.path.join(dir_hyperparameters, 'gbm_grid_2025_03_21.csv')

dir_temp = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_03_21/temp_folder')
fn_x_train = os.path.join(dir_temp, 'x_train.parquet')
fn_x_test  = os.path.join(dir_temp, 'x_test.parquet')
fn_y_train = os.path.join(dir_temp, 'y_train.parquet')
fn_y_test  = os.path.join(dir_temp, 'y_test.parquet')

In [11]:
space = {
    'verbose':-1,
    'num_trees': tune.randint(1, 1000),  # used to max at 500
    'learning_rate': tune.uniform(0.0001, 0.75),
    'min_data_in_leaf': tune.randint(1, 200),
    'objective':'binary', 
    # 'early_stopping_round':2,
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
    }

In [12]:
search_alg = OptunaSearch(metric="binary_logloss", mode="min")
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=1)

In [13]:
def fit_mod(space):
    
    # Read data
    XTrain = pd.read_parquet(fn_x_train)
    XTest  = pd.read_parquet(fn_x_test)
    YTrain = pd.read_parquet(fn_y_train)
    YTest  = pd.read_parquet(fn_y_test)
    
    # Package in training and testing objects
    train_set = lgb.Dataset(XTrain, YTrain)
    test_set  = lgb.Dataset(XTest,  YTest)

    # Model
    gbm = lgb.train(
        space,
        train_set,
        valid_sets=[test_set]    
    )
    binary_logloss = gbm.best_score['valid_0']['binary_logloss']
    auc = gbm.best_score['valid_0']['auc']
    train.report(
        {
            "binary_logloss": binary_logloss,
            "auc": auc
        }
    )

In [None]:
tuner = tune.Tuner(
    fit_mod,
    tune_config=tune.TuneConfig(
        num_samples=500,  # 250 at prev. stages
        search_alg=search_alg,
    ),
    param_space=space,
    run_config=train.RunConfig(
        storage_path=dir_hyperparameters, 
        name="gb_ray_tune"
    )
)
results = tuner.fit()

0,1
Current time:,2025-03-22 02:47:27
Running for:,11:02:12.31
Memory:,4.8/8.0 GiB

Trial name,status,loc,learning_rate,min_data_in_leaf,num_trees,iter,total time (s),binary_logloss,auc
fit_mod_537349ff,RUNNING,127.0.0.1:67777,0.0138494,89,510,,,,
fit_mod_0048a8b7,TERMINATED,127.0.0.1:61014,0.0152322,29,227,1.0,56.3774,0.000270607,0.999991
fit_mod_00962577,TERMINATED,127.0.0.1:61306,0.0271792,34,269,1.0,69.4385,0.00551349,0.955496
fit_mod_00ab32c7,TERMINATED,127.0.0.1:65257,0.0447054,112,641,1.0,182.212,0.00772642,0.954954
fit_mod_012b47e4,TERMINATED,127.0.0.1:64555,0.0289736,90,490,1.0,129.156,0.00644477,0.957278
fit_mod_0157d2ed,TERMINATED,127.0.0.1:61467,0.014878,43,226,1.0,48.7102,0.000240229,0.999992
fit_mod_03348c36,TERMINATED,127.0.0.1:61914,0.0285179,26,518,1.0,136.048,0.00895187,0.96116
fit_mod_0373cf18,TERMINATED,127.0.0.1:63731,0.0245907,72,581,1.0,138.128,0.00715003,0.979363
fit_mod_0481ec07,TERMINATED,127.0.0.1:65816,0.0263811,110,431,1.0,98.8221,0.00411037,0.967159
fit_mod_04afade7,TERMINATED,127.0.0.1:63617,0.0170142,69,531,1.0,89.6096,0.00020877,0.999995




In [None]:
Grid = results.get_dataframe().copy()

In [None]:
Grid.index.name = 'order'
RankedGrid = Grid.sort_values(['binary_logloss', 'auc'], ascending=[True, False]).reset_index()
RankedGrid.index.name = 'rank'
RankedGrid.to_csv(fn_hyperparameters)

In [None]:
RankedGrid.sort_values('binary_logloss').head(10)[['binary_logloss', 'auc', 'config/num_trees', 'config/learning_rate', 'config/min_data_in_leaf']]