In [1]:
import pandas as pd
import numpy as np
import os

import lightgbm as lgb

from sklearn.preprocessing import StandardScaler

from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import ConcurrencyLimiter

from sklearn.metrics import log_loss

In [2]:
data_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker'

In [3]:
dir_hyperparameters = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_03_02')
fn_hyperparameters = os.path.join(dir_hyperparameters, 'gbm_grid_2025_03_02.csv')

dir_temp = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_03_02/temp_folder')

fn_x_train = os.path.join(dir_temp, 'x_train.parquet')
fn_x_test  = os.path.join(dir_temp, 'x_test.parquet')
fn_y_train = os.path.join(dir_temp, 'y_train.parquet')
fn_y_test  = os.path.join(dir_temp, 'y_test.parquet')

In [4]:
space = {
    'verbose':-1,
    'num_trees': tune.randint(1, 1000),  # used to max at 500
    'learning_rate': tune.uniform(0.0001, 0.75),
    'min_data_in_leaf': tune.randint(1, 200),
    'objective':'binary', 
    # 'early_stopping_round':2,
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
    }

In [5]:
search_alg = OptunaSearch(metric="binary_logloss", mode="min")
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=1)

In [6]:
def fit_mod(space):
    
    # Read data
    XTrain = pd.read_parquet(fn_x_train)
    XTest  = pd.read_parquet(fn_x_test)
    YTrain = pd.read_parquet(fn_y_train)
    YTest  = pd.read_parquet(fn_y_test)
    
    # Package in training and testing objects
    train_set = lgb.Dataset(XTrain, YTrain)
    test_set  = lgb.Dataset(XTest,  YTest)

    # Model
    gbm = lgb.train(
        space,
        train_set,
        valid_sets=[test_set]    
    )
    binary_logloss = gbm.best_score['valid_0']['binary_logloss']
    auc = gbm.best_score['valid_0']['auc']
    train.report(
        {
            "binary_logloss": binary_logloss,
            "auc": auc
        }
    )

In [7]:
tuner = tune.Tuner(
    fit_mod,
    tune_config=tune.TuneConfig(
        num_samples=500,  # 250 at prev. stages
        search_alg=search_alg,
    ),
    param_space=space,
    run_config=train.RunConfig(
        storage_path=dir_hyperparameters, 
        name="gb_ray_tune"
    )
)
results = tuner.fit()

0,1
Current time:,2025-03-02 14:32:59
Running for:,"1 days, 01:55:58.98"
Memory:,3.7/8.0 GiB

Trial name,status,loc,learning_rate,min_data_in_leaf,num_trees,iter,total time (s),binary_logloss,auc
fit_mod_532178db,TERMINATED,127.0.0.1:96153,0.20287,13,638,1,132.068,0.0139788,0.991089
fit_mod_b960ba43,TERMINATED,127.0.0.1:96215,0.137807,167,625,1,144.802,0.00614423,0.97093
fit_mod_c87273b0,TERMINATED,127.0.0.1:96299,0.389224,138,795,1,165.496,0.0345043,0.00962988
fit_mod_42dc31bd,TERMINATED,127.0.0.1:96309,0.204241,191,429,1,103.781,0.0226816,0.971593
fit_mod_b2e4e44a,TERMINATED,127.0.0.1:96374,0.506226,134,268,1,70.0116,0.0440023,0.0374327
fit_mod_5dfeee59,TERMINATED,127.0.0.1:96394,0.731965,109,547,1,85.1549,0.328998,0.92157
fit_mod_19516bba,TERMINATED,127.0.0.1:96481,0.330446,41,125,1,45.1324,0.116618,0.852711
fit_mod_be54c8e0,TERMINATED,127.0.0.1:96487,0.0203864,142,111,1,39.3927,0.00312896,0.947117
fit_mod_51faab6d,TERMINATED,127.0.0.1:96494,0.442073,139,985,1,181.167,0.0348531,0.277701
fit_mod_16a859ac,TERMINATED,127.0.0.1:96513,0.158967,151,572,1,125.742,0.0689124,0.0718158


[33m(raylet)[0m [2025-03-02 08:01:57,614 E 96141 5535557] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2025-03-01_12-36-58_476906_96129 is over 95% full, available space: 12089171968; capacity: 245107195904. Object creation will fail if spilling is required.
[33m(raylet)[0m [2025-03-02 08:02:07,675 E 96141 5535557] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2025-03-01_12-36-58_476906_96129 is over 95% full, available space: 12088991744; capacity: 245107195904. Object creation will fail if spilling is required.
[33m(raylet)[0m [2025-03-02 09:23:45,593 E 96141 5535557] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2025-03-01_12-36-58_476906_96129 is over 95% full, available space: 11680051200; capacity: 245107195904. Object creation will fail if spilling is required.
[33m(raylet)[0m [2025-03-02 12:14:50,655 E 96141 5535557] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2025-03-01_12-36-58_476906_96129 is over 95% full, available space: 12243423

In [11]:
Grid = results.get_dataframe().copy()

In [12]:
Grid.index.name = 'order'
RankedGrid = Grid.sort_values(['binary_logloss', 'auc'], ascending=[True, False]).reset_index()
RankedGrid.index.name = 'rank'
RankedGrid.to_csv(fn_hyperparameters)

In [13]:
RankedGrid.sort_values('binary_logloss').head(10)[['binary_logloss', 'auc', 'config/num_trees', 'config/learning_rate', 'config/min_data_in_leaf']]

Unnamed: 0_level_0,binary_logloss,auc,config/num_trees,config/learning_rate,config/min_data_in_leaf
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.000137,0.999994,580,0.01373,162
1,0.000141,0.999994,843,0.012509,129
2,0.000142,0.999992,861,0.012041,166
3,0.000143,0.999988,901,0.011118,162
4,0.000143,0.999995,925,0.01275,142
5,0.000143,0.999987,835,0.012754,162
6,0.000144,0.999995,541,0.012782,154
7,0.000145,0.99999,858,0.013276,169
8,0.000145,0.999996,867,0.014005,180
9,0.000145,0.999992,549,0.013758,166
