In [49]:
import pandas as pd
import numpy as np
import os

import lightgbm as lgb

from sklearn.preprocessing import StandardScaler

from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import ConcurrencyLimiter

from sklearn.metrics import log_loss

In [17]:
data_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker'

In [18]:
dir_hyperparameters = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_02_21')
fn_hyperparameters = os.path.join(dir_hyperparameter, 'gbm_grid_2025_02_21.csv')

dir_temp = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_02_21/temp_folder')

fn_x_train = os.path.join(dir_temp, 'x_train.parquet')
fn_x_test  = os.path.join(dir_temp, 'x_test.parquet')
fn_y_train = os.path.join(dir_temp, 'y_train.parquet')
fn_y_test  = os.path.join(dir_temp, 'y_test.parquet')

In [19]:
space = {
    'verbose':-1,
    'num_trees': tune.randint(1, 1000),  # used to max at 500
    'learning_rate': tune.uniform(0.0001, 0.75),
    'min_data_in_leaf': tune.randint(1, 200),
    'objective':'binary', 
    # 'early_stopping_round':2,
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
    }

In [20]:
search_alg = OptunaSearch(metric="binary_logloss", mode="min")
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=1)

In [64]:
XTrain = pd.read_parquet(fn_x_train)
XTest  = pd.read_parquet(fn_x_test)
YTrain = pd.read_parquet(fn_y_train)
YTest  = pd.read_parquet(fn_y_test)

# Package in training and testing objects
train_set = lgb.Dataset(XTrain, YTrain)
test_set  = lgb.Dataset(XTest,  YTest)

# Model
evals={}

gbm = lgb.train(
    train_set=train_set,
    params={'num_trees':3, 'min_data_in_leaf':100,'objective':'binary','early_stopping_round':-1, 'metrics':['binary_logloss', 'auc']},
    valid_sets=test_set,
    callbacks = [lgb.record_evaluation(evals)]
)



[LightGBM] [Info] Number of positive: 2463, number of negative: 2463000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.277660 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6415
[LightGBM] [Info] Number of data points in the train set: 2465463, number of used features: 142
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000999 -> initscore=-6.907755
[LightGBM] [Info] Start training from score -6.907755


In [68]:
evals['valid_0']['binary_logloss'][-1]

0.00408081481955259

In [21]:
def fit_mod(space):
    
    # Read data
    XTrain = pd.read_parquet(fn_x_train)
    XTest  = pd.read_parquet(fn_x_test)
    YTrain = pd.read_parquet(fn_y_train)
    YTest  = pd.read_parquet(fn_y_test)
    
    # Package in training and testing objects
    train_set = lgb.Dataset(XTrain, YTrain)
    test_set  = lgb.Dataset(XTest,  YTest)

    # Model
    gbm = lgb.train(
        space,
        train_set,
        valid_sets=[test_set]    
    )
    binary_logloss = gbm.best_score['valid_0']['binary_logloss']
    auc = gbm.best_score['valid_0']['auc']
    train.report(
        {
            "binary_logloss": binary_logloss,
            "auc": auc
        }
    )

In [22]:
tuner = tune.Tuner(
    fit_mod,
    tune_config=tune.TuneConfig(
        num_samples=500,  # 250 at prev. stages
        search_alg=search_alg,
    ),
    param_space=space,
    run_config=train.RunConfig(
        storage_path=dir_hyperparameters, 
        name="gb_ray_tune"
    )
)
results = tuner.fit()

0,1
Current time:,2025-02-24 23:04:08
Running for:,03:17:40.02
Memory:,6.0/8.0 GiB

Trial name,status,loc,learning_rate,min_data_in_leaf,num_trees,iter,total time (s),binary_logloss,auc
fit_mod_b178c5ac,RUNNING,127.0.0.1:34817,0.0355929,134,674,,,,
fit_mod_cd7f9a16,TERMINATED,127.0.0.1:31900,0.547356,71,911,1.0,170.433,0.00574189,0.966746
fit_mod_bab2caed,TERMINATED,127.0.0.1:31910,0.58635,81,450,1.0,94.1652,16.9202,0.0057974
fit_mod_d1cc55f9,TERMINATED,127.0.0.1:31915,0.0305092,197,655,1.0,172.08,0.0134868,0.97108
fit_mod_cf82945d,TERMINATED,127.0.0.1:31950,0.223507,116,962,1.0,146.774,0.0309057,0.890616
fit_mod_6e181f86,TERMINATED,127.0.0.1:31960,0.0946299,149,387,1.0,90.9329,0.0121427,0.995885
fit_mod_9c0c991d,TERMINATED,127.0.0.1:32002,0.314388,40,382,1.0,88.3214,13.9675,0.0154535
fit_mod_f2665887,TERMINATED,127.0.0.1:32007,0.373717,179,600,1.0,115.486,5.57453,0.0510425
fit_mod_05b1a6d9,TERMINATED,127.0.0.1:32017,0.368979,7,394,1.0,76.3629,15.3507,0.00144769
fit_mod_97f6a549,TERMINATED,127.0.0.1:32020,0.614538,63,216,1.0,56.4645,0.231475,0.00636599


2025-02-24 23:04:08,713	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_02_21/gb_ray_tune' in 0.0988s.
2025-02-24 23:04:18,764	INFO tune.py:1041 -- Total run time: 11870.08 seconds (11859.92 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_02_21/gb_ray_tune", trainable=...)


In [23]:
Grid = results.get_dataframe().copy()

In [24]:
Grid.index.name = 'order'
RankedGrid = Grid.sort_values(['binary_logloss', 'auc'], ascending=[True, False]).reset_index()
RankedGrid.index.name = 'rank'
RankedGrid.to_csv(fn_hyperparameters)

In [25]:
RankedGrid.sort_values('binary_logloss').head(10)[['binary_logloss', 'auc', 'config/num_trees', 'config/learning_rate', 'config/min_data_in_leaf']]

Unnamed: 0_level_0,binary_logloss,auc,config/num_trees,config/learning_rate,config/min_data_in_leaf
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.000126,0.999997,882,0.012975,164
1,0.000128,0.999997,778,0.008274,158
2,0.000128,0.999996,678,0.009364,134
3,0.000138,0.999997,799,0.01154,135
4,0.000142,0.999997,720,0.012843,135
5,0.000161,0.999997,767,0.016259,135
6,0.000163,0.999997,757,0.016286,115
7,0.000166,0.999997,780,0.00483,72
8,0.000197,0.999995,727,0.017365,130
9,0.000203,0.999995,318,0.009283,99
