In [1]:
import pandas as pd
import numpy as np
import os

import lightgbm as lgb

from sklearn.preprocessing import StandardScaler

from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import ConcurrencyLimiter

from sklearn.metrics import log_loss

In [2]:
data_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker'

In [4]:
dir_hyperparameters = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_02_21')
fn_hyperparameters = os.path.join(dir_hyperparameters, 'gbm_grid_2025_02_21.csv')

dir_temp = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_02_21/temp_folder')

fn_x_train = os.path.join(dir_temp, 'x_train.parquet')
fn_x_test  = os.path.join(dir_temp, 'x_test.parquet')
fn_y_train = os.path.join(dir_temp, 'y_train.parquet')
fn_y_test  = os.path.join(dir_temp, 'y_test.parquet')

In [5]:
space = {
    'verbose':-1,
    'num_trees': tune.randint(1, 1000),  # used to max at 500
    'learning_rate': tune.uniform(0.0001, 0.75),
    'min_data_in_leaf': tune.randint(1, 200),
    'objective':'binary', 
    # 'early_stopping_round':2,
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
    }

In [6]:
search_alg = OptunaSearch(metric="binary_logloss", mode="min")
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=1)

In [7]:
XTrain = pd.read_parquet(fn_x_train)
XTest  = pd.read_parquet(fn_x_test)
YTrain = pd.read_parquet(fn_y_train)
YTest  = pd.read_parquet(fn_y_test)

# Package in training and testing objects
train_set = lgb.Dataset(XTrain, YTrain)
test_set  = lgb.Dataset(XTest,  YTest)

# Model
evals={}

gbm = lgb.train(
    train_set=train_set,
    params={'num_trees':3, 'min_data_in_leaf':100,'objective':'binary','early_stopping_round':-1, 'metrics':['binary_logloss', 'auc']},
    valid_sets=test_set,
    callbacks = [lgb.record_evaluation(evals)]
)



[LightGBM] [Info] Number of positive: 2463, number of negative: 2463000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.313517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6415
[LightGBM] [Info] Number of data points in the train set: 2465463, number of used features: 142
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000999 -> initscore=-6.907755
[LightGBM] [Info] Start training from score -6.907755


In [8]:
def fit_mod(space):
    
    # Read data
    XTrain = pd.read_parquet(fn_x_train)
    XTest  = pd.read_parquet(fn_x_test)
    YTrain = pd.read_parquet(fn_y_train)
    YTest  = pd.read_parquet(fn_y_test)
    
    # Package in training and testing objects
    train_set = lgb.Dataset(XTrain, YTrain)
    test_set  = lgb.Dataset(XTest,  YTest)

    # Model
    gbm = lgb.train(
        space,
        train_set,
        valid_sets=[test_set]    
    )
    binary_logloss = gbm.best_score['valid_0']['binary_logloss']
    auc = gbm.best_score['valid_0']['auc']
    train.report(
        {
            "binary_logloss": binary_logloss,
            "auc": auc
        }
    )

In [None]:
tuner = tune.Tuner(
    fit_mod,
    tune_config=tune.TuneConfig(
        num_samples=500,  # 250 at prev. stages
        search_alg=search_alg,
    ),
    param_space=space,
    run_config=train.RunConfig(
        storage_path=dir_hyperparameters, 
        name="gb_ray_tune"
    )
)
results = tuner.fit()

0,1
Current time:,2025-02-28 12:16:37
Running for:,14:07:54.87
Memory:,4.6/8.0 GiB

Trial name,status,loc,learning_rate,min_data_in_leaf,num_trees,iter,total time (s),binary_logloss,auc
fit_mod_58a8c497,RUNNING,127.0.0.1:86034,0.336743,187,838,,,,
fit_mod_00a84993,TERMINATED,127.0.0.1:84818,0.000813544,182,709,1.0,113.33,0.00150025,0.999993
fit_mod_00d261d7,TERMINATED,127.0.0.1:84971,0.0505388,180,787,1.0,197.41,0.0107255,0.95699
fit_mod_00ee6af2,TERMINATED,127.0.0.1:78799,0.00627067,36,436,1.0,64.8027,0.00027022,0.999992
fit_mod_04965f04,TERMINATED,127.0.0.1:84499,0.0253593,188,695,1.0,178.416,0.0134463,0.966672
fit_mod_05f88de5,TERMINATED,127.0.0.1:83394,0.0258041,161,940,1.0,233.357,0.0143793,0.978768
fit_mod_06ffaeb2,TERMINATED,127.0.0.1:81077,0.343478,152,912,1.0,112.548,2.45541,0.919605
fit_mod_075bda1a,TERMINATED,127.0.0.1:78392,0.695205,126,960,1.0,181.459,7.32422,0.000871173
fit_mod_09e08373,TERMINATED,127.0.0.1:82931,0.0117459,126,925,1.0,131.729,0.000148072,0.999997
fit_mod_0a61adfa,TERMINATED,127.0.0.1:83120,0.000977478,135,981,1.0,143.266,0.000984247,0.999995


[33m(raylet)[0m [2025-02-28 07:11:56,049 E 78287 4530954] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2025-02-27_22-08-38_622866_78039 is over 95% full, available space: 12252905472; capacity: 245107195904. Object creation will fail if spilling is required.
[33m(raylet)[0m [2025-02-28 07:48:06,928 E 78287 4530954] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2025-02-27_22-08-38_622866_78039 is over 95% full, available space: 12226031616; capacity: 245107195904. Object creation will fail if spilling is required.
[33m(raylet)[0m [2025-02-28 07:48:16,940 E 78287 4530954] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2025-02-27_22-08-38_622866_78039 is over 95% full, available space: 12225482752; capacity: 245107195904. Object creation will fail if spilling is required.
[33m(raylet)[0m [2025-02-28 08:59:12,245 E 78287 4530954] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2025-02-27_22-08-38_622866_78039 is over 95% full, available space: 12232757

In [None]:
Grid = results.get_dataframe().copy()

In [None]:
Grid.index.name = 'order'
RankedGrid = Grid.sort_values(['binary_logloss', 'auc'], ascending=[True, False]).reset_index()
RankedGrid.index.name = 'rank'
RankedGrid.to_csv(fn_hyperparameters)

In [None]:
RankedGrid.sort_values('binary_logloss').head(10)[['binary_logloss', 'auc', 'config/num_trees', 'config/learning_rate', 'config/min_data_in_leaf']]