In [7]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import dask
from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
# from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import ConcurrencyLimiter
from sklearn.model_selection import train_test_split
# from optuna.samplers import TPESampler

In [8]:
fn_train_x = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/train_x.parquet'
fn_train_y = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/train_y.parquet'

fn_test_x = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/test_x.parquet'
fn_test_y = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/test_y.parquet'

dir_hyperparameters = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train'
fn_out = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/gb_ray_tune/grid_search.csv'

In [9]:
def fit_mod(space):

    # ELT
    XTrain = pd.read_parquet(fn_train_x)
    YTrain = pd.read_parquet(fn_train_y)
    
    XTest = pd.read_parquet(fn_test_x)
    YTest = pd.read_parquet(fn_test_y)
    # X = pd.read_parquet(fn_train_x)
    # Y = pd.read_parquet(fn_train_y)
    
    # size_of_train_set = round(0.8 * X.shape[0])
    # rows_for_train_set = np.random.choice(a=X.index, size=size_of_train_set, replace=False)
    # rows_for_val_set = np.setdiff1d(X.index, rows_for_train_set)
    
    train_set = lgb.Dataset(XTrain, YTrain)
    val_set = lgb.Dataset(XTest, YTest)

    # Model
    gbm = lgb.train(
        space,
        train_set,
        valid_sets=[val_set]    
    )
    binary_logloss = gbm.best_score['valid_0']['binary_logloss']
    auc = gbm.best_score['valid_0']['auc']
    train.report(
        {
            "binary_logloss": binary_logloss,
            "auc": auc
        }
    )

In [10]:
space = {
    # 'num_iterations': tune.randint(1, 1000),
    'verbose':-1,
    'num_trees': tune.randint(1, 500),
    'learning_rate': tune.uniform(0.0001, 1),
    'min_data_in_leaf': tune.randint(1, 200),
    'objective':'binary', 
    # 'early_stopping_round':2,
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
    }

In [11]:
# asha = ASHAScheduler(metric='binary_logloss', mode='min')

search_alg = OptunaSearch(metric="binary_logloss", mode="min")
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2)

In [None]:
tuner = tune.Tuner(
    fit_mod,
    tune_config=tune.TuneConfig(
        # scheduler=asha,
        num_samples=500,
        search_alg=search_alg,
    ),
    param_space=space,
    run_config=train.RunConfig(
        storage_path=dir_hyperparameters, 
        name="gb_ray_tune"
    )
)
results = tuner.fit()

0,1
Current time:,2024-12-26 04:43:41
Running for:,08:22:24.59
Memory:,5.9/8.0 GiB

Trial name,status,loc,learning_rate,min_data_in_leaf,num_trees,iter,total time (s),binary_logloss,auc
fit_mod_15660bba,RUNNING,127.0.0.1:14741,0.0557718,94,358,,,,
fit_mod_2b77914b,RUNNING,127.0.0.1:14756,0.0325914,91,358,,,,
fit_mod_020944e5,TERMINATED,127.0.0.1:8538,0.0430599,95,358,1.0,280.605,0.0238151,0.950697
fit_mod_02a74893,TERMINATED,127.0.0.1:7207,0.0958268,79,294,1.0,197.149,0.0093349,0.959127
fit_mod_06160ded,TERMINATED,127.0.0.1:3815,0.0171459,80,310,1.0,234.695,0.00338018,0.959696
fit_mod_0714bd41,TERMINATED,127.0.0.1:12348,0.00294853,28,364,1.0,225.62,0.000485403,0.999258
fit_mod_077e51f5,TERMINATED,127.0.0.1:14063,0.042769,87,326,1.0,232.741,0.0108824,0.961097
fit_mod_07ad2fc6,TERMINATED,127.0.0.1:5224,0.301999,74,428,1.0,250.717,0.0157027,0.858307
fit_mod_0870d815,TERMINATED,127.0.0.1:7897,0.0412037,76,412,1.0,326.701,0.00857063,0.969092
fit_mod_08865091,TERMINATED,127.0.0.1:8041,0.0179591,92,340,1.0,251.662,0.00387712,0.963504




In [None]:
Grid = results.get_dataframe().copy()

In [None]:
Grid.index.name = 'order'
RankedGrid = Grid.sort_values(['binary_logloss', 'auc'], ascending=[True, False]).reset_index()
RankedGrid.index.name = 'rank'
RankedGrid.to_csv(fn_out)

In [21]:
RankedGrid.head()

Unnamed: 0_level_0,order,binary_logloss,auc,timestamp,checkpoint_dir_name,done,training_iteration,trial_id,date,time_this_iter_s,...,time_since_restore,iterations_since_restore,config/verbose,config/num_trees,config/learning_rate,config/min_data_in_leaf,config/objective,config/early_stopping_round,config/metrics,logdir
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,43,0.000297,0.999538,1735094903,,False,1,c28cd683,2024-12-24_18-48-23,213.021816,...,213.021816,1,-1,397,0.004754,78,binary,-1,"[binary_logloss, auc]",c28cd683
1,112,0.000306,0.997294,1735100466,,False,1,74150487,2024-12-24_20-21-06,164.082999,...,164.082999,1,-1,261,0.012356,62,binary,-1,"[binary_logloss, auc]",74150487
2,45,0.000308,0.998993,1735095035,,False,1,4c178663,2024-12-24_18-50-35,155.330574,...,155.330574,1,-1,263,0.009665,84,binary,-1,"[binary_logloss, auc]",4c178663
3,481,0.00031,0.998069,1735136501,,False,1,2dadf8b8,2024-12-25_06-21-41,159.621424,...,159.621424,1,-1,279,0.014149,67,binary,-1,"[binary_logloss, auc]",2dadf8b8
4,52,0.000326,0.999687,1735095604,,False,1,d5488a83,2024-12-24_19-00-04,175.706035,...,175.706035,1,-1,304,0.005784,81,binary,-1,"[binary_logloss, auc]",d5488a83


*** SIGTERM received at time=1735163063 ***
PC: @        0x19c6c101c  (unknown)  kevent
    @        0x157128660  (unknown)  absl::lts_20230125::WriteFailureInfo()
    @        0x1571283ac  (unknown)  absl::lts_20230125::AbslFailureSignalHandler()
    @        0x19c732e04  (unknown)  _sigtramp
    @        0x103080bf0  (unknown)  select_kqueue_control
    @        0x102a2b5a8  (unknown)  _PyEval_EvalFrameDefault
    @        0x102a1c05c  (unknown)  PyEval_EvalCode
    @        0x102a18c78  (unknown)  builtin_exec
    @        0x10296c19c  (unknown)  cfunction_vectorcall_FASTCALL_KEYWORDS
    @        0x10290ff1c  (unknown)  PyObject_Vectorcall
    @        0x102a2998c  (unknown)  _PyEval_EvalFrameDefault
    @        0x102ac00a8  (unknown)  pymain_run_module
    @        0x102abfad4  (unknown)  Py_RunMain
    @        0x102ac0940  (unknown)  pymain_main
    @        0x1028a8d60  (unknown)  main
    @        0x19c37c274  (unknown)  start
[2024-12-25 13:44:23,403 E 1147 34823] logging.cc

In [8]:
# experiment_path = "/Users/andrewbartnof/Documents/rmi/rematch_ferc_eia1/clean_data/model_full_gradient_boost/ray_tune/gb_ray_tune"
# restored_tuner = tune.Tuner.restore(experiment_path, trainable=fit_mod)

In [9]:
# fn_results = '/Users/andrewbartnof/Documents/rmi/rematch_ferc_eia1/clean_data/model_full_gradient_boost/ray_tune/ray_tune_dataframe.csv'
# restored_tuner.get_results().get_dataframe().to_csv(fn_results)

In [10]:
# !jupyter nbconvert --to script model_a_hyperparameter_search.ipynb