In [8]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import dask
from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
# from ray.tune.s|chedulers import ASHAScheduler
from ray.tune.search import ConcurrencyLimiter

In [2]:
fn_train_x = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/train_x.parquet'
fn_train_y = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/train_y.parquet'
dir_hyperparameters = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train'

In [3]:
def fit_mod(space):

    # ELT
    X = pd.read_parquet(fn_train_x)
    Y = pd.read_parquet(fn_train_y)
    # X = pd.read_parquet('/Volumes/Extreme SSD/rematch1_predictor/full_training_data/train_x.parquet')
    # Y = pd.read_parquet('/Volumes/Extreme SSD/rematch1_predictor/full_training_data/full_data_y.parquet')
    
    size_of_train_set = round(0.8 * X.shape[0])
    rows_for_train_set = np.random.choice(a=X.index, size=size_of_train_set, replace=False)
    rows_for_val_set = np.setdiff1d(X.index, rows_for_train_set)
    
    train_set = lgb.Dataset(X.loc[rows_for_train_set], Y.loc[rows_for_train_set])
    val_set = lgb.Dataset(X.loc[rows_for_val_set], Y.loc[rows_for_val_set])

    # Model
    gbm = lgb.train(
        space,
        train_set,
        valid_sets=[val_set],
    )
    binary_logloss = gbm.best_score['valid_0']['binary_logloss']
    auc = gbm.best_score['valid_0']['auc']
    train.report(
        {
            "binary_logloss": binary_logloss,
            "auc": auc
        }
    )

In [4]:
space = {
    'num_iterations': tune.randint(1, 1000),
    # 'num_rounds': tune.randint(1, 500),
    'learning_rate': tune.uniform(0.0001, 1),
    'min_data_in_leaf': tune.randint(1, 200),
    'objective':'binary', 
    # 'early_stopping_round':2,
    'metrics':['binary_logloss', 'auc']
    }

In [9]:
# asha_scheduler = ASHAScheduler(
#     time_attr='training_iteration',
#     metric='binary_logloss',
#     mode='min',
#     max_t=1000,
#     grace_period=50,
#     reduction_factor=3,
#     brackets=1,
# )

search_alg = OptunaSearch(metric="binary_logloss", mode="min")
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2)

In [6]:
tuner = tune.Tuner(
    fit_mod,
    tune_config=tune.TuneConfig(
        # scheduler=asha_scheduler,
        search_alg=search_alg,
        num_samples=1000
    ),
    param_space=space,
    run_config=train.RunConfig(
        storage_path=dir_hyperparameters, 
        name="gb_ray_tune"
    )
)
results = tuner.fit()

0,1
Current time:,2024-12-23 15:44:01
Running for:,00:00:14.94
Memory:,6.7/8.0 GiB

Trial name,status,loc,learning_rate,min_data_in_leaf,num_iterations
fit_mod_7b3fe8e6,RUNNING,127.0.0.1:657,0.721522,155,811


[36m(pid=657)[0m Dask dataframe query planning is disabled because dask-expr is not installed.
[36m(pid=657)[0m 
[36m(pid=657)[0m You can install it with `pip install dask[dataframe]` or `conda install dask`.
[36m(pid=657)[0m This will raise in a future version.
[36m(pid=657)[0m 
2024-12-23 15:44:01,074	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/gb_ray_tune' in 0.0268s.
2024-12-23 15:44:11,150	INFO tune.py:1041 -- Total run time: 25.04 seconds (14.91 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/gb_ray_tune", trainable=...)


In [None]:
# experiment_path = "/Users/andrewbartnof/Documents/rmi/rematch_ferc_eia1/clean_data/model_full_gradient_boost/ray_tune/gb_ray_tune"
# restored_tuner = tune.Tuner.restore(experiment_path, trainable=fit_mod)

In [None]:
# fn_results = '/Users/andrewbartnof/Documents/rmi/rematch_ferc_eia1/clean_data/model_full_gradient_boost/ray_tune/ray_tune_dataframe.csv'
# restored_tuner.get_results().get_dataframe().to_csv(fn_results)

In [2]:
# !jupyter nbconvert --to script model_a_hyperparameter_search.ipynb

[NbConvertApp] Converting notebook model_a_hyperparameter_search.ipynb to script
[NbConvertApp] Writing 3141 bytes to model_a_hyperparameter_search.py
