In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import dask
from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
# from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import ConcurrencyLimiter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# from optuna.samplers import TPESampler

In [2]:
fn_x = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/x.parquet'
fn_y = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/y.parquet'
fn_id = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/id.parquet'

# fn_test_x = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/test_x.parquet'
# fn_test_y = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/test_y.parquet'

dir_hyperparameters = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train'
fn_out = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/gb_ray_tune/grid_search.csv'

In [3]:
def np_cleaning(X):
    X = np.clip(X, a_min=-3, a_max=3)
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    return X

In [4]:
# # Read data
# X = pd.read_parquet(fn_x)
# Y = pd.read_parquet(fn_y)
# ID = pd.read_parquet(fn_id)

# # Split data into testing and training
# fold_variable = 1
# is_train_mask = (ID['fold_num'] != fold_variable).values
# XTrain = X.loc[is_train_mask]
# XTest = X.loc[~is_train_mask]

# # Scale numeric values
# standard_scaler = StandardScaler()

# standard_scaler.fit(XTrain)
# XTrain = standard_scaler.transform(XTrain)
# XTest  = standard_scaler.transform(XTest)

# XTrain = np_cleaning(XTrain)
# XTest  = np_cleaning(XTest)

# # Package in training and testing objects
# train_set = lgb.Dataset(XTrain, Y.loc[is_train_mask])
# test_set  = lgb.Dataset(XTest,  Y.loc[~is_train_mask])

In [5]:
def fit_mod(space):

    # Read data
    X = pd.read_parquet(fn_x)
    Y = pd.read_parquet(fn_y)
    ID = pd.read_parquet(fn_id)
    
    # Split data into testing and training
    fold_variable = 1
    is_train_mask = (ID['fold_num'] != fold_variable).values
    XTrain = X.loc[is_train_mask]
    XTest = X.loc[~is_train_mask]
    
    # Scale numeric values
    standard_scaler = StandardScaler()
    
    standard_scaler.fit(XTrain)
    XTrain = standard_scaler.transform(XTrain)
    XTest  = standard_scaler.transform(XTest)
    
    XTrain = np_cleaning(XTrain)
    XTest  = np_cleaning(XTest)
    
    # Package in training and testing objects
    train_set = lgb.Dataset(XTrain, Y.loc[is_train_mask])
    test_set  = lgb.Dataset(XTest,  Y.loc[~is_train_mask])

    # Model
    gbm = lgb.train(
        space,
        train_set,
        valid_sets=[test_set]    
    )
    binary_logloss = gbm.best_score['valid_0']['binary_logloss']
    auc = gbm.best_score['valid_0']['auc']
    train.report(
        {
            "binary_logloss": binary_logloss,
            "auc": auc
        }
    )

In [6]:
space = {
    # 'num_iterations': tune.randint(1, 1000),
    'verbose':-1,
    'num_trees': tune.randint(1, 500),
    'learning_rate': tune.uniform(0.0001, 0.75),
    'min_data_in_leaf': tune.randint(1, 200),
    'objective':'binary', 
    # 'early_stopping_round':2,
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
    }

In [7]:
# asha = ASHAScheduler(metric='binary_logloss', mode='min')

search_alg = OptunaSearch(metric="binary_logloss", mode="min")
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2)

In [None]:
tuner = tune.Tuner(
    fit_mod,
    tune_config=tune.TuneConfig(
        # scheduler=asha,
        num_samples=500,
        search_alg=search_alg,
    ),
    param_space=space,
    run_config=train.RunConfig(
        storage_path=dir_hyperparameters, 
        name="gb_ray_tune"
    )
)
results = tuner.fit()

In [11]:
Grid = results.get_dataframe().copy()

In [12]:
Grid.index.name = 'order'
RankedGrid = Grid.sort_values(['binary_logloss', 'auc'], ascending=[True, False]).reset_index()
RankedGrid.index.name = 'rank'
RankedGrid.to_csv(fn_out)

In [13]:
RankedGrid.head()

Unnamed: 0_level_0,order,binary_logloss,auc,timestamp,checkpoint_dir_name,done,training_iteration,trial_id,date,time_this_iter_s,...,time_since_restore,iterations_since_restore,config/verbose,config/num_trees,config/learning_rate,config/min_data_in_leaf,config/objective,config/early_stopping_round,config/metrics,logdir
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,484,0.000163,0.999992,1735479114,,False,1,35faf222,2024-12-29_05-31-54,195.503513,...,195.503513,1,-1,488,0.012665,189,binary,-1,"[binary_logloss, auc]",35faf222
1,470,0.000165,0.999991,1735477449,,False,1,7d4896e8,2024-12-29_05-04-09,199.11973,...,199.11973,1,-1,486,0.012715,191,binary,-1,"[binary_logloss, auc]",7d4896e8
2,492,0.000167,0.999992,1735479945,,False,1,69968eed,2024-12-29_05-45-45,194.786236,...,194.786236,1,-1,486,0.012649,189,binary,-1,"[binary_logloss, auc]",69968eed
3,491,0.000169,0.999991,1735479839,,False,1,327ff249,2024-12-29_05-43-59,194.384423,...,194.384423,1,-1,477,0.013214,188,binary,-1,"[binary_logloss, auc]",327ff249
4,451,0.000169,0.99999,1735475481,,False,1,807218b2,2024-12-29_04-31-21,201.390625,...,201.390625,1,-1,492,0.012953,186,binary,-1,"[binary_logloss, auc]",807218b2


In [8]:
# experiment_path = "/Users/andrewbartnof/Documents/rmi/rematch_ferc_eia1/clean_data/model_full_gradient_boost/ray_tune/gb_ray_tune"
# restored_tuner = tune.Tuner.restore(experiment_path, trainable=fit_mod)

In [9]:
# fn_results = '/Users/andrewbartnof/Documents/rmi/rematch_ferc_eia1/clean_data/model_full_gradient_boost/ray_tune/ray_tune_dataframe.csv'
# restored_tuner.get_results().get_dataframe().to_csv(fn_results)

In [10]:
# !jupyter nbconvert --to script model_a_hyperparameter_search.ipynb