# Perform cross-validation to dig into the most promising hyperparameters

__author__: Andrew Bartnof

__copyright__: Copyright 2025, Rocky Mountain Institute

__credits__: Alex Engel, Andrew Bartnof

In [2]:
import pandas as pd
import numpy as np
import itertools
import os

from keras import models, layers, regularizers, optimizers, callbacks, utils, losses, metrics
from tensorflow.keras.backend import clear_session
from tensorflow import convert_to_tensor

from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.search import ConcurrencyLimiter
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, log_loss

from tqdm import tqdm
# from tqdm.notebook import tqdm

In [10]:
data_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker'
dir_working_model_a_training = os.path.join(data_dir, 'working_data/model_a/model_a_training')
dir_working_model_a_training

'/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/model_a_training'

In [11]:
fn_x = os.path.join(dir_working_model_a_training, 'x.parquet')
fn_y = os.path.join(dir_working_model_a_training, 'y.parquet')
fn_id = os.path.join(dir_working_model_a_training, 'id.parquet')

dir_hyperparameters = dir_working_model_a_training
fn_hp = os.path.join(dir_working_model_a_training, 'ann_ray_tune/model_a_ann_hp_search.csv')
fn_history = os.path.join(dir_working_model_a_training, 'ann_ray_tune/history_cross_validation_of_best_candidates_ann.csv')
fn_metrics = os.path.join(dir_working_model_a_training, 'ann_ray_tune/metrics_cross_validation_of_best_candidates_ann.csv')

In [12]:
# working_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/
# 'model_a/train/x.parquet'
# # os.path.join('C:\Users\A\Desktop\Repo', filename)

# fn_x = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/x.parquet'
# fn_y = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/y.parquet'
# fn_id = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/id.parquet'

# # dir_hyperparameters = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train'
# fn_grid = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/ann/grid_search.csv'
# fn_history = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/ann/history_cross_validation_of_best_candidates_ann.csv'
# fn_metrics = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker/working_data/model_a/train/ann/metrics_cross_validation_of_best_candidates_ann.csv'

In [13]:
def np_cleaning(X):
    X = np.clip(X, a_min=-3, a_max=3)
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    return X

In [14]:
X = pd.read_parquet(fn_x)
Y = pd.read_parquet(fn_y)
ID = pd.read_parquet(fn_id)

In [15]:
rename_dict = {'config/dropout_1':'dropout_1', 'config/dropout_2':'dropout_2', 'config/relu_1':'relu_1', 'config/relu_2':'relu_2'} #, 'config/metrics':'metrics'}

Grid = pd.read_csv(fn_hp, index_col='rank')
Grid = Grid.rename(columns=rename_dict)[list(rename_dict.values())]

# Create a dictionary: punch in the rank of the model we want to use, and get the parameters back, as a dictionary
param_dict = {i:Grid.loc[i].to_dict() for i in Grid.index}
# param_dict[0]

In [16]:
test = False

if test:
    variables = [range(2), range(2)]
    max_epochs = 2
else:
    variables = [
        range(15),  # num hyperparameters to test
        range(5)  # number of folds in the ID table
    ]
    max_epochs = 500

history_list = []
metrics_list = []

for (hp_rank, fold) in tqdm(list(itertools.product(*variables))):
    
    space = param_dict[hp_rank]
    # Split data into training and validation
    is_train_mask = (ID['fold'] != fold).values
    
    XTrain = X.loc[is_train_mask]
    XVal = X.loc[~is_train_mask]
    y_train = Y.loc[is_train_mask, 'is_match']
    y_val = Y.loc[~is_train_mask, 'is_match']
    
    # X value processing
    standard_scaler = StandardScaler()
    standard_scaler.fit(XTrain)
    XTrain = standard_scaler.transform(XTrain)
    XVal  = standard_scaler.transform(XVal)
    
    XTrain = np_cleaning(XTrain)
    XVal  = np_cleaning(XVal)
    
    XTrain = convert_to_tensor(XTrain)
    XVal = convert_to_tensor(XVal)

    # Fit model
    clear_session()
    model = models.Sequential()
    model.add(layers.Dropout(rate=space["dropout_1"]))
    model.add(layers.Dense(units=int(space["relu_1"]), activation='relu'))    
    model.add(layers.Dropout(rate=space["dropout_2"]))
    model.add(layers.Dense(units=int(space["relu_2"]), activation='relu'))   
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(
        loss=losses.BinaryCrossentropy(),
        metrics=[
            metrics.BinaryCrossentropy(),
            metrics.BinaryAccuracy(), 
            metrics.AUC()
        ]
    )
        
    history = model.fit(
        XTrain, y_train, epochs=max_epochs, batch_size=128,  # hard-coded here
        validation_data=(XVal, y_val), 
        callbacks=callbacks.EarlyStopping(patience=5, start_from_epoch=10, restore_best_weights=True),
        verbose=0
    )
    
    # Store history
    History = pd.DataFrame(history.history) 
    History['hp_rank'] = hp_rank
    History['fold'] = fold
    History['epoch'] = History.index + 1
    history_list.append(History)

    # Get goodness of fit metrics on the best-scoring iteration of the model (see: callback)
    # This involves finding the best prediction per FERC record, setting those to 1, and the rest to 0
    
    y_fit = model.predict(XVal, verbose=0)
    Framework = pd.DataFrame({
        'record_id_ferc1': ID[~is_train_mask]['record_id_ferc1'],
        'y_fit': y_fit.flatten()
    })
    Framework['groupwise_max_y_fit'] = Framework.groupby('record_id_ferc1')['y_fit'].transform('max')
    Framework['y_fit_adj'] = Framework['y_fit'] == Framework['groupwise_max_y_fit']
    
    y_fit_adj = Framework['y_fit_adj'].values
    y_true = y_val.astype(bool).values
    metric_dict = {'hp_rank':hp_rank,
        'fold':fold,
        'accuracy':accuracy_score(y_true, y_fit_adj),
        'roc_auc':roc_auc_score(y_true, y_fit_adj),
        'log_loss':log_loss(y_true, y_fit_adj),
        'precision':precision_score(y_true, y_fit_adj),
        'recall':recall_score(y_true, y_fit_adj)
    }
    Metrics = pd.DataFrame(metric_dict, index=range(1))
    metrics_list.append(Metrics)

  0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
CollectedHistory = pd.concat(history_list)
CollectedHistory.reset_index(drop=True, inplace=True)
CollectedHistory.to_csv(fn_history, index=False)
CollectedHistory

Unnamed: 0,auc,binary_accuracy,binary_crossentropy,loss,val_auc,val_binary_accuracy,val_binary_crossentropy,val_loss,hp_rank,fold,epoch
0,0.943686,0.999436,0.002533,0.002533,0.963633,0.999717,0.001405,0.001405,0,0,1
1,0.960064,0.999679,0.001579,0.001579,0.971152,0.999774,0.001116,0.001116,0,0,2
2,0.942624,0.999396,0.002673,0.002673,0.960583,0.999703,0.001499,0.001499,0,1,1
3,0.963935,0.99966,0.001646,0.001646,0.969412,0.999754,0.001528,0.001528,0,1,2
4,0.942773,0.999455,0.002406,0.002406,0.962854,0.999736,0.001278,0.001278,1,0,1
5,0.958149,0.99963,0.001777,0.001777,0.969161,0.999751,0.00126,0.00126,1,0,2
6,0.948887,0.999432,0.0024,0.0024,0.973395,0.999729,0.001146,0.001146,1,1,1
7,0.965481,0.999664,0.001537,0.001537,0.959793,0.999719,0.00137,0.00137,1,1,2


In [20]:
CollectedMetrics = pd.concat(metrics_list).reset_index()
CollectedMetrics.drop('index',axis=1, inplace=True)
CollectedMetrics.to_csv(fn_metrics, index=False)
CollectedMetrics

Unnamed: 0,hp_rank,fold,accuracy,roc_auc,log_loss,precision,recall
0,0,0,0.999861,0.965265,0.004998,0.930599,0.930599
1,0,1,0.999847,0.96221,0.005524,0.922276,0.924498
2,1,0,0.99986,0.96487,0.005055,0.929811,0.929811
3,1,1,0.999844,0.961406,0.005611,0.921411,0.922892


In [21]:
# CollectedMetrics.drop('fold', axis=1).boxplot(
#     by='hp_rank', 
#     sharey=False, 
#     grid=False, 
#     layout = (3, 2), 
#     figsize = (10, 6), 
#     meanline=True
# )