Steps here:
- Subset training data to 2/5 of the folds
- fit all four first-order models on this 2/5
- Subsequently, we'll need to train a GBM model on 2/5 of the (remaining) folds, and to test on the last 1/5
    - These training and testing sets for the second-tier model will need the __feature-engineered__ input X files for each model (A and B), as well as the __y-fit__ for each of the input models (A, B x ANN, GBM). Also, add __descending-rank__ for each prediction, where 1.0 is 1
    - It's easier if we pre-process this bit, and save these training and testing sets in a 'temp folder', and then in a second script...
      
Near future steps:
- use raytune, get 250 models
- Repeat, using more/less of the original data, as a sensitivity test

In [1]:
import pandas as pd
import numpy as np
import os

from keras import models, layers, regularizers, optimizers, callbacks, utils, losses, metrics
from tensorflow.keras.backend import clear_session
from tensorflow import convert_to_tensor

import lightgbm as lgb

from sklearn.preprocessing import StandardScaler

# from ray import train, tune
# from ray.tune.search.optuna import OptunaSearch
# from ray.tune.schedulers import ASHAScheduler
# from ray.tune.search import ConcurrencyLimiter

from scipy import stats

# Establish file locations, define functions

In [2]:
data_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker'
dir_working_model_a_training = os.path.join(data_dir, 'working_data/model_a/model_a_training')
dir_working_model_b_training = os.path.join(data_dir, 'working_data/model_b/model_b_training')

In [3]:
fn_x_a = os.path.join(dir_working_model_a_training, 'x.parquet')
fn_y_a = os.path.join(dir_working_model_a_training, 'y.parquet')
fn_id_a = os.path.join(dir_working_model_a_training, 'id.parquet')

fn_x_b = os.path.join(dir_working_model_b_training, 'x.parquet')
fn_y_b = os.path.join(dir_working_model_b_training, 'y.parquet')
fn_id_b = os.path.join(dir_working_model_b_training, 'id.parquet')

In [4]:
# (pd.read_parquet(fn_id_a) == pd.read_parquet(fn_id_b)).mean()

In [5]:
# dir_hyperparameter = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_02_21')
# fn_hyperparameter = os.path.join(dir_hyperparameter, 'gbm_grid_2025_02_21.csv')

dir_temp = os.path.join(data_dir, 'working_data/model_second_stage/model_second_stage_training/gbm_raytune_2025_02_21/temp_folder')

fn_model_a_ann = os.path.join(dir_temp, 'model_a_ann.keras')
fn_model_b_ann = os.path.join(dir_temp, 'model_b_ann.keras')
fn_model_a_gbm = os.path.join(dir_temp, 'model_a_gbm.txt')
fn_model_b_gbm = os.path.join(dir_temp, 'model_b_gbm.txt')

fn_x_train = os.path.join(dir_temp, 'x_train.parquet')
fn_x_test  = os.path.join(dir_temp, 'x_test.parquet')
fn_y_train = os.path.join(dir_temp, 'y_train.parquet')
fn_y_test  = os.path.join(dir_temp, 'y_test.parquet')

fn_split_characteristics = os.path.join(dir_temp, 'split_characteristics.csv')

In [6]:
def np_cleaning(X):
    X = np.clip(X, a_min=-3, a_max=3)
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    return X

def prep_x(fit_scaler, X):
    X = fit_scaler.transform(X)
    X = np_cleaning(X)
    # X = convert_to_tensor(X)
    return(X)

def get_dense_desc_rank(nn):
    # will be used for ranking y_fits
    return( stats.rankdata(-nn, method='dense') )

In [7]:
params_a_ann = {
    'dropout_1': 0.000120,
    'dropout_2': 0.0633,
    'relu_1': 33,
    'relu_2': 20,
    'epochs': 20
}

params_a_gbm = {
    'num_trees':482,
    'learning_rate':0.0134,
    'min_data_in_leaf':85,
    'objective':'binary',
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
}

params_b_ann = {
    'dropout_1': 0.0177,
    'dropout_2': 0.00595,
    'relu_1': 56,
    'relu_2': 29,
    'epochs': 14
}

params_b_gbm = {
    'num_trees':266,
    'learning_rate':0.0105,
    'min_data_in_leaf':42,
    'objective':'binary',
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
}

params_2_gbm = {
    # 'num_iterations': tune.randint(1, 1000),
    'verbose':-1,
    'num_trees': tune.randint(1, 500),
    'learning_rate': tune.uniform(0.0001, 0.75),
    'min_data_in_leaf': tune.randint(1, 200),
    'objective':'binary', 
    # 'early_stopping_round':2,
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
    }

# Model A

In [8]:
# Model A- data prep

X_a = pd.read_parquet(fn_x_a)
Y_a = pd.read_parquet(fn_y_a)
ID_a = pd.read_parquet(fn_id_a)

In [9]:
# Create 3 folds: fit premier models, train secondary model, test secondary model
fold_values = ID_a.fold.unique().tolist()
np.random.shuffle(fold_values)
values_for_premier_model_fits = fold_values[0:2]
values_for_secondary_model_train = fold_values[2:4]
values_for_secondary_model_test = fold_values[4]

is_premier_model_fits    = np.isin(element=ID_a.fold.values, test_elements=values_for_premier_model_fits)
is_secondary_model_test  = np.isin(element=ID_a.fold.values, test_elements=values_for_secondary_model_test)
is_secondary_model_train = np.isin(element=ID_a.fold.values, test_elements=values_for_secondary_model_train)

frames = [
    pd.DataFrame({'name':'premier_model_fits', 'fold':values_for_premier_model_fits}),
    pd.DataFrame({'name':'secondary_model_train', 'fold':values_for_secondary_model_train}),
    pd.DataFrame({'name':'secondary_model_test', 'fold':[values_for_secondary_model_test]})
]
SplitCharacteristics = pd.concat(frames, ignore_index=True)
SplitCharacteristics

Unnamed: 0,name,fold
0,premier_model_fits,1
1,premier_model_fits,4
2,secondary_model_train,3
3,secondary_model_train,0
4,secondary_model_test,2


In [10]:
YPremierModelFits = Y_a.loc[is_premier_model_fits]

YSecondaryModelTrain = Y_a.loc[is_secondary_model_train]
YSecondaryModelTest = Y_a.loc[is_secondary_model_test]

In [11]:
# Model A Premier Model, clean X

XAPremierModelFits    = X_a.loc[is_premier_model_fits]
XASecondaryModelTrain = X_a.loc[is_secondary_model_train]
XASecondaryModelTest  = X_a.loc[is_secondary_model_test]

standard_scaler_a = StandardScaler()
standard_scaler_a = standard_scaler_a.fit(XAPremierModelFits)

XAPremierModelFits    = prep_x(fit_scaler=standard_scaler_a, X=XAPremierModelFits)
XASecondaryModelTrain = prep_x(fit_scaler=standard_scaler_a, X=XASecondaryModelTrain)
XASecondaryModelTest  = prep_x(fit_scaler=standard_scaler_a, X=XASecondaryModelTest)

## ANN A

In [13]:
clear_session()
model_a_ann = models.Sequential()
model_a_ann.add(layers.Dropout(rate=params_a_ann["dropout_1"]))
model_a_ann.add(layers.Dense(units=params_a_ann["relu_1"], activation='relu'))    
model_a_ann.add(layers.Dropout(rate=params_a_ann["dropout_2"]))
model_a_ann.add(layers.Dense(units=params_a_ann["relu_2"], activation='relu'))   
model_a_ann.add(layers.Dense(1, activation='sigmoid'))

model_a_ann.compile(
    loss=losses.BinaryCrossentropy(),
    metrics=[
        metrics.BinaryCrossentropy(),
        metrics.BinaryAccuracy(), 
        metrics.AUC()
    ]
)
    
history_a_ann = model_a_ann.fit(
    convert_to_tensor(XAPremierModelFits), YPremierModelFits, epochs=params_a_ann['epochs'], batch_size=128,
    verbose=1
)

Epoch 1/20
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 402us/step - auc: 0.8742 - binary_accuracy: 0.9964 - binary_crossentropy: 0.0111 - loss: 0.0111
Epoch 2/20
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 396us/step - auc: 0.9581 - binary_accuracy: 0.9996 - binary_crossentropy: 0.0018 - loss: 0.0018
Epoch 3/20
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 397us/step - auc: 0.9611 - binary_accuracy: 0.9996 - binary_crossentropy: 0.0017 - loss: 0.0017
Epoch 4/20
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 407us/step - auc: 0.9671 - binary_accuracy: 0.9997 - binary_crossentropy: 0.0015 - loss: 0.0015
Epoch 5/20
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 398us/step - auc: 0.9650 - binary_accuracy: 0.9997 - binary_crossentropy: 0.0015 - loss: 0.0015
Epoch 6/20
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 447us/step - auc: 0.9658 - binary_

## GBM B

In [14]:
train_set = lgb.Dataset(XAPremierModelFits, YPremierModelFits)
model_a_gbm = lgb.train(
        params = params_a_gbm,
        train_set=train_set   
    )



[LightGBM] [Info] Number of positive: 2601, number of negative: 2601000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.149146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2849
[LightGBM] [Info] Number of data points in the train set: 2603601, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000999 -> initscore=-6.907755
[LightGBM] [Info] Start training from score -6.907755


# Model B

In [15]:
# Model B- data prep

X_b = pd.read_parquet(fn_x_b)
Y_b = pd.read_parquet(fn_y_b)
# ID_b = pd.read_parquet(fn_id_b)

XBPremierModelFits = X_b.loc[is_premier_model_fits]
XBSecondaryModelTest = X_b.loc[is_secondary_model_test]
XBSecondaryModelTrain = X_b.loc[is_secondary_model_train]

In [16]:
# Model B Premier Model, clean X

XBPremierModelFits    = X_b.loc[is_premier_model_fits]
XBSecondaryModelTrain = X_b.loc[is_secondary_model_train]
XBSecondaryModelTest  = X_b.loc[is_secondary_model_test]

standard_scaler_b = StandardScaler()
standard_scaler_b = standard_scaler_b.fit(XBPremierModelFits)

XBPremierModelFits    = prep_x(fit_scaler=standard_scaler_b, X=XBPremierModelFits)
XBSecondaryModelTrain = prep_x(fit_scaler=standard_scaler_b, X=XBSecondaryModelTrain)
XBSecondaryModelTest  = prep_x(fit_scaler=standard_scaler_b, X=XBSecondaryModelTest)

## ANN B

In [17]:
clear_session()
model_b_ann = models.Sequential()
model_b_ann.add(layers.Dropout(rate=params_b_ann["dropout_1"]))
model_b_ann.add(layers.Dense(units=params_b_ann["relu_1"], activation='relu'))    
model_b_ann.add(layers.Dropout(rate=params_b_ann["dropout_2"]))
model_b_ann.add(layers.Dense(units=params_b_ann["relu_2"], activation='relu'))   
model_b_ann.add(layers.Dense(1, activation='sigmoid'))

model_b_ann.compile(
    loss=losses.BinaryCrossentropy(),
    metrics=[
        metrics.BinaryCrossentropy(),
        metrics.BinaryAccuracy(), 
        metrics.AUC()
    ]
)
    
history_b_ann = model_b_ann.fit(
    convert_to_tensor(XBPremierModelFits), YPremierModelFits, epochs=params_b_ann['epochs'], batch_size=128,
    verbose=1
)

Epoch 1/14
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 455us/step - auc: 0.8729 - binary_accuracy: 0.9984 - binary_crossentropy: 0.0076 - loss: 0.0076
Epoch 2/14
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 444us/step - auc: 0.9189 - binary_accuracy: 0.9994 - binary_crossentropy: 0.0026 - loss: 0.0026
Epoch 3/14
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 443us/step - auc: 0.9074 - binary_accuracy: 0.9994 - binary_crossentropy: 0.0028 - loss: 0.0028
Epoch 4/14
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 450us/step - auc: 0.9002 - binary_accuracy: 0.9995 - binary_crossentropy: 0.0029 - loss: 0.0029
Epoch 5/14
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 444us/step - auc: 0.8963 - binary_accuracy: 0.9995 - binary_crossentropy: 0.0029 - loss: 0.0029
Epoch 6/14
[1m20341/20341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 457us/step - auc: 0.9096 - binary

## GBM B

In [18]:
train_set = lgb.Dataset(XBPremierModelFits, YPremierModelFits)
model_b_gbm = lgb.train(
        params = params_b_gbm,
        train_set=train_set   
    )



[LightGBM] [Info] Number of positive: 2601, number of negative: 2601000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.167289 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1537
[LightGBM] [Info] Number of data points in the train set: 2603601, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000999 -> initscore=-6.907755
[LightGBM] [Info] Start training from score -6.907755


# Save data to temporary folder

In [19]:
# Write models, split characteristics to disk

SplitCharacteristics.to_csv(fn_split_characteristics)
model_a_ann.save(filepath=fn_model_a_ann)
model_b_ann.save(filepath=fn_model_b_ann)
model_a_gbm.save_model(fn_model_a_gbm)
model_b_gbm.save_model(fn_model_b_gbm)

<lightgbm.basic.Booster at 0x345f90290>

In [20]:
y_fit_train_a_ann = model_a_ann.predict(convert_to_tensor(XASecondaryModelTrain))
y_fit_train_b_ann = model_b_ann.predict(convert_to_tensor(XBSecondaryModelTrain))

y_fit_test_a_ann = model_a_ann.predict(convert_to_tensor(XASecondaryModelTest))
y_fit_test_b_ann = model_b_ann.predict(convert_to_tensor(XBSecondaryModelTest))

[1m77046/77046[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 197us/step
[1m77046/77046[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 203us/step
[1m40791/40791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 196us/step
[1m40791/40791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 202us/step


In [21]:
y_fit_train_a_gbm = model_a_gbm.predict(XASecondaryModelTrain)
y_fit_train_b_gbm = model_b_gbm.predict(XBSecondaryModelTrain)

y_fit_test_a_gbm = model_a_gbm.predict(XASecondaryModelTest)
y_fit_test_b_gbm = model_b_gbm.predict(XBSecondaryModelTest)

In [22]:
XSecondaryModelTrain = np.hstack([
    XASecondaryModelTrain, 
    XBSecondaryModelTrain,
    
    y_fit_train_a_ann,
    np.array( [get_dense_desc_rank( y_fit_train_a_ann )] ).T,
    
    y_fit_train_b_ann,
    np.array( [get_dense_desc_rank( y_fit_train_b_ann )] ).T,

    np.array([y_fit_train_a_gbm]).T,
    np.array( [get_dense_desc_rank( y_fit_train_a_gbm )] ).T,

    np.array([y_fit_train_b_gbm]).T,
    np.array( [get_dense_desc_rank( y_fit_train_b_gbm )] ).T
])

In [23]:
XSecondaryModelTest = np.hstack([
    XASecondaryModelTest, 
    XBSecondaryModelTest,
    
    y_fit_test_a_ann,
    np.array( [get_dense_desc_rank( y_fit_test_a_ann )] ).T,
    
    y_fit_test_b_ann,
    np.array( [get_dense_desc_rank( y_fit_test_b_ann )] ).T,

    np.array([y_fit_test_a_gbm]).T,
    np.array( [get_dense_desc_rank( y_fit_test_a_gbm )] ).T,

    np.array([y_fit_test_b_gbm]).T,
    np.array( [get_dense_desc_rank( y_fit_test_b_gbm )] ).T
])

In [24]:
# Write tables to disk
YSecondaryModelTrain.to_parquet(fn_y_train)
YSecondaryModelTest.to_parquet(fn_y_test)

pd.DataFrame(XSecondaryModelTrain).to_parquet(fn_x_train)
pd.DataFrame(XSecondaryModelTest).to_parquet(fn_x_test)