Stage 1 hyperparameters:
- These are a 'fait accompli', and the hyperparameters need only be loaded.

Stage 2, testable hyperparameters:
- Load the top n number of possible hyperparameters per iteration.

Load the mostly feature-engineered stage 1 X and Y files

For each stage 2 hyperparameter x each fold number:
- Split the folds into premier_model_fits (x2), secondary_model_train (x2), secondary_model_test (x1)
- Normalize the premier_model_fits X files
- Train all four input models on the premier_model_fits files
- Fit a stage 2 model with the contender stage 2 hyperparameters
- Test the loss, accuracy, etc

In [1]:
import pandas as pd
import numpy as np
from glob import glob
import os

from keras import models, layers, regularizers, optimizers, callbacks, utils, losses, metrics
from tensorflow.keras.backend import clear_session
from tensorflow import convert_to_tensor

import lightgbm as lgb

from sklearn.preprocessing import StandardScaler
from sklearn import metrics as sklearn_metrics
from scipy import stats
from tqdm.notebook import tqdm

In [2]:
data_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker'
dir_working_model_a_training = os.path.join(data_dir, 'working_data/model_a/model_a_training')
dir_working_model_b_training = os.path.join(data_dir, 'working_data/model_b/model_b_training')

In [3]:
list_hp2_fn = glob(os.path.join(data_dir, '**/gbm_grid*.csv'), recursive=True)

In [4]:
fn_x_a = os.path.join(dir_working_model_a_training, 'x.parquet')
fn_y_a = os.path.join(dir_working_model_a_training, 'y.parquet')
fn_id = os.path.join(dir_working_model_a_training, 'id.parquet')

fn_x_b = os.path.join(dir_working_model_b_training, 'x.parquet')
fn_y_b = os.path.join(dir_working_model_b_training, 'y.parquet')

In [5]:
# Define functions

def np_cleaning(X):
    X = np.clip(X, a_min=-3, a_max=3)
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    return X

def prep_x(fit_scaler, X):
    X = fit_scaler.transform(X)
    X = np_cleaning(X)
    # X = convert_to_tensor(X)
    return(X)

def get_dense_desc_rank(nn):
    # will be used for ranking y_fits
    return( stats.rankdata(-nn, method='dense') )

def define_folds(values_for_secondary_model_test):
    # If fold f is reserved for secondary model_test, split the remaining folds
    # half into premier_model_fits, and half into secondary_model_train. 
    # Return all values, as numpy arrays
    fold_values = np.arange(5)
    remaining_fold_values = np.setdiff1d(fold_values, values_for_secondary_model_test)
    np.random.shuffle(remaining_fold_values)
    values_for_premier_model_fits = remaining_fold_values[0:2]
    values_for_secondary_model_train = remaining_fold_values[2:]
    
    print('Values for premier model fits: ', values_for_premier_model_fits)
    print('Values for secondary model train: ', values_for_secondary_model_train)
    print('Values for secondary model test: ', values_for_secondary_model_test)
    
    return values_for_premier_model_fits, values_for_secondary_model_train, np.array(values_for_secondary_model_test)

In [6]:
def get_boolean_masks_for_folds(ID, values_for_premier_model_fits, values_for_secondary_model_train, values_for_secondary_model_test):
    is_premier_model_fits    = np.isin(element=ID.fold.values, test_elements=values_for_premier_model_fits)
    is_secondary_model_train = np.isin(element=ID.fold.values, test_elements=values_for_secondary_model_train)
    is_secondary_model_test  = np.isin(element=ID.fold.values, test_elements=values_for_secondary_model_test)
    return is_premier_model_fits, is_secondary_model_train, is_secondary_model_test

In [7]:
def fit_ann(params, X, Y):
    # Can be used by model A or B
    clear_session()
    model_ann = models.Sequential()
    model_ann.add(layers.Dropout(rate=params["dropout_1"]))
    model_ann.add(layers.Dense(units=int(params["relu_1"]), activation='relu'))    
    model_ann.add(layers.Dropout(rate=params["dropout_2"]))
    model_ann.add(layers.Dense(units=int(params["relu_2"]), activation='relu'))   
    model_ann.add(layers.Dense(1, activation='sigmoid'))
    
    model_ann.compile(
        loss=losses.BinaryCrossentropy(),
        metrics=[
            metrics.BinaryCrossentropy(),
            metrics.BinaryAccuracy(), 
            metrics.AUC()
        ]
    )
        
    history_ann = model_ann.fit(
        convert_to_tensor(X), Y, epochs=int(params['epochs']), batch_size=128,
        verbose=0
    ) 
    return model_ann

In [8]:
def fit_gbm(params, X, Y):
    # Can be used by model A or B
    train_set = lgb.Dataset(X, Y)
    model_gbm = lgb.train(
            params = params,
            train_set=train_set   
        )
    return model_gbm

In [9]:
def clean_x(X, is_premier_model_fits, is_secondary_model_train, is_secondary_model_test):
    # Scale X files 
    XPremierModelFits    = X.loc[is_premier_model_fits]
    XSecondaryModelTrain = X.loc[is_secondary_model_train]
    XSecondaryModelTest  = X.loc[is_secondary_model_test]
    
    standard_scaler = StandardScaler()
    standard_scaler = standard_scaler.fit(XPremierModelFits)
    
    XPremierModelFits    = prep_x(fit_scaler=standard_scaler, X=XPremierModelFits)
    XSecondaryModelTrain = prep_x(fit_scaler=standard_scaler, X=XSecondaryModelTrain)
    XSecondaryModelTest  = prep_x(fit_scaler=standard_scaler, X=XSecondaryModelTest)
    return XPremierModelFits, XSecondaryModelTrain, XSecondaryModelTest

# Collect Stage 1 Hyperparameters

Note that these are a 'fait accompli', and need only be read from the disk

In [10]:
fn_model_a_ann_hp = os.path.join(data_dir, 'working_data/model_a/model_a_training/model_a_ann_hp.csv')
hp1_a_ann = pd.read_csv(fn_model_a_ann_hp).to_dict(orient='list')
hp1_a_ann = {k:hp1_a_ann[k][0] for k in hp1_a_ann.keys()}

In [11]:
fn_model_a_gbm_hp = os.path.join(data_dir, 'working_data/model_a/model_a_training/model_a_gbm_hp.csv')
hp1_a_gbm = pd.read_csv(fn_model_a_gbm_hp).to_dict(orient='list')
hp1_a_gbm = {k:hp1_a_gbm[k][0] for k in hp1_a_gbm.keys()}
hp1_a_gbm['metrics'] = ['binary_logloss', 'auc']

In [12]:
fn_model_b_ann_hp = os.path.join(data_dir, 'working_data/model_b/model_b_training/model_b_ann_hp.csv')
hp1_b_ann = pd.read_csv(fn_model_b_ann_hp).to_dict(orient='list')
hp1_b_ann = {k:hp1_b_ann[k][0] for k in hp1_b_ann.keys()}

In [13]:
fn_model_b_gbm_hp = os.path.join(data_dir, 'working_data/model_b/model_b_training/model_b_gbm_hp.csv')
hp1_b_gbm = pd.read_csv(fn_model_b_gbm_hp).to_dict(orient='list')
hp1_b_gbm = {k:hp1_b_gbm[k][0] for k in hp1_b_gbm.keys()}
hp1_b_gbm['metrics'] = ['binary_logloss', 'auc']

# Collect Stage 2 Contender Hyperparameters

Filter to the top n contenders per run

In [14]:
JoinedHP2 = pd.DataFrame()

for fn in list_hp2_fn:
    HP2 = pd.read_csv(fn)
    HP2['fn'] = fn
    JoinedHP2 = pd.concat([JoinedHP2, HP2])

mask_is_hp2_contender = JoinedHP2['rank'] <= 1  # NB this is what the user can change to test more possible hyperparameters! 0 is best.

In [15]:
ContenderHP2 = JoinedHP2.loc[mask_is_hp2_contender, ['config/num_trees', 'config/min_data_in_leaf', 'config/learning_rate', 'fn', 'rank']].copy()
ContenderHP2.reset_index(inplace=True, drop=True)
ContenderHP2.rename(columns={'config/num_trees': 'num_trees', 'config/min_data_in_leaf': 'min_data_in_leaf', 'config/learning_rate': 'learning_rate'}, inplace=True)
ContenderHP2 = pd.DataFrame({'fold_num':np.arange(5)}).merge(ContenderHP2, how='cross')
ContenderHP2

Unnamed: 0,fold_num,num_trees,min_data_in_leaf,learning_rate,fn,rank
0,0,580,162,0.01373,/Volumes/Extreme SSD/rematch_eia_ferc1_docker/...,0
1,0,843,129,0.012509,/Volumes/Extreme SSD/rematch_eia_ferc1_docker/...,1
2,0,815,196,0.010433,/Volumes/Extreme SSD/rematch_eia_ferc1_docker/...,0
3,0,958,124,0.010048,/Volumes/Extreme SSD/rematch_eia_ferc1_docker/...,1
4,0,664,157,0.012033,/Volumes/Extreme SSD/rematch_eia_ferc1_docker/...,0
5,0,632,177,0.012811,/Volumes/Extreme SSD/rematch_eia_ferc1_docker/...,1
6,1,580,162,0.01373,/Volumes/Extreme SSD/rematch_eia_ferc1_docker/...,0
7,1,843,129,0.012509,/Volumes/Extreme SSD/rematch_eia_ferc1_docker/...,1
8,1,815,196,0.010433,/Volumes/Extreme SSD/rematch_eia_ferc1_docker/...,0
9,1,958,124,0.010048,/Volumes/Extreme SSD/rematch_eia_ferc1_docker/...,1


In [16]:
stage_2_param_dict = ContenderHP2[['fold_num', 'num_trees', 'min_data_in_leaf', 'learning_rate']].to_dict('index')
print(stage_2_param_dict[0])

{'fold_num': 0, 'num_trees': 580, 'min_data_in_leaf': 162, 'learning_rate': 0.0137300158848056}


# Load data

In [17]:
X_a = pd.read_parquet(fn_x_a)
X_b = pd.read_parquet(fn_x_b)
Y = pd.read_parquet(fn_y_a)
ID = pd.read_parquet(fn_id)

# Iterate

In [None]:
collected_results = []
for i in tqdm(stage_2_param_dict.keys()):
    
    params = stage_2_param_dict[i]
    
    # Note which fold we're on, and divvy up the data into test/train bits, based on that
    values_for_premier_model_fits, values_for_secondary_model_train, values_for_secondary_model_test = define_folds( params['fold_num'] )

    is_premier_model_fits, is_secondary_model_train, is_secondary_model_test = get_boolean_masks_for_folds(
        ID=ID, 
        values_for_premier_model_fits=values_for_premier_model_fits, 
        values_for_secondary_model_train=values_for_secondary_model_train, 
        values_for_secondary_model_test=values_for_secondary_model_test
    )

    # Y, ID
    YPremierModelFits = Y.loc[is_premier_model_fits]
    YSecondaryModelTrain = Y.loc[is_secondary_model_train]
    YSecondaryModelTest = Y.loc[is_secondary_model_test]
    
    IDSecondaryModelTest = ID.loc[is_secondary_model_test]

    # XA
    XAPremierModelFits, XASecondaryModelTrain, XASecondaryModelTest = clean_x(
        X=X_a, 
        is_premier_model_fits=is_premier_model_fits, 
        is_secondary_model_train=is_secondary_model_train, 
        is_secondary_model_test=is_secondary_model_test
    )

    # XB
    XBPremierModelFits, XBSecondaryModelTrain, XBSecondaryModelTest = clean_x(
        X=X_b, 
        is_premier_model_fits=is_premier_model_fits, 
        is_secondary_model_train=is_secondary_model_train, 
        is_secondary_model_test=is_secondary_model_test
    )

    # Fit models for stage 1, get y_fit
    model_a_ann = fit_ann(params=hp1_a_ann, X=XAPremierModelFits, Y=YPremierModelFits)
    model_a_gbm = fit_gbm(params=hp1_a_gbm, X=XAPremierModelFits, Y=YPremierModelFits)
    model_b_ann = fit_ann(params=hp1_b_ann, X=XBPremierModelFits, Y=YPremierModelFits)
    model_b_gbm = fit_gbm(params=hp1_b_gbm, X=XBPremierModelFits, Y=YPremierModelFits)

    # ANN, stage 2 Train
    y_fit_train_a_ann = model_a_ann.predict(convert_to_tensor(XASecondaryModelTrain))
    y_fit_train_b_ann = model_b_ann.predict(convert_to_tensor(XBSecondaryModelTrain))

    # ANN, stage 2 Test
    y_fit_test_a_ann = model_a_ann.predict(convert_to_tensor(XASecondaryModelTest))
    y_fit_test_b_ann = model_b_ann.predict(convert_to_tensor(XBSecondaryModelTest))

    # GBM, stage 2 Train
    y_fit_train_a_gbm = model_a_gbm.predict(XASecondaryModelTrain)
    y_fit_train_b_gbm = model_b_gbm.predict(XBSecondaryModelTrain)

    # GBM, stage 2 Test
    y_fit_test_a_gbm = model_a_gbm.predict(XASecondaryModelTest)
    y_fit_test_b_gbm = model_b_gbm.predict(XBSecondaryModelTest)

    # Collect the above into something the 2nd stage model can use
    XSecondaryModelTrain = np.hstack([
        XASecondaryModelTrain, 
        XBSecondaryModelTrain,
        
        y_fit_train_a_ann,
        np.array( [get_dense_desc_rank( y_fit_train_a_ann )] ).T,
        
        y_fit_train_b_ann,
        np.array( [get_dense_desc_rank( y_fit_train_b_ann )] ).T,
    
        np.array([y_fit_train_a_gbm]).T,
        np.array( [get_dense_desc_rank( y_fit_train_a_gbm )] ).T,
    
        np.array([y_fit_train_b_gbm]).T,
        np.array( [get_dense_desc_rank( y_fit_train_b_gbm )] ).T
    ])
    
    XSecondaryModelTest = np.hstack([
        XASecondaryModelTest, 
        XBSecondaryModelTest,
        
        y_fit_test_a_ann,
        np.array( [get_dense_desc_rank( y_fit_test_a_ann )] ).T,
        
        y_fit_test_b_ann,
        np.array( [get_dense_desc_rank( y_fit_test_b_ann )] ).T,
    
        np.array([y_fit_test_a_gbm]).T,
        np.array( [get_dense_desc_rank( y_fit_test_a_gbm )] ).T,
    
        np.array([y_fit_test_b_gbm]).T,
        np.array( [get_dense_desc_rank( y_fit_test_b_gbm )] ).T
    ])

    # def fit_mod(stage_2_params, XTrain, XTest, YTrain, YTest):
    
    XTrain = XSecondaryModelTrain
    XTest = XSecondaryModelTest
    YTrain = YSecondaryModelTrain
    YTest = YSecondaryModelTest
    
    
    # Package in training and testing objects
    train_set = lgb.Dataset(XTrain, YTrain)
    test_set  = lgb.Dataset(XTest,  YTest)
    
    # Model
    gbm = lgb.train(
        params,
        train_set
        # valid_sets=[test_set]    
    )
    y_fit = gbm.predict(XTest)

    # Goodness of fit
    Framework = IDSecondaryModelTest[['record_id_ferc1']].copy()
    Framework['y_fit'] = y_fit
    Framework['groupwise_max_y_fit'] = Framework.groupby('record_id_ferc1')['y_fit'].transform('max')
    Framework['y_fit_adj'] = Framework['y_fit'] == Framework['groupwise_max_y_fit']
    
    gof_dict = {
        'precision' : sklearn_metrics.precision_score(YTest.values, Framework['y_fit_adj'].values*1),
        'recall' : sklearn_metrics.recall_score(YTest.values, Framework['y_fit_adj'].values*1),
        'log_loss' : sklearn_metrics.log_loss(YTest.values, y_fit),
        'roc_auc' : sklearn_metrics.roc_auc_score(YTest.values, y_fit)
    }

    results = stage_2_param_dict[i] | gof_dict
    collected_results.append(results)

  0%|          | 0/30 [00:00<?, ?it/s]

Values for premier model fits:  [1 4]
Values for secondary model train:  [3 2]
Values for secondary model test:  0


## Split data: iterate from here

In [18]:
i = 0

fold_num = stage_2_param_dict[i]['fold_num']
# num_trees = 580
# min_data_in_leaf = 162
# learning_rate = 0.013

In [19]:
values_for_premier_model_fits, values_for_secondary_model_train, values_for_secondary_model_test = define_folds(fold_num)

Values for premier model fits:  [3 4]
Values for secondary model train:  [2 1]
Values for secondary model test:  0


In [20]:
is_premier_model_fits, is_secondary_model_train, is_secondary_model_test = get_boolean_masks_for_folds(
    ID=ID, 
    values_for_premier_model_fits=values_for_premier_model_fits, 
    values_for_secondary_model_train=values_for_secondary_model_train, 
    values_for_secondary_model_test=values_for_secondary_model_test
)

In [21]:
YPremierModelFits = Y.loc[is_premier_model_fits]
YSecondaryModelTrain = Y.loc[is_secondary_model_train]
YSecondaryModelTest = Y.loc[is_secondary_model_test]

IDSecondaryModelTest = ID.loc[is_secondary_model_test]

In [22]:
XAPremierModelFits, XASecondaryModelTrain, XASecondaryModelTest = clean_x(
    X=X_a, 
    is_premier_model_fits=is_premier_model_fits, 
    is_secondary_model_train=is_secondary_model_train, 
    is_secondary_model_test=is_secondary_model_test
)

In [23]:
XBPremierModelFits, XBSecondaryModelTrain, XBSecondaryModelTest = clean_x(
    X=X_b, 
    is_premier_model_fits=is_premier_model_fits, 
    is_secondary_model_train=is_secondary_model_train, 
    is_secondary_model_test=is_secondary_model_test
)

In [24]:
model_a_ann = fit_ann(params=hp1_a_ann, X=XAPremierModelFits, Y=YPremierModelFits)

In [25]:
model_a_gbm = fit_gbm(params=hp1_a_gbm, X=XAPremierModelFits, Y=YPremierModelFits)



In [26]:
model_b_ann = fit_ann(params=hp1_b_ann, X=XBPremierModelFits, Y=YPremierModelFits)

In [27]:
model_b_gbm = fit_gbm(params=hp1_b_gbm, X=XBPremierModelFits, Y=YPremierModelFits)



In [28]:
y_fit_train_a_ann = model_a_ann.predict(convert_to_tensor(XASecondaryModelTrain))
y_fit_train_b_ann = model_b_ann.predict(convert_to_tensor(XBSecondaryModelTrain))

y_fit_test_a_ann = model_a_ann.predict(convert_to_tensor(XASecondaryModelTest))
y_fit_test_b_ann = model_b_ann.predict(convert_to_tensor(XBSecondaryModelTest))

[1m81019/81019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 205us/step
[1m81019/81019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 206us/step
[1m38946/38946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 201us/step
[1m38946/38946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 207us/step


In [29]:
y_fit_train_a_gbm = model_a_gbm.predict(XASecondaryModelTrain)
y_fit_train_b_gbm = model_b_gbm.predict(XBSecondaryModelTrain)

y_fit_test_a_gbm = model_a_gbm.predict(XASecondaryModelTest)
y_fit_test_b_gbm = model_b_gbm.predict(XBSecondaryModelTest)

In [30]:
XSecondaryModelTrain = np.hstack([
    XASecondaryModelTrain, 
    XBSecondaryModelTrain,
    
    y_fit_train_a_ann,
    np.array( [get_dense_desc_rank( y_fit_train_a_ann )] ).T,
    
    y_fit_train_b_ann,
    np.array( [get_dense_desc_rank( y_fit_train_b_ann )] ).T,

    np.array([y_fit_train_a_gbm]).T,
    np.array( [get_dense_desc_rank( y_fit_train_a_gbm )] ).T,

    np.array([y_fit_train_b_gbm]).T,
    np.array( [get_dense_desc_rank( y_fit_train_b_gbm )] ).T
])

In [31]:
XSecondaryModelTest = np.hstack([
    XASecondaryModelTest, 
    XBSecondaryModelTest,
    
    y_fit_test_a_ann,
    np.array( [get_dense_desc_rank( y_fit_test_a_ann )] ).T,
    
    y_fit_test_b_ann,
    np.array( [get_dense_desc_rank( y_fit_test_b_ann )] ).T,

    np.array([y_fit_test_a_gbm]).T,
    np.array( [get_dense_desc_rank( y_fit_test_a_gbm )] ).T,

    np.array([y_fit_test_b_gbm]).T,
    np.array( [get_dense_desc_rank( y_fit_test_b_gbm )] ).T
])

In [34]:
# def fit_mod(stage_2_params, XTrain, XTest, YTrain, YTest):
stage_2_params = stage_2_param_dict[0]
XTrain = XSecondaryModelTrain
XTest = XSecondaryModelTest
YTrain = YSecondaryModelTrain
YTest = YSecondaryModelTest


# Package in training and testing objects
train_set = lgb.Dataset(XTrain, YTrain)
test_set  = lgb.Dataset(XTest,  YTest)

# Model
gbm = lgb.train(
    stage_2_params,
    train_set
    # valid_sets=[test_set]    
)
y_fit = gbm.predict(XTest)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.301950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6421
[LightGBM] [Info] Number of data points in the train set: 2592590, number of used features: 141
[LightGBM] [Info] Start training from score 0.000999


In [39]:
Framework = IDSecondaryModelTest[['record_id_ferc1']].copy()
Framework['y_fit'] = y_fit
Framework['groupwise_max_y_fit'] = Framework.groupby('record_id_ferc1')['y_fit'].transform('max')
Framework['y_fit_adj'] = Framework['y_fit'] == Framework['groupwise_max_y_fit']

gof_dict = {
    'precision' : sklearn_metrics.precision_score(YTest.values, Framework['y_fit_adj'].values*1),
    'recall' : sklearn_metrics.recall_score(YTest.values, Framework['y_fit_adj'].values*1),
    'log_loss' : sklearn_metrics.log_loss(YTest.values, y_fit),
    'roc_auc' : sklearn_metrics.roc_auc_score(YTest.values, y_fit)
}

In [50]:
results = stage_2_param_dict[i] | gof_dict

{'fold_num': 0,
 'num_trees': 580,
 'min_data_in_leaf': 162,
 'learning_rate': 0.0137300158848056,
 'precision': 0.9840383080606544,
 'recall': 0.9903614457831326,
 'log_loss': 0.0003007053699996441,
 'roc_auc': 0.9975861479653554}