In [1]:
import pandas as pd
import numpy as np
import os

from keras import models, layers, regularizers, optimizers, callbacks, utils, losses, metrics
from tensorflow.keras.backend import clear_session
from tensorflow import convert_to_tensor

from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from tqdm.notebook import tqdm

In [2]:
def np_cleaning(X):
    X = np.clip(X, a_min=-3, a_max=3)
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    return X

In [3]:
data_dir = '/Volumes/Extreme SSD/rematch_eia_ferc1_docker'
dir_working_model_b_training = os.path.join(data_dir, 'working_data/model_b/model_b_training')
# dir_working_model_b_training

In [4]:
# output file
fn_y_fit_model_b = os.path.join(data_dir, 'working_data/model_z/y_fit_model_b.parquet')

In [5]:
fn_x = os.path.join(dir_working_model_b_training, 'x.parquet')
fn_y = os.path.join(dir_working_model_b_training, 'y.parquet')
fn_id = os.path.join(dir_working_model_b_training, 'id.parquet')

X = pd.read_parquet(fn_x)
Y = pd.read_parquet(fn_y)
ID = pd.read_parquet(fn_id)

In [6]:
params_model_b_ann = {
    'dropout_1': 0.0177,
    'dropout_2': 0.00595,
    'relu_1': 56,
    'relu_2': 29,
    'epochs': 14
}
params_model_b_gbm = {
    'num_trees':266,
    'learning_rate':0.0105,
    'min_data_in_leaf':42,
    'objective':'binary',
    'early_stopping_round':-1,
    'metrics':['binary_logloss', 'auc']
}

In [7]:
results_list = []

for fold in tqdm(ID.fold.unique()):
    
    # Create Test and Train subsets based on fold num
    is_train_mask = ID.fold != fold
    XTrain = X.loc[is_train_mask]
    XTest = X.loc[~is_train_mask]
    
    YTrain = Y.loc[is_train_mask]
    YTest = Y.loc[~is_train_mask]
    
    # Clean the X datasets, based on the training data characteristics
    standard_scaler = StandardScaler()
    standard_scaler.fit(XTrain)
    XTrain = standard_scaler.transform(XTrain)
    XTest = standard_scaler.transform(XTest)
    
    XTrain = np_cleaning(XTrain)
    XTest = np_cleaning(XTest)

    # Fit models
    # GBM
    train_set = lgb.Dataset(XTrain, YTrain)
    mod_b_gbm = lgb.train(
            params = params_model_b_gbm,
            train_set=train_set
        )

    # ANN
    clear_session()
    mod_b_ann = models.Sequential()
    mod_b_ann.add(layers.Dropout(rate=params_model_b_ann["dropout_1"]))
    mod_b_ann.add(layers.Dense(units=params_model_b_ann["relu_1"], activation='relu'))    
    mod_b_ann.add(layers.Dropout(rate=params_model_b_ann["dropout_2"]))
    mod_b_ann.add(layers.Dense(units=params_model_b_ann["relu_2"], activation='relu'))   
    mod_b_ann.add(layers.Dense(1, activation='sigmoid'))
    
    mod_b_ann.compile(
        loss=losses.BinaryCrossentropy(),
        metrics=[
            metrics.BinaryCrossentropy(),
            metrics.BinaryAccuracy(), 
            metrics.AUC()
        ]
    )
        
    history = mod_b_ann.fit(
        XTrain, YTrain, epochs=params_model_b_ann['epochs'], batch_size=128,  # hard-coded here
        verbose=1
    )

    # Make predictions
    yfit_b_gbm = mod_b_gbm.predict(XTest)
    
    yfit_b_ann = mod_b_ann.predict(XTest)
    yfit_b_ann = yfit_b_ann.reshape(-1,)

    # Join ID to YFit, store together
    RelevantID = ID.loc[~is_train_mask, ['record_id_ferc1', 'record_id_eia', 'fold']].reset_index(drop=True)

    RelevantYFit = pd.DataFrame({
        'y_fit_b_ann':yfit_b_ann,
        'y_fit_b_gbm':yfit_b_gbm
    })
    
    Results = pd.concat([RelevantID, RelevantYFit], axis=1)
    results_list.append(Results)

  0%|          | 0/5 [00:00<?, ?it/s]



[LightGBM] [Info] Number of positive: 5090, number of negative: 5090000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.334039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1502
[LightGBM] [Info] Number of data points in the train set: 5095090, number of used features: 78
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000999 -> initscore=-6.907755
[LightGBM] [Info] Start training from score -6.907755
Epoch 1/14
[1m39806/39806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 492us/step - auc: 0.8830 - binary_accuracy: 0.9990 - binary_crossentropy: 0.0054 - loss: 0.0054
Epoch 2/14
[1m39806/39806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 455us/step - auc: 0.9026 - binary_accuracy: 0.9995 - binary_crossentropy: 0.0030 - loss: 0.0030
Epoch 3/14
[1m39806/39806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4



[LightGBM] [Info] Number of positive: 5115, number of negative: 5115000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.344006 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 5120115, number of used features: 78
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000999 -> initscore=-6.907755
[LightGBM] [Info] Start training from score -6.907755
Epoch 1/14
[1m40001/40001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 447us/step - auc: 0.8774 - binary_accuracy: 0.9991 - binary_crossentropy: 0.0054 - loss: 0.0054
Epoch 2/14
[1m40001/40001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 449us/step - auc: 0.8887 - binary_accuracy: 0.9993 - binary_crossentropy: 0.0036 - loss: 0.0036
Epoch 3/14
[1m40001/40001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4



[LightGBM] [Info] Number of positive: 5068, number of negative: 5068000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.327798 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1533
[LightGBM] [Info] Number of data points in the train set: 5073068, number of used features: 78
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000999 -> initscore=-6.907755
[LightGBM] [Info] Start training from score -6.907755
Epoch 1/14
[1m39634/39634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 444us/step - auc: 0.8867 - binary_accuracy: 0.9988 - binary_crossentropy: 0.0056 - loss: 0.0056
Epoch 2/14
[1m39634/39634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 444us/step - auc: 0.9124 - binary_accuracy: 0.9995 - binary_crossentropy: 0.0027 - loss: 0.0027
Epoch 3/14
[1m39634/39634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4



[LightGBM] [Info] Number of positive: 5137, number of negative: 5137000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.319797 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1539
[LightGBM] [Info] Number of data points in the train set: 5142137, number of used features: 78
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000999 -> initscore=-6.907755
[LightGBM] [Info] Start training from score -6.907755
Epoch 1/14
[1m40173/40173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 455us/step - auc: 0.8910 - binary_accuracy: 0.9984 - binary_crossentropy: 0.0062 - loss: 0.0062
Epoch 2/14
[1m40173/40173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 444us/step - auc: 0.9157 - binary_accuracy: 0.9994 - binary_crossentropy: 0.0028 - loss: 0.0028
Epoch 3/14
[1m40173/40173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4



[LightGBM] [Info] Number of positive: 5062, number of negative: 5062000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.326195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1538
[LightGBM] [Info] Number of data points in the train set: 5067062, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000999 -> initscore=-6.907755
[LightGBM] [Info] Start training from score -6.907755
Epoch 1/14
[1m39587/39587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 446us/step - auc: 0.8751 - binary_accuracy: 0.9984 - binary_crossentropy: 0.0063 - loss: 0.0063
Epoch 2/14
[1m39587/39587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 449us/step - auc: 0.9052 - binary_accuracy: 0.9994 - binary_crossentropy: 0.0030 - loss: 0.0030
Epoch 3/14
[1m39587/39587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4

In [9]:
pd.concat(results_list).reset_index(drop=True).to_parquet(fn_y_fit_model_b)