# Model Building for Hourly/Daily Predictions

    - labels are created separately (see 04-label-generation.ipynb)
    - labels are for alarms generated in the next 5 minutes, next 1 hour and next 1 day
    - data points can be at 5 minutes interval, hourly level and day lavel
    
    - this notebook uses embeddings generated from Transformer models
    - Transformer model returns [:, sequence_length, embedding_dimension] shaped embeddings
    - for the current model - the last one or the first one in the sequence is used, i.e., [:, -1, :] or [:, 0, :]

In [None]:
import datetime
import math
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import pickle

import optuna
import lightgbm as lgb

from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn import preprocessing


In [None]:
with open('inverter-data-v03.pkl', 'rb') as handle:
    all_data = pickle.load(handle)
    
print(all_data.shape)
all_data.dropna(inplace=True)
all_data.shape

In [None]:
with open('inverter-data-daily-embedded-initial.pkl', 'rb') as handle:
    embed_data = pickle.load(handle)
x_emb, y, label_df = embed_data

In [None]:
label_col = 'label_24h'
total = label_df[label_col].value_counts()
print(f"Class ratio: {100 * total[1] / (total[0] + total[1]):.3f} %")

In [None]:
all_data = pd.DataFrame()
all_data[label_col] = y
all_data.reset_index(level=0, inplace=True)
all_data

In [None]:
train, test = train_test_split(all_data, train_size=0.8, random_state=100)
train, valid = train_test_split(train, train_size=0.8, random_state=100)
print(train.shape, valid.shape, test.shape)

train[label_col].value_counts(True)

In [None]:
x_emb = np.array(x_emb)

row_idx = np.array(train.index, dtype=np.intp)
col_idx = np.arange(x_emb.shape[1])
# col_idx = np.array([0, 2], dtype=np.intp)

x_train = x_emb[row_idx[:, np.newaxis], col_idx]
x_train.shape

In [None]:
row_idx = np.array(valid.index, dtype=np.intp)
col_idx = np.arange(x_emb.shape[1])
x_valid = x_emb[row_idx[:, np.newaxis], col_idx]
x_valid.shape

In [None]:
class Objective(object):
    def __init__(self, x_train, y_train, x_valid, y_valid, categoricals, fixed_params, param_set={}, verbose_eval=50):
        self.categoricals = categoricals
        self.fixed_params = fixed_params
        self.param_set = param_set
        self.verbose_eval = verbose_eval
        self.dtrain = lgb.Dataset(
            x_train,
            label = y_train,
            categorical_feature=self.categoricals,
            free_raw_data=False
        )
        self.dvalid = lgb.Dataset(
            x_valid,
            label = y_valid,
            categorical_feature=self.categoricals,
            reference=self.dtrain,
            free_raw_data=False
        )
        self.default_ranges = {
            "num_leaves":(2, 256),
            "min_data_in_leaf":(5, 100),
            "learning_rate":(1e-3, 1e-1),
            "feature_fraction":(0.4, 1.0),
            "bagging_freq":(1, 7),
            "bagging_fraction":(0.4, 1.0)
        }
        
    def get_params(self, trial):
        param_funcs = {
            "num_leaves":trial.suggest_int,
            "min_data_in_leaf":trial.suggest_int,
            "learning_rate":trial.suggest_loguniform,
            "feature_fraction":trial.suggest_float,
            "bagging_freq":trial.suggest_int,
            "bagging_fraction":trial.suggest_float
        }
        params = {}
        for param, rng in self.param_set.items():
            if rng is None:
                default_rng = self.default_ranges[param]
                params[param] = param_funcs[param](param, default_rng[0], default_rng[1])
            else:
                params[param] = param_funcs[param](param, rng[0], rng[1])

        params.update(self.fixed_params)
        return params
    
    def __call__(self, trial):
        params = self.get_params(trial)
        bst = lgb.train(
            params,
            self.dtrain,
            valid_sets=[self.dvalid],
            verbose_eval=self.verbose_eval
        )
        # get best value of objective
        valid_0 = bst.best_score['valid_0']
        score = valid_0[list(valid_0)[0]]
        
        trial.set_user_attr('best_iteration', bst.best_iteration)
        trial.set_user_attr('features', self.dtrain.feature_name)
        trial.set_user_attr('importance', bst.feature_importance().tolist())
        
        return score

class EarlyStoppingExceeded(optuna.exceptions.OptunaError):
    pass

class EarlyStoppingCallback(object):
    # from https://github.com/optuna/optuna/issues/1001#issuecomment-596478792
    
    def __init__(self, early_stopping_rounds, min_delta):
        self.early_stopping_rounds = early_stopping_rounds
        self.min_delta = min_delta
        self.early_stopping_count = 0
        self.best_score = None
    
    def __call__(self, study, trial):
        if self.best_score == None:
            self.best_score = study.best_value

        if study.best_value < self.best_score - self.min_delta:
            self.best_score = study.best_value
            self.early_stopping_count = 0
        else:
            if self.early_stopping_count > self.early_stopping_rounds:
                self.early_stopping_count = 0
                best_score = None
                raise EarlyStoppingExceeded()
            else:
                self.early_stopping_count += 1
        return
    

def tune_model(x_train, y_train, x_valid, y_valid, categoricals, fixed_params, param_set, n_trials=50, verbose_eval=50, show_progress=True, early_stop_callback=None, tpe_mode="independent"):
    multivariate_flag = True if tpe_mode == "multivariate" else False
    sampler = optuna.samplers.TPESampler(multivariate=multivariate_flag)
    study = optuna.create_study(sampler=sampler)
    callbacks = None
    if early_stop_callback is not None:
        callbacks = [early_stop_callback]
    else:
        callbacks = []
    try:
        study.optimize(
            Objective(
                x_train=x_train,
                y_train=y_train,
                x_valid=x_valid,
                y_valid=y_valid,
                categoricals=categoricals,
                fixed_params=fixed_params,
                param_set = param_set,
                verbose_eval=verbose_eval
            ),
            n_trials=n_trials,
            show_progress_bar=show_progress,
            callbacks=callbacks
        )
    except EarlyStoppingExceeded:
        print(f'EarlyStopping Exceeded: No new best scores on iters {early_stop_callback.early_stopping_rounds}')
    return study



In [None]:
obj_func = 'binary'
num_rounds = 1000
early_stopping_rounds = 50

print("Tune hyperparameters...")
param_set = {
    "num_leaves":None, 
    "min_data_in_leaf":None, 
    "learning_rate":None, 
    "feature_fraction":None,
    "bagging_freq":None, 
    "bagging_fraction":None
}

fixed_params = {
    "objective":obj_func,
    "metric":[obj_func, "auc"],
    "num_rounds":num_rounds,
    "early_stopping_rounds":early_stopping_rounds,
    "first_metric_only":True,
    "force_row_wise":True,
    "feature_pre_filter":False,
    "verbose":1,
}

early_stopping = EarlyStoppingCallback(10, 0.001)
categoricals = []
study = tune_model(
                    x_train, train[label_col],
                    x_valid, valid[label_col],
                    categoricals, fixed_params, param_set, n_trials=100, verbose_eval=0,
                    show_progress=False, early_stop_callback=early_stopping,
                )

print("Saving best model parameters...")
best_params = {k: [v] for (k,v) in study.best_params.items()}


In [None]:
print('best parameters:', best_params)
num_rounds = study.best_trial.user_attrs["best_iteration"]

fixed_params["num_rounds"] = num_rounds
# fixed_params["early_stopping_rounds"] = 0
params = study.best_params.copy()

params.update(fixed_params)
del params["early_stopping_rounds"] # = 0        

In [None]:
params['verbose'] = 1
params['metric'] = ['binary', 'auc']
# params['is_unbalance'] = True
params

In [None]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=params['num_leaves'], 
                           min_data_in_leaf=params['min_data_in_leaf'],
                           learning_rate=params['learning_rate'],
                           feature_fraction=params['feature_fraction'],
                           bagging_freq=params['bagging_freq'],
                           bagging_fraction=params['bagging_fraction'],
                           objective='binary',
                           metric=params['metric'],
                           num_rounds=params['num_rounds'],
#                            is_unbalance=params['is_unbalance']
                          )
y_train = train[label_col]
y_val = valid[label_col]

model.fit(X=x_train, y=y_train, 
          eval_set=[(x_valid, y_val)],
          eval_names=['eval']
         )     

In [None]:
row_idx = np.array(test.index, dtype=np.intp)
col_idx = np.arange(x_emb.shape[1])
x_test = x_emb[row_idx[:, np.newaxis], col_idx]
print(x_test.shape)
y_test = test[label_col]

train_pred = model.predict_proba(x_train)
val_pred = model.predict_proba(x_valid)
test_pred = model.predict_proba(x_test)
train_pred.shape, val_pred.shape, test_pred.shape

In [None]:
train_auc = roc_auc_score(y_true=y_train, y_score=train_pred[:,1])
val_auc = roc_auc_score(y_true=y_val, y_score=val_pred[:,1])
test_auc = roc_auc_score(y_true=y_test, y_score=test_pred[:,1])
print(f"train-auc: {train_auc}, val-auc: {val_auc}, test-auc: {test_auc}")

## Precision-Recall Curve for the Test Data

In [None]:
pos_label = 1
average_precision = average_precision_score(y_test, test_pred[:,1])
precision, recall, thresholds = precision_recall_curve(y_test, test_pred[:,1])
# disp = plot_precision_recall_curve(classifier, X_test, y_test)
# disp.ax_.set_title('2-class Precision-Recall curve: '
#                    'AP={0:0.2f}'.format(average_precision))
line_kwargs1 = {"drawstyle": "steps-post", 'label': 'precision'}
line_kwargs2 = {"drawstyle": "steps-post", 'label': 'threshold'}
# line_kwargs = {}
# line_kwargs["label"] = ('precision', 'threshold')

plt.figure(figsize=(10, 5))
sns.set(font_scale = 1)
fig, ax = plt.subplots()
ax.plot(recall, precision, **line_kwargs1)
ax.plot(recall[:-1], thresholds, **line_kwargs2)
info_pos_label = (f" (Positive label: {pos_label})"
                  if pos_label is not None else "")
xlabel = "Recall" + info_pos_label
ylabel = "Precision" + info_pos_label
ax.set(xlabel=xlabel, ylabel=ylabel, title=f"Average Precision = {average_precision:0.2f}")
ax.legend(loc="lower left")