## Predicting the Alarm Type

    - given that there is an alarm in the next T hours
    - there could be multiple alarms with more than one count for each alarm
    - simplified to predicting multiple binary outputs
    - for alarm type x the label is 1 if the count is greater than 0 else 0

In [None]:
import datetime
import math
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import pickle

import optuna
import lightgbm as lgb

from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

In [None]:
target_codes = [7006, 3511, 7502, 7501, 3504, 6448, 1500, 7704]

label_df = pd.read_csv('multiclass_labels.csv')
label_df['date'] = pd.to_datetime(label_df['date'])
label_df = label_df.drop(['only_date', 'label', 'label_1h', 'label_24h', 'alarm_24h_concat'] + ['count_'+str(t) for t in target_codes], axis=1)
label_df

In [None]:
label_df.shape

In [None]:
with open('inverter-data-v03.pkl', 'rb') as handle:
    all_data = pickle.load(handle)
    
print(all_data.shape)
all_data.dropna(inplace=True)
all_data.shape

In [None]:
all_data.columns

In [None]:
data_df = label_df.merge(all_data, on=['date', 'inverter'], how='left')
data_df.shape

In [None]:
data_df.dropna(inplace=True)
data_df.shape

In [None]:
data_df.columns.to_list()

In [None]:
data_df = data_df.drop(['inverter', 'label', 'label_1h', 'label_24h'], axis=1)
# Note: date is dropped later
categoricals = ['hour', "day", "dayofweek", "weekofyear", "month"]
for cat in categoricals:
    all_data[cat] = all_data[cat].astype('category')

In [None]:
class Objective(object):
    def __init__(self, df_train, df_valid, categoricals, fixed_params, param_set={}, verbose_eval=50):
        self.categoricals = categoricals
        self.fixed_params = fixed_params
        self.param_set = param_set
        self.verbose_eval = verbose_eval
        self.dtrain = lgb.Dataset(
            df_train.drop([label_col], axis=1),
            label = df_train[label_col],
            categorical_feature=self.categoricals,
            free_raw_data=False
        )
        self.dvalid = lgb.Dataset(
            df_valid.drop([label_col], axis=1),
            label = df_valid[label_col],
            categorical_feature=self.categoricals,
            reference=self.dtrain,
            free_raw_data=False
        )
        self.default_ranges = {
            "num_leaves":(2, 256),
            "min_data_in_leaf":(5, 100),
            "learning_rate":(1e-3, 1e-1),
            "feature_fraction":(0.4, 1.0),
            "bagging_freq":(1, 7),
            "bagging_fraction":(0.4, 1.0)
        }
        
    def get_params(self, trial):
        param_funcs = {
            "num_leaves":trial.suggest_int,
            "min_data_in_leaf":trial.suggest_int,
            "learning_rate":trial.suggest_loguniform,
            "feature_fraction":trial.suggest_float,
            "bagging_freq":trial.suggest_int,
            "bagging_fraction":trial.suggest_float
        }
        params = {}
        for param, rng in self.param_set.items():
            if rng is None:
                default_rng = self.default_ranges[param]
                params[param] = param_funcs[param](param, default_rng[0], default_rng[1])
            else:
                params[param] = param_funcs[param](param, rng[0], rng[1])

        params.update(self.fixed_params)
        return params
    
    def __call__(self, trial):
        params = self.get_params(trial)
        bst = lgb.train(
            params,
            self.dtrain,
            valid_sets=[self.dvalid],
            verbose_eval=self.verbose_eval
        )
        # get best value of objective
        valid_0 = bst.best_score['valid_0']
        score = valid_0[list(valid_0)[0]]
        
        trial.set_user_attr('best_iteration', bst.best_iteration)
        trial.set_user_attr('features', self.dtrain.feature_name)
        trial.set_user_attr('importance', bst.feature_importance().tolist())
        
        return score

class EarlyStoppingExceeded(optuna.exceptions.OptunaError):
    pass

class EarlyStoppingCallback(object):
    # from https://github.com/optuna/optuna/issues/1001#issuecomment-596478792
    
    def __init__(self, early_stopping_rounds, min_delta):
        self.early_stopping_rounds = early_stopping_rounds
        self.min_delta = min_delta
        self.early_stopping_count = 0
        self.best_score = None
    
    def __call__(self, study, trial):
        if self.best_score == None:
            self.best_score = study.best_value

        if study.best_value < self.best_score - self.min_delta:
            self.best_score = study.best_value
            self.early_stopping_count = 0
        else:
            if self.early_stopping_count > self.early_stopping_rounds:
                self.early_stopping_count = 0
                best_score = None
                raise EarlyStoppingExceeded()
            else:
                self.early_stopping_count += 1
        return
    

def tune_model(df_train, df_valid, categoricals, fixed_params, param_set, n_trials=50, verbose_eval=50, show_progress=True, early_stop_callback=None, tpe_mode="independent"):
    multivariate_flag = True if tpe_mode == "multivariate" else False
    sampler = optuna.samplers.TPESampler(multivariate=multivariate_flag)
    study = optuna.create_study(sampler=sampler)
    callbacks = None
    if early_stop_callback is not None:
        callbacks = [early_stop_callback]
    else:
        callbacks = []
    try:
        study.optimize(
            Objective(
                df_train=df_train,
                df_valid=df_valid,
                categoricals=categoricals,
                fixed_params=fixed_params,
                param_set = param_set,
                verbose_eval=verbose_eval
            ),
            n_trials=n_trials,
            show_progress_bar=show_progress,
            callbacks=callbacks
        )
    except EarlyStoppingExceeded:
        print(f'EarlyStopping Exceeded: No new best scores on iters {early_stop_callback.early_stopping_rounds}')
    return study



In [None]:
for t in [7501]:
# for t in target_codes:
    print(f"Modeling for {t} label")
    drop_labels = ['label_'+str(tc) for tc in target_codes if tc != t]
    label_col = 'label_' + str(t)
    df_all = data_df.drop(drop_labels, axis=1).copy(deep=True)
    train, test = train_test_split(df_all, train_size=0.8, random_state=100)
    train, valid = train_test_split(train, train_size=0.8, random_state=100)
    print(train.shape, valid.shape, test.shape)
    print(train[label_col].value_counts(True))
    print(test[label_col].value_counts(True))

    # Hyperparameter tuning
    obj_func = 'binary'
    num_rounds = 1000
    early_stopping_rounds = 50

    print("Tune hyperparameters...")
    param_set = {
        "num_leaves":None, 
        "min_data_in_leaf":None, 
        "learning_rate":None, 
        "feature_fraction":None,
        "bagging_freq":None, 
        "bagging_fraction":None
    }

    fixed_params = {
        "objective":obj_func,
        "metric":[obj_func, "auc"],
        "num_rounds":num_rounds,
        "early_stopping_rounds":early_stopping_rounds,
        "first_metric_only":True,
        "force_row_wise":True,
        "feature_pre_filter":False,
        "verbose":1,
    }

    early_stopping = EarlyStoppingCallback(10, 0.001)

    study = tune_model(
                        train.drop(columns=["date"]),
                        valid.drop(columns=["date"]),
                        categoricals, fixed_params, param_set, n_trials=100, verbose_eval=0,
                        show_progress=False, early_stop_callback=early_stopping,
                    )

    print("Saving best model parameters...")
    best_params = {k: [v] for (k,v) in study.best_params.items()}

    print('best parameters:', best_params)
    num_rounds = study.best_trial.user_attrs["best_iteration"]

    fixed_params["num_rounds"] = num_rounds
    # fixed_params["early_stopping_rounds"] = 0
    params = study.best_params.copy()

    params.update(fixed_params)
    del params["early_stopping_rounds"] # = 0        

    params['verbose'] = 1
    params['metric'] = ['binary', 'auc']
    # params['is_unbalance'] = True
    print(params)
    
    model = lgb.LGBMClassifier(boosting_type='gbdt', 
                               num_leaves=params['num_leaves'], 
                               min_data_in_leaf=params['min_data_in_leaf'],
                               learning_rate=params['learning_rate'],
                               feature_fraction=params['feature_fraction'],
                               bagging_freq=params['bagging_freq'],
                               bagging_fraction=params['bagging_fraction'],
                               objective='binary',
                               metric=params['metric'],
                               num_rounds=params['num_rounds'],
    #                            is_unbalance=params['is_unbalance']
                              )
    x_train, y_train = train.drop(columns=[label_col, "date"]), train[label_col]
    x_val, y_val = valid.drop(columns=[label_col, "date"]), valid[label_col]

    model.fit(X=x_train, y=y_train, 
              eval_set=[(x_val, y_val)],
              eval_names=['eval']
             )     

    # Evaluation
    x_test, y_test = test.drop(columns=[label_col, "date"]), test[label_col]

    train_pred = model.predict_proba(x_train)
    val_pred = model.predict_proba(x_val)
    test_pred = model.predict_proba(x_test)
    print(train_pred.shape, val_pred.shape, test_pred.shape)

    # AUC
    train_auc = roc_auc_score(y_true=y_train, y_score=train_pred[:,1])
    val_auc = roc_auc_score(y_true=y_val, y_score=val_pred[:,1])
    test_auc = roc_auc_score(y_true=y_test, y_score=test_pred[:,1])
    print(f"train-auc: {train_auc}, val-auc: {val_auc}, test-auc: {test_auc}")

    # Feature Importance
    num = 10
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,
                                'Feature':train.drop(columns=[label_col, "date"]).columns})
    plt.figure(figsize=(10, 5))
    sns.set(font_scale = 1.5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title(f'Feature Importance - {t} prediction')
    plt.tight_layout()
#     plt.savefig('lgbm_importances-01.png')
    plt.show()

    # Test Precision-Recall Curve
    pos_label = 1
    average_precision = average_precision_score(y_test, test_pred[:,1])
    precision, recall, thresholds = precision_recall_curve(y_test, test_pred[:,1])
    line_kwargs1 = {"drawstyle": "steps-post", 'label': 'precision'}
    line_kwargs2 = {"drawstyle": "steps-post", 'label': 'threshold'}
    plt.figure(figsize=(10, 5))
    sns.set(font_scale = 1)
    fig, ax = plt.subplots()
    ax.plot(recall, precision, **line_kwargs1)
    ax.plot(recall[:-1], thresholds, **line_kwargs2)
    info_pos_label = (f" (Positive label: {pos_label})"
                      if pos_label is not None else "")
    xlabel = "Recall" + info_pos_label
    ylabel = "Precision" + info_pos_label
    title_txt = f"{t} Prediction - Average Precision = {average_precision:0.2f}"
    ax.set(xlabel=xlabel, ylabel=ylabel, title=title_txt)
    ax.legend(loc="lower left")


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

# t = 7502
# features = ['power_max8640', 'power_max6048', 'power_max4032', 'power_max2016']

t = 7501
# features = ['power_max8640', 'power_max6048', 'power_max4032', 'power_mean8640']
# limits = [(300,500), (300,500), (300,500), (10,200)]
features = ['power_median2016', 'power_median288', 'power_median6048', 'power_median4032']
limits = [(0,50), (0,50), (0,50), (0,50)]

# t = 1500
# features = ['power_max288', 'power_max4032', 'power_max8640', 'power_max6048', 'temp2']
drop_labels = ['label_'+str(tc) for tc in target_codes if tc != t]
label_col = 'label_' + str(t)
df_all = data_df.drop(drop_labels, axis=1).copy(deep=True)

for col, lim in zip(features, limits):
    df = df_all[[col, label_col]]
    print(df[label_col].value_counts())
    print(df.groupby(label_col).agg(np.mean))

    data1 = df[df[label_col]==1][col]
    data2 = df[df[label_col]==0][col]

    density1 = gaussian_kde(data1)  # positive
    density2 = gaussian_kde(data2)  # negative

    xs = np.linspace(lim[0], lim[1], 100)
    density1.covariance_factor = lambda : .25
    density1._compute_covariance()

    density2.covariance_factor = lambda : .25
    density2._compute_covariance()
    plt.plot(xs, density1(xs), label='positive')
    plt.plot(xs, density2(xs), label='negative')
    plt.title(f'Density plot: {col} for Alarm-{t}')
    plt.legend()
    plt.show()

In [None]:
data2.isna().sum()

In [None]:
import shap

In [None]:
shap_values = shap.TreeExplainer(model.booster_).shap_values(x_test)
shap_values[0].shape

In [None]:
global_importances = np.abs(shap_values[0]).mean(0)[:-1]

In [None]:
data = x_test
# make a bar chart that shows the global importance of the top 20 features
inds = np.argsort(-global_importances)
f = plt.figure(figsize=(5,10))
y_pos = np.arange(20)
inds2 = np.flip(inds[:20], 0)
plt.barh(y_pos, global_importances[inds2], align='center', color="#1E88E5")
plt.yticks(y_pos, fontsize=13)
plt.gca().set_yticklabels(data.columns[inds2])
plt.xlabel('mean abs. SHAP value (impact on model output)', fontsize=13)
plt.gca().xaxis.set_ticks_position('bottom')
plt.gca().yaxis.set_ticks_position('none')
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

In [None]:
shap.summary_plot(shap_values[0], data)

In [None]:
shap.dependence_plot("power_median2016", shap_values[0], data)