# Code for Multivariate LGBM Implementation - testing.

***

Use cross-validation to evaluate a set of parameters - compare cross validation results for different parameter combinations.

## Steps to implement:

* ~~max epochs and initial learning rate set to 1200 and 0.075. - not going to mess with these, just use defaults~~
* ~~MAE as loss function in model training?~~

* ~~k-folds cross validation for window length?~~
* Direct modeling - train one LGBM model for every horizon step

* ~~early stopping mechanism when validation error does not improve over 5 consecutive epochs. - only do about this if necessary.~~

***

In [1]:
from forecasting_functions import *
from data_processing_functions import *
import lightgbm as lgb
import numpy as np

### Read in the data.

In [2]:
# import weekly finance time series
# ignore header and skip the first row to use integers as column names
full_data = pd.read_csv("../../Data/Train/Clean/full_m3_monthly_micro_clean.csv", header=None, skiprows=1)

In [3]:
# convert to a list of series, potentially with different lengths
full_data = [x.dropna() for _, x in full_data.iterrows()]

In [4]:
# forecast horizon
h = 1

In [5]:
Y = [x.iloc[:-h] for x in full_data]
Test = [x.iloc[-h:] for x in full_data]
Test = pd.DataFrame([x.reset_index(drop=True) for x in Test]).T

***

### Perform reduction on training data.

In [6]:
transform_dict = {"windows": {"window_length": 36}, "deseasonalize": {"sp": 12, "seasonality_type": "additive"}}

Y_processed, last_window_dt, last_window, full_lags = pre_process(ts_data=Y,
                                                                  target_forecast_period=h,
                                                                  mean_normalize=True,
                                                                  log=True,
                                                                  sp=12,
                                                                  transform_dict=transform_dict)

***

### Convert `Y_processed` Train and Validation Data for LGBM

In [9]:
train = Y_processed.iloc[:,:-1]
label = Y_processed.iloc[:,-1]
train_data = lgb.Dataset(train, label=label)

In [10]:
# validation_data = train_data.create_valid(train, label)

In [54]:
import itertools

In [56]:
learning_rate = [0.05, 0.1]
num_boost_rounds = [100, 200]
c = list(itertools.product(learning_rate, num_boost_rounds))

In [61]:
for p in c:
    print(p)

(0.05, 100)
(0.05, 200)
(0.1, 100)
(0.1, 200)


In [62]:
params = {"objective": "mae",
          "metrics": "mae",
          "learning_rate": 0.1}

In [63]:
bst_vals = lgb.cv(params,
                  train_data,
                  num_boost_round=1000,
                  nfold=10,
                  stratified=False,
                  callbacks=[lgb.early_stopping(stopping_rounds=5)])

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 23310, number of used features: 36
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 23310, number of used features: 36
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 23310, number of used features: 36
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 23310, number of used features: 36
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 23310, number of used features: 36
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info]

In [51]:
bst = lgb.train(params, 
                train_data,
                num_boost_round=1000)

fcasts = bst.predict(last_window_dt)
    
fcasts = [pd.Series(x) for x in fcasts]
    
fcast_indexes = [last_window_dt[i].index[-1]+h for i in range(474)]
    
# add correct time index back to forecasts
for i in range(474):
    fcasts[i].index = [fcast_indexes[i]]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 25905, number of used features: 36
[LightGBM] [Info] Start training from score -0.000476


In [52]:
fcasts = post_process(full_ts_data=Y,
                      target_forecast_period=h,
                      forecasts=fcasts,
                      last_window_with_trend=last_window,
                      mean_normalize=True,
                      log=True,
                      sp=12,
                      transform_dict=transform_dict)

In [53]:
mean_absolute_error(Test, fcasts, multioutput="uniform_average")

648.4268112753832

## currently at train_and_forecast

In [None]:
def multivariate_lgbm_window_length(ts_data, h, param_grid):
    num_series = len(ts_data)
    
    # set model parameters
    params = {"objective": "mae",
              "metrics": "mae"}
    
    # use cross-validation to choose best window length from param_grid
    maes = []
    
    window_lengths = param_grid['window_length']

    for w in window_lengths:

        transform_dict = {"windows": {"window_length": w}, "deseasonalize": {"sp": 12, "seasonality_type": "additive"}}
    
        Y_processed, last_window_dt, last_window, full_lags = pre_process(ts_data=ts_data,
                                                                          target_forecast_period=h,
                                                                          mean_normalize=True,
                                                                          log=True,
                                                                          sp=12,
                                                                          transform_dict=transform_dict)
    
        train = Y_processed.iloc[:,:-1]
        label = Y_processed.iloc[:,-1]
        train_data = lgb.Dataset(train, label=label)
    
        bst = lgb.cv(params,
                     train_data,
                     stratified=False)
    
        best_mae = bst['l1-mean'][-1]
        maes.append(best_mae)
        
    which_best = np.argmin(maes)
    
    return window_lengths[which_best]

In [None]:
best_window_length = multivariate_lgbm_window_length(Y, h, param_grid={'window_length':[24, 36]})

In [None]:
best_window_length

In [None]:
transform_dict = {"windows": {"window_length": best_window_length}, "deseasonalize": {"sp": 12, "seasonality_type": "additive"}}

Y_processed, last_window_dt, last_window, full_lags = pre_process(ts_data=Y,
                                                                  target_forecast_period=h,
                                                                  mean_normalize=True,
                                                                  log=True,
                                                                  sp=12,
                                                                  transform_dict=transform_dict)

In [None]:
def multivariate_lgbm_forecast(ts_data, last_window_dt, num_series):
    """
    Function to train multivariate lgbm model. Note that the hyperparameter (window_length) chosen using cross-validation
    is part of the pre-processing. It is important that non-processed data is input into this function.
    """
    # set model parameters
    params = {"objective": "mae",
              "metrics": "mae"}
    
    train = ts_data.iloc[:,:-1]
    label = ts_data.iloc[:,-1]

    train_data = lgb.Dataset(train, label=label)
    
    bst = lgb.train(params, train_data)
    
    fcasts = bst.predict(last_window_dt)
    
    fcasts = [pd.Series(x) for x in fcasts]
    
    fcast_indexes = [last_window_dt[i].index[-1]+h for i in range(num_series)]
    
    # add correct time index back to forecasts
    for i in range(num_series):
        fcasts[i].index = [fcast_indexes[i]]
        
    return fcasts

In [None]:
fcasts = multivariate_lgbm_forecast(Y_processed, last_window_dt, 474)

In [None]:
# train = Y_processed.iloc[:,:-1]
# label = Y_processed.iloc[:,-1]

# train_data = lgb.Dataset(train, label=label)
# # validation_data = train_data.create_valid(train, label)
# # validation_data = lgb.Dataset('validation.svm', reference=train_data)

### Set Parameters for LGBM Model

In [None]:
# params = {"objective": "mae", 
#           "metrics": "mae"}

### Run with 5-fold cross-validation

In [None]:
# bst = lgb.cv(params,
#              train_data,
#              stratified=False,
#              callbacks=[lgb.early_stopping(stopping_rounds=5)])

In [None]:
best_mae = np.min(bst['l1-mean'])

In [None]:
bst = lgb.train(params, train_data)

In [None]:
fcasts = bst.predict(last_window_dt)

In [None]:
# combine forecasts into (num_series, horizon_length) shaped array
# fcasts = np.concatenate([i.reshape(474, 1) for i in fcasts], axis=1)
fcasts = [pd.Series(x) for x in fcasts]

In [None]:
fcast_indexes = [np.arange(last_window[i].index[-1]+1, last_window[i].index[-1]+h+1) for i in range(474)]

In [None]:
# add correct time index back to forecasts
for i in range(474):
    fcasts[i].index = fcast_indexes[i]

In [None]:
fcasts = post_process(full_ts_data=Y,
                      forecasts=fcasts,
                      last_window_with_trend=last_window,
                      mean_normalize=True,
                      log=True,
                      sp=12,
                      transform_dict=transform_dict)

In [None]:
mean_absolute_error(Test, fcasts, multioutput="uniform_average")

***

***

### Use Cross-validation to choose window length

In [None]:
window_lengths = [24, 30, 36]

In [None]:
# maes = []

# for w in window_lengths:

#     transform_dict = {"windows": {"window_length": w}, "deseasonalize": {"sp": 12, "seasonality_type": "additive"}}
    
#     Y_processed, last_window_dt, last_window, full_lags = pre_process(ts_data=Y,
#                                                                       mean_normalize=True,
#                                                                       log=True,
#                                                                       sp=12,
#                                                                       transform_dict=transform_dict)
    
#     train = Y_processed.iloc[:,:-1]
#     label = Y_processed.iloc[:,-1]
#     train_data = lgb.Dataset(train, label=label)
    
#     bst = lgb.cv(params,
#                  train_data,
#                  stratified=False)
    
#     best_mae = bst['l1-mean'][-1]
#     maes.append(best_mae)

In [None]:
maes

In [None]:
which_best = np.argmin(maes)

In [None]:
transform_dict = {"windows": {"window_length": window_lengths[which_best]}, "deseasonalize": {"sp": 12, "seasonality_type": "additive"}}

Y_processed, last_window_dt, last_window, full_lags = pre_process(ts_data=Y,
                                                                  mean_normalize=True,
                                                                  log=True,
                                                                  sp=12,
                                                                  transform_dict=transform_dict)

In [None]:
train = Y_processed.iloc[:,:-1]
label = Y_processed.iloc[:,-1]

train_data = lgb.Dataset(train, label=label)
# validation_data = train_data.create_valid(train, label)
# validation_data = lgb.Dataset('validation.svm', reference=train_data)

In [None]:
bst = lgb.train(params, train_data)

In [None]:
fcasts = bst.predict(last_window_dt)

In [None]:
# combine forecasts into (num_series, horizon_length) shaped array
# fcasts = np.concatenate([i.reshape(474, 1) for i in fcasts], axis=1)
fcasts = [pd.Series(x) for x in fcasts]

In [None]:
# fcast_indexes = [np.arange(last_window[i].index[-1]+1, last_window[i].index[-1]+h+1) for i in range(474)]

In [None]:
fcast_indexes = [last_window[i].index[-1]+h for i in range(num_series)]

In [None]:
# add correct time index back to forecasts
for i in range(474):
    fcasts[i].index = fcast_indexes[i]

In [None]:
fcasts = post_process(full_ts_data=Y,
                      target_forecast_period=h,
                      forecasts=fcasts,
                      last_window_with_trend=last_window,
                      mean_normalize=True,
                      log=True,
                      sp=12,
                      transform_dict=transform_dict)

In [None]:
mean_absolute_error(Test, fcasts, multioutput="uniform_average")

***

In [None]:
H = 18
fcasts = [full_forecast_analysis(Y=Y,
                                                     h=i,
                                                     forecasting_model="Multivariate_LGBM",
                                                     window_length=None,
                                                     make_stationary=False,
                                                     seasonality_type="additive",
                                                     sp=12,
                                                     remove_seasonality=True,
                                                     mean_normalize=True,
                                                     log=True,
                                                     param_grid={'window_length':[30]}) for i in range(1, H+1)]

In [None]:
# combine fcast dataframes into one
fcasts = pd.concat(fcasts, axis=0)

In [None]:
fcasts

In [None]:
Test

In [None]:
mean_absolute_error(Test, fcasts, multioutput="uniform_average")