# Examining the Effects of Additive Noise on LGBM Forecast Accuracy

***

In [1]:
# general modules
import pandas as pd
import numpy as np
import lightgbm as lgb

##### the `helper_functions.py` file contains many custom functions we wrote to aid in our analysis
##### `full_coding_analysis` combines all of the following - train-test split data,
##### data protection, train models, compare accuracies, return accuracy results
from helper_functions import *

In [2]:
# import weekly finance time series
Y = np.genfromtxt("../../Data/Train/Clean/weekly_finance_clean.csv", delimiter = ',', skip_header = 1)
Y = pd.DataFrame(Y)

In [3]:
# detrender = Detrender()
# detrended_series = [detrender.fit_transform(series) for _ , series in Y.iterrows()]
# detrended_series = [i+np.abs(np.min(i))+1.0 for i in detrended_series]
# Y = pd.concat(detrended_series, axis=1).T

***

## Simple Model (window length = 10)

In [4]:
forecaster = lgb.LGBMRegressor()
window_length=10

In [5]:
results_dict_10 = {}
fcasts_10 = {}
fcasts_protected_10 = {}
tests = {}
num_stdevs = [1, 2]
horizons = [1, 20]

In [6]:
for n in num_stdevs:
    for h in horizons:
        idx = "h="+str(h)+", "+str(n)+" stan. devs"
        results_dict_10[idx], tests[idx], fcasts_10[idx], fcasts_protected_10[idx] = full_coding_analysis(time_series_data=Y, 
                                                                                                          forecasting_model=forecaster, 
                                                                                                          forecast_horizon=h,
                                                                                                          num_stdev=n,
                                                                                                          window_length=window_length)

In [7]:
results_dict_10

{'h=1, 1 stan. devs': {'Mean Accuracies': array([67.4396, 67.4396]),
  'Protected Mean Accuracies:': array([289.8237, 289.8237]),
  '% Change Mean accuracy:': array([-3.2975, -3.2975]),
  '% Change Median accuracy:': array([-1.4905, -1.4905]),
  '% Forecasted Points adjusted downward:': 0.5488,
  '% Forecasted Points adjusted upward:': 0.4512,
  '% Series with improved accuracy:': array([0.2683, 0.2683]),
  '% Series with reduced accuracy:': array([0.7317, 0.7317]),
  'Original Mean Absolute Error Upward Adjusted:': 93.0647,
  'Original Mean Absolute Error Downward Adjusted:': 46.3701,
  'Protected Mean Absolute Error Upward Adjusted:': 474.7413,
  'Protected Mean Absolute Error Downward Adjusted:': 137.7803},
 'h=20, 1 stan. devs': {'Mean Accuracies': array([115.1224, 138.0561]),
  'Protected Mean Accuracies:': array([266.9036, 289.6825]),
  '% Change Mean accuracy:': array([-1.3184, -1.0983]),
  '% Change Median accuracy:': array([-0.4752, -0.5819]),
  '% Forecasted Points adjusted d

***
***

## 'Medium' Model (window length = 20)

In [8]:
forecaster = lgb.LGBMRegressor()
window_length = 20

In [9]:
results_dict_20 = {}
fcasts_20 = {}
fcasts_protected_20 = {}
tests = {}
num_stdevs = [1, 2]
horizons = [1, 20]

In [10]:
for n in num_stdevs:
    for h in horizons:
        idx = "h="+str(h)+", "+str(n)+" stan. devs"
        results_dict_20[idx], tests[idx], fcasts_20[idx], fcasts_protected_20[idx] = full_coding_analysis(time_series_data=Y, 
                                                                                                          forecasting_model=forecaster, 
                                                                                                          forecast_horizon=h,
                                                                                                          num_stdev=n,
                                                                                                          window_length=window_length)

In [11]:
results_dict_20

{'h=1, 1 stan. devs': {'Mean Accuracies': array([64.9424, 64.9424]),
  'Protected Mean Accuracies:': array([197.0099, 197.0099]),
  '% Change Mean accuracy:': array([-2.0336, -2.0336]),
  '% Change Median accuracy:': array([-1.297, -1.297]),
  '% Forecasted Points adjusted downward:': 0.5549,
  '% Forecasted Points adjusted upward:': 0.4451,
  '% Series with improved accuracy:': array([0.2317, 0.2317]),
  '% Series with reduced accuracy:': array([0.7683, 0.7683]),
  'Original Mean Absolute Error Upward Adjusted:': 82.2189,
  'Original Mean Absolute Error Downward Adjusted:': 51.0831,
  'Protected Mean Absolute Error Upward Adjusted:': 218.9173,
  'Protected Mean Absolute Error Downward Adjusted:': 179.4357},
 'h=20, 1 stan. devs': {'Mean Accuracies': array([125.6744, 149.4553]),
  'Protected Mean Accuracies:': array([280.1729, 301.3825]),
  '% Change Mean accuracy:': array([-1.2294, -1.0165]),
  '% Change Median accuracy:': array([-0.2378, -0.1969]),
  '% Forecasted Points adjusted dow

***
***

## Complex Model (window length = 40)

In [12]:
forecaster = lgb.LGBMRegressor()
window_length = 40

In [13]:
results_dict_40 = {}
fcasts_40 = {}
fcasts_protected_40 = {}
tests = {}
num_stdevs = [1, 2]
horizons = [1, 20]

In [14]:
for n in num_stdevs:
    for h in horizons:
        idx = "h="+str(h)+", "+str(n)+" stan. devs"
        results_dict_40[idx], tests[idx], fcasts_40[idx], fcasts_protected_40[idx] = full_coding_analysis(time_series_data=Y, 
                                                                                                          forecasting_model=forecaster, 
                                                                                                          forecast_horizon=h,
                                                                                                          num_stdev=n,
                                                                                                          window_length=window_length)

In [15]:
results_dict_40

{'h=1, 1 stan. devs': {'Mean Accuracies': array([67.7525, 67.7525]),
  'Protected Mean Accuracies:': array([209.7504, 209.7504]),
  '% Change Mean accuracy:': array([-2.0958, -2.0958]),
  '% Change Median accuracy:': array([-1.6916, -1.6916]),
  '% Forecasted Points adjusted downward:': 0.5549,
  '% Forecasted Points adjusted upward:': 0.4451,
  '% Series with improved accuracy:': array([0.2073, 0.2073]),
  '% Series with reduced accuracy:': array([0.7927, 0.7927]),
  'Original Mean Absolute Error Upward Adjusted:': 92.3736,
  'Original Mean Absolute Error Downward Adjusted:': 48.0014,
  'Protected Mean Absolute Error Upward Adjusted:': 312.9471,
  'Protected Mean Absolute Error Downward Adjusted:': 126.9662},
 'h=20, 1 stan. devs': {'Mean Accuracies': array([166.4712, 191.1059]),
  'Protected Mean Accuracies:': array([399.1456, 444.8764]),
  '% Change Mean accuracy:': array([-1.3977, -1.3279]),
  '% Change Median accuracy:': array([-0.5531, -0.5066]),
  '% Forecasted Points adjusted d