## Analyzing the Effects of Top/Bottom Coding on The Accuracy of Global LGBM Forecasts

***

## Import Modules

In [1]:
# general modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# light gradient boosting model package
import lightgbm as lgb

##### the `helper_functions.py` file contains many custom functions we wrote to aid in our analysis
##### `full_coding_analysis` combines all of the following - train-test split data,
##### data protection, train models, compare accuracies, return accuracy results
from helper_functions import full_coding_analysis
from helper_functions import *

# import detrender and deseasonalizer
from sktime.transformations.series.detrend import Detrender
# nice time series plots
from sktime.utils.plotting import plot_series

## Import data

In [2]:
# import weekly finance time series
Y = np.genfromtxt("../../Data/Train/Clean/weekly_finance_clean.csv", delimiter = ',', skip_header = 1)
Y = pd.DataFrame(Y)

This file experiments with applying top and bottom coding to detrended data (intuition suggests this type of data is a better candidate for this type of protection than the original finance series data) so we remove the trend from the finance data here.

In [3]:
# detrender = Detrender()
# detrended_series = [detrender.fit_transform(series) for _ , series in Y.iterrows()]
# detrended_series = [i+np.abs(np.min(i))+1.0 for i in detrended_series]
# Y = pd.concat(detrended_series, axis=1).T

***

We obtain results for a combination of forecast horizons, coding types (top and bottom), coding percentages, and model complexities:

* Forecast Horizons: (1, 5, 15)
* Coding Types: (Top, Bottom)
* Coding Percentages: (0.10, 0.20, 0.40)
* Model complexities (window length): (10, 20, 40)

## Simple Model (window length = 10)

In [4]:
forecaster = lgb.LGBMRegressor()
window_length = 10

In [5]:
results_dict_10 = {}
fcasts_10 = {}
fcasts_protected_10 = {}
tests = {}
types = ["Top", "Bottom"]
percentages = [0.10, 0.20, 0.40]
horizons = [1, 20]

In [6]:
for t in types:
    for p in percentages:
        for h in horizons:
            idx = "h="+str(h)+", "+t+" "+str(p)
            results_dict_10[idx], tests[idx], fcasts_10[idx], fcasts_protected_10[idx] = full_coding_analysis(time_series_data=Y,
                                                                                                              forecasting_model=forecaster,
                                                                                                              forecast_horizon=h, 
                                                                                                              coding_type=t, 
                                                                                                              coding_percentage=p, 
                                                                                                              window_length=window_length)

In [7]:
results_dict_10

{'h=1, Top 0.1': {'Mean Accuracies': array([67.4396, 67.4396]),
  'Protected Mean Accuracies:': array([112.4631, 112.4631]),
  '% Change Mean accuracy:': array([-0.6676, -0.6676]),
  '% Change Median accuracy:': array([-1.2035, -1.2035]),
  '% Forecasted Points adjusted downward:': 0.6829,
  '% Forecasted Points adjusted upward:': 0.3171,
  '% Series with improved accuracy:': array([0.3354, 0.3354]),
  '% Series with reduced accuracy:': array([0.6646, 0.6646]),
  'Original Mean Absolute Error Upward Adjusted:': 118.0343,
  'Original Mean Absolute Error Downward Adjusted:': 43.9492,
  'Protected Mean Absolute Error Upward Adjusted:': 110.2835,
  'Protected Mean Absolute Error Downward Adjusted:': 113.4751},
 'h=20, Top 0.1': {'Mean Accuracies': array([115.1224, 138.0561]),
  'Protected Mean Accuracies:': array([134.0747, 155.5238]),
  '% Change Mean accuracy:': array([-0.1646, -0.1265]),
  '% Change Median accuracy:': array([-0.0703, -0.0654]),
  '% Forecasted Points adjusted downward:'

***
***

## 'Medium' Model (window length = 20)

In [8]:
forecaster = lgb.LGBMRegressor()
window_length = 20

In [9]:
results_dict_20 = {}
fcasts_20 = {}
fcasts_protected_20 = {}
tests = {}
types = ["Top", "Bottom"]
percentages = [0.10, 0.20, 0.40]
horizons = [1, 20]

In [10]:
for t in types:
    for p in percentages:
        for h in horizons:
            idx = "h="+str(h)+", "+t+" "+str(p)
            results_dict_20[idx], tests[idx], fcasts_20[idx], fcasts_protected_20[idx] = full_coding_analysis(time_series_data=Y,
                                                                                                              forecasting_model=forecaster,
                                                                                                              forecast_horizon=h, 
                                                                                                              coding_type=t, 
                                                                                                              coding_percentage=p, 
                                                                                                              window_length=window_length)

In [11]:
results_dict_20

{'h=1, Top 0.1': {'Mean Accuracies': array([64.9424, 64.9424]),
  'Protected Mean Accuracies:': array([105.4467, 105.4467]),
  '% Change Mean accuracy:': array([-0.6237, -0.6237]),
  '% Change Median accuracy:': array([-1.1011, -1.1011]),
  '% Forecasted Points adjusted downward:': 0.689,
  '% Forecasted Points adjusted upward:': 0.311,
  '% Series with improved accuracy:': array([0.372, 0.372]),
  '% Series with reduced accuracy:': array([0.628, 0.628]),
  'Original Mean Absolute Error Upward Adjusted:': 93.0727,
  'Original Mean Absolute Error Downward Adjusted:': 52.2464,
  'Protected Mean Absolute Error Upward Adjusted:': 84.1944,
  'Protected Mean Absolute Error Downward Adjusted:': 115.0384},
 'h=20, Top 0.1': {'Mean Accuracies': array([125.6744, 149.4553]),
  'Protected Mean Accuracies:': array([146.8518, 169.8595]),
  '% Change Mean accuracy:': array([-0.1685, -0.1365]),
  '% Change Median accuracy:': array([-0.036 , -0.0328]),
  '% Forecasted Points adjusted downward:': 0.7381

***
***

## More Complex Model (window length = 40)

In [12]:
forecaster = lgb.LGBMRegressor()
window_length = 40

In [13]:
results_dict_40 = {}
fcasts_40 = {}
fcasts_protected_40 = {}
tests = {}
types = ["Top", "Bottom"]
percentages = [0.10, 0.20, 0.40]
horizons = [1, 20]

In [14]:
for t in types:
    for p in percentages:
        for h in horizons:
            idx = "h="+str(h)+", "+t+" "+str(p)
            results_dict_40[idx], tests[idx], fcasts_40[idx], fcasts_protected_40[idx] = full_coding_analysis(time_series_data=Y,
                                                                                                              forecasting_model=forecaster,
                                                                                                              forecast_horizon=h, 
                                                                                                              coding_type=t, 
                                                                                                              coding_percentage=p, 
                                                                                                              window_length=window_length)

In [15]:
results_dict_40

{'h=1, Top 0.1': {'Mean Accuracies': array([67.7525, 67.7525]),
  'Protected Mean Accuracies:': array([111.5692, 111.5692]),
  '% Change Mean accuracy:': array([-0.6467, -0.6467]),
  '% Change Median accuracy:': array([-1.4123, -1.4123]),
  '% Forecasted Points adjusted downward:': 0.75,
  '% Forecasted Points adjusted upward:': 0.25,
  '% Series with improved accuracy:': array([0.311, 0.311]),
  '% Series with reduced accuracy:': array([0.689, 0.689]),
  'Original Mean Absolute Error Upward Adjusted:': 99.1719,
  'Original Mean Absolute Error Downward Adjusted:': 57.2793,
  'Protected Mean Absolute Error Upward Adjusted:': 91.8678,
  'Protected Mean Absolute Error Downward Adjusted:': 118.1363},
 'h=20, Top 0.1': {'Mean Accuracies': array([166.4712, 191.1059]),
  'Protected Mean Accuracies:': array([181.7919, 205.7167]),
  '% Change Mean accuracy:': array([-0.092 , -0.0765]),
  '% Change Median accuracy:': array([ 0.0239, -0.005 ]),
  '% Forecasted Points adjusted downward:': 0.7399,
