## Analyzing the Effects of Top/Bottom Coding on The Accuracy of Global LGBM Forecasts

***

## Import Modules

In [1]:
# general modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# light gradient boosting model package
import lightgbm as lgb

##### the `helper_functions.py` file contains many custom functions we wrote to aid in our analysis
##### `full_coding_analysis` combines all of the following - train-test split data,
##### data protection, train models, compare accuracies, return accuracy results
from helper_functions import full_coding_analysis
from helper_functions import *

# import detrender and deseasonalizer
from sktime.transformations.series.detrend import Detrender
# nice time series plots
from sktime.utils.plotting import plot_series

## Import data

In [2]:
# import weekly finance time series
Y = np.genfromtxt("../../Data/Train/Clean/weekly_finance_clean.csv", delimiter = ',', skip_header = 1)
Y = pd.DataFrame(Y)

This file experiments with applying top and bottom coding to detrended data (intuition suggests this type of data is a better candidate for this type of protection than the original finance series data) so we remove the trend from the finance data here.

In [3]:
detrender = Detrender()
detrended_series = [detrender.fit_transform(series) for _ , series in Y.iterrows()]
detrended_series = [i+np.abs(np.min(i))+1.0 for i in detrended_series]
Y = pd.concat(detrended_series, axis=1).T

***

We obtain results for a combination of forecast horizons, coding types (top and bottom), coding percentages, and model complexities:

* Forecast Horizons: (1, 5, 15)
* Coding Types: (Top, Bottom)
* Coding Percentages: (0.10, 0.20, 0.40)
* Model complexities (window length): (10, 20, 40)

## Simple Model (window length = 10)

In [4]:
forecaster = lgb.LGBMRegressor()
window_length = 10

In [5]:
results_dict_10 = {}
fcasts_10 = {}
fcasts_protected_10 = {}
tests = {}
types = ["Top", "Bottom"]
percentages = [0.10, 0.20, 0.40]
horizons = [1, 20]

In [6]:
for t in types:
    for p in percentages:
        for h in horizons:
            idx = "h="+str(h)+", "+t+" "+str(p)
            results_dict_10[idx], tests[idx], fcasts_10[idx], fcasts_protected_10[idx] = full_coding_analysis(time_series_data=Y,
                                                                                                              forecasting_model=forecaster,
                                                                                                              forecast_horizon=h, 
                                                                                                              coding_type=t, 
                                                                                                              coding_percentage=p, 
                                                                                                              window_length=window_length)

***

In [7]:
results_dict_10

{'h=1, Top 0.1': {'Mean Accuracies': array([0.3457, 0.3457]),
  'Protected Mean Accuracies:': array([0.3675, 0.3675]),
  '% Change Mean accuracy:': array([-0.063, -0.063]),
  '% Change Median accuracy:': array([-0.2091, -0.2091]),
  '% Forecasted Points adjusted downward:': 0.3537,
  '% Forecasted Points adjusted upward:': 0.6463,
  '% Series with improved accuracy:': array([0.4329, 0.4329]),
  '% Series with reduced accuracy:': array([0.5671, 0.5671]),
  'Original Mean Absolute Error Upward Adjusted:': 0.351,
  'Original Mean Absolute Error Downward Adjusted:': 0.336,
  'Protected Mean Absolute Error Upward Adjusted:': 0.3625,
  'Protected Mean Absolute Error Downward Adjusted:': 0.3765},
 'h=20, Top 0.1': {'Mean Accuracies': array([0.485 , 0.5894]),
  'Protected Mean Accuracies:': array([0.4547, 0.5566]),
  '% Change Mean accuracy:': array([0.0624, 0.0557]),
  '% Change Median accuracy:': array([0.1249, 0.0919]),
  '% Forecasted Points adjusted downward:': 0.421,
  '% Forecasted Poin

***
***

## 'Medium' Model (window length = 20)

In [8]:
forecaster = lgb.LGBMRegressor()
window_length = 20

In [9]:
results_dict_20 = {}
fcasts_20 = {}
fcasts_protected_20 = {}
tests = {}
types = ["Top", "Bottom"]
percentages = [0.10, 0.20, 0.40]
horizons = [1, 20]

In [10]:
for t in types:
    for p in percentages:
        for h in horizons:
            idx = "h="+str(h)+", "+t+" "+str(p)
            results_dict_20[idx], tests[idx], fcasts_20[idx], fcasts_protected_20[idx] = full_coding_analysis(time_series_data=Y,
                                                                                                              forecasting_model=forecaster,
                                                                                                              forecast_horizon=h, 
                                                                                                              coding_type=t, 
                                                                                                              coding_percentage=p, 
                                                                                                              window_length=window_length)

In [11]:
results_dict_20

{'h=1, Top 0.1': {'Mean Accuracies': array([0.3431, 0.3431]),
  'Protected Mean Accuracies:': array([0.3648, 0.3648]),
  '% Change Mean accuracy:': array([-0.0634, -0.0634]),
  '% Change Median accuracy:': array([-0.1924, -0.1924]),
  '% Forecasted Points adjusted downward:': 0.4146,
  '% Forecasted Points adjusted upward:': 0.5854,
  '% Series with improved accuracy:': array([0.4268, 0.4268]),
  '% Series with reduced accuracy:': array([0.5732, 0.5732]),
  'Original Mean Absolute Error Upward Adjusted:': 0.3431,
  'Original Mean Absolute Error Downward Adjusted:': 0.343,
  'Protected Mean Absolute Error Upward Adjusted:': 0.3605,
  'Protected Mean Absolute Error Downward Adjusted:': 0.371},
 'h=20, Top 0.1': {'Mean Accuracies': array([0.4815, 0.5946]),
  'Protected Mean Accuracies:': array([0.4718, 0.5843]),
  '% Change Mean accuracy:': array([0.0201, 0.0175]),
  '% Change Median accuracy:': array([0.0439, 0.0638]),
  '% Forecasted Points adjusted downward:': 0.4293,
  '% Forecasted P

***
***

## More Complex Model (window length = 40)

In [12]:
forecaster = lgb.LGBMRegressor()
window_length = 40

In [13]:
results_dict_40 = {}
fcasts_40 = {}
fcasts_protected_40 = {}
tests = {}
types = ["Top", "Bottom"]
percentages = [0.10, 0.20, 0.40]
horizons = [1, 20]

In [14]:
for t in types:
    for p in percentages:
        for h in horizons:
            idx = "h="+str(h)+", "+t+" "+str(p)
            results_dict_40[idx], tests[idx], fcasts_40[idx], fcasts_protected_40[idx] = full_coding_analysis(time_series_data=Y,
                                                                                                              forecasting_model=forecaster,
                                                                                                              forecast_horizon=h, 
                                                                                                              coding_type=t, 
                                                                                                              coding_percentage=p, 
                                                                                                              window_length=window_length)

In [15]:
results_dict_40

{'h=1, Top 0.1': {'Mean Accuracies': array([0.3589, 0.3589]),
  'Protected Mean Accuracies:': array([0.3624, 0.3624]),
  '% Change Mean accuracy:': array([-0.0099, -0.0099]),
  '% Change Median accuracy:': array([-0.0686, -0.0686]),
  '% Forecasted Points adjusted downward:': 0.4268,
  '% Forecasted Points adjusted upward:': 0.5732,
  '% Series with improved accuracy:': array([0.4756, 0.4756]),
  '% Series with reduced accuracy:': array([0.5244, 0.5244]),
  'Original Mean Absolute Error Upward Adjusted:': 0.2203,
  'Original Mean Absolute Error Downward Adjusted:': 0.545,
  'Protected Mean Absolute Error Upward Adjusted:': 0.2135,
  'Protected Mean Absolute Error Downward Adjusted:': 0.5624},
 'h=20, Top 0.1': {'Mean Accuracies': array([0.4757, 0.5814]),
  'Protected Mean Accuracies:': array([0.4513, 0.5601]),
  '% Change Mean accuracy:': array([0.0514, 0.0366]),
  '% Change Median accuracy:': array([ 0.0235, -0.0572]),
  '% Forecasted Points adjusted downward:': 0.3765,
  '% Forecaste

In [16]:
original_forecasts = fcasts_40['h=20, Bottom 0.1']
protected_forecasts = fcasts_protected_40['h=20, Bottom 0.1']
test = tests['h=20, Bottom 0.1']

In [17]:
adjusted_up = original_forecasts < protected_forecasts
adjusted_up = pd.concat([row for i, row in adjusted_up.iterrows()])
adjusted_down = original_forecasts > protected_forecasts
adjusted_down = pd.concat([row for i, row in adjusted_down.iterrows()])

In [18]:
absolute_error_original = np.absolute(test - original_forecasts)
absolute_error_protected = np.absolute(test - protected_forecasts)

In [19]:
improved = absolute_error_original > absolute_error_protected
improved = pd.concat([row for i, row in improved.iterrows()])
worsened = absolute_error_original < absolute_error_protected
worsened = pd.concat([row for i, row in worsened.iterrows()])

In [20]:
np.mean(adjusted_down[improved])

0.28313253012048195

In [21]:
np.mean(adjusted_up[improved])

0.7168674698795181