## Analyzing the Effects of Top/Bottom Coding on The Accuracy of Global LGBM Forecasts

***

## Import Modules

In [1]:
# general modules
import numpy as np
import pandas as pd

# light gradient boosting model package
import lightgbm as lgb

##### the `helper_functions.py` file contains many custom functions we wrote to aid in our analysis
##### `full_coding_analysis` combines all of the following - train-test split data,
##### data protection, train models, compare accuracies, return accuracy results
from helper_functions import full_coding_analysis

## Import data

In [2]:
# import weekly finance time series
Y = np.genfromtxt("../../Data/Train/Clean/weekly_finance_clean.csv", delimiter = ',', skip_header = 1)
Y = pd.DataFrame(Y)

***

We obtain results for a combination of forecast horizons, coding types (top and bottom), coding percentages, and model complexities:

* Forecast Horizons: (1, 5, 15)
* Coding Types: (Top, Bottom)
* Coding Percentages: (0.10, 0.20, 0.40)
* Model complexities (window length): (10, 20, 40)

## Simple Model (window length = 10)

In [3]:
forecaster = lgb.LGBMRegressor()
window_length=10

In [4]:
results_dict_10 = {}
types = ["Top", "Bottom"]
percentages = [0.10, 0.20, 0.40]
horizons = [1, 5, 15]

In [5]:
for t in types:
    for p in percentages:
        for h in horizons:
            results_dict_10["h="+str(h)+", "+t+" "+str(p)] = full_coding_analysis(Y, forecaster, forecast_horizon=h, coding_type=t, coding_percentage=p, window_length=window_length)

In [6]:
results_dict_10

{'h=1, Top 0.1': {'% of forecasted points adjusted downward:': 55.50000000000001,
  '% of forecasted points adjusted upward:': 44.5,
  '% Series with improved accuracy:': array([31.1, 31.1]),
  '% Series with worsened accuracy:': array([68.9, 68.9]),
  '% Series with unchanged accuracy:': array([0., 0.]),
  '% Change global accuracy:': array([-51., -51.])},
 'h=5, Top 0.1': {'% of forecasted points adjusted downward:': 66.7,
  '% of forecasted points adjusted upward:': 33.300000000000004,
  '% Series with improved accuracy:': array([34.8, 36. ]),
  '% Series with worsened accuracy:': array([65.2, 64. ]),
  '% Series with unchanged accuracy:': array([0., 0.]),
  '% Change global accuracy:': array([-39.7, -35.5])},
 'h=15, Top 0.1': {'% of forecasted points adjusted downward:': 65.5,
  '% of forecasted points adjusted upward:': 34.5,
  '% Series with improved accuracy:': array([37.2, 39.6]),
  '% Series with worsened accuracy:': array([62.8, 60.4]),
  '% Series with unchanged accuracy:':

***
***

## 'Medium' Model (window length = 20)

In [7]:
forecaster = lgb.LGBMRegressor()
window_length=20

In [8]:
results_dict_20 = {}
types = ["Top", "Bottom"]
percentages = [0.10, 0.20, 0.40]
horizons = [1, 5, 15]

In [9]:
for t in types:
    for p in percentages:
        for h in horizons:
            results_dict_20["h="+str(h)+", "+t+" "+str(p)] = full_coding_analysis(Y, forecaster, forecast_horizon=h, coding_type=t, coding_percentage=p, window_length=window_length)

In [10]:
results_dict_20

{'h=1, Top 0.1': {'% of forecasted points adjusted downward:': 54.900000000000006,
  '% of forecasted points adjusted upward:': 45.1,
  '% Series with improved accuracy:': array([33.5, 33.5]),
  '% Series with worsened accuracy:': array([66.5, 66.5]),
  '% Series with unchanged accuracy:': array([0., 0.]),
  '% Change global accuracy:': array([-50., -50.])},
 'h=5, Top 0.1': {'% of forecasted points adjusted downward:': 65.2,
  '% of forecasted points adjusted upward:': 34.8,
  '% Series with improved accuracy:': array([40.2, 40.2]),
  '% Series with worsened accuracy:': array([59.8, 59.8]),
  '% Series with unchanged accuracy:': array([0., 0.]),
  '% Change global accuracy:': array([-32.5, -28.7])},
 'h=15, Top 0.1': {'% of forecasted points adjusted downward:': 65.4,
  '% of forecasted points adjusted upward:': 34.599999999999994,
  '% Series with improved accuracy:': array([39. , 41.5]),
  '% Series with worsened accuracy:': array([61. , 58.5]),
  '% Series with unchanged accuracy:'

***
***

## More Complex Model (window length = 40)

In [11]:
forecaster = lgb.LGBMRegressor()
window_length=40

In [12]:
results_dict_40 = {}
types = ["Top", "Bottom"]
percentages = [0.10, 0.20, 0.40]
horizons = [1, 5, 15]

In [13]:
for t in types:
    for p in percentages:
        for h in horizons:
            results_dict_40["h="+str(h)+", "+t+" "+str(p)] = full_coding_analysis(Y, forecaster, forecast_horizon=h, coding_type=t, coding_percentage=p, window_length=window_length)

In [14]:
results_dict_40

{'h=1, Top 0.1': {'% of forecasted points adjusted downward:': 62.2,
  '% of forecasted points adjusted upward:': 37.8,
  '% Series with improved accuracy:': array([30.5, 30.5]),
  '% Series with worsened accuracy:': array([69.5, 69.5]),
  '% Series with unchanged accuracy:': array([0., 0.]),
  '% Change global accuracy:': array([-62.1, -62.1])},
 'h=5, Top 0.1': {'% of forecasted points adjusted downward:': 67.9,
  '% of forecasted points adjusted upward:': 32.1,
  '% Series with improved accuracy:': array([39. , 36.6]),
  '% Series with worsened accuracy:': array([61. , 63.4]),
  '% Series with unchanged accuracy:': array([0., 0.]),
  '% Change global accuracy:': array([-32.5, -28.2])},
 'h=15, Top 0.1': {'% of forecasted points adjusted downward:': 65.60000000000001,
  '% of forecasted points adjusted upward:': 34.4,
  '% Series with improved accuracy:': array([37.2, 37.8]),
  '% Series with worsened accuracy:': array([62.8, 62.2]),
  '% Series with unchanged accuracy:': array([0., 