## Restructure Data Analysis Framework

***

In [1]:
import pandas as pd
import numpy as np

from data_protection_functions import *
from data_processing_functions import *
from forecasting_functions import *

# nice time series plots
from sktime.utils.plotting import plot_series

import sktime
import lightgbm

from sktime.performance_metrics.forecasting import mean_absolute_percentage_error, median_absolute_percentage_error

### Step 1: Import Time Series Data

In [2]:
# import weekly finance time series
# ignore header and skip the first row to use integers as column names
full_data = pd.read_csv("../../Data/Train/Clean/full_m3_monthly_micro_clean.csv", header=None, skiprows=1)

In [3]:
# convert to a list of series, potentially with different lengths
full_data = [x.dropna() for _, x in full_data.iterrows()]

In [4]:
# forecast horizon
h = 1

In [5]:
Y = [x.iloc[:-h] for x in full_data]
Test = [x.iloc[-h:] for x in full_data]

***

### Step 2: Apply Data Protection to Generate Protected Series

At the end of this step, we will have two time series datasets:

* Confidential (original) dataset
* Protected dataset

In [6]:
protection_method = "Top_10"

Y_protected = apply_data_protection(Y, coding_type="Top", coding_percentage=0.10)

***

### Step 3: Pre-process the Confidential and Sensitive Datasets

In [7]:
window_length = 20

transform_dict = {"windows":{"window_length":window_length}, "deseasonalize":{"sp":12, "seasonality_type":"additive"}}

In [8]:
# transform_dict = {}

In [9]:
Y_processed, Y_last_window, Y_last_window_trend = pre_process(Y, 
                                                              mean_normalize=True, 
                                                              log=True, 
                                                              transform_dict=transform_dict)

Y_protected_processed, Y_protected_last_window, Y_protected_last_window_trend = pre_process(Y_protected, 
                                                                                            mean_normalize=True, 
                                                                                            log=True, 
                                                                                            transform_dict=transform_dict)

### Step 4: Train Models and Generate Forecasts

In [10]:
forecasting_model = "Multivariate_LGBM"

forecasts_original = train_and_forecast(ts_data=Y_processed, 
                                        horizon_length=h, 
                                        forecasting_model=forecasting_model, 
                                        protection_method=protection_method, 
                                        last_window=Y_last_window)

forecasts_protected = train_and_forecast(ts_data=Y_protected_processed, 
                                         horizon_length=h, 
                                         forecasting_model=forecasting_model, 
                                         protection_method=protection_method, 
                                         last_window=Y_protected_last_window)

### Step 5: Post Process the Forecasts

In [11]:
forecasts_original = post_process(full_ts_data=Y, 
                                  forecasts=forecasts_original, 
                                  last_window_with_trend=Y_last_window_trend,
                                  mean_normalize=True,
                                  log=True,
                                  transform_dict=transform_dict)

In [12]:
forecasts_protected = post_process(full_ts_data=Y_protected, 
                                   forecasts=forecasts_protected, 
                                   last_window_with_trend=Y_protected_last_window_trend,
                                   mean_normalize=True,
                                   log=True,
                                   transform_dict=transform_dict)

### Step 6: Saving Forecasts

In [13]:
# save the forecasts to a file
forecasts_original.to_csv("../../Outputs/Forecasts/" + forecasting_model + "_" + "h" + str(h) + "_original.csv")
forecasts_protected.to_csv("../../Outputs/Forecasts/" + forecasting_model + "_" + "h" + str(h) + "_" + protection_method + ".csv")

### Step 7: Forecast Evaluation

In [14]:
# Things to calculate
# series-level MAPE, MdAPE
# the percentage of series that had improved/worsened accuracy

In [15]:
Test = pd.DataFrame([x.reset_index(drop=True) for x in Test]).T

In [16]:
forecast_results(test_data=Test, original_forecasts=forecasts_original, protected_forecasts=forecasts_protected)

{'Global MAPE': 0.2265,
 'Global Protected MAPE': 0.2232,
 'Global MAE': 666.7842,
 'Global Protected MAE': 646.5698,
 'Global MdAE': 666.7842,
 'Global Protected MdAE': 646.5698,
 'Original MAE Up': 655.2745,
 'Protected MAE Up': 655.0543,
 'Original MAE Down': 676.6691,
 'Protected MAE Down': 639.2831,
 'Original MdAE Up': 379.6991,
 'Protected MdAE Up': 389.3447,
 'Original MdAE Down': 413.2852,
 'Protected MdAE Down': 386.5654}

***

In [2]:
# import weekly finance time series
# ignore header and skip the first row to use integers as column names
full_data = pd.read_csv("../../Data/Train/Clean/full_m3_monthly_micro_clean.csv", header=None, skiprows=1)

In [3]:
# convert to a list of series, potentially with different lengths
full_data = [x.dropna() for _, x in full_data.iterrows()]

In [17]:
h = 1
protection_method = "Top_10"
forecasting_model = "Multivariate_LGBM"
window_length = 20
seasonality_type = "additive"
sp = 12
remove_seasonality = True
mean_normalize = True

In [18]:
Y = [x.iloc[:-h] for x in full_data]
Test = [x.iloc[-h:] for x in full_data]

In [19]:
Y_protected = apply_data_protection(Y, coding_type="Top", coding_percentage=0.10)

In [21]:
full_forecast_analysis(Y=Y,
                       Y_protected=Y_protected,
                       Test=Test,
                       h=h,
                       protection_method=protection_method,
                       forecasting_model=forecasting_model,
                       window_length=window_length,
                       seasonality_type=seasonality_type,
                       sp=sp,
                       remove_seasonality=remove_seasonality,
                       mean_normalize=mean_normalize,
                       log=True)

{'Global MAPE': 0.2265,
 'Global Protected MAPE': 0.2232,
 'Global MAE': 666.7842,
 'Global Protected MAE': 646.5698,
 'Global MdAE': 666.7842,
 'Global Protected MdAE': 646.5698,
 'Original MAE Up': 655.2745,
 'Protected MAE Up': 655.0543,
 'Original MAE Down': 676.6691,
 'Protected MAE Down': 639.2831,
 'Original MdAE Up': 379.6991,
 'Protected MdAE Up': 389.3447,
 'Original MdAE Down': 413.2852,
 'Protected MdAE Down': 386.5654}