In [131]:
from statsmodels.tsa.vector_ar.var_model import VAR

# load modules
import numpy as np
import pandas as pd

from data_protection_functions import *
from data_processing_functions import *
# from forecasting_functions import *

# nice time series plots
from sktime.utils.plotting import plot_series

from sktime.performance_metrics.forecasting import mean_absolute_error

***

In [132]:
# import weekly finance time series
# ignore header and skip the first row to use integers as column names
full_data = pd.read_csv("../../Data/Train/Clean/full_m3_monthly_micro_clean.csv", header=None, skiprows=1)

In [133]:
# convert to a list of series, potentially with different lengths
full_data = [x.dropna() for _, x in full_data.iterrows()]

In [134]:
# forecast horizon
h = 1

In [135]:
Y = [x.iloc[:-h] for x in full_data]
test = pd.read_csv("../../Outputs/Forecasts/Test_h1.csv")

***

### Apply Data Protection

In [136]:
epsilons = [1]

In [137]:
eps = epsilons[0]
Y_protected = apply_data_protection(Y, epsilon=eps)

***

### Extract A Subset of the Data

In [138]:
Y = Y[:5]
Y_protected = Y_protected[:5]

***

### Pre-Process the Data

In [139]:
Y_processed, Y_last_window, Y_last_window_trend, _, full_lags = pre_process(ts_data=Y,
                                                                            target_forecast_period=h,
                                                                            log=True,
                                                                            make_stationary=False,
                                                                            sp=12)

Y_protected_processed, Y_protected_last_window, Y_protected_last_window_trend, _, full_lags_protected = pre_process(ts_data=Y_protected,
                                                                                                                    target_forecast_period=h,
                                                                                                                    log=True,
                                                                                                                    make_stationary=False,
                                                                                                                    sp=12)

### Train Models and Generate Forecasts

In [140]:
def VAR_forecast(ts_data, h, param_grid, noisy_protection=False):
    
    intercept_list = []
    lag_list = []
    coef_list = []
    fitted_list = []

    # get the length of each series
    lengths = [len(y) for y in ts_data]

    # store the unique length values
    unique_lengths = np.unique(lengths)

    # store the forecasts in an array of all forecasts using the stored series indices
    full_forecasts = np.zeros([len(ts_data), h])

    for k, l in enumerate(unique_lengths):

        # get the indexes of each series with the lth length
        Y_ids = np.nonzero(lengths == l)[0]

        split_ids = split(Y_ids, 5)

        for i, j in enumerate(split_ids):
                
            # store series in a list
            group = [ts_data[m].reset_index(drop=True) for m in j]

            # convert list to TxK dataframe
            group = pd.concat(group, axis=1, ignore_index=True)
            
            ####################################################
            
            forecaster = VAR(endog=group)
            results = forecaster.fit()
            
            fitted = results.fittedvalues
            fitted_list = fitted
            
            # extract intercept coefficients
            intercepts = results.coefs_exog
            intercept_list.append(intercepts)
            
            # extract lag coefficients
            coefs = results.coefs
            coef_list.append(coefs)
            
            # number of lags in VAR model
            lag_order = results.k_ar
            lag_list.append(lag_order)
            
            # forecast nfs steps ahead using lag_order prior values
            y_pred = results.forecast(np.array(group[-lag_order:]), steps=h)
            
            # forecaster = VAR()
            # forecaster.fit(group)
            # y_pred = forecaster.predict(h)

            # store forecasts in dataframe for all series
            full_forecasts[j,:] = y_pred.T

    full_forecasts = [pd.Series(full_forecasts[i,:]) for i in range(full_forecasts.shape[0])]

    for i in range(len(full_forecasts)):
        last_time = ts_data[i].index[-1]
        full_forecasts[i].index = np.arange(last_time+1, last_time+1+h)

    return full_forecasts, intercept_list, lag_list, coef_list, fitted_list

In [141]:
forecasts_original, orig_intercepts, orig_lags, orig_coefs, orig_fitted = VAR_forecast(Y_processed, h=1, param_grid=None)

In [142]:
forecasts_protected, protected_intercepts, protected_lags, protected_coefs, protected_fitted = VAR_forecast(Y_protected_processed, h=1, param_grid=None)

### Post-process Forecasts

In [143]:
forecasts_original = post_process(full_ts_data=Y, 
                                  forecasts=forecasts_original, 
                                  target_forecast_period=h,
                                  log=True,
                                  make_stationary=False,
                                  sp=12,
                                  full_lags=full_lags)

forecasts_protected = post_process(full_ts_data=Y_protected, 
                                   forecasts=forecasts_protected, 
                                   target_forecast_period=h,
                                   log=True,
                                   make_stationary=False,
                                   sp=12,
                                   full_lags=full_lags_protected)

### Calculate Errors

In [144]:
mean_absolute_error(forecasts_original, test.iloc[:,:5])

908.8075071489199

In [145]:
mean_absolute_error(forecasts_protected, test.iloc[:,:5])

3998.5449032339266

### Perform Analysis for Range of Epsilon Values

In [146]:
epsilons = [0.1, 1, 4.6, 10, 20]

In [147]:
protected_intercepts_dict = {}
protected_lags_dict = {}
protected_coefs_dict = {}
num_diffs_dict = {}
fitted_dict = {}

for eps in epsilons:
    
    # protect the data using the current epsilon
    Y_protected = apply_data_protection(Y, epsilon=eps)
    
    # extract first five protected series
    Y_protected = Y_protected[:5]
    
    Y_protected_processed, Y_protected_last_window, Y_protected_last_window_trend, _, full_lags_protected = pre_process(ts_data=Y_protected,
                                                                                                                        target_forecast_period=h,
                                                                                                                        log=True,
                                                                                                                        make_stationary=False,
                                                                                                                        sp=12)
    
    forecasts_protected, protected_intercepts, protected_lags, protected_coefs, protected_fitted = VAR_forecast(Y_protected_processed, h=1, param_grid=None)
    
    forecasts_protected = post_process(full_ts_data=Y_protected, 
                                       forecasts=forecasts_protected, 
                                       target_forecast_period=h,
                                       log=True,
                                       make_stationary=False,
                                       sp=12,
                                       full_lags=full_lags_protected)
    
    num_diffs_dict[str(eps)] = full_lags_protected
    protected_intercepts_dict[str(eps)] = protected_intercepts
    protected_lags_dict[str(eps)] = protected_lags
    protected_coefs_dict[str(eps)] = protected_coefs
    fitted_dict[str(eps)] = protected_fitted

In [148]:
num_diffs_dict['orig'] = full_lags

In [149]:
num_diffs_dict

{'0.1': None, '1': None, '4.6': None, '10': None, '20': None, 'orig': None}

In [150]:
protected_coefs_dict['orig'] = orig_coefs

In [151]:
protected_intercepts_dict['orig'] = orig_intercepts

In [152]:
fitted_dict['orig'] = orig_fitted

### Save Values

The number of first differences taken for each series.

In [153]:
[x[1]['0'][0] for x in num_diffs_dict.items()]

TypeError: 'NoneType' object is not subscriptable

In [154]:
fitted = [x[1] for x in fitted_dict.items()]

In [155]:
fitted = pd.concat(fitted, keys=['0.1', '1', '4.6', '10', '20', 'Original']).reset_index()

In [156]:
fitted = fitted.iloc[:, [0, 2, 3, 4, 5, 6]]

In [158]:
fitted.to_csv("../../Outputs/Tables/var_fitted_vals.csv", index=False)

In [160]:
coefs_df = pd.concat([pd.DataFrame(i[1][0][0,:].T) for i in protected_coefs_dict.items()], axis=1)
coefs_df.to_csv("../../Outputs/Tables/var_coefs_dp.csv", index=False)

In [162]:
ints_df = pd.concat([pd.DataFrame(i[1][0]) for i in protected_intercepts_dict.items()], axis=1)
ints_df.to_csv("../../Outputs/Tables/var_ints_dp.csv", index=False)