## Startup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from statsmodels.tsa.api import VAR
from statsmodels.tools.eval_measures import aic, bic

import copy
import os
import pickle

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
np.random.seed(42)

In [4]:
dataroute = os.path.join("..", "data")
dumproute = os.path.join("..", "dump")
resultsroute = os.path.join("..", "results")

In [5]:
from scripts.params import get_params

params = get_params()

## Data Retrieval

In [6]:
name = f"""finaldf_train_{params["tablename"]}.pickle"""
filename = os.path.join(dataroute, name)
with open(filename, "rb") as handle:
    df = pickle.load(handle)

name = f'finaldf_test_{params["tablename"]}.pickle'
filename = os.path.join(dataroute, name)
with open(filename, "rb") as handle:
    df_test = pickle.load(handle)

In [7]:
tickerlist = params["tickerlist"]

In [8]:
df.tail(1)

Unnamed: 0,^BVSP_rets,^BVSP_log_rets,^BVSP_gk_vol,VALE3.SA_rets,VALE3.SA_log_rets,VALE3.SA_gk_vol,VALE_rets,VALE_log_rets,VALE_gk_vol,PETR3.SA_rets,...,ABEV3.SA_gk_vol,ABEV_rets,ABEV_log_rets,ABEV_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,^BVSP_USD_rets,^BVSP_USD_log_rets,^BVSP_USD_gk_vol
2023-05-31,-0.0058,-0.005817,5.1e-05,-0.007312,-0.007339,0.000245,-0.00627,-0.006289,0.000215,-0.012175,...,0.000109,-0.014134,-0.014235,0.000251,0.004457,0.004447,9e-06,-0.005988,-0.006006,5.1e-05


In [9]:
df_test.head(1)

Unnamed: 0,^BVSP_rets,^BVSP_log_rets,^BVSP_gk_vol,VALE3.SA_rets,VALE3.SA_log_rets,VALE3.SA_gk_vol,VALE_rets,VALE_log_rets,VALE_gk_vol,PETR3.SA_rets,...,ABEV3.SA_gk_vol,ABEV_rets,ABEV_log_rets,ABEV_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,^BVSP_USD_rets,^BVSP_USD_log_rets,^BVSP_USD_gk_vol
2023-06-01,0.020584,0.020375,0.000402,0.021157,0.020936,0.000315,0.028391,0.027996,0.000488,0.0291,...,0.00013,0.010753,0.010695,0.000163,-0.013985,-0.014083,3.5e-05,0.030699,0.030237,0.000402


In [10]:
def generate_columns(stock: str, contains_vol: bool, contains_USD: bool):
    """Devuelve una lista con los nombres de columnas para distintas especificaciones"""
    columns = []
    columns.append(f"{stock}_log_rets")

    if contains_vol:
        columns.append(f"{stock}_gk_vol")

    if contains_USD:
        columns.append(f"USD_log_rets")
        columns.append(f"USD_gk_vol")

    return columns

In [12]:
df.tail()

Unnamed: 0,^BVSP_rets,^BVSP_log_rets,^BVSP_gk_vol,VALE3.SA_rets,VALE3.SA_log_rets,VALE3.SA_gk_vol,VALE_rets,VALE_log_rets,VALE_gk_vol,PETR3.SA_rets,...,ABEV3.SA_gk_vol,ABEV_rets,ABEV_log_rets,ABEV_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,^BVSP_USD_rets,^BVSP_USD_log_rets,^BVSP_USD_gk_vol
2023-05-25,0.011526,0.01146,0.000272,-0.003075,-0.003079,0.000204,-0.019055,-0.019239,0.000269,-0.00739,...,0.000201,-0.010204,-0.010257,0.000151,0.014761,0.014653,1.7e-05,-0.001131,-0.001132,0.000272
2023-05-26,0.007742,0.007712,0.000156,0.022822,0.022565,0.00016,0.030303,0.029853,0.000164,0.016582,...,0.000159,0.0,0.0,0.000252,-0.004774,-0.004786,2e-06,0.010514,0.010459,0.000156
2023-05-29,-0.005167,-0.00518,4.9e-05,-0.007538,-0.007567,0.00063,0.0,0.0,0.0,-0.006325,...,0.000159,0.0,0.0,0.0,0.00094,0.000939,1e-05,-0.010572,-0.010629,4.9e-05
2023-05-30,-0.012381,-0.012458,0.00037,-0.023546,-0.023827,0.001086,-0.037707,-0.038437,0.000416,-0.00938,...,0.000735,-0.027491,-0.027876,0.000564,0.008492,0.008456,2.5e-05,-0.021534,-0.02177,0.00037
2023-05-31,-0.0058,-0.005817,5.1e-05,-0.007312,-0.007339,0.000245,-0.00627,-0.006289,0.000215,-0.012175,...,0.000109,-0.014134,-0.014235,0.000251,0.004457,0.004447,9e-06,-0.005988,-0.006006,5.1e-05


In [13]:
df_test.head()

Unnamed: 0,^BVSP_rets,^BVSP_log_rets,^BVSP_gk_vol,VALE3.SA_rets,VALE3.SA_log_rets,VALE3.SA_gk_vol,VALE_rets,VALE_log_rets,VALE_gk_vol,PETR3.SA_rets,...,ABEV3.SA_gk_vol,ABEV_rets,ABEV_log_rets,ABEV_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,^BVSP_USD_rets,^BVSP_USD_log_rets,^BVSP_USD_gk_vol
2023-06-01,0.020584,0.020375,0.000402,0.021157,0.020936,0.000315,0.028391,0.027996,0.000488,0.0291,...,0.00013,0.010753,0.010695,0.000163,-0.013985,-0.014083,3.535259e-05,0.030699,0.030237,0.000402
2023-06-02,0.018026,0.017865,0.000373,0.042664,0.041779,7.5e-05,0.04908,0.047913,0.00012,0.011976,...,0.000335,0.028369,0.027974,0.00046,-0.009057,-0.009098,5.5209e-07,0.031513,0.031027,0.000373
2023-06-05,0.001226,0.001225,7.1e-05,-0.008243,-0.008277,0.000475,-0.005117,-0.00513,0.000254,0.006246,...,7.8e-05,-0.006897,-0.00692,0.000114,-0.000801,-0.000801,2.723537e-06,0.005134,0.005121,7.1e-05
2023-06-06,0.016984,0.016841,0.000278,0.003265,0.00326,0.00011,0.009552,0.009507,0.000105,0.020255,...,0.000686,0.034722,0.034133,0.001061,-0.006883,-0.006906,5.04636e-05,0.018346,0.01818,0.000278
2023-06-07,0.007661,0.007632,9.3e-05,0.015828,0.015704,0.000272,0.014556,0.014451,0.000156,0.03106,...,0.000331,0.003356,0.00335,0.000277,0.001856,0.001855,2.761701e-05,0.012536,0.012458,9.3e-05


In [14]:
def generate_VAR_samples_residuals(
    stock: str,
    lags: int,
    insample_data: pd.DataFrame,
    oos_data: pd.DataFrame,
    contains_vol: bool,
    contains_USD: bool,
):
    columns = generate_columns(
        stock=stock, contains_vol=contains_vol, contains_USD=contains_USD
    )

    combined_data = pd.concat([insample_data[columns], oos_data[columns]])

    split_date = oos_data.index[0]
    dates_to_forecast = len(oos_data)

    fcast_holder = []
    resid_holder = []

    for i in range(0, dates_to_forecast):
        end_loc = combined_data.index.get_loc(split_date) + i
        fitstart = end_loc - 252
        fitend = end_loc

        stock_data = combined_data.iloc[fitstart:fitend]

        model = VAR(stock_data)
        results = model.fit(lags)

        fcast = results.forecast(y=stock_data.values, steps=1)
        resid = results.resid.iloc[-1:]
        
        fcast_holder.append(fcast)
        resid_holder.append(resid)

    forecasts = pd.DataFrame(np.concatenate(fcast_holder), columns=columns, index=oos_data.index)
    residuals = pd.DataFrame(np.concatenate(resid_holder), columns=columns, index=oos_data.index)

    return forecasts, residuals


In [16]:
def estimate_best_residuals(
    stock: str,
    criterion: str,
    insample_data: pd.DataFrame,
    oos_data: pd.DataFrame,
    contains_vol: bool,
    contains_USD: bool,
):
    columns = generate_columns(
        stock=stock, contains_vol=contains_vol, contains_USD=contains_USD
    )

    selected_orders = VAR(insample_data[columns]).select_order(maxlags=15, trend="c")
    best_lag = selected_orders.selected_orders[criterion]

    forecasts, residuals = generate_VAR_samples_residuals(
        stock=stock,
        lags=best_lag,
        insample_data=insample_data,
        oos_data=oos_data,
        contains_vol=contains_vol,
        contains_USD=contains_USD,
    )

    assert type(residuals) == pd.DataFrame

    return best_lag, forecasts, residuals

In [17]:
def save_as_pickle(data, contains_USD: bool, criterion: str, type_save: str):
    if contains_USD:
        string = "multiv"
    else:
        string = "with_vol"

    with open(
        os.path.join(
            resultsroute,
            f"""VAR_{string}_{params["tablename"]}_{criterion}_best_{type_save}.pickle""",
        ),
        "wb",
    ) as output_file:
        pickle.dump(data, output_file)

In [18]:
best_lags = {
    "aic": {"contains_USD=True": {}, "contains_USD=False": {}},
    "bic": {"contains_USD=True": {}, "contains_USD=False": {}},
}
best_forecasts = copy.deepcopy(best_lags)
best_residuals = copy.deepcopy(best_lags)

for criterion in ["aic", "bic"]:
    for contains_USD in [True, False]:
        usdstring = f"contains_USD={contains_USD}"

        for stock in tickerlist:
            best_lag, forecasts, residuals = estimate_best_residuals(
                stock=stock,
                criterion=criterion,
                insample_data=df,
                oos_data=df_test,
                contains_vol=True,
                contains_USD=contains_USD,
            )

            pct_nan = forecasts.iloc[:, 0].isna().sum() / len(forecasts.index) * 100

            if pct_nan > 5:
                warnings.warn(f"{stock} % na: {pct_nan}")

            forecasts.fillna(method="ffill", inplace=True)
            residuals.fillna(method="ffill", inplace=True)

            best_lags[criterion][usdstring][stock] = best_lag
            best_forecasts[criterion][usdstring][stock] = forecasts
            best_residuals[criterion][usdstring][stock] = residuals

        save_as_pickle(
            data=best_lags[criterion][usdstring],
            contains_USD=contains_USD,
            criterion=criterion,
            type_save="lags",
        )
        save_as_pickle(
            data=best_forecasts[criterion][usdstring],
            contains_USD=contains_USD,
            criterion=criterion,
            type_save="forecasts",
        )
        save_as_pickle(
            data=best_residuals[criterion][usdstring],
            contains_USD=contains_USD,
            criterion=criterion,
            type_save="residuals",
        )

In [19]:
for crit, d in best_residuals.items():
    for cols, values in d.items():
        for stock, dataframe in values.items():
            isna= dataframe.iloc[:,0].isna().sum()/len(dataframe.index)
            if isna>0:
                print(crit, stock, cols, isna)