## Startup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from statsmodels.tsa.api import VAR
from statsmodels.tools.eval_measures import aic, bic

import copy
import os
import pickle

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
np.random.seed(42)

In [4]:
dataroute = os.path.join("..", "data")
dumproute = os.path.join("..", "dump")
resultsroute = os.path.join("..", "results")

In [5]:
from scripts.params import get_params

params = get_params()

## Data Retrieval

In [6]:
name = f"""finaldf_train_{params["tablename"]}.pickle"""
filename = os.path.join(dataroute, name)
with open(filename, "rb") as handle:
    df = pickle.load(handle)

name = f'finaldf_test_{params["tablename"]}.pickle'
filename = os.path.join(dataroute, name)
with open(filename, "rb") as handle:
    df_test = pickle.load(handle)

In [7]:
tickerlist = params["tickerlist"]

In [8]:
df.tail(1)

Unnamed: 0,^MERV_rets,^MERV_log_rets,^MERV_gk_vol,GGAL.BA_rets,GGAL.BA_log_rets,GGAL.BA_gk_vol,GGAL_rets,GGAL_log_rets,GGAL_gk_vol,YPFD.BA_rets,...,BBAR.BA_gk_vol,BBAR_rets,BBAR_log_rets,BBAR_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,USD_^MERV_rets,USD_^MERV_log_rets,USD_^MERV_gk_vol
2023-05-31,-0.003825,-0.003832,0.000113,0.021748,0.021515,0.001402,0.01836,0.018193,0.001315,-0.022577,...,0.001981,-0.022321,-0.022574,0.000603,0.004597,0.004586,1.5e-05,0.001323,0.001322,0.000113


In [9]:
df_test.head(1)

Unnamed: 0,^MERV_rets,^MERV_log_rets,^MERV_gk_vol,GGAL.BA_rets,GGAL.BA_log_rets,GGAL.BA_gk_vol,GGAL_rets,GGAL_log_rets,GGAL_gk_vol,YPFD.BA_rets,...,BBAR.BA_gk_vol,BBAR_rets,BBAR_log_rets,BBAR_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,USD_^MERV_rets,USD_^MERV_log_rets,USD_^MERV_gk_vol
2023-06-01,0.033019,0.032486,0.000989,0.02532,0.025004,0.000851,0.018534,0.018364,0.001232,0.040686,...,0.004033,0.03653,0.035878,0.001698,0.012531,0.012453,0.000162,0.026511,0.026166,0.000989


In [10]:
def generate_columns(stock: str, contains_vol: bool, contains_USD: bool):
    """Devuelve una lista con los nombres de columnas para distintas especificaciones"""
    columns = []
    columns.append(f"{stock}_log_rets")

    if contains_vol:
        columns.append(f"{stock}_gk_vol")

    if contains_USD:
        columns.append(f"USD_log_rets")
        columns.append(f"USD_gk_vol")

    return columns

In [11]:
selected_orders = VAR(df[["BBAR_log_rets", "BBAR_gk_vol"]]).select_order(
    maxlags=None, trend="c"
)
selected_orders.selected_orders

{'aic': 6, 'bic': 2, 'hqic': 4, 'fpe': 6}

In [12]:
df.tail()

Unnamed: 0,^MERV_rets,^MERV_log_rets,^MERV_gk_vol,GGAL.BA_rets,GGAL.BA_log_rets,GGAL.BA_gk_vol,GGAL_rets,GGAL_log_rets,GGAL_gk_vol,YPFD.BA_rets,...,BBAR.BA_gk_vol,BBAR_rets,BBAR_log_rets,BBAR_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,USD_^MERV_rets,USD_^MERV_log_rets,USD_^MERV_gk_vol
2023-05-23,0.00533,0.005315,0.000464,0.017734,0.017578,0.000697,0.007916,0.007884,0.001497,0.038068,...,0.001741,0.030733,0.03027,0.002791,0.01592,0.015794,0.000216,-0.007137,-0.007162,0.000464
2023-05-24,0.002593,0.00259,0.000161,0.017608,0.017455,0.000874,0.026178,0.025841,0.00206,0.011437,...,0.002068,0.011468,0.011403,0.00146,-0.0083,-0.008335,0.000126,0.005553,0.005537,0.000161
2023-05-29,0.013755,0.013661,0.000283,0.023161,0.022897,0.00062,0.0,0.0,0.0,0.010033,...,0.000856,0.0,0.0,0.0,0.007615,0.007586,1.7e-05,0.008795,0.008757,0.000283
2023-05-30,-0.008595,-0.008632,0.000265,0.004404,0.004395,0.001201,0.036815,0.036153,0.002979,-0.020036,...,0.000446,0.018182,0.018018,0.001212,-0.01736,-0.017512,0.000187,-0.007206,-0.007232,0.000265
2023-05-31,-0.003825,-0.003832,0.000113,0.021748,0.021515,0.001402,0.01836,0.018193,0.001315,-0.022577,...,0.001981,-0.022321,-0.022574,0.000603,0.004597,0.004586,1.5e-05,0.001323,0.001322,0.000113


In [13]:
df_test.head()

Unnamed: 0,^MERV_rets,^MERV_log_rets,^MERV_gk_vol,GGAL.BA_rets,GGAL.BA_log_rets,GGAL.BA_gk_vol,GGAL_rets,GGAL_log_rets,GGAL_gk_vol,YPFD.BA_rets,...,BBAR.BA_gk_vol,BBAR_rets,BBAR_log_rets,BBAR_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,USD_^MERV_rets,USD_^MERV_log_rets,USD_^MERV_gk_vol
2023-06-01,0.033019,0.032486,0.000989,0.02532,0.025004,0.000851,0.018534,0.018364,0.001232,0.040686,...,0.004033,0.03653,0.035878,0.001698,0.012531,0.012453,0.000162,0.026511,0.026166,0.000989
2023-06-02,-0.001236,-0.001237,0.000117,-0.021765,-0.022005,0.001171,-0.022332,-0.022586,0.00161,0.014547,...,0.000801,-0.006608,-0.00663,0.000876,-0.004449,-0.004459,2.6e-05,-0.005255,-0.005269,0.000117
2023-06-05,0.022352,0.022106,0.000512,0.039877,0.039102,0.001169,0.043147,0.042242,0.002685,0.021411,...,0.001479,0.035477,0.034862,0.001976,0.009517,0.009472,0.000436,0.000804,0.000804,0.000512
2023-06-06,0.05524,0.053768,0.002593,0.122284,0.115366,0.006882,0.129765,0.12201,0.013107,0.056152,...,0.008509,0.122056,0.115162,0.011983,-0.008866,-0.008906,0.000514,0.060322,0.058573,0.002593
2023-06-07,-0.000858,-0.000858,0.000234,0.040329,0.039537,0.000975,0.040919,0.040104,0.003104,0.006279,...,0.00176,0.038168,0.037458,0.00235,0.005694,0.005678,5.3e-05,0.000512,0.000512,0.000234


In [14]:
def generate_VAR_samples_residuals(
    stock: str,
    lags: int,
    insample_data: pd.DataFrame,
    oos_data: pd.DataFrame,
    contains_vol: bool,
    contains_USD: bool,
):
    columns = generate_columns(
        stock=stock, contains_vol=contains_vol, contains_USD=contains_USD
    )

    combined_data = pd.concat([insample_data[columns], oos_data[columns]])

    split_date = oos_data.index[0]
    dates_to_forecast = len(oos_data)

    forecasts = pd.DataFrame()
    residuals = pd.DataFrame()

    for i in range(1, dates_to_forecast):
        end_loc = combined_data.index.get_loc(split_date) + i
        fitstart = end_loc - 252
        fitend = end_loc

        stock_data = combined_data.iloc[fitstart:fitend]

        model = VAR(stock_data)
        results = model.fit(lags)

        fcast = results.forecast(y=stock_data.values, steps=1)
        fcast_df = pd.DataFrame(fcast, columns=columns)

        resid = results.resid.iloc[-1:].rename({0: 'residual'}, axis=1)

        forecasts = pd.concat([forecasts, fcast_df], ignore_index=True)
        residuals = pd.concat([residuals, resid], ignore_index=True)

    return forecasts, residuals


In [15]:
def estimate_best_residuals(
    stock: str,
    criterion: str,
    insample_data: pd.DataFrame,
    oos_data: pd.DataFrame,
    contains_vol: bool,
    contains_USD: bool,
):
    columns = generate_columns(
        stock=stock, contains_vol=contains_vol, contains_USD=contains_USD
    )

    selected_orders = VAR(insample_data[columns]).select_order(maxlags=15, trend="c")
    best_lag = selected_orders.selected_orders[criterion]

    forecasts, residuals = generate_VAR_samples_residuals(
        stock=stock,
        lags=best_lag,
        insample_data=insample_data,
        oos_data=oos_data,
        contains_vol=contains_vol,
        contains_USD=contains_USD,
    )

    assert type(residuals) == pd.DataFrame

    return best_lag, forecasts, residuals

In [16]:
def save_as_pickle(data, contains_USD: bool, criterion: str, type_save: str):
    if contains_USD:
        string = "multiv"
    else:
        string = "with_vol"

    with open(
        os.path.join(
            resultsroute,
            f"""VAR_{string}_{params["tablename"]}_{criterion}_best_{type_save}.pickle""",
        ),
        "wb",
    ) as output_file:
        pickle.dump(data, output_file)

In [17]:
best_lags = {
    "aic": {"contains_USD=True": {}, "contains_USD=False": {}},
    "bic": {"contains_USD=True": {}, "contains_USD=False": {}},
}
best_forecasts = copy.deepcopy(best_lags)
best_residuals = copy.deepcopy(best_lags)

for criterion in ["aic", "bic"]:
    for contains_USD in [True, False]:
        usdstring = f"contains_USD={contains_USD}"

        for stock in tickerlist:
            best_lag, forecasts, residuals = estimate_best_residuals(
                stock=stock,
                criterion=criterion,
                insample_data=df,
                oos_data=df_test,
                contains_vol=True,
                contains_USD=contains_USD,
            )

            pct_nan = forecasts.iloc[:, 0].isna().sum() / len(forecasts.index) * 100

            if pct_nan > 5:
                warnings.warn(f"{stock} % na: {pct_nan}")

            forecasts.fillna(method="ffill", inplace=True)
            residuals.fillna(method="ffill", inplace=True)

            best_lags[criterion][usdstring][stock] = best_lag
            best_forecasts[criterion][usdstring][stock] = forecasts
            best_residuals[criterion][usdstring][stock] = residuals

        save_as_pickle(
            data=best_lags[criterion][usdstring],
            contains_USD=contains_USD,
            criterion=criterion,
            type_save="lags",
        )
        save_as_pickle(
            data=best_forecasts[criterion][usdstring],
            contains_USD=contains_USD,
            criterion=criterion,
            type_save="forecasts",
        )
        save_as_pickle(
            data=best_residuals[criterion][usdstring],
            contains_USD=contains_USD,
            criterion=criterion,
            type_save="residuals",
        )