# Comparison


In [2]:
import pandas as pd
import numpy as np
import os

pd.set_option("display.max_columns", None)

In [3]:
from scripts.params import get_params

params = get_params()

In [19]:
from epftoolbox.evaluation import DM, GW, plot_multivariate_GW_test, plot_multivariate_DM_test

# note: to install this package, execute
    #git clone https://github.com/jeslago/epftoolbox.git
    #cd epftoolbox
    #pip install .
# may also need to modify epftoolbox/setup.py to be able to install it after python 3.11

In [5]:
dataroute = os.path.join("..", "data")
dumproute = os.path.join("..", "dump")
resultsroute = os.path.join("..", "results")

In [6]:
start_test = params["start_test"]

In [7]:
all_residuals = {}

for filename in os.listdir(resultsroute):
    file_path = os.path.join(resultsroute, filename)
    if os.path.isfile(file_path) and "residual" in filename:
        all_residuals[filename] = file_path

print(all_residuals)

{'GARCH_AR_^MERV_aic_best_residuals.pickle': '..\\results\\GARCH_AR_^MERV_aic_best_residuals.pickle', 'GARCH_AR_^MERV_bic_best_residuals.pickle': '..\\results\\GARCH_AR_^MERV_bic_best_residuals.pickle', 'HMM_multiv_AR_^MERV_aic_best_residuals.pickle': '..\\results\\HMM_multiv_AR_^MERV_aic_best_residuals.pickle', 'HMM_multiv_AR_^MERV_aic_residuals.pickle': '..\\results\\HMM_multiv_AR_^MERV_aic_residuals.pickle', 'HMM_multiv_AR_^MERV_bic_best_residuals.pickle': '..\\results\\HMM_multiv_AR_^MERV_bic_best_residuals.pickle', 'HMM_multiv_AR_^MERV_bic_residuals.pickle': '..\\results\\HMM_multiv_AR_^MERV_bic_residuals.pickle', 'HMM_univ_AR_^MERV_aic_best_residuals.pickle': '..\\results\\HMM_univ_AR_^MERV_aic_best_residuals.pickle', 'HMM_univ_AR_^MERV_aic_residuals.pickle': '..\\results\\HMM_univ_AR_^MERV_aic_residuals.pickle', 'HMM_univ_AR_^MERV_bic_best_residuals.pickle': '..\\results\\HMM_univ_AR_^MERV_bic_best_residuals.pickle', 'HMM_univ_AR_^MERV_bic_residuals.pickle': '..\\results\\HMM_un

In [8]:
def get_only_log_rets(dict_with_dfs: dict, stock: str):
    if type(dict_with_dfs[stock]) == pd.Series:
        # univariate models are saved as series
        df = pd.DataFrame(dict_with_dfs[stock])

    else:
        try:
            # multivariate models are saved as dataframes
            df = pd.DataFrame(dict_with_dfs[stock][f"{stock}_log_rets"])
        except:  # TODO: SACAR ESTO! Es un chanchullo pq hay algunas que son guardadas como None
            pass
    return df

In [9]:
residual_df = pd.DataFrame()

for name, dir in all_residuals.items():
    dict_with_dfs = pd.read_pickle(dir)
    print(name)

    for stock in dict_with_dfs.keys():
        df = get_only_log_rets(dict_with_dfs, stock)

        modelname = (
            name.replace("residuals.pickle", "")
            .replace("best", "")
            .replace(params["tablename"], "")
            .replace("__", "_")
            .replace("__", "_")
        )

        df.columns = [modelname + "_" + stock]

        residual_df = pd.merge(
            residual_df, df, left_index=True, right_index=True, how="outer"
        )

residual_df.index = pd.to_datetime(residual_df.index)
residual_df = residual_df[residual_df.index > start_test]

GARCH_AR_^MERV_aic_best_residuals.pickle
GARCH_AR_^MERV_bic_best_residuals.pickle
HMM_multiv_AR_^MERV_aic_best_residuals.pickle
HMM_multiv_AR_^MERV_aic_residuals.pickle
HMM_multiv_AR_^MERV_bic_best_residuals.pickle
HMM_multiv_AR_^MERV_bic_residuals.pickle
HMM_univ_AR_^MERV_aic_best_residuals.pickle
HMM_univ_AR_^MERV_aic_residuals.pickle
HMM_univ_AR_^MERV_bic_best_residuals.pickle
HMM_univ_AR_^MERV_bic_residuals.pickle
HMM_with_vol_AR_^MERV_aic_best_residuals.pickle
HMM_with_vol_AR_^MERV_aic_residuals.pickle
HMM_with_vol_AR_^MERV_bic_best_residuals.pickle
HMM_with_vol_AR_^MERV_bic_residuals.pickle
VAR_multiv_AR_^MERV_aic_best_residuals.pickle
VAR_multiv_AR_^MERV_bic_best_residuals.pickle
VAR_with_vol_AR_^MERV_aic_best_residuals.pickle
VAR_with_vol_AR_^MERV_bic_best_residuals.pickle


In [10]:
def subset_of_columns(df: pd.DataFrame, substring: str):
    filtered_columns = [col for col in df.columns if substring in col]
    return df[filtered_columns]

In [11]:
aic_residuals = subset_of_columns(residual_df, "aic")
bic_residuals = subset_of_columns(residual_df, "bic")

In [12]:
# estadisticos de nans
(residual_df.isna().sum() / len(residual_df.index) * 100).describe()

count    198.000000
mean       0.829433
std        2.654380
min        0.000000
25%        0.000000
50%        0.000000
75%        0.813008
max       17.886179
dtype: float64

In [13]:
# estadisticos de nans
((residual_df.isna().sum(axis=0) / len(residual_df.index)) * 100).nlargest(10)
# VAR tiene problemas con NANs

HMM_multiv_aic__YPF_x     17.886179
HMM_multiv_aic__YPF_y     17.886179
HMM_multiv_bic__YPF_x     17.886179
HMM_multiv_bic__YPF_y     17.886179
HMM_multiv_aic__BBAR_x     6.504065
HMM_multiv_aic__BBAR_y     6.504065
HMM_multiv_bic__BBAR_x     6.504065
HMM_multiv_bic__BBAR_y     6.504065
HMM_multiv_aic__BMA_x      3.252033
HMM_multiv_aic__BMA_y      3.252033
dtype: float64

In [17]:
stock_dict={}

for stock in params["tickerlist"]:
     stock_dict[stock]= subset_of_columns(residual_df, stock)

In [43]:
pd.DataFrame(np.zeros_like(stock_dict[stock].iloc[:,0]))

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
118,0.0
119,0.0
120,0.0
121,0.0


In [48]:
stock_dict[stock].iloc[:,0].reset_index(drop=True)

0     -0.000395
1     -0.000367
2     -0.000334
3     -0.000627
4     -0.000627
         ...   
118    0.179704
119   -0.004316
120   -0.102741
121   -0.000747
122         NaN
Name: GARCH_aic__BBAR.BA, Length: 123, dtype: float64

In [49]:
plot_multivariate_DM_test(real_price=pd.DataFrame(np.zeros_like(stock_dict[stock].iloc[:,0])), 
                          forecasts=stock_dict[stock].reset_index(drop=True), 
                          title=f"DM test {stock}")

ValueError: cannot reshape array of size 123 into shape (24)

In [59]:
model_list = ["GARCH", "HMM_univ", "HMM_multiv", "VAR_multiv", "VAR_with_vol"]

aggregating_dict = {"aic": {}, "bic": {}}

for criteria, dataframe in zip(("aic", "bic"), (aic_residuals, bic_residuals)):
    for model in model_list:
        aggregating_dict[criteria][model] = subset_of_columns(dataframe, model)

aggregating_dict["bic"]["GARCH"].head()

Unnamed: 0,GARCH_bic__^MERV,GARCH_bic__GGAL.BA,GARCH_bic__GGAL,GARCH_bic__YPFD.BA,GARCH_bic__YPF,GARCH_bic__EDN.BA,GARCH_bic__EDN,GARCH_bic__BMA.BA,GARCH_bic__BMA,GARCH_bic__BBAR.BA,GARCH_bic__BBAR
2023-06-02,-0.005047,-0.025661,-0.022235,0.007722,0.012681,0.063604,0.100486,-0.001323,0.003509,-0.000395,-0.008184
2023-06-05,0.01625,0.03536,0.038094,0.013768,0.018403,0.086247,0.093646,0.095998,0.042085,-0.000367,0.033212
2023-06-06,0.044891,0.110336,0.11144,0.044727,0.056079,0.00374,0.026269,0.110873,0.132402,-0.000334,0.111272
2023-06-07,-0.005159,0.034746,0.035308,-0.000354,0.00243,-0.014999,-0.031311,0.036467,0.023737,-0.000627,0.034402
2023-06-08,-0.003221,0.018402,0.009482,0.008347,0.003759,0.007923,0.008615,-0.016975,-0.027999,-0.000627,-0.038866


In [63]:
metrics_df = pd.DataFrame(index=["mse", "meanabs", "medianabs"])

for criteria, dictionary in aggregating_dict.items():
    for model, dataframe in dictionary.items():
        metrics_df.loc["mse", f"{criteria}_{model}"] = (
            (dataframe**2).mean().mean()
        )
        metrics_df.loc["meanabs", f"{criteria}_{model}"] = (
            dataframe.abs().mean().mean()
        )
        metrics_df.loc["medianabs", f"{criteria}_{model}"] = (
            (dataframe.abs()).median().median()
        )

metrics_df = metrics_df * 100
metrics_df

Unnamed: 0,aic_GARCH,aic_HMM_univ,aic_HMM_multiv,aic_VAR_multiv,aic_VAR_with_vol,bic_GARCH,bic_HMM_univ,bic_HMM_multiv,bic_VAR_multiv,bic_VAR_with_vol
mse,6653008.0,0.153527,0.176854,0.293429,0.417043,6653008.0,0.171918,0.184005,0.431818,0.460903
meanabs,707.5074,2.899991,3.042162,2.460573,2.84471,707.5155,2.99063,3.096038,3.09916,3.216367
medianabs,2.272801,2.207734,2.220856,1.758667,2.130095,2.28419,2.292984,2.380128,2.227783,2.206785


In [64]:
for criteria in ["aic", "bic"]:
    print(criteria)
    filtered_columns = [col for col in metrics_df.columns if criteria in col]
    for metric in metrics_df.index:
        print(metric)
        print(metrics_df[filtered_columns].loc[metric].idxmin())
        print(np.round(metrics_df[filtered_columns].loc[metric].min(), 5))
        print()
    print()

aic
mse
aic_HMM_univ
0.15353

meanabs
aic_VAR_multiv
2.46057

medianabs
aic_VAR_multiv
1.75867


bic
mse
bic_HMM_univ
0.17192

meanabs
bic_HMM_univ
2.99063

medianabs
bic_VAR_with_vol
2.20679




In [None]:
DM(p_real=0,)