## Startup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pomegranate as pm
import torch
from scipy.special import logsumexp

import logging
import os
import pickle
import warnings

In [2]:
from pomegranate.distributions import Normal
from pomegranate.hmm import DenseHMM

In [3]:
random_state = 42
np.random.seed(random_state)
# logging.captureWarnings(True)

In [4]:
from scripts.params import get_params
from scripts.aux_functions import (
    generate_columns,
    save_as_pickle,
    get_all_results_matching,
    clean_modelname,
)

params = get_params()

## Data Retrieval

In [5]:
dataroute = params["dataroute"]
resultsroute = params["resultsroute"]
dumproute = params["dumproute"]

In [6]:
name = f'finaldf_train_{params["tablename"]}.pickle'
filename = os.path.join(dataroute, name)
with open(filename, "rb") as handle:
    df = pickle.load(handle)

In [7]:
df.head()

Unnamed: 0,^BVSP_rets,^BVSP_log_rets,^BVSP_gk_vol,VALE3.SA_rets,VALE3.SA_log_rets,VALE3.SA_gk_vol,VALE_rets,VALE_log_rets,VALE_gk_vol,PETR3.SA_rets,...,ABEV3.SA_gk_vol,ABEV_rets,ABEV_log_rets,ABEV_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,^BVSP_USD_rets,^BVSP_USD_log_rets,^BVSP_USD_gk_vol
2013-01-03,0.012182,0.012109,0.000218,-0.017007,-0.017153,0.00019,-0.011168,-0.011231,0.000204,0.037298,...,0.000185,0.00692,0.006896,0.000123,0.005423,0.005409,5e-06,0.008609,0.008572,0.000218
2013-01-04,-0.012462,-0.01254,0.000163,-0.015455,-0.015576,0.000512,-0.008471,-0.008507,0.000265,0.003401,...,0.00027,0.000711,0.000711,5.6e-05,-0.00911,-0.009152,0.000127,-0.012968,-0.013053,0.000163
2013-01-07,-0.009437,-0.009481,0.00018,-0.019681,-0.019878,0.000541,-0.01851,-0.018683,0.000324,-0.013075,...,0.000146,-0.007814,-0.007845,6.5e-05,0.002544,0.002541,5.6e-05,-0.004489,-0.004499,0.00018
2013-01-08,-0.012998,-0.013083,0.00025,-0.007887,-0.007919,0.000184,-0.01499,-0.015104,0.000108,-0.02846,...,0.000141,0.005967,0.005949,6.1e-05,0.002794,0.00279,3e-05,-0.017548,-0.017704,0.00025
2013-01-09,0.007378,0.007351,8.7e-05,0.004577,0.004567,0.000137,0.001964,0.001962,0.000136,0.010101,...,0.000309,0.007117,0.007092,3.7e-05,0.003096,0.003092,2.8e-05,0.009302,0.009259,8.7e-05


## HMM Training

In [8]:
range_states = range(1, 16)
emptydf = pd.DataFrame(columns=["AIC", "BIC"], index=range_states)
emptydf.fillna(np.inf, inplace=True)
results_dict_df = {stock: emptydf for stock in params["tickerlist"]}

In [9]:
def from_df_to_reshaped(data: pd.DataFrame):
    npdata = data.values
    data_reshaped = npdata[:, :, np.newaxis]
    return data_reshaped

In [10]:
def GaussianHMM(data_reshaped: np.ndarray, n_state: int):
    model = DenseHMM(distributions=[Normal() for _ in range(n_state)], sample_length=1)

    res = model.fit(data_reshaped)
    return res

In [11]:
cols = ["VALE3.SA_log_rets", "VALE3.SA_gk_vol", "USD_log_rets", "USD_gk_vol"]
data = df[cols]
data_reshaped = from_df_to_reshaped(data)
res = GaussianHMM(data_reshaped, n_state=3)

In [12]:
def n_params(res: pm.hmm.dense_hmm.DenseHMM):
    n_dist = res.n_distributions
    params_from_dists = n_dist * 2  # mean and variance for Normal
    transmat_elements = n_dist * (
        n_dist - 1
    )  # square matrix (minus last row bc must sum to one)
    n_params = params_from_dists + transmat_elements
    return n_params

In [13]:
def get_aic(res: pm.hmm.dense_hmm.DenseHMM, data: np.ndarray):
    """
    Log Likelihood of the model is the Logsumexp of the log likelihood
    see https://stats.stackexchange.com/questions/60902/how-to-calculate-the-log-likelihood-in-hmm-from-the-output-of-the-forward-algori
    """
    aic = 2 * n_params(res) - 2 * logsumexp(res.log_probability(data))
    return aic

In [14]:
def get_bic(res: pm.hmm.dense_hmm.DenseHMM, data: np.ndarray):
    """
    bic = k * np.log(len(data)) - 2 * model.log_likelihood(data)
    """
    bic = n_params(res) * np.log(len(data)) - 2 * logsumexp(res.log_probability(data))
    return bic

In [15]:
prediction = res.predict_proba(from_df_to_reshaped(data))
print(
    prediction,
    """
esto es un array
en donde cada matriz (separada por espacios) coresponde a una observacion (un dia)
cada fila de esa matriz (horizontalmente suma 1) la probabilidad
cada columna un estado
""",
)

tensor([[[0.0000e+00, 8.1205e-01, 1.8795e-01],
         [9.9910e-01, 1.3628e-18, 8.9840e-04],
         [0.0000e+00, 9.9973e-01, 2.7367e-04],
         [1.0000e+00, 7.3519e-25, 8.5546e-09]],

        [[0.0000e+00, 8.3456e-01, 1.6544e-01],
         [9.9837e-01, 2.3580e-18, 1.6286e-03],
         [0.0000e+00, 9.9947e-01, 5.2770e-04],
         [1.0000e+00, 1.0840e-24, 1.2614e-08]],

        [[0.0000e+00, 7.5852e-01, 2.4148e-01],
         [9.9744e-01, 3.9464e-18, 2.5555e-03],
         [3.0526e-27, 9.9932e-01, 6.7720e-04],
         [1.0000e+00, 1.5816e-24, 1.8405e-08]],

        ...,

        [[0.0000e+00, 8.9881e-01, 1.0119e-01],
         [9.9825e-01, 1.9912e-11, 1.7540e-03],
         [2.6507e-11, 9.9953e-01, 4.6820e-04],
         [1.0000e+00, 2.5394e-23, 1.4433e-08]],

        [[0.0000e+00, 4.1683e-01, 5.8317e-01],
         [6.4123e-01, 5.2219e-16, 3.5877e-01],
         [0.0000e+00, 8.9471e-01, 1.0529e-01],
         [1.0000e+00, 2.6601e-22, 3.0953e-06]],

        [[0.0000e+00, 9.0090e-01, 9.

In [16]:
print(prediction.argmax(axis=1))
print("""esto not really me hace falta""")

tensor([[3, 2, 0],
        [3, 2, 0],
        [3, 2, 0],
        ...,
        [3, 2, 0],
        [3, 2, 0],
        [3, 2, 0]])
esto not really me hace falta


In [26]:
def select_best(data: pd.DataFrame, max_states=15):

    aic = {"criterion": np.inf, "best_model": None, "n_state": None}
    bic = {"criterion": np.inf, "best_model": None, "n_state": None}

    data_reshaped = from_df_to_reshaped(data)

    for num_states in range(2, max_states + 1):
        res = GaussianHMM(data_reshaped, n_state=num_states)

        aic_result = get_aic(res, data_reshaped)
        bic_result = get_bic(res, data_reshaped)

        if aic_result < aic["criterion"]:
            aic["criterion"] = aic_result
            aic["best_model"] = res
            aic["n_state"] = num_states
        if bic_result < bic["criterion"]:
            bic["criterion"] = bic_result
            bic["best_model"] = res
            bic["n_state"] = num_states

    return aic, bic

In [27]:
aic, bic = select_best(data, 10)

In [28]:
def find_best_all_assets(
    df: pd.DataFrame,
    max_states: int = 10,
    contains_vol: bool = False,
    contains_USD: bool = False,
):
    best = {stock: {"aic": None, "bic": None} for stock in params["assetlist"]}

    for stock in params["assetlist"]:
        print(stock)
        cols = generate_columns(
            stock=stock, contains_vol=contains_vol, contains_USD=contains_USD
        )
        aic, bic = select_best(df[cols], max_states=max_states)
        best[stock]["aic"] = aic
        best[stock]["bic"] = bic

    return best

In [29]:
df[["USD_^BVSP_log_rets", "USD_^BVSP_gk_vol"]] = df[
    ["^BVSP_log_rets", "^BVSP_gk_vol"]
].copy()
# transitorio pq issue #71

In [30]:
best_with_vol = find_best_all_assets(
    df, max_states=10, contains_vol=True, contains_USD=False
)
# this cell sometimes crashes unexpectedly - just run again

^BVSP
USD_^BVSP
VALE3.SA
VALE
PETR3.SA
PBR
EMBR3.SA
ERJ
ABEV3.SA
ABEV


In [31]:
best_multiv = find_best_all_assets(
    df, max_states=10, contains_vol=True, contains_USD=True
)

^BVSP
USD_^BVSP
VALE3.SA
VALE
PETR3.SA
PBR
EMBR3.SA
ERJ
ABEV3.SA
ABEV


In [32]:
best_multiv

{'^BVSP': {'aic': {'criterion': -32.508628845214844,
   'best_model': DenseHMM(
     (start): Silent()
     (end): Silent()
     (distributions): ModuleList(
       (0-2): 3 x Normal()
     )
   ),
   'n_state': 3},
  'bic': {'criterion': 7.570280750408855,
   'best_model': DenseHMM(
     (start): Silent()
     (end): Silent()
     (distributions): ModuleList(
       (0-1): 2 x Normal()
     )
   ),
   'n_state': 2}},
 'USD_^BVSP': {'aic': {'criterion': -32.511051177978516,
   'best_model': DenseHMM(
     (start): Silent()
     (end): Silent()
     (distributions): ModuleList(
       (0-2): 3 x Normal()
     )
   ),
   'n_state': 3},
  'bic': {'criterion': 6.964316089764324,
   'best_model': DenseHMM(
     (start): Silent()
     (end): Silent()
     (distributions): ModuleList(
       (0-1): 2 x Normal()
     )
   ),
   'n_state': 2}},
 'VALE3.SA': {'aic': {'criterion': -28.513328552246094,
   'best_model': DenseHMM(
     (start): Silent()
     (end): Silent()
     (distributions): Mod

# Generating out of sample data

In [33]:
name = f'finaldf_test_{params["tablename"]}.pickle'
filename = os.path.join(dataroute, name)
with open(filename, "rb") as handle:
    df_test = pickle.load(handle)

In [34]:
def return_residuals(actual: pd.DataFrame, forecasts: pd.DataFrame):
    residuals = actual - forecasts
    return residuals

In [35]:
prob_matrix = res.predict_proba(data_reshaped)[-1]

prob_states = prob_matrix.sum(axis=0) / prob_matrix.sum()

print(prob_states)

tensor([0.4999, 0.4752, 0.0249])


In [38]:
param_means = [dist.means for dist in res.distributions]

param_tensor = torch.cat(param_means, dim=0)

result = torch.dot(prob_states, param_tensor)

print(prob_states)

print(param_tensor)

print(result*100)



tensor([0.4999, 0.4752, 0.0249])
tensor([0.0002, 0.0001, 0.0013])
tensor(0.0186)


In [55]:
n_state=3
insample_data=df.copy()
oos_data=df_test.copy()

columns = oos_data.columns

split_date = oos_data.index[0]
dates_to_forecast = len(oos_data.index)

probabilities = pd.DataFrame(columns=range(n_state), index=oos_data.index)
forecasts = pd.DataFrame(columns=oos_data.columns, index=oos_data.index)

full_data = pd.concat([insample_data, oos_data])
index = full_data.index
end_loc = np.where(index >= split_date)[0].min()
# esto es un int del iloc
# preciso usar ints de iloc porque el timedelta se me va a romper con el fin de semana
rolling_window = 252

model_list = []

In [57]:
for i in range(1, dates_to_forecast):
    # recursive window forecasting
    date_of_first_forecast = full_data.index[end_loc + i - 1]

    fitstart = end_loc - rolling_window + i
    fitend = end_loc + i

    # fit model with last year
    fit_data = full_data.iloc[fitstart:fitend][columns]
    reshaped_fit_data= from_df_to_reshaped(fit_data)
    
    res = GaussianHMM(data_reshaped=reshaped_fit_data, n_state=n_state)
    model_list.append(res)
    
    prob_matrix = res.predict_proba(data_reshaped)[-1]
    
    last_day_state_probs = prob_matrix.sum(axis=0) / prob_matrix.sum()
    # hotfix véase https://github.com/alfsn/regime-switching-hmm/issues/72

    probabilities.loc[date_of_first_forecast] = last_day_state_probs
    
    param_means = [dist.means for dist in res.distributions]

    param_tensor = torch.cat(param_means, dim=0)

    expected_means = torch.dot(prob_states, param_tensor)
    
    forecasts.loc[date_of_first_forecast] = expected_means

forecasts.fillna(method="ffill", inplace=True)

residuals = return_residuals(oos_data, forecasts)
    

  forecasts.fillna(method="ffill", inplace=True)


In [58]:
residuals

Unnamed: 0,^BVSP_rets,^BVSP_log_rets,^BVSP_gk_vol,VALE3.SA_rets,VALE3.SA_log_rets,VALE3.SA_gk_vol,VALE_rets,VALE_log_rets,VALE_gk_vol,PETR3.SA_rets,...,ABEV3.SA_gk_vol,ABEV_rets,ABEV_log_rets,ABEV_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,^BVSP_USD_rets,^BVSP_USD_log_rets,^BVSP_USD_gk_vol
2023-06-01,0.015999,0.015790,-0.004184,0.016571,0.016350,-0.004270,0.023805,0.023410,-0.004097,0.024514,...,-0.004455,0.006167,0.006110,-0.004423,-0.018570,-0.018669,-0.004550,0.026113,0.025651,-0.004184
2023-06-02,0.012704,0.012543,-0.004949,0.037342,0.036457,-0.005247,0.043758,0.042591,-0.005202,0.006654,...,-0.004987,0.023047,0.022652,-0.004862,-0.014379,-0.014421,-0.005322,0.026191,0.025705,-0.004949
2023-06-05,-0.008152,-0.008153,-0.009307,-0.017621,-0.017655,-0.008904,-0.014495,-0.014508,-0.009124,-0.003132,...,-0.009300,-0.016275,-0.016299,-0.009265,-0.010179,-0.010179,-0.009375,-0.004245,-0.004258,-0.009307
2023-06-06,0.021174,0.021031,0.004468,0.007455,0.007450,0.004300,0.013742,0.013697,0.004295,0.024445,...,0.004876,0.038912,0.038323,0.005251,-0.002692,-0.002716,0.004241,0.022537,0.022370,0.004468
2023-06-07,0.011652,0.011623,0.004084,0.019820,0.019696,0.004263,0.018547,0.018442,0.004147,0.035051,...,0.004322,0.007347,0.007341,0.004269,0.005847,0.005846,0.004019,0.016527,0.016449,0.004084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-01,0.009031,0.009013,0.003053,0.021520,0.021350,0.003085,0.027635,0.027336,0.003398,-0.010294,...,0.003875,0.043114,0.042329,0.003350,-0.005152,-0.005185,0.002973,0.016266,0.016178,0.003053
2023-12-04,-0.006892,-0.006944,0.003374,-0.019212,-0.019468,0.003487,-0.033829,-0.034535,0.003763,-0.016368,...,0.003283,-0.007270,-0.007326,0.003317,0.015698,0.015621,0.003294,-0.017388,-0.017605,0.003374
2023-12-05,0.003778,0.003778,0.003021,-0.006258,-0.006301,0.003066,-0.002416,-0.002430,0.003082,-0.004414,...,0.003261,0.020720,0.020565,0.003331,-0.000147,-0.000152,0.002990,0.001876,0.001875,0.003021
2023-12-06,-0.006981,-0.007032,0.003261,-0.003483,-0.003505,0.003502,0.001068,0.001065,0.003472,-0.020651,...,0.003679,-0.010832,-0.010930,0.003224,-0.002270,-0.002285,0.003126,-0.002260,-0.002274,0.003261


In [63]:
def generate_samples_residuals(n_state, insample_data, oos_data):
    """
    This function only requires the number of normal distributions, which may be acquired from len(res.distributions)
    """
    # res.predict_proba(data_reshaped)[-1] es la matriz de cada estado
    columns = oos_data.columns

    split_date = oos_data.index[0]
    dates_to_forecast = len(oos_data.index)

    probabilities = pd.DataFrame(columns=range(n_state), index=oos_data.index)
    forecasts = pd.DataFrame(columns=oos_data.columns, index=oos_data.index)

    full_data = pd.concat([insample_data, oos_data])
    index = full_data.index
    end_loc = np.where(index >= split_date)[0].min()
    # esto es un int del iloc
    # preciso usar ints de iloc porque el timedelta se me va a romper con el fin de semana
    rolling_window = 252

    model_list = []

    for i in range(1, dates_to_forecast):
        # recursive window forecasting
        date_of_first_forecast = full_data.index[end_loc + i - 1]

        fitstart = end_loc - rolling_window + i
        fitend = end_loc + i

        # fit model with last year
        fit_data = full_data.iloc[fitstart:fitend][columns]
        reshaped_fit_data= from_df_to_reshaped(fit_data)
        
        res = GaussianHMM(data_reshaped=reshaped_fit_data, n_state=n_state)
        model_list.append(res)
        
        prob_matrix = res.predict_proba(data_reshaped)[-1]
        
        last_day_state_probs = prob_matrix.sum(axis=0) / prob_matrix.sum()
        # hotfix véase https://github.com/alfsn/regime-switching-hmm/issues/72

        probabilities.loc[date_of_first_forecast] = last_day_state_probs
        
        param_means = [dist.means for dist in res.distributions]

        param_tensor = torch.cat(param_means, dim=0)

        expected_means = torch.dot(prob_states, param_tensor)
        
        forecasts.loc[date_of_first_forecast] = expected_means

    forecasts.fillna(method="ffill", inplace=True)

    residuals = return_residuals(oos_data, forecasts)

    return probabilities, forecasts, residuals
        

In [64]:
probabilities, forecasts, residuals = generate_samples_residuals(3, df[cols], df_test[cols])

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130


  forecasts.fillna(method="ffill", inplace=True)


In [22]:
def generate_and_save_samples(
    best_model_dict: dict,
    modeltype: str,
    criterion: str,
    insample_data: pd.DataFrame,
    oos_data: pd.DataFrame,
    tickerlist: list,
    contains_vol: bool,
    contains_USD: bool,
):
    probabilities = {stock: None for stock in tickerlist}
    forecasts = {stock: None for stock in tickerlist}
    residuals = {stock: None for stock in tickerlist}
    failed = {stock: None for stock in tickerlist}

    print(">" * 10, modeltype, criterion)

    for stock in tickerlist:
        print(stock)
        columns = generate_columns(
            stock=stock, contains_vol=contains_vol, contains_USD=contains_USD
        )

        proba, fcast, resid, fails = generate_HMM_samples_residuals(
            best_model_dict[stock],
            insample_data=insample_data[columns],
            oos_data=oos_data[columns],
        )

        probabilities[stock] = proba
        forecasts[stock] = fcast
        residuals[stock] = resid
        failed[stock] = fails

    save_as_pickle(
        data=forecasts,
        resultsroute=params["resultsroute"],
        model_type=f"HMM_{modeltype}",
        tablename=params["tablename"],
        criterion=criterion,
        type_save="forecasts",
    )

    save_as_pickle(
        data=residuals,
        resultsroute=params["resultsroute"],
        model_type=f"HMM_{modeltype}",
        tablename=params["tablename"],
        criterion=criterion,
        type_save="residuals",
    )

    save_as_pickle(
        data=failed,
        resultsroute=params["resultsroute"],
        model_type=f"HMM_{modeltype}",
        tablename=params["tablename"],
        criterion=criterion,
        type_save="model_fails",
    )

In [23]:
models_dict = {
    "aic": {
        "univ": (aic_best_model_univ, False, False),
        "with_vol": (aic_best_model_with_vol, True, False),
        "multiv": (aic_best_model_multi, True, True),
    },
    "bic": {
        "univ": (bic_best_model_univ, False, False),
        "with_vol": (bic_best_model_with_vol, True, False),
        "multiv": (bic_best_model_multi, True, True),
    },
}

In [24]:
for criterion, type_dict in models_dict.items():
    for modeltype, tupla in type_dict.items():
        best_dict, contains_vol, contains_USD = tupla
        try:
            generate_and_save_samples(
                best_model_dict=best_dict,
                modeltype=modeltype,
                criterion=criterion,
                insample_data=df,
                oos_data=df_test,
                tickerlist=params["tickerlist"],
                contains_vol=contains_vol,
                contains_USD=contains_USD,
            )
        except UnboundLocalError:
            print(f"MODEL FALILURE: {criterion}, {modeltype}")

>>>>>>>>>> univ aic
^BVSP
failed models:  0
VALE3.SA
failed models:  0
VALE
failed models:  0
PETR3.SA
failed models:  0
PBR
failed models:  0
EMBR3.SA
failed models:  0
ERJ
failed models:  0
ABEV3.SA
failed models:  0
ABEV
failed models:  0
>>>>>>>>>> with_vol aic
^BVSP
failed models:  0
VALE3.SA
failed models:  0
VALE
failed models:  0
PETR3.SA
failed models:  0
PBR
failed models:  0
EMBR3.SA
failed models:  0
ERJ
failed models:  0
ABEV3.SA
failed models:  0
ABEV
failed models:  0
>>>>>>>>>> multiv aic
^BVSP
failed models:  0
VALE3.SA
failed models:  3
VALE
failed models:  0
PETR3.SA
failed models:  0
PBR
failed models:  0
EMBR3.SA
failed models:  0
ERJ
failed models:  0
ABEV3.SA
failed models:  0
ABEV
failed models:  0
>>>>>>>>>> univ bic
^BVSP
failed models:  0
VALE3.SA
failed models:  0
VALE
failed models:  0
PETR3.SA
failed models:  0
PBR
failed models:  0
EMBR3.SA
failed models:  0
ERJ
failed models:  0
ABEV3.SA
failed models:  0
ABEV
failed models:  0
>>>>>>>>>> with_vol bic
^B

In [25]:
file = f"""HMM_multiv_{params["tablename"]}_aic_best_residuals.pickle"""
with open(os.path.join(resultsroute, file), "rb") as f:
    opened_pickle = pickle.load(f)

In [26]:
opened_pickle[params["index"]].tail()

Unnamed: 0,^BVSP_log_rets,^BVSP_gk_vol,USD_log_rets,USD_gk_vol
2023-12-01,0.005412,-0.000107,-0.007433,-2.4e-05
2023-12-04,-0.010738,-7.3e-05,0.012951,1e-05
2023-12-05,0.000191,-0.000159,-0.002534,-2.8e-05
2023-12-06,-0.01066,-3.3e-05,-0.004736,-8e-06
2023-12-07,0.002553,-0.000152,0.000697,5e-06


In [7]:
fails_dict = get_all_results_matching(resultsroute, ["fail"])

{'HMM_multiv_BR_^BVSP_aic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_multiv_BR_^BVSP_aic_best_model_fails.pickle', 'HMM_multiv_BR_^BVSP_bic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_multiv_BR_^BVSP_bic_best_model_fails.pickle', 'HMM_univ_BR_^BVSP_aic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_univ_BR_^BVSP_aic_best_model_fails.pickle', 'HMM_univ_BR_^BVSP_bic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_univ_BR_^BVSP_bic_best_model_fails.pickle', 'HMM_with_vol_BR_^BVSP_aic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_with_vol_BR_^BVSP_aic_best_model_fails.pickle', 'HMM_with_vol_BR_^BVSP_bic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_with_vol_BR_^BVSP_bic_best_model_fails.pickle'}


In [13]:
fails_df = pd.DataFrame()
for name, dir in fails_dict.items():
    dict_with_dfs = pd.read_pickle(dir)
    colname = clean_modelname(
        name, substring_to_replace="model_fails", tablename=params["tablename"]
    )
    fails_df[colname] = dict_with_dfs
    os.remove(dir)

fails_df = fails_df / len(df_test.index)
fails_df.to_csv(
    path_or_buf=os.path.join(
        params["resultsroute"], f"""HMM_{params["tablename"]}_fails.csv"""
    )
)

# Graficando

In [27]:
def plot_close_rets_vol(model, data, key, IC):
    prediction = model.predict(data)
    states = set(prediction)

    fig = plt.figure(figsize=(20, 20))
    plt.tight_layout()
    plt.title(
        f"{key} Log returns and intraday Vol\n{model.n_components} states / best by {IC}"
    )

    for subplot, var in zip(range(1, 3), data.columns):
        plt.subplot(2, 1, subplot)
        for i in set(prediction):
            state = prediction == i
            x = data.index[state]
            y = data[var].iloc[state]
            plt.plot(x, y, ".")
        plt.legend(states, fontsize=16)

        plt.grid(True)
        plt.xlabel("datetime", fontsize=16)
        plt.ylabel(var, fontsize=16)

    plt.savefig(os.path.join(resultsroute, "graphs", f"HMM", f"{key}_model_{IC}.png"))

In [28]:
# for dictionary, IC in zip([aic_best_model, bic_best_model], ["AIC", "BIC"]):
#    for key, model in dictionary.items():
#        columns = [f"{stock}_log_rets", f"{stock}_gk_vol"]
#        insample_data = df[columns]
#        oos_data = df_test[columns]
#        train_end = insample_data.index.max()
#        data = pd.concat([insample_data, oos_data])
#
#        plot_close_rets_vol(model, data, key, IC)