## Startup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pomegranate as pm

import logging
import os
import pickle
import warnings

In [2]:
from pomegranate.distributions import Uniform, Normal
from pomegranate.hmm import DenseHMM

In [3]:
random_state = 42
np.random.seed(random_state)
# logging.captureWarnings(True)

In [4]:
from scripts.params import get_params
from scripts.aux_functions import generate_columns, save_as_pickle, get_all_results_matching, clean_modelname

params = get_params()

## Data Retrieval

In [5]:
dataroute = params["dataroute"]
resultsroute = params["resultsroute"]
dumproute = params["dumproute"]

In [6]:
name = f'finaldf_train_{params["tablename"]}.pickle'
filename = os.path.join(dataroute, name)
with open(filename, "rb") as handle:
    df = pickle.load(handle)

In [7]:
df.head()

Unnamed: 0,^BVSP_rets,^BVSP_log_rets,^BVSP_gk_vol,VALE3.SA_rets,VALE3.SA_log_rets,VALE3.SA_gk_vol,VALE_rets,VALE_log_rets,VALE_gk_vol,PETR3.SA_rets,...,ABEV3.SA_gk_vol,ABEV_rets,ABEV_log_rets,ABEV_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,^BVSP_USD_rets,^BVSP_USD_log_rets,^BVSP_USD_gk_vol
2013-01-03,0.012182,0.012109,0.000218,-0.017007,-0.017153,0.00019,-0.011168,-0.011231,0.000204,0.037298,...,0.000185,0.00692,0.006896,0.000123,0.005423,0.005409,5e-06,0.008609,0.008572,0.000218
2013-01-04,-0.012462,-0.01254,0.000163,-0.015455,-0.015576,0.000512,-0.008471,-0.008507,0.000265,0.003401,...,0.00027,0.000711,0.000711,5.6e-05,-0.00911,-0.009152,0.000127,-0.012968,-0.013053,0.000163
2013-01-07,-0.009437,-0.009481,0.00018,-0.019681,-0.019878,0.000541,-0.01851,-0.018683,0.000324,-0.013075,...,0.000146,-0.007814,-0.007845,6.5e-05,0.002544,0.002541,5.6e-05,-0.004489,-0.004499,0.00018
2013-01-08,-0.012998,-0.013083,0.00025,-0.007887,-0.007919,0.000184,-0.01499,-0.015104,0.000108,-0.02846,...,0.000141,0.005967,0.005949,6.1e-05,0.002794,0.00279,3e-05,-0.017548,-0.017704,0.00025
2013-01-09,0.007378,0.007351,8.7e-05,0.004577,0.004567,0.000137,0.001964,0.001962,0.000136,0.010101,...,0.000309,0.007117,0.007092,3.7e-05,0.003096,0.003092,2.8e-05,0.009302,0.009259,8.7e-05


## HMM Training

In [8]:
range_states = range(1, 16)
emptydf = pd.DataFrame(columns=["AIC", "BIC"], index=range_states)
emptydf.fillna(np.inf, inplace=True)
results_dict_df = {stock: emptydf for stock in params["tickerlist"]}

In [9]:
#np.array([[df[cols].values]]).shape

In [10]:
import torch

In [11]:
help(Normal)

Help on class Normal in module pomegranate.distributions.normal:

class Normal(pomegranate.distributions._distribution.Distribution)
 |  Normal(means=None, covs=None, covariance_type='full', min_cov=None, inertia=0.0, frozen=False, check_data=True)
 |  
 |  A normal distribution object.
 |  
 |  A normal distribution models the probability of a variable occurring under
 |  a bell-shaped curve. It is described by a vector of mean values and a
 |  covariance value that can be zero, one, or two dimensional. This
 |  distribution can assume that features are independent of the others if
 |  the covariance type is 'diag' or 'sphere', but if the type is 'full' then
 |  the features are not independent.
 |  
 |  There are two ways to initialize this object. The first is to pass in
 |  the tensor of probability parameters, at which point they can immediately be
 |  used. The second is to not pass in the rate parameters and then call
 |  either `fit` or `summary` + `from_summaries`, at which po

In [12]:
num_states=2

n1 = Normal(means=[0])


model = DenseHMM()
cols=["VALE3.SA_log_rets","VALE3.SA_gk_vol"]

model.add_distributions([n1 for _ in range(num_states*len(cols))])
#X = torch.randn(100, 50, 2) #
X= np.array([[df[cols].values]])
print(X.shape)

(1, 1, 2578, 2)


In [13]:
res=model.fit(X)

NEW TRY

In [15]:
data=df[cols].values

In [16]:
data.shape

(2578, 2)

In [17]:
data_reshaped = data[:, :, np.newaxis]

In [18]:
data_reshaped

array([[[-0.0171533 ],
        [ 0.00018979]],

       [[-0.01557612],
        [ 0.00051209]],

       [[-0.01987763],
        [ 0.00054097]],

       ...,

       [[-0.00756658],
        [ 0.00063023]],

       [[-0.0238272 ],
        [ 0.00108552]],

       [[-0.0073386 ],
        [ 0.00024527]]])

In [19]:
X3 = torch.randn(10, 2, 1)

In [20]:
model3 = DenseHMM([Normal(), Normal()], sample_length=1)
model3.fit(data_reshaped)

DenseHMM(
  (start): Silent()
  (end): Silent()
  (distributions): ModuleList(
    (0-1): 2 x Normal()
  )
)

In [25]:
(model3.predict(data_reshaped).sum(axis=1)==2).sum()

tensor(1763)

In [31]:
num_states=5

model = DenseHMM(distributions=[Normal() for _ in range(num_states)])

data=df[cols].values
data_reshaped=data[:, :, np.newaxis]
res=model.fit(data_reshaped)
prediction=res.predict(data_reshaped)


In [32]:
prediction

tensor([[2, 0],
        [2, 0],
        [2, 0],
        ...,
        [2, 0],
        [3, 4],
        [2, 0]])

In [45]:
def select_optimal_states(data, max_states=10):
  """
  Selects the optimal number of states for an HMM model using AIC.

  Args:
      data (list): List of observation sequences.
      max_states (int, optional): Upper limit for the number of states to explore. Defaults to 10.

  Returns:
      tuple: (number_of_states, model) - The optimal number of states and the fitted model.
  """

  best_aic = np.inf  # Initialize with positive infinity
  best_bic = np.inf
  best_model_aic = None
  best_model_bic = None

  for num_states in range(1, max_states + 1):
    # Create initial distributions, transition matrix (random), and emission with Normal Distribution
    initial_probs = Uniform(np.zeros(num_states), np.ones(num_states))
    transition_matrix = np.random.rand(num_states, num_states)
    transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)
    emission_probs = Normal()  # N(0,1)

    # Create the HMM model
    model = DenseHMM()

    # Fit the model to data
    model.fit(data)

    # Calculate AIC (same as before)
    aic = 2 * model.num_params() - 2 * model.log_likelihood(data)

    # Calculate BIC (same logic as before, but emission_params now considers mean and standard deviation)
    emission_params = 2 * len(data[0])  # Mean and standard deviation for each emission dimension

    # Number of parameters for transition matrix (excluding last row)
    transition_params = num_states * (num_states - 1)

    # Total number of free parameters
    k = model.num_states - 1 + emission_params + transition_params

    # Print statements for debugging (can be removed)
    print("num params", model.num_params())
    print("k", k)

    bic = k * np.log(len(data)) - 2 * model.log_likelihood(data)

    # Update best model if lower IC is found
    if aic < best_aic:
      best_aic = aic
      best_model_aic = model
    if bic < best_bic:
      best_bic = bic
      best_model_bic = model

  return best_aic, best_model_aic, best_bic, best_model_bic

In [46]:
# Example usage
data = [[0, 1, 0, 1], [1, 0, 1, 0]]
optimal_states, fitted_model = select_optimal_states(data)

print(f"Optimal number of states (AIC): {optimal_states}")
print("Fitted model:", fitted_model)

ValueError: Parameter X must have 2 dims

In [26]:
pm.distributions

AttributeError: module 'pomegranate' has no attribute 'distributions'

In [10]:
param_dict = {
    "covariance_type": "diag",
    "n_iter": 500,
    "random_state": random_state,
    # no voy a usar startprob_prior por devlog 20-06-23
}

In [11]:
def fit_hmm_model(
    df: pd.DataFrame,
    tickerlist: list,
    range_states,
    param_dict: dict,
    contains_vol: bool,
    contains_USD: bool,
):

    results_dict_df = {}

    for stock in tickerlist:
        results_dict_df[stock] = pd.DataFrame(
            index=range_states, columns=["AIC", "BIC"]
        )
        for nstate in range_states:
            columns = generate_columns(stock, contains_vol, contains_USD)

            insample_data = df[columns]

            model = hmm.GaussianHMM(n_components=nstate, **param_dict, verbose=False)
            results = model.fit(insample_data)

            convergence = results.monitor_.converged
            all_states_found = np.isclose(a=(model.transmat_.sum(axis=1)), b=1).all()
            startprob_check = model.startprob_.sum() == 1
            good_model = convergence and all_states_found and startprob_check

            if good_model:
                try:
                    results_dict_df[stock].loc[nstate, "AIC"] = model.aic(insample_data)
                    results_dict_df[stock].loc[nstate, "BIC"] = model.bic(insample_data)
                except ValueError:
                    pass

            else:
                print(">" * 10, f"{stock} {nstate} did not converge")
                results_dict_df[stock].loc[nstate, "AIC"] = np.inf
                results_dict_df[stock].loc[nstate, "BIC"] = np.inf

    return results_dict_df

In [12]:
results_dict_df_univ = fit_hmm_model(
    df, tickerlist, range_states, param_dict, contains_vol=False, contains_USD=False
)

>>>>>>>>>> ^BVSP 12 did not converge
>>>>>>>>>> ^BVSP 15 did not converge
>>>>>>>>>> VALE3.SA 7 did not converge
>>>>>>>>>> VALE3.SA 13 did not converge
>>>>>>>>>> VALE3.SA 15 did not converge
>>>>>>>>>> VALE 5 did not converge
>>>>>>>>>> PETR3.SA 8 did not converge
>>>>>>>>>> PETR3.SA 13 did not converge
>>>>>>>>>> PBR 5 did not converge
>>>>>>>>>> EMBR3.SA 11 did not converge
>>>>>>>>>> ERJ 6 did not converge
>>>>>>>>>> ABEV3.SA 5 did not converge
>>>>>>>>>> ABEV3.SA 6 did not converge
>>>>>>>>>> ABEV3.SA 8 did not converge
>>>>>>>>>> ABEV3.SA 12 did not converge
>>>>>>>>>> ABEV3.SA 14 did not converge
>>>>>>>>>> ABEV3.SA 15 did not converge
>>>>>>>>>> ABEV 5 did not converge
>>>>>>>>>> ABEV 8 did not converge
>>>>>>>>>> ABEV 10 did not converge
>>>>>>>>>> ABEV 15 did not converge


In [13]:
results_dict_df_with_vol = fit_hmm_model(
    df, tickerlist, range_states, param_dict, contains_vol=True, contains_USD=False
)

>>>>>>>>>> ^BVSP 5 did not converge
>>>>>>>>>> VALE3.SA 3 did not converge
>>>>>>>>>> VALE3.SA 7 did not converge
>>>>>>>>>> VALE 5 did not converge
>>>>>>>>>> VALE 7 did not converge
>>>>>>>>>> VALE 15 did not converge
>>>>>>>>>> PBR 3 did not converge
>>>>>>>>>> PBR 13 did not converge
>>>>>>>>>> PBR 15 did not converge
>>>>>>>>>> EMBR3.SA 8 did not converge
>>>>>>>>>> ERJ 2 did not converge
>>>>>>>>>> ERJ 6 did not converge
>>>>>>>>>> ABEV3.SA 3 did not converge
>>>>>>>>>> ABEV3.SA 14 did not converge
>>>>>>>>>> ABEV 2 did not converge
>>>>>>>>>> ABEV 5 did not converge
>>>>>>>>>> ABEV 15 did not converge


In [14]:
results_dict_df_multi = fit_hmm_model(
    df, tickerlist, range_states, param_dict, contains_vol=True, contains_USD=True
)

>>>>>>>>>> ^BVSP 5 did not converge
>>>>>>>>>> ^BVSP 14 did not converge
>>>>>>>>>> VALE3.SA 2 did not converge
>>>>>>>>>> VALE3.SA 5 did not converge
>>>>>>>>>> VALE3.SA 6 did not converge
>>>>>>>>>> VALE3.SA 7 did not converge
>>>>>>>>>> VALE3.SA 13 did not converge
>>>>>>>>>> VALE 12 did not converge
>>>>>>>>>> VALE 14 did not converge
>>>>>>>>>> VALE 15 did not converge
>>>>>>>>>> PETR3.SA 5 did not converge
>>>>>>>>>> PETR3.SA 6 did not converge
>>>>>>>>>> PETR3.SA 14 did not converge
>>>>>>>>>> PBR 5 did not converge
>>>>>>>>>> PBR 14 did not converge
>>>>>>>>>> PBR 15 did not converge
>>>>>>>>>> EMBR3.SA 6 did not converge
>>>>>>>>>> ERJ 3 did not converge
>>>>>>>>>> ERJ 5 did not converge
>>>>>>>>>> ABEV3.SA 5 did not converge
>>>>>>>>>> ABEV3.SA 6 did not converge
>>>>>>>>>> ABEV3.SA 7 did not converge
>>>>>>>>>> ABEV3.SA 14 did not converge
>>>>>>>>>> ABEV3.SA 15 did not converge
>>>>>>>>>> ABEV 13 did not converge
>>>>>>>>>> ABEV 14 did not converge


In [15]:
def select_best_model(
    df: pd.DataFrame,
    results_dict: dict,
    tickerlist: list,
    param_dict: dict,
    contains_vol: bool,
    contains_USD: bool,
):
    """"""
    aic_best_model = {stock: None for stock in tickerlist}
    bic_best_model = {stock: None for stock in tickerlist}

    for stock in tickerlist:
        columns = generate_columns(stock, contains_vol, contains_USD)
        insample_data = df[columns]

        best_aic_nstate = results_dict[stock]["AIC"].astype(float).idxmin()
        best_bic_nstate = results_dict[stock]["BIC"].astype(float).idxmin()

        print(
            f"For stock {stock}, best AIC: {best_aic_nstate} best BIC: {best_bic_nstate}"
        )

        aic_best_model[stock] = hmm.GaussianHMM(
            n_components=best_aic_nstate, **param_dict
        ).fit(insample_data)

        bic_best_model[stock] = hmm.GaussianHMM(
            n_components=best_bic_nstate, **param_dict
        ).fit(insample_data)

    return aic_best_model, bic_best_model

In [16]:
aic_best_model_univ, bic_best_model_univ = select_best_model(
    df=df,
    results_dict=results_dict_df_univ,
    tickerlist=tickerlist,
    param_dict=param_dict,
    contains_vol=False,
    contains_USD=False,
)

For stock ^BVSP, best AIC: 6 best BIC: 2
For stock VALE3.SA, best AIC: 5 best BIC: 2
For stock VALE, best AIC: 4 best BIC: 2
For stock PETR3.SA, best AIC: 4 best BIC: 4
For stock PBR, best AIC: 4 best BIC: 3
For stock EMBR3.SA, best AIC: 4 best BIC: 4
For stock ERJ, best AIC: 4 best BIC: 4
For stock ABEV3.SA, best AIC: 4 best BIC: 2
For stock ABEV, best AIC: 4 best BIC: 2


In [17]:
aic_best_model_with_vol, bic_best_model_with_vol = select_best_model(
    df=df,
    results_dict=results_dict_df_with_vol,
    tickerlist=tickerlist,
    param_dict=param_dict,
    contains_vol=False,
    contains_USD=False,
)

For stock ^BVSP, best AIC: 2 best BIC: 2
For stock VALE3.SA, best AIC: 2 best BIC: 2
For stock VALE, best AIC: 2 best BIC: 2
For stock PETR3.SA, best AIC: 2 best BIC: 2
For stock PBR, best AIC: 4 best BIC: 2
For stock EMBR3.SA, best AIC: 3 best BIC: 2
For stock ERJ, best AIC: 4 best BIC: 4
For stock ABEV3.SA, best AIC: 2 best BIC: 2
For stock ABEV, best AIC: 4 best BIC: 4


In [18]:
aic_best_model_multi, bic_best_model_multi = select_best_model(
    df=df,
    results_dict=results_dict_df_multi,
    tickerlist=tickerlist,
    param_dict=param_dict,
    contains_vol=False,
    contains_USD=False,
)

For stock ^BVSP, best AIC: 2 best BIC: 2
For stock VALE3.SA, best AIC: 3 best BIC: 3
For stock VALE, best AIC: 2 best BIC: 2
For stock PETR3.SA, best AIC: 2 best BIC: 2
For stock PBR, best AIC: 2 best BIC: 2
For stock EMBR3.SA, best AIC: 2 best BIC: 2
For stock ERJ, best AIC: 2 best BIC: 2
For stock ABEV3.SA, best AIC: 2 best BIC: 2
For stock ABEV, best AIC: 2 best BIC: 2


# Generating out of sample data

In [19]:
name = f'finaldf_test_{params["tablename"]}.pickle'
filename = os.path.join(dataroute, name)
with open(filename, "rb") as handle:
    df_test = pickle.load(handle)

In [20]:
def return_residuals(actual: pd.DataFrame, forecasts: pd.DataFrame):
    residuals = (actual - forecasts)
    return residuals

In [21]:
def generate_HMM_samples_residuals(model, insample_data, oos_data):
    """_summary_

    Args:
        model (_type_): _description_
        insample_data (_type_): _description_
        oos_data (_type_): _description_
    """
    # pseudocodigo
    # agarra el mejor modelo (esto con una cantidad optima de params ya esta)
    # fittear t-j con t-j-252d
    # Darle un año de datos hasta t-j para que me prediga la secuencia (probabilidad) de estados.
    # Le pido que me prediga las probabilidades de cada estado durante el periodo t-j, t-j-252:
    # esto me da una matriz de (252 x n estados)
    # esto entiendo es https://hmmlearn.readthedocs.io/en/latest/api.html#hmmlearn.hmm.GaussianHMM.predict_proba
    # Tomo la ultima fila de la matriz
    # Multiplico esa por el vector de medias estimadas: este punto es mi forecast.
    # esto es model.means_ (!)
    nstate = model.n_components
    columns = oos_data.columns

    split_date = oos_data.index[0]
    dates_to_forecast = len(oos_data.index)

    probabilities = pd.DataFrame(columns=range(nstate), index=oos_data.index)
    forecasts = pd.DataFrame(columns=oos_data.columns, index=oos_data.index)

    full_data = pd.concat([insample_data, oos_data])
    del insample_data

    # vamos a implementar recursive window forecasting

    index = full_data.index
    end_loc = np.where(index >= split_date)[0].min()
    # esto es un int del iloc
    # preciso usar ints de iloc porque el timedelta se me va a romper con el fin de semana
    rolling_window = 252

    nstate = model.n_components
    model = hmm.GaussianHMM(n_components=nstate, **param_dict, verbose=False)

    model_list = []
    counter = 0

    for i in range(1, dates_to_forecast):
        date_of_first_forecast = full_data.index[end_loc + i - 1]

        fitstart = end_loc - rolling_window + i
        fitend = end_loc + i

        # fit model with last year
        fit_data = full_data.iloc[fitstart:fitend][columns]
        res = model.fit(fit_data)
        model_list.append(res)

        # obtenemos las probabilidades por estado del ultimo dia
        # son las probabilidades que maximizan la log/likelihood de toda la secuencia
        index = len(model_list)
        while index > 0:
            try:
                add_count = False
                last_day_state_probs = res.predict_proba(fit_data)[-1]
                probabilities.loc[date_of_first_forecast] = last_day_state_probs
                index = 0

            except ValueError:
                # this happens when startprob_ must sum to 1 (got nan)
                # si el modelo falla en el predict_proba, se utiliza el de t-1
                add_count = True
                index = index - 1
                res = model_list[index]

                if not "last_day_state_probs" in locals():
                    # this checks for failure of estimation in the first day
                    last_day_state_probs = np.full(nstate, (1 / nstate))
                    # inputs a flat prior if it has no previous day to fall back on

        if add_count:
            counter = counter + 1
        # model.means_ es es la media condicional a cada estado
        # cada columna representa cada columna del dataset
        # cada fila es un estado
        # el producto punto entre este y las probabilidades del ultimo día me da la media esperada por cada columna
        expected_means = np.dot(last_day_state_probs, model.means_)
        forecasts.loc[date_of_first_forecast] = expected_means

    pct_nan = forecasts.iloc[:, 0].isna().sum() / len(forecasts.index) * 100

    if pct_nan > 5:
        warnings.warn(f"{oos_data.columns[0]} % na: {pct_nan}")

    forecasts.fillna(method="ffill", inplace=True)

    residuals = return_residuals(oos_data, forecasts)

    print("failed models: ", counter)
    return probabilities, forecasts, residuals, counter

In [22]:
def generate_and_save_samples(
    best_model_dict: dict,
    modeltype: str,
    criterion: str,
    insample_data: pd.DataFrame,
    oos_data: pd.DataFrame,
    tickerlist: list,
    contains_vol: bool,
    contains_USD: bool,
):
    probabilities = {stock: None for stock in tickerlist}
    forecasts = {stock: None for stock in tickerlist}
    residuals = {stock: None for stock in tickerlist}
    failed = {stock: None for stock in tickerlist}

    print(">" * 10, modeltype, criterion)

    for stock in tickerlist:
        print(stock)
        columns = generate_columns(
            stock=stock, contains_vol=contains_vol, contains_USD=contains_USD
        )

        proba, fcast, resid, fails = generate_HMM_samples_residuals(
            best_model_dict[stock],
            insample_data=insample_data[columns],
            oos_data=oos_data[columns],
        )

        probabilities[stock] = proba
        forecasts[stock] = fcast
        residuals[stock] = resid
        failed[stock] = fails

    save_as_pickle(
        data=forecasts,
        resultsroute=params["resultsroute"],
        model_type=f"HMM_{modeltype}",
        tablename=params["tablename"],
        criterion=criterion,
        type_save="forecasts",
    )

    save_as_pickle(
        data=residuals,
        resultsroute=params["resultsroute"],
        model_type=f"HMM_{modeltype}",
        tablename=params["tablename"],
        criterion=criterion,
        type_save="residuals",
    )

    save_as_pickle(
        data=failed,
        resultsroute=params["resultsroute"],
        model_type=f"HMM_{modeltype}",
        tablename=params["tablename"],
        criterion=criterion,
        type_save="model_fails",
    )

In [23]:
models_dict = {
    "aic": {
        "univ": (aic_best_model_univ, False, False),
        "with_vol": (aic_best_model_with_vol, True, False),
        "multiv": (aic_best_model_multi, True, True),
    },
    "bic": {
        "univ": (bic_best_model_univ, False, False),
        "with_vol": (bic_best_model_with_vol, True, False),
        "multiv": (bic_best_model_multi, True, True),
    },
}

In [24]:
for criterion, type_dict in models_dict.items():
    for modeltype, tupla in type_dict.items():
        best_dict, contains_vol, contains_USD = tupla
        try:
            generate_and_save_samples(
                best_model_dict=best_dict,
                modeltype=modeltype,
                criterion=criterion,
                insample_data=df,
                oos_data=df_test,
                tickerlist=params["tickerlist"],
                contains_vol=contains_vol,
                contains_USD=contains_USD,
            )
        except UnboundLocalError:
            print(f"MODEL FALILURE: {criterion}, {modeltype}")

>>>>>>>>>> univ aic
^BVSP
failed models:  0
VALE3.SA
failed models:  0
VALE
failed models:  0
PETR3.SA
failed models:  0
PBR
failed models:  0
EMBR3.SA
failed models:  0
ERJ
failed models:  0
ABEV3.SA
failed models:  0
ABEV
failed models:  0
>>>>>>>>>> with_vol aic
^BVSP
failed models:  0
VALE3.SA
failed models:  0
VALE
failed models:  0
PETR3.SA
failed models:  0
PBR
failed models:  0
EMBR3.SA
failed models:  0
ERJ
failed models:  0
ABEV3.SA
failed models:  0
ABEV
failed models:  0
>>>>>>>>>> multiv aic
^BVSP
failed models:  0
VALE3.SA
failed models:  3
VALE
failed models:  0
PETR3.SA
failed models:  0
PBR
failed models:  0
EMBR3.SA
failed models:  0
ERJ
failed models:  0
ABEV3.SA
failed models:  0
ABEV
failed models:  0
>>>>>>>>>> univ bic
^BVSP
failed models:  0
VALE3.SA
failed models:  0
VALE
failed models:  0
PETR3.SA
failed models:  0
PBR
failed models:  0
EMBR3.SA
failed models:  0
ERJ
failed models:  0
ABEV3.SA
failed models:  0
ABEV
failed models:  0
>>>>>>>>>> with_vol bic
^B

In [25]:
file=f"""HMM_multiv_{params["tablename"]}_aic_best_residuals.pickle"""
with open(os.path.join(resultsroute, file), "rb") as f:
    opened_pickle=pickle.load(f)

In [26]:
opened_pickle[params["index"]].tail()


Unnamed: 0,^BVSP_log_rets,^BVSP_gk_vol,USD_log_rets,USD_gk_vol
2023-12-01,0.005412,-0.000107,-0.007433,-2.4e-05
2023-12-04,-0.010738,-7.3e-05,0.012951,1e-05
2023-12-05,0.000191,-0.000159,-0.002534,-2.8e-05
2023-12-06,-0.01066,-3.3e-05,-0.004736,-8e-06
2023-12-07,0.002553,-0.000152,0.000697,5e-06


In [7]:
fails_dict=get_all_results_matching(resultsroute, ["fail"])

{'HMM_multiv_BR_^BVSP_aic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_multiv_BR_^BVSP_aic_best_model_fails.pickle', 'HMM_multiv_BR_^BVSP_bic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_multiv_BR_^BVSP_bic_best_model_fails.pickle', 'HMM_univ_BR_^BVSP_aic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_univ_BR_^BVSP_aic_best_model_fails.pickle', 'HMM_univ_BR_^BVSP_bic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_univ_BR_^BVSP_bic_best_model_fails.pickle', 'HMM_with_vol_BR_^BVSP_aic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_with_vol_BR_^BVSP_aic_best_model_fails.pickle', 'HMM_with_vol_BR_^BVSP_bic_best_model_fails.pickle': '..\\results\\BR_^BVSP\\HMM_with_vol_BR_^BVSP_bic_best_model_fails.pickle'}


In [13]:
fails_df=pd.DataFrame()
for name, dir in fails_dict.items():
    dict_with_dfs = pd.read_pickle(dir)
    colname = clean_modelname(name, substring_to_replace="model_fails", tablename=params["tablename"])
    fails_df[colname]=dict_with_dfs
    os.remove(dir)

fails_df=fails_df/len(df_test.index)
fails_df.to_csv(path_or_buf=os.path.join(params["resultsroute"], f"""HMM_{params["tablename"]}_fails.csv"""))

# Graficando

In [27]:
def plot_close_rets_vol(model, data, key, IC):
    prediction = model.predict(data)
    states = set(prediction)

    fig = plt.figure(figsize=(20, 20))
    plt.tight_layout()
    plt.title(
        f"{key} Log returns and intraday Vol\n{model.n_components} states / best by {IC}"
    )

    for subplot, var in zip(range(1, 3), data.columns):
        plt.subplot(2, 1, subplot)
        for i in set(prediction):
            state = prediction == i
            x = data.index[state]
            y = data[var].iloc[state]
            plt.plot(x, y, ".")
        plt.legend(states, fontsize=16)

        plt.grid(True)
        plt.xlabel("datetime", fontsize=16)
        plt.ylabel(var, fontsize=16)

    plt.savefig(os.path.join(resultsroute, "graphs", f"HMM", f"{key}_model_{IC}.png"))

In [28]:
# for dictionary, IC in zip([aic_best_model, bic_best_model], ["AIC", "BIC"]):
#    for key, model in dictionary.items():
#        columns = [f"{stock}_log_rets", f"{stock}_gk_vol"]
#        insample_data = df[columns]
#        oos_data = df_test[columns]
#        train_end = insample_data.index.max()
#        data = pd.concat([insample_data, oos_data])
#
#        plot_close_rets_vol(model, data, key, IC)