## Startup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hmmlearn import hmm

import os
import pickle

In [2]:
random_state=42
np.random.seed(random_state)

In [3]:
from scripts.params import get_params

params = get_params()

## Data Retrieval

In [4]:
dataroute=os.path.join("..",  "data")
dumproute=os.path.join("..",  "dump")
resultsroute=os.path.join("..",  "results")

In [5]:
name=f'finaldf_train_{params["tablename"]}.pickle'
filename=os.path.join(dataroute, name)
with open(filename, 'rb') as handle:
    df=pickle.load(handle)


In [6]:
df.head()

Unnamed: 0,^MERV_rets,^MERV_log_rets,^MERV_gk_vol,GGAL.BA_rets,GGAL.BA_log_rets,GGAL.BA_gk_vol,GGAL_rets,GGAL_log_rets,GGAL_gk_vol,YPFD.BA_rets,...,BBAR.BA_gk_vol,BBAR_rets,BBAR_log_rets,BBAR_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,USD_^MERV_rets,USD_^MERV_log_rets,USD_^MERV_gk_vol
2013-01-03,0.007552,0.007524,0.000129,0.010616,0.01056,0.000677,-0.012748,-0.01283,0.001228,-0.006862,...,0.000169,-0.005725,-0.005742,0.00096,0.00883,0.008792,1.4e-05,0.001247,0.001246,0.000129
2013-01-04,0.007092,0.007067,0.000158,-0.006303,-0.006323,0.000208,-0.010043,-0.010094,0.000554,0.004936,...,0.000406,-0.019194,-0.019381,0.000635,0.018043,0.017883,0.000133,-0.005727,-0.005744,0.000158
2013-01-07,-0.001035,-0.001035,2.2e-05,0.002114,0.002112,6.3e-05,-0.014493,-0.014599,0.000517,0.010805,...,0.000492,0.015656,0.015534,0.000511,-0.002489,-0.002492,4.8e-05,-0.009769,-0.009817,2.2e-05
2013-01-08,0.008285,0.008251,8.2e-05,-0.008439,-0.008475,0.000153,-0.016176,-0.016309,0.001085,0.049563,...,0.000438,-0.015414,-0.015534,0.000642,0.015356,0.015239,6.4e-05,-0.001117,-0.001118,8.2e-05
2013-01-09,0.017826,0.017669,0.000273,0.0,0.0,0.0,0.011958,0.011887,0.005238,0.0,...,0.0,-0.003914,-0.003921,0.000147,-0.008671,-0.008709,0.001065,0.017245,0.017098,0.000273


In [7]:
tickerlist=params["tickerlist"]

## HMM Training

In [8]:
range_states=range(1,16)
emptydf=pd.DataFrame(columns=["AIC", "BIC"], index=range_states)
emptydf.fillna(np.inf, inplace=True)
results_dict_df={stock:emptydf for stock in tickerlist}

In [9]:
aic_best_model={stock:None for stock in tickerlist}
bic_best_model={stock:None for stock in tickerlist}

In [None]:
for stock in tickerlist:
    columns = [f'{stock}_log_rets', f'{stock}_gk_vol']
    insample_data = df[columns]

    param_dict={
        "covariance_type" : "diag", 
        "n_iter" : 500,
        "random_state" : random_state
        #no voy a usar startprob_prior por devlog 20-06-23
        }

    for nstate in range_states:
        model = hmm.GaussianHMM(n_components= nstate, **param_dict, verbose=False)
        results = model.fit(insample_data)

        convergence=results.monitor_.converged
        # esta es la condición de si el modelo convergió

        all_states_found=np.isclose(a=(model.transmat_.sum(axis=1)), b=1).all()
        # esta es la condición de que todos los estados (nstates) hayan sido observados
        # si no, alguna fila en la matriz de transición del modelo son 0.
        # el errormsg es "Some rows of transmat_ have zero sum because no transition from the state was ever observed".

        startprob_check = (model.startprob_.sum()==1)
        # esta es la condición de que los estados al inicializar estén definidos
        
        good_model = convergence and all_states_found and startprob_check

        if good_model:
            try:
                results_dict_df[stock].loc[nstate, "AIC"]=model.aic(insample_data)
                results_dict_df[stock].loc[nstate, "BIC"]=model.bic(insample_data)
            except ValueError:
                pass
        else: 
            print(">"*10,f"{stock} {nstate} did not converge")
            results_dict_df[stock].loc[nstate, "BIC"]=np.inf
            results_dict_df[stock].loc[nstate, "BIC"]=np.inf

In [None]:
for stock in tickerlist:
    columns = [f'{stock}_log_rets', f'{stock}_gk_vol']
    insample_data = df[columns]
    
    best_aic_nstate=results_dict_df[stock]["AIC"].astype(float).idxmin()
    best_bic_nstate=results_dict_df[stock]["BIC"].astype(float).idxmin()
    print(f"For stock {stock}, best AIC: {best_aic_nstate} best BIC: {best_bic_nstate}")

    aic_best_model[stock]=hmm.GaussianHMM(n_components = best_aic_nstate, **param_dict).fit(insample_data)
    bic_best_model[stock]=hmm.GaussianHMM(n_components = best_bic_nstate, **param_dict).fit(insample_data)

# Generating out of sample data

In [12]:
name=f'finaldf_test_{params["tablename"]}.pickle'
filename=os.path.join(dataroute, name)
with open(filename, 'rb') as handle:
    df_test=pickle.load(handle)

In [118]:
model = hmm.GaussianHMM(n_components= 3, **param_dict, verbose=False)

In [119]:
oos_data=df_test.copy()
insample_data=df.copy()

In [120]:
split_date = oos_data.index[0]
dates_to_forecast = len(oos_data.index)

columns=["GGAL_log_rets","GGAL_gk_vol"]
forecasts=pd.DataFrame(columns=columns, index=oos_data.index)

oos_data = pd.concat([insample_data, oos_data])
del insample_data

# vamos a implementar recursive window forecasting

index = oos_data.index
end_loc = np.where(index >= split_date)[0].min()


In [121]:
end_loc

2524

In [142]:
i=1
rolling_window=252
date_of_first_forecast = oos_data.index[end_loc + i -1]

fitstart = end_loc - rolling_window + i
fitend = end_loc + i
fitend

2525

In [143]:
oos_data.iloc[fitstart:fitend].shape

(252, 39)

In [144]:
model.get_params()

{'algorithm': 'viterbi',
 'covariance_type': 'diag',
 'covars_prior': 0.01,
 'covars_weight': 1,
 'implementation': 'log',
 'init_params': 'stmc',
 'means_prior': 0,
 'means_weight': 0,
 'min_covar': 0.001,
 'n_components': 3,
 'n_iter': 500,
 'params': 'stmc',
 'random_state': 42,
 'startprob_prior': 1.0,
 'tol': 0.01,
 'transmat_prior': 1.0,
 'verbose': False}

In [145]:
df.columns

Index(['^MERV_rets', '^MERV_log_rets', '^MERV_gk_vol', 'GGAL.BA_rets',
       'GGAL.BA_log_rets', 'GGAL.BA_gk_vol', 'GGAL_rets', 'GGAL_log_rets',
       'GGAL_gk_vol', 'YPFD.BA_rets', 'YPFD.BA_log_rets', 'YPFD.BA_gk_vol',
       'YPF_rets', 'YPF_log_rets', 'YPF_gk_vol', 'EDN.BA_rets',
       'EDN.BA_log_rets', 'EDN.BA_gk_vol', 'EDN_rets', 'EDN_log_rets',
       'EDN_gk_vol', 'BMA.BA_rets', 'BMA.BA_log_rets', 'BMA.BA_gk_vol',
       'BMA_rets', 'BMA_log_rets', 'BMA_gk_vol', 'BBAR.BA_rets',
       'BBAR.BA_log_rets', 'BBAR.BA_gk_vol', 'BBAR_rets', 'BBAR_log_rets',
       'BBAR_gk_vol', 'USD_rets', 'USD_log_rets', 'USD_gk_vol',
       'USD_^MERV_rets', 'USD_^MERV_log_rets', 'USD_^MERV_gk_vol'],
      dtype='object')

In [146]:
fit_data=oos_data.iloc[fitstart:fitend][columns]
res=model.fit(fit_data)

Even though the 'startprob_' attribute is set, it will be overwritten during initialization because 'init_params' contains 's'
Even though the 'transmat_' attribute is set, it will be overwritten during initialization because 'init_params' contains 't'
Even though the 'means_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'm'
Even though the 'covars_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'c'


In [147]:
res.predict_proba(fit_data)[-1]

array([1.97645064e-01, 8.02354936e-01, 2.70813830e-85])

In [148]:
model.means_

array([[ 0.00415891,  0.00156764],
       [-0.00128011,  0.00179141],
       [-0.06473003,  0.01143463]])

In [154]:
exp_values=np.dot(res.predict_proba(fit_data)[-1], model.means_)
exp_values

array([-0.00020512,  0.00174718])

In [155]:
date_of_first_forecast

datetime.date(2023, 6, 2)

In [158]:
forecasts.loc[date_of_first_forecast]=exp_values
forecasts

Unnamed: 0,GGAL_log_rets,GGAL_gk_vol
2023-06-02,-0.000205,0.001747
2023-06-05,,
2023-06-06,,
2023-06-07,,
2023-06-08,,
...,...,...
2023-11-24,,
2023-11-27,,
2023-11-28,,
2023-11-29,,


In [161]:
model.n_components

3

In [180]:
def generate_HMM_samples_residuals_2(model, insample_data, oos_data):
    """_summary_

    Args:
        model (_type_): _description_
        insample_data (_type_): _description_
        oos_data (_type_): _description_
    """
    # pseudocodigo
    # agarra el mejor modelo (esto con una cantidad optima de params ya esta)
    # fittear t-j con t-j-252d
    # Darle un año de datos hasta t-j para que me prediga la secuencia (probabilidad) de estados.
        # Le pido que me prediga las probabilidades de cada estado durante el periodo t-j, t-j-252: 
        # esto me da una matriz de (252 x n estados)
        # esto entiendo es https://hmmlearn.readthedocs.io/en/latest/api.html#hmmlearn.hmm.GaussianHMM.predict_proba
    # Tomo la ultima fila de la matriz
    # Multiplico esa por el vector de medias estimadas: este punto es mi forecast. 
        # esto es model.means_ (!)    
    nstate=model.n_components
    columns=oos_data.columns

    split_date = oos_data.index[0]
    dates_to_forecast = len(oos_data.index)

    probabilities=pd.DataFrame(columns=range(nstate), index=oos_data.index)
    forecasts=pd.DataFrame(columns=oos_data.columns, index=oos_data.index)

    oos_data = pd.concat([insample_data, oos_data])
    del insample_data

    # vamos a implementar recursive window forecasting
    
    index = oos_data.index
    end_loc = np.where(index >= split_date)[0].min()
    # esto es un int del iloc
    # preciso usar ints de iloc porque el timedelta se me va a romper con el fin de semana
    rolling_window = 252

    nstate=model.n_components
    model = hmm.GaussianHMM(n_components = nstate, **param_dict, verbose=False)

    for i in range(1, dates_to_forecast):
        date_of_first_forecast = oos_data.index[end_loc + i -1]
        
        fitstart = end_loc - rolling_window + i
        fitend = end_loc + i

        # fit model with last year
        fit_data=oos_data.iloc[fitstart:fitend][columns]
        res=model.fit(fit_data)
        # TODO: que pasa si fittea mal?
        
        # obtenemos las probabilidades por estado del ultimo dia
        # son las probabilidades que maximizan la log/likelihood de toda la secuencia
        try:
            last_day_state_probs = res.predict_proba(fit_data)[-1]
            probabilities.loc[date_of_first_forecast] = last_day_state_probs
        
        except ValueError:
            # this happens when startprob_ must sum to 1 (got nan)
            res=last_model
            last_day_state_probs = res.predict_proba(fit_data)[-1]
            probabilities.loc[date_of_first_forecast] = last_day_state_probs

        # model.means_ es es la media condicional a cada estado
            # cada columna representa cada columna del dataset
            # cada fila es un estado
        # el producto punto entre este y las probabilidades del ultimo día me da la media esperada por cada columna
        expected_means = np.dot(last_day_state_probs, model.means_)
        forecasts.loc[date_of_first_forecast]=expected_means
        
        last_model=res
        
    return probabilities, forecasts

In [181]:
df["GGAL_log_rets", "GGAL_gk_vol"].

SyntaxError: invalid syntax (775945070.py, line 1)

In [183]:
probabilities, forecasts= generate_HMM_samples_residuals_2(model,
                                                           insample_data=df[["GGAL.BA_log_rets", "GGAL.BA_gk_vol"]], 
                                                           oos_data=df_test[["GGAL.BA_log_rets", "GGAL.BA_gk_vol"]])

Even though the 'startprob_' attribute is set, it will be overwritten during initialization because 'init_params' contains 's'
Even though the 'transmat_' attribute is set, it will be overwritten during initialization because 'init_params' contains 't'
Even though the 'means_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'm'
Even though the 'covars_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'c'
Even though the 'startprob_' attribute is set, it will be overwritten during initialization because 'init_params' contains 's'
Even though the 'transmat_' attribute is set, it will be overwritten during initialization because 'init_params' contains 't'
Even though the 'means_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'm'
Even though the 'covars_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'c'


Even though the 'startprob_' attribute is set, it will be overwritten during initialization because 'init_params' contains 's'
Even though the 'transmat_' attribute is set, it will be overwritten during initialization because 'init_params' contains 't'
Even though the 'means_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'm'
Even though the 'covars_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'c'
Even though the 'startprob_' attribute is set, it will be overwritten during initialization because 'init_params' contains 's'
Even though the 'transmat_' attribute is set, it will be overwritten during initialization because 'init_params' contains 't'
Even though the 'means_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'm'
Even though the 'covars_' attribute is set, it will be overwritten during initialization because 'init_params' contains 'c'


In [185]:
probabilities

Unnamed: 0,0,1,2
2023-06-02,0.000003,0.999997,0.0
2023-06-05,0.999882,0.000003,0.000115
2023-06-06,0.192408,0.799764,0.007829
2023-06-07,0.798641,0.201234,0.000125
2023-06-08,0.273423,0.72655,0.000028
...,...,...,...
2023-11-24,0.763938,0.235787,0.000274
2023-11-27,0.587768,0.412067,0.000165
2023-11-28,0.94647,0.04905,0.00448
2023-11-29,0.01792,0.982077,0.000003


In [186]:
forecasts

Unnamed: 0,GGAL.BA_log_rets,GGAL.BA_gk_vol
2023-06-02,0.00343,0.001261
2023-06-05,0.006796,0.00125
2023-06-06,0.005197,0.001534
2023-06-07,0.004264,0.001209
2023-06-08,0.006254,0.001314
...,...,...
2023-11-24,0.010615,0.002057
2023-11-27,0.008843,0.002031
2023-11-28,0.001849,0.002284
2023-11-29,0.012889,0.001942


In [56]:
def generate_HMM_samples_residuals(model, insample_data, oos_data):
    """
    
    """
    # como el modelo es memoryless, sólo necesito 1 día de observación para saber en qué estado estoy
    # por lo tanto, en vez de complicarme con dos datasets, puedo agregarle el ultimo día de insample_data al ppio de oos_data
    # pseudocodigo
    oos_data=pd.concat([insample_data[-1:], oos_data])
    del insample_data

    samples=pd.DataFrame(columns=oos_data.columns)
    residuals=pd.DataFrame(columns=oos_data.columns)

    # for i=0
    for i in range(1,
                   len(oos_data.index)):
        prev_obs=oos_data[i-1:i]

        todays_obs = oos_data[i:i+1]
        todays_date = todays_obs.index

        state=model.decode(prev_obs)[1][-1]
        # decode()[0] is the log probability, decode()[1] is the sequence of states, [-1] is the last state
        # since we have added the last datum of insample_data to oos_data, then the 
            # TODO: revisar que tenga sentido decodear solo el ultimo día.
            # La alternativa es agregar diez días de insample al principio y usar un decode con diez dias, 
            # me quedo con el ultimo valor del array que maximiza la log-likelihood de la secuencia entera
            # pero como es memoryless, not sure if it makesense
        
        sample = model.sample(n_samples=1, random_state=random_state, currstate=state)[0]
        # sample()[0] is the array with observations of the sampled variables, sample()[1] is the value of the currstate
        sample = pd.DataFrame(data=sample, columns=oos_data.columns, index=todays_date)

        samples=pd.concat([samples, sample])   
        # sampling given state t-1
        # observar realización en t+i
        residual = todays_obs-sample
        
        residuals=pd.concat([residuals, residual])
    
    return samples, residuals

In [57]:
aic_best_residuals={stock:None for stock in tickerlist}
bic_best_residuals={stock:None for stock in tickerlist}

In [60]:
for stock in tickerlist:
    columns = [f'{stock}_log_rets', f'{stock}_gk_vol']
    insample_data = df[columns]
    oos_data=df_test[columns]

    samples, aic_best_residuals[stock] = generate_HMM_samples_residuals(
        aic_best_model[stock], 
        insample_data=insample_data, 
        oos_data=oos_data)

    samples, bic_best_residuals[stock] = generate_HMM_samples_residuals(
        bic_best_model[stock], 
        insample_data=insample_data, 
        oos_data=oos_data)

# Guardado de datos

In [62]:
with open(os.path.join(resultsroute, f"""HMM_univ_{params["tablename"]}_aic_bestmodels.pickle"""), "wb") as output_file:
    pickle.dump(aic_best_model, output_file)

with open(os.path.join(resultsroute, f"""HMM_univ_{params["tablename"]}_bic_bestmodels.pickle"""), "wb") as output_file:
    pickle.dump(bic_best_model, output_file)

In [63]:
with open(os.path.join(resultsroute, f"""HMM_univ_{params["tablename"]}_aic_residuals.pickle"""), "wb") as output_file:
    pickle.dump(aic_best_residuals, output_file)

with open(os.path.join(resultsroute, f"""HMM_univ_{params["tablename"]}_bic_residuals.pickle"""), "wb") as output_file:
    pickle.dump(bic_best_residuals, output_file)

# Graficando

In [72]:
def plot_close_rets_vol(model, data, key, IC):
    prediction= model.predict(data)
    states=set(prediction)

    fig=plt.figure(figsize = (20, 20))
    plt.tight_layout()
    plt.title(f"{key} Log returns and intraday Vol\n{model.n_components} states / best by {IC}")

    for subplot, var in zip(range(1,3), data.columns):    
        plt.subplot(2,1,subplot)
        for i in set(prediction):
            state = (prediction == i)
            x = data.index[state]
            y = data[var].iloc[state]
            plt.plot(x, y, '.')
        plt.legend(states, fontsize=16)
        
        plt.grid(True)
        plt.xlabel("datetime", fontsize=16)
        plt.ylabel(var, fontsize=16)
            
    plt.savefig(os.path.join(resultsroute, "graphs", 
                             f"HMM", 
                             f"{key}_model_{IC}.png"))

In [None]:
for dictionary, IC in zip([aic_best_model, bic_best_model], ["AIC", "BIC"]):
    for key, model in dictionary.items():
        columns = [f'{stock}_log_rets', f'{stock}_gk_vol']
        insample_data = df[columns]
        oos_data=df_test[columns]
        train_end=insample_data.index.max()
        data=pd.concat([insample_data, oos_data])

        plot_close_rets_vol(model, data, key, IC)

## HMM Selection

Selecting the Number of States in Hidden Markov Models: Pragmatic Solutions Illustrated Using Animal Movement
https://sci-hub.st/10.1007/s13253-017-0283-8