## Startup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pomegranate as pm
import torch
from scipy.special import logsumexp

import logging
import os
import pickle
import warnings

In [2]:
from pomegranate.distributions import Normal
from pomegranate.hmm import DenseHMM

In [3]:
random_state = 42
np.random.seed(random_state)
logging.captureWarnings(True)

In [4]:
from scripts.params import get_params
from scripts.aux_functions import (
    generate_columns,
    save_as_pickle,
    get_all_results_matching,
    clean_modelname,
)

params = get_params()

## Data Retrieval

In [5]:
dataroute = params["dataroute"]
resultsroute = params["resultsroute"]
dumproute = params["dumproute"]

In [6]:
name = f'finaldf_train_{params["tablename"]}.pickle'
filename = os.path.join(dataroute, name)
with open(filename, "rb") as handle:
    df = pickle.load(handle)

In [7]:
df.head()

Unnamed: 0,^MXX_rets,^MXX_log_rets,^MXX_gk_vol,WALMEX.MX_rets,WALMEX.MX_log_rets,WALMEX.MX_gk_vol,WMMVY_rets,WMMVY_log_rets,WMMVY_gk_vol,GFNORTEO.MX_rets,...,CEMEXCPO.MX_gk_vol,CX_rets,CX_log_rets,CX_gk_vol,USD_rets,USD_log_rets,USD_gk_vol,MXX_USD_rets,MXX_USD_log_rets,MXX_USD_gk_vol
2013-01-03,0.0015,0.001499,5.6e-05,0.004529,0.004519,0.0002,-0.000608,-0.000609,0.000141,-0.016141,...,0.000374,-0.002901,-0.002906,0.000505,0.001275,0.001274,1.4e-05,-0.001751,-0.001752,5.6e-05
2013-01-04,0.00432,0.004311,3.5e-05,-0.010916,-0.010976,0.000246,-0.006999,-0.007024,0.000143,-0.017928,...,0.000184,0.00388,0.003872,0.000187,-0.003042,-0.003047,1.6e-05,0.006798,0.006775,3.5e-05
2013-01-07,0.001427,0.001426,4.3e-05,0.011037,0.010976,0.00018,0.007049,0.007024,0.000124,0.014318,...,0.000258,0.008696,0.008658,0.000238,0.002933,0.002929,1.1e-05,0.001838,0.001837,4.3e-05
2013-01-08,-0.001226,-0.001227,1.4e-05,-0.006882,-0.006906,6.9e-05,-0.005478,-0.005493,5.9e-05,0.007999,...,0.000158,-0.000958,-0.000958,0.000205,-0.000919,-0.000919,1.2e-05,-0.000999,-0.000999,1.4e-05
2013-01-09,0.006537,0.006515,3.7e-05,0.009558,0.009512,9.7e-05,0.015606,0.015485,0.00018,0.014704,...,0.000145,0.011505,0.01144,0.000211,-0.003966,-0.003974,4e-06,0.007962,0.00793,3.7e-05


## HMM Training

In [8]:
range_states = range(1, 16)
emptydf = pd.DataFrame(columns=["AIC", "BIC"], index=range_states)
emptydf.fillna(np.inf, inplace=True)
results_dict_df = {stock: emptydf for stock in params["assetlist"]}

In [9]:
def from_df_to_reshaped(data: pd.DataFrame):
    npdata = data.values
    data_reshaped = npdata[:, :, np.newaxis]
    return data_reshaped

In [10]:
def GaussianHMM(data_reshaped: np.ndarray, n_state: int):
    model = DenseHMM(distributions=[Normal() for _ in range(n_state)], sample_length=1)

    res = model.fit(data_reshaped)
    return res

In [11]:
def n_params(res: pm.hmm.dense_hmm.DenseHMM):
    n_dist = res.n_distributions
    params_from_dists = n_dist * 2  # mean and variance for Normal
    transmat_elements = n_dist * (
        n_dist - 1
    )  # square matrix (minus last row bc must sum to one)
    n_params = params_from_dists + transmat_elements
    return n_params

In [12]:
def get_aic(res: pm.hmm.dense_hmm.DenseHMM, data: np.ndarray):
    """
    Log Likelihood of the model is the Logsumexp of the log likelihood
    see https://stats.stackexchange.com/questions/60902/how-to-calculate-the-log-likelihood-in-hmm-from-the-output-of-the-forward-algori
    """
    aic = 2 * n_params(res) - 2 * logsumexp(res.log_probability(data))
    return aic

In [13]:
def get_bic(res: pm.hmm.dense_hmm.DenseHMM, data: np.ndarray):
    """
    bic = k * np.log(len(data)) - 2 * model.log_likelihood(data)
    """
    bic = n_params(res) * np.log(len(data)) - 2 * logsumexp(res.log_probability(data))
    return bic

In [14]:
def select_best(data: pd.DataFrame, max_states=15):

    aic = {"criterion": np.inf, "best_model": None, "n_state": None}
    bic = {"criterion": np.inf, "best_model": None, "n_state": None}

    data_reshaped = from_df_to_reshaped(data)

    for num_states in range(2, max_states + 1):
        res = GaussianHMM(data_reshaped, n_state=num_states)

        aic_result = get_aic(res, data_reshaped)
        bic_result = get_bic(res, data_reshaped)

        if aic_result < aic["criterion"]:
            aic["criterion"] = aic_result
            aic["best_model"] = res
            aic["n_state"] = num_states
        if bic_result < bic["criterion"]:
            bic["criterion"] = bic_result
            bic["best_model"] = res
            bic["n_state"] = num_states

    return aic, bic

In [15]:
def find_best_all_assets(
    df: pd.DataFrame,
    max_states: int = 10,
    contains_vol: bool = False,
    contains_USD: bool = False,
):
    best = {stock: {"aic": None, "bic": None} for stock in params["assetlist"]}

    for stock in params["assetlist"]:
        print(stock)
        cols = generate_columns(
            stock=stock, contains_vol=contains_vol, contains_USD=contains_USD
        )
        aic, bic = select_best(df[cols], max_states=max_states)
        best[stock]["aic"] = aic
        best[stock]["bic"] = bic

    return best

In [16]:
df[["USD_^MXX_log_rets", "USD_^MXX_gk_vol"]] = df[
    ["^MXX_log_rets", "^MXX_gk_vol"]
].copy()
# transitorio pq issue #71

In [17]:
for i in range(5):
    try:
        best_with_vol = find_best_all_assets(
            df, max_states=10, contains_vol=True, contains_USD=False
        )
        # this cell sometimes crashes unexpectedly - just run again
        break
    except IndexError:
        print(f"Fail {i}, try again")
        

MXX_USD
^MXX
WALMEX.MX
WMMVY
GFNORTEO.MX
GBOOY
FEMSAUBD.MX
FMX
CEMEXCPO.MX
CX


In [18]:
for i in range(5):
    try:
        best_multiv = find_best_all_assets(
            df, max_states=10, contains_vol=True, contains_USD=True
        )
        # this cell sometimes crashes unexpectedly - just run again
        break
    except IndexError:
        print(f"Fail {i}, try again")

MXX_USD
^MXX
WALMEX.MX
WMMVY
GFNORTEO.MX
GBOOY
FEMSAUBD.MX
FMX
CEMEXCPO.MX
CX


# Generating out of sample data

In [19]:
name = f'finaldf_test_{params["tablename"]}.pickle'
filename = os.path.join(dataroute, name)
with open(filename, "rb") as handle:
    df_test = pickle.load(handle)

In [21]:
def return_residuals(actual: pd.DataFrame, forecasts: pd.DataFrame):
    residuals = actual - forecasts
    return residuals

In [22]:
def generate_samples_residuals(n_state, insample_data, oos_data):
    """
    This function only requires the number of normal distributions, which may be acquired from len(res.distributions)
    """
    # res.predict_proba(data_reshaped)[-1] es la matriz de cada estado
    columns = oos_data.columns

    split_date = oos_data.index[0]
    dates_to_forecast = len(oos_data.index)

    probabilities = pd.DataFrame(columns=range(n_state), index=oos_data.index)
    forecasts = pd.DataFrame(columns=oos_data.columns, index=oos_data.index)

    full_data = pd.concat([insample_data, oos_data])
    index = full_data.index
    end_loc = np.where(index >= split_date)[0].min()
    # esto es un int del iloc
    # preciso usar ints de iloc porque el timedelta se me va a romper con el fin de semana
    rolling_window = 252

    model_list = []

    for i in range(1, dates_to_forecast):
        # recursive window forecasting
        date_of_first_forecast = full_data.index[end_loc + i - 1]

        fitstart = end_loc - rolling_window + i
        fitend = end_loc + i

        # fit model with last year
        fit_data = full_data.iloc[fitstart:fitend][columns]
        reshaped_fit_data= from_df_to_reshaped(fit_data)
        
        res = GaussianHMM(data_reshaped=reshaped_fit_data, n_state=n_state)
        model_list.append(res)
        
        prob_matrix = res.predict_proba(reshaped_fit_data)[-1]
        prob_states = prob_matrix.sum(axis=0)/prob_matrix.sum() # rescale to measure 1
        
        last_day_state_probs = prob_matrix.sum(axis=0) / prob_matrix.sum()
        # hotfix véase https://github.com/alfsn/regime-switching-hmm/issues/72

        probabilities.loc[date_of_first_forecast] = last_day_state_probs
        
        param_means = [dist.means for dist in res.distributions]
        param_tensor = torch.cat(param_means, dim=0)

        expected_means = torch.dot(prob_states, param_tensor)
        
        forecasts.loc[date_of_first_forecast] = expected_means

    forecasts.fillna(method="ffill", inplace=True)

    residuals = return_residuals(oos_data, forecasts)

    return probabilities, forecasts, residuals
        

In [23]:
def generate_and_save_samples(
    best_model_dict: dict,
    modeltype: str,
    insample_data: pd.DataFrame,
    oos_data: pd.DataFrame,
    contains_vol: bool,
    contains_USD: bool,
):
    generic_dict = {stock: None for stock in params["assetlist"]}
    probabilities = {"aic": generic_dict.copy(), "bic": generic_dict.copy()}
    forecasts = probabilities.copy()
    residuals = probabilities.copy()

    for stock in best_model_dict.keys():
        for criterion, specific_model in best_model_dict[stock].items():
            retries=5
            n_state = specific_model["n_state"]
            print(modeltype, criterion, stock, n_state)
            columns = generate_columns(
                stock=stock, contains_vol=contains_vol, contains_USD=contains_USD
            )
            
            for i in range(retries):
                try:
                    proba, fcast, resid= generate_samples_residuals(
                        n_state=n_state,
                        insample_data=insample_data[columns],
                        oos_data=oos_data[columns],
                    )
                    print("Converged")
                    break
                except IndexError:
                    print(f"Fail {i}, retrying...")

            probabilities[criterion][stock] = proba
            forecasts[criterion][stock] = fcast
            residuals[criterion][stock] = resid

    for criterion in ["aic", "bic"]:
        save_as_pickle(
            data=forecasts[criterion],
            resultsroute=params["resultsroute"],
            model_type=f"HMM_{modeltype}",
            tablename=params["tablename"],
            criterion=criterion,
            type_save="forecasts",
        )

        save_as_pickle(
            data=residuals[criterion],
            resultsroute=params["resultsroute"],
            model_type=f"HMM_{modeltype}",
            tablename=params["tablename"],
            criterion=criterion,
            type_save="residuals",
        )

In [24]:
models_dict = {
    "with_vol": (best_with_vol, True, False),
    "multiv": (best_multiv, True, True)
}

In [29]:
for i in range(5):
    try:
        for modeltype, tupla in models_dict.items():
            best_model_dict, contains_vol, contains_USD = tupla
            generate_and_save_samples(
                best_model_dict=best_model_dict,
                modeltype= modeltype,
                insample_data=df,
                oos_data=df_test,
                contains_vol= contains_vol,
                contains_USD=contains_USD)          
    # this cell sometimes crashes unexpectedly - just run again
        break
    except AttributeError:
        print(f"Fail {i}, try again")



with_vol aic MXX_USD 2
Converged
with_vol bic MXX_USD 2
Converged
with_vol aic ^MXX 2
Converged
with_vol bic ^MXX 2
Converged
with_vol aic WALMEX.MX 2
Converged
with_vol bic WALMEX.MX 2
Converged
with_vol aic WMMVY 2
Converged
with_vol bic WMMVY 2
Fail 0, retrying...
Converged
with_vol aic GFNORTEO.MX 2
Converged
with_vol bic GFNORTEO.MX 2
Converged
with_vol aic GBOOY 2
Converged
with_vol bic GBOOY 2
Converged
with_vol aic FEMSAUBD.MX 2
Converged
with_vol bic FEMSAUBD.MX 2
Converged
with_vol aic FMX 2
Converged
with_vol bic FMX 2
Converged
with_vol aic CEMEXCPO.MX 2
Converged
with_vol bic CEMEXCPO.MX 2
Converged
with_vol aic CX 2
Fail 0, retrying...
Converged
with_vol bic CX 2
Fail 0, retrying...
Fail 1, retrying...
Fail 2, retrying...
Converged
multiv aic MXX_USD 3
Converged
multiv bic MXX_USD 2
Converged
multiv aic ^MXX 3
Converged
multiv bic ^MXX 2
Converged
multiv aic WALMEX.MX 2
Converged
multiv bic WALMEX.MX 2
Converged
multiv aic WMMVY 3
Converged
multiv bic WMMVY 2
Converged
mu