In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3_2/regime-identification"
path = expanduser(path)
sys.path.append(path)

path_file = f"~/data/G3_2/regime-identification/simulation"
path_file = expanduser(path_file)
path_data = f"{path_file}/data"
path_estimation = f"{path_file}/estimation"
path_score = f"{path_file}/score"

In [2]:
import numpy as np
from sklearn.metrics import roc_auc_score

from numpy.random import RandomState
random_state = RandomState(0)

In [3]:
from regime.simulation_helper import *

# 0-generate-data-estimate-true-model

In this notebook we systematically generate the simulation data, estimate the labels and probability by the true HMM model, and score them. 

# Original Jump paper
## Vanilla: 9 combinations

- scale: We use the parameters estimated in the classical Hardy's paper, and convert into three scales: **daily, weekly, monthly**, with decreasing persistency.
- length: We simulate seqs of different length: 250, 500, 1000.

For each combo, we simulate `n_t=1000` seqs. The data in each combo are saved in a batch, thus in the shape of `(n_t, n_s, n_f)`. Also since we need to do feature engineering, every seq is 20 periods longer.

In [4]:
scale_lst = ["daily", "weekly", "monthly"]
n_s_lst = [250, 500, 1000]
key_data_list = [f"{scale}_{n_s}" for scale in scale_lst for n_s in n_s_lst]

n_buffer, n_t, n_c = 20, 1024, 2

In [5]:
for scale in scale_lst:
    # get a true HMM model
    hmm_true = get_GaussianHMM_model(*load_hardy_params(scale), random_state=random_state)
    for n_s in n_s_lst:
        # generate key for data
        key_data = f"{scale}_{n_s}"
        # simulate X_raw, Z.
        Xs, Zs = sample_from_hmm(hmm_true, n_trials=n_t, n_samples=n_s+n_buffer, random_state=random_state)
        Zs = Zs[:, -n_s:]
        # save raw data
        np_save_print(f"{path_data}/X_raw_{key_data}.npy", Xs, "X raw")
        np_save_print(f"{path_data}/Z_{key_data}.npy", Zs, "Z")        
        # estimate by the true HMM model.
        labels_arr, proba_arr = HMM_estimate_result(hmm_true, Xs[:, -n_s:])
        # save estimation results
        np_save_print(f"{path_estimation}/labels_{key_data}_true.npy", labels_arr, "labels")
        np_save_print(f"{path_estimation}/proba_{key_data}_true.npy", proba_arr, "proba")
        # score the estimation by the true model.
        acc_arr = scorer_batch(accuracy_each_cluster, Zs, labels_arr, )
        idx = get_idx_have_all_clusters(Zs, n_c)
        roc_auc_arr = scorer_batch(roc_auc_score, Zs, proba_arr[:, :, 1], idx_subset=idx)
        # save scores
        np_save_print(f"{path_score}/acc_{key_data}_true", acc_arr, "accuracy score")
        np_save_print(f"{path_score}/roc_auc_{key_data}_true", roc_auc_arr, "roc auc score")
        # print for sanity check
        print(f"{key_data} data. BAC: {np.nanmean(acc_arr, 0).mean()}, roc_auc: {np.nanmean(roc_auc_arr)}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:08<00:00, 124.78it/s]


shape of the saved X raw: (1024, 270, 1).
shape of the saved Z: (1024, 250).
shape of the saved labels: (1024, 250).
shape of the saved proba: (1024, 250, 2).
shape of the saved accuracy score: (1024, 2).
shape of the saved roc auc score: (1024,).
daily_250 data. BAC: 0.9247409753546655, roc_auc: 0.9922781336083762


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:15<00:00, 64.58it/s]


shape of the saved X raw: (1024, 520, 1).
shape of the saved Z: (1024, 500).
shape of the saved labels: (1024, 500).
shape of the saved proba: (1024, 500, 2).
shape of the saved accuracy score: (1024, 2).
shape of the saved roc auc score: (1024,).
daily_500 data. BAC: 0.9285405045155555, roc_auc: 0.9921850567050384


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:31<00:00, 32.96it/s]


shape of the saved X raw: (1024, 1020, 1).
shape of the saved Z: (1024, 1000).
shape of the saved labels: (1024, 1000).
shape of the saved proba: (1024, 1000, 2).
shape of the saved accuracy score: (1024, 2).
shape of the saved roc auc score: (1024,).
daily_1000 data. BAC: 0.953140004988454, roc_auc: 0.9960976374560261


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:08<00:00, 124.99it/s]


shape of the saved X raw: (1024, 270, 1).
shape of the saved Z: (1024, 250).
shape of the saved labels: (1024, 250).
shape of the saved proba: (1024, 250, 2).
shape of the saved accuracy score: (1024, 2).
shape of the saved roc auc score: (1024,).
weekly_250 data. BAC: 0.8439488183410284, roc_auc: 0.9673186986651192


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:15<00:00, 64.37it/s]


shape of the saved X raw: (1024, 520, 1).
shape of the saved Z: (1024, 500).
shape of the saved labels: (1024, 500).
shape of the saved proba: (1024, 500, 2).
shape of the saved accuracy score: (1024, 2).
shape of the saved roc auc score: (1024,).
weekly_500 data. BAC: 0.8716447852338077, roc_auc: 0.9767307758057153


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:31<00:00, 32.82it/s]


shape of the saved X raw: (1024, 1020, 1).
shape of the saved Z: (1024, 1000).
shape of the saved labels: (1024, 1000).
shape of the saved proba: (1024, 1000, 2).
shape of the saved accuracy score: (1024, 2).
shape of the saved roc auc score: (1024,).
weekly_1000 data. BAC: 0.8817797280946338, roc_auc: 0.9798447515810371


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:08<00:00, 117.92it/s]


shape of the saved X raw: (1024, 270, 1).
shape of the saved Z: (1024, 250).
shape of the saved labels: (1024, 250).
shape of the saved proba: (1024, 250, 2).
shape of the saved accuracy score: (1024, 2).
shape of the saved roc auc score: (1024,).
monthly_250 data. BAC: 0.7377494275988151, roc_auc: 0.9062887936974405


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:16<00:00, 63.64it/s]


shape of the saved X raw: (1024, 520, 1).
shape of the saved Z: (1024, 500).
shape of the saved labels: (1024, 500).
shape of the saved proba: (1024, 500, 2).
shape of the saved accuracy score: (1024, 2).
shape of the saved roc auc score: (1024,).
monthly_500 data. BAC: 0.7539622723800392, roc_auc: 0.9147037591518162


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:31<00:00, 32.71it/s]


shape of the saved X raw: (1024, 1020, 1).
shape of the saved Z: (1024, 1000).
shape of the saved labels: (1024, 1000).
shape of the saved proba: (1024, 1000, 2).
shape of the saved accuracy score: (1024, 2).
shape of the saved roc auc score: (1024,).
monthly_1000 data. BAC: 0.7568206017963774, roc_auc: 0.9150253093039684
