In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3_2/regime-identification"
path = expanduser(path)
sys.path.append(path)

path_data = f"{path}/notebooks/simulation/data"
path_estimation = f"{path}/notebooks/simulation/estimation"
path_scores = f"{path}/notebooks/simulation/scores"

In [2]:
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score

In [3]:
from regime.simulation_helper import *

In [4]:
scale_lst = ["daily", "weekly", "monthly"]
n_s_lst = [250, 500, 1000]
n_c = 2

key_list = [f"{scale}_{n_s}" for scale in scale_lst for n_s in n_s_lst]
n_t=1000

# Scoring: Jump models.
In this notebook we score the true HMM model.

In [5]:
def label_switching(Zs, labels_, proba_ = None):
    accuracy_arr = score_trials(accuracy_score, Zs, labels_, lambdas=True)
    idx_l, idx_t = np.where(accuracy_arr<.5)
    labels_new = labels_.copy()
    labels_new[idx_l, idx_t] = 1 - labels_new[idx_l, idx_t]
    if proba_ is not None:
        proba_new = proba_.copy()
        proba_new[idx_l, idx_t] = 1 - proba_new[idx_l, idx_t]
        return labels_new, proba_new
    return labels_new, None

In [6]:
def run_model_scoring(model):
    for key in tqdm(key_list):
        # true data
        Zs = np.load(f"{path_data}/Z_{key}.npy")[:n_t]
        # estimation res
        labels_ = np.load(f"{path_estimation}/labels_{model}_{key}.npy")
        if "discrete" in model:
            proba_ = None
        else:
            proba_ = np.load(f"{path_estimation}/proba_{model}_{key}.npy")
        # switch labels
        labels_new, proba_new = label_switching(Zs, labels_, proba_)

        # accuracy
        accuracy_arr = score_trials(accuracy_each_cluster, Zs, labels_new, lambdas=True)
        np_save_print(f"{path_scores}/accuracy_{model}_{key}.npy", accuracy_arr)
        if "discrete" not in model:
            # 
            state_counts = count_sample_in_each_cluster(Zs, n_c)
            idx_both_states = (state_counts > 0).all(1)
            roc_auc_arr = score_trials(roc_auc_score, Zs, proba_new, idx_subset=idx_both_states, lambdas=True)
            np_save_print(f"{path_scores}/roc_auc_{model}_{key}.npy", roc_auc_arr)

In [7]:
model = "discrete_zheng_feat"
run_model_scoring(model)

 11%|█████                                        | 1/9 [00:01<00:15,  1.89s/it]

shape of the saved arr: (17, 1000, 2).


 22%|██████████                                   | 2/9 [00:03<00:13,  1.96s/it]

shape of the saved arr: (17, 1000, 2).


 33%|███████████████                              | 3/9 [00:06<00:12,  2.12s/it]

shape of the saved arr: (17, 1000, 2).


 44%|████████████████████                         | 4/9 [00:08<00:10,  2.02s/it]

shape of the saved arr: (17, 1000, 2).


 56%|█████████████████████████                    | 5/9 [00:10<00:08,  2.03s/it]

shape of the saved arr: (17, 1000, 2).


 67%|██████████████████████████████               | 6/9 [00:12<00:06,  2.12s/it]

shape of the saved arr: (17, 1000, 2).


 78%|███████████████████████████████████          | 7/9 [00:14<00:04,  2.05s/it]

shape of the saved arr: (17, 1000, 2).


 89%|████████████████████████████████████████     | 8/9 [00:16<00:02,  2.05s/it]

shape of the saved arr: (17, 1000, 2).


100%|█████████████████████████████████████████████| 9/9 [00:18<00:00,  2.08s/it]

shape of the saved arr: (17, 1000, 2).





In [8]:
model = "cont_mode_zheng_feat"
run_model_scoring(model)

  0%|                                                     | 0/9 [00:00<?, ?it/s]

shape of the saved arr: (17, 1000, 2).


 11%|█████                                        | 1/9 [00:04<00:35,  4.50s/it]

shape of the saved arr: (17, 1000).
shape of the saved arr: (17, 1000, 2).


 22%|██████████                                   | 2/9 [00:10<00:37,  5.31s/it]

shape of the saved arr: (17, 1000).
shape of the saved arr: (17, 1000, 2).


 33%|███████████████                              | 3/9 [00:18<00:39,  6.53s/it]

shape of the saved arr: (17, 1000).
shape of the saved arr: (17, 1000, 2).


 44%|████████████████████                         | 4/9 [00:25<00:33,  6.60s/it]

shape of the saved arr: (17, 1000).
shape of the saved arr: (17, 1000, 2).


 56%|█████████████████████████                    | 5/9 [00:32<00:27,  6.93s/it]

shape of the saved arr: (17, 1000).
shape of the saved arr: (17, 1000, 2).


 67%|██████████████████████████████               | 6/9 [00:41<00:22,  7.50s/it]

shape of the saved arr: (17, 1000).
shape of the saved arr: (17, 1000, 2).


 78%|███████████████████████████████████          | 7/9 [00:48<00:14,  7.34s/it]

shape of the saved arr: (17, 1000).
shape of the saved arr: (17, 1000, 2).


 89%|████████████████████████████████████████     | 8/9 [00:55<00:07,  7.41s/it]

shape of the saved arr: (17, 1000).
shape of the saved arr: (17, 1000, 2).


100%|█████████████████████████████████████████████| 9/9 [01:04<00:00,  7.16s/it]

shape of the saved arr: (17, 1000).





In [9]:
def run_model_scoring(model):
    for key in tqdm(key_list[:3]):
        # true data
        Zs = np.load(f"{path_data}/Z_{key}.npy")[:n_t]
        # estimation res
        labels_ = np.load(f"{path_estimation}/labels_{model}_{key}.npy")
        if "discrete" in model:
            proba_ = None
        else:
            proba_ = np.load(f"{path_estimation}/proba_{model}_{key}.npy")
        # switch labels
        labels_new, proba_new = label_switching(Zs, labels_, proba_)

        # accuracy
        accuracy_arr = score_trials(accuracy_each_cluster, Zs, labels_new, lambdas=True)
        np_save_print(f"{path_scores}/accuracy_{model}_{key}.npy", accuracy_arr)
        if "discrete" not in model:
            # 
            state_counts = count_sample_in_each_cluster(Zs, n_c)
            idx_both_states = (state_counts > 0).all(1)
            roc_auc_arr = score_trials(roc_auc_score, Zs, proba_new, idx_subset=idx_both_states, lambdas=True)
            np_save_print(f"{path_scores}/roc_auc_{model}_{key}.npy", roc_auc_arr)

In [10]:
model = "cont_no_mode_zheng_feat"
run_model_scoring(model)

  0%|                                                     | 0/3 [00:00<?, ?it/s]

shape of the saved arr: (17, 1000, 2).


 33%|███████████████                              | 1/3 [00:04<00:08,  4.49s/it]

shape of the saved arr: (17, 1000).
shape of the saved arr: (17, 1000, 2).


 67%|██████████████████████████████               | 2/3 [00:10<00:05,  5.33s/it]

shape of the saved arr: (17, 1000).
shape of the saved arr: (17, 1000, 2).


100%|█████████████████████████████████████████████| 3/3 [00:18<00:00,  6.16s/it]

shape of the saved arr: (17, 1000).





In [13]:
key="daily_1000"

In [20]:
Zs = np.load(f"{path_data}/Z_{key}.npy")
state_counts = count_sample_in_each_cluster(Zs, n_c)
idx_both_states = (state_counts > 0).all(1)

In [16]:
labels_ = np.load(f"{path_estimation}/labels_cont_mode_zheng_feat_{key}.npy")
proba_ = np.load(f"{path_estimation}/proba_cont_mode_zheng_feat_{key}.npy")

In [21]:
roc_auc_arr = score_trials(roc_auc_score, Zs, proba_, lambdas=True, idx_subset = idx_both_states)

In [25]:
np.maximum(roc_auc_arr, 1-roc_auc_arr)

array([[0.90519237, 0.91000219, 0.88679245, ..., 0.8636989 ,        nan,
        0.90856215],
       [0.92803179, 0.91365945, 0.88679245, ..., 0.8636989 ,        nan,
        0.90856215],
       [0.92803179, 0.91365945, 0.88679245, ..., 0.8636989 ,        nan,
        0.90856215],
       ...,
       [0.99942683, 0.99887002, 0.98949225, ..., 0.9927198 ,        nan,
        0.99826264],
       [0.99933949, 0.98959928, 0.9699095 , ..., 0.98047284,        nan,
        0.99932646],
       [0.99945412, 0.97741246, 0.96853812, ..., 0.8562631 ,        nan,
        0.84418552]])

In [27]:
np.nanmean(np.maximum(roc_auc_arr, 1-roc_auc_arr), 1)

array([0.87292092, 0.8735229 , 0.87339283, 0.87388763, 0.87380449,
       0.87427003, 0.87571321, 0.87924687, 0.88806581, 0.90190728,
       0.9199709 , 0.93815217, 0.95522021, 0.96783212, 0.97194122,
       0.96652973, 0.91778304])

In [24]:
np.nanmean(roc_auc_arr, 1)

array([0.54947852, 0.53649799, 0.54313878, 0.55398023, 0.54372008,
       0.50420324, 0.53768327, 0.54492191, 0.52258292, 0.53161377,
       0.5403218 , 0.54162713, 0.57400658, 0.55254199, 0.53185368,
       0.53591632, 0.54626475])

In [28]:
labels_new, proba_new = label_switching(Zs, labels_, proba_)

In [33]:
roc_auc_arr_new = score_trials(roc_auc_score, Zs, proba_new, lambdas=True, idx_subset=idx_both_states)

In [35]:
np.nanmean(roc_auc_arr_new, 1)

array([0.86853259, 0.86822596, 0.86846321, 0.86901086, 0.86838817,
       0.86940232, 0.87186604, 0.87526888, 0.88367696, 0.89663984,
       0.91649375, 0.934796  , 0.9496631 , 0.96190879, 0.96742539,
       0.96275467, 0.91703661])

In [29]:
proba_new

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 1., 1., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 1., 1., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 1., 1., 1.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [31]:
score_trials(accuracy_score, Zs, labels_, lambdas=True)

array([[0.022, 0.849, 0.062, ..., 0.848, 0.563, 0.89 ],
       [0.02 , 0.144, 0.938, ..., 0.848, 0.563, 0.11 ],
       [0.98 , 0.144, 0.938, ..., 0.848, 0.438, 0.89 ],
       ...,
       [0.988, 0.013, 0.982, ..., 0.94 , 0.208, 0.985],
       [0.011, 0.968, 0.023, ..., 0.893, 0.   , 0.97 ],
       [0.99 , 0.043, 0.033, ..., 0.21 , 1.   , 0.963]])

In [18]:
proba_

array([[[1., 1., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 1., 1., 1.]],

       [[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [1., 1., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 1., 1.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [1., 1., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 1., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 0., 0.