In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = expanduser("~/Documents/G3_2/regime-identification")
sys.path.append(path)

path_file = expanduser("~/data/G3_2/regime-identification/simulation")
path_data = f"{path_file}/data"
path_estimation = f"{path_file}/estimation"
path_score = f"{path_file}/score"
path_figure = f"{path_file}/figure"

In [2]:
import numpy as np
from sklearn.metrics import roc_auc_score
from tqdm import trange, tqdm
from numpy.random import RandomState
random_state = RandomState(0)

In [3]:
from regime.jump import *
from regime.simulation_helper import *
from regime.stats import *

# Model fitting dev

In this nb we develop the functionality of training one model on a batch of data.

In [4]:
# # true model
# def model_true_fit_batch(model, Xs):
#     """
#     fit the true model on a batch of data. save model parameters, and fitted labels & proba
#     There is no need to estimate the model on the data; only call viterbi and forward-backward algos.
#     model parameters are copies of the true params.
        
#     Paramaters:
#     ---------------------
#     model: a model instance.
    
#     Xs: array of size (n_t, n_s, n_f)
#         input data
        
#     Returns:
#     -------------------------
#     model_params_arr: (n_t, n_c**2 + n_c)
    
#     labels_arr: array of size (n_t, n_s)
    
#     proba_arr: array of size (n_t, n_s, n_c)
#     """
#     n_t, n_s, _ = Xs.shape
#     n_c = model.n_components
#     # model parameters, true values
#     model_params = combine_model_param_estimation(model.means_.squeeze(), model.covars_.squeeze(), model.transmat_)
#     model_params_arr = np.repeat(model_params[np.newaxis, ...], n_t, 0)
#     # fitted labels & proba
#     labels_arr = np.empty((n_t, n_s), dtype=np.int32)
#     proba_arr = np.empty((n_t, n_s, n_c))
    
#     # for i_trial in tqdm(range(n_t)):
#     for i_trial in range(n_t):   # really fast, no need to tqdm, unless the state space becomes large.
#         X = Xs[i_trial]
#         labels_arr[i_trial] = model.predict(X)
#         proba_arr[i_trial] = model.predict_proba(X)
#     return model_params_arr, labels_arr, proba_arr

In [18]:
# our models
def model_fit_batch(model, Xs, Zs):
    """
    fit a model on a batch of data. save model parameters, and fitted labels & proba.
    need the true labels to do alignments (i.e. the permutation with the highest overall accuracy)

        
    Paramaters:
    ---------------------
    model: a model instance.
    
    Xs: array of size (n_t, n_s, n_f)
        input data
        
    Zs: array (n_t, n_s)
        true labels
        
    Returns:
    -------------------------
    model_params_arr: (n_t, n_c**2 + n_c)
    
    labels_arr: array of size (n_t, n_s)
    
    proba_arr: array of size (n_t, n_s, n_c)
    """
    n_t, n_s, _ = Xs.shape
    n_c = model.n_components

    res_list = []
    # estimate
    # for i_t in trange(n_t):
    for i_t in range(n_t):
        X, Z = Xs[i_t], Zs[i_t]
        # fit
        model.fit(X)
        # save dict result
        res_list.append(extract_results_from_model(model, X_=X[:, 0]))
    # dict of results
    res = combine_list_dict(res_list)
    # align with true labels
    res = align_estimation_results_batch(Zs, res)
    # combine model params
    model_params_arr = combine_model_param_estimation(res["means_"], res["covars_"], res["transmat_"])
    labels_arr = res["labels_"]; proba_arr = res["proba_"]
    return model_params_arr, labels_arr, proba_arr
    
# helpers
def weighted_mean_vol(X, proba_):
    n_c = proba_.shape[1]
    means_, covars_ = np.full(n_c, np.nan), np.full(n_c, np.nan)
    total_weight = proba_.sum(0)
    idx = (total_weight>0)
    weighted_sum = X @ proba_
    means_[idx] = weighted_sum[idx] / total_weight[idx]
    weighted_sum_square = ((X[:, np.newaxis] - means_[np.newaxis, :])**2 * proba_).sum(0)
    covars_[idx] = weighted_sum_square[idx] / total_weight[idx]
    return means_, covars_

def raise_labels_to_proba_(labels_, n_c):
    """
    raise one labels_ into a proba_
    labels_: (n_s,)
    """
    n_s = len(labels_)
    proba_ = np.zeros((n_s, n_c))
    np.put_along_axis(proba_, indices=labels_[..., np.newaxis], values=1., axis=-1)
    return proba_  

def raise_labels_to_proba_batch(labels_arr, n_c):
    """
    labels_arr: (n_t, n_s)
    """
    n_t, n_s = labels_arr.shape
    proba_arr = np.zeros((n_t, n_s, n_c))
    np.put_along_axis(proba_arr, indices=labels_arr[..., np.newaxis], values=1., axis=-1)
    return proba_arr    

def extract_results_from_model(model, X_=None):
    """
    extract the estimation results from one model.
    The 1d sequence X is needed to compute the weighted means, covars.
    
    Parameters:
    ----------------------------------------
    model:
    
    X_: array (n_s,). default None.
    
    Returns:
    ---------------------------------------
    result: dict
    """
    n_c = model.n_components
    result = {}
    # proba
    if hasattr(model, "proba_"):
        result["proba_"] = model.proba_
    else:
        result["proba_"] = raise_labels_to_proba_(model.labels_, n_c)
        
    # label
    if hasattr(model, "labels_"):
        result["labels_"] = model.labels_
    else:
        result["labels_"] = model.proba_.argmax(axis=-1).astype(np.int32)
        
    # means covars
    if hasattr(model, "means_"):
        result["means_"] = model.means_
        result["covars_"] = model.covars_
    else:
        # compute weighted average by proba_
        result["means_"], result["covars_"] = weighted_mean_vol(X_, result["proba_"])
        
    # transmat
    if hasattr(model, "transmat_"):
        result["transmat_"] = model.transmat_
    else:
        # empirical
        result["transmat_"] = empirical_trans_mx(result["labels_"])
    return result

def combine_list_dict(dict_list):
    """
    input is a list of dictionaries, all with the same keys.
    return a dict with the same keys, value is the stacked array.
    """
    keys = dict_list[0].keys()
    res = {key: np.array([dict_[key] for dict_ in dict_list]) for key in keys}
    return res  

def align_estimation_results_batch(Zs_true, res):
    """
    align a batch of estimation results with the true labels, i.e. find the optimal permutation for each sample.
    results include labels_, proba_, means_, covars_, transmat_.
    """
    n_c, n_t = len(np.unique(Zs_true)), len(Zs_true)
    # all the perms
    all_perms = generate_all_perms_as_arr(n_c) 
    # all the possible perms of labels
    labels_all_perms = permute_labels(res["labels_"], all_perms)
    # score accuracy for each perm
    acc_all_perms = scorer_batch(accuracy_score, Zs_true, labels_all_perms, has_params=True) # of shape (n_t, n_p)
    # best perm for each trial 
    best_perm_idx = acc_all_perms.argmax(-1) # shape (n_t,)
    best_perm = all_perms[best_perm_idx] # (n_t, n_c)
    # take the corresponding perm for labels
    res["labels_"] = np.take_along_axis(labels_all_perms, best_perm_idx[:, np.newaxis, np.newaxis], axis=-1).squeeze(axis=-1)
    res["proba_"] = np.take_along_axis(res["proba_"], best_perm[:, np.newaxis, :], -1)
    res["means_"] = np.take_along_axis(res["means_"], best_perm, -1)
    res["covars_"] = np.take_along_axis(res["covars_"], best_perm, -1)
    res["transmat_"] = np.take_along_axis(res["transmat_"][np.arange(n_t)[:, np.newaxis], best_perm], best_perm[:, np.newaxis, :], -1)
    return res

In [16]:
def model_fit_batch_with_params(model, Xs, Zs, param_grid=None):
    """
    fit a model on a batch of data. save model parameters, and fitted labels & proba.
    the model can have a param_grid for hyperparam tuning.
    need the true labels to do alignments (i.e. the permutation with the highest overall accuracy)

        
    Paramaters:
    ---------------------
    model: a model instance.
    
    Xs: array of size (n_t, n_s, n_f)
        input data
        
    Zs: array (n_t, n_s)
        true labels
        
    param_grid: dict, default None
        if None, will call `model_fit_batch` directly.
        
    Returns:
    -------------------------
    model_params_arr: (n_t, n_c**2 + n_c, n_l)
    
    labels_arr: array of size (n_t, n_s, n_l)
    
    proba_arr: array of size (n_t, n_s, n_c, n_l)
    """
    if param_grid is None: # no hyperparams
        return model_fit_batch(model, Xs, Zs)
    
    PG = ParameterGrid(param_grid)
    model_params_arr_list, labels_arr_list, proba_arr_list = [], [], []
    for param_ in tqdm(PG):
        model.set_params(**param_)
        model_params_arr, labels_arr, proba_arr = model_fit_batch(model, Xs, Zs)
        model_params_arr_list.append(model_params_arr)
        labels_arr_list.append(labels_arr)
        proba_arr_list.append(proba_arr)
    return np.stack(model_params_arr_list, axis=-1), np.stack(labels_arr_list, axis=-1), np.stack(proba_arr_list, axis=-1)

# testing

In [7]:
n_t=10
Xs_zheng = np.load(f"{path_data}/Xs_daily_1000_zheng.npy")[:n_t]
Xs_HMM = np.load(f"{path_data}/Xs_daily_1000_HMM.npy")[:n_t]
Zs = np.load(f"{path_data}/Zs_daily_1000.npy")[:n_t]

In [8]:
n_c=2
model_hmm = GaussianHMM_model(n_c, random_state=random_state)
model_discrete = jump_model(n_c, state_type="discrete", jump_penalty=1e2, random_state=random_state)
model_cont_mode = jump_model(n_c, state_type="cont", grid_size=.02, mode_loss=True, jump_penalty=1e3, random_state=random_state)
model_cont_no_mode = jump_model(n_c, state_type="cont", grid_size=.02, mode_loss=False, jump_penalty=1e3, random_state=random_state)

In [9]:
model_params_arr1, labels_arr1, proba_arr1 = model_fit_batch(model_hmm, Xs_HMM, Zs)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.41it/s]


In [10]:
model_params_arr2, labels_arr2, proba_arr2 = model_fit_batch(model_discrete, Xs_zheng, Zs)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 147.82it/s]


In [11]:
model_params_arr3, labels_arr3, proba_arr3 = model_fit_batch(model_cont_mode, Xs_zheng, Zs)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  7.73it/s]


In [12]:
model_params_arr4, labels_arr4, proba_arr4 = model_fit_batch(model_cont_no_mode, Xs_zheng, Zs)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.38it/s]


In [15]:
param_grid = {"jump_penalty": [1e2, 1e3]}

In [19]:
model_params_arr5, labels_arr5, proba_arr5 = model_fit_batch_with_params(model_cont_no_mode, Xs_zheng, Zs, param_grid)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.16s/it]


# fitting on datasets

In [None]:
def model_fit_many_datas_models(key_data_list, key_feat_list, model_dict, param_grid, path_data, path_estimation, start = 0, end = -1, sub_job_no = ""):
    """
    train a collection of models, w/ hyperparams to tune, on a batch of data from many datasets.
    """
    N_combos = len(key_data_list) * len(key_feat_list) * len(model_dict)
    count=0
    time_old = time.time(); total_time=0.
    for key_data, key_feat in product(key_data_list, key_feat_list):
        # load data
        Xs = np.load(f"{path_data}/Xs_{key_data}_{key_feat}.npy")[start:end]
        Zs = np.load(f"{path_data}/Zs_{key_data}.npy")[start:end]
        for key_model, model in model_dict.items():
            # train the model, on a param grid, on a batch of data
            model_params_arr, labels_arr, proba_arr = model_fit_batch_with_params(model, Xs, Zs, param_grid)
            # save results
            save_estimation_results(model_params_arr, labels_arr, proba_arr, path_estimation, key_data, key_feat, key_model, sub_job_no)
            time_now = time.time(); time_this_iter = time_now-time_old; total_time += time_this_iter; time_old = time_now
            count+=1
            print(f"{count}/{N_combos} combos done. Time of this iter: {print_seconds(time_this_iter)}s. Total time: {print_seconds(total_time)}s.")

In [13]:
def temp(arr):
    return pd.DataFrame(arr).mean()

In [26]:
temp(model_params_arr1)

0    0.000789
1    0.001830
2    0.007772
3    0.014325
4    0.004589
5    0.126405
dtype: float64

In [27]:
temp(model_params_arr2)

0    0.000723
1    0.000868
2    0.007905
3    0.016297
4    0.002576
5    0.014046
dtype: float64

In [None]:
temp(model_params_arr3)

0    0.000722
1    0.000545
2    0.008031
3    0.014882
4    0.002833
5    0.013157
dtype: float64

In [None]:
temp(model_params_arr4)

0    0.000729
1    0.000568
2    0.008035
3    0.014866
4    0.003206
5    0.014553
dtype: float64