In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = expanduser("~/Documents/G3_2/regime-identification")
sys.path.append(path)

path_file = expanduser("~/data/G3_2/regime-identification/simulation")
path_data = f"{path_file}/data"
path_estimation = f"{path_file}/estimation"
path_score = f"{path_file}/score"
path_figure = f"{path_file}/figure"

In [2]:
import numpy as np

In [79]:
from regime.simulation_helper import *

In [None]:
n_s_lst = [250, 500, 1000]
n_t, n_buffer = 1024, 20

key_data_list_dict = generate_key_data_list_dict()

In [None]:
batch_size, num_of_batch = 32, 1

In [5]:
def generate_off_diagonal_idx(n_c):
    res = []
    for i, j in product(range(1, n_c+1), repeat=2):
        if i != j:
            res.append(f"{i}{j}")
    return res

def generate_summary_index(n_c):
    index=[]
    index += [f"$\mu_{i}$" for i in range(1, n_c+1)]
    index += [f"$\sigma_{i}$" for i in range(1, n_c+1)]
    index += [f"$\gamma_{{{ij}}}$" for ij in generate_off_diagonal_idx(n_c)]
    return index

def compute_BAC_std_from_acc_arr(acc_arr):
    """
    compute the mean/std of accuracy per class and balanced accuracy.
    
    Parameters:
    ----------------------------------
    acc_arr: size (n_s, n_c)
        the accuracy of k-th class in n-th trial.
        
    Returns:
    ----------------------------------
    acc_mean,
    
    acc_std
    """
    if acc_arr.ndim == 2:  # (n_t, n_c)
        n_c = acc_arr.shape[1]
        acc_cov = pd.DataFrame(acc_arr).cov().to_numpy()
        vec = np.repeat(1/n_c, n_c)
        quad_form = vec @ (acc_cov @ vec)
        return 0 if quad_form<=0 else np.sqrt(quad_form)   #vec @ (acc_cov @ vec) #np.sqrt(vec @ (acc_cov @ vec))
    # ndim == 3
    n_l = acc_arr.shape[-1]
    return np.array([compute_BAC_std_from_acc_arr(acc_arr[..., i_l]) for i_l in range(n_l)])

In [59]:
def generate_summary_df(model_params_arr, scoring_results):
    def raise_scalar_to_list(x):
        if np.isscalar(x): return [x] 
        return x
    n_c = scoring_results["acc"].shape[1]
    means_dict, stds_dict = {}, {}
    #
    means_dict["model_params"] = pd.DataFrame(np.nanmean(model_params_arr, axis=0), index=generate_summary_index(n_c))
    stds_dict["model_params"] = pd.DataFrame(np.nanstd(model_params_arr, axis=0), index=generate_summary_index(n_c))
    #
    acc_index = [f"Accuracy {i}" for i in range(1, n_c+1)]
    means_dict["acc"] = pd.DataFrame(np.nanmean(scoring_results["acc"], axis=0), index=acc_index)
    stds_dict["acc"] = pd.DataFrame(np.nanstd(scoring_results["acc"], axis=0), index=acc_index)
    #
    means_dict["BAC"] = means_dict["acc"].mean(0); means_dict["BAC"] = pd.DataFrame(means_dict["BAC"], columns=["BAC"]).T
    BAC_std = compute_BAC_std_from_acc_arr(scoring_results["acc"]); BAC_std = raise_scalar_to_list(BAC_std)
    stds_dict["BAC"] = pd.DataFrame(BAC_std, columns=["BAC"]).T
    for name, score_arr in scoring_results.items():
        if name == "acc":
            continue
        # print(raise_scalar_to_list(np.nanmean(score_arr, axis=0)))
        means_dict[name] = pd.DataFrame(raise_scalar_to_list(np.nanmean(score_arr, axis=0)), columns=[name]).T
        stds_dict[name] = pd.DataFrame(raise_scalar_to_list(np.nanstd(score_arr, axis=0)), columns=[name]).T
    def combine_dict_to_df(dictionary):
        return pd.concat(dictionary.values(), axis=0)
    means_df, stds_df = combine_dict_to_df(means_dict).T, combine_dict_to_df(stds_dict).T
    return means_df, stds_df, combine_means_std_df(means_df, stds_df)  #means_dict, stds_dict


def combine_means_std_df(means_df, stds_df):
    index, columns = means_df.index, means_df.columns
    df = pd.DataFrame(index=index, columns=columns)
    for idx, col in product(index, columns):
        df.loc[idx, col] = f"{means_df.loc[idx, col]:.4f} ({stds_df.loc[idx, col]:.4f})"
    return df

In [84]:
def load_combine_estimation_arrs(name, key, path_estimation, number_of_batch=1, batch_size=32):
    """
    load and combine one estimation arrs, from distributed locations.
    """
    res = [np.load(f"{path_estimation}/{name}{key}_{i}.npy")[:batch_size] for i in range(number_of_batch)]
    return np.concatenate(res, axis=0)

def load_combine_estimation_results(key, path_estimation, number_of_batch=1, batch_size=32):
    """
    load and combine estimated {"model_params_", "labels_", "proba_"}, from distributed locations.
    """
    estimation_results = {}
    for name in ["modelParams_", "labels_", "proba_"]:
        estimation_results[name] = load_combine_estimation_arrs(name, key, path_estimation, number_of_batch, batch_size)
    return estimation_results["modelParams_"], estimation_results["labels_"], estimation_results["proba_"]
    #  model_params_, labels_, proba_

In [74]:
"3StateDaily1000_HMM_true"
model_params_, labels_, proba_ = load_combine_estimation_results("2StateDaily1000_HMM_true", path_estimation)

'3StateDaily1000_HMM_true'

In [4]:
def scoring_labels_proba_(Zs_true, labels_arr, proba_arr):
    has_params = labels_arr.ndim==3
    n_c = len(np.unique(Zs_true))
    scoring_res = {}
    scoring_res["acc"] = scorer_batch(accuracy_each_cluster, Zs_true, labels_arr, n_c, has_params=has_params)
    scoring_res["ROC-AUC"] = scorer_batch(roc_auc_score, Zs_true, proba_arr[:, :, 1], has_params=has_params, idx=True)
    return scoring_res

def save_scoring_results(scoring_results, key, path_score):
    for score_name, scores in scoring_results.items():
        np_save_print(f"{path_score}/{score_name}_{key}.npy", scores, score_name)

In [None]:
def score_and_summary_model(key, path_data, path_estimation, path_score, number_of_batch=1, batch_size=32):
    """
    score and summary one model estimation results, from distributed locations.
    """
    key_data = key.split("_")[0]
    # load true labels
    Zs = np.load(f"{path_data}/Zs_{key_data}.npy")[:number_of_batch * batch_size]  
    # load estimation results
    model_params_, labels_, proba_ = load_combine_estimation_results(key, path_estimation, number_of_batch, batch_size)
    # scoring classification accuracy
    scoring_results = scoring_labels_proba_(Zs, labels_, proba_)
    # save scores
    save_scoring_results(scoring_results, key, path_score)
    # combine model params estimation with accuracy scores
    means_df, stds_df, summary_df = generate_summary_df(estimation_results["model_params_"], scoring_results)
    return means_df, stds_df, summary_df    

In [None]:
model_fit_many_datas_models(key_data_list_dict[n_c], [key_feat_hmm], model_dict_hmm, param_grid_hmm, path_data, path_estimation, job_id, batch_size)

In [90]:
import imgkit
def save_df_as_fig(df, fig_path):
    return imgkit.from_string(df.style.to_html(), fig_path)

In [92]:
df = pd.DataFrame(np.random.randn(4, 3))
save_df_as_fig(df, f"{path_figure}/test.jpeg")

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


True

In [None]:
def score_and_summary_many_models(key_data_list, key_feat_list, key_model_list, path_data, path_estimation, path_score, path_figure, number_of_batch=1, batch_size=32):
    for key_data, key_feat, key_model in product(key_data_list, key_feat_list, key_model_list):
        key = f"{key_data}_{key_feat}_{key_model}"
        means_df, stds_df, summary_df = score_and_summary_model(key, path_data, path_estimation, path_score, number_of_batch, batch_size)
        means_df.to_hdf(f"{path_score}/means_{key}.h5", "means", "w")
        stds_df.to_hdf(f"{path_score}/stds_{key}.h5", "stds", "w")
        summary_df.to_hdf(f"{path_score}/summary_{key}.h5", "summary", "w")
        save_df_as_fig(summary_df, f"{path_figure}/summary_{key}.jpeg")
    return 
    

In [None]:
# def score_and_summary_models(key_data_list, key_feat_list, key_model_list, number_of_batch=1):
#     """
#     score and summary models.
#     """
#     for key_data in key_data_list:
#         # true labels
#         Zs = np.load(f"{path_data}/Zs_{key_data}.npy")
#         n_c = len(np.unique(Zs))
#         for key_feat, key_model in product(key_feat_list, key_model_list):
#             key = f"{key_data}_{key_feat}_{key_model}"
#             estimation_restuls = {}
#             for name in ["model_params_", "labels_", "proba_"]:
#                 estimation_results[name] = load_and_combine_estimation_arrs(name, key, number_of_batch)
#             scoring_results = scoring_labels_proba(Zs, estimation_results["labels_"], estimation_results["proba_"])
                
                
            
            
            
            
#             # load estimation results
#             labels_arr, proba_arr = np.load(f"{path_estimation}/labels_{key}.npy"), np.load(f"{path_estimation}/proba_{key}.npy")
#             # scoring
#             acc_arr = scorer_batch(accuracy_each_cluster, Zs, labels_arr, n_c)
#             roc_auc_arr = scorer_batch(roc_auc_score, Zs, proba_arr[..., 1], idx=True)
#             # save scores
#             np_save_print(f"{path_score}/acc_{key}.npy", acc_arr, "accuracy score")
#             np_save_print(f"{path_score}/roc_auc_{key}.npy", roc_auc_arr, "roc auc score")
#             # load model params
#             model_params_arr = np.load(f"{path_estimation}/model_params_{key}.npy")
            

In [39]:
# def score_and_summary_model(key_data, key_feat, key_model, path_data, path_estimation, path_score, number_of_batch=1, batch_size=32):
#     key=f"{key_data}_{key_feat}_{key_model}"
#     # load true labels
#     Zs = np.load(f"{path_data}/Zs_{key_data}.npy")[:number_of_batch * batch_size]
#     # load and combine estimation results
#     model_params_arr, labels_arr, proba_arr = load_combine_estimation_results()
#     estimation_results = {}
#     for name in ["model_params_", "labels_", "proba_"]:
#         estimation_results[name] = load_and_combine_estimation_arrs(name, key, path_estimation, number_of_batch)[:number_of_batch * batch_size]
#     # scoring classification accuracy
#     scoring_results = scoring_labels_proba_(Zs, estimation_results["labels_"], estimation_results["proba_"])
#     # save scores
#     save_scoring_results(scoring_results, key, path_score)
#     # combine model params estimation with accuracy scores
#     means_df, stds_df, summary_df = generate_summary_df(estimation_results["model_params_"], scoring_results)
#     return means_df, stds_df, summary_df
#     # save as picture for convenience    

In [35]:
means_df, stds_df, summary_df = score_and_summary_model("daily_1000", "HMM", "HMM", path_data, path_estimation, path_score)

shape of the saved acc: (32, 2).
shape of the saved ROC-AUC: (32,).


In [40]:
means_df, stds_df, summary_df = score_and_summary_model("daily_1000", "HMM", "true", path_data, path_estimation, path_score)

shape of the saved acc: (32, 2).
shape of the saved ROC-AUC: (32,).


In [60]:
means_df, stds_df, summary_df = score_and_summary_model("daily_1000", "zheng", "discrete", path_data, path_estimation, path_score)

shape of the saved acc: (32, 2, 15).
shape of the saved ROC-AUC: (32, 15).
[0.85109964 0.85155089 0.85073145 0.85772333 0.85113359 0.88624524
 0.93599888 0.94829801 0.83063785 0.5        0.5        0.5
 0.5        0.5        0.5       ]


In [51]:
acc=np.load(f"{path_score}/acc_daily_1000_zheng_discrete.npy")
roc_auc=np.load(f"{path_score}/ROC-AUC_daily_1000_zheng_discrete.npy")

In [53]:
np.nanmean(roc_auc, 0)

array([0.85109964, 0.85155089, 0.85073145, 0.85772333, 0.85113359,
       0.88624524, 0.93599888, 0.94829801, 0.83063785, 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ])

In [49]:
np.nanmean(acc, axis=0)#.shape

array([[0.85929377, 0.86009896, 0.86086198, 0.86309048, 0.87171385,
        0.88683847, 0.93994484, 0.98006696, 0.98845667, 0.96875   ,
        0.96875   , 0.96875   , 0.96875   , 0.96875   , 0.96875   ],
       [0.78525851, 0.7852808 , 0.78292278, 0.79426534, 0.77001373,
        0.82620043, 0.89691497, 0.91510925, 0.67495668, 0.03703704,
        0.03703704, 0.03703704, 0.03703704, 0.03703704, 0.03703704]])

In [61]:
summary_df

Unnamed: 0,$\mu_1$,$\mu_2$,$\sigma_1$,$\sigma_2$,$\gamma_{12}$,$\gamma_{21}$,Accuracy 1,Accuracy 2,BAC,ROC-AUC
0,0.0007 (0.0009),-0.0005 (0.0024),0.0076 (0.0009),0.0150 (0.0041),0.0558 (0.0484),0.1315 (0.0482),0.8593 (0.1800),0.7853 (0.1786),0.8223 (0.1408),0.8511 (0.1236)
1,0.0007 (0.0009),-0.0006 (0.0024),0.0076 (0.0009),0.0151 (0.0041),0.0530 (0.0457),0.1260 (0.0452),0.8601 (0.1800),0.7853 (0.1782),0.8227 (0.1408),0.8516 (0.1235)
2,0.0007 (0.0009),-0.0006 (0.0024),0.0076 (0.0009),0.0151 (0.0042),0.0471 (0.0401),0.1145 (0.0407),0.8609 (0.1795),0.7829 (0.1774),0.8219 (0.1402),0.8507 (0.1228)
3,0.0009 (0.0008),-0.0007 (0.0022),0.0077 (0.0008),0.0150 (0.0042),0.0365 (0.0312),0.0908 (0.0365),0.8631 (0.1785),0.7943 (0.1800),0.8287 (0.1420),0.8577 (0.1245)
4,0.0008 (0.0008),-0.0007 (0.0021),0.0079 (0.0007),0.0149 (0.0045),0.0226 (0.0209),0.0588 (0.0256),0.8717 (0.1798),0.7700 (0.2249),0.8209 (0.1795),0.8511 (0.1632)
5,0.0008 (0.0006),-0.0005 (0.0020),0.0079 (0.0005),0.0149 (0.0044),0.0115 (0.0105),0.0326 (0.0154),0.8868 (0.1694),0.8262 (0.1842),0.8565 (0.1416),0.8862 (0.1220)
6,0.0008 (0.0005),-0.0006 (0.0022),0.0079 (0.0003),0.0152 (0.0040),0.0041 (0.0028),0.0200 (0.0147),0.9399 (0.1050),0.8969 (0.0874),0.9184 (0.0677),0.9360 (0.0473)
7,0.0007 (0.0002),-0.0002 (0.0023),0.0079 (0.0003),0.0157 (0.0032),0.0019 (0.0013),0.0197 (0.0225),0.9801 (0.0280),0.9151 (0.0838),0.9476 (0.0406),0.9483 (0.0397)
8,0.0007 (0.0003),-0.0008 (0.0023),0.0084 (0.0010),0.0170 (0.0012),0.0009 (0.0008),0.0094 (0.0063),0.9885 (0.0233),0.6750 (0.3316),0.8317 (0.1659),0.8306 (0.1629)
9,0.0005 (0.0003),0.0007 (0.0000),0.0096 (0.0015),0.0145 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.9688 (0.1740),0.0370 (0.1889),0.5029 (0.0000),0.5000 (0.0000)


In [41]:
summary_df

Unnamed: 0,$\mu_1$,$\mu_2$,$\sigma_1$,$\sigma_2$,$\gamma_{12}$,$\gamma_{21}$,Accuracy 1,Accuracy 2,BAC,ROC-AUC
0,0.0006 (0.0000),-0.0008 (0.0000),0.0078 (0.0000),0.0174 (0.0000),0.0021 (0.0000),0.0120 (0.0000),0.9915 (0.0186),0.9120 (0.1871),0.9517 (0.0943),0.9988 (0.0020)


In [36]:
summary_df

Unnamed: 0,$\mu_1$,$\mu_2$,$\sigma_1$,$\sigma_2$,$\gamma_{12}$,$\gamma_{21}$,Accuracy 1,Accuracy 2,BAC,ROC-AUC
0,0.0006 (0.0003),0.0002 (0.0058),0.0078 (0.0002),0.0142 (0.0056),0.0083 (0.0176),0.1665 (0.3205),0.9884 (0.0200),0.9130 (0.1873),0.9507 (0.0944),0.9756 (0.1162)


In [37]:
stds_df

Unnamed: 0,$\mu_1$,$\mu_2$,$\sigma_1$,$\sigma_2$,$\gamma_{12}$,$\gamma_{21}$,Accuracy 1,Accuracy 2,BAC,ROC-AUC
0,0.000313,0.005751,0.000249,0.005596,0.017612,0.320495,0.01997,0.187276,0.094413,0.116222


In [4]:
model_params_arr = np.load(f"{path_estimation}/model_params_daily_1000_zheng_cont_mode0.npy")