In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = expanduser("~/Documents/G3_2/regime-identification")
sys.path.append(path)

path_file = expanduser("~/data/G3_2/regime-identification/simulation")
path_data = f"{path_file}/data"
path_estimation = f"{path_file}/estimation"
path_score = f"{path_file}/score"
path_figure = f"{path_file}/figure"

In [2]:
import numpy as np
from sklearn.metrics import roc_auc_score
from itertools import product

from numpy.random import RandomState
random_state = RandomState(0)

In [3]:
from regime.simulation_helper import *

# 0-generate-data-estimate-true-model

In this notebook we systematically generate the simulation data, estimate the labels and probability by the true HMM model, and score them. 

# 2-state models: 9 combinations

- scale: We use the parameters estimated in the classical Hardy's paper, and convert into three scales: **daily, weekly, monthly**, with decreasing persistency.
- length: We simulate seqs of different length: 250, 500, 1000.

For each combo, we simulate `n_t=1000` seqs. The data in each combo are saved in a batch, thus in the shape of `(n_t, n_s, n_f)`. Also since we need to do feature engineering, every seq is 20 periods longer.

## data generation & true model estimation

In [4]:
scale_lst = ["daily", "weekly", "monthly"]
n_s_lst = [250, 500, 1000]
key_data_list = [f"{scale}_{n_s}" for scale in scale_lst for n_s in n_s_lst]

n_t, n_buffer = 1024, 20

In [None]:
key_feat, key_model = "HMM", "true"

for scale in scale_lst:
    # get a true HMM model
    hmm_true = get_GaussianHMM_model(*load_hardy_params(scale), random_state=random_state)
    for n_s in n_s_lst:
        # generate key for data
        key_data = f"{scale}_{n_s}"
        # simulate Xs, Zs.
        Xs, Zs = sample_from_model(hmm_true, n_trials=n_t, n_samples=n_s+n_buffer)
        np_save_print(f"{path_data}/Xs_{key_data}_raw.npy", Xs, "Xs raw")
        Xs, Zs = Xs[:, -n_s:], Zs[:, -n_s:]
        np_save_print(f"{path_data}/Xs_{key_data}_HMM.npy", Xs, "Xs")
        np_save_print(f"{path_data}/Zs_{key_data}.npy", Zs, "Zs") 
        # estimate by the true HMM model.
        model_params_arr, labels_arr, proba_arr = model_true_fit_batch(hmm_true, Xs)
        # save estimation results
        save_estimation_results(model_params_arr, labels_arr, proba_arr, path_estimation, key_data, key_feat, key_model)

## scoring & summary

In [10]:
def score_and_summary_models(key_data_list, key_feat_list, key_model_list):
    """
    score and summary models.
    """
    for key_data in key_data_list:
        # true labels
        Zs = np.load(f"{path_data}/Zs_{key_data}.npy")
        n_c = len(np.unique(Zs))
        for key_feat, key_model in product(key_feat_list, key_model_list):
            key = f"{key_data}_{key_feat}_{key_model}"
            # load estimation results
            labels_arr, proba_arr = np.load(f"{path_estimation}/labels_{key}.npy"), np.load(f"{path_estimation}/proba_{key}.npy")
            # scoring
            acc_arr = scorer_batch(accuracy_each_cluster, Zs, labels_arr, n_c)
            roc_auc_arr = scorer_batch(roc_auc_score, Zs, proba_arr[..., 1], idx=True)
            # save scores
            np_save_print(f"{path_score}/acc_{key}.npy", acc_arr, "accuracy score")
            np_save_print(f"{path_score}/roc_auc_{key}.npy", roc_auc_arr, "roc auc score")
            # load model params
            model_params_arr = np.load(f"{path_estimation}/model_params_{key}.npy")
            

In [19]:
key = f"{key_data}_{key_feat}_{key_model}"
model_params_arr = np.load(f"{path_estimation}/model_params_{key}.npy")
key

'daily_250_HMM_true'

In [106]:
print(f"{{}}")

{}


In [108]:
def generate_off_diagonal_idx(n_c):
    res = []
    for i, j in product(range(1, n_c+1), repeat=2):
        if i != j:
            res.append(f"{i}{j}")
    return res

def generate_summary_index(n_c):
    index=[]
    index += [f"$\mu_{i}$" for i in range(1, n_c+1)]
    index += [f"$\sigma_{i}$" for i in range(1, n_c+1)]
    index += [f"$\gamma_{{{ij}}}$" for ij in generate_off_diagonal_idx(n_c)]
    return index

In [71]:
def compute_acc_mean_std(acc_arr):
    """
    compute the mean/std of accuracy per class and balanced accuracy.
    
    Parameters:
    ----------------------------------
    acc_arr: size (n_s, n_c)
        the accuracy of k-th class in n-th trial.
        
    Returns:
    ----------------------------------
    acc_mean,
    
    acc_std
    """
    n_c = acc_arr.shape[1]
    acc_df = pd.DataFrame(acc_arr, columns=[f"Accuracy {i}" for i in range(1, n_c+1)])
    # mean
    acc_mean = acc_df.mean()
    acc_mean["BAC"] = acc_mean.mean()
    # std
    acc_std = acc_df.std()
    acc_cov = acc_df.cov()
    vec = np.repeat(1/n_c, n_c)
    acc_std["BAC"] = np.sqrt(vec @ (acc_cov.to_numpy() @ vec))
    # acc_std.index = [name + " std" for name in acc_std.index]
    return acc_mean, acc_std   #, BAC_mean, BAC_std

In [78]:
def get_summary_ser(model_params_arr, acc_arr, roc_auc_arr):
    """
    get summary ser for the estimation results of a model
    """
    n_c = acc_arr.shape[1]
    index = generate_summary_index(n_c)
    model_params_df = pd.DataFrame(model_params_arr, columns=index)
    model_params_mean = model_params_df.mean()
    model_params_std = model_params_df.std()
    model_params_std.iloc[np.isclose(model_params_std, 0)] = 0
    # model_params_std = model_params_std.mask(model_params_std<1e-10, 0)    
    
    # acc
    acc_mean, acc_std = compute_acc_mean_std(acc_arr)
    # roc auc
    roc_auc_df = pd.DataFrame(roc_auc_arr, columns=['ROC-AUC'])
    roc_auc_mean, roc_auc_std = roc_auc_df.mean(), roc_auc_df.std()
    means = pd.concat([model_params_mean, acc_mean, roc_auc_mean])
    stds = pd.concat([model_params_std, acc_std, roc_auc_std])
    stds.index = [name + " std" for name in stds.index]
    return pd.concat([means, stds])

In [80]:
from sklearn.model_selection import ParameterGrid

In [81]:
PG = ParameterGrid({"lambd": [.1, 10.], "w":[3, 5]})

In [83]:
PG[0]

{'w': 3, 'lambd': 0.1}

In [116]:
def get_summary_df_params(model_params_arr, acc_arr, roc_auc_arr, PG):
    """
    get summary df for the estimation results of a model w/ hyperparams
    """
    n_l = len(PG)
    summary_df = pd.concat([get_summary_ser(model_params_arr[..., i], acc_arr[..., i], roc_auc_arr[..., i]) for i in range(n_l)], axis=1).T
    summary_df.insert(0, "hyperparam", PG)
    return summary_df

In [94]:
def repeat_arr(arr, num):
    return np.repeat(arr[...,np.newaxis], num, -1)

In [95]:
repeat_arr(model_params_arr, 4).shape

(1024, 6, 4)

In [137]:
summary_df = get_summary_df_params(repeat_arr(model_params_arr, 4), repeat_arr(acc_arr, 4), repeat_arr(roc_auc_arr, 4), PG).T

In [133]:
import pandas as pd

# create a sample dataframe
df = pd.DataFrame({'Model': ['Model A', 'Model B', 'Model C'],
                   'Accuracy': [0.90, 0.92, 0.88],
                   'Precision': [0.91, 0.93, 0.89],
                   'Recall': [0.92, 0.94, 0.90]})

# set the index to be the Model column
df.set_index('Model', inplace=True)

# apply formatting to the dataframe using pandas.DataFrame.style method
styled_df = df.style#.background_gradient()

In [134]:
type(styled_df)

pandas.io.formats.style.Styler

In [135]:
import dataframe_image as dfi 

In [144]:
import imgkit



In [145]:
imgkit.from_url('http://google.com', 'out.jpg')
imgkit.from_file('test.html', 'out.jpg')
imgkit.from_string('Hello!', 'out.jpg')

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


True

In [146]:
imgkit.from_string(summary_df.style.to_html(), 'summary.jpeg')

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


True

In [151]:
summary_df.style.to_latex()

"\\begin{tabular}{lllll}\n & 0 & 1 & 2 & 3 \\\\\nhyperparam & {'lambd': 0.1, 'w': 3} & {'lambd': 0.1, 'w': 5} & {'lambd': 10.0, 'w': 3} & {'lambd': 10.0, 'w': 5} \\\\\n$\\mu_1$ & 0.000615 & 0.000615 & 0.000615 & 0.000615 \\\\\n$\\mu_2$ & -0.000785 & -0.000785 & -0.000785 & -0.000785 \\\\\n$\\sigma_1$ & 0.007759 & 0.007759 & 0.007759 & 0.007759 \\\\\n$\\sigma_2$ & 0.017397 & 0.017397 & 0.017397 & 0.017397 \\\\\n$\\gamma_{12}$ & 0.002116 & 0.002116 & 0.002116 & 0.002116 \\\\\n$\\gamma_{21}$ & 0.011982 & 0.011982 & 0.011982 & 0.011982 \\\\\nAccuracy 1 & 0.989162 & 0.989162 & 0.989162 & 0.989162 \\\\\nAccuracy 2 & 0.860320 & 0.860320 & 0.860320 & 0.860320 \\\\\nBAC & 0.924741 & 0.924741 & 0.924741 & 0.924741 \\\\\nROC-AUC & 0.992278 & 0.992278 & 0.992278 & 0.992278 \\\\\n$\\mu_1$ std & 0.000000 & 0.000000 & 0.000000 & 0.000000 \\\\\n$\\mu_2$ std & 0.000000 & 0.000000 & 0.000000 & 0.000000 \\\\\n$\\sigma_1$ std & 0.000000 & 0.000000 & 0.000000 & 0.000000 \\\\\n$\\sigma_2$ std & 0.000000 & 0

In [127]:
print(styled_df.to_latex())

\begin{tabular}{lrrr}
 & Accuracy & Precision & Recall \\
Model &  &  &  \\
Model A & 0.900000 & 0.910000 & 0.920000 \\
Model B & 0.920000 & 0.930000 & 0.940000 \\
Model C & 0.880000 & 0.890000 & 0.900000 \\
\end{tabular}



In [129]:
# save the styled dataframe as a PNG file
styled_df.to_image('performance_metrics.png')

AttributeError: 'Styler' object has no attribute 'to_image'

In [113]:
ser = get_summary_ser(model_params_arr, acc_arr, roc_auc_arr)

In [99]:
type(ser)

pandas.core.series.Series

In [115]:
pd.concat([ser, ser], axis=1)

Unnamed: 0,0,1
$\mu_1$,0.000615,0.000615
$\mu_2$,-0.000785,-0.000785
$\sigma_1$,0.007759,0.007759
$\sigma_2$,0.017397,0.017397
$\gamma_{12}$,0.002116,0.002116
$\gamma_{21}$,0.011982,0.011982
Accuracy 1,0.989162,0.989162
Accuracy 2,0.86032,0.86032
BAC,0.924741,0.924741
ROC-AUC,0.992278,0.992278


In [54]:
index = generate_summary_index(n_c)
model_params_df = pd.DataFrame(model_params_arr, columns=index)
model_params_mean = model_params_df.mean()
model_params_std = model_params_df.std()
model_params_std = model_params_std.mask(model_params_std<1e-10, 0)

In [55]:
acc_arr.shape[1]

(1024, 2)

In [64]:
get_summary_ser(model_params_arr, acc_arr, roc_auc_arr)

($\mu_1$        0.000615
 $\mu_2$       -0.000785
 $\sigma_1$     0.007759
 $\sigma_2$     0.017397
 $\gamma_12$    0.002116
 $\gamma_21$    0.011982
 Accuracy 1     0.989162
 Accuracy 2     0.860320
 BAC            0.924741
 ROC-AUC        0.992278
 dtype: float64,
 $\mu_1$        0.000000
 $\mu_2$        0.000000
 $\sigma_1$     0.000000
 $\sigma_2$     0.000000
 $\gamma_12$    0.000000
 $\gamma_21$    0.000000
 Accuracy 1     0.079449
 Accuracy 2     0.274427
 BAC            0.138061
 ROC-AUC        0.045641
 dtype: float64)

Unnamed: 0,ROC-AUC
0,1.000000
1,0.967611
2,0.999320
3,0.999558
4,1.000000
...,...
1019,
1020,0.998600
1021,
1022,


In [7]:
key_data_list_temp = key_data_list[:1]

In [8]:
key_feat_list=['HMM']
key_model_list=['true']

shape of the saved accuracy score: (1024, 2).
shape of the saved roc auc score: (1024,).


In [10]:
acc_arr = scorer_batch(accuracy_each_cluster, Zs, labels_arr)

In [18]:
roc_auc_arr = scorer_batch(roc_auc_score, Zs, proba_arr[..., 1], idx=True, n_c=2)