In [2]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path_repo = expanduser("~/Documents/G3_2/regime-identification"); sys.path.append(path_repo)
path_file = expanduser("~/data/G3_2/regime-identification/simulation")
path = {}
for folder in ["data", "estimation", "score", "figure", "latex"]:
    path[folder] = f"{path_file}/{folder}"

In [9]:
from regime.simulation_helper import *

In [1]:
def generate_file_name(path, folder, key_data, name, key_len=None, key_feat=None, key_model=None, job_id=None, suffix="npy"):
    file_name = f"{path[folder]}/{key_data}/{name}"
    if key_len is None:
        return file_name + "." + suffix
    file_name += f"_{key_len}"
    if key_feat is None:
        return file_name + "." + suffix
    file_name += f"_{key_feat}"
    if key_model is None:
        return file_name + "." + suffix
    file_name += f"_{key_model}"
    if job_id is None:
        return file_name + "." + suffix
    return file_name + f"_{job_id}" + "." + suffix

In [None]:
def check_dir_exist(file_name):
    dirname = os.path.dirname(file_name)
    if not os.path.exists(dirname): 
        os.makedirs(dirname)  
        print(f"create folder: {dirname}.")
    return 

In [7]:
def save_file(arr, path, folder, key_data, name=None, key_len=None, key_feat=None, key_model=None, job_id=None, suffix="npy"):
    """
    save a file, or a dict of files.
    """
    if isinstance(arr, dict):
        for name_, arr_ in arr.items():
            save_file(arr_, path, folder, key_data, name_, key_len, key_feat, key_model, job_id, suffix)
        return
    file_name = generate_file_name(path, folder, key_data, name, key_len, key_feat, key_model, job_id, suffix)
    check_dir_exist(file_name)
    if suffix == "npy":
        np.save(file_name, arr)
        print(f"shape of the saved {name}: {arr.shape}.")
    elif suffix == "h5":
        arr.to_hdf(file_name, name, "w")
        print(f"save {name} to hdf: {arr.shape}")
    elif suffix == "csv":
        arr.to_csv(file_name)
        print(f"save {name} to csv: {arr.shape}")
    else: 
        raise NotImplementedError()
    return 

def load_file(path, folder, key_data, name, key_len=None, key_feat=None, key_model=None, job_id=None, suffix="npy", **kwargs):
    """
    save an arr
    """
    file_name = generate_file_name(path, folder, key_data, name, key_len, key_feat, key_model, job_id, suffix)
    if suffix == "npy":
        return np.load(file_name)
    elif suffix == "h5":
        return pd.read_hdf(file_name)
    elif suffix == "csv":
        return pd.read_csv(file_name, **kwargs)
    else: 
        raise NotImplementedError()
    
    
def save_file_dict(arr_dict, path, folder, key_data, key_len=None, key_feat=None, key_model=None, job_id=None, suffix="npy"):
    """
    save a dict of arrs.
    """
    for name, arr in arr_dict.items():
        save_file(arr, path, folder, key_data, name, key_len, key_feat, key_model, job_id, suffix)

In [None]:
def np_save_print(file_name, arr, arr_name="arr"):
    """
    save one file and print its shape. If the folder doesn't exist, creat one.
    """
    check_dir_exist(file_name)
    np.save(file_name, arr)
    print(f"shape of the saved {arr_name}: {arr.shape}.")
    return


def print_seconds(x):
    x = math.ceil(x)
    return str(timedelta(seconds=x))

In [None]:
def model_fit_many_datas_models(key_data_list, key_feat_list, model_dict, param_grid, path, job_id, batch_size, n_s_list=None):
    """
    train a collection of models, w/ hyperparams to tune, on a batch of data from many datasets.
    can specify the seq length to fit, or fit all in the folder.
    """
    def raise_str_to_list(x):
        if isinstance(x, str): return [x]
        return x
    key_data_list, key_feat_list = raise_str_to_list(key_data_list), raise_str_to_list(key_feat_list)
    #
    start = job_id * batch_size; end = start + batch_size
    N_combos = len(key_data_list) * len(key_feat_list) * len(model_dict)
    count = 0; time_old = time.time(); total_time=0.
    for key_data, key_feat in product(key_data_list, key_feat_list):
        folder = f"{path['data']}/{key_data}"
        if n_s_list is not None:
            filenames = [generate_file_name(path, "data", key_data, "Xs", n_s, key_feat) for n_s in n_s_list]
        else:
            filenames = filter_filenames_in_folder(folder, key_feat)
        for key_model, model in model_dict.items():
            count+=1; print(f"{count}/{N_combos} combo starts.")
            for filename in filenames:
                Xs = np.load(f"{folder}/{filename}")[start:end]
                n_s = int(filename.split('_')[1]); Zs = load_file(path, "data", key_data, "Zs", n_s)[start:end]
                # train the model, on a param grid, on a batch of data
                model_params_arr, labels_arr, proba_arr = model_fit_batch_with_params(model, Xs, Zs, param_grid)
                # save results
                save_file({"modelParams": model_params_arr,
                          "labels": labels_arr,
                          "proba": proba_arr}, 
                          path, "estimation", key_data, None, n_s, key_feat, key_model, job_id)
            time_now = time.time(); time_this_iter = time_now-time_old; total_time += time_this_iter; time_old = time_now
            print(f"{count}/{N_combos} combo done. Time of this combo: {print_seconds(time_this_iter)}s. Total time: {print_seconds(total_time)}s.")
    return 

In [None]:
def feature_engineer(key_feat, key_data, n_b, path, n_s_list = None):
    """
    key_data can be a list
    """
    if isinstance(key_data, list):
        for key_data_ in key_data:
            feature_engineer(key_feat, key_data_, n_b, path)
        return 
    if isinstance(key_feat, list):
        for key_feat_ in key_feat:
            feature_engineer(key_feat_, key_data, n_b, path)
        return    

    function_dict = {"zhengB": lambda x: feature_engineer_zheng_batch(x, True),
                    "zhengF": lambda x: feature_engineer_zheng_batch(x, False),
                    "ewm": feature_engineer_ewm_batch}
    if key_feat not in function_dict.keys():
        raise NotImplementedError("feature not supported yet") 
    
    # key_feat, key_data are single key
    folder = f"{path['data']}/{key_data}"
    if n_s_list is not None:
        filenames = [generate_file_name(path, "data", key_data, "Xs", n_s, "raw") for n_s in n_s_list]
    else:
        filenames = filter_filenames_in_folder(folder, "raw")
    for filename in filenames:
        Xs_raw = np.load(f"{folder}/{filename}")
        Xs_feat = function_dict[key_feat](Xs_raw)[:, n_b:-n_b]
        # save results
        save_file(Xs_feat, path, 'data', key_data, "Xs", int(filename.split('_')[0]), key_feat)
        # np_save_print(f"{folder}/{filename.replace('raw', key_feat)}", Xs_feat, "Xs features")            
    return 