In [1]:
import numpy as np
import pandas as pd

import os

import json
from typing import List, Dict, Callable

In [2]:
%load_ext autoreload
%autoreload 2

from dataloading import DataLoader, Dataset
from model import GP

In [17]:
def make_dirs(dataset_name, split):
    save_path = f'model_pred_and_gt/{dataset_name}/run_{split}'
    for i in ['test', 'valid']:
        for j in ['gt', 'pred']:
            name = f'{j}_{i}'
            os.makedirs( os.path.join(save_path, name), exist_ok=True )

def save_res(path,dataset_name, model: GP, loader: DataLoader, mode, split=0):
    gt = loader.gen_gt()
    pred = model.predict_last(loader)
    make_dirs(dataset_name, split)
    if mode == 'test':
        np.savetxt(os.path.join(path, 'gt_test') + '/data.csv', gt, delimiter=',')
        np.savetxt(os.path.join(path, 'pred_test', 'data.csv'), pred, delimiter=',')
    if mode == 'valid':
        # print(os.path.join(path, 'gt_valid')+ '/data.csv')
        np.savetxt(os.path.join(path, 'gt_valid')+ '/data.csv', gt, delimiter=',')
        np.savetxt(os.path.join(path, 'pred_valid', 'data.csv'), pred, delimiter=',')

In [34]:
dataset_name = 'synthea_preprocessed'
for split in [0,1,2,3,4]:
    # split = 0
    dataset = Dataset(dataset_name, split)
    train_loader, valid_loader, test_loader = dataset.gen_dataloaders()
    # print(len(valid_loader))
    model = GP(dataset.vocab_size)
    model.fit(train_loader)
    # make_dirs(dataset_name)
    save_res(f'model_pred_and_gt/{dataset_name}/run_{split}', dataset_name, model, valid_loader, 'valid', split=split)
    save_res(f'model_pred_and_gt/{dataset_name}/run_{split}', dataset_name, model, test_loader, 'test', split=split)


## Data format conversion

This model was written with regard to old data format, so you should convert tambn-format to this one to launch this model

In [None]:
import json
import os
import pickle

import numpy as np

import random

In [None]:
def unpickle_file(path, type_of_split, data, prefix, encoding):
    tmp_path = os.path.join(path, prefix + type_of_split + '.pkl')
    with open(tmp_path, 'rb') as file:
        data[type_of_split] = pickle.load(file, encoding=encoding)[type_of_split]
    return data

def retrieve_dict(path, prefix, encoding):
    tmp_path = os.path.join(path, prefix + 'dev.pkl')
    # print(encoding)
    with open(tmp_path, 'rb') as file:
        data = pickle.load(file, encoding=encoding)
    unpickle_file(path, 'train', data, prefix, encoding=encoding)
    unpickle_file(path, 'test', data, prefix, encoding=encoding)
    return data

def change_busket_enc_format(one_hot_encoded: np.ndarray) -> np.ndarray:
    '''if the busket is empty'''
    return np.nonzero(one_hot_encoded)[0].tolist()

def retrieve_busket_seq(TCMBN_data_dict_record: list) -> list:
    '''
    TCMBN_data_dict_record is expected to be list with info corresponding to one user id
    '''
    busket_seq = []

    for record in TCMBN_data_dict_record:
        one_hot = record['type_event']
        if ~(np.all(one_hot == 0)):
            busket_seq.append(change_busket_enc_format(one_hot))
        # busket_seq.append(change_busket_enc_format(one_hot))
    
    return busket_seq

# def merge_splits_from_TCMBN(TCMBN_unpickled_data: dict) -> None:
#     TCMBN_merged_arr = []

#     for type_of_split in ['train', 'test', 'dev']:
#         TCMBN_merged_arr.extend(TCMBN_unpickled_data[type_of_split])

#     return TCMBN_merged_arr

def create_DNNTSP_dict(TCMBN_arr: list, last_id = 0) -> dict:
    '''
    TCMBN_merged_arr is expected to be a list with data
    TCMBN_merged_arr[i] returns data regarding i-th id  
    '''

    DNNTSP_dict = {}

    for id, record in enumerate(TCMBN_arr):
        busket_seq = retrieve_busket_seq(record)
        if busket_seq:  # Only add non-empty sequences
            DNNTSP_dict[str(last_id + id)] = busket_seq

    return DNNTSP_dict

def train_test_valid_split(DNNTSP_dict_: list, ratio, seed) -> dict:
    '''
    ration is expected as [train, test, valid], for e.g. [0.7, 0.15, 0.15]
    '''

    DNNTSP_dict = DNNTSP_dict_['train'] | DNNTSP_dict_['test'] | DNNTSP_dict_['validate'] 

    type_of_split = ['train', 'test', 'validate']

    # ids_train = list(DNNTSP_dict['train'].keys())
    # ids_test = list(DNNTSP_dict['test'].keys())
    # ids_val = list(DNNTSP_dict['validate'].keys())

    ids = list(DNNTSP_dict.keys())

    random.seed(seed)
    # random.shuffle(ids_train)
    # random.shuffle(ids_test)
    # random.shuffle(ids_val)

    # ids = ids_train + ids_test + ids_val

    random.shuffle(ids)

    num_of_ids = len(ids)

    # print(num_of_ids)

    

    split_len = {}
    for ind, r in enumerate(ratio):
        split_len[type_of_split[ind]] = int(r*num_of_ids)

    # print(ratio)
    # print(split_len)

    # print(len(ids[:split_len['train']]))
    # print(len(ids[split_len['train']: split_len['train'] + split_len['test']]))
    
    train_dict = {id : DNNTSP_dict[id] for id in ids[:split_len['train']]}
    test_dict = {id : DNNTSP_dict[id] for id in ids[split_len['train']: split_len['train'] + split_len['test']]}
    valid_dict = {id : DNNTSP_dict[id] for id in ids[split_len['train'] + split_len['test']:]}

    DNNTSP_dict = {
        'train': train_dict,
        'test': test_dict,
        'validate': valid_dict
    }

    # print(len(valid_dict))

    return DNNTSP_dict

def save_TCMBN_to_DNNTSP_format(dataset_name: str, path_to_pickled_files: str, ratio=[0.6,0.2,0.2], seed=42, prefix='', encoding='ASCII') -> None:
    '''
    given the path to folder with pickles saves prepared file in the noted path
    also rearranges train/test split to ratio from TCMBN
    '''

    splits = {'train' : 'train', 'test' : 'test', 'dev' : 'validate'}
    ids = {'train' : 0, 'test' : 10000, 'dev' : 20000}
    

    TCMBN_unpickled_data = retrieve_dict(path_to_pickled_files, prefix, encoding=encoding)
    DNNTSP_dict = {}

    for key, value in splits.items(): 
        DNNTSP_dict[value] = create_DNNTSP_dict(TCMBN_unpickled_data[key], ids[key])

    
    # print(DNNTSP_dict.keys())
    # print(DNNTSP_dict['train'].keys())
    DNNTSP_dict = train_test_valid_split(DNNTSP_dict, [0.6, 0.2, 0.2], seed=seed)

    new_path = os.path.join('data', dataset_name, f'split_{seed}')
    os.makedirs(new_path, exist_ok=True)

    with open(os.path.join(new_path, dataset_name) + '.json', 'w') as file:
        json.dump(DNNTSP_dict, file)

    return DNNTSP_dict



In [None]:
dataset_name = 'instacart_preprocessed'
path_to_pickled_files = f'../tcmbn_data/{dataset_name}/split_1'
for seed in [0]:
    new_dict = save_TCMBN_to_DNNTSP_format(dataset_name, path_to_pickled_files, prefix='', encoding='latin-1', seed=seed)