In [1]:
import pickle
import os
import numpy as np

In [9]:
def unpickle_file(path, type_of_split, data, prefix, encoding):
    tmp_path = os.path.join(path, prefix + type_of_split + '.pkl')
    with open(tmp_path, 'rb') as file:
        data[type_of_split] = pickle.load(file, encoding=encoding)[type_of_split]
    return data

def load_pkl(path, type_of_split, encoding='ASCII'):
    tmp_path = os.path.join(path, type_of_split + '.pkl')
    with open(tmp_path, 'rb') as file:
        data = pickle.load(file, encoding=encoding)
    return data

def save_pkl(data, path, type_of_split, encoding='ASCII'):
    tmp_path = os.path.join(path, type_of_split + '.pkl')
    os.makedirs(path, exist_ok=True)
    with open(tmp_path, 'wb') as file:
        pickle.dump(data, file)

def preprocess_split(data: list[list[dict]]) -> list[list[dict]]:
    '''delete empty baskets, delete users whose history is <=1'''
    data_processed = []
    for user in data:
        if len(user) <= 1:
            continue
        user_history = []
        for basket in user:
            if basket['type_event'].sum() == 0.0:
                continue
            user_history.append(basket)
        if len(user_history) > 1:    
            data_processed.append(user_history)
    
    return data_processed

In [6]:
name = 'instacart'
path = f'tcmbn_data/{name}/split_1'
types_of_split = ['train', 'dev', 'test']
new_path = f'tcmbn_data/{name}_preprocessed/split_1'

In [7]:
for split in types_of_split:
    data = load_pkl(path, split)
    data[split] = preprocess_split(data[split])
    save_pkl(data, new_path, split)
    

In [43]:
np.array([]).size

0

In [2]:
with open('tcmbn_data/DC_preprocessed/split_1/train.pkl', 'rb') as f:
    d = pickle.load(f)

In [5]:
d['train'][0]

[{'time_since_start': 17,
  'time_since_last_event': 0,
  'type_event': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},
 {'time_since_start': 20,
  'time_since_last_event': 3,
  'type_event': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Data preparation for time information ablation study

In [6]:
def remove_time(data: dict):
    new_data = []
    for user in data:
        user = []
        for i, basket in enumerate(user):
            basket['time_since_start'] = i
            basket['time_since_last_event'] = 0
            user.append(basket)
        new_data.append(user)
        
# def remove_all_time(data: dict):
    

In [15]:
name = 'mimic3_preprocessed'
path = f'tcmbn_data/{name}/split_1'
types_of_split = ['train', 'dev', 'test']
new_path = f'tcmbn_data/{name}_wo_time/split_1'

In [16]:
for split in types_of_split:
    data = load_pkl(path, split)
    data[split] = remove_time(data[split])
    save_pkl(data, new_path, split)
    