In [10]:
import pandas as pd
import numpy as np

In [11]:
max_pred = 5

In [12]:
df = pd.read_hdf("../data/humob/humob.h5", key='data')
trajectories = df['trajectory']

In [13]:
from sklearn.model_selection import train_test_split

# Dividir en train, valid y test
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [14]:
# Construir un diccionario word2idx
unique_tokens = set(token for trajectory in trajectories for token in trajectory)
word2idx = {token: idx for idx, token in enumerate(['[PAD]', '[CLS]', '[SEP]', '[MASK]'] + list(unique_tokens), start=0)}

# Inversa para verificar o decodificar más adelante
idx2word = {idx: token for token, idx in word2idx.items()}

In [15]:
from random import shuffle, random, randint

def mask_trajectories(trajectories, max_pred=5):
    """
    Enmascara trayectorias para MLM.

    Args:
        trajectories: Lista de trayectorias, donde cada trayectoria es una lista de tokens (str).
        word2idx: Diccionario que asigna índices a tokens.
        max_pred: Máximo número de tokens a enmascarar.

    Returns:
        Listado de datos procesados, cada uno con trajectory, masked_tokens, masked_pos.
    """
    total_data = []
    vocab_size = len(word2idx)

    for tokens in trajectories:
        # Convertir tokens a índices
        input_ids = ['[CLS]'] + list(tokens) + ['[SEP]']
        
        # Generar posiciones candidatas para el enmascaramiento
        cand_masked_pos = [
            i for i, token in enumerate(input_ids)
            if token != '[CLS]' and token != '[SEP]' and token != '[PAD]'
        ]
        shuffle(cand_masked_pos)

        # Determinar cuántos tokens enmascarar
        n_pred = min(max_pred, max(1, int(len(input_ids) * 0.15)))
        if n_pred < max_pred:
            continue  # Ignorar trayectorias muy cortas

        masked_tokens, masked_pos = [], []

        # Aplicar enmascaramiento
        for pos in cand_masked_pos[:n_pred]:
            masked_pos.append(str(pos))
            masked_tokens.append(str(input_ids[pos]))

            if random() < 0.8:  # 80% de probabilidad de usar [MASK]
                input_ids[pos] = '[MASK]'
            elif random() > 0.9:  # 10% de probabilidad de usar un token aleatorio
                index = randint(0, vocab_size - 1)
                while index < 4:  # Evitar tokens especiales
                    index = randint(0, vocab_size - 1)
                input_ids[pos] = idx2word[index]

        # Guardar los datos
        total_data.append({
            'trajectory': ' '.join(input_ids[1:-1]),
            'masked_tokens': ' '.join(masked_tokens),
            'masked_pos': ' '.join(masked_pos)
        })

    return total_data

In [16]:
# Aplicar enmascaramiento a valid y test
valid_data = mask_trajectories(valid_df['trajectory'], max_pred=max_pred)
test_data = mask_trajectories(test_df['trajectory'], max_pred=max_pred)

# Agregar las columnas a los DataFrames
valid_df['trajectory'] = [data['trajectory'] for data in valid_data]
valid_df['masked_tokens'] = [data['masked_tokens'] for data in valid_data]
valid_df['masked_pos'] = [data['masked_pos'] for data in valid_data]

test_df['trajectory'] = [data['trajectory'] for data in test_data]
test_df['masked_tokens'] = [data['masked_tokens'] for data in test_data]
test_df['masked_pos'] = [data['masked_pos'] for data in test_data]


In [17]:
# Guardar los conjuntos en archivos HDF5
train_df.to_hdf("train.h5", key='data', mode='w')
valid_df.to_hdf(f"valid_{max_pred}.h5", key='data', mode='w')
test_df.to_hdf(f"test_{max_pred}.h5", key='data', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['trajectory'], dtype='object')]

  train_df.to_hdf("train.h5", key='data', mode='w')
