In [9]:
import pandas as pd
import numpy as np

In [26]:
min_pred = 3
masked_percents = [0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
hidden_percents = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [27]:
df = pd.read_hdf("../data/etecsa/etecsa_1.h5", key='data')
trajectories = df['trajectory']

In [28]:
from sklearn.model_selection import train_test_split

# Dividir en train, valid y test
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [29]:
# Construir un diccionario word2idx
unique_tokens = set(token for trajectory in trajectories for token in trajectory)
word2idx = {token: idx for idx, token in enumerate(['[PAD]', '[CLS]', '[SEP]', '[MASK]'] + list(unique_tokens), start=0)}

# Inversa para verificar o decodificar más adelante
idx2word = {idx: token for token, idx in word2idx.items()}

In [21]:
import random

def mask_trajectories(trajectories, masked_percent=0.15, hidden_percent=0, min_pred=5):
    """
    Enmascara trayectorias para MLM.

    Args:
        trajectories: Lista de trayectorias, donde cada trayectoria es una lista de tokens (str).
        word2idx: Diccionario que asigna índices a tokens.
        max_pred: Mínimo número de tokens a enmascarar.

    Returns:
        Listado de datos procesados, cada uno con trajectory, masked_tokens, masked_pos.
    """
    total_data = []
    vocab_size = len(word2idx)

    for tokens in trajectories:
        # Convertir tokens a índices
        input_ids = ['[CLS]'] + list(tokens) + ['[SEP]']
        
        # Identificar el primer y último valor conocido (excluyendo tokens especiales)
        known_positions = [
            i for i, token in enumerate(input_ids)
            if token != '[CLS]' and token != '[SEP]' and token != '[PAD]'
        ]
        if len(known_positions) < 2:
            continue  # Saltar trayectorias que no tienen suficientes valores conocidos
        
        first_known = known_positions[0]
        last_known = known_positions[-1]

        # Generar posiciones candidatas para el enmascaramiento (excluyendo el primero y último conocido)
        cand_masked_pos = [
            pos for pos in known_positions
            if pos != first_known and pos != last_known
        ]

        # Establecer una semilla aleatoria fija
        random.seed(hash(tuple(tokens)))
        
        # Barajar las posiciones candidatas
        random.shuffle(cand_masked_pos)

        # Determinar cuántos tokens enmascarar
        n_pred = max(min_pred, int(len(cand_masked_pos) * masked_percent))

        masked_tokens, masked_pos = [], []

        # Aplicar enmascaramiento
        for pos in cand_masked_pos[:n_pred]:
            masked_pos.append(str(pos))
            masked_tokens.append(str(input_ids[pos]))

            if random.random() < 0.8:  # 80% de probabilidad de usar [MASK]
                input_ids[pos] = '[MASK]'
            elif random.random() > 0.9:  # 10% de probabilidad de usar un token aleatorio
                index = random.randint(0, vocab_size - 1)
                while index < 4:  # Evitar tokens especiales
                    index = random.randint(0, vocab_size - 1)
                input_ids[pos] = idx2word[index]

        cand_masked_pos = cand_masked_pos[n_pred:]
        n_hide = int(len(cand_masked_pos) * hidden_percent)

        for pos in cand_masked_pos[:n_hide]:
            input_ids[pos] = '[PAD]'

        # Guardar los datos
        total_data.append({
            'trajectory': ' '.join(input_ids[1:-1]),
            'masked_tokens': ' '.join(masked_tokens),
            'masked_pos': ' '.join(masked_pos)
        })

    return total_data

In [30]:
# Aplicar enmascaramiento a valid y test
valid_data = mask_trajectories(valid_df['trajectory'], min_pred=min_pred)
test_data = mask_trajectories(test_df['trajectory'], min_pred=min_pred)

# Agregar las columnas a los DataFrames
valid_df['trajectory'] = [data['trajectory'] for data in valid_data]
valid_df['masked_tokens'] = [data['masked_tokens'] for data in valid_data]
valid_df['masked_pos'] = [data['masked_pos'] for data in valid_data]

test_df['trajectory'] = [data['trajectory'] for data in test_data]
test_df['masked_tokens'] = [data['masked_tokens'] for data in test_data]
test_df['masked_pos'] = [data['masked_pos'] for data in test_data]

In [31]:
# Guardar los conjuntos en archivos HDF5
train_df.to_hdf("train.h5", key='data', mode='w')
valid_df.to_hdf(f"valid_{min_pred}.h5", key='data', mode='w')
test_df.to_hdf(f"test_{min_pred}.h5", key='data', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['trajectory'], dtype='object')]

  train_df.to_hdf("train.h5", key='data', mode='w')


In [187]:
for hidden_percent in [m for m in hidden_percents if m > 0.8]:
    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
    valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

    # Aplicar enmascaramiento a test
    test_data = mask_trajectories(test_df['trajectory'], hidden_percent=hidden_percent, min_pred=min_pred)

    # Agregar las columnas a los DataFrames

    test_df['trajectory'] = [data['trajectory'] for data in test_data]
    test_df['masked_tokens'] = [data['masked_tokens'] for data in test_data]
    test_df['masked_pos'] = [data['masked_pos'] for data in test_data]
    
    # Guardar los conjuntos en archivos HDF5
    masked_value = int(0.15 * 100)
    hidden_value = int(hidden_percent * 100)
    test_df.to_hdf(f"test_{min_pred}_{masked_value}_{hidden_value}.h5", key='data', mode='w')