In [31]:
import os
import pandas as pd

In [32]:
root_path = '..'
data_path = 'data/'
data_type = 'humob'

pre_len = 5
masked_percent = 15

In [33]:
train_data_name = f'{data_type}/train.h5'
test_data_name = f'{data_type}/test_{pre_len}_{masked_percent}.h5'
    
print('success load ', train_data_name, test_data_name)
train_df = pd.read_hdf(os.path.join(root_path, data_path, train_data_name))
test_df = pd.read_hdf(os.path.join(root_path, data_path, test_data_name))

success load  humob/train.h5 humob/test_5_15.h5


In [34]:
class DataSet:
    def __init__(self, train_df, test_df):
        self.train_df = train_df
        self.test_df = test_df

    def gen_train_data(self):
        # ['trajectory', 'user_index', 'day']
        records = []
        for _, row in self.train_df.iterrows():
            seq, user_index, day = row['trajectory'], row['user_index'], row['day']
            records.append([seq, user_index, day])
        print("All train length is " + str(len(records)))
        return records

    def gen_test_data(self):
        # ['trajectory', 'masked_pos', 'masked_tokens']
        test_df = self.test_df
        records = []
        for _, row in test_df.iterrows():
            seq, masked_pos, masked_tokens = row['trajectory'], row['masked_pos'], row['masked_tokens']
            user_index, day = row['user_index'], row['day']
            seq, masked_pos, masked_tokens = list(seq.split()), list(map(int, masked_pos.split())), \
                                                list(map(int, masked_tokens.split()))
            records.append([seq, masked_pos, masked_tokens, user_index, day])
        print("All test length is " + str(len(records)))
        return records

In [35]:
dataset = DataSet(train_df, test_df)
    
train_data = dataset.gen_train_data()  # [seq, user_index, day]
test_data = dataset.gen_test_data()  # [seq, masked_pos, masked_tokens, user_index, day]

All train length is 159073
All test length is 34088


In [36]:
from collections import defaultdict, Counter

def most_frequent_location_per_interval(data):
    """
    Determina el lugar más frecuente para cada usuario en cada intervalo de tiempo.
    
    Args:
    data (list): Lista de listas, donde cada elemento tiene el formato:
                 [trayectoria, user_id, día].
                 
    Returns:
    dict: Diccionario que mapea user_id a otro diccionario con el intervalo como clave
          y el lugar más frecuente como valor.
    """
    # Diccionario para almacenar las frecuencias por usuario e intervalo
    user_interval_data = defaultdict(lambda: defaultdict(list))
    
    # Procesar cada registro
    for record in data:
        trajectory, user_id, day = record
        for interval, location in enumerate(trajectory):
            if location != "[PAD]":  # Ignorar los valores [PAD]
                user_interval_data[user_id][interval].append(location)
    
    # Determinar el lugar más frecuente por usuario e intervalo
    result = defaultdict(dict)
    for user_id, intervals in user_interval_data.items():
        for interval, locations in intervals.items():
            most_common_location = Counter(locations).most_common(1)[0][0]
            result[user_id][interval] = most_common_location

    return result

In [37]:
history = most_frequent_location_per_interval(train_data)

In [38]:
def predict_masked_values(data, history):
    result = []

    total_masked_values = 0
    total_predicted_values = 0

    for record in data:
        trajectory, masked_pos, masked_values, user_id, day = record
        total_masked_values += len(masked_pos)

        for i in range(48):
            for j in range(len(masked_pos)):
                if masked_pos[j] == i:
                    try:
                        trajectory[i] = history[user_id][i]
                        if trajectory[i] == str(masked_values[j]):
                            total_predicted_values += 1
                    except:
                        pass
                    continue

        result.append([trajectory, masked_pos, masked_values, user_id, day])

    return result, total_predicted_values / total_masked_values


In [39]:
result, accuracy = predict_masked_values(test_data, history)

In [40]:
accuracy

0.5142733639810039