###  Created by Luis A. Sanchez-Perez (l.alejandro.2011@gmail.com).
<p><span style="color:green"><b>Copyright &#169;</b> Do not distribute or use without authorization from author.</span></p>

In [1]:
import pandas as pd
import numpy as np
import pathlib
import os
from tqdm import tqdm
import utils

In [2]:
tqdm.pandas()

In [3]:
DATASETS = pathlib.Path(os.environ['DATASETS'])

In [4]:
with open('data/ml-100k/splits/sequence/user_to_index.txt') as file:
    index_to_user = [int(entry.strip()) for entry in file.readlines()]

In [5]:
with open('data/ml-100k/splits/sequence/movie_to_index.txt') as file:
    index_to_movie = [int(entry.strip()) for entry in file.readlines()]

In [6]:
train_df = pd.read_csv('data/ml-100k/splits/sequence/train_df.csv')

In [7]:
test_df = pd.read_csv('data/ml-100k/splits/sequence/test_df.csv', index_col='userId')

In [8]:
dataset = pd.read_csv(DATASETS / 'recommender/movies/ml-100k/u.data',
                      sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'], encoding='latin-1',
                      index_col=['userId', 'movieId'])

In [9]:
def check_training_sequences(user_data):
    index = user_data['userId'].values[0]
    user = index_to_user[index]
    movies = dataset.loc[user]
    indices = [index_to_movie[entry] for entry in user_data.iloc[:, 1].values]
    previous = movies.loc[indices]['timestamp']
    for i in range(2, 6):
        indices = [index_to_movie[entry] for entry in user_data.iloc[:, i].values]   
        current = movies.loc[indices]['timestamp']
        if (previous.values > current.values).any():
            return False
        previous = current
    return True

In [10]:
assert train_df.groupby(by='userId', group_keys=False).progress_apply(check_training_sequences).all()

100%|██████████| 943/943 [00:01<00:00, 566.31it/s]


In [11]:
def check_user_last_target(user_data):
    index = user_data['userId'].values[0]
    user = index_to_user[index]
    movies = dataset.loc[user]
    indices = [index_to_movie[entry] for entry in user_data['target'].values]
    max_timestamp = movies.loc[indices]['timestamp'].max()
    return max_timestamp <= movies.loc[index_to_movie[test_df.loc[index]['target']]]['timestamp']

In [12]:
assert train_df.groupby(by='userId', group_keys=False).progress_apply(check_user_last_target).all()

100%|██████████| 943/943 [00:00<00:00, 1175.05it/s]
