In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Data Preprocessing

## Load Data

In [2]:
load_dir = os.path.join('data','ml-1m')
save_dir = os.path.join('data','ml-1m-split')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [3]:
ratings_header = ['user_id','movie_id','rating','timestamp']
ratings_df = pd.read_csv(os.path.join(load_dir, 'ratings.dat'),sep='::',names = ratings_header, engine='python')

users_header = ['user_id','gender','age','occupation','zip_code']
users_df = pd.read_csv(os.path.join(load_dir, 'users.dat'),sep='::',names = users_header, engine='python')

movies_header = ['movie_id','title','genres']
movies_df = pd.read_csv(os.path.join(load_dir, 'movies.dat'),sep='::',names = movies_header, encoding='iso-8859-1', engine='python')

In [4]:
users_df = users_df.set_index('user_id')
movies_df = movies_df.set_index('movie_id')

## train test split

In [5]:
users_train_df, users_holdout_df = train_test_split(users_df, test_size=0.2, random_state=2658, shuffle=True)
users_test_df, users_val_df = train_test_split(users_holdout_df, test_size=0.5, random_state=4769, shuffle=True)

In [6]:
past_df = ratings_df.groupby('user_id').apply(lambda x: np.array_split(x.sort_values('timestamp'), 2)[0].reset_index(drop=True))
future_df = ratings_df.groupby('user_id').apply(lambda x: np.array_split(x.sort_values('timestamp'), 2)[1].reset_index(drop=True))

## apply to future and past

In [7]:
future_train_df = future_df.loc[users_train_df.index]
future_val_df = future_df.loc[users_val_df.index]
future_test_df = future_df.loc[users_test_df.index]

past_train_df = past_df.loc[users_train_df.index]
past_val_df = past_df.loc[users_val_df.index]
past_test_df = past_df.loc[users_test_df.index]

## save

In [8]:
store = pd.HDFStore(os.path.join(save_dir,'users_train_dfs.h5'), 'w')
store['users'] = users_train_df
store['users_history'] = past_train_df
store['samples_future'] = future_train_df
store.close()


In [9]:


store = pd.HDFStore(os.path.join(save_dir,'users_val_dfs.h5'))
store['users'] = users_val_df
store['users_history'] = past_val_df
store['samples_future'] = future_val_df
store.close()

store = pd.HDFStore(os.path.join(save_dir,'users_test_dfs.h5'))
store['users'] = users_test_df
store['users_history'] = past_test_df
store['samples_future'] = future_test_df
store.close()

In [10]:
movies_df.to_hdf(os.path.join(save_dir,'movies_dfs.h5'), key='movies', mode='w')

In [11]:
pd.read_hdf(os.path.join(save_dir,'users_train_dfs.h5'), key='users_history')

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4036,0,4036,2028,5,965507074
4036,1,4036,3210,4,965507074
4036,2,4036,1258,5,965507106
4036,3,4036,3105,3,965507106
4036,4,4036,1431,3,965507106
...,...,...,...,...,...
456,74,456,750,4,976300620
456,75,456,151,3,976300620
456,76,456,2067,3,976300620
456,77,456,1263,2,976300620


In [15]:
pd.read_hdf(os.path.join(save_dir,'users_train_dfs.h5'), key='users').iloc[234]

gender            F
age              35
occupation        0
zip_code      17870
Name: 494, dtype: object

In [16]:
pd.read_hdf(os.path.join(save_dir,'movies_dfs.h5'))

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy
...,...,...
3948,Meet the Parents (2000),Comedy
3949,Requiem for a Dream (2000),Drama
3950,Tigerland (2000),Drama
3951,Two Family House (2000),Drama


In [22]:
tuple(pd.read_hdf(os.path.join(save_dir,'users_train_dfs.h5'), key='samples_future').iloc[234])

(4950, 2792, 4, 966097923)

In [64]:
pd.read_hdf(os.path.join(save_dir,'users_train_dfs.h5'), key='samples_future').iloc[234].to_dict()

{'user_id': 4950, 'movie_id': 2792, 'rating': 4, 'timestamp': 966097923}

In [96]:
users_history = pd.read_hdf(os.path.join(save_dir,'users_train_dfs.h5'), key='users_history')

In [97]:
users_history = users_history.droplevel('user_id')
max_seq_len = users_history.groupby('user_id').count().max().max()
users_history = users_history.groupby('user_id').apply(lambda x: x.sort_values('timestamp'))
users_history = users_history.drop('timestamp', axis=1)
users_history = users_history.droplevel('user_id')
users_history = users_history.groupby('user_id').agg(lambda x: x.tolist())
users_history = users_history.applymap(np.asarray, dtype=int)#.iloc[:,0]
users_history.applymap(lambda x: np.pad(x, (max_seq_len - len(x), 0), 'constant', constant_values=(0,0)))

Unnamed: 0_level_0,movie_id,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
6034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6035,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6037,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6039,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
