In [19]:
import numpy as np
# import pickle
# pickle.HIGHEST_PROTOCOL = 4
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Data Preprocessing

## Load Data

In [20]:
load_dir = os.path.join('data','ml-1m')
save_dir = os.path.join('data','ml-1m-split')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [21]:
ratings_header = ['user_id','movie_id','rating','timestamp']
ratings_df = pd.read_csv(os.path.join(load_dir, 'ratings.dat'),sep='::',names = ratings_header, engine='python')

users_header = ['user_id','gender','age','occupation','zip_code']
users_df = pd.read_csv(os.path.join(load_dir, 'users.dat'),sep='::',names = users_header, engine='python')

movies_header = ['movie_id','title','genres']
movies_df = pd.read_csv(os.path.join(load_dir, 'movies.dat'),sep='::',names = movies_header, encoding='iso-8859-1', engine='python')

In [22]:
users_df = users_df.set_index('user_id')
movies_df = movies_df.set_index('movie_id')

## train test split

In [23]:
users_train_df, users_holdout_df = train_test_split(users_df, test_size=0.2, random_state=2658, shuffle=True)
users_test_df, users_val_df = train_test_split(users_holdout_df, test_size=0.5, random_state=4769, shuffle=True)

In [24]:
past_df = ratings_df.groupby('user_id').apply(lambda x: np.array_split(x.sort_values('timestamp'), 2)[0].reset_index(drop=True)).rename_axis((None, None))
future_df = ratings_df.groupby('user_id').apply(lambda x: np.array_split(x.sort_values('timestamp'), 2)[1].reset_index(drop=True)).rename_axis((None, None))

## apply to future and past

In [25]:
future_train_df = future_df.loc[users_train_df.index]
future_val_df = future_df.loc[users_val_df.index]
future_test_df = future_df.loc[users_test_df.index]

past_train_df = past_df.loc[users_train_df.index]
past_val_df = past_df.loc[users_val_df.index]
past_test_df = past_df.loc[users_test_df.index]

In [29]:
past_train_df.rating.describe()

count    410290.000000
mean          3.660803
std           1.111214
min           1.000000
25%           3.000000
50%           4.000000
75%           5.000000
max           5.000000
Name: rating, dtype: float64

## save

In [8]:
store = pd.HDFStore(os.path.join(save_dir,'users_train_dfs.h5'), 'w')
# store['users'] = users_train_df
# store['users_history'] = past_train_df
# store['samples_future'] = future_train_df
store.put('users', users_train_df, format='table')
store.put('users_history', past_train_df, format='table')
store.put('samples_future', future_train_df, format='table')
store.close()


In [9]:


store = pd.HDFStore(os.path.join(save_dir,'users_val_dfs.h5'), 'w')
# store['users'] = users_val_df
# store['users_history'] = past_val_df
# store['samples_future'] = future_val_df
store.put('users', users_val_df, format='table')
store.put('users_history', past_val_df, format='table')
store.put('samples_future', future_val_df, format='table')
store.close()

store = pd.HDFStore(os.path.join(save_dir,'users_test_dfs.h5'), 'w')
# store['users'] = users_test_df
# store['users_history'] = past_test_df
# store['samples_future'] = future_test_df
store.put('users', users_test_df, format='table')
store.put('users_history', past_test_df, format='table')
store.put('samples_future', future_test_df, format='table')
store.close()

In [10]:
movies_df.to_hdf(os.path.join(save_dir,'movies_dfs.h5'), key='movies', mode='w', format='table')