In [2]:
import time
import random
import ciso8601
import numpy as np
import pandas as pd
import datetime as dt


In [3]:
PATH_TO_ORIGINAL_DATA = 'C:/Users/BuiQuocBao/OneDrive/Desktop/GRU4REC/raw/'
PATH_TO_PROCESSED_DATA = 'C:/Users/BuiQuocBao/OneDrive/Desktop/GRU4REC/processed/'

In [5]:
## load data
%time raw_data = pd.read_csv(PATH_TO_ORIGINAL_DATA + 'yoochoose-clicks.dat', sep=',', \
                   header=None, usecols=[0,1,2], dtype={0:np.int32, 1:str, 2:np.int64})
raw_data.columns = ['SessionId', 'TimeStr', 'ItemId']
raw_data.shape

Wall time: 40.5 s


(33003944, 3)

In [6]:
## parameters
sampling=True
sample_rate = 0.1
single_process=True
file_type = "sample" if sampling==True else "full"

In [7]:
## sampling
##
random.seed(1050)
if sampling:
    u_sessid = raw_data.SessionId.unique()
    s_sessid = random.sample(u_sessid.tolist(), int(len(u_sessid)*sample_rate))
    raw_data = raw_data[np.in1d(raw_data.SessionId, s_sessid)]
raw_data.shape

(3301436, 3)

In [8]:
def timestr_to_timestamp(df):
    df['timestamp'] = df.TimeStr.apply(lambda x: ciso8601.parse_datetime(x).timestamp())
    return df


In [9]:
## transpose timestr to timestamp
%time raw_data['timestamp'] = raw_data.TimeStr.apply(lambda x: ciso8601.parse_datetime(x).timestamp())
data = raw_data
del(data['TimeStr'])

Wall time: 3.85 s


In [10]:
## sorted by sessionid, timestamp
%time data = data.sort_values(['SessionId','timestamp'])
data[:5]

Wall time: 4.3 s


Unnamed: 0,SessionId,ItemId,timestamp
15,6,214701242,1396804000.0
16,6,214826623,1396804000.0
55,21,214838503,1396861000.0
56,21,214838503,1396861000.0
57,21,214838503,1396861000.0


In [11]:
## data length by sessionid
session_lengths = data.groupby('SessionId').size()
print("length:", len(session_lengths))
print("min length", min(session_lengths))
print("max length", max(session_lengths))

length: 924972
min length 1
max length 200


In [12]:
## filter by session length
##
data = data[np.in1d(data.SessionId, session_lengths[session_lengths>=2].index)]

In [13]:
## data length by itemid
item_supports = data.groupby('ItemId').size()
print("length:", len(item_supports))
print("min length", min(item_supports))
print("max length", max(item_supports))

length: 34452
min length 1
max length 13262


In [14]:
## filter by item length
##
data = data[np.in1d(data.ItemId, item_supports[item_supports>=5].index)]

In [15]:
## filter by session length
##
session_lengths = data.groupby('SessionId').size()
data = data[np.in1d(data.SessionId, session_lengths[session_lengths>=2].index)]

In [16]:
## split train & test set
##
tmax = data.timestamp.max()
session_max_times = data.groupby('SessionId').timestamp.max()
session_train = session_max_times[session_max_times < tmax-86400].index
session_test = session_max_times[session_max_times >= tmax-86400].index
##
train = data[np.in1d(data.SessionId, session_train)]
test = data[np.in1d(data.SessionId, session_test)]
test = test[np.in1d(test.ItemId, train.ItemId)]