In [1]:
from collections import defaultdict
from datetime import datetime
import torch
import pandas as pd

In [2]:
data_path = '../../../../datasets/Retailrocket'

In [14]:
file_path = data_path + '/events.csv'
events_data = pd.read_csv(file_path, index_col='visitorid')
events_data = events_data.sort_index().drop('transactionid', axis=1, inplace=False)

In [15]:
events_data = events_data[events_data.event != 'view']

In [16]:
events_data

Unnamed: 0_level_0,timestamp,event,itemid
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,1440914628202,addtocart,65273
150,1433719818230,addtocart,452955
155,1441952340395,addtocart,224623
155,1441951932678,addtocart,368372
155,1441952265950,addtocart,442601
...,...,...,...
1407512,1432953262950,addtocart,54141
1407512,1432952383859,addtocart,107832
1407512,1432954196910,addtocart,398091
1407512,1432954281393,addtocart,398533


## SASRec handling
Only use implicit feedback in the sequence of items

In [17]:
# use timestamps to determine the sequence order of actions.
actions = events_data
actions = actions.groupby('itemid').filter(lambda  x: len(x) >= 5)
actions = actions.groupby('visitorid').filter(lambda  x: len(x) >= 5)
actions = actions.groupby('visitorid', group_keys= False).apply(lambda  x: x.sort_values('timestamp'))
# We discard users and items with fewer than 5 related actions.
actions

Unnamed: 0_level_0,timestamp,event,itemid
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
172,1439599816912,addtocart,10034
172,1439601219691,addtocart,465522
172,1439601228324,addtocart,465522
172,1439602141230,transaction,465522
172,1439602141686,transaction,10034
...,...,...,...
1404163,1435710164769,addtocart,316255
1404163,1435710167921,addtocart,11670
1404163,1435710195727,addtocart,409793
1404163,1435710325899,addtocart,233427


In [18]:
usermap = dict()
usernum = 0
itemmap = dict()
itemnum = 0
for _id, row in actions.iterrows():
    if _id in usermap:
        userid = usermap[_id]
    else:
        usernum += 1
        userid = usernum
        usermap[_id] = userid

    if row.itemid in itemmap:
        itemid = itemmap[row.itemid]
    else:
        itemnum += 1
        itemid = itemnum
        itemmap[row.itemid] = itemid

In [19]:
usermap.__len__(), itemmap.__len__()

(1256, 4205)

In [20]:
actions['itemid'] = actions['itemid'].map(itemmap)
actions.index = actions.index.map(usermap)
# data['Occupation'] = data['Occupation'].map(occupation_map)

In [21]:
actions

Unnamed: 0_level_0,timestamp,event,itemid
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1439599816912,addtocart,1
1,1439601219691,addtocart,2
1,1439601228324,addtocart,2
1,1439602141230,transaction,2
1,1439602141686,transaction,1
...,...,...,...
1256,1435710164769,addtocart,1164
1256,1435710167921,addtocart,1166
1256,1435710195727,addtocart,237
1256,1435710325899,addtocart,1263


In [22]:
sas_data = actions.drop('timestamp', axis=1, inplace=False).drop('event', axis=1, inplace=False)
## ..continue
sas_data.loc[1]

Unnamed: 0_level_0,itemid
visitorid,Unnamed: 1_level_1
1,1
1,2
1,2
1,2
1,1


In [11]:
sas_data

Unnamed: 0_level_0,itemid
visitorid,Unnamed: 1_level_1
1,1
1,1
1,2
1,3
1,4
...,...
75875,65712
75875,32303
75875,1327
75875,3346


In [12]:
with open('retailrocket.txt', 'w') as f:
    for _id in set(sas_data.index):
        for item_id in sas_data.loc[_id].itemid:
            f.write('%d %d\n' % (_id, item_id))

In [23]:
userActLength = actions.groupby('visitorid').size()
seq_avg_length= userActLength.mean()
seq_avg_length

15.180732484076433

In [24]:
userActLength.min(), userActLength.max()

(5, 866)