In [1]:
from collections import defaultdict
from datetime import datetime
import torch
import pandas as pd

In [2]:
data_path = '../../../../datasets/Retailrocket'

In [3]:
file_path = data_path + '/events.csv'
events_data = pd.read_csv(file_path, index_col='visitorid')
events_data = events_data.sort_index().drop('transactionid', axis=1, inplace=False)

In [4]:
events_data

Unnamed: 0_level_0,timestamp,event,itemid
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1442004589439,view,285930
0,1442004759591,view,357564
0,1442004917175,view,67045
1,1439487966444,view,72028
2,1438971444375,view,342816
...,...,...,...
1407575,1433972768922,view,121220
1407576,1433343689991,view,356208
1407577,1431899284867,view,427784
1407578,1431825683288,view,188736


## SASRec handling
Only use implicit feedback in the sequence of items

In [5]:
# use timestamps to determine the sequence order of actions.
actions = events_data
actions = actions.groupby('itemid').filter(lambda  x: len(x) >= 5)
actions = actions.groupby('visitorid').filter(lambda  x: len(x) >= 5)
actions = actions.groupby('visitorid', group_keys= False).apply(lambda  x: x.sort_values('timestamp'))
# We discard users and items with fewer than 5 related actions.
actions

Unnamed: 0_level_0,timestamp,event,itemid
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,1438969904567,view,325215
2,1438970013790,view,325215
2,1438970212664,view,259884
2,1438970468920,view,216305
2,1438970905669,view,342816
...,...,...,...
1407573,1437973995037,view,57720
1407573,1438402327230,view,363109
1407573,1438402365311,view,463766
1407573,1438648367715,view,82278


In [6]:
usermap = dict()
usernum = 0
itemmap = dict()
itemnum = 0
for _id, row in actions.iterrows():
    if _id in usermap:
        userid = usermap[_id]
    else:
        usernum += 1
        userid = usernum
        usermap[_id] = userid

    if row.itemid in itemmap:
        itemid = itemmap[row.itemid]
    else:
        itemnum += 1
        itemid = itemnum
        itemmap[row.itemid] = itemid

In [7]:
usermap.__len__(), itemmap.__len__()

(75875, 65712)

In [8]:
actions['itemid'] = actions['itemid'].map(itemmap)
actions.index = actions.index.map(usermap)
# data['Occupation'] = data['Occupation'].map(occupation_map)

In [9]:
actions

Unnamed: 0_level_0,timestamp,event,itemid
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1438969904567,view,1
1,1438970013790,view,1
1,1438970212664,view,2
1,1438970468920,view,3
1,1438970905669,view,4
...,...,...,...
75875,1437973995037,view,65712
75875,1438402327230,view,32303
75875,1438402365311,view,1327
75875,1438648367715,view,3346


In [10]:
sas_data = actions.drop('timestamp', axis=1, inplace=False).drop('event', axis=1, inplace=False)
## ..continue
sas_data.loc[1]

Unnamed: 0_level_0,itemid
visitorid,Unnamed: 1_level_1
1,1
1,1
1,2
1,3
1,4
1,4
1,3
1,1


In [11]:
sas_data

Unnamed: 0_level_0,itemid
visitorid,Unnamed: 1_level_1
1,1
1,1
1,2
1,3
1,4
...,...
75875,65712
75875,32303
75875,1327
75875,3346


In [13]:
with open('retailrocket.txt', 'w') as f:
    for _id in set(sas_data.index):
        for item_id in sas_data.loc[_id].itemid:
            f.write('%d %d\n' % (_id, item_id))