## Build Input for Transformer Based Recommender with Adjacency Data

SASRec [1] takes sequence of items (interacted by a user) as input and predicts the same sequence (shifted). The idea is to enrich this item sequence with additional information coming from the associated users. Thus, each item will have a sequence 
of users (sorted by time) as additional attributes. 

How to take care of the time aspect? While creating this graph only the users interacted before this time should be taken into account. Thus, the new dataset will look like 
        
        user-id, item-id, all the users who interacted this item recently    
        
        11 56076 0
        11 14037 15,10,12,13
        11 4467 0
        11 33810 268,41162,60222,56206,49801,10441,13000,14299
        11 31260 12817,14614,30088,11039,25632,13,62373,47260,45849
        11 28006 32489
        11 16413 11359,11315,14025,34607,41079,11448,41139,40790,2541,10873,41072,41089,41083,13000,41099,29498,26935,9951,41060
        11 55039 59475,11315,14025,34607,11448,61040,41089,41099,41083,2541,13407,20417,9951
        11 20799 56213,56198
        11 58690 3616
        11 26147 0
        11 66039 51660,52634,58450,51306,59567,10873
        11 78708 58957
        11 18158 3597,951,61249,2901,40226,60070,32243,32556,3635

In [1]:
import os
import sys
import json
import re
import random
import copy
from tqdm import tqdm
import numpy as np
import pickle

from collections import defaultdict, Counter


In [2]:
interaction_filename = "/recsys_data/RecSys/SASRec-tf2/data/ae_original.txt"
output_filename = "/recsys_data/RecSys/SASRec-tf2/data/ae_graph.txt"
dict_filename = "/recsys_data/RecSys/SASRec-tf2/data/ae_graph_dict.pkl"

In [13]:
def data_process_with_time(fname, pname, sep="\t", file_write=False, max_seq_len=50, max_item_len=50):
    User = defaultdict(list)
    Items = set()
    user_dict, item_dict = {}, {}
    item_user = defaultdict(list)  # track user interaction time
    final_item_user = dict()

    with open(fname, 'r') as fr:
        for line in fr:
            u, i, t = line.rstrip().split(sep)
            t = float(t)
            User[u].append((i, t))
            Items.add(i)
            item_user[i].append((u, t))
            
    print(len(User), len(Items))
    
    item_count = 1  # always start with 1
    for item in Items:
        item_dict[item] = item_count
        item_count += 1

    count_del = 0
    user_count = 1  # start with 1

    # get the user-ids
    for user in User.keys():
        if len(User[user]) <= 2:
            count_del += 1
        else:
            User[user] = sorted(User[user], key=lambda x: x[1])
            user_dict[user] = user_count
            user_count += 1

    if file_write:
        print(f"Writing data in {pname}")
        with open(pname, 'w') as fw:
            for user in tqdm(User.keys()):
                if len(User[user]) > 2:
                    items = sorted(User[user], key=lambda x: x[1])
                    current_items = [x[0] for x in items]
                    user_id = user_dict[user]
                    missing_user = 0
                    for it in items:
                        item_name, item_time = it
                        ut = item_user[item_name]
                        item_id = item_dict[item_name]
                        prev_ut = [x for x in ut if x[1] < item_time]  # previous user-time
                        prev_ut = sorted(prev_ut, key=lambda x: item_time - x[1])
                        prev_u = [user] + [x[0] for x in prev_ut if x[0] in user_dict]
                        
                        # items interacted by these users (but before the current item_time)
                        # and not in the current user's item list
                        prev_it = [User[u] for u in prev_u]
                        prev_it = [item for sublist in prev_it for item in sublist]
                        prev_it = [x for x in prev_it if x[1] < item_time]
                        prev_it = sorted(prev_it, key=lambda x: item_time - x[1])
                        prev_i = [x[0] for x in prev_it if x[0] in item_dict]
                        prev_i = [item for item in prev_i if item not in current_items]
                        if len(prev_i) > 0:
                            prev_i = [item_dict[item] for item in prev_i][:max_item_len]
                            prev_i = [str(item) for item in prev_i]
                        else:
                            prev_i = ['0']
                        
                        prev_u = [str(user_id)] + [str(user_dict[x[0]]) for x in prev_ut if x[0] in user_dict][:max_seq_len]
                        hist_u = ','.join(prev_u)
                        hist_i = ','.join(prev_i)
                        fw.write(sep.join([str(user_id), str(item_id), hist_u, hist_i]) + '\n')
        
    print(user_count-1, count_del)
    return user_dict, item_dict, User, item_user

In [14]:
write_file = True
max_user_list = 49
max_item_list = 50
udict, idict, user_history, item_history = data_process_with_time(interaction_filename, 
                                                                  output_filename, 
                                                                  "\t", 
                                                                  write_file,
                                                                  max_user_list,
                                                                  max_item_list
                                                                 )

if write_file:
    with open(dict_filename, 'wb') as handle:
        pickle.dump((udict, idict, user_history), handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Retained {len(udict)} users with {len(idict)} items from {len(user_history)} users")

63161 85930


  0%|          | 4/63161 [00:00<34:21, 30.64it/s]

Writing data in /recsys_data/RecSys/SASRec-tf2/data/ae_graph.txt


100%|██████████| 63161/63161 [29:03<00:00, 36.24it/s]  


63114 47
Retained 63114 users with 85930 items from 63161 users


In [7]:
# 'B005UEB5TQ', 'B000W3LJ6Y', 'B0089MVZDW', 'B00005TQ09', 'B0001Y7UAI', 'B00020BJA8', 'B000BQ7GW8', 'B000EPR7XO', 'B000M2GYF6', 'B001EH8FZA', 'B001EZRYYU'
item_history['B005UEB5TQ']

[('A2C8I2RQ0WG940', 1383523200.0), ('AM8OIQGVZEEKT', 1405468800.0)]

In [8]:
user_history['A2C8I2RQ0WG940']

[('B003AVMRPM', 1276992000.0),
 ('B004AM5RB6', 1295568000.0),
 ('B003S68Q0Y', 1305504000.0),
 ('B007CS9WYI', 1339632000.0),
 ('B00BOZ1Y46', 1369440000.0),
 ('B002P3FQT0', 1370304000.0),
 ('B000VDCTCI', 1370736000.0),
 ('B000EOSHGQ', 1370736000.0),
 ('B0016ML7YE', 1370736000.0),
 ('B005XDASYC', 1370736000.0),
 ('B0095ZRQN0', 1379980800.0),
 ('B00CU9GKTO', 1382486400.0),
 ('B007G5NNOW', 1382659200.0),
 ('B005UEB5TQ', 1383523200.0),
 ('B007A4JTDI', 1390176000.0),
 ('B0045KGZOG', 1390176000.0),
 ('B005XUOQF2', 1390176000.0),
 ('B002JCSV8U', 1390176000.0),
 ('B003VTY2S8', 1390176000.0),
 ('B00005QIZ8', 1390176000.0),
 ('B0049WBZEK', 1392076800.0),
 ('B003OBUJIK', 1393113600.0),
 ('B007234F0O', 1393113600.0),
 ('B00FDPSH0W', 1395187200.0),
 ('B0037MH5W4', 1395878400.0),
 ('B00144KS6W', 1403740800.0)]

In [9]:
udict['A2C8I2RQ0WG940']

1

In [11]:
idict['B005UEB5TQ'], idict['B007A4JTDI']

(71628, 36145)

In [12]:
item_history['B007A4JTDI']

[('A1C82BC5GNABOA', 1395273600.0),
 ('A23RMOGZKJJS6N', 1386720000.0),
 ('A2LDL960FST9LW', 1354060800.0),
 ('A2C8I2RQ0WG940', 1390176000.0),
 ('AV4XQWYQ3XLGD', 1369008000.0),
 ('A1QQKJU6TZKJUQ', 1333497600.0),
 ('A3U84V957MFKFC', 1368403200.0)]

In [14]:
[udict[x[0]] for x in item_history['B007A4JTDI'] if x[1] < 1390176000.0]

[51424, 32273, 32682, 49237, 62539]