## Time Interval Based Transformer Model

The original Transformer based recommender does not take into account of the time interval between two successive interactions. The paper Time Interval Aware Self-Attention for Sequential Recommendation, Jiacheng Li, Yujie Wang, Julian McAuley, WSDM, 2020 introduced the logic of including the time information. 

The original Git repo with TF 1.x is https://github.com/JiachengLi1995/TiSASRec

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import json
import re
import random
import copy
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle

from collections import defaultdict, Counter

sys.path.insert(0, "/recsys_data/RecSys/SASRec-tf2/")

import download_and_process_amazon as dpa

In [9]:
data_dir = "/recsys_data/RecSys/SASRec-tf2/data"
meta_filename = 'meta_Electronics.json'
encoded_file = "ae_v3.txt"

# 5-core
category = "Electronics"
download_url = f"http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{category}_5.json.gz"
reviews_name = f"reviews_{category}_5.json"
reviews_file = os.path.join(data_dir, reviews_name)

In [4]:
print(f"Generating data for ***{category}***")
dpa.download_and_extract(reviews_name, reviews_file)

Generating data for ***Electronics***


'/recsys_data/RecSys/SASRec-tf2/data/reviews_Electronics_5.json'

In [5]:
if not os.path.exists(reviews_file + '_output'):
    reviews_output_file = dpa._reviews_preprocessing(reviews_file)

start reviews preprocessing...
Processed data in /recsys_data/RecSys/SASRec-tf2/data/reviews_Electronics_5.json_output


In [20]:
def data_process_with_time(fname, pname, K=3, sep="\t", file_write=False, add_time=False):
    User = defaultdict(list)
    Items = set()
    user_dict, item_dict = {}, {}

    with open(fname, 'r') as fr:
        for line in fr:
            u, i, t = line.rstrip().split(sep)
            User[u].append((i, t))
            Items.add(i)
    
    print(len(User), len(Items))
    item_count = 1
    for item in Items:
        item_dict[item] = item_count
        item_count += 1

    count_del = 0
    user_count = 1
    if file_write:
        print(f"Writing data in {pname}")
        with open(pname, 'w') as fw:
            for user in User.keys():
                if len(User[user]) < K:
#                     del User[user]
                    count_del += 1
                else:
                    # user_dict[user] = user_count
                    items = sorted(User[user], key=lambda x: x[1])
                    timestamps = [x[1] for x in items]
                    items = [item_dict[x[0]] for x in items]
                    for i, t in zip(items, timestamps):
                        out_txt = [str(user_count), str(i)]
                        if add_time:
                            out_txt.append(str(t))
                        fw.write(sep.join(out_txt) + "\n")
                    user_dict[user] = user_count
                    user_count += 1
    else:
        for user in User.keys():
            if len(User[user]) < K:
                # del User[user]
                count_del += 1
            else:
                User[user] = sorted(User[user], key=lambda x: x[1])
                user_dict[user] = user_count
                user_count += 1
        
    print(user_count-1, count_del)
    return user_dict, item_dict, User

In [12]:
udict, idict = dpa.data_process_with_time(reviews_output_file,
                                      os.path.join(data_dir, encoded_file),
                                      K=5,
                                      sep="\t",
                                      item_set=None,
                                      add_time=True)
len(udict), len(idict)

Read 192403 users and 63001 items
Total 192403 users and 63001 items
Total 192403 users, 0 removed
Processed model input data in /recsys_data/RecSys/SASRec-tf2/data/ae_v3.txt


In [15]:
udict, idict = dpa.data_process_with_time(os.path.join(data_dir, "ae_original.txt"),
                                          os.path.join(data_dir, encoded_file),
                                          K=3,
                                          sep="\t",
                                          item_set=None,
                                          add_time=True)
len(udict), len(idict)

Read 63161 users and 85930 items
27773 items have less than 3 interactions
47 users have less than 3 interactions
Total 63114 users and 58157 items
Total 63073 users, 41 removed
Processed model input data in /recsys_data/RecSys/SASRec-tf2/data/ae_v3.txt


(63073, 58157)

In [21]:
udict, idict, user_history = data_process_with_time(os.path.join(data_dir, "ae_original.txt"),
                                                    os.path.join(data_dir, encoded_file),
                                                    K=3,
                                                    sep="\t",
                                                    file_write=True,
                                                    add_time=True)
print(f"Retained {len(udict)} users with {len(idict)} items from {len(user_history)} users")

63161 85930
Writing data in /recsys_data/RecSys/SASRec-tf2/data/ae_v3.txt
63114 47
Retained 63114 users with 85930 items from 63161 users


In [24]:
User = dpa.data_partition(os.path.join(data_dir, "ae_v3.txt"))

Preparing data...


In [26]:
User[1]

[[71865, 1276992000.0],
 [73699, 1295568000.0],
 [76752, 1305504000.0],
 [70038, 1339632000.0],
 [52031, 1369440000.0],
 [5655, 1370736000.0],
 [67712, 1370736000.0],
 [36497, 1382659200.0],
 [54084, 1390176000.0],
 [76563, 1390176000.0],
 [58972, 1390176000.0],
 [26213, 1390176000.0],
 [62645, 1390176000.0],
 [39023, 1392076800.0],
 [49569, 1393113600.0],
 [11443, 1395187200.0],
 [83584, 1395878400.0],
 [38275, 1403740800.0]]

In [29]:
[user_train, user_valid, user_test, usernum, itemnum, timenum] = dpa.data_partition(os.path.join(data_dir, "ae_v3.txt"))

Preparing data...
Preparing done...


In [30]:
user_train[1]

[[33306, 1],
 [34165, 28],
 [35596, 42],
 [32497, 92],
 [24147, 135],
 [2640, 137],
 [31413, 137],
 [16854, 154],
 [25087, 165],
 [35505, 165],
 [27370, 165],
 [12196, 165],
 [29087, 165],
 [18026, 167],
 [23003, 169],
 [5337, 172]]

In [31]:
1295568000 - 1276992000

18576000

In [32]:
import tensorflow as tf


In [41]:
n_timesteps, n_features = 96, 6
in1 = tf.keras.Input(shape=(n_timesteps, n_features))
conv1 = tf.keras.layers.Conv1D(2, 2, strides=1)(in1)
model = tf.keras.Model(inputs=in1, outputs=conv1)

In [42]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 96, 6)]           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 95, 2)             26        
Total params: 26
Trainable params: 26
Non-trainable params: 0
_________________________________________________________________


In [43]:
model.weights

[<tf.Variable 'conv1d_2/kernel:0' shape=(2, 6, 2) dtype=float32, numpy=
 array([[[ 0.27043295, -0.43187496],
         [-0.473627  , -0.32877958],
         [ 0.16233104, -0.05000722],
         [ 0.26561534,  0.07231426],
         [-0.18085214,  0.58943754],
         [-0.39737538,  0.43327934]],
 
        [[ 0.4520176 , -0.3670225 ],
         [-0.45689282, -0.40269274],
         [ 0.00142598, -0.28955248],
         [ 0.33368218, -0.4050467 ],
         [-0.28567907,  0.04187793],
         [-0.04750389, -0.23195189]]], dtype=float32)>,
 <tf.Variable 'conv1d_2/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>]

In [None]:
w = w - grad * lr
grad = average(grad_i), i = 1.,, batch_size