## Build Knowledge Graph from Amazon Data

The idea is to create a graph where items are nodes, relations amongst the items are edges. The nodes would have features that can come from the textual description (later we can add images) of the items and target labels - that can be taken as the product category. Once we collect these information we can use some of the Graph Convolution Network (GCN) for node classification task that would result in node embeddings as a by-product.  

In [1]:
import os
import sys
import json
import re
import random
import copy
from tqdm import tqdm
import numpy as np
import pickle

from collections import defaultdict, Counter

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
data_dir = "/recsys_data/RecSys/recommenders/tests/resources/deeprec/slirec"
meta_filename = 'meta_Electronics.json'
interaction_filename = "/recsys_data/RecSys/SASRec-tf2/data/ae_original.txt"
output_filename = "/recsys_data/RecSys/SASRec-tf2/data/ae_v2.txt"
dict_filename = "/recsys_data/RecSys/SASRec-tf2/data/ae_v2_dict.pkl"
text_embed_filename = "/recsys_data/RecSys/SASRec-tf2/data/ae_v2_text_embeddings.pkl"

# word embeddings
glove_dir = "/recsys_data/datasets/glove"
glove_file = "glove.6B.50d.txt"
embedding_dim = 50

### Gather all the relevant Items

While there are too many items in the meta data file we do not need all of them. So first we gather all the relevant items by going through the user interaction history (and do some filtering based on the frequency of interactions). We use K-filter where users and items with less than K interactions are filtered *in one pass*. This is different from K-core where items and users are filtered recursively till all of them have at most K-interactions. 

In [3]:
def data_process_with_time(fname, pname, sep="\t", file_write=False):
    User = defaultdict(list)
    Items = set()
    user_dict, item_dict = {}, {}

    with open(fname, 'r') as fr:
        for line in fr:
            u, i, t = line.rstrip().split(sep)
            User[u].append((i, t))
            Items.add(i)
    
    print(len(User), len(Items))
    item_count = 1
    for item in Items:
        item_dict[item] = item_count
        item_count += 1

    count_del = 0
    user_count = 1
    if file_write:
        print(f"Writing data in {pname}")
        with open(pname, 'w') as fw:
            for user in User.keys():
                if len(User[user]) <= 2:
#                     del User[user]
                    count_del += 1
                else:
                    # user_dict[user] = user_count
                    items = sorted(User[user], key=lambda x: x[1])
                    items = [item_dict[x[0]] for x in items]
                    for item in items:
                        fw.write(str(user_count) + ' ' + str(item) + '\n')
                    user_dict[user] = user_count
                    user_count += 1
    else:
        for user in User.keys():
            if len(User[user]) <= 2:
                # del User[user]
                count_del += 1
            else:
                User[user] = sorted(User[user], key=lambda x: x[1])
                user_dict[user] = user_count
                user_count += 1
        
    print(user_count-1, count_del)
    return user_dict, item_dict, User

In [4]:
write_file = False

# udict, idict, user_history = data_process_with_time(interaction_filename, "")
udict, idict, user_history = data_process_with_time(interaction_filename, output_filename, "\t", write_file)

if write_file:
    with open(dict_filename, 'wb') as handle:
        pickle.dump((udict, idict, user_history), handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Retained {len(udict)} users with {len(idict)} items from {len(user_history)} users")

63161 85930
63114 47
Retained 63114 users with 85930 items from 63161 users


In [7]:
os.getcwd()

'/recsys_data/RecSys/SASRec-tf2/notebooks'

In [5]:
all_keys = set()
all_vals = []
entity_dict = dict()
category_dict = dict()
count = 0
count_item = 0

# Note: item-id starts with 0 
with open(os.path.join(data_dir, meta_filename), 'r') as fr:
    for line in tqdm(fr.readlines()):
        jdict = eval(line)
        
        # keep items that are present in the user history
        # otherwise, the size becomes too big
        if jdict['asin'] not in idict:
            continue
#         jstr = line.strip()
#         jstr = re.sub(r"[0-9]\"", "", jstr)
#         jstr = re.sub(r"[^:]\s\"\w|\w\" ", " ", jstr)    
#         jstr = jstr.replace("\'", "\"")
#         jdict = json.loads(jstr)
        keys = jdict.keys()
        all_keys.update(keys)
#         entity_dict[jdict['asin']] = {'id': count}
#         if 'related' in jdict:
#             entity_dict[jdict['asin']]['related'] = jdict['related']
#         for cat in jdict['categories'][0]:
#             if cat not in category_dict:
#                 category_dict[cat] = []
#             category_dict[cat].append(jdict['asin'])
#         entity_dict[jdict['asin']]['categories'] = jdict['categories']
#         if 'categories' in jdict and ('description' in jdict and 'title' in jdict):
        jdict['id'] = count_item
        all_vals.append(jdict)
        count_item += 1
        entity_dict[jdict['asin']] = jdict
        count += 1
print(count, count_item)

100%|██████████| 498196/498196 [00:38<00:00, 12803.52it/s]


85930 85930


#### Sample Data

Note all items may not have description or title to create node features but they should have `categories` information for label

In [6]:
all_vals[1]

{'asin': '0594451647',
 'description': "HDTV Adapter Kit for NOOK HD and NOOK HD+\nThis handy kit enables you to stream content from your NOOK HD or NOOK HD+ to a high-definition TV, via the included adapter and High Speed HDMI Cable. The kit also includes a pass-through that allows you to charge your NOOK while streaming, so there's no danger of running out of battery power before you're done. (A compatible NOOK Power Kit is required for this function and is not included with the adapter kit.)",
 'title': 'Barnes &amp; Noble HDTV Adapter Kit for NOOK HD and NOOK HD+',
 'price': 49.95,
 'imUrl': 'http://ecx.images-amazon.com/images/I/51RjSETO23L._SX300_.jpg',
 'related': {'also_bought': ['B009L7EEZA',
   'B00AGAYQEU',
   'B00AGAS6XW',
   'B00BN1Q5JA',
   'B00AV1UWWY',
   'B00CPV9YOU',
   '1400699169',
   'B00AK2MHEU',
   'B00AFYBTSI',
   '0594481813',
   'B00AAVF0J0',
   '0594481902',
   'B00E9IAQ1C',
   'B00BKOWE6I',
   'B00AAKLZ6I',
   'B00AAKLUWC',
   'B007KZC6R8',
   'B00E9IKYKK',


### Create Node Features

We use the `title` and `description` of each item and convert them to a fixed length feature vector. We use Glove vectors for word embedding and average of word embeddings to represent a document (here title and description)

In [6]:
vocab_size = 5000
maxlen = 500
embedding_dim = 50

docs = []
for item in all_vals:
    doc_text = ''
    if 'title' in item:
        doc_text += item['title']
    if 'description' in item:
        doc_text += item['description']
    if len(doc_text) == 0:
        doc_text = 'not available'
    docs.append(doc_text)
print(f"Total {len(docs)} documents")

tokenizer = Tokenizer(num_words=vocab_size-1, lower=True, split=' ')  # 1 ... 4999
# tokenizer = Tokenizer(num_words=vocab_size, lower=True, split=' ', oov_token='<OOV>')
tokenizer.fit_on_texts(docs)
print(f"Number of words found: {len(tokenizer.word_index)}")

vocab = [k for k,v in tokenizer.word_index.items() if v < vocab_size]  # 1 ... 4999
tensor = tokenizer.texts_to_sequences(docs)
tensor = pad_sequences(tensor, padding='post', maxlen=maxlen)
print(f"Tokenized each item description", tensor.shape)

Total 85930 documents
Number of words found: 260983
Tokenized each item description (85930, 500)


In [7]:
def create_embedding_matrix(filepath, word_index, embedding_dim, vocab_size):
    # vocab_size = len(word_index) + 1
    # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    all_words = set()
    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            all_words.add(word)
            if word in word_index:
                idx = word_index.index(word) + 1
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[
                    :embedding_dim
                ]
    count_missing = len(set(word_index) - all_words)
    if count_missing > 0:
        print(f"!!! {count_missing} words could not be mapped")
    return embedding_matrix, all_words


In [8]:
# create word embedding matrix - dimension = |V| X word-embedding dimension
embedding_matrix, glove_vocab = create_embedding_matrix(
    os.path.join(glove_dir, glove_file), vocab, embedding_dim, vocab_size)
embedding_matrix.shape

!!! 334 words could not be mapped


(5000, 50)

### Create Item Embedding Matrix

For each item we take all the word embeddings and take the average. We can also divide by sqrt(n) as was done by [?]

In [9]:
num_items, seq_len = tensor.shape
# big_tensor = np.zeros((num_items + 1, seq_len))
# big_tensor[1 : num_items + 1, :] = tensor
# del tensor
# num_items, seq_len

item_embeddings = np.zeros((num_items, embedding_matrix.shape[1]))
# item_embeddings = np.zeros((num_items + 1, embedding_matrix.shape[1]))

for item in tqdm(range(num_items)):
    word_indices = tensor[item, :]
    word_indices = [int(i) for i in word_indices if i != 0]
    if len(word_indices) > 0:
        word_vectors = embedding_matrix[word_indices, :]
        mean_vector = word_vectors.mean(axis=0)
        item_embeddings[item, :] = mean_vector
#     else:
#         print(f"Missing embedding for item-{item}")

print(f"Text based item embedding matrix", item_embeddings.shape)

100%|██████████| 85930/85930 [01:13<00:00, 1173.08it/s]

Text based item embedding matrix (85930, 50)





In [10]:
# write raw word embedding file - basic nodal features
with open(text_embed_filename, 'wb') as handle:
    pickle.dump(item_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)


### Write Node File

The file has all the node details, node-id, nodal features and class information. 

In [11]:
classes = []
class_index = 1
with open('ae_nodes.txt', 'w') as fw:
    for ii, item in tqdm(enumerate(all_vals)):
        out_txt = [item['id']] + list(item_embeddings[ii, :]) + [item['categories'][0][class_index]]
        out_txt = [str(e) for e in out_txt]
        fw.write('\t'.join(out_txt) + '\n')
        classes.append(item['categories'][0][class_index])


85930it [00:04, 18195.24it/s]


In [12]:
Counter(classes)

Counter({'GPS & Navigation': 1566,
         'Computers & Accessories': 37416,
         'eBook Readers & Accessories': 892,
         'Accessories & Supplies': 11932,
         'Camera & Photo': 15404,
         'Portable Audio & Video': 6996,
         'Car & Vehicle Electronics': 3819,
         'Home Audio': 2363,
         'Television & Video': 4029,
         'Security & Surveillance': 570,
         'Interior Accessories': 2,
         'Accessories': 457,
         'Cases': 243,
         'Electronics Warranties': 193,
         'Service & Replacement Plans': 21,
         'Tools & Equipment': 5,
         'C': 1,
         'Cell Phones': 10,
         'Luggage & Travel Gear': 4,
         'Hair Care': 1,
         'Connected Devices': 4,
         'Car Care': 2})

### Write Edge File

This file has three columns for each edge - starting node, ending node and edge type (if available)

In [14]:
count_edge = 0
criteria = ['bought_together', 'buy_after_viewing', 'also_bought'] # 'also_viewed', 'also_bought',
add_edge_type = True
with open('ae_edges.txt', 'w') as fw:
    for e in tqdm(entity_dict):
        if 'related' in entity_dict[e]:
            for criterion in criteria:
                if criterion in entity_dict[e]['related']:
                    connected = entity_dict[e]['related'][criterion]
                    for e2 in connected:
                        if e2 in entity_dict:
                            tuples = [entity_dict[e]['id'], entity_dict[e2]['id']]
                            if add_edge_type:
                                tuples.append(criterion)
                            out_txt = '\t'.join([str(e) for e in tuples])
                            fw.write(out_txt + '\n')
                            count_edge += 1

print(f"Total {count_edge} edges created")

100%|██████████| 85930/85930 [00:03<00:00, 23230.15it/s]


Total 1679007 edges created


### Run Graph Convolution Network Model

    - cluster-gcn-node-classification.ipynb for no edge type
    - rgcn-node-classification.ipynb for edges with edge type
    - load the node embeddings from a pickle file

In [1]:
import pickle

with open('ae_item_embeddings.pkl', 'rb') as handle:
    emb = pickle.load(handle)
    
emb.shape

(85930, 64)

In [28]:
inv_udict = {v:k for k, v in udict.items()}
inv_idict = {v:k for k, v in idict.items()}

In [30]:
inv_udict[3932]

'A2O2D7JV0QU630'

In [33]:
user_history['A2O2D7JV0QU630']

[('B00GTGETFG', '1400198400'),
 ('B009A5204K', '1393977600'),
 ('B000LRMS66', '1173484800'),
 ('B001BRXW74', '1393977600'),
 ('B003FVJYF8', '1390867200'),
 ('B001KZ6C74', '1278115200'),
 ('B000EXS1BS', '1173484800'),
 ('B004W2JKWG', '1318204800'),
 ('B00BUCLVZU', '1402099200'),
 ('B00004ZCJE', '1244937600'),
 ('B0001P4V36', '1162166400'),
 ('B000EG46O6', '1173484800'),
 ('B00003CWG2', '1209081600'),
 ('B005FW56WA', '1317340800'),
 ('B000DZDSME', '1178064000'),
 ('B000SKVZOM', '1232323200'),
 ('B0002EXJ8Y', '1145750400'),
 ('B0007KQWDC', '1123459200'),
 ('B00685423M', '1337904000'),
 ('B001IYS1QM', '1261612800'),
 ('B005TI1ILS', '1401580800'),
 ('B00065L5TE', '1185667200'),
 ('B002E3AIG2', '1269734400'),
 ('B000GWLL0K', '1173484800'),
 ('B00F6EL6O4', '1393977600'),
 ('B00006J07K', '1185667200'),
 ('B008TZJ126', '1376784000'),
 ('B00008VSLB', '1209081600'),
 ('B0002EXJ98', '1197849600'),
 ('B000COZ43C', '1170720000'),
 ('B001MKKA5C', '1278115200'),
 ('B007C8DWFI', '1362700800'),
 ('B009F

In [34]:
[idict[it[0]] for it in sorted(user_history['A2O2D7JV0QU630'], key=lambda x: x[1])]

[68787,
 81707,
 50744,
 62280,
 18720,
 68250,
 24776,
 890,
 38317,
 63560,
 48670,
 42709,
 83276,
 33038,
 47622,
 30823,
 62309,
 67597,
 50720,
 14088,
 69420,
 18548,
 69397,
 4945,
 38938,
 79255,
 50750,
 61332,
 18576,
 19084,
 83278,
 43810,
 26051]

In [12]:
items = [61528, 3052, 12906, 62823, 19661, 70994, 69609, 58134, 47850]
for it in reversed(items):
    print(inv_idict[it])

B003JTHMRS
B0030EV78M
B005LXNTT0
B004P616IU
B005UE894U
B00CB3E50U
B001AJJON4
B003960EYO
B0023SJCII


In [29]:
udict['A2STI6QRXQR2A9'], sorted(user_history['A2STI6QRXQR2A9'], key=lambda x: x[1])

(31227,
 [('B000BUK7KW', '1248912000'),
  ('B005LLS7WG', '1320451200'),
  ('B005O0R9UA', '1341446400'),
  ('B0038A9HSK', '1341964800'),
  ('B008DR5R1O', '1342483200'),
  ('B003VVH8Y6', '1344211200'),
  ('B005DKZTMG', '1353888000'),
  ('B0092HP3GS', '1365552000'),
  ('B006U3O566', '1369958400'),
  ('B00005T3FS', '1369958400'),
  ('B0090J652Y', '1375488000'),
  ('B002RL9XQM', '1376438400'),
  ('B004GJURHM', '1381190400'),
  ('B005B3VO24', '1382054400'),
  ('B008U69DDG', '1389916800'),
  ('B00BUCLVZU', '1389916800'),
  ('B0042X8W0Q', '1389916800'),
  ('B00DC8IEE6', '1389916800'),
  ('B008C1JC4O', '1393200000'),
  ('B00DBA9YD0', '1393200000'),
  ('B008DWIFP4', '1404345600'),
  ('B004HD4L2E', '1404691200')])

### Feature Creation Differences

    - original data (33 interactions)
            1       A2O2D7JV0QU630  B0007KQWDC      1123459200      Point & Shoot Digital Cameras
            1       A2O2D7JV0QU630  B0002EXJ8Y      1145750400      Remote-Control Extenders
            1       A2O2D7JV0QU630  B0001P4V36      1162166400      Camera
            1       A2O2D7JV0QU630  B000COZ43C      1170720000      Camera Batteries
            1       A2O2D7JV0QU630  B000LRMS66      1173484800      Vehicle Mounts
            1       A2O2D7JV0QU630  B000EXS1BS      1173484800      Vehicle GPS
            1       A2O2D7JV0QU630  B000EG46O6      1173484800      Digital Camera Lenses
            1       A2O2D7JV0QU630  B000GWLL0K      1173484800      MP3 Player Accessories
            1       A2O2D7JV0QU630  B000DZDSME      1178064000      Point & Shoot Digital Cameras
            1       A2O2D7JV0QU630  B00065L5TE      1185667200      Camera
            1       A2O2D7JV0QU630  B00006J07K      1185667200      Binocular, Camera & Camcorder Straps
            1       A2O2D7JV0QU630  B0002EXJ98      1197849600      Component Receivers
            1       A2O2D7JV0QU630  B00003CWG2      1209081600      Splitters
            1       A2O2D7JV0QU630  B00008VSLB      1209081600      Splitters
            1       A2O2D7JV0QU630  B000SKVZOM      1232323200      Shoe Mount Flashes
            1       A2O2D7JV0QU630  B00004ZCJE      1244937600      Skylight & UV Filters
            1       A2O2D7JV0QU630  B001IYS1QM      1261612800      USB Flash Drives
            1       A2O2D7JV0QU630  B002E3AIG2      1269734400      Routers
            1       A2O2D7JV0QU630  B001KZ6C74      1278115200      Camcorder
            1       A2O2D7JV0QU630  B001MKKA5C      1278115200      Vehicle Mounts
            1       A2O2D7JV0QU630  B005FW56WA      1317340800      Cases
            1       A2O2D7JV0QU630  B004W2JKWG      1318204800      Internal Solid State Drives
            1       A2O2D7JV0QU630  B00685423M      1337904000      Print Servers
            1       A2O2D7JV0QU630  B007C8DWFI      1362700800      Vehicle Mounts
            1       A2O2D7JV0QU630  B009FN72PO      1362700800      Splitters
            1       A2O2D7JV0QU630  B008TZJ126      1376784000      Streaming Media Players
            1       A2O2D7JV0QU630  B003FVJYF8      1390867200      F-Pin-Coaxial Tip
            1       A2O2D7JV0QU630  B009A5204K      1393977600      Bluetooth Headsets
            1       A2O2D7JV0QU630  B001BRXW74      1393977600      TV Antennas
            1       A2O2D7JV0QU630  B00F6EL6O4      1393977600      USB Network Adapters
            1       A2O2D7JV0QU630  B00GTGETFG      1400198400      Travel Chargers
            1       A2O2D7JV0QU630  B005TI1ILS      1401580800      Component Receivers
            1       A2O2D7JV0QU630  B00BUCLVZU      1402099200      Streaming Media Players
    
    - processed data (sub-sampled)
    train   1       A2O2D7JV0QU630  B000LRMS66      1173484800      Vehicle Mounts
    train   1       A2O2D7JV0QU630  B000EXS1BS      1173484800      Vehicle GPS
    train   1       A2O2D7JV0QU630  B0002EXJ98      1197849600      Component Receivers
    train   1       A2O2D7JV0QU630  B00004ZCJE      1244937600      Skylight & UV Filters
    train   1       A2O2D7JV0QU630  B001IYS1QM      1261612800      USB Flash Drives
    train   1       A2O2D7JV0QU630  B004W2JKWG      1318204800      Internal Solid State Drives
    train   1       A2O2D7JV0QU630  B003FVJYF8      1390867200      F-Pin-Coaxial Tip
    train   1       A2O2D7JV0QU630  B009A5204K      1393977600      Bluetooth Headsets
    train   1       A2O2D7JV0QU630  B001BRXW74      1393977600      TV Antennas
    train   1       A2O2D7JV0QU630  B00GTGETFG      1400198400      Travel Chargers
    valid   1       A2O2D7JV0QU630  B005TI1ILS      1401580800      Component Receivers
    test    1       A2O2D7JV0QU630  B00BUCLVZU      1402099200      Streaming Media Players

    - sli-rec type:
        - train_data
            1       A2O2D7JV0QU630  B000EXS1BS      Vehicle GPS     1173484800      B000LRMS66      Vehicle Mounts  1173484800
            1       A2O2D7JV0QU630  B0002EXJ98      Component Receivers     1197849600      B000LRMS66,B000EXS1BS   Vehicle Mounts,Vehicle GPS      1173484800,1173484800
            1       A2O2D7JV0QU630  B00004ZCJE      Skylight & UV Filters   1244937600      B000LRMS66,B000EXS1BS,B0002EXJ98        Vehicle Mounts,Vehicle GPS,Component Receivers  1173484800,1173484800,1197849600
            1       A2O2D7JV0QU630  B001IYS1QM      USB Flash Drives        1261612800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE     Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters    1173484800,1173484800,1197849600,1244937600
            1       A2O2D7JV0QU630  B004W2JKWG      Internal Solid State Drives     1318204800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM  Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives   1173484800,1173484800,1197849600,1244937600,1261612800
            1       A2O2D7JV0QU630  B003FVJYF8      F-Pin-Coaxial Tip       1390867200      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG       Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives       1173484800,1173484800,1197849600,1244937600,1261612800,1318204800
            1       A2O2D7JV0QU630  B009A5204K      Bluetooth Headsets      1393977600      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8    Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip     1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200
            1       A2O2D7JV0QU630  B001BRXW74      TV Antennas     1393977600      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets  1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600
            1       A2O2D7JV0QU630  B00GTGETFG      Travel Chargers 1400198400      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74      Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas      1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600

    valid_data
            1       A2O2D7JV0QU630  B005TI1ILS      Component Receivers     1401580800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74,B00GTGETFG   Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas,Travel Chargers      1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600,1400198400
            0       A2O2D7JV0QU630  B001L1H0SC      Micro SD Cards  1401580800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74,B00GTGETFG   Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas,Travel Chargers      1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600,1400198400
            0       A2O2D7JV0QU630  B000T95Q1U      Adapter Rings   1401580800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74,B00GTGETFG   Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas,Travel Chargers      1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600,1400198400
            0       A2O2D7JV0QU630  B00BE68UZ6      Cases   1401580800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74,B00GTGETFG   Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas,Travel Chargers      1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600,1400198400
            0       A2O2D7JV0QU630  B009WZRAQQ      Cases   1401580800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74,B00GTGETFG   Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas,Travel Chargers      1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600,1400198400
            0       A2O2D7JV0QU630  B0018Z2Q2G      Headphones      1401580800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74,B00GTGETFG   Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas,Travel Chargers      1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600,1400198400
            0       A2O2D7JV0QU630  B007KFLV9W      Weather Radios  1401580800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74,B00GTGETFG   Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas,Travel Chargers      1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600,1400198400
            0       A2O2D7JV0QU630  B0001D3K8A      Routers 1401580800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74,B00GTGETFG   Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas,Travel Chargers      1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600,1400198400
            0       A2O2D7JV0QU630  B002UT42UI      SD & SDHC Cards 1401580800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74,B00GTGETFG   Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas,Travel Chargers      1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600,1400198400
            0       A2O2D7JV0QU630  B003FVVMS0      Subwoofer Cables        1401580800      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74,B00GTGETFG   Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas,Travel Chargers      1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600,1400198400

        - test data
            1       A2O2D7JV0QU630  B00BUCLVZU      Streaming Media Players 1402099200      B000LRMS66,B000EXS1BS,B0002EXJ98,B00004ZCJE,B001IYS1QM,B004W2JKWG,B003FVJYF8,B009A5204K,B001BRXW74,B00GTGETFG,B005TI1ILS        Vehicle Mounts,Vehicle GPS,Component Receivers,Skylight & UV Filters,USB Flash Drives,Internal Solid State Drives,F-Pin-Coaxial Tip,Bluetooth Headsets,TV Antennas,Travel Chargers,Component Receivers  1173484800,1173484800,1197849600,1244937600,1261612800,1318204800,1390867200,1393977600,1393977600,1400198400,1401580800


### SASRec Feature Creation

sorted user history: [68787, 81707, 50744, 62280, 18720, 68250, 24776, 890, 38317, 63560, 48670,
 42709, 83276, 33038, 47622, 30823, 62309, 67597, 50720, 14088, 69420, 18548, 69397, 4945, 38938,
 79255, 50750, 61332, 18576, 19084, 83278,
 `43810`,
 `26051`] where the last two are kept for validation and test, espectively.

user-id: 3932

*seq*: [0 ... 0, 68787 81707 50744 62280 18720 68250 24776   890 38317 63560
 48670 42709 83276 33038 47622 30823 62309 67597 50720 14088 69420 18548
 69397  4945 38938 79255 50750 61332 18576 19084]
 
*pos*: [0 ... 0, 81707 50744 62280 18720 68250 24776   890 38317 63560 48670
 42709 83276 33038 47622 30823 62309 67597 50720 14088 69420 18548 69397
  4945 38938 79255 50750 61332 18576 19084 83278]
  
*neg*: [0 ... 0, 35405 14579 57151 76896 66003 47718 36914 64425 12544 33579
 36497 39168 37315 77772 11931 30812 18501  2028 29749 68490 79277  4844
 78272 76118 31696 85199 51452 55959  3604 15751]


In [4]:
# for cat in category_dict:
#     ratio = len(category_dict[cat])/count
#     if ratio > 0.1:
#         print(cat, len(category_dict[cat]), ratio)
        
# for cat in category_dict:
#     if 10 < len(category_dict[cat]) < 100:
#         print(cat, len(category_dict[cat]))

In [7]:
def write_all_files(out_dir, categories, entities, max_examples=None):
    include_categories = []
    for cat in categories:
        ratio = len(categories[cat])/count
#         if ratio > cutoff and ratio < 0.8:
        if 10 < len(category_dict[cat]) < 100:
            include_categories.append(cat)
    
    relations = copy.deepcopy(include_categories)
    include_categories.extend(['also_viewed', 'buy_after_viewing'])
    print(f"Added {len(include_categories)} relations")
        
    # entity2id.txt
    with open(os.path.join(out_dir, 'AE', 'entity2id.txt'), 'w') as fw:
        fw.write(str(len(entities)) + '\n')
        for e in entities:
            fw.write(e + '\t' + str(entities[e]['id']) + '\n')
    
    # relation2id.txt
    with open(os.path.join(out_dir, 'AE', 'relation2id.txt'), 'w') as fw:
        fw.write(str(len(include_categories)) + '\n')
        for ii, r in enumerate(include_categories):
            fw.write(r + '\t' + str(ii) + '\n')
    
    # train2id.txt
    train, valid, test = [], [], []
    count_train, count_valid, count_test = 0, 0, 0

    for jj, cat in enumerate(relations):
        similar_entities = categories[cat]
        print(cat, len(similar_entities))
        for e1 in similar_entities:
            for e2 in similar_entities:
                if e1 != e2:
                    tuples = (entities[e1]['id'], entities[e2]['id'], jj)
                    rnd = random.random()
                    if rnd <= 0.7:
                        train.append(tuples)
                        count_train += 1
                    elif rnd <= 0.85:
                        valid.append(tuples)
                        count_valid += 1
                    else:
                        test.append(tuples)
                        count_test += 1
    
    print(f"Train: {count_train}, Validation: {count_valid} and Test: {count_test}")
    
    for e in entities:
        if 'related' in entities[e]:
            if 'also_viewed' in entities[e]['related']:
                connected = entities[e]['related']['also_viewed']
                jj = include_categories.index('also_viewed')
                for e2 in connected:
                    if e2 in entities:
                        tuples = (entities[e]['id'], entities[e2]['id'], jj)
                        rnd = random.random()
                        if rnd <= 0.7:
                            train.append(tuples)
                            count_train += 1
                        elif rnd <= 0.85:
                            valid.append(tuples)
                            count_valid += 1
                        else:
                            test.append(tuples)
                            count_test += 1
                            
            if 'buy_after_viewing' in entities[e]['related']:
                connected = entities[e]['related']['buy_after_viewing']
                jj = include_categories.index('buy_after_viewing')
                for e2 in connected:
                    if e2 in entities:
                        tuples = (entities[e]['id'], entities[e2]['id'], jj)
                        rnd = random.random()
                        if rnd <= 0.7:
                            train.append(tuples)
                            count_train += 1
                        elif rnd <= 0.85:
                            valid.append(tuples)
                            count_valid += 1
                        else:
                            test.append(tuples)
                            count_test += 1
    
    print(f"Train: {count_train}, Validation: {count_valid} and Test: {count_test}")
    
    random.shuffle(train)
    random.shuffle(valid)
    random.shuffle(test)
    
    if max_examples:
        train = train[:max_examples]
        valid = valid[:max_examples]
        test = test[:max_examples]
    
    with open(os.path.join(out_dir, 'AE', 'train2id.txt'), 'w') as fw:
        fw.write(str(len(train)) + '\n')
        for tup in train:
            out = [str(e) for e in tup]
            fw.write(' '.join(out) + '\n')
        
    with open(os.path.join(out_dir, 'AE', 'valid2id.txt'), 'w') as fw:
        fw.write(str(len(valid)) + '\n')
        for tup in valid:
            out = [str(e) for e in tup]
            fw.write(' '.join(out) + '\n')
        
    with open(os.path.join(out_dir, 'AE', 'test2id.txt'), 'w') as fw:
        fw.write(str(len(test)) + '\n')
        for tup in test:
            out = [str(e) for e in tup]
            fw.write(' '.join(out) + '\n')
    


In [8]:
write_all_files(out_dir='/recsys_data/RecSys/OpenKE/benchmarks/',
                categories=category_dict,
                entities=entity_dict,
                max_examples=10000
               )

Added 219 relations
Stylus Pens 63
USB Fans 91
VHS 72
Hi-8 45
Color Correction & Compensation Filters 94
APS Cameras 59
VHS-C 23
Systems 24
Wireless Jack Systems 25
VCR Rewinders 32
Caller ID Displays 89
Cable Straps 64
PS/2 Cables 50
SCSI Cables 57
Zip Discs 45
Minidisc Players 51
External Zip Drives 52
Internal Modems 84
Tape Decks 57
Minidiscs 32
Security Locks 72
Automotive 44
CD Recorders 55
Tuners 95
CD-RW Discs 77
SmartMedia Cards 36
Radar Systems 14
Changers 46
Handheld CB Radios 11
Catadioptric 84
Barlow Lenses 29
Telescope Cases 36
Photo Adapters 27
Tape Libraries 19
Multimedia Cards 48
Parallel Port Cards 59
DVD-RAM Discs 53
Bluetooth Headsets 99
Holsters & Clips 24
VTR Decks 17
Cassette Storage 21
Projectors 36
Slide Projectors 35
Single-Use Cameras 99
Presentation Supplies 32
Binding Machine Supplies 13
Calculator Accessories 93
Tabletop & Travel Tripods 22
TV Turntables 44
Port Tubes 22
Dew Caps 37
Fixed-mount CB Radios 74
Binocular Accessories 56
Clock Radios 61
Mounting

In [127]:
'0545016266' in entity_dict

False

In [85]:
entity_dict['BT008T2BGK']

{'id': 498195,
 'related': {'also_viewed': ['B0007T27H8', 'B00425S1H8', 'B000BI95W0'],
  'buy_after_viewing': ['B0007T27H8', 'B00425S1H8']}}

In [8]:
desc_dict = dict()
count_missing = 0
with open(os.path.join(data_dir, filename), 'r') as fr:
    for line in fr.readlines():
        jdict = eval(line)
        if 'description' not in jdict:
            count_missing += 1
        else:
            desc_dict[jdict['asin']] = jdict['description']
print(count_missing)

38726


In [9]:
img_dict = dict()
count_missing = 0
with open(os.path.join(data_dir, filename), 'r') as fr:
    for line in fr.readlines():
        jdict = eval(line)
        if 'imUrl' not in jdict:
            count_missing += 1
        else:
            img_dict[jdict['asin']] = jdict['imUrl']
print(count_missing)

175


In [7]:
all_keys

{'asin',
 'brand',
 'categories',
 'description',
 'imUrl',
 'price',
 'related',
 'salesRank',
 'title'}

In [47]:
import re
ss = 'Size: 2.875"(W) x 8.5"(L) x 1.125"(D)'
# ss.replace(r"[0-9]\"", '')
re.sub(r"[0-9]\"", "", ss)

#
ss = '"key": "HDTV" accented by a subtle "nook" logo'
re.sub(r"[^:]\s\"\w|\w\" ", " ", ss)

'Size: 2.87(W) x 8.(L) x 1.12(D)'

(63114, 85930)

In [16]:
inv_idict[1]

'B0013J5XZE'

In [17]:
entity_dict['B0013J5XZE']

{'id': 95536,
 'related': {'also_bought': ['B000XB9GV0',
   'B003HFB9B4',
   'B003U8YAWI',
   'B0028ZCXQ4',
   'B000XB9GUG',
   'B00494P6AW',
   'B006LZQ6O2',
   'B000XB4412',
   'B001ENW61I',
   'B00009UTL1',
   '032171105X',
   'B004BFZHO4',
   'B003HF3A34',
   'B003UOIMAS',
   'B00452V288',
   'B001M4HXB2',
   'B0009BX0AM',
   'B0028ZETAC',
   'B0083Z6R4W',
   'B00009UTZA',
   'B001OKBLEE',
   'B004BFXBXI',
   'B001HAER88',
   'B00DNADZME',
   'B007FH1KX2',
   'B000CLNHXY',
   'B003OU51LG',
   'B003GR6IUK',
   'B0090BSSZO',
   'B0007DDK7A',
   'B00BXA7N6A',
   'B002XUN2AA',
   'B003N8SKDU',
   'B007FH1LJU',
   'B002YWHSVG',
   'B003VBO4U2',
   'B004YG7JXW',
   'B003TYDBYQ',
   'B001DIG46M',
   'B003CRZFT4',
   'B006ISEJ9Q',
   '0321832752',
   'B00HO0BYQS',
   'B001E4LC6C',
   '0321580141',
   'B005GMWEI8',
   'B001M4HTAW',
   'B002Z3H36O',
   'B004CBTCFC',
   'B004YG7JV4',
   'B00AA478UQ',
   'B0034WR19Y',
   'B00009UTL9',
   'B0079M711S',
   'B001TANZ0W',
   'B00270VQTK',
   'B00B

In [18]:
all_keys

{'asin',
 'brand',
 'categories',
 'description',
 'imUrl',
 'price',
 'related',
 'salesRank',
 'title'}

## Find Items with Title & Descriptions

In [21]:
ddict = {}
with open(os.path.join(data_dir, filename), 'r') as fr:
    for line in fr.readlines():
        jdict = eval(line)
        if jdict['asin'] in idict:
            ddict[jdict['asin']] = {}
            if 'title' in jdict.keys():
                ddict[jdict['asin']]['title'] = jdict['title']
            if 'description' in jdict.keys():
                ddict[jdict['asin']]['description'] = jdict['description']

print(len(ddict))

85930


In [22]:
ddict['B0013J5XZE']

{'title': 'Honl Photo Professional Color Correction Gel Filter Kit for Photo Speed System',
 'description': 'Use Honl Color Correction filters to color correct your strobes for tungsten, fluorescent or cool daylight lighting environments. Use CTO filters to match your flash output to tungsten lights, or simply to give a pleasing warm tone to your portraits. Use Full + Green to match your flash output to fluorescent lights. These superior filters are pre-cut with velcro strips attached so they can be quickly fastened to a Speed Strap (sold separately). Honl Filters can be used in conjunction with Honl Snoots, Gobos and Grids. The usable filter area is 2.5" x 4" and can be trimmed to fit individual strobes'}

In [23]:
count = 0
for item in ddict:
    if len(ddict[item]) == 0:
        count += 1
print(count)

155


In [32]:
def nwords(txt, n):
    return ' '.join(txt.split()[:n])

In [33]:
keep_words = 100
max_len = 0
with open('/recsys_data/RecSys/SASRec-tf2/data/ae_item_description.txt', 'w') as fw:
    for item_number in range(1, 85931):
        item_name = inv_idict[item_number]
        temp_dict = ddict[item_name]
        title, desc = '', ''
        if 'title' in temp_dict:
            title = temp_dict['title'].replace('\n', '')
        if 'description' in temp_dict:
            desc = temp_dict['description'].replace('\n', '')
        otxt = title + ' ' + desc
        if len(otxt.split()) > max_len:
            max_len = len(otxt.split())
        otxt = nwords(otxt, keep_words)
        fw.write(otxt + '\n')
print(max_len)

18060


## Find Items with Images

In [34]:
imgdict = {}
with open(os.path.join(data_dir, filename), 'r') as fr:
    for line in fr.readlines():
        jdict = eval(line)
        if jdict['asin'] in idict:
            imgdict[jdict['asin']] = {}
            if 'imUrl' in jdict.keys():
                imgdict[jdict['asin']]['imUrl'] = jdict['imUrl']

print(len(imgdict))

85930


In [35]:
# Items with no image information
count = 0
for item in imgdict:
    if len(imgdict[item]) == 0:
        count += 1
print(count)

58


In [36]:
imgdict['B0013J5XZE']

{'imUrl': 'http://ecx.images-amazon.com/images/I/2135RN5gz3L.jpg'}

In [43]:
from os.path import basename
import urllib

item_name = 'B0013J5XZE'
imgUrl = imgdict[item_name]['imUrl']
imgData = urllib.request.urlopen(imgUrl).read()
fileName = item_name + '_' + imgUrl.split('/')[-1]
print(filename)
output = open(fileName,'wb')
output.write(imgData)
output.close()