In [1]:
import os
import time
import argparse
import tensorflow as tf
from tqdm import tqdm
import pickle
import sys
from sklearn.neighbors import DistanceMetric

sys.path.insert(0, "../")
from sasrec import SASREC
from sasrec_text import SASREC_TEXT
from ssept import SSEPT
from util import *


In [30]:
def text_processing(dataset, text_maxlen, vocab_size, text_embed):
    data_dir = '../data/'
    filename = dataset + "_item_description.txt"
    glove_dir = "/recsys_data/datasets/glove"
    glove_file = 'glove.6B.50d.txt'
    maxlen = text_maxlen
    vocab_size = vocab_size
    embedding_dim = text_embed

    print(f"Processing for textual features")
    with open(os.path.join(data_dir, filename), 'r') as fr:
        docs = fr.readlines()
    tokenizer = Tokenizer(num_words=vocab_size-1, lower=True, split=' ')  # 1 ... 4999
    # tokenizer = Tokenizer(num_words=vocab_size, lower=True, split=' ', oov_token='<OOV>')
    tokenizer.fit_on_texts(docs)
    print(f"Number of words found: {len(tokenizer.word_index)}")
    vocab = [k for k,v in tokenizer.word_index.items() if v < vocab_size]  # 1 ... 4999
    tensor = tokenizer.texts_to_sequences(docs)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', maxlen=maxlen)
    print(f"Tokenized each item description", tensor.shape)

    # add a zero row
    num_items, seq_len = tensor.shape
    big_tensor = np.zeros((num_items+1, seq_len))
    big_tensor[1:num_items+1, :] = tensor

    embedding_matrix, glove_vocab = create_embedding_matrix(os.path.join(glove_dir, glove_file),
                                                            vocab,  
                                                            embedding_dim,
                                                            vocab_size)
    item_embeddings = np.zeros((num_items+1, embedding_matrix.shape[1]))
    for item in tqdm(range(1, num_items+1)):
        word_indices = big_tensor[item, :]
        word_indices = [int(i) for i in word_indices if i != 0]
        if len(word_indices) > 0:
            word_vectors = embedding_matrix[word_indices, :]
            mean_vector = word_vectors.mean(axis=0)
            item_embeddings[item,:] = mean_vector
        else:
            print(f"Missing embedding for item-{item}")
            print(f"{item}-text: {docs[item-1]}")

    print(f"Text based item embedding matrix", item_embeddings.shape)
    return item_embeddings, embedding_matrix, docs


In [3]:
itemnum = 67310
maxlen = 50
num_blocks = 2
hidden_units = 100
num_heads = 1
dropout_rate = 0.5
l2_emb = 0.0
num_neg_test = 100

dataset = 'Beauty'
text_maxlen, vocab_size, text_embed = 100, 5000, 50

In [31]:
embed_matrix, word_embedddings, docs = text_processing(dataset, text_maxlen, vocab_size, text_embed)

Processing for textual features
Number of words found: 68940
Tokenized each item description (67310, 100)


  2%|▏         | 1281/67310 [00:00<00:05, 12799.97it/s]

!!! 239 words could not be mapped
Missing embedding for item-929
929-text: Revivasol



  9%|▉         | 6269/67310 [00:00<00:04, 12599.17it/s]

Missing embedding for item-4107
4107-text: Raydiant



 44%|████▍     | 29520/67310 [00:02<00:02, 12662.69it/s]

Missing embedding for item-27989
27989-text: Accentus: Transcriptions

Missing embedding for item-29688
29688-text: Euc-4338



 52%|█████▏    | 34670/67310 [00:02<00:02, 12827.27it/s]

Missing embedding for item-33371
33371-text: Mavala Scientifique



 79%|███████▉  | 53024/67310 [00:04<00:01, 13215.50it/s]

Missing embedding for item-50757
50757-text: Ponybun



100%|██████████| 67310/67310 [00:05<00:00, 12848.77it/s]

Missing embedding for item-65491
65491-text: Magick

Text based item embedding matrix (67311, 50)





In [5]:
print(embed_matrix.shape, word_embedddings.shape)

(67311, 50) (5000, 50)


In [6]:
embed_matrix.min(), embed_matrix.max()

(-2.4820001125335693, 3.955150008201599)

In [7]:
word_embedddings.min(), word_embedddings.max()

(-2.940700054168701, 4.365699768066406)

In [8]:
dist = DistanceMetric.get_metric('euclidean')
dm = dist.pairwise(embed_matrix)

In [17]:
# dm_s = np.argsort(dm, axis=1)
K = 5
similar_items = np.zeros((itemnum+1, K))
for ii in tqdm(range(dm.shape[0])):
    vec = dm[ii, :]
    indx = np.argsort(vec)[:(K+1)]
    similar_items[ii, :] = [int(jj) for jj in indx if jj != ii][:K]
    

100%|██████████| 67311/67311 [06:25<00:00, 174.47it/s]


In [22]:
def print_similar_items(item, similar_items, docs):
    sitems = similar_items[item, :]
    print(docs[item-1])
    print("*************similar items**************")
    for it in sitems:
        print(docs[int(it-1)])

In [32]:
print_similar_items(20000, similar_items, docs)

c. Booth derma M 36 Oxygen Infusion Cell Rejuvenation Peel Kit 1 kit Oxygen Infusion Cell Rejuvenation Peel KitA powerful Two-Step SystemErases dead cells so skin can breatheReverses sun damage and premature agingImmediate, breath-taking resultsYou feel sluggish without enough oxygen to breathe. So does your skin. So peel away the dead, dry, flaky skin cells that deprive your skin of oxygen. When every pore breathes freely, skin comes alive with renewed vibrancy. Fine lines virtually disappear. Sun damage is reversed. Breakouts? A thing of the past. Best of all, results are immediate and remarkable. Because a little oxygen goes a

*************similar items**************
Yes To Tomatoes Skin Clearing Facial Mask, 1.7 Fluid Ounce A skin clearing facial mask that not only absorbs sebum buildup and keeps pores clear, but also contains dead sea mud to exfoliate your skin and encourage a shine-free appearance. Our deep pore treatment contains the potent antioxidant lycopene from organic tom

In [29]:
docs[928], docs[4106], docs[27988]

('Revivasol\n', 'Raydiant\n', 'Accentus: Transcriptions\n')

In [33]:
import requests 

def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

In [35]:
download_url(url='http://snap.stanford.edu/graphsage/reddit.zip', 
             save_path='/recsys_data/GraphSAGE/example_data/reddit.zip', chunk_size=128)

In [7]:
model = SASREC_TEXT(item_num=itemnum,
            seq_max_len=maxlen,
            num_blocks=num_blocks,
            embedding_dim=hidden_units,
            attention_dim=hidden_units,
            attention_num_heads=num_heads,
            dropout_rate=dropout_rate,
            l2_reg=l2_emb,
            num_neg_test=num_neg_test,
#             max_seq_len_text=text_maxlen,
#             vocab_size=vocab_size,
            text_embedding_dimension=text_embed,
            item_text_embedding_matrix=embed_matrix,
#             item_text_sequences=item_desc
)

In [8]:
with open('../sample_nan.pkl', 'rb') as fr:
    inputs = pickle.load(fr)
    
model.load_weights('../checkpoints/my_checkpoint')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fabb7e99c90>

In [9]:
model.weights

[<tf.Variable 'Variable:0' shape=(50, 100) dtype=float32, numpy=
 array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]], dtype=float32)>,
 <tf.Variable 'Variable:0' shape=(50, 100) dtype=float32, numpy=
 array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]], dtype=float32)>,
 <tf.Variable 'Variable:0' shape=(50, 100) dtype=float32, numpy=
 array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [n

In [14]:
for layer in model.layers:
    print(layer.name, layer)

embedding <tensorflow.python.keras.layers.embeddings.Embedding object at 0x7f9433ad0690>
item_embeddings <tensorflow.python.keras.layers.embeddings.Embedding object at 0x7f94302a6410>
positional_embeddings <tensorflow.python.keras.layers.embeddings.Embedding object at 0x7f94306a37d0>
dropout <tensorflow.python.keras.layers.core.Dropout object at 0x7f9433ac0b90>
encoder <sasrec_text.Encoder object at 0x7f9433ac0610>
masking <tensorflow.python.keras.layers.core.Masking object at 0x7f9433ac0f50>
layer_normalization_6 <sasrec_text.LayerNormalization object at 0x7f9433ac0e90>
text_encoder <sasrec_text.TextEncoder object at 0x7f942ed79f90>


In [16]:
print(model.layers[0].weights)
# print(model.layers[0].bias.numpy())
# print(model.layers[0].bias_initializer)

[]


In [17]:
model.layers[0]

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7f9433ad0690>

In [10]:
inputs.keys()

dict_keys(['users', 'input_seq', 'positive', 'negative'])

In [12]:
inputs['users'][0]

array([35322])

In [11]:
inputs['input_seq'][0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
       13599, 20493,  8465, 50130,  9629], dtype=int32)

In [13]:
inputs['positive'][0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
       20493,  8465, 50130,  9629, 60954], dtype=int32)

In [15]:
inputs['negative'][0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
       23654, 15273, 41160, 36779, 40662], dtype=int32)

In [36]:
embed_matrix.shape

(5000, 50)

In [38]:
item_desc[5265,:]

array([1.417e+03, 2.850e+02, 6.000e+00, 2.320e+02, 1.240e+02, 1.200e+03,
       9.270e+02, 2.200e+01, 1.330e+02, 1.161e+03, 1.400e+01, 3.619e+03,
       1.240e+02, 1.600e+01, 3.500e+01, 5.700e+01, 2.850e+02, 1.500e+01,
       6.000e+01, 1.019e+03, 5.020e+02, 2.460e+02, 1.000e+01, 6.000e+00,
       1.200e+02, 3.917e+03, 1.287e+03, 3.000e+00, 1.507e+03, 2.000e+00,
       1.950e+02, 4.000e+00, 1.103e+03, 2.768e+03, 1.400e+01, 2.200e+01,
       3.000e+00, 3.552e+03, 1.540e+02, 1.446e+03, 4.370e+02, 2.000e+00,
       2.320e+02, 2.100e+01, 9.300e+01, 2.000e+00, 1.821e+03, 1.000e+00,
       4.050e+02, 1.220e+02, 2.720e+02, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 