In [1]:
import os
import sys
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tf.__version__

'2.3.0'

In [2]:
data_dir = "/recsys_data/RecSys/SASRec-tf2/data/"
filename = "ae_item_description.txt"
glove_dir = "/recsys_data/datasets/glove"
maxlen = 100
vocab_size = 5000
embedding_dim = 50

with open(os.path.join(data_dir, filename), 'r') as fr:
    docs = fr.readlines()

len(docs)

85930

In [3]:
# https://stackoverflow.com/questions/64158898/what-does-keras-tokenizer-num-words-specify
tokenizer = Tokenizer(num_words=5000, lower=True, split=' ', oov_token='<OOV>')
tokenizer.fit_on_texts(docs)
len(tokenizer.word_index)

156986

In [4]:
vocab = [k for k,v in tokenizer.word_index.items() if v <= vocab_size-1]

In [5]:
tensor = tokenizer.texts_to_sequences(docs)
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', maxlen=maxlen)

In [6]:
def create_embedding_matrix(filepath, word_index, embedding_dim, vocab_size):
    # vocab_size = len(word_index) + 1  
    # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    all_words = set()
    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            all_words.add(word)
            if word in word_index:
                idx = word_index.index(word)+1 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
    count_missing = len(set(word_index) - all_words)
    if count_missing > 0:
        print(f"!!! {count_missing} words could not be mapped")
    return embedding_matrix, all_words

In [7]:
embedding_matrix, glove_vocab = create_embedding_matrix(os.path.join(glove_dir, 'glove.6B.50d.txt'),
                                           vocab,  
                                           embedding_dim,
                                           vocab_size)

!!! 424 words could not be mapped


In [8]:
embedding_matrix.shape

(5000, 50)

In [9]:
set(vocab) - glove_vocab

{'0em',
 '0mm',
 '0px',
 '1000d',
 '1005ha',
 '100mbps',
 '100v',
 '10ft',
 '10mp',
 '10px',
 '1100d',
 '110v',
 '11n',
 '11px',
 '120gb',
 '120hz',
 '120v',
 '12800',
 '128gb',
 '12px',
 '1333mhz',
 '13px',
 '13r',
 '140mm',
 '150mbps',
 '15px',
 '15r',
 '1600mhz',
 '160gb',
 '18x',
 '19v',
 '1aca',
 '1bas',
 '1brs',
 '1ds',
 '1mp',
 '1px',
 '1tb',
 '200w',
 '20d',
 '20px',
 '220v',
 '240v',
 '250gb',
 '250mm',
 '250w',
 '2dvi',
 '2em',
 '2ghz',
 '2mp',
 '2tb',
 '300d',
 '300mbps',
 '30d',
 '30fps',
 '30gb',
 '320gb',
 '350d',
 '384019',
 '3ft',
 '3gb',
 '400d',
 '42mm',
 '4400mah',
 '450d',
 '462890',
 '480mbps',
 '4em',
 '4ghz',
 '4mp',
 '4v',
 '500d',
 '500gb',
 '50d',
 '52x',
 '532h',
 '5400rpm',
 '54mbps',
 '550d',
 '5520',
 '580ex',
 '58mm',
 '5em',
 '5gb',
 '5gbps',
 '5ghz',
 '5mp',
 '5ms',
 "6'",
 '600d',
 '60d',
 '60gb',
 '60hz',
 '650d',
 '65w',
 '67mm',
 '6gb',
 '6ghz',
 "7''",
 '700d',
 '70d',
 '7200rpm',
 '750gb',
 '77mm',
 '7gb',
 '7v',
 '800mhz',
 '80gb',
 '8212',
 '821

In [10]:
import tensorflow as tf

In [12]:
tensor = tf.constant(tensor)
tf.gather(tensor, [1, 10, 100])

InternalError: CUDA runtime implicit initialization on GPU:0 failed. Status: out of memory

In [15]:
tensor[[0, 10, 100],:]

array([[   1,  303,  312,  103, 3736, 2101,  456,  114,    7,  303,  139,
          54,   53,    1,  103, 3736, 1088,    4,  103, 2523,    8,    1,
           7, 4952, 2969,   14, 1058, 3793,  869, 1833,   53,    1, 1088,
           4, 1233,    8,  120,  251,    4, 4952, 1432,   14,  442,    4,
         724,    5,    1, 3538, 2350,    4,    8, 3539,   53,   91,  672,
           4, 1233,    8,  120,  251,    4, 2969, 1432,  227,  562, 1088,
          43, 1069, 1016,    6, 1604,    1, 1081,  151,  367,   33,   59,
         830,    1,    4,    5,  139,  386,  999, 1789,    1, 1088,   33,
          59,  258,   10,    1,    6,    1,    1,    1,    3,    1,    2,
           1],
       [   1,    1,   10, 1011,  318, 1347,   20,  238,  116,   79,  204,
           6,  163,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    