In [1]:
import os
import sys
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tf.__version__

In [2]:
data_dir = "/recsys_data/RecSys/SASRec-tf2/data/"
filename = "ae_item_description.txt"
glove_dir = "/recsys_data/datasets/glove"
maxlen = 100
vocab_size = 5000
embedding_dim = 50

with open(os.path.join(data_dir, filename), 'r') as fr:
    docs = fr.readlines()

len(docs)

85930

In [3]:
# https://stackoverflow.com/questions/64158898/what-does-keras-tokenizer-num-words-specify
tokenizer = Tokenizer(num_words=5000, lower=True, split=' ', oov_token='<OOV>')
tokenizer.fit_on_texts(docs)
len(tokenizer.word_index)

156986

In [12]:
vocab = [k for k,v in tokenizer.word_index.items() if v <= vocab_size-1]

In [5]:
tensor = tokenizer.texts_to_sequences(docs)
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', maxlen=maxlen)

In [19]:
def create_embedding_matrix(filepath, word_index, embedding_dim, vocab_size):
    # vocab_size = len(word_index) + 1  
    # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    all_words = set()
    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            all_words.add(word)
            if word in word_index:
                idx = word_index.index(word)+1 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
    count_missing = len(set(word_index) - all_words)
    if count_missing > 0:
        print(f"!!! {count_missing} words could not be mapped")
    return embedding_matrix

In [20]:
embedding_matrix = create_embedding_matrix(os.path.join(glove_dir, 'glove.6B.50d.txt'),
                                           vocab,  
                                           embedding_dim,
                                           vocab_size)

!!! 424 words could not be mapped


In [14]:
embedding_matrix.shape

(5000, 50)

In [17]:
embedding_matrix[100,:]

array([ 2.27960005e-01, -8.47329974e-01,  1.39579999e+00,  1.67499995e+00,
       -1.82579994e-01, -3.53850007e-01, -2.24559996e-02, -8.55679989e-01,
       -3.85540009e-01,  1.28090000e+00,  9.04990017e-01,  2.61930004e-02,
       -1.28050005e+00,  1.33340001e-01,  6.39569998e-01,  6.83510005e-01,
       -1.68499994e+00,  7.39130020e-01, -1.72029994e-03, -5.89489996e-01,
        1.01370001e+00,  2.40459993e-01, -6.40860021e-01, -2.84009993e-01,
       -5.21189988e-01, -4.57659990e-01, -8.38559985e-01, -4.93090004e-01,
       -9.24409986e-01, -9.25719976e-01,  1.99049997e+00,  2.03189999e-01,
        1.05079997e+00,  3.07240009e-01, -1.42820001e-01, -6.85989976e-01,
        1.83270007e-01, -3.46329987e-01, -3.81449997e-01, -1.66270006e+00,
        3.12350005e-01, -6.63610026e-02, -2.49290004e-01, -5.20349979e-01,
       -1.08900003e-01,  6.49280012e-01,  2.74949998e-01, -2.13450000e-01,
       -1.27429998e+00,  2.87209988e-01])