In [1]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

# Load the TensorBoard notebook extension
%load_ext tensorboard


2023-10-11 10:25:05.120515: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-11 10:25:05.463213: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-11 10:25:05.465911: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE # automatyczne dostosowanie liczby wątków i buforów

# Create training example for one sentence 

In [3]:
# sentence = "The wide road shimmered in the hot sun"
sentence = "Mary has a little lamb"
tokens = sentence.lower().split()
len(tokens)

5

In [4]:
# create vocabulary and inverse vocabulary
vocab, inverse_vocab, i = {}, {}, 0

for token in ["<pad>"] + tokens:
    if token not in vocab:
        vocab[token] = i
        inverse_vocab[i] = token
        i += 1

vocab_size = len(vocab)
print(vocab, inverse_vocab, sep="\n")

{'<pad>': 0, 'mary': 1, 'has': 2, 'a': 3, 'little': 4, 'lamb': 5}
{0: '<pad>', 1: 'mary', 2: 'has', 3: 'a', 4: 'little', 5: 'lamb'}


In [5]:
# vectorize sentences
example_sequence = [vocab[word] for word in tokens]
example_sequence

[1, 2, 3, 4, 5]

In [6]:
# generate skip-grams
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)

print(len(positive_skip_grams))

for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

14
(3, 1): (a, mary)
(3, 5): (a, lamb)
(4, 3): (little, a)
(5, 4): (lamb, little)
(3, 4): (a, little)


In [7]:
# generate neagtive samples – context word other than assigned in positive skip-gram

target_word, context_word = positive_skip_grams[0]

num_ns = 4 # number of negative samples per positive context

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class, # class that should be sampled as 'positive'
    num_true=1, # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,
    unique=True, # the negative samples should be unique
    range_max=vocab_size, # pick index of the samples from [0, vocab_size]
    seed = SEED,
    name="negative_sampling" # name of this operation
)

print(inverse_vocab[target_word], inverse_vocab[context_word])
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])


a mary
tf.Tensor([2 1 3 0], shape=(4,), dtype=int64)
['has', 'mary', 'a', '<pad>']


2023-10-11 10:25:17.361953: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [8]:
squeezed_context_class = tf.squeeze(context_class, 1) # from [[5]] to [5]
context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0)

label = tf.constant([1] + [0]*num_ns, dtype="int64") # first is positive, rest is negative
target = target_word

In [9]:
print(f"target          : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context         : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")


target          : 3
target_word     : a
context         : [1 2 1 3 0]
context_words   : ['mary', 'has', 'mary', 'a', '<pad>']
label           : [1 0 0 0 0]


# Combine it into a function

In [10]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(
    size=10) # prob of sampling i-th common word in dataset (assuming Zipf's distribution)

In [11]:
def generate_training_data(sequences: list, window_size: int, num_ns: int, vocab_size: int, seed: int) -> list:
    # @param sequence: list of lists of tokens
    # @param window_size: context window size
    # @param num_ns: number of negative samples per positive context word
    # @param vocab_size: size of the vocabulary
    # @param seed: seed for random number generator

    targets, contexts, labels = [], [], []

    # prob of sampling i-th common word in dataset (assuming Zipf's distribution)
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=vocab_size)

    for sequence in tqdm.tqdm(sequences): # tqdm creates progress bar
        # generate positive skip-grams
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
        sequence,
        sampling_table=sampling_table,
        vocabulary_size=vocab_size,
        window_size=window_size,
        negative_samples=0)

        # generate neagtive samples – context word other than assigned in positive skip-gram
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class, # class that should be sampled as 'positive'
                num_true=1, # each positive skip-gram has 1 positive context class
                num_sampled=num_ns,
                unique=True, # the negative samples should be unique
                range_max=vocab_size, # pick index of the samples from [0, vocab_size]
                seed = SEED,
                name="negative_sampling" # name of this operation
            )   

            targets.append(target_word)
            contexts.append(tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0))
            labels.append(tf.constant([1] + [0]*num_ns, dtype="int64"))  # first is positive, rest is negative        
        
    return targets, contexts, labels



In [12]:
tf.reshape(tf.constant(5, dtype="int64"), (1, 1)) == tf.expand_dims(
    tf.constant([5], dtype="int64"), 1)

<tf.Tensor: shape=(1, 1), dtype=bool, numpy=array([[ True]])>

# Prepare training data

In [13]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [14]:
with open(path_to_file) as f:
    for i in range(10):
        print(f.readline(), end='')

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:


In [15]:
# remove empty lines
text_ds = tf.data.TextLineDataset(path_to_file).filter(
    lambda x: tf.cast(tf.strings.length(x), bool)) # True if len > 0 False otherwise


In [16]:
def custom_standardization(input_data):
  ### lowercase and remove punctuation
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,'[%s]' % re.escape(string.punctuation), '')

vocab_size = 4096
sequence_length = 10

# normalize, split, and map strings to integers
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length # pad or truncate to the same length
    ) 

In [17]:
# build the vocabulary
vectorize_layer.adapt(text_ds.batch(1024))

inverse_vocab = vectorize_layer.get_vocabulary()
inverse_vocab[:5]

2023-10-11 10:25:17.654698: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


['', '[UNK]', 'the', 'and', 'to']

In [18]:
# Vectorize the data in text_ds using the built vocabulary
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [19]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))
for seq in sequences[:10]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

2023-10-11 10:25:26.259686: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_9' with dtype resource
	 [[{{node Placeholder/_9}}]]


32777
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] => ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] => ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[   7   41   34 1286  344    4  200   64    4 3690] => ['you', 'are', 'all', 'resolved', 'rather', 'to', 'die', 'than', 'to', 'famish']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[1286 1286    0    0    0    0    0    0    0    0] => ['resolved', 'resolved', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[  89    7   93 1187  225   12 2442  592    4    2] => ['first', 'you', 'kno

All the lines were changed to lowercase, splitted, the stopwords were removed. The lines were truncated or padded with '' to match the length 10.

In [20]:
# generate traing data
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|██████████| 32777/32777 [01:46<00:00, 307.46it/s] 




targets.shape: (65601,)
contexts.shape: (65601, 5)
labels.shape: (65601, 5)


In [21]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000

dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>

# Model and training

Article about Embedding layer: https://medium.com/analytics-vidhya/understanding-embedding-layer-in-keras-bbe3ff1327ce

In [22]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size, #input_size
                                      embedding_dim,     # output size
                                      input_length=1,    # pass one word at once
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1) # pass all context words at once 

  def call(self, pair):
    target, context = pair
    # target is (batch_size, 1), context is (batch_size, num_ns+1)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch_size,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch_size, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch_size, num_ns+1, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb) # dimensions of wordemb is batch_szie (b) and embed (e). For context is batch_size (b), num_ns+1 (c) and embed (e). Result is batch_size (b) and num_ns+1 (c).
    # So we perform the dot product over embedding (e) dimension.
    # dots: (batch_size, context)
    return dots

The network is trying to learn the context based on the target word. It generates the embeddings for both and then measures the simmilarity between them. We use two separate layers, because of other functions for embeddings. One of it learns word as it is and one of it learns the word as a context. It was proven to work better.

In [23]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [24]:
# for tensorboard statistics
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [25]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20


2023-10-11 10:27:38.424208: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [65601,5]
	 [[{{node Placeholder/_1}}]]
2023-10-11 10:27:38.424593: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int64 and shape [65601,5]
	 [[{{node Placeholder/_2}}]]


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fec940c6170>

In [26]:
#docs_infra: no_execute
%tensorboard --logdir logs

Analysis

In [27]:
words_wectors = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [28]:
imageout_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = words_wectors[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

NameError: name 'out_v' is not defined

https://projector.tensorflow.org/?_gl=1*1ec0dl2*_ga*NDE0Mzk1NjcyLjE2OTY2NjYyODU.*_ga_W0YLR4190T*MTY5Njk0NDQ2Ni43LjEuMTY5Njk0NzUxNy4wLjAuMA..

![Visualisation](embeddings2D.png)

# Test the embeddings

In [None]:
from gensim.models import KeyedVectors

def evaluate_tf_model(word2vec, embedding_dim):
    words_wectors = word2vec.get_layer('w2v_embedding').get_weights()[0]

    # Convert tf model to gensim one
    gensim_model = KeyedVectors(vector_size=embedding_dim)
    gensim_model.add_vectors(vocab, words_wectors)

    # Evaluate word similarity
    similarity_score = gensim_model.similarity('brother', 'sister')
    print("sim brother, sister", similarity_score)

    similarity_score = gensim_model.similarity('brother', 'make')
    print("sim brother, make", similarity_score)

    # Find similar words
    similar_words = gensim_model.most_similar('brother')
    print("most similar to brother: ", similar_words)

    # Evaluate word analogy
    analogy_result = gensim_model.most_similar(positive=['king', 'woman'], negative=['man'])
    print("king - man + woman = ", analogy_result)

In [None]:
evaluate_tf_model(word2vec, embedding_dim)

[('iv', 0.34329548478126526), ('triumphant', 0.34055984020233154), ('ye', 0.33662500977516174), ('conveyd', 0.33429524302482605), ('richard', 0.32999661564826965), ('ii', 0.30728983879089355), ('henry', 0.3062649667263031), ('iii', 0.3016456663608551), ('lovers', 0.3012474477291107), ('vi', 0.30031704902648926)]


The results doesn't seem good.

# Other params

In [None]:
embedding_dim_2 = 256
word2vec_2 = Word2Vec(vocab_size, embedding_dim_2)
word2vec_2.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])
word2vec_2.fit(dataset, epochs=35, callbacks=[tensorboard_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fd829e33dc0>

In [None]:
evaluate_tf_model(word2vec_2, embedding_dim_2)

sim brother, sister 0.038001135
sim brother, make -0.07777563
most similar to brother:  [('familiar', 0.3976895213127136), ('waked', 0.3845680058002472), ('stood', 0.3626787066459656), ('subtle', 0.3443647027015686), ('wounded', 0.3273954689502716), ('sceptres', 0.30579644441604614), ('choleric', 0.29051488637924194), ('impossible', 0.2879590094089508), ('wrongfully', 0.2862926423549652), ('himself', 0.28537046909332275)]
king - man + woman =  [('richard', 0.2890335023403168), ('conveyd', 0.2626268267631531), ('3', 0.2621176540851593), ('ii', 0.25475478172302246), ('purse', 0.2537199854850769), ('fourteen', 0.25072288513183594), ('vi', 0.24345341324806213), ('troubled', 0.24259331822395325), ('ye', 0.24201162159442902), ('kissing', 0.23922547698020935)]
