In [1]:
import io
import re
import string
import tensorflow as tf
import tqdm
from tensorflow.keras import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [73]:
# Reference
# 1 .Tutorial from https://www.tensorflow.org/tutorials/text/word2vec
# 2. http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
# 3. https://proceedings.neurips.cc/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf
# 4. https://towardsdatascience.com/word2vec-research-paper-explained-205cb7eecc30

In [2]:
num_ns = 4 # Number of negative samples per target word

### Step by step generating dataset 

In [21]:
sentence = "Данное устройство уже было ранее зарегистрированно в нашей сети"
tokens = list(sentence.lower().split())
print(tokens)

['данное', 'устройство', 'уже', 'было', 'ранее', 'зарегистрированно', 'в', 'нашей', 'сети']


In [23]:
# Make vocabulary and inverse vocabulary

In [32]:
vocab, inverse_vocab = {'<pad>': 0}, {0: '<pad>'}
for i, token in enumerate(tokens):
    vocab[token] = vocab.get(token, i+1)
    inverse_vocab[i+1] = token
vocab_size = len(vocab)
print("vocab:", vocab)
print("inverse_vocab:", inverse_vocab)

vocab: {'<pad>': 0, 'данное': 1, 'устройство': 2, 'уже': 3, 'было': 4, 'ранее': 5, 'зарегистрированно': 6, 'в': 7, 'нашей': 8, 'сети': 9}
inverse_vocab: {0: '<pad>', 1: 'данное', 2: 'устройство', 3: 'уже', 4: 'было', 5: 'ранее', 6: 'зарегистрированно', 7: 'в', 8: 'нашей', 9: 'сети'}


In [33]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [28]:
# Generating positive skip-grams - pairs [target word, context word]. Context word is taken 
# from the window of size - window_size

In [27]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(positive_skip_grams)

[[3, 4], [4, 2], [8, 6], [2, 4], [4, 6], [7, 9], [6, 4], [8, 7], [2, 1], [7, 8], [3, 1], [9, 7], [2, 3], [5, 3], [7, 5], [4, 5], [5, 7], [1, 3], [3, 5], [4, 3], [8, 9], [3, 2], [1, 2], [9, 8], [7, 6], [6, 8], [5, 4], [6, 5], [5, 6], [6, 7]]


In [29]:
for skip_gram in positive_skip_grams[:5]:
    print(f"{skip_gram} {inverse_vocab[skip_gram[0]]} {inverse_vocab[skip_gram[1]]}")

[3, 4] уже было
[4, 2] было устройство
[8, 6] нашей зарегистрированно
[2, 4] устройство было
[4, 6] было зарегистрированно


In [11]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]
context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
print(target_word, context_word)
print(context_class)

4 3
tf.Tensor([[3]], shape=(1, 1), dtype=int64)


In [35]:
# Generate negative skip-gramms

In [36]:
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,
    num_true=1,
    num_sampled=num_ns,
    unique=True,
    range_max=vocab_size
)
print(target_word, context_word, inverse_vocab[target_word], inverse_vocab[context_word])
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

4 3 было уже
tf.Tensor([2 9 0 4], shape=(4,), dtype=int64)
['устройство', 'сети', '<pad>', 'было']


In [37]:
# Concatenate positive class (right context word) with negative class (wrong context word)

In [38]:
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)
context = tf.concat([context_class, negative_sampling_candidates], 0)
print(context)

tf.Tensor(
[[3]
 [2]
 [9]
 [0]
 [4]], shape=(5, 1), dtype=int64)


In [39]:
# Make lable tensor and flattern all tensors

In [40]:
label = tf.constant([1] + [0]*num_ns, dtype="int64")

target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)
print(target)
print(context)
print(label)

tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor([3 2 9 0 4], shape=(5,), dtype=int64)
tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


In [None]:
# Combine all together

In [58]:
def generate_training_data(sequences, window_size, num_ns, vocab_size):
  # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
    for sequence in tqdm.tqdm(sequences):

        # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
              sequence,
              vocabulary_size=vocab_size,
              sampling_table=sampling_table,
              window_size=window_size,
              negative_samples=0)
#         print(positive_skip_grams)
        # Iterate over each positive skip-gram pair to produce training examples
        # with positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)
            
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
              true_classes=context_class,
              num_true=1,
              num_sampled=num_ns,
              unique=True,
              range_max=vocab_size)

          # Build context and label vectors (for one target word)
            negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")

              # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels

### Apply created function to real dataset

In [42]:
text_ds = tf.data.TextLineDataset("data/processing_phrases_more_one_word.txt")

In [43]:
# Define the vocabulary size and number of words in a sequence. Number of words in a sequence set to the 
# third quartile of senteneces length

In [44]:
vocab_size = 10000
sequence_length = 6

In [67]:
# Use the text vectorization layer

In [45]:
vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [69]:
# Computes a vocabulary and create inverse_vocab

In [70]:
vectorize_layer.adapt(text_ds.batch(1024))
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'вы', 'в', 'на', 'я', 'не', 'за', 'ваш', 'и', 'мочь', 'по', 'у', 'роутер', 'интернет', 'помочь', 'обращение', 'с', 'быть', 'наш']


In [54]:
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [55]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

749130


In [56]:
for seq in sequences[:5]:
    print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[118  31 464  71  23  39] => ['подсказать', 'как', 'добавить', 'устройство', 'к', 'договор']
[  17  182   17    2 1032  119] => ['с', 'который', 'с', 'вы', 'общаться', 'да']
[312 168   0   0   0   0] => ['ноутбук', 'возможно', '', '', '', '']
[433  17  60   3 108  39] => ['заходить', 'с', 'он', 'в', 'кабинет', 'договор']
[385   0   0   0   0   0] => ['мбс', '', '', '', '', '']


In [71]:
# Generate targets, contexts and labels by created function

In [59]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size)
print(len(targets), len(contexts), len(labels))

100%|██████████| 749130/749130 [01:12<00:00, 10349.40it/s]

686914 686914 686914





In [60]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [61]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [72]:
# Make class inheriting from Model and implementing NN for train word2vec embedding

In [62]:
class Word2Vec(Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = Embedding(vocab_size,
                                          embedding_dim,
                                          input_length=1,
                                          name="w2v_embedding")
        self.context_embedding = Embedding(vocab_size,
                                           embedding_dim,
                                           input_length=num_ns+1)
        self.dots = Dot(axes=(3, 2))
        self.flatten = Flatten()

    def call(self, pair):
        target, context = pair
        we = self.target_embedding(target)
        ce = self.context_embedding(context)
        dots = self.dots([ce, we])
        return self.flatten(dots)

In [63]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [74]:
# Select embedding dimension

In [90]:
embedding_dims = [64, 128, 256, 512]

In [93]:
results = {}
for embedding_dim in embedding_dims:
    print("embedding_dim value: ", embedding_dim)
    w2v = Word2Vec(vocab_size, embedding_dim)
    w2v.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
    w2v.fit(dataset, epochs=20)
    print("Evaluate")
    _, results[embedding_dim] = w2v.evaluate(dataset)
results

embedding_dim value:  64
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Evaluate
embedding_dim value:  128
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Evaluate
embedding_dim value:  256
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Evaluate
embedding_dim value:  512
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 

Epoch 18/20
Epoch 19/20
Epoch 20/20
Evaluate


{64: 0.8519618511199951,
 128: 0.892391562461853,
 256: 0.9115248918533325,
 512: 0.915403425693512}

In [75]:
embedding_dim = 256

In [64]:
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [66]:
word2vec.fit(dataset, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7f473920b1f0>

In [43]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [77]:
# save embedding vectors to the files

In [44]:
out_v = io.open('vectors_norm_sentence_length_6_256_v2.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata_norm_sentence_length_6_256_v2.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()