<a href="https://colab.research.google.com/github/aakhterov/ML_projects/blob/master/Text_classification/word2vec_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import io
import re
import string
import tensorflow as tf
import tqdm
import numpy as np
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [41]:
num_ns = 4 # Number of negative samples per target word

In [42]:
sentence = "Данное устройство уже было ранее зарегистрированно в нашей сети" # example
tokens = list(sentence.lower().split())
print(tokens)

['данное', 'устройство', 'уже', 'было', 'ранее', 'зарегистрированно', 'в', 'нашей', 'сети']


In [43]:
# Make vocabulary and inverse vocabulary

In [44]:
vocab, inverse_vocab = {'<pad>': 0}, {0: '<pad>'}
for i, token in enumerate(tokens):
    vocab[token] = vocab.get(token, i+1)
    inverse_vocab[i+1] = token
vocab_size = len(vocab)
print("vocab:", vocab)
print("inverse_vocab:", inverse_vocab)

vocab: {'<pad>': 0, 'данное': 1, 'устройство': 2, 'уже': 3, 'было': 4, 'ранее': 5, 'зарегистрированно': 6, 'в': 7, 'нашей': 8, 'сети': 9}
inverse_vocab: {0: '<pad>', 1: 'данное', 2: 'устройство', 3: 'уже', 4: 'было', 5: 'ранее', 6: 'зарегистрированно', 7: 'в', 8: 'нашей', 9: 'сети'}


In [45]:
example_sequence = [vocab[word] for word in tokens] # example encoded as a list of word indices
print(example_sequence)

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [8]:
# Generating skip-grams - (word, word in the same window) with label 1 (positive samples);
# (word, random word from the vocabulary), with label 0 (negative samples).
# Context word is taken from the window of size - window_size

In [57]:
window_size = 2
skip_grams, labels = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=num_ns)

In [47]:
# Let's look at labels, skip_grams and corresponding them pair of words

In [58]:
for idx, skip_gram in enumerate(skip_grams[:10]):
  print(f"{labels[idx]}: {skip_gram} {inverse_vocab[skip_gram[0]]} {inverse_vocab[skip_gram[1]]}")

0: [5, 5] ранее ранее
0: [7, 7] в в
0: [3, 9] уже сети
0: [2, 5] устройство ранее
0: [7, 6] в зарегистрированно
0: [9, 3] сети уже
1: [5, 3] ранее уже
0: [5, 9] ранее сети
0: [6, 7] зарегистрированно в
0: [2, 1] устройство данное


In [None]:
# We see there are skipgrams that should be positive, but they are negative instead (ex. [2, 1]). It's because of small vocabularity size.
# Let's make vokab_size = 10000

In [60]:
skip_grams, labels = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=10000,
      window_size=window_size,
      negative_samples=num_ns)

In [None]:
# Let's look at one positive skipgram and coresponding negative skip grams

In [61]:
positive_skip_grams = [skip_gram for idx, skip_gram in enumerate(skip_grams) if labels[idx]==1]
negative_skip_grams = [skip_gram for idx, skip_gram in enumerate(skip_grams) if labels[idx]==0]
target_word, context_word = positive_skip_grams[0]

negative_skip_grams_candidates = [skip_gram for skip_gram in negative_skip_grams if skip_gram[0]==target_word]
current_negative_skip_grams = negative_skip_grams_candidates[:num_ns]

In [71]:
print("target word: ", target_word)
print("context word: ", context_word)
print("current_negative_skip_grams: ", current_negative_skip_grams)

target word:  6
context word:  5
current_negative_skip_grams:  [[6, 5586], [6, 4499], [6, 995], [6, 6654]]


In [72]:
# ... and convert target_word, context and labels to tf.Tensor

In [63]:
target = tf.squeeze(target_word)
context = tf.squeeze([context_word] + [x[1] for x in current_negative_skip_grams] )
label = tf.constant([1] + [0]*num_ns)
print(target)
print(context)
print(label)

tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor([   5 5586 4499  995 6654], shape=(5,), dtype=int32)
tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int32)


In [None]:
# Let's get it all together

In [73]:
def generate_training_data(sequences, window_size, num_ns, vocab_size):
  # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

  # Iterate over all sequences (sentences) in dataset.
    for sequence in tqdm.tqdm(sequences):
      skip_grams, sg_labels = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=None,
          window_size=window_size,
          negative_samples=num_ns)

      positive_skip_grams = [skip_gram for idx, skip_gram in enumerate(skip_grams) if sg_labels[idx]==1]
      negative_skip_grams = [skip_gram for idx, skip_gram in enumerate(skip_grams) if sg_labels[idx]==0]

      # Iterate over every positive skipgram
      for target_word, context_word in positive_skip_grams:
        # Get negative skipgrams corresponding to the current positive skipgram
        negative_skip_grams_candidates = [skip_gram for skip_gram in negative_skip_grams if
                                          skip_gram[0]==target_word]
        current_negative_skip_grams = negative_skip_grams_candidates[:num_ns]
        # remove selected negative skipgrams from negative_skip_grams list
        for skip_gram in current_negative_skip_grams:
          negative_skip_grams.remove(skip_gram)

        target = tf.squeeze(target_word)
        context = tf.squeeze([context_word] + [x[1] for x in current_negative_skip_grams] )
        label = tf.constant([1] + [0]*num_ns, dtype="int64")

        targets.append(target_word)
        contexts.append(context)
        labels.append(label)

    return targets, contexts, labels

In [78]:
targets, contexts, labels = generate_training_data([example_sequence], window_size, num_ns, 10000)

100%|██████████| 1/1 [00:00<00:00, 66.96it/s]


In [None]:
# Let's look at the contexts. First element is right context and the others are wrong context (negative sampling)

In [75]:
contexts[:10]

[<tf.Tensor: shape=(5,), dtype=int32, numpy=array([   7, 2447, 5627,  481, 6719], dtype=int32)>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([   8, 4742, 8095, 1291, 4478], dtype=int32)>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([   2, 7624, 4307, 2855, 3858], dtype=int32)>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([   7, 6771, 5063, 4853, 2018], dtype=int32)>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([   6, 6604, 6162, 9023, 3908], dtype=int32)>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([   7, 5774, 1103, 3556, 9772], dtype=int32)>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([   6, 7883, 9710, 1036, 4964], dtype=int32)>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([   4, 9032, 9370, 4030, 7450], dtype=int32)>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([   4, 3102,   55, 2041, 8935], dtype=int32)>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([   3, 1094, 5071, 9249, 3267], dtype=int32)>]

In [76]:
labels[:10]

[<tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>]

In [80]:
targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")



targets.shape: (30,)
contexts.shape: (30, 5)
labels.shape: (30, 5)
