<a href="https://colab.research.google.com/github/aakhterov/ML_projects/blob/master/Text_classification/make_embedding_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import io
import re
import string
import tensorflow as tf
import tqdm
import numpy as np
from tensorflow.keras import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [3]:
# Reference
# 1 .Tutorial from https://www.tensorflow.org/tutorials/text/word2vec
# 2. http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
# 3. https://proceedings.neurips.cc/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf
# 4. https://towardsdatascience.com/word2vec-research-paper-explained-205cb7eecc30
# 5. https://colab.research.google.com/drive/1vZuJSHulZhn7ihoPWwVQGic-fGQCtamX#scrollTo=lqPFJRv28lHC (word2vec_train.ipynb)

In [4]:
num_ns = 4 # Number of negative samples per target word
window_size = 2 # Size of sampling windows
vocab_size = 10000 # Vocabulary size
sequence_length = 6 # Number of words in a sequence set to the third quartile of senteneces length

In [5]:
def generate_training_data(sequences, window_size, num_ns, vocab_size):
    targets, contexts, labels = [], [], []

    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    for sequence in tqdm.tqdm(sequences):
      skip_grams, sg_labels = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=num_ns)

      positive_skip_grams = [skip_gram for idx, skip_gram in enumerate(skip_grams) if sg_labels[idx]==1]
      negative_skip_grams = [skip_gram for idx, skip_gram in enumerate(skip_grams) if sg_labels[idx]==0]

      for target_word, context_word in positive_skip_grams:
        negative_skip_grams_candidates = [skip_gram for skip_gram in negative_skip_grams if
                                          skip_gram[0]==target_word]
        current_negative_skip_grams = negative_skip_grams_candidates[:num_ns]
        for skip_gram in current_negative_skip_grams:
          negative_skip_grams.remove(skip_gram)

        context = tf.squeeze([context_word] + [x[1] for x in current_negative_skip_grams] )
        label = tf.constant([1] + [0]*num_ns, dtype="int64")

        targets.append(target_word)
        contexts.append(context)
        labels.append(label)

    return targets, contexts, labels

In [6]:
# Get preprocessed data

In [7]:
text_ds = tf.data.TextLineDataset("/content/drive/MyDrive/Colab Notebooks/Data/processing_phrases_more_one_word.txt")

In [8]:
# Use the text vectorization layer

In [9]:
vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [10]:
# Computes a vocabulary and create inverse_vocab

In [11]:
vectorize_layer.adapt(text_ds.batch(1024))
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'вы', 'в', 'на', 'я', 'не', 'за', 'ваш', 'и', 'мочь', 'по', 'у', 'роутер', 'интернет', 'помочь', 'обращение', 'с', 'быть', 'наш']


In [12]:
text_vector_ds = text_ds.batch(1024).prefetch(buffer_size=tf.data.AUTOTUNE).map(vectorize_layer).unbatch()

In [13]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

749130


In [14]:
for seq in sequences[:5]:
    print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[118  31 464  71  23  39] => ['подсказать', 'как', 'добавить', 'устройство', 'к', 'договор']
[  17  182   17    2 1032  119] => ['с', 'который', 'с', 'вы', 'общаться', 'да']
[312 168   0   0   0   0] => ['ноутбук', 'возможно', '', '', '', '']
[433  17  60   3 108  39] => ['заходить', 'с', 'он', 'в', 'кабинет', 'договор']
[385   0   0   0   0   0] => ['мбс', '', '', '', '', '']


In [15]:
# Generate targets, contexts and labels by created function

In [16]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size)
print(len(targets), len(contexts), len(labels))

100%|██████████| 749130/749130 [03:40<00:00, 3403.32it/s]

687552 687552 687552





In [17]:
targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")



targets.shape: (687552,)
contexts.shape: (687552, 5)
labels.shape: (687552, 5)


In [33]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int32, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [34]:
dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int32, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [35]:
# Make class inheriting from Model and implementing NN for train word2vec embedding

In [36]:
class Word2Vec(Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = Embedding(vocab_size,
                                          embedding_dim,
                                          input_length=1,
                                          name="w2v_embedding")
        self.context_embedding = Embedding(vocab_size,
                                           embedding_dim,
                                           input_length=num_ns+1)

    def call(self, pair):
        target, context = pair
        we = self.target_embedding(target)
        ce = self.context_embedding(context)
        dots = tf.einsum('be,bce->bc', we, ce)
        return dots

In [37]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [None]:
# Select embedding dimension

In [40]:
# embedding_dims = [64, 128, 256, 512]

In [41]:
# results = {}
# for embedding_dim in embedding_dims:
#     print("embedding_dim value: ", embedding_dim)
#     w2v = Word2Vec(vocab_size, embedding_dim)
#     w2v.compile(optimizer='adam',
#                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
#                 metrics=['accuracy'])
#     w2v.fit(dataset, epochs=20)
#     print("Evaluate")
#     _, results[embedding_dim] = w2v.evaluate(dataset)
# results

In [42]:
embedding_dim = 256

In [43]:
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [44]:
word2vec.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4361f93ca0>

In [45]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
# save embedding vectors to the files

In [None]:
out_v = io.open('vectors_norm_sentence_length_6_256_v2.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata_norm_sentence_length_6_256_v2.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()