In [81]:
import os
import glob
import imageio
import matplotlib.pyplot as plt
import numpy as np
import PIL
from tensorflow.keras import layers
import time
import tensorflow as tf
import random
import re
import string
import random
import tqdm
from IPython import display


import requests
import tensorflow as tf
import tensorflow_text as tf_text


In [101]:
BATCH_SIZE = 1024
num_ns = 4
embedding_dim = 128
vocab_size = 1000
sequence_length = 10
window_size = 4
BUFFER_SIZE = 10000
buffer_size = 500
SEED = random.randint(0, 1000)

In [121]:
#2.1 and 2.2

AUTOTUNE = tf.data.experimental.AUTOTUNE

path_to_file = os.path.abspath("./" +'bible.txt')
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase, '[%s]' % re.escape(string.punctuation), '')

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)
vectorize_layer.adapt(text_ds.batch(1024))
inverse_vocab = vectorize_layer.get_vocabulary()
text_vector_ds = text_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()
sequences = list(text_vector_ds.as_numpy_iterator())


In [84]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [103]:
#2.3
#Useing the freedom granted in the task to use an alternative implementation of SkipGram
#and for that adding a label into the input formation

targets, contexts, labels = generate_training_data(sequences=sequences, window_size=2, num_ns=4, vocab_size=vocab_size, seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)


dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(BUFFER_SIZE)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 74644/74644 [00:09<00:00, 8256.18it/s]


In [104]:



class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    word_emb = self.target_embedding(target)
    context_emb = self.context_embedding(context)
    
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    return dots

In [105]:
#2.4

word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [106]:
#using the loop in the fit-method
word2vec.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fd7282f3100>

In [107]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
print(weights.shape)

(1000, 128)


In [116]:
def calculateDistance(position1, position2):
    distance = 0
    for i in range(0,len(position1)):
        distance = distance + (position1[i] - position2[i]) * (position1[i] - position2[i])
    return distance

def findeNN(index):
    if index == 1:
        nearest = 2
        nearestDistant = calculateDistance(weights[index],weights[2])
    else:
        nearest = 1
        nearestDistant = calculateDistance(weights[index],weights[1])          
    i = 0
    for postion in weights:
        if i != index:
            if i != 0:
                temp = calculateDistance(weights[index],weights[i])
                if nearestDistant > temp:
                    nearestDistant = temp
                    nearest = i
        
        i = i+1
    return nearest

def printNN(string):
    for i in range(0,len(inverse_vocab)):
        if inverse_vocab[i] == string:
            number = i
    clostest = inverse_vocab[findeNN(number)]
    print("for '" + str(string) + "' the nearest neighbours is '" + str(clostest) +"'")

In [117]:
printNN('he')
printNN('father')
printNN('water')
printNN('old')
printNN('strong')
printNN('day')

for 'he' the nearest neighbours is 'she'
for 'father' the nearest neighbours is 'master'
for 'water' the nearest neighbours is 'wilderness'
for 'old' the nearest neighbours is 'years'
for 'strong' the nearest neighbours is 'wine'
for 'day' the nearest neighbours is 'year'
