In [23]:
pip install -U tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
import tensorflow as tf
import tensorflow_text as tf_txt
from tensorflow.keras import layers

from collections import Counter
import numpy as np
import math

import tqdm
import datetime

In [25]:
# bash code to mount the drive
import os
from google.colab import drive
#drive.mount ("/content/drive")
#os.chdir("drive/MyDrive/Colab Notebooks/Week10")

In [26]:
bible = open('/content/drive/MyDrive/Colab Notebooks/Week10/bible.txt', 'r').read()

**Word Embeddings**

**Preprocessing**

In [27]:
def normalizetext(text):
  notwanted = '. ; , : ? ! -'
  notwanted = notwanted.split()
  
  text = text.replace('\n', ' ')
  for w in notwanted:
    text = text.replace(w, ' ')
  
  text = text.lower()
  return text

In [28]:
def subsample_formula(count_w, total, s=0.001):
  z_w = count_w / total
  return min(1, (np.sqrt(z_w/s)+1) * s / z_w)

In [29]:
def subsample(tokens):
  dic = {}
  dic_reverse = {}

  counter = Counter(tokens.numpy())
  total = sum(counter.values())

  probabilities = {}
  numbers = {}

  for n, i in enumerate(counter):
    p = subsample_formula(counter[i], total)
    dic[i] = n
    dic_reverse[n] = i
    if p != 1:
      probabilities[i] = p
  
  subsampled_tokens = []
  
  for i in tokens:
    i = i.numpy()
    if not((i in probabilities) and (probabilities[i] < np.random.rand())):
      subsampled_tokens.append(dic[i])
    
  return subsampled_tokens, dic, dic_reverse

In [30]:
def create_data(text, c = 4):
  text = normalizetext(text)

  tokenizer = tf_txt.UnicodeScriptTokenizer()
  tokens = tokenizer.tokenize(normalizetext(text))

  tokens, dic, dic_reverse = subsample(tokens)

  input = []
  targets = {}

  for i, w in enumerate(tokens):
    input.append(w)
    temp = []
    for n in range(i-c, i+c+1):
      if n >= 0 and n < len(tokens) and n != i:
        temp.append(tokens[n])
    
    targets[i] = temp

  return tf.data.Dataset.from_tensor_slices(input), targets, dic, dic_reverse

In [31]:
dataset, targets, dic, dic_reverse = create_data(bible)

In [33]:
BATCH_SIZE = 128 
ds = dataset.batch(128).prefetch(10);

**Model**

In [34]:
class skipgram(tf.keras.layers.Layer):

  def __init__(self, vocabsize, embsize=64, neg_samples=500):
    super(skipgram, self).__init__()
    self.embsize = embsize
    self.vocabsize = vocabsize
    self.neg_samples = neg_samples

  def build(self, _):
    self.w_matrix = self.add_weight(shape = (self.vocabsize, self.embsize),initializer='random_normal', trainable=True)
    self.w_bias = self.add_weight(shape = [self.embsize],initializer='random_normal', trainable=True)

    self.s_matrix = self.add_weight(shape = (self.vocabsize, self.embsize),initializer='random_normal', trainable=True)
    self.s_bias = self.add_weight(shape = [self.vocabsize],initializer='random_normal', trainable=True)

  def embedding(self, input):
    embedding = tf.nn.embedding_lookup(self.w_matrix, input)+self.w_bias
    return embedding

  def call(self, input, target):
    embedding = tf.nn.embedding_lookup(self.w_matrix, input)+self.w_bias

    loss = tf.nn.nce_loss(self.s_matrix, 
                          self.s_bias, 
                          target, 
                          embedding, 
                          num_sampled = self.neg_samples, 
                          num_classes = self.vocabsize)
    loss = tf.reduce_mean(loss)

    return loss


**Training**

In [35]:
def get_targets(word_n, targets, length=1):
  target = []
  
  for i in range(word_n,word_n+length):
    target.append(np.random.choice(targets[i]))
  return tf.expand_dims(target, -1)

In [36]:
def cosine_similarity(model, word, k= 5):
  emb_word = model.embedding(word)
  nearest_n = {}

  for i in dic_reverse:
    if i != word:
      emb_word2 = model.embedding(i)
      similarity = np.dot(emb_word, emb_word2)/(np.linalg.norm(emb_word)*np.linalg.norm(emb_word2))

      if len(nearest_n) < k: nearest_n[similarity] = i
      elif similarity > min(nearest_n): 
        del nearest_n[min(nearest_n)]
        nearest_n[similarity] = i

  text = b''.join(dic_reverse[w] + b', ' for w in nearest_n.values())
  return tf.constant(text)

In [37]:
@tf.function
def train_step(model, input, target, optimizer):

  with tf.GradientTape() as tape:
    loss = model(input, target)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  return loss

In [38]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [39]:
file_path = "/content/drive/MyDrive/Colab Notebooks/Week10/Week10_train"

summary_writer = tf.summary.create_file_writer(file_path)

In [40]:
test_words = [b'holy', b'father', b'wine', b'poison',b'death',b'hope', b'killed']
test_words = [ dic[w] for w in test_words]

In [41]:
tf.keras.backend.clear_session()

epochs = 10
learning_rate = 0.001

model = skipgram(len(dic))
optimizer = tf.optimizers.Adam(learning_rate)

train_losses = []

for epoch in range(epochs):
  epoch_loss_agg = []
 
  for i, batch in enumerate(ds):
    labels = get_targets(i*BATCH_SIZE, targets, batch.shape[0])
    train_loss = train_step(model, batch, labels, optimizer)
    epoch_loss_agg.append(train_loss)

  epoch_loss_agg = tf.reduce_mean(epoch_loss_agg)

  train_losses.append(epoch_loss_agg)
  
  with summary_writer.as_default():
    tf.summary.scalar(name='train_loss', data=epoch_loss_agg, step=epoch)
    for w in test_words:
      word = dic_reverse[w].decode('UTF-8')
      tf.summary.text(name = f'nearest_neighbours_of_{word}', data = cosine_similarity(model, w), step = epoch)

  print(f'Epoch: {str(epoch)} ending with training loss of {np.round(epoch_loss_agg,4)}')


Epoch: 0 ending with training loss of 110.44779968261719
Epoch: 1 ending with training loss of 6.44189977645874
Epoch: 2 ending with training loss of 6.43179988861084
Epoch: 3 ending with training loss of 6.438399791717529
Epoch: 4 ending with training loss of 6.452899932861328
Epoch: 5 ending with training loss of 6.466000080108643
Epoch: 6 ending with training loss of 6.449100017547607
Epoch: 7 ending with training loss of 6.4822998046875
Epoch: 8 ending with training loss of 6.509200096130371
Epoch: 9 ending with training loss of 6.523399829864502
