In [None]:
# https://petamind.com/word2vec-with-tensorflow-2-0-a-simple-cbow-implementation/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
import numpy as np
print(tf.__version__)
##Output
#TensorFlow 2.x selected.
#2.0.0-rc2

2.8.0


In [None]:
class Word2Vec:
  def __init__(self, vocab_size=0, embedding_dim=16, optimizer='sgd', epochs=1):
    self.vocab_size=vocab_size
    self.embedding_dim=5
    self.epochs=epochs
    if optimizer=='adam':
      self.optimizer = tf.optimizers.Adam()
    else:
      self.optimizer = tf.optimizers.SGD(learning_rate=0.1)
  
  def train(self, x_train=None, y_train=None):
    self.W1 = tf.Variable(tf.random.normal([self.vocab_size, self.embedding_dim]))
    self.b1 = tf.Variable(tf.random.normal([self.embedding_dim])) #bias
    self.W2 = tf.Variable(tf.random.normal([self.embedding_dim, self.vocab_size]))
    self.b2 = tf.Variable(tf.random.normal([self.vocab_size]))
    for _ in range(self.epochs):
      with tf.GradientTape() as t:
        #print(x_train, self.W1)
        hidden_layer = tf.add(tf.matmul(x_train,self.W1),self.b1)
        output_layer = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, self.W2), self.b2))
        cross_entropy_loss = tf.reduce_mean(-tf.math.reduce_sum(y_train * tf.math.log(output_layer), axis=[1]))
      grads = t.gradient(cross_entropy_loss, [self.W1, self.b1, self.W2, self.b2])
      self.optimizer.apply_gradients(zip(grads,[self.W1, self.b1, self.W2, self.b2]))
      if(_ % 1000 == 0):
        print(cross_entropy_loss)
  
  def vectorized(self, word_idx):
    return (self.W1+self.b1)[word_idx]

In [None]:
def to_one_hot(data_point_index, vocab_size):
  temp = np.zeros(vocab_size)
  temp[data_point_index] = 1
  return temp

In [None]:
import re
import os
import numpy as np

cwd = os.getcwd()
print(cwd)

lines = []
count = 0
max_num_lines = 1
vocab_size = 30000
word2int = {}
int2word = {}

cbow = Word2Vec(vocab_size=vocab_size, optimizer='adam', epochs=10)

with open("drive/MyDrive/wiki_corpus.txt", encoding="utf-8", errors="ignore") as infile:
    for line in infile:
      if count > max_num_lines:
          break

      corpus_raw = line.replace("'", "")
      corpus_raw = corpus_raw.replace('"', "")
      pattern = re.compile('[\W_-–]+')
      pattern.sub('', corpus_raw)

      # print(corpus_raw)

      # convert to lower case
      corpus_raw = corpus_raw.lower()
      # raw sentences is a list of sentences.
      raw_sentences = corpus_raw.split('.')

      words = set()
      for i in raw_sentences:
          for word in i.split():
              if word not in word2int:
                word2int[word] = len(word2int)
                int2word[len(int2word)] = word

      if len(word2int) >= vocab_size:
        break

      # print(raw_sentences)
      for raw in raw_sentences:
        sentence = raw.split()
        # print(sentences)

        #sentences:
        data = []
        WINDOW_SIZE = 2
        for word_index, word in enumerate(sentence):
            for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] :
                if nb_word != word:
                    data.append([word, nb_word])

        
        
        
        x_train = [] # input word
        y_train = [] # output word

        for data_word in data:
            x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size))
            y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size))

        # convert them to numpy arrays
        x_train = np.asarray(x_train, dtype='float32')
        y_train = np.asarray(y_train, dtype='float32')

        try:
          cbow.train(x_train, y_train)
        except:
          continue

      count += 1

/content
tf.Tensor(16.74611, shape=(), dtype=float32)
tf.Tensor(19.9341, shape=(), dtype=float32)
tf.Tensor(17.83647, shape=(), dtype=float32)
tf.Tensor(13.183455, shape=(), dtype=float32)
tf.Tensor(16.293875, shape=(), dtype=float32)
tf.Tensor(14.329293, shape=(), dtype=float32)
tf.Tensor(14.428323, shape=(), dtype=float32)
tf.Tensor(13.620128, shape=(), dtype=float32)
tf.Tensor(15.616964, shape=(), dtype=float32)
tf.Tensor(14.906727, shape=(), dtype=float32)
tf.Tensor(15.17925, shape=(), dtype=float32)
tf.Tensor(16.418581, shape=(), dtype=float32)
tf.Tensor(14.129124, shape=(), dtype=float32)
tf.Tensor(18.029535, shape=(), dtype=float32)
tf.Tensor(15.793638, shape=(), dtype=float32)
tf.Tensor(14.077219, shape=(), dtype=float32)
tf.Tensor(14.192761, shape=(), dtype=float32)
tf.Tensor(14.8830385, shape=(), dtype=float32)
tf.Tensor(21.92309, shape=(), dtype=float32)
tf.Tensor(15.562655, shape=(), dtype=float32)
tf.Tensor(14.279499, shape=(), dtype=float32)
tf.Tensor(16.408, shape=(), dt

In [None]:
criminal_vector = cbow.vectorized(word2int['democratic'])
black_vector = cbow.vectorized(word2int['white'])

normalize_a = tf.nn.l2_normalize(criminal_vector,0)        
normalize_b = tf.nn.l2_normalize(black_vector,0)
cos_similarity=tf.reduce_sum(tf.multiply(normalize_a,normalize_b))
print(cos_similarity)
print(criminal_vector)
print(black_vector)
dist = tf.reduce_sum(tf.square(criminal_vector-black_vector))
print(dist)

tf.Tensor(0.28569484, shape=(), dtype=float32)
tf.Tensor([-0.614982   -0.14089316 -1.5024987  -3.2958884   1.5584078 ], shape=(5,), dtype=float32)
tf.Tensor([-0.02100319  1.2640468   1.7935528  -1.8290117   0.05875313], shape=(5,), dtype=float32)
tf.Tensor(17.591314, shape=(), dtype=float32)
