In [None]:
# https://petamind.com/word2vec-with-tensorflow-2-0-a-simple-cbow-implementation/

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
import numpy as np
print(tf.__version__)
##Output
#TensorFlow 2.x selected.
#2.0.0-rc2

2.5.0


In [2]:
class Word2Vec:
  def __init__(self, vocab_size=0, embedding_dim=16, optimizer='sgd', epochs=10000):
    self.vocab_size=vocab_size
    self.embedding_dim=5
    self.epochs=epochs
    if optimizer=='adam':
      self.optimizer = tf.optimizers.Adam()
    else:
      self.optimizer = tf.optimizers.SGD(learning_rate=0.1)
  
  def train(self, x_train=None, y_train=None):
    self.W1 = tf.Variable(tf.random.normal([self.vocab_size, self.embedding_dim]))
    self.b1 = tf.Variable(tf.random.normal([self.embedding_dim])) #bias
    self.W2 = tf.Variable(tf.random.normal([self.embedding_dim, self.vocab_size]))
    self.b2 = tf.Variable(tf.random.normal([self.vocab_size]))
    for _ in range(self.epochs):
      with tf.GradientTape() as t:
        #print(x_train, self.W1)
        hidden_layer = tf.add(tf.matmul(x_train,self.W1),self.b1)
        output_layer = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, self.W2), self.b2))
        cross_entropy_loss = tf.reduce_mean(-tf.math.reduce_sum(y_train * tf.math.log(output_layer), axis=[1]))
      grads = t.gradient(cross_entropy_loss, [self.W1, self.b1, self.W2, self.b2])
      self.optimizer.apply_gradients(zip(grads,[self.W1, self.b1, self.W2, self.b2]))
      if(_ % 1000 == 0):
        print(cross_entropy_loss)
  
  def vectorized(self, word_idx):
    return (self.W1+self.b1)[word_idx]

In [18]:
import re


corpus_file = open("wiki_corpus.txt")
lines = []
i = 0
max_num_lines = 10
for line in corpus_file:
    if i > max_num_lines:
        break
    
    lines.append(line)
    i += 1

corpus_raw = '.'.join(lines)

corpus_raw = corpus_raw.replace("'", "")
pattern = re.compile('[\W_-–]+')
pattern.sub('', corpus_raw)
# print(corpus_raw)
# convert to lower case
corpus_raw = corpus_raw.lower()
# raw sentences is a list of sentences.
raw_sentences = corpus_raw.split('.')
# print(raw_sentences)
sentences = []
for sentence in raw_sentences:
    sentences.append(sentence.split())
# print(sentences)
#sentences:
data = []
WINDOW_SIZE = 2
for sentence in sentences:
    for word_index, word in enumerate(sentence):
        for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] :
            if nb_word != word:
                data.append([word, nb_word])
words = set()
for i in raw_sentences:
    for word in i.split():
        words.add(word)
        
word2int = {}
int2word = {}
vocab_size = len(words) # gives the total number of unique words
for i,word in enumerate(words):
    word2int[word] = i
    int2word[i] = word
def to_one_hot(data_point_index, vocab_size):
  temp = np.zeros(vocab_size)
  temp[data_point_index] = 1
  return temp
x_train = [] # input word
y_train = [] # output word
for data_word in data:
    x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size))
    y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size))
# convert them to numpy arrays
x_train = np.asarray(x_train, dtype='float32')
y_train = np.asarray(y_train, dtype='float32')

In [None]:
cbow = Word2Vec(vocab_size=vocab_size, optimizer='adam', epochs=10000)
cbow.train(x_train, y_train)

tf.Tensor(8.816945, shape=(), dtype=float32)
tf.Tensor(2.3829265, shape=(), dtype=float32)
tf.Tensor(1.7946837, shape=(), dtype=float32)
tf.Tensor(1.4863819, shape=(), dtype=float32)
tf.Tensor(1.396186, shape=(), dtype=float32)
tf.Tensor(1.3485827, shape=(), dtype=float32)
tf.Tensor(1.334331, shape=(), dtype=float32)
tf.Tensor(1.330012, shape=(), dtype=float32)
tf.Tensor(1.3283578, shape=(), dtype=float32)
tf.Tensor(1.3276052, shape=(), dtype=float32)


In [None]:
cbow.vectorized(word2int['criminal'])


<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([-5.0990295 ,  1.1574619 , -0.52969515,  1.1401453 , -0.35821998],
      dtype=float32)>