# Using Word2Vec to create self-Supervised Embeddings

We will use _Tensorflow_ to create a 3 layer `Word2Vec` model to train embeddings

In [13]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as k
from k.preprocessing.text import one_hot
from k.preprocessing.sequence import pad_sequences
from k.models import Sequential
import k.layers as tf_layer
from sklearn.manifold import TSNE
from sklearn import preprocessing
import matplotlib.pyplot as plt




ModuleNotFoundError: No module named 'numpy'

### *Cleaning up our Corpus* 

In [None]:
# taking our Corpus and converting it to be lowercase
corpus = "nice food, amazing restaurant, too good, horrible sevice, highly disgusting, never recommending this to anyone"
corpus = corpus.lower()

words = []

# TODO : Use Tokenizer Instead

# Splitting up and storing our words
for word in corpus.split():
    if word is not '.' or ',':
        words.append(word)

# initializing dictionaries
word2int = {}
int2word = {}

VOCAB_SIZE = len(words)

for i, word in enumerate(words):
    word2int[word] = i
    int2word[i] = word

In [None]:
# Testing our dicionaries
print(word2int["nice"])
print(int2word[5])

In [None]:
# Creating List of sentences from our corpus
sentences = []

for sentence in corpus.split(","):
    sentences.append(sentence.split())

print(sentences)

### *Generating Test Data*

a window size decides the number of words that will be predicted from a central word

a window size of 2 means the we will create 2x2=4 pairs of data points 
for each central word which has the central word and the actual 
surrounding word next to it

If, the central word is at/close-to the cornors then a reduced number of sets are formed 

e.g. : 

![Image](http://mccormickml.com/assets/word2vec/training_data.png)

In [None]:
data = []

WINDOW_SIZE = 2

# Creating Pairs
for sentence in sentences:
    for index, word in enumerate(sentence):
        for word_neighbour in sentence[ max(index - WINDOW_SIZE, 0) : min(index + WINDOW_SIZE, len(sentence)) + 1 ]:
            if word_neighbour is not word:
                data.append([word, word_neighbour])

print(data)

### *Converting our Data into `One Hot Vectors`*

In [None]:
# defining input data
x_train = []
y_train = []

def to_one_hot(data_point_index, VOCAB_SIZE):
    temp = np.zeros(VOCAB_SIZE)
    temp[data_point_index] = 1
    return temp

# looping our data
for index, pair in enumerate(data):
    x_train.append(to_one_hot(word2int[pair[0]], VOCAB_SIZE))
    y_train.append(to_one_hot(word2int[pair[1]], VOCAB_SIZE))

    # data[index] = [  , one_hot(word2int[pair[0]], VOCAB_SIZE) ]
    # x_train.append(one_hot(word2int[pair[0]], VOCAB_SIZE))
    # y_train.append(one_hot(word2int[pair[1]], VOCAB_SIZE))

# converting to numpy arrays
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

print(x_train.shape, y_train.shape)
print(x_train[:10])

### *Creatng Model*

![Model Structure](https://miro.medium.com/max/700/1*Os5hj9qg1t6sr0S3DF4gyA.jpeg)

In [None]:
# making placeholders for x_train and y_train
x = tf.placeholder(tf.float32, shape=(None, VOCAB_SIZE))
y_label = tf.placeholder(tf.float32, shape=(None, VOCAB_SIZE))

# specifying the Embedding Dimentions
EMBEDDING_DIM = 5

# initializing embedding matrix and bias
W1 = tf.Variable(tf.random_normal([VOCAB_SIZE, EMBEDDING_DIM])) 
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) 

hidden_representations = tf.add(tf.matmul(x, W1), b1)

### *Predictig the Neighbouring words to train Embeddings*

![2nd Layer Model](https://miro.medium.com/max/700/1*KxWiUoe-FXPpBdATP-IHOw.jpeg)

In [15]:
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, VOCAB_SIZE]))
b2 = tf.Variable(tf.random_normal([VOCAB_SIZE]))

prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2) )

### *Finally to summarize*

![softmax](https://miro.medium.com/max/700/1*cnzY08TWRxG3lMKExbslHw.jpeg)

## *Training our Model*

In [None]:
# Creating and Initializing our Session
sess = tf.Session()
init = tf.global_variables_initializer()

# Running Model
sess.run(init)

# define the loss function:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))

# define the training step:
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)
n_iters = 10000

# train for n_iter iterations
for _ in range(n_iters):
    sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
    
    print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))

### *Getting W1 and b1 Embeddings*

In [None]:
print(sess.run(W1))
print('----------')
print(sess.run(b1))
print('----------')

In [None]:
# Getting Vectors
vectors = sess.run(W1 + b1)
print(vectors.shape)
print(vectors)

In [None]:
# testing trained embedding output
print( vectors[ word2int['nice'] ])

In [None]:
# Functions to get closest Words
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum((vec1-vec2)**2))

def find_closest(word_index, vectors):
    min_dist = 10000 # to act like positive infinity
    min_index = -1
    query_vector = vectors[word_index]
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
    return min_index

In [None]:
print(int2word[ find_closest( word2int['king'], vectors ) ])
print(int2word[ find_closest( word2int['queen'], vectors ) ])
print(int2word[ find_closest( word2int['royal'], vectors ) ])

# Reducing Dimentionality with t-SNE

In [None]:
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
vectors = model.fit_transform(vectors)

In [None]:
# normalizing values to show in matplotlib
normalizer = preprocessing.Normalizer()
vectors =  normalizer.fit_transform(vectors, 'l2')

In [None]:
# Plotting 2D graph
fig, ax = plt.subplots()
for word in words:
    print(word, vectors[word2int[word]][1])
    ax.annotate(word, (vectors[word2int[word]][0],vectors[word2int[word]][1] ))

plt.show()