# Word2Vec

Wee learn word embeddings of words from the song lyrics using skip-gram word2vec.

In [35]:
import tensorflow as tf
import numpy as np
import utils.utils as utils
import os
import math
import re
import pickle

In [2]:
#Load data and do some cleanup.

doc = utils.load_data(filepath='data/mj_lyrics.txt')

doc = re.sub('\n+','\n ',doc)
doc = re.sub(' +',' ', doc)



In [20]:
#Constants for network

batch_size = 256
embedding_dims = 64
neg_smaples = 32
epochs = 1



In [4]:
#Create word2index and index2word mappings
word2idx, idx2word, vocab_size = utils.word_idx_mappings(doc)

In [5]:
#Create SkipGram pairs  with context = 1

skip_gram_pairs=[]

words = doc.lower().split()

for i in range(1, len(words)-1):
    word_context_pair = [[word2idx[words[i-1]],
                         word2idx[words[i+1]]],
                         word2idx[words[i]]]
    
    skip_gram_pairs += [[word_context_pair[1],word_context_pair[0][0]]]
    skip_gram_pairs += [[word_context_pair[1],word_context_pair[0][1]]]



In [6]:
def get_skipgram_batch(batch_size):
    indices = list(range(len(skip_gram_pairs)))
    ix = 0
    np.random.shuffle(indices)
    batch = indices[:batch_size]
    x = [skip_gram_pairs[i][0] for i in batch]
    y = [skip_gram_pairs[i][1] for i in batch]
    
    x = np.array(x)
    y = np.array(y)
    return x,y[:,np.newaxis]

In [7]:
class Word2Vec():
    
    def __init__(self):
        self.name = 'word2vec'
    
    def build(self,embedding_dims, neg_samples):
        
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            
            self._inputs = tf.placeholder(tf.int32, shape=[batch_size], name='inputs')
            self._labels = tf.placeholder(tf.int32, shape = [batch_size,1], name='labels')
            
            
            with tf.name_scope('embeddings'):
                self.embeddings = tf.get_variable(dtype=tf.float32, initializer=tf.random_uniform(shape=[vocab_size,embedding_dims],minval=-1.0,maxval=1.0), name='embed_matrix')
                
                #lookup table
                self.embed = tf.nn.embedding_lookup(self.embeddings, self._inputs)
                
            with tf.name_scope('Loss'):
                
                self.nce_weights = tf.get_variable(dtype=tf.float32, shape=[vocab_size,embedding_dims], name='nce_weignts', initializer=tf.random_normal_initializer())
                
                self.nce_biases = tf.get_variable(dtype=tf.float32, shape=[vocab_size], name='nce_biases', initializer=tf.random_normal_initializer())
                
                self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=self.nce_weights, biases=self.nce_biases,inputs=self.embed,labels=self._labels, num_sampled=neg_smaples, num_classes=vocab_size))
                
                global_step = tf.Variable(0, trainable=False)
                
                learning_rate = tf.train.exponential_decay(learning_rate=1e-2,
                                                          global_step=global_step,decay_steps=1000,decay_rate=0.95,staircase=True)
                
                self.train_step = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
                
    def train(self,epochs):
        
        with tf.Session() as s:
            
            s.run(tf.global_variables_initializer())
            
            steps = len(skip_gram_pairs) // batch_size
                        
            for e in range(epochs):
                print(f'Epoch {e+1}:')
                for step in range(steps):
                    X_batch, y_batch = get_skipgram_batch(batch_size)
                    
                    _ = s.run([self.train_step], feed_dict={self._inputs: X_batch, self._labels: y_batch})
                    
                    if step % 100 == 0:
                        tr_loss = s.run([self.loss], feed_dict={self._inputs: X_batch, self._labels: y_batch})
                        print(f'Step {step}, LOSS: {tr_loss}')
            
            #Normalize embeddings here
            
            norm = tf.sqrt(tf.reduce_sum(tf.square(self.embeddings),1,keepdims=True))
            norm_embeddings = self.embeddings / norm
            
            norm_embeddings_matrix = s.run(norm_embeddings)
            
            return norm_embeddings_matrix
                    
                    
                
            
        

In [22]:
w2v = Word2Vec()

w2v.build(embedding_dims,neg_smaples)


In [23]:
em = w2v.train(epochs=10)

Epoch 1:
Step 0, LOSS: [138.04443]
Step 100, LOSS: [111.20104]
Step 200, LOSS: [77.61796]
Step 300, LOSS: [51.187477]
Epoch 2:
Step 0, LOSS: [65.94971]
Step 100, LOSS: [44.89686]
Step 200, LOSS: [71.28148]
Step 300, LOSS: [21.60888]
Epoch 3:
Step 0, LOSS: [39.668777]
Step 100, LOSS: [45.002]
Step 200, LOSS: [20.35807]
Step 300, LOSS: [18.900162]
Epoch 4:
Step 0, LOSS: [22.980154]
Step 100, LOSS: [5.3417177]
Step 200, LOSS: [16.045723]
Step 300, LOSS: [29.083668]
Epoch 5:
Step 0, LOSS: [7.6336718]
Step 100, LOSS: [6.8079696]
Step 200, LOSS: [5.772628]
Step 300, LOSS: [13.515457]
Epoch 6:
Step 0, LOSS: [21.24993]
Step 100, LOSS: [3.1834276]
Step 200, LOSS: [4.573825]
Step 300, LOSS: [3.5942945]
Epoch 7:
Step 0, LOSS: [6.5755787]
Step 100, LOSS: [3.9641469]
Step 200, LOSS: [21.544903]
Step 300, LOSS: [2.9790657]
Epoch 8:
Step 0, LOSS: [3.9184122]
Step 100, LOSS: [3.078453]
Step 200, LOSS: [2.7673883]
Step 300, LOSS: [2.9712257]
Epoch 9:
Step 0, LOSS: [9.139435]
Step 100, LOSS: [2.8860111]

In [29]:
ref_word = em[word2idx['childhood']]

cosine_dists = np.dot(em,ref_word)

wrds = np.argsort(cosine_dists)[::-1][1:10]

for w in wrds:
    print(idx2word[w])
    print(cosine_dists[w])


oh
0.8336346
thought
0.8287991
eyes
0.82837164
got
0.82503355
with
0.82464486
rhythm
0.8235712
a
0.8229039
baby,
0.81785923
devil
0.816272


In [34]:
np.savez(open('data/word2vec/mj_lyrics_w2v.npz', 'wb'),em)

In [36]:
with open('data/word2vec/word2idx.pkl','wb') as handle:
    pickle.dump(word2idx,handle)
with open('data/word2vec/idx2word.pkl','wb') as handle:
    pickle.dump(idx2word,handle)