## Word2Vec

In [1]:
import collections
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
# Configuration
batch_size = 20

In [0]:
# Dimension of the embedding vector. Two too small to get
# any meaningful embeddings, but let's make it 2 for simple visualization
embedding_size = 2
num_sampled = 15    # Number of negative examples to sample.

### Input some sentences

In [0]:
sentences = ["I have something that I want to say to him",
            "How are you",
            "We can see many stars tonight",
            "That's our house",
            "sung likes cats",
            "she loves dogs",
            "Do you know what he has done",
            "cats are great companions when they want to be",
            "We need to invest in clean, renewable energy",
            "women love his man",
            "queen love his king",
            "girl love his boy",
            "The line is too long. Why don't you come back tomorrow",
            "man and women roam in park",
            "Does it really matter",
            "dynasty king remain mortal"]

### Convert those sentences into lines

In [0]:
# sentences to words and count
words = " ".join(sentences).split()

### List of words

In [6]:
words

['I',
 'have',
 'something',
 'that',
 'I',
 'want',
 'to',
 'say',
 'to',
 'him',
 'How',
 'are',
 'you',
 'We',
 'can',
 'see',
 'many',
 'stars',
 'tonight',
 "That's",
 'our',
 'house',
 'sung',
 'likes',
 'cats',
 'she',
 'loves',
 'dogs',
 'Do',
 'you',
 'know',
 'what',
 'he',
 'has',
 'done',
 'cats',
 'are',
 'great',
 'companions',
 'when',
 'they',
 'want',
 'to',
 'be',
 'We',
 'need',
 'to',
 'invest',
 'in',
 'clean,',
 'renewable',
 'energy',
 'women',
 'love',
 'his',
 'man',
 'queen',
 'love',
 'his',
 'king',
 'girl',
 'love',
 'his',
 'boy',
 'The',
 'line',
 'is',
 'too',
 'long.',
 'Why',
 "don't",
 'you',
 'come',
 'back',
 'tomorrow',
 'man',
 'and',
 'women',
 'roam',
 'in',
 'park',
 'Does',
 'it',
 'really',
 'matter',
 'dynasty',
 'king',
 'remain',
 'mortal']

### Count the occurance of each word

In [0]:
count = collections.Counter(words).most_common()

### Build a dictionary to lookup table

In [0]:
# Build dictionaries
reverse_dictionary = [i[0] for i in count] #reverse dic, idx -> word
dic = {w: i for i, w in enumerate(reverse_dictionary)} #dic, word -> id
voc_size = len(dic)

In [9]:
reverse_dictionary

['to',
 'you',
 'love',
 'his',
 'I',
 'want',
 'are',
 'We',
 'cats',
 'in',
 'women',
 'man',
 'king',
 'have',
 'something',
 'that',
 'say',
 'him',
 'How',
 'can',
 'see',
 'many',
 'stars',
 'tonight',
 "That's",
 'our',
 'house',
 'sung',
 'likes',
 'she',
 'loves',
 'dogs',
 'Do',
 'know',
 'what',
 'he',
 'has',
 'done',
 'great',
 'companions',
 'when',
 'they',
 'be',
 'need',
 'invest',
 'clean,',
 'renewable',
 'energy',
 'queen',
 'girl',
 'boy',
 'The',
 'line',
 'is',
 'too',
 'long.',
 'Why',
 "don't",
 'come',
 'back',
 'tomorrow',
 'and',
 'roam',
 'park',
 'Does',
 'it',
 'really',
 'matter',
 'dynasty',
 'remain',
 'mortal']

In [10]:
dic

{'Do': 32,
 'Does': 64,
 'How': 18,
 'I': 4,
 "That's": 24,
 'The': 51,
 'We': 7,
 'Why': 56,
 'and': 61,
 'are': 6,
 'back': 59,
 'be': 42,
 'boy': 50,
 'can': 19,
 'cats': 8,
 'clean,': 45,
 'come': 58,
 'companions': 39,
 'dogs': 31,
 "don't": 57,
 'done': 37,
 'dynasty': 68,
 'energy': 47,
 'girl': 49,
 'great': 38,
 'has': 36,
 'have': 13,
 'he': 35,
 'him': 17,
 'his': 3,
 'house': 26,
 'in': 9,
 'invest': 44,
 'is': 53,
 'it': 65,
 'king': 12,
 'know': 33,
 'likes': 28,
 'line': 52,
 'long.': 55,
 'love': 2,
 'loves': 30,
 'man': 11,
 'many': 21,
 'matter': 67,
 'mortal': 70,
 'need': 43,
 'our': 25,
 'park': 63,
 'queen': 48,
 'really': 66,
 'remain': 69,
 'renewable': 46,
 'roam': 62,
 'say': 16,
 'see': 20,
 'she': 29,
 'something': 14,
 'stars': 22,
 'sung': 27,
 'that': 15,
 'they': 41,
 'to': 0,
 'tomorrow': 60,
 'tonight': 23,
 'too': 54,
 'want': 5,
 'what': 34,
 'when': 40,
 'women': 10,
 'you': 1}

In [11]:
# Make indexed word data
data = [dic[word] for word in words]
print('Sample data', data[:10], [reverse_dictionary[t] for t in data[:10]])

Sample data [4, 13, 14, 15, 4, 5, 0, 16, 0, 17] ['I', 'have', 'something', 'that', 'I', 'want', 'to', 'say', 'to', 'him']


In [12]:
# Let's make a training data for window size 1 for simplicity
# ([the, brown], quick), ([quick, fox], brown), ([brown, jumped], fox),
cbow_pairs = [];
for i in range(1, len(data)-1) :
    cbow_pairs.append([[data[i-1], data[i+1]], data[i]]);
print('Context pairs', cbow_pairs[:10])

Context pairs [[[4, 14], 13], [[13, 15], 14], [[14, 4], 15], [[15, 5], 4], [[4, 0], 5], [[5, 16], 0], [[0, 0], 16], [[16, 17], 0], [[0, 18], 17], [[17, 6], 18]]


In [13]:
# Let's make skip-gram pairs
# (quick, the), (quick, brown), (brown, quick), (brown, fox), ...
skip_gram_pairs = [];
for c in cbow_pairs:
    skip_gram_pairs.append([c[1], c[0][0]])
    skip_gram_pairs.append([c[1], c[0][1]])
print('skip-gram pairs', skip_gram_pairs[:5])

skip-gram pairs [[13, 4], [13, 14], [14, 13], [14, 15], [15, 14]]


In [14]:
def generate_batch(size):
    assert size < len(skip_gram_pairs)
    x_data=[]
    y_data = []
    r = np.random.choice(range(len(skip_gram_pairs)), size, replace=False)
    for i in r:
        x_data.append(skip_gram_pairs[i][0])  # n dim
        y_data.append([skip_gram_pairs[i][1]])  # n, 1 dim
    return x_data, y_data

# generate_batch test
print ('Batches (x, y)', generate_batch(3))

Batches (x, y) ([65, 61, 66], [[64], [10], [67]])


In [15]:
# Input data
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
# Ops and variables pinned to the CPU because of missing GPU implementation
with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(
        tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs) # lookup table

# Construct the variables for the NCE loss
nce_weights = tf.Variable(
    tf.random_uniform([voc_size, embedding_size],-1.0, 1.0))
nce_biases = tf.Variable(tf.zeros([voc_size]))

# Compute the average NCE loss for the batch.
# This does the magic:
#   tf.nn.nce_loss(weights, biases, inputs, labels, num_sampled, num_classes ...)
# It automatically draws negative samples when we evaluate the loss.
loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed, num_sampled, voc_size))
# Use the adam optimizer
train_op = tf.train.AdamOptimizer(1e-1).minimize(loss)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [16]:
# Launch the graph in a session
with tf.Session() as sess:
    # Initializing all variables
    tf.global_variables_initializer().run()

    for step in range(100):
        batch_inputs, batch_labels = generate_batch(batch_size)
        _, loss_val = sess.run([train_op, loss],
                feed_dict={train_inputs: batch_inputs, train_labels: batch_labels})
        if step % 10 == 0:
            print("Loss at ", step, loss_val) # Report the loss

    # Final embeddings are ready for you to use. Need to normalize for practical use
    trained_embeddings = embeddings.eval()

Loss at  0 24.692923
Loss at  10 16.292217
Loss at  20 10.714836
Loss at  30 9.961512
Loss at  40 12.34342
Loss at  50 8.3872
Loss at  60 4.234043
Loss at  70 4.666297
Loss at  80 3.4948673
Loss at  90 3.8797882
