## Word2Vec

In [1]:
import collections
import math
import os
import errno
import random
import zipfile

import numpy as np
import tensorflow as tf
from six.moves import urllib, xrange

In [3]:
data_dir = "./tensorflow_course_resources/04-Recurrent-Neural-Networks/word2vec_data/words"
data_url = "http://mattmahoney.net/dc/text8.zip"

In [8]:
def fetch_words_data(url=data_url, words_data=data_dir):
    # Make the dir if it does not exist
    os.makedirs(words_data, exist_ok=True)
    
    # Path to zip file
    zip_path = os.path.join(words_data, "words.zip")
    
    # If the zip file isn't there, download it from the data url
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(url, zip_path)
        
    # Now that the zip file is there, get the data from it
    with zipfile.ZipFile(zip_path) as f:
        data = f.read(f.namelist()[0])
        
    # Return a list of all the words in the data source
    return data.decode("ascii").split()

In [9]:
words = fetch_words_data()

In [12]:
from collections import Counter

In [13]:
def create_counts(vocab_size=50000):
    vocab = [] + Counter(words).most_common(vocab_size)
    vocab = np.array([word for word, _ in vocab])
    dictionary = {word: code for code, word in enumerate(vocab)}
    data = np.array([dictionary.get(word,0) for word in words])
    
    return data, vocab

In [14]:
data, vocabulary = create_counts()

In [15]:
data.shape, vocabulary.shape

((17005207,), (50000,))

In [31]:
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    
    if data_index + span > len(data):
        data_index = 0
    
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
    
    if data_index == len(data):
        buffer[:] = data[:span]
        data_index = span
    else:
        buffer.append(data[data_index])
        data_index += 1
        
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [32]:
batch_size = 128
embedding_size = 150
skip_window = 1
num_skips = 2

In [33]:
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [34]:
num_sampled = 64
learning_rate = 0.01
vocabulary_size = 50000

In [35]:
tf.reset_default_graph()

In [36]:
train_inputs = tf.placeholder(tf.int32, shape=[None])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [37]:
init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)

In [38]:
embeddings = tf.Variable(init_embeds)

In [39]:
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [40]:
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0/np.sqrt(embedding_size)))
nce_bias = tf.Variable(tf.zeros([vocabulary_size]))

In [41]:
loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_bias, train_labels, embed, num_sampled, vocabulary_size))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
trainer = optimizer.minimize(loss)

In [42]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), axis=1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [43]:
data_index = 0

In [44]:
init = tf.global_variables_initializer()

num_steps = 5000
with tf.Session() as session:
    session.run(init)
    average_loss = 0
    
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        _, loss_val = session.run([trainer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        
        if step % 1000 == 0:
            if step > 0:
                average_loss = average_loss / 1000
            print("Average loss at steps: ", step, " is ", average_loss)
            average_loss = 0
        
        final_embeddings = normalized_embeddings.eval()

Average loss at steps:  0  is  308.1671447753906
Average loss at steps:  1000  is  222.73242475891112


KeyboardInterrupt: 