**Phase-I: Assemble Graph**
1. Import data (`tf.data` or define `placeholders` for input and output)
2. Define the `weights`
3. Define the inference `model` (i.e. Forward path of the model)
4. Define `cost` function
5. Define `optimizer`

**Phase-II: Execute Computation (i.e. Train Model)**
1. Initialize all model variables for the first time
2. Feed in the training data. Might involve randomizing the order of data samples
3. Execute the inference `model` on the training data
4. Compute the `loss`
5. Adjust the model `weights` to minimize/maximize `loss` depending on the model

**`Word Embedding`**
- Captures the semantic relationships between words

**Word2Vec: skip-gram**
- Softmax is computationally expensive (because of the size of word vocabulary)
- Negative sampling (a simplified version of `Noise Contrastive Estimation (NCE)`). NCE guarantes approximation to softmax, where as Negative sampling does not approximate to softmax.

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Adjust verbosity to suppress information logs

import numpy as np
import tensorflow as tf
import tensorflow.contrib.eager as tfe

tfe.enable_eager_execution()

**Data Pre-Processing**

In [10]:
import gzip
import shutil
import zipfile
import random

from six.moves import urllib
from collections import Counter

# Utility functions
def make_dir(path):
    """Create a directory if directory does not exist."""
    try:
        os.mkdir(path)
    except OSError:
        pass
    
def download_one_file(download_url, local_dest, expected_byte=None, unzip_and_remove=False):
    """ 
    Download the file from download_url into local_dest, if the file doesn't already exists.
    If expected_byte is provided, check if the downloaded file has the same number of bytes.
    If unzip_and_remove is True, unzip the file and remove the zip file
    """
    if os.path.exists(local_dest) or os.path.exists(local_dest[:-3]):
        print('%s already exists' % local_dest)
    else:
        print('Downloading %s' % download_url)
        local_file, _ = urllib.request.urlretrieve(download_url, local_dest)
        file_stat = os.stat(local_dest)
        if expected_byte:
            if file_stat.st_size == expected_byte:
                print('Successfully downloaded %s' % local_dest)
                if unzip_and_remove:
                    with gzip.open(local_dest, 'rb') as f_in, open(local_dest[:-3],'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                    os.remove(local_dest)
            else:
                print('The downloaded file has unexpected number of bytes')

def read_data(file_path):
    """Read data into a list of tokens. There should be 17,005,207 tokens."""
    with zipfile.ZipFile(file_path) as f:
        # Convert input to string using Python 2 vs 3 compatibility as_str()
        tokens = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return tokens

def build_vocab(tokens, vocab_size, visual_fld):
    """Build vocabulary of VOCAB_SIZE most frequent tokens and write
    it to visualization/vocab.tsv
    """
    # Create vocabulary
    dictionary = dict()
    count = [('UNK', -1)] # For Unknown words
    count.extend(Counter(tokens).most_common(vocab_size - 1)) # Extend the list (ensures that UNK is 0th)
    index = 0
    
    # Create directory to store vocab.tsv
    make_dir(visual_fld)
    with open(os.path.join(visual_fld, 'vocab.tsv'), "w") as f:
        for token, _ in count:
            dictionary[token] = index
            index += 1
            f.write(token + '\n')
            
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, index_dictionary

def convert_tokens_to_index(tokens, dictionary):
    """Replace each token in the dataset with its index in the dictionary."""
    return [dictionary[token] if token in dictionary else 0 for token in tokens]

def generate_sample(index_tokens, context_window_size):
    """Form training pairs according to the skip-gram model."""
    for index, center in enumerate(index_tokens):
        context = random.randint(1, context_window_size)
        # Get a random target token before the center token
        for target in index_tokens[max(0, index - context): index]:
            yield center, target
        # Get a random target token after the center token
        for target in index_tokens[index + 1: index + context + 1]:
            yield center, target
            
def most_common_tokens(visual_fld, num_visualize):
    """Create a list of num_visualize most frequent words to visualize
    on Tenserboard. The list is saved to visualization/vocab_[num_visualize].tsv"""
    vocab = open(os.path.join(visual_fld, 'vocab.tsv'), 'r').readlines()[:num_visualize]
    tokens = [token for token in vocab]
    path = os.path.join(visual_fld, 'vocab_' + str(num_visualize) + '.tsv')
    with open(path, 'w') as f:
        for token in tokens:
            f.write(token)

def batch_gen(download_url, expected_byte, vocab_size, batch_size, skip_window, visual_fld):
    local_dest = './data/text8.zip'
    download_one_file(download_url, local_dest, expected_byte)
    tokens = read_data(local_dest)
    dictionary, _ = build_vocab(tokens, vocab_size, visual_fld)
    index_tokens = convert_tokens_to_index(tokens, dictionary)
    del tokens  # To save memory
    single_gen = generate_sample(index_tokens, skip_window)
    
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(single_gen)
        yield center_batch, target_batch

In [22]:
# Parameters for downloading data
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016

# Model Hyper-parameters
VOCAB_SIZE = 50000
BATCH_SIZE = 64
EMBED_SIZE = 300  # Dimension of the word embedding vectors
SKIP_WINDOW = 1  # Context window size
NUM_SAMPLED = 25  # Number of negative examples to sample
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 20000
SKIP_STEP = 2000  # Steps to skip before reporting the loss
VISUAL_FLD = 'visualization'  # Directory name

In [26]:
class Word2Vec(object):
    def __init__(self, vocab_size, embed_size, num_sampled=NUM_SAMPLED):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.num_sampled = num_sampled
        
        # Create the variables: embedding matrix, nce weight, nce bias
        # ============================================================
        # If one word is represented with a vector of size EMBED_SIZE, then the embedding matrix will 
        # have shape [VOCAB_SIZE, EMBED_SIZE]
        self.embed_matrix = tfe.Variable(initial_value=tf.random_uniform([self.vocab_size, self.embed_size], 
                                                                        minval=-1, maxval=1), name='embeddings')
        
        # Shape: [VOCAB_SIZE, EMBED_SIZE]
        self.nce_weight = tfe.Variable(initial_value=tf.truncated_normal([self.vocab_size, self.embed_size], 
                                                                        stddev=1.0/(self.embed_size**0.5)), name='nce_weight')
        
        # Shape: VOCAB_SIZE
        self.nce_bias = tfe.Variable(tf.zeros([self.vocab_size]), name='nce_bias')
        
        
    def compute_loss(self, center_words, target_words):
        """Computes the forward pass of Word2Vec with NCE loss."""
        # Look up the embeddings for the center word
        embed = tf.nn.embedding_lookup(self.embed_matrix, center_words, name='embed')
        
        
        # Compute loss using: tf.reduce_mean and tf.nn.nce_loss
        loss = tf.nn.nce_loss(weights=self.nce_weight, biases=self.nce_bias, 
                              labels=target_words, inputs=embed, num_sampled=self.num_sampled, 
                              num_classes=self.vocab_size)
        
        return tf.reduce_mean(loss)
    


def generate():
    yield from batch_gen(DOWNLOAD_URL, EXPECTED_BYTES, VOCAB_SIZE, 
                         BATCH_SIZE, SKIP_WINDOW, VISUAL_FLD)
    

def train():
    # Dataset
    dataset = tf.data.Dataset.from_generator(generate, output_types=(tf.int32, tf.int32), 
                                             output_shapes=(tf.TensorShape([BATCH_SIZE]), 
                                                            tf.TensorShape([BATCH_SIZE, 1])))


    # Optimizer
    optimizer = tf.train.GradientDescentOptimizer([LEARNING_RATE])
    
    # Create the model
    model = Word2Vec(vocab_size=VOCAB_SIZE, embed_size=EMBED_SIZE)
    
    # Create the gradients function (using tfe.implicit_value_and_gradients)
    grad_fn = tfe.implicit_value_and_gradients(model.compute_loss)
    
    
    total_loss = 0.0  # For average loss in the last SKIP_STEP steps
    num_train_steps = 0
    while num_train_steps < NUM_TRAIN_STEPS:
        for center_words, target_words in tfe.Iterator(dataset):
            if num_train_steps >= NUM_TRAIN_STEPS:
                break
                
            # Compute the loss and gradients, and take an optimizaton step
            loss_batch, grads = grad_fn(center_words, target_words)
            total_loss += loss_batch
            optimizer.apply_gradients(grads)
            
            if (num_train_steps + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(num_train_steps,
                                                                total_loss/SKIP_STEP))
                total_loss = 0.0
            num_train_steps += 1

In [27]:
# Training
train()

./data/text8.zip already exists
Average loss at step 1999:  68.2
Average loss at step 3999:  42.7
Average loss at step 5999:  32.3
Average loss at step 7999:  26.5
Average loss at step 9999:  22.2
Average loss at step 11999:  19.2
Average loss at step 13999:  16.8
Average loss at step 15999:  15.2
Average loss at step 17999:  13.2
Average loss at step 19999:  12.0


### Scratch

In [None]:
# Global Step
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(skg.global_step.eval())

In [None]:
tokens = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'a', 'b', 'c', 'd', 'e', 'a', 'b', 'c']
print(Counter(tokens))
print(Counter(tokens).most_common(3))
count = [('u', -1)]
count.extend(Counter(tokens).most_common(3))
count

In [None]:
text = """Randomness is the lack of pattern or predictability in events.
A random sequence of events, symbols or steps has no order and does not 
follow an intelligible pattern or combination. Individual random events 
are by definition unpredictable, but in many cases the frequency of 
different outcomes over a large number of events (or "trials") is 
predictable. For example, when throwing two dice, the outcome of any 
particular roll is unpredictable, but a sum of 7 will occur twice as 
often as 4. In this view, randomness is a measure of uncertainty of an 
outcome, rather than haphazardness, and applies to concepts of chance, 
probability, and information entropy."""

tokens = text.split()
count = [('UNK', -1)]
count.extend(Counter(tokens).most_common(50))

# Create dictionary and index dictionary
dictionary = dict()
index = 0
for token, _ in count:
    dictionary[token] = index
    index += 1
    
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

# Replace each token in the dataset with its index in the dictionary
index_tokens = [dictionary[token] if token in dictionary else 0 for token in tokens]

In [None]:
index_tokens[0:1]

In [None]:
context_window_size = 5
for index, center in enumerate(index_tokens):
    context = random.randint(1, context_window_size)
    print(index-context, index)
    for target in index_tokens[max(0, index - context): index]:
        print(index_dictionary[center], index_dictionary[target])