### Structuring TensorFlow Models

**Phase-I: Assemble Graph**
1. Import data (`tf.data` or define `placeholders` for input and output)
2. Define the `weights`
3. Define the inference `model` (i.e. Forward path of the model)
4. Define `cost` function
5. Define `optimizer`

**Phase-II: Execute Computation (i.e. Train Model)**
1. Initialize all model variables for the first time
2. Feed in the training data. Might involve randomizing the order of data samples
3. Execute the inference `model` on the training data
4. Compute the `loss`
5. Adjust the model `weights` to minimize/maximize `loss` depending on the model

### Structuring TensorFlow Models For Reuse

**Reusable Models**: Using Object Oriented Programming to make models reusable

- Define a `class` for the model
- Set up model in a collection (e.g. map)

> Reusing a model without rebuilding it: Big models that take a long time to build - save the `graph_def` in a file and then load it

**`Word Embedding`**
- Captures the semantic relationships between words

**Word2Vec: skip-gram**
- Softmax is computationally expensive (because of the size of word vocabulary)
- Negative sampling (a simplified version of `Noise Contrastive Estimation (NCE)`). NCE guarantes approximation to softmax, where as Negative sampling does not approximate to softmax.

In [4]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Adjust verbosity to suppress information logs

import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector # For visualizing embeddings

**Data Pre-Processing**

In [5]:
import gzip
import shutil
import zipfile
import random

from six.moves import urllib
from collections import Counter

# Utility functions
def make_dir(path):
    """Create a directory if directory does not exist."""
    try:
        os.mkdir(path)
    except OSError:
        pass
    
def download_one_file(download_url, local_dest, expected_byte=None, unzip_and_remove=False):
    """ 
    Download the file from download_url into local_dest, if the file doesn't already exists.
    If expected_byte is provided, check if the downloaded file has the same number of bytes.
    If unzip_and_remove is True, unzip the file and remove the zip file
    """
    if os.path.exists(local_dest) or os.path.exists(local_dest[:-3]):
        print('%s already exists' % local_dest)
    else:
        print('Downloading %s' % download_url)
        local_file, _ = urllib.request.urlretrieve(download_url, local_dest)
        file_stat = os.stat(local_dest)
        if expected_byte:
            if file_stat.st_size == expected_byte:
                print('Successfully downloaded %s' % local_dest)
                if unzip_and_remove:
                    with gzip.open(local_dest, 'rb') as f_in, open(local_dest[:-3],'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                    os.remove(local_dest)
            else:
                print('The downloaded file has unexpected number of bytes')

def read_data(file_path):
    """Read data into a list of tokens. There should be 17,005,207 tokens."""
    with zipfile.ZipFile(file_path) as f:
        # Convert input to string using Python 2 vs 3 compatibility as_str()
        tokens = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return tokens

def build_vocab(tokens, vocab_size, visual_fld):
    """Build vocabulary of VOCAB_SIZE most frequent tokens and write
    it to visualization/vocab.tsv
    """
    # Create vocabulary
    dictionary = dict()
    count = [('UNK', -1)] # For Unknown words
    count.extend(Counter(tokens).most_common(vocab_size - 1)) # Extend the list (ensures that UNK is 0th)
    index = 0
    
    # Create directory to store vocab.tsv
    make_dir(visual_fld)
    with open(os.path.join(visual_fld, 'vocab.tsv'), "w") as f:
        for token, _ in count:
            dictionary[token] = index
            index += 1
            f.write(token + '\n')
            
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, index_dictionary

def convert_tokens_to_index(tokens, dictionary):
    """Replace each token in the dataset with its index in the dictionary."""
    return [dictionary[token] if token in dictionary else 0 for token in tokens]

def generate_sample(index_tokens, context_window_size):
    """Form training pairs according to the skip-gram model."""
    for index, center in enumerate(index_tokens):
        context = random.randint(1, context_window_size)
        # Get a random target token before the center token
        for target in index_tokens[max(0, index - context): index]:
            yield center, target
        # Get a random target token after the center token
        for target in index_tokens[index + 1: index + context + 1]:
            yield center, target
            
def most_common_tokens(visual_fld, num_visualize):
    """Create a list of num_visualize most frequent words to visualize
    on Tenserboard. The list is saved to visualization/vocab_[num_visualize].tsv"""
    vocab = open(os.path.join(visual_fld, 'vocab.tsv'), 'r').readlines()[:num_visualize]
    tokens = [token for token in vocab]
    path = os.path.join(visual_fld, 'vocab_' + str(num_visualize) + '.tsv')
    with open(path, 'w') as f:
        for token in tokens:
            f.write(token)

def batch_gen(download_url, expected_byte, vocab_size, batch_size, skip_window, visual_fld):
    local_dest = './data/text8.zip'
    download_one_file(download_url, local_dest, expected_byte)
    tokens = read_data(local_dest)
    dictionary, _ = build_vocab(tokens, vocab_size, visual_fld)
    index_tokens = convert_tokens_to_index(tokens, dictionary)
    del tokens  # To save memory
    single_gen = generate_sample(index_tokens, skip_window)
    
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(single_gen)
        yield center_batch, target_batch

In [6]:
# Parameters for downloading data
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016

# Model Hyper-parameters
VOCAB_SIZE = 50000
BATCH_SIZE = 64
EMBED_SIZE = 300  # Dimension of the word embedding vectors
SKIP_WINDOW = 1  # Context window size
NUM_SAMPLED = 25  # Number of negative examples to sample
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 20000
SKIP_STEP = 2000  # Steps to skip before reporting the loss
VISUAL_FLD = 'visualization'  # Directory name
NUM_VISUALIZE = 1000  # number of tokens to visualize

**Skip-Gram (Reusable)** - Build model as a `class`

In [10]:
# Model

class SkipGramModel:
    """
    Build the graph for Word2Vec (skip-gram) model.
    """
    def __init__(self, dataset, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
        self.dataset = dataset
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.learning_rate = learning_rate
        self.skip_step = SKIP_STEP
        self.global_step = tf.get_variable('global_step', initializer=tf.constant(0), 
                                           trainable=False)
    
    def _import_data(self):
        """
        Step 1: Import data
        """
        self.iterator = self.dataset.make_initializable_iterator()
        self.center_words, self.target_words = self.iterator.get_next()
    
    def _create_embedding(self):
        """
        Step 2: Embeddings (i.e. weights)
        """
        with tf.name_scope('embeddings'):
            self.embed_matrix = tf.get_variable('embed_matrix', 
                                                shape=[self.vocab_size, self.embed_size],
                                                initializer=tf.random_uniform_initializer())
            
            self.embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, 
                                                name='embedding')
    
    def _create_loss(self):
        """
        Step 3 + 4: Define the inference + the loss function
        """
        with tf.name_scope('loss'):
            # Create variables for NCE loss
            nce_weight = tf.get_variable(name='nce_weight', 
                                         shape=[self.vocab_size, self.embed_size],
                                         initializer=tf.truncated_normal_initializer(
                                             stddev=1.0/self.embed_size ** 0.5))
            
            nce_bias = tf.get_variable(name='nce_bias', initializer=tf.zeros([self.vocab_size]))
            
            self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, 
                                                      biases=nce_bias, 
                                                      labels=self.target_words, 
                                                      inputs=self.embed, 
                                                      num_sampled=self.num_sampled, 
                                                      num_classes=self.vocab_size), name='loss')
            
    
    def _create_optimizer(self):
        """
        Step 5: Define optimizer
        """
        self.optimozer = tf.train.GradientDescentOptimizer(
            self.learning_rate).minimize(self.loss, global_step=self.global_step)
    
    def _create_summaries(self):
        """
        For visualization
        """
        with tf.name_scope(name='summaries'):
            tf.summary.scalar(name='loss', tensor=self.loss)
            tf.summary.histogram(name='histogram_loss', values=self.loss)
            
            # Because there are several summaries: Merge them all into one
            # op to make it easier to manage
            self.summary_op = tf.summary.merge_all()
        
    def build_graph(self):
        """
        Build graph for the skip-gram model
        """
        self._import_data()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()
    
    def train(self, num_train_steps):
        """
        Training loop
        """
        saver = tf.train.Saver()  # Defaults to saving all variables: embed_matrix, nce_weight, nce_bias
        
        make_dir('checkpoints')
        
        with tf.Session() as sess:  # How would this work in dmed case?
            sess.run(self.iterator.initializer)  # Why?
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
            
            # If checkpoint exists then restore from checkpoint
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
                
            total_loss = 0.0  # For calculating average los in last SKIP_STEP steps
            writer = tf.summary.FileWriter('graphs/word2vec/lr' + str(self.learning_rate), sess.graph)
            initial_step = self.global_step.eval()  # What is this for?
            
            for index in range(initial_step, initial_step + num_train_steps):
                try:
                    batch_loss, _, summary = sess.run([self.loss, self.optimizer, self.summary_op])
                    writer.add_summary(summary, global_step=index)
                    total_loss += batch_loss
                    if (index + 1) % self.skip_step == 0:
                        print('Average loss at step {}:{:5.1f}'.format(index,
                                                                       total_loss/self.skip_step))
                        
                        total_loss = 0.0
                        saver.save(sess, 'checkpoints/skip-gram', index)
                        
                except tf.errors.OutOfRangeError:
                    sess.run(self.iterator.initializer)
            writer.close()
            
    def visualize(self):
        """
        Run `tensorboard --logdir='visualization'` to see the embeddings
        """
        # Create a list of most common num_visualize words to visualize
        most_common_tokens(visual_fld, num_visualize)
        
        saver = tf.train.Saver()
        
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
            
            # If checkpoint exists then restore from checkpoint
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
                
            final_embed_matrix = sess.run(self.embed_matrix)
            
            # Need to store embeddings in a new variable
            embedding_var = tf.Variable(final_embed_matrix[:num_visualize], name='embedding')
            sess.run(embedding_var.initializer)
            
            config = projector.ProjectorConfig()
            summary_writer = tf.summary.FileWriter(visual_fld)
            
            # Add embedding to the config file

In [11]:
def generator():
    pass

In [3]:
# Parameters
VOCAB_SIZE = 50000
BATCH_SIZE = 64
EMBED_SIZE = 300  # Dimension of the word embedding vectors
SKIP_WINDOW = 1  # Context window size
NUM_SAMPLED = 25  # Number of negative examples to sample
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 20000
SKIP_STEP = 2000  # Steps to skip before reporting the loss

# Word2Vec Graph for Skip-Gram
def word2vec(batch_gen):
    """Build the graph for word2vec model and train it."""
    # Step 1: define the placeholders for input and output
    # center_words have to be int to work on embedding lookup

    # TODO
    # Instead of using one-hot vectors, input the index of those words directly. 
    # For example, if the center word is the 1001th word in the vocabulary, input the number 1001.
    center_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
    target_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1]) # [BATCH_SIZE] -> ValueError: Shape must be rank 2 but is rank 1

    # Step 2: define weights. In word2vec, it's actually the weights that we care about
    # vocab size x embed size
    # initialized to random uniform -1 to 1

    # TODO
    # If one word is represented with a vector of size EMBED_SIZE, then the embedding matrix will 
    # have shape [VOCAB_SIZE, EMBED_SIZE]
    embed_matrix = tf.Variable(initial_value=tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], minval=-1, maxval=1))


    # Step 3: define the inference
    # get the embed of input words using tf.nn.embedding_lookup
    # embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed')

    # TODO
    # embed_matrix has dimension VOCAB_SIZE x EMBED_SIZE, with each row of the embedding matrix 
    # corresponds to the vector representation of the word at that index
    embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed')


    # Step 4: construct variables for NCE loss
    # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
    # nce_weight (vocab size x embed size), intialized to truncated_normal stddev=1.0 / (EMBED_SIZE ** 0.5)
    # bias: vocab size, initialized to 0

    # TODO
    nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE], stddev=1.0/(EMBED_SIZE**0.5)), 
                             name='nce_weight')
    nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]), name='nce_bias')

    # define loss function to be NCE loss function
    # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
    # need to get the mean accross the batch
    # note: you should use embedding of center words for inputs, not center words themselves

    # TODO
    nce_loss = tf.nn.nce_loss(weights=nce_weight, biases=nce_bias, labels=target_words, inputs=embed, 
                              num_sampled=NUM_SAMPLED, num_classes=VOCAB_SIZE)
    # Mean accross the batch
    loss = tf.reduce_mean(nce_loss)

        
    # Step 5: define optimizer
    
    # TODO
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE).minimize(loss)

    with tf.Session() as sess:
        
        # TODO: initialize variables
        sess.run(tf.global_variables_initializer())

        total_loss = 0.0 # To calculate the average loss in the last SKIP_STEP steps
        writer = tf.summary.FileWriter('./graphs/skip-gram/', sess.graph)
        
        for index in range(NUM_TRAIN_STEPS):
            
            # Get batch data
            centers, targets = batch_gen.next()
            
            # TO DO: create feed_dict, run optimizer, fetch loss_batch
            _, loss_batch = sess.run([optimizer, loss], feed_dict={center_words: centers, target_words: targets})

            total_loss += loss_batch
            if (index + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss/SKIP_STEP))
                total_loss = 0.0
        writer.close()

In [4]:
# Get text data and pre-process it
batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)

# Train
word2vec(batch_gen)

Dataset is ready.
Average loss at step 1999: 140.7
Average loss at step 3999:  53.4
Average loss at step 5999:  31.1
Average loss at step 7999:  21.8
Average loss at step 9999:  15.2
Average loss at step 11999:  14.4
Average loss at step 13999:  11.6
Average loss at step 15999:  10.5
Average loss at step 17999:  10.3
Average loss at step 19999:   9.0


### Name Scope
- For grouping nodes together
```python
with tf.name_scope('name_of_scope'):
    # Declare Operation-1
    # Declare Operation-2
    # ...
```

**Skip Gram Class**

In [None]:
# From Course examples
def train_model(model, batch_gen, num_train_steps, weights_fld):
    saver = tf.train.Saver() # defaults to saving all variables - in this case embed_matrix, nce_weight, nce_bias

    initial_step = 0
    make_dir('checkpoints')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        # Get CheckpointState proto from the "checkpoint" file.
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
        
        # if that checkpoint exists, restore from checkpoint
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

        total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
        writer = tf.summary.FileWriter('improved_graph/lr' + str(LEARNING_RATE), sess.graph)
        initial_step = model.global_step.eval()
        for index in range(initial_step, initial_step + num_train_steps):
            centers, targets = batch_gen.next()
            feed_dict={model.center_words: centers, model.target_words: targets}
            loss_batch, _, summary = sess.run([model.loss, model.optimizer, model.summary_op], 
                                              feed_dict=feed_dict)
            writer.add_summary(summary, global_step=index)
            total_loss += loss_batch
            if (index + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                total_loss = 0.0
                saver.save(sess, 'checkpoints/skip-gram', index)
        
        ####################
        # code to visualize the embeddings. uncomment the below to visualize embeddings
        # run "'tensorboard --logdir='processed'" to see the embeddings
        # final_embed_matrix = sess.run(model.embed_matrix)
        
        # # it has to variable. constants don't work here. you can't reuse model.embed_matrix
        # embedding_var = tf.Variable(final_embed_matrix[:1000], name='embedding')
        # sess.run(embedding_var.initializer)

        # config = projector.ProjectorConfig()
        # summary_writer = tf.summary.FileWriter('processed')

        # # add embedding to the config file
        # embedding = config.embeddings.add()
        # embedding.tensor_name = embedding_var.name
        
        # # link this tensor to its metadata file, in this case the first 500 words of vocab
        # embedding.metadata_path = 'processed/vocab_1000.tsv'

        # # saves a configuration file that TensorBoard will read during startup.
        # projector.visualize_embeddings(summary_writer, config)
        # saver_embed = tf.train.Saver([embedding_var])
        # saver_embed.save(sess, 'processed/model3.ckpt', 1)

In [None]:
# Train using function
tf.reset_default_graph()
skg = SkipGramModel(VOCAB_SIZE, BATCH_SIZE, EMBED_SIZE, NUM_SAMPLED, LEARNING_RATE)
skg.build_graph()
train_model(skg, batch_gen, NUM_TRAIN_STEPS, WEIGHTS_FLD)

In [None]:
class SkipGramModel():
    """
    Build the graph for Word2Vec model.
    """
    def __init__(self, vocab_size, batch_size, embed_size, num_sampled, learning_rate):
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.embed_size = embed_size
        self.num_sampled = num_sampled
        self.learning_rate = learning_rate
        self.global_step = tf.Variable(initial_value=0, dtype=tf.int32, trainable=False, name='global_step')
        
    
    def _create_placeholder(self):
        """Step 1: define placeholders for input and output"""
        with tf.name_scope('data'):
            self.center_words = tf.placeholder(tf.int32, shape=[self.batch_size])
            self.target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1])
    
    def _create_embeddings(self):
        """Step 2: define weights, (vocab size x embed size)"""
        with tf.device('/cpu:0'):
            with tf.name_scope('embeddings'):
                self.embed_matrix = tf.Variable(initial_value=tf.random_uniform([self.vocab_size, self.embed_size], 
                                                                                minval=-1, maxval=1), name='embed_matrix')
    
    def _create_loss(self):
        """Step 3 + 4: define the inference + the loss function"""
        with tf.name_scope('loss'):
            # Step 3: define the inference
            self.embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, name='embed')
            
            # Step 4: construct variables for NCE loss
            self.nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size], 
                                                              stddev=1.0/(self.embed_size**0.5)), name='nce_weight')
        
            self.nce_bias = tf.Variable(tf.zeros([self.vocab_size]), name='nce_bias')
            
            self.nce_loss = tf.nn.nce_loss(weights=self.nce_weight, biases=self.nce_bias, labels=self.target_words, 
                                           inputs=self.embed, num_sampled=self.num_sampled, num_classes=self.vocab_size)
            # Mean accross the batch
            self.loss = tf.reduce_mean(self.nce_loss)
            
    
    def _create_optimizer(self):
        """Step 5: define optimizer"""
        self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss, 
                                                                                                      global_step=self.global_step)
        
    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar("loss", self.loss)
            tf.summary.histogram("histogram_loss", self.loss)
            
            # Because there are 2 summaries - merge summaries into one op to make it easier to manage
            self.summary_op = tf.summary.merge_all()
        
    def build_graph(self):
        self._create_placeholder()
        self._create_embeddings()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()
    
#     def train(self, batch_gen, num_train_steps, skip_step, weights_fld):
        
#         # Instantiate class
#         sgm = self.__class__(self.vocab_size, self.batch_size, self.embed_size, self.num_sampled, 
#                              self.learning_rate)
        
#         # Build graph
#         sgm.build_graph()
        
#         # Object to save and restore variables
#         saver = tf.train.Saver() # defaults to saving all variables - embed_matrix, nce_weight, nce_bias
        
#         # Create directory for checkpoints
#         make_dir('checkpoints')
        
#         with tf.Session() as sess:
            
#             # Initialize variables
#             sess.run(tf.global_variables_initializer())
                        
#             # Get CheckpointState proto from the "checkpoint" file.
#             ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
            
#             # If checkpoint exists then restroe from checkpoint
#             if ckpt and ckpt.model_checkpoint_path:
#                 saver.restore(sess, ckpt.model_checkpoint_path)
                
#             total_loss = 0.0 # To calculate late average loss in the last SKIP_STEP steps
            
#             # To write `Summary` protocol buffers to event files
#             writer = tf.summary.FileWriter('improved_graph/lr' + str(self.learning_rate), sess.graph)
            
#             initial_step = sgm.global_step.eval() # Why?
            
#             for index in range(initial_step, initial_step + num_train_steps):
#                 centers, targets = batch_gen.next()
#                 feed_dict = {sgm.center_words: centers, sgm.target_words: targets}
#                 batch_loss, _, summary = sess.run([sgm.loss, sgm.optimizer, sgm.summary_op],
#                                                   feed_dict=feed_dict)
                
#                 writer.add_summary(summary, global_step=index)
#                 total_loss += batch_loss
#                 if (index + 1) % skip_step == 0:
#                     print('Average loss at step {}: {:5.1f}'.format(index, total_loss / skip_step))
#                 total_loss = 0.0
#                 saver.save(sess, 'checkpoints/skip-gram', index)

In [None]:
# Train using SkipGramModel method
tf.reset_default_graph()
skg = SkipGramModel(VOCAB_SIZE, BATCH_SIZE, EMBED_SIZE, NUM_SAMPLED, LEARNING_RATE)
skg.train(batch_gen, NUM_TRAIN_STEPS, SKIP_STEP, WEIGHTS_FLD)

### Scratch

In [None]:
# Global Step
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(skg.global_step.eval())

In [None]:
tokens = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'a', 'b', 'c', 'd', 'e', 'a', 'b', 'c']
print(Counter(tokens))
print(Counter(tokens).most_common(3))
count = [('u', -1)]
count.extend(Counter(tokens).most_common(3))
count

In [None]:
text = """Randomness is the lack of pattern or predictability in events.
A random sequence of events, symbols or steps has no order and does not 
follow an intelligible pattern or combination. Individual random events 
are by definition unpredictable, but in many cases the frequency of 
different outcomes over a large number of events (or "trials") is 
predictable. For example, when throwing two dice, the outcome of any 
particular roll is unpredictable, but a sum of 7 will occur twice as 
often as 4. In this view, randomness is a measure of uncertainty of an 
outcome, rather than haphazardness, and applies to concepts of chance, 
probability, and information entropy."""

tokens = text.split()
count = [('UNK', -1)]
count.extend(Counter(tokens).most_common(50))

# Create dictionary and index dictionary
dictionary = dict()
index = 0
for token, _ in count:
    dictionary[token] = index
    index += 1
    
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

# Replace each token in the dataset with its index in the dictionary
index_tokens = [dictionary[token] if token in dictionary else 0 for token in tokens]

In [None]:
index_tokens[0:1]

In [None]:
context_window_size = 5
for index, center in enumerate(index_tokens):
    context = random.randint(1, context_window_size)
    print(index-context, index)
    for target in index_tokens[max(0, index - context): index]:
        print(index_dictionary[center], index_dictionary[target])