---------

# BiGram Model

----------

#### Built on lines of  &nbsp;&nbsp;&nbsp;&nbsp;   [representation_learning/word2vec/BiGram_barebones_1_wiki.ipynb](http://localhost:8888/notebooks/representation_learning/word2vec/BiGram_barebones_1_wiki.ipynb)

### Added 
    1) tensorboard network visualizations   
    2) tensorboard loss visualizations
    3) similar words to words in validation data

#### Author : Anuj

#### Uses Wikipedia Dataset


In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline


In [2]:
import sys
import tensorflow as tf
import numpy as np
import random
import itertools

In [3]:
sys.path.append("../../Utils/")

In [4]:
from readWikiData import get_wikipedia_data

## Load the data file - map tokens to Ids, convert data to Ids

In [5]:
def get_wiki_data(n_vocab_=1000):
    sentences, word2idx = get_wikipedia_data(n_vocab=n_vocab_, n_files=10, by_paragraph=True)
    training_data = []
    vocab_size = len(word2idx)
    for sentence in sentences:
        for elem1, elem2 in zip(sentence[:-1], sentence[1:]):
            training_data.append((elem1, elem2))
    
    # this destroys the order of words in a wondow but for bigram its harmless
    # all we want is - pair of all bigrams
    training_data = list(set(training_data))   
    
    idx2word = {v:k for k, v in word2idx.iteritems()}
    return len(word2idx), training_data, word2idx, idx2word

In [6]:
vocab_size, training_data, word2idx, idx2word = get_wiki_data(n_vocab_=9999)

In [7]:
print vocab_size
print type(training_data)
print len(training_data)

10000
<type 'list'>
1659944


In [8]:
print len(word2idx.keys())
print len(idx2word.keys())

10000
10000


## Build validation set - randomly choose 100 keys from idx2word

In [9]:
# randomly pick some validation words from data

validation_size = 32
#validation_set = random.sample(idx2word.keys(), validation_size)
validation_set = random.sample(idx2word.keys(), validation_size)

In [10]:
print validation_set
print [idx2word[index] for index in validation_set]

[1375, 5115, 8555, 8342, 5003, 4755, 7461, 5688, 299, 2480, 9753, 626, 5782, 3637, 129, 4803, 7629, 8293, 8195, 9757, 7001, 4702, 3200, 7030, 5530, 4713, 8912, 7397, 1092, 9656, 8360, 4639]
['1992', 'funded', 'pennant', 'companions', 'minimal', 'lights', 'lattice', 'releasing', 'rather', 'peninsula', 'altruism', 'come', 'cruise', 'anxiety', 'include', 'emission', 'mountainous', 'codex', 'alcoholism', 'clone', 'cheaper', 'virus', 'fund', 'invested', 'overcome', 'innovation', 'enforce', 'nowadays', 'effective', 'jew', 'rooted', 'solely']


### Get batches

In [11]:
# contains list of pairs that have already been selected
bucket_list = []

def getNextBatch(bi_grams_, batch_size=1000):
    
    global bucket_list
    
    # list of possible pairs to pick from
    docs_ids_to_select = list(set(bi_grams_) - set(bucket_list))
    
    # once you exhaust the possible pais, reset
    if len(docs_ids_to_select) < batch_size:
        bucket_list = []
        docs_ids_to_select = bi_grams_
        
    # Initialize two variables 
    train_X = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    train_label = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    # pick a random chunks of pairs 
    random_docs = random.sample(docs_ids_to_select, batch_size)
    bucket_list += random_docs
    
    index = 0 
    
    # Iterate threw all the docs 
    for item in random_docs:
        train_X[index] = item[0]
        train_label[index] = item[1]  
        index += 1
        
    #flatten list of lists to a single list
    train_X = list(itertools.chain(*train_X))
    train_label = list(itertools.chain(*train_label))
            
    return train_X, train_label

In [12]:
#X, Y = getNextBatch(bi_grams_=training_data, batch_size=32)

In [13]:
#print len(X), len(Y)

In [14]:
#print X[:10], Y[:10]

# Network

In [15]:
batch_size=32
num_batches = len(training_data)/batch_size

print "Number of batches = %d" %num_batches


embedding_dims = 128

Number of batches = 51873


In [16]:
X = tf.placeholder(shape=(batch_size,), dtype = tf.int32, name='X_var')
Y = tf.placeholder(shape=(batch_size,), dtype = tf.int32, name='Y_var')
valid_X = tf.Variable(validation_set, dtype=tf.int32, name='X_valid')

In [17]:
y_oh = tf.one_hot(indices=X, depth=vocab_size, name='Converting_Y_to_Y_oh')

In [18]:
print X.get_shape()
print Y.get_shape()
print y_oh.get_shape()

(32,)
(32,)
(32, 10000)


In [19]:
embedding_layer_1 = tf.Variable(tf.truncated_normal(
    shape=(vocab_size, embedding_dims),mean=0.0, stddev=1.0, dtype=tf.float32), name="Embeddings_Matrix") 
embeded = tf.nn.embedding_lookup(embedding_layer_1, ids=X, name="Embedding_LookUp")

In [20]:
embeded.get_shape()

TensorShape([Dimension(32), Dimension(128)])

In [21]:
#softmax weights, bias
W = tf.Variable(tf.truncated_normal(
    shape=(embedding_dims, vocab_size),mean=0.0, stddev=1.0, dtype=tf.float32), name="Softmax_Weights_Matrix")
b = tf.Variable(tf.zeros(shape=(vocab_size,)), name="Softmax_Bias_Vector")


In [22]:
logits = tf.add(tf.matmul(embeded, W, name="WX"), b, name="WX_plus_b")

#logits = tf.add(tf.matmul(embed, softmax_weights, name="WX"), softmax_bias, name="WX_plus_b")

loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_oh, name="Compute_Loss")
#mean_loss = tf.reduce_mean(loss)
mean_loss = tf.reduce_mean(loss, name="Compute_mean_loss")

tf.summary.scalar("mean_loss", mean_loss)



<tf.Tensor 'mean_loss:0' shape=() dtype=string>

In [23]:
print logits.get_shape()
print y_oh.get_shape()

(32, 10000)
(32, 10000)


In [24]:
optimizer = tf.train.GradientDescentOptimizer(0.5, name="Optimizer").minimize(mean_loss)

In [25]:
summary_op = tf.summary.merge_all()

In [26]:
#compute L2 norm for cosine similarity
norm = tf.sqrt(tf.reduce_sum(tf.square(embedding_layer_1), axis=1, keep_dims=True))
normalised_embeddings = embedding_layer_1 / norm

# get validation set embeddings
validation_data_embeddings = tf.nn.embedding_lookup(normalised_embeddings, ids=valid_X, name="validation_embeddings_lookup")

# similarity score of validation embeddings w.r.t normalised= dot product between validation_data_embeddings and mornalised embeddings
similarity = tf.matmul(validation_data_embeddings, tf.transpose(normalised_embeddings))  # C.A = C x transpose(A)

# Training

In [27]:
num_of_epochs = 20
LOG_DIR = './bigram_wiki_chk_pts'

print "Number of batches = %d" %num_batches
print "Number of epochs = %d" %num_of_epochs


validation_size = 8

Number of batches = 51873
Number of epochs = 20


In [28]:
# demo params 

num_of_epochs = 5
num_batches = 50

In [29]:


# A SIMPLE saver() to save the model
saver = tf.train.Saver()

with tf.Session() as sess:
    
    # writer to write graph to tensorboard
    writer = tf.summary.FileWriter(LOG_DIR, sess.graph)

    tf.global_variables_initializer().run()
    print "initialised\n"

    for epoch_id in range(num_of_epochs):

        av_batch_loss = 0

        for batch_id in range(num_batches):

            X_, Y_ = getNextBatch(bi_grams_=training_data, batch_size=batch_size)

            feed_dict = {}
            feed_dict[X] = X_
            feed_dict[Y] = Y_

            batch_loss, _, summary = sess.run([mean_loss, optimizer, summary_op], feed_dict=feed_dict)
            
            #writer.add_summary(batch_loss, epoch) 
            step_id = epoch_id * num_batches + batch_id
            print "step_id = %d" %step_id
            writer.add_summary(summary, global_step=step_id)

            av_batch_loss += batch_loss
            
            if batch_id % 500 == 0:
                print "\nFor epoch = %d, batch id = %d, batch loss = %f\n" %(epoch_id, batch_id, batch_loss)
            
            if batch_id % 1000 == 0:
                print "\nFor epoch = %d, batch id = %d, batch loss = %f\n" %(epoch_id, batch_id, batch_loss)
                
                #print validation data
                sim = similarity.eval() # compute similarity
                
                #iterate over each validation example
                
                for i in range(validation_size):
                    word = idx2word[validation_set[i]]
                    top_k = 8
                    # sort indexes and pick top k. we take 1:top_k+1 since 0th top pick will the same word itself
                    nearest = (-sim[i,:]).argsort()[1:top_k+1]
                    
                    #nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    
                    log = '\t Nearest to %s : ' %word
                    for k in range(top_k):
                        nearest_word = idx2word[nearest[k]]
                        log = '%s %s,' %(log, nearest_word)
                    print log        

        print "\nFor epoch = %d, Av loss = %f" %(epoch_id, av_batch_loss/num_batches)
        
        #batch.reset()
        
    save_path = saver.save(sess, LOG_DIR)
    print("Model saved in file: %s" % save_path)
        
        
        
        

initialised

step_id = 0

For epoch = 0, batch id = 0, batch loss = 32.895508


For epoch = 0, batch id = 0, batch loss = 32.895508

	 Nearest to 1992 :  oxygen, path, history, critics, 1846, furniture, chapel, decree,
	 Nearest to funded :  expectation, blockade, sudan, ed, 44, resident, richards, obvious,
	 Nearest to pennant :  defend, lung, shallow, educated, mind, contributing, geoffrey, reign,
	 Nearest to companions :  front, developers, poetic, fraud, earliest, lincolns, 1936, odd,
	 Nearest to minimal :  aviation, africans, sexual, them, crossbow, 300000, relied, vienna,
	 Nearest to lights :  volunteer, valve, darkness, 1924, legs, counsel, alike, underground,
	 Nearest to lattice :  relying, make, donald, tried, acupuncture, coma, churchill, coordinated,
	 Nearest to releasing :  dallas, assert, jewelry, tended, knight, concentrations, undergraduate, afonso,
step_id = 1
step_id = 2
step_id = 3
step_id = 4
step_id = 5
step_id = 6
step_id = 7
step_id = 8
step_id = 9
step_id = 

# Plot the Embeddings 

## Tensorboard way

In [None]:
num_of_epochs = 5
num_batches = 50

for epoch_id in range(num_of_epochs):
    for batch_id in range(num_batches):
        
        step_id = epoch_id * num_batches + batch_id
        
        print step_id
