---------

# BiGram Model

----------

### Objective is correctness and clarity of concepts and not efficiency

#### Author : Anuj

#### Uses Wikipedia Dataset

#### Built on lines of  &nbsp;&nbsp;&nbsp;&nbsp;   [representation_learning/word2vec/BiGram_barebones_1_wiki.ipynb](http://localhost:8888/notebooks/representation_learning/word2vec/BiGram_barebones_1_wiki.ipynb)

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline


In [2]:
import sys
import tensorflow as tf
import numpy as np
import random
import itertools

In [3]:
sys.path.append("../Utils/")

In [4]:
from readWikiData import get_wikipedia_data

## Load the data file - map tokens to Ids, convert data to Ids

In [5]:
def get_wiki_data(n_vocab_=1000):
    sentences, word2idx = get_wikipedia_data(n_vocab=n_vocab_, n_files=10, by_paragraph=True)
    training_data = []
    vocab_size = len(word2idx)
    for sentence in sentences:
        for elem1, elem2 in zip(sentence[:-1], sentence[1:]):
            training_data.append((elem1, elem2))
    
    # this destroys the order of words in a wondow but for bigram its harmless
    # all we want is - pair of all bigrams
    training_data = list(set(training_data))   
    
    idx2word = {v:k for k, v in word2idx.iteritems()}
    return len(word2idx), training_data, word2idx, idx2word

In [6]:
vocab_size, training_data, word2idx, idx2word = get_wiki_data(n_vocab_=9999)

In [7]:
print vocab_size
print type(training_data)
print len(training_data)

10000
<type 'list'>


## Build validation set - randomly choose 100 keys from idx2word

In [31]:
# randomly pick some validation words from data

validation_size = 32
#validation_set = random.sample(idx2word.keys(), validation_size)
validation_set = idx2word.keys()[500:500+validation_size]

In [32]:
print validation_set
print [idx2word[index] for index in validation_set]

[500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531]
['size', 'st', 'typically', 'model', 'eventually', 'head', 'title', 'far', 'religious', 'films', 'latin', 'television', 'china', 'album', 'born', 'soviet', 'class', 'society', 'strong', 'nature', 'food', 'therefore', 'value', 'includes', 'germany', 'half', 'influence', 'market', 'complex', 'culture', 'women', 'father']


### Get batches

In [30]:
# contains list of pairs that have already been selected
bucket_list = []

def getNextBatch(bi_grams_, batch_size=1000):
    
    global bucket_list
    
    # list of possible pairs to pick from
    docs_ids_to_select = list(set(bi_grams_) - set(bucket_list))
    
    # once you exhaust the possible pais, reset
    if len(docs_ids_to_select) < batch_size:
        bucket_list = []
        docs_ids_to_select = bi_grams_
        
    # Initialize two variables 
    train_X = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    train_label = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    # pick a random chunks of pairs 
    random_docs = random.sample(docs_ids_to_select, batch_size)
    bucket_list += random_docs
    
    index = 0 
    
    # Iterate threw all the docs 
    for item in random_docs:
        train_X[index] = item[0]
        train_label[index] = item[1]  
        index += 1
        
    #flatten list of lists to a single list
    train_X = list(itertools.chain(*train_X))
    train_label = list(itertools.chain(*train_label))
            
    return train_X, train_label

In [11]:
#X, Y = getNextBatch(bi_grams_=training_data, batch_size=32)

In [12]:
#print len(X), len(Y)

32 32


In [13]:
#print X[:10], Y[:10]

# Network

In [14]:
batch_size=32
num_batches = len(training_data)/batch_size

embedding_dims = 128

In [15]:
X = tf.placeholder(shape=(batch_size,), dtype = tf.int32, name='X_var')
Y = tf.placeholder(shape=(batch_size,), dtype = tf.int32, name='Y_var')
valid_X = tf.Variable(validation_set, dtype=tf.int32, name='X_valid')

In [16]:
y_oh = tf.one_hot(indices=X, depth=vocab_size, name='Converting_Y_to_Y_oh')

In [17]:
print X.get_shape()
print Y.get_shape()
print y_oh.get_shape()

(32,)
(32,)
(32, 10000)


In [18]:
embedding_layer_1 = tf.Variable(tf.truncated_normal(
    shape=(vocab_size, embedding_dims),mean=0.0, stddev=1.0, dtype=tf.float32), name="Embeddings_Matrix") 
embeded = tf.nn.embedding_lookup(embedding_layer_1, ids=X, name="Embedding_LookUp")

In [19]:
embeded.get_shape()

TensorShape([Dimension(32), Dimension(128)])

In [20]:
#softmax weights, bias
W = tf.Variable(tf.truncated_normal(
    shape=(embedding_dims, vocab_size),mean=0.0, stddev=1.0, dtype=tf.float32), name="Softmax_Weights_Matrix")
b = tf.Variable(tf.zeros(shape=(vocab_size,)), name="Softmax_Bias_Vector")


In [21]:
logits = tf.add(tf.matmul(embeded, W, name="WX"), b, name="WX_plus_b")

#logits = tf.add(tf.matmul(embed, softmax_weights, name="WX"), softmax_bias, name="WX_plus_b")

loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_oh, name="Compute_Loss")
#mean_loss = tf.reduce_mean(loss)
mean_loss = tf.reduce_mean(loss, name="Compute_mean_loss")

tf.summary.scalar("mean_loss", mean_loss)



<tf.Tensor 'mean_loss:0' shape=() dtype=string>

In [22]:
print logits.get_shape()
print y_oh.get_shape()

(32, 10000)
(32, 10000)


In [23]:
optimizer = tf.train.GradientDescentOptimizer(0.5, name="Optimizer").minimize(mean_loss)

In [24]:
summary_op = tf.summary.merge_all()

In [25]:
#compute L2 norm for cosine similarity
norm = tf.sqrt(tf.reduce_sum(tf.square(embedding_layer_1), axis=1, keep_dims=True))
normalised_embeddings = embedding_layer_1 / norm

# get validation set embeddings
validation_data_embeddings = tf.nn.embedding_lookup(normalised_embeddings, ids=valid_X, name="validation_embeddings_lookup")

# similarity score of validation embeddings w.r.t normalised= dot product between validation_data_embeddings and mornalised embeddings
similarity = tf.matmul(validation_data_embeddings, tf.transpose(normalised_embeddings))  # C.A = C x transpose(A)

# Training

In [33]:
num_of_epochs = 20
LOG_DIR = './bigram_wiki_chk_pts'

print "Number of batches = %d" %num_batches
print "Number of epochs = %d" %num_of_epochs


validation_size = validation_size/4 # Tempararoy 

# A SIMPLE saver() to save the model
saver = tf.train.Saver()

#batch = BatchData(batch_size=32, list_of_token_ids=data)

with tf.Session() as sess:
    
    # writer to write graph to tensorboard
    writer = tf.summary.FileWriter(LOG_DIR, sess.graph)

    tf.global_variables_initializer().run()
    print "initialised\n"

    for epoch_id in range(num_of_epochs):

        av_batch_loss = 0

        for batch_id in range(num_batches):

            X_, Y_ = getNextBatch(bi_grams_=training_data, batch_size=batch_size)

            feed_dict = {}
            feed_dict[X] = X_
            feed_dict[Y] = Y_

            batch_loss, _, summary = sess.run([mean_loss, optimizer,summary_op], feed_dict=feed_dict)
            
            #writer.add_summary(batch_loss, epoch) 
            writer.add_summary(summary, global_step=epoch_id)

            av_batch_loss += batch_loss
            
            if batch_id % 500 == 0:
                print "\nFor epoch = %d, batch id = %d, batch loss = %f\n" %(epoch_id, batch_id, batch_loss)
            
            if batch_id % 1000 == 0:
                print "\nFor epoch = %d, batch id = %d, batch loss = %f\n" %(epoch_id, batch_id, batch_loss)
                
                #print validation data
                sim = similarity.eval() # compute similarity
                
                #iterate over each validation example
                
                for i in range(validation_size):
                    word = idx2word[validation_set[i]]
                    top_k = 8
                    # sort indexes and pick top k. we take 1:top_k+1 since 0th top pick will the same word itself
                    nearest = (-sim[i,:]).argsort()[1:top_k+1]
                    
                    #nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    
                    log = '\t Nearest to %s : ' %word
                    for k in range(top_k):
                        nearest_word = idx2word[nearest[k]]
                        log = '%s %s,' %(log, nearest_word)
                    print log        

        print "\nFor epoch = %d, Av loss = %f" %(epoch_id, av_batch_loss/num_batches)
        
        #batch.reset()
        
    save_path = saver.save(sess, LOG_DIR)
    print("Model saved in file: %s" % save_path)
        
        
        
        

initialised


For epoch = 0, batch id = 0, batch loss = 31.308790


For epoch = 0, batch id = 0, batch loss = 31.308790

	 Nearest to size :  nearest, aegean, n, gifts, ahead, acceleration, andrew, advice,
	 Nearest to st :  not, asl, superman, monopoly, inland, close, scripture, publications,
	 Nearest to typically :  postseason, track, lawrence, lions, analogy, temples, pounds, pennant,
	 Nearest to model :  counting, teach, domination, january, fall, losing, lincoln, note,
	 Nearest to eventually :  technological, embroidery, pedro, atari, reactions, advised, impose, concord,
	 Nearest to head :  grid, shed, conceived, much, complement, nomenclature, bernoulli, often,
	 Nearest to title :  navigation, evident, andrews, may, finals, squares, appeal, 1844,
	 Nearest to far :  regional, thereafter, classics, synonymous, everyday, abilities, therapeutic, northwest,

For epoch = 0, batch id = 500, batch loss = 20.307142


For epoch = 0, batch id = 1000, batch loss = 18.072203


For epoch

	 Nearest to far :  classics, regional, thereafter, northwest, abilities, rapidly, moor, investigations,

For epoch = 0, batch id = 8500, batch loss = 0.719281


For epoch = 0, batch id = 9000, batch loss = 0.527985


For epoch = 0, batch id = 9000, batch loss = 0.527985

	 Nearest to size :  nearest, ahead, n, aegean, donated, advice, andrew, gifts,
	 Nearest to st :  not, asl, presidency, finish, atp, defended, superman, proper,
	 Nearest to typically :  postseason, track, temples, lions, analogy, lawrence, pennant, transformed,
	 Nearest to model :  teach, counting, domination, global, january, note, consecrated, yeast,
	 Nearest to eventually :  technological, pedro, embroidery, requirement, reactions, most, assault, concord,
	 Nearest to head :  grid, shed, often, conceived, much, nomenclature, decreases, bernoulli,
	 Nearest to title :  navigation, may, evident, finals, andrews, squares, an, 130,
	 Nearest to far :  classics, regional, thereafter, northwest, abilities, moor, rapi

	 Nearest to far :  classics, regional, thereafter, northwest, 87, moor, rapidly, investigations,

For epoch = 0, batch id = 17500, batch loss = 0.016396


For epoch = 0, batch id = 18000, batch loss = 0.007790


For epoch = 0, batch id = 18000, batch loss = 0.007790

	 Nearest to size :  nearest, ahead, n, aegean, donated, andrew, advice, gifts,
	 Nearest to st :  not, asl, finish, presidency, atp, defended, proper, superman,
	 Nearest to typically :  postseason, track, temples, lions, lawrence, analogy, preservation, transformed,
	 Nearest to model :  teach, counting, domination, global, january, note, yeast, consecrated,
	 Nearest to eventually :  technological, pedro, requirement, embroidery, reactions, most, assault, 130,
	 Nearest to head :  grid, shed, often, conceived, nomenclature, much, decreases, bernoulli,
	 Nearest to title :  navigation, may, evident, finals, an, squares, andrews, engineering,
	 Nearest to far :  classics, regional, thereafter, northwest, 87, moor, rapidl


For epoch = 0, batch id = 26500, batch loss = 0.005152


For epoch = 0, batch id = 27000, batch loss = 0.007021


For epoch = 0, batch id = 27000, batch loss = 0.007021

	 Nearest to size :  nearest, ahead, n, donated, aegean, andrew, advice, gifts,
	 Nearest to st :  not, finish, asl, presidency, atp, defended, proper, superman,
	 Nearest to typically :  postseason, temples, track, lions, lawrence, analogy, preservation, transformed,
	 Nearest to model :  teach, counting, domination, global, january, note, yeast, weights,
	 Nearest to eventually :  technological, pedro, requirement, embroidery, reactions, most, assault, 130,
	 Nearest to head :  grid, shed, often, conceived, nomenclature, much, decreases, american,
	 Nearest to title :  navigation, may, evident, an, finals, squares, andrews, engineering,
	 Nearest to far :  classics, regional, thereafter, northwest, 87, moor, rapidly, investigations,

For epoch = 0, batch id = 27500, batch loss = 0.395552


For epoch = 0, batch id = 

	 Nearest to far :  classics, regional, thereafter, northwest, 87, rapidly, moor, investigations,

For epoch = 0, batch id = 35500, batch loss = 0.011230


For epoch = 0, batch id = 36000, batch loss = 0.003272


For epoch = 0, batch id = 36000, batch loss = 0.003272

	 Nearest to size :  nearest, ahead, n, donated, aegean, andrew, advice, gifts,
	 Nearest to st :  not, finish, asl, presidency, atp, defended, proper, superman,
	 Nearest to typically :  postseason, temples, track, lions, lawrence, analogy, preservation, transformed,
	 Nearest to model :  teach, counting, domination, global, january, note, yeast, weights,
	 Nearest to eventually :  technological, pedro, requirement, embroidery, reactions, most, assault, 130,
	 Nearest to head :  grid, shed, often, nomenclature, conceived, much, decreases, american,
	 Nearest to title :  navigation, may, evident, an, finals, squares, andrews, engineering,
	 Nearest to far :  classics, regional, thereafter, northwest, 87, rapidly, moor, in

	 Nearest to far :  classics, regional, thereafter, northwest, 87, rapidly, moor, investigations,

For epoch = 0, batch id = 44500, batch loss = 0.003638


For epoch = 0, batch id = 45000, batch loss = 0.002484


For epoch = 0, batch id = 45000, batch loss = 0.002484

	 Nearest to size :  nearest, ahead, n, donated, aegean, andrew, advice, gifts,
	 Nearest to st :  not, finish, asl, presidency, atp, defended, proper, superman,
	 Nearest to typically :  postseason, temples, track, lions, lawrence, analogy, preservation, transformed,
	 Nearest to model :  teach, counting, domination, global, january, note, yeast, weights,
	 Nearest to eventually :  technological, pedro, requirement, embroidery, reactions, most, assault, 130,
	 Nearest to head :  grid, shed, often, nomenclature, conceived, much, decreases, american,
	 Nearest to title :  navigation, may, evident, an, finals, squares, andrews, engineering,
	 Nearest to far :  classics, regional, thereafter, northwest, 87, rapidly, moor, in

KeyboardInterrupt: 