---------

# Gentle introduction to BiGram model

----------

### Objective is correctness and clarity of basic concepts and not efficiency

#### Author : Anuj

#### Uses Wikipedia Dataset

In [39]:
import sys
import tensorflow as tf
import numpy as np
import random
import itertools

In [40]:
sys.path.append("../../Utils/")

In [41]:
from readWikiData import get_wikipedia_data

In [42]:
def get_wiki_data(n_vocab_=1000):
    sentences, word2idx, _, _ = get_wikipedia_data(n_vocab=n_vocab_, n_files=10, by_paragraph=True)
    training_data = []
    vocab_size = len(word2idx)
    for sentence in sentences:
        for elem1, elem2 in zip(sentence[:-1], sentence[1:]):
            training_data.append((elem1, elem2))
    
    # this destroys the order of words in a wondow but for bigram its harmless
    # all we want is - pair of all bigrams
    training_data = list(set(training_data))   
    
    idx2word = {v:k for k, v in word2idx.items()}
    return len(word2idx), training_data, word2idx, idx2word

In [43]:
vocab_size, training_data, word2idx, idx2word = get_wiki_data(n_vocab_=9999)

In [44]:
print (vocab_size)
print (type(training_data))

10000
<class 'list'>


In [45]:
training_data[:10]

[(1151, 604),
 (7332, 2077),
 (147, 4616),
 (2488, 1459),
 (8721, 904),
 (9999, 5827),
 (632, 8197),
 (8883, 6811),
 (3136, 5063),
 (147, 5285)]

##### Get batches

In [46]:
# contains list of pairs that have already been selected
bucket_list = []

def getNextBatch(bi_grams_, batch_size=1000):
    
    global bucket_list
    
    # list of possible pairs to pick from
    docs_ids_to_select = list(set(bi_grams_) - set(bucket_list))
    
    # once you exhaust the possible pais, reset
    if len(docs_ids_to_select) < batch_size:
        bucket_list = []
        docs_ids_to_select = bi_grams_
        
    # Initialize two variables 
    train_X = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    train_label = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    # pick a random chunks of pairs 
    random_docs = random.sample(docs_ids_to_select, batch_size)
    bucket_list += random_docs
    
    index = 0 
    
    # Iterate threw all the docs 
    for item in random_docs:
        train_X[index] = item[0]
        train_label[index] = item[1]  
        index += 1
        
    #flatten list of lists to a single list
    train_X = list(itertools.chain(*train_X))
    train_label = list(itertools.chain(*train_label))
            
    return train_X, train_label

In [47]:
X, Y = getNextBatch(bi_grams_=training_data, batch_size=32)

In [48]:
print (len(X), len(Y))

32 32


In [49]:
print (X[:10], Y[:10])

[16, 1397, 8199, 2597, 2283, 6847, 2, 8, 1236, 1548] [1743, 265, 2398, 9999, 739, 28, 3100, 1256, 167, 4]


# Network

In [50]:
batch_size=32
num_batches = len(training_data)/batch_size

print ("Number of batches = %d" %num_batches)

embedding_dims = 128



Number of batches = 42816


In [51]:
X = tf.placeholder(shape=(batch_size,), dtype = tf.int32)
Y = tf.placeholder(shape=(batch_size,), dtype = tf.int32)

In [52]:
y_oh = tf.one_hot(indices=X, depth=vocab_size)

In [53]:
print (X.get_shape())
print (Y.get_shape())
print (y_oh.get_shape())

(32,)
(32,)
(32, 10000)


In [54]:
embedding_layer_1 = tf.Variable(tf.truncated_normal(shape=(vocab_size, embedding_dims), 
                                                    mean=0.0, stddev=1.0, dtype=tf.float32))
embeded = tf.nn.embedding_lookup(embedding_layer_1, ids=X)

In [55]:
embeded.get_shape()

TensorShape([Dimension(32), Dimension(128)])

In [56]:
#softmax weights, bias
W = tf.Variable(tf.truncated_normal(shape=(embedding_dims, vocab_size),mean=0.0, stddev=1.0, dtype=tf.float32))
b = tf.Variable(tf.zeros(shape=(vocab_size,)))


In [57]:
logits = tf.add(tf.matmul(embeded, W), b )

#define loss
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_oh)
mean_loss = tf.reduce_mean(loss)

In [58]:
print (logits.get_shape())
print (y_oh.get_shape())

(32, 10000)
(32, 10000)


In [59]:
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(mean_loss)

In [60]:
num_of_epochs = 20

#batch = BatchData(batch_size=32, list_of_token_ids=data)

with tf.Session() as sess:

    tf.global_variables_initializer().run()

    for epoch_id in range(num_of_epochs):

        av_batch_loss = 0

        for batch_id in range(int(num_batches)):

            X_, Y_ = getNextBatch(bi_grams_=training_data, batch_size=batch_size)

            feed_dict = {}
            feed_dict[X] = X_
            feed_dict[Y] = Y_

            batch_loss, _ = sess.run([mean_loss, optimizer], feed_dict=feed_dict)

            av_batch_loss += batch_loss
            
            if batch_id % 100 == 0:
                print ("For epoch = %d, batch id = %d, batch loss = %f" %(epoch_id, batch_id, batch_loss))

        print ("For epoch = %d, Av loss = %f" %(epoch_id, av_batch_loss/num_batches))
        
        batch.reset()
        
        
        
        
        
        

For epoch = 0, batch id = 0, batch loss = 32.628548
For epoch = 0, batch id = 100, batch loss = 29.545616
For epoch = 0, batch id = 200, batch loss = 27.514614
For epoch = 0, batch id = 300, batch loss = 22.346369
For epoch = 0, batch id = 400, batch loss = 21.961580
For epoch = 0, batch id = 500, batch loss = 17.446415
For epoch = 0, batch id = 600, batch loss = 22.300009
For epoch = 0, batch id = 700, batch loss = 14.901987
For epoch = 0, batch id = 800, batch loss = 13.546486
For epoch = 0, batch id = 900, batch loss = 12.539949
For epoch = 0, batch id = 1000, batch loss = 15.827174
For epoch = 0, batch id = 1100, batch loss = 13.441044
For epoch = 0, batch id = 1200, batch loss = 13.772314
For epoch = 0, batch id = 1300, batch loss = 12.863209
For epoch = 0, batch id = 1400, batch loss = 11.866285
For epoch = 0, batch id = 1500, batch loss = 12.184096
For epoch = 0, batch id = 1600, batch loss = 10.843259
For epoch = 0, batch id = 1700, batch loss = 7.720001
For epoch = 0, batch id

KeyboardInterrupt: 