---------

# Gentle introduction to BiGram model

----------

### Objective is correctness and clarity of basic concepts and not efficiency

#### Author : Anuj

#### Uses Wikipedia Dataset

In [1]:
import sys
import tensorflow as tf
import numpy as np
import random
import itertools

In [2]:
sys.path.append("../../Utils/")

In [3]:
from readWikiData import get_wikipedia_data

In [4]:
def get_wiki_data(n_vocab_=1000):
    sentences, word2idx = get_wikipedia_data(n_vocab=n_vocab_, n_files=10, by_paragraph=True)
    training_data = []
    vocab_size = len(word2idx)
    for sentence in sentences:
        for elem1, elem2 in zip(sentence[:-1], sentence[1:]):
            training_data.append((elem1, elem2))
    
    # this destroys the order of words in a wondow but for bigram its harmless
    # all we want is - pair of all bigrams
    training_data = list(set(training_data))   
    
    idx2word = {v:k for k, v in word2idx.iteritems()}
    return len(word2idx), training_data, word2idx, idx2word

In [5]:
vocab_size, training_data, word2idx, idx2word = get_wiki_data(n_vocab_=9999)

In [6]:
print vocab_size
print type(training_data)

10000
<type 'list'>


In [7]:
training_data[:10]

[(1760, 65),
 (178, 1155),
 (2711, 159),
 (607, 1522),
 (312, 241),
 (9999, 5827),
 (741, 1932),
 (3102, 2312),
 (2926, 6061),
 (390, 2036)]

##### Get batches

In [8]:
# contains list of pairs that have already been selected
bucket_list = []

def getNextBatch(bi_grams_, batch_size=1000):
    
    global bucket_list
    
    # list of possible pairs to pick from
    docs_ids_to_select = list(set(bi_grams_) - set(bucket_list))
    
    # once you exhaust the possible pais, reset
    if len(docs_ids_to_select) < batch_size:
        bucket_list = []
        docs_ids_to_select = bi_grams_
        
    # Initialize two variables 
    train_X = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    train_label = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    # pick a random chunks of pairs 
    random_docs = random.sample(docs_ids_to_select, batch_size)
    bucket_list += random_docs
    
    index = 0 
    
    # Iterate threw all the docs 
    for item in random_docs:
        train_X[index] = item[0]
        train_label[index] = item[1]  
        index += 1
        
    #flatten list of lists to a single list
    train_X = list(itertools.chain(*train_X))
    train_label = list(itertools.chain(*train_label))
            
    return train_X, train_label

In [9]:
X, Y = getNextBatch(bi_grams_=training_data, batch_size=32)

In [10]:
print len(X), len(Y)

32 32


In [11]:
print X[:10], Y[:10]

[6307, 4386, 1937, 20, 855, 23, 8, 3634, 6, 536] [1064, 116, 13, 1275, 1660, 4032, 222, 3260, 5019, 4706]


# Network

In [23]:
batch_size=32
num_batches = len(training_data)/batch_size

print "Number of batches = %d" %num_batches

embedding_dims = 128



Number of batches = 51873


In [13]:
X = tf.placeholder(shape=(batch_size,), dtype = tf.int32)
Y = tf.placeholder(shape=(batch_size,), dtype = tf.int32)

In [14]:
y_oh = tf.one_hot(indices=X, depth=vocab_size)

In [15]:
print X.get_shape()
print Y.get_shape()
print y_oh.get_shape()

(32,)
(32,)
(32, 10000)


In [16]:
embedding_layer_1 = tf.Variable(tf.truncated_normal(shape=(vocab_size, embedding_dims), 
                                                    mean=0.0, stddev=1.0, dtype=tf.float32))
embeded = tf.nn.embedding_lookup(embedding_layer_1, ids=X)

In [17]:
embeded.get_shape()

TensorShape([Dimension(32), Dimension(128)])

In [18]:
#softmax weights, bias
W = tf.Variable(tf.truncated_normal(shape=(embedding_dims, vocab_size),mean=0.0, stddev=1.0, dtype=tf.float32))
b = tf.Variable(tf.zeros(shape=(vocab_size,)))


In [19]:
logits = tf.add(tf.matmul(embeded, W), b )

#define loss
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_oh)
mean_loss = tf.reduce_mean(loss)

In [20]:
print logits.get_shape()
print y_oh.get_shape()

(32, 10000)
(32, 10000)


In [21]:
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(mean_loss)

In [22]:
num_of_epochs = 20

#batch = BatchData(batch_size=32, list_of_token_ids=data)

with tf.Session() as sess:

    tf.global_variables_initializer().run()

    for epoch_id in range(num_of_epochs):

        av_batch_loss = 0

        for batch_id in range(num_batches):

            X_, Y_ = getNextBatch(bi_grams_=training_data, batch_size=batch_size)

            feed_dict = {}
            feed_dict[X] = X_
            feed_dict[Y] = Y_

            batch_loss, _ = sess.run([mean_loss, optimizer], feed_dict=feed_dict)

            av_batch_loss += batch_loss
            
            if batch_id % 100 == 0:
                print "For epoch = %d, batch id = %d, batch loss = %f" %(epoch_id, batch_id, batch_loss)

        print "For epoch = %d, Av loss = %f" %(epoch_id, av_batch_loss/num_batches)
        
        batch.reset()
        
        
        
        
        
        

For epoch = 0, batch id = 0, batch loss = 33.704872
For epoch = 0, batch id = 100, batch loss = 30.799883
For epoch = 0, batch id = 200, batch loss = 25.704319
For epoch = 0, batch id = 300, batch loss = 18.266960
For epoch = 0, batch id = 400, batch loss = 18.454296
For epoch = 0, batch id = 500, batch loss = 19.174532
For epoch = 0, batch id = 600, batch loss = 16.017178
For epoch = 0, batch id = 700, batch loss = 11.960627
For epoch = 0, batch id = 800, batch loss = 14.330009
For epoch = 0, batch id = 900, batch loss = 16.777081
For epoch = 0, batch id = 1000, batch loss = 14.439182
For epoch = 0, batch id = 1100, batch loss = 9.905437
For epoch = 0, batch id = 1200, batch loss = 13.311388
For epoch = 0, batch id = 1300, batch loss = 10.752110
For epoch = 0, batch id = 1400, batch loss = 12.372339
For epoch = 0, batch id = 1500, batch loss = 8.329511
For epoch = 0, batch id = 1600, batch loss = 7.962450
For epoch = 0, batch id = 1700, batch loss = 9.623573
For epoch = 0, batch id = 

KeyboardInterrupt: 