---------

# BiGram Model

----------

#### Built on lines of  &nbsp;&nbsp;&nbsp;&nbsp;   [representation_learning/word2vec/BiGram_barebones_1_wiki.ipynb](http://localhost:8888/notebooks/representation_learning/word2vec/BiGram_barebones_1_wiki.ipynb)

### Added 
    1) tensorboard network visualizations   
    2) tensorboard loss visualizations
    3) similar words to words in validation data

#### Author : Anuj

#### Uses Wikipedia Dataset


In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline


In [0]:
import sys
import tensorflow as tf
import numpy as np
import random
import itertools

In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}
!mkdir -p drive
!google-drive-ocamlfuse drive

In [0]:
sys.path.append("/content/drive//GoogleDrive_Utils/")

In [0]:
from readWikiData import get_wikipedia_data

## Load the data file - map tokens to Ids, convert data to Ids

In [0]:
def get_wiki_data(n_vocab_=1000):
    sentences, word2idx,  _, _ = get_wikipedia_data(n_vocab=n_vocab_, n_files=10, by_paragraph=True)
    training_data = []
    vocab_size = len(word2idx)
    for sentence in sentences:
        for elem1, elem2 in zip(sentence[:-1], sentence[1:]):
            training_data.append((elem1, elem2))
    
    # this destroys the order of words in a wondow but for bigram its harmless
    # all we want is - pair of all bigrams
    training_data = list(set(training_data))   
    
    idx2word = {v:k for k, v in word2idx.items()}
    return len(word2idx), training_data, word2idx, idx2word

In [0]:
vocab_size, training_data, word2idx, idx2word = get_wiki_data(n_vocab_=9999)

In [0]:
print (vocab_size)
print (type(training_data))
print (len(training_data))

10000
<class 'list'>
1664638


In [0]:
print (len(word2idx.keys()))
print (len(idx2word.keys()))

10000
10000


## Build validation set - randomly choose 100 keys from idx2word

In [0]:
# randomly pick some validation words from data

validation_size = 32
#validation_set = random.sample(idx2word.keys(), validation_size)
validation_set = random.sample(idx2word.keys(), validation_size)

In [0]:
print (validation_set)
print ([idx2word[index] for index in validation_set])

[9372, 9892, 4577, 3605, 8304, 4949, 2058, 8160, 7753, 8801, 5758, 6495, 2091, 9691, 7360, 2346, 1076, 6165, 2109, 6075, 2721, 736, 3125, 6011, 8746, 9144, 1177, 835, 9916, 7342, 5605, 6128]
['cerebral', '1700', 'algae', 'rear', '84', 'genres', '1972', 'deny', 'dive', 'deliberate', 'restaurant', 'encyclopedia', '1960', 'benedictine', 'viable', 'currency', 'necessary', 'linking', 'manner', 'farms', 'bulgaria', '100', 'cuisine', 'smiths', 'pursuing', 'asylum', 'marriage', 'mainly', 'suffrage', 'therapeutic', 'norm', 'clusters']


### Get batches

In [0]:
# contains list of pairs that have already been selected
bucket_list = []

def getNextBatch(bi_grams_, batch_size=1000):
    
    global bucket_list
    
    # list of possible pairs to pick from
    docs_ids_to_select = list(set(bi_grams_) - set(bucket_list))
    
    # once you exhaust the possible pais, reset
    if len(docs_ids_to_select) < batch_size:
        bucket_list = []
        docs_ids_to_select = bi_grams_
        
    # Initialize two variables 
    train_X = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    train_label = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    # pick a random chunks of pairs 
    random_docs = random.sample(docs_ids_to_select, batch_size)
    bucket_list += random_docs
    
    index = 0 
    
    # Iterate threw all the docs 
    for item in random_docs:
        train_X[index] = item[0]
        train_label[index] = item[1]  
        index += 1
        
    #flatten list of lists to a single list
    train_X = list(itertools.chain(*train_X))
    train_label = list(itertools.chain(*train_label))
            
    return train_X, train_label

In [0]:
#X, Y = getNextBatch(bi_grams_=training_data, batch_size=32)

In [0]:
#print len(X), len(Y)

In [0]:
#print X[:10], Y[:10]

# Network

In [0]:
batch_size=32
num_batches = len(training_data)/batch_size

print ("Number of batches = %d" %num_batches)


embedding_dims = 128

Number of batches = 52019


In [0]:
X = tf.placeholder(shape=(batch_size,), dtype = tf.int32, name='X_var')
Y = tf.placeholder(shape=(batch_size,), dtype = tf.int32, name='Y_var')
valid_X = tf.Variable(validation_set, dtype=tf.int32, name='X_valid')

In [0]:
y_oh = tf.one_hot(indices=X, depth=vocab_size, name='Converting_Y_to_Y_oh')

In [0]:
print (X.get_shape())
print (Y.get_shape())
print (y_oh.get_shape())

(32,)
(32,)
(32, 10000)


In [0]:
embedding_layer_1 = tf.Variable(tf.truncated_normal(
    shape=(vocab_size, embedding_dims),mean=0.0, stddev=1.0, dtype=tf.float32), name="Embeddings_Matrix") 
embeded = tf.nn.embedding_lookup(embedding_layer_1, ids=X, name="Embedding_LookUp")

In [0]:
embeded.get_shape()

TensorShape([Dimension(32), Dimension(128)])

In [0]:
#softmax weights, bias
W = tf.Variable(tf.truncated_normal(
    shape=(embedding_dims, vocab_size),mean=0.0, stddev=1.0, dtype=tf.float32), name="Softmax_Weights_Matrix")
b = tf.Variable(tf.zeros(shape=(vocab_size,)), name="Softmax_Bias_Vector")


In [0]:
logits = tf.add(tf.matmul(embeded, W, name="WX"), b, name="WX_plus_b")

#logits = tf.add(tf.matmul(embed, softmax_weights, name="WX"), softmax_bias, name="WX_plus_b")

loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_oh, name="Compute_Loss")
#mean_loss = tf.reduce_mean(loss)
mean_loss = tf.reduce_mean(loss, name="Compute_mean_loss")

tf.summary.scalar("mean_loss", mean_loss)



<tf.Tensor 'mean_loss:0' shape=() dtype=string>

In [0]:
print (logits.get_shape())
print (y_oh.get_shape())

(32, 10000)
(32, 10000)


In [0]:
optimizer = tf.train.GradientDescentOptimizer(0.5, name="Optimizer").minimize(mean_loss)

In [0]:
summary_op = tf.summary.merge_all()

In [0]:
#compute L2 norm for cosine similarity
norm = tf.sqrt(tf.reduce_sum(tf.square(embedding_layer_1), axis=1, keepdims=True))
normalised_embeddings = embedding_layer_1 / norm

# get validation set embeddings
validation_data_embeddings = tf.nn.embedding_lookup(normalised_embeddings, ids=valid_X, name="validation_embeddings_lookup")

# similarity score of validation embeddings w.r.t normalised= dot product between validation_data_embeddings and mornalised embeddings
similarity = tf.matmul(validation_data_embeddings, tf.transpose(normalised_embeddings))  # C.A = C x transpose(A)

# Training

In [0]:
num_of_epochs = 20
LOG_DIR = './bigram_wiki_chk_pts'

print ("Number of batches = %d" %num_batches)
print ("Number of epochs = %d" %num_of_epochs)


validation_size = 8

Number of batches = 52019
Number of epochs = 20


In [0]:
# demo params 

num_of_epochs = 5
num_batches = 50

In [0]:


# A SIMPLE saver() to save the model
saver = tf.train.Saver()

with tf.Session() as sess:
    
    # writer to write graph to tensorboard
    writer = tf.summary.FileWriter(LOG_DIR, sess.graph)

    tf.global_variables_initializer().run()
    print ("initialised\n")

    for epoch_id in range(num_of_epochs):

        av_batch_loss = 0

        for batch_id in range(num_batches):

            X_, Y_ = getNextBatch(bi_grams_=training_data, batch_size=batch_size)

            feed_dict = {}
            feed_dict[X] = X_
            feed_dict[Y] = Y_

            batch_loss, _, summary = sess.run([mean_loss, optimizer, summary_op], feed_dict=feed_dict)
            
            #writer.add_summary(batch_loss, epoch) 
            step_id = epoch_id * num_batches + batch_id
            print ("step_id = %d" %step_id)
            writer.add_summary(summary, global_step=step_id)

            av_batch_loss += batch_loss
            
            if batch_id % 500 == 0:
                print ("\nFor epoch = %d, batch id = %d, batch loss = %f\n" %(epoch_id, batch_id, batch_loss))
            
            if batch_id % 1000 == 0:
                print ("\nFor epoch = %d, batch id = %d, batch loss = %f\n" %(epoch_id, batch_id, batch_loss))
                
                #print validation data
                sim = similarity.eval() # compute similarity
                
                #iterate over each validation example
                
                for i in range(validation_size):
                    word = idx2word[validation_set[i]]
                    top_k = 8
                    # sort indexes and pick top k. we take 1:top_k+1 since 0th top pick will the same word itself
                    nearest = (-sim[i,:]).argsort()[1:top_k+1]
                    
                    #nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    
                    log = '\t Nearest to %s : ' %word
                    for k in range(top_k):
                        nearest_word = idx2word[nearest[k]]
                        log = '%s %s,' %(log, nearest_word)
                    print (log)        

        print ("\nFor epoch = %d, Av loss = %f" %(epoch_id, av_batch_loss/num_batches))
        
        #batch.reset()
        
    save_path = saver.save(sess, LOG_DIR)
    print("Model saved in file: %s" % save_path)
        
        
        
        

initialised

step_id = 0

For epoch = 0, batch id = 0, batch loss = 32.715607


For epoch = 0, batch id = 0, batch loss = 32.715607

	 Nearest to cerebral :  transferred, mining, proprietary, seasons, 1000, toll, 1900, four,
	 Nearest to 1700 :  car, fate, verse, junction, meaning, includes, apartment, end,
	 Nearest to algae :  careful, therapies, ensure, such, split, persistent, age, sony,
	 Nearest to rear :  rom, defenders, commodity, conscription, lawyers, drake, skin, chemicals,
	 Nearest to 84 :  battleship, hitchcock, 1846, czechoslovakia, fleet, recovery, achieving, transformation,
	 Nearest to genres :  disappeared, toronto, proclamation, circulated, absent, 2000s, implementations, ethanol,
	 Nearest to 1972 :  commonwealth, dog, epistle, rebuilding, happened, keeping, registration, peter,
	 Nearest to deny :  wild, knife, active, 62, warner, pursuit, sends, commodities,
step_id = 1
step_id = 2
step_id = 3
step_id = 4
step_id = 5
step_id = 6
step_id = 7
step_id = 8
step_id = 

# Plot the Embeddings 

## Tensorboard way

In [0]:
num_of_epochs = 5
num_batches = 50

for epoch_id in range(num_of_epochs):
    for batch_id in range(num_batches):
        
        step_id = epoch_id * num_batches + batch_id
        
        print (step_id)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
