In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import collections
import math
import os
import random
import zipfile

In [3]:
#file download utilities
from six.moves import urllib
from six.moves import xrange

In [4]:
import numpy as np
import tensorflow as tf


In [5]:
print(np.__version__)
print(tf.__version__)

1.14.1
1.5.0


In [6]:
DOWNLOADED_FILENAME = 'SampleText.zip'

In [7]:
def maybe_download(url_path, expected_bytes):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
    
    statinfo = os.stat(DOWNLOADED_FILENAME)
    if statinfo.st_size == expected_bytes:
        print('Found and verified file from this path: ', url_path)
        print('Downloaded file: ', DOWNLOADED_FILENAME)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify file from: ' + url_path + '. Can you get to it from a browser?')
        
    

In [8]:
def read_words():
    with zipfile.ZipFile(DOWNLOADED_FILENAME) as f:
        firstfile = f.namelist()[0]
        filestring = tf.compat.as_str(f.read(firstfile))
        words = filestring.split()
    return words

In [9]:
URL_PATH = 'http://mattmahoney.net/dc/text8.zip'
FILESIZE = 313344016

maybe_download(URL_PATH, FILESIZE)

31344016


Exception: Failed to verify file from: http://mattmahoney.net/dc/text8.zip. Can you get to it from a browser?

In [10]:
vocabulary = read_words()

In [11]:
len(vocabulary)

17005207

In [33]:
def build_dataset(words, n_words):
    word_counts = [['UNKNOWN', -1]]
    
    counter = collections.Counter(words)
    word_counts.extend(counter.most_common(n_words-1))
    
    dictionary = dict()
    
    for word, _ in word_counts:
        dictionary[word] = len(dictionary)#assign word to the dictionary length at this step
        #the most common word gets the least index. as word_counts contains words from highest to lowest occurence
        
    word_indexes = list()
    
    unknown_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0 #dictionary['UNKNOWN]
            unknown_count += 1
        
        word_indexes.append(index)#all the words in their index form with Zero for the words except the top n_words
        
    word_counts[0][1] = unknown_count
    
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    #maps values to its coutns {1:"is", 2356: "how", 43:"this"}
    
    return word_counts, word_indexes, dictionary, reversed_dictionary


In [34]:
VOCABULARY_SIZE = 5000

word_counts, word_indexes, dictionary, reversed_dictionary = build_dataset(vocabulary, VOCABULARY_SIZE)


In [35]:
word_counts[:10]

[['UNKNOWN', 2735459],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

In [36]:
word_indexes[:10]

[0, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [38]:
import random



In [41]:
for key in random.sample(list(dictionary), 10):
    print(key, ":", dictionary[key])


sight : 4426
sons : 2112
money : 815
forming : 2834
yellow : 2458
was : 18
battery : 4239
top : 563
cook : 3566
binding : 4060


In [44]:
for key in random.sample(list(reversed_dictionary), 10):
    print(key, ":", reversed_dictionary[key])

3403 : campbell
4462 : retrieved
4265 : hosts
3710 : superman
2118 : bwv
1973 : count
4620 : capabilities
848 : origin
1420 : christians
4838 : enabled


In [45]:
del vocabulary

In [46]:
#Global index into words maintained acroos batches
global_index = 0

In [49]:
def generate_batch(word_indexes, batch_size, num_skips, skip_window):
    #num_skips : the number of words we choose from a context window
    #num_skips number of words are chosen at random as target word which will be predicted using the context window center word.
    #skip_window : number of neighbors we want to consider to the left and to the right
    # if skip_window = 3 then context window contains 3 words to the left and 3 words to the right
    # The quick brown fox jumped over the lazy dog
    # if fox is center word oof the context window
    # then we pick num_skips i.e., 2 words at random to be predicted as a target word using center word "fox"
    # therefore neural network we'd set up uses  "fox" to predict -> "jumped" and "fox" -> "quick" jumped and quick are chosen at random

    global global_index # where we are in the document
    
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size,1), dtype=np.int32)
    #labels : array of arrays containing the indexes of the target predicted words
    #from the above example it would be the count of the target words, jumped and quick
    #[[1], [0], [0], ..... <batch_size>]
    
    span = 2 * skip_window + 1 # [skip_window input_word skip_window]
    #if the total size of the context window skip_window words on the left skip_wondw words on the right and the center word.
    buffer = collections.deque(maxlen = span)# double ended queue
    #very fast addition and removal of words from either ends
    
    for _ in range(span):
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)#making sure to reset global_index back to zero once done with span items
    
    for i in range(batch_size // num_skips):
        target = skip_window #input word at the center of the buffer
        targets_to_avoid = [skip_window]
        
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span-1)
            
            targets_to_avoid.append(target)
            
            batch[i * num_skips + j] = buffer[skip_window] #this is the input word
            labels[i * num_skips + j, 0] = buffer[target] # these are the context words
        
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
        
    global_index = (global_index + len(word_indexes) - span) % len(word_indexes)
    
            
    return batch, labels

In [50]:
batch, labels = generate_batch(word_indexes, 10, 2, 5)

In [51]:
batch

array([   2,    2, 3134, 3134,   46,   46,   59,   59,  156,  156])

In [52]:
labels

array([[ 195],
       [  46],
       [   6],
       [ 195],
       [ 742],
       [ 156],
       [   0],
       [ 195],
       [3134],
       [ 742]])

In [54]:
for i in range(9):
    print(reversed_dictionary[batch[i]], ":", reversed_dictionary[labels[i][0]])

of : term
of : first
abuse : a
abuse : term
first : working
first : against
used : UNKNOWN
used : term
against : abuse


In [55]:
#Reset the global index backk to zero
global_index = 0

In [56]:
valid_size = 16
valid_window = 100

valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [58]:
batch_size = 128 #128 input words fed in and their targets correspondingly
embedding_size = 50 #number of dimensions our embedding will have. that's 50 neurons in the hidden layer of the neural network
skip_window = 2
num_skips = 2

In [61]:
tf.reset_default_graph()#fresh tensorflow graph

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])#every iteration = 128 bits of data fed in
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])


In [63]:
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [69]:
embeddings = tf.Variable(tf.random_uniform([VOCABULARY_SIZE, embedding_size], -1.0, 1.0))
#embeddings are generated using training data set and stored in variables 
#the shape = vocabsize:50 => every word in vocab has an embedding
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [70]:
embeddings

<tf.Variable 'Variable_4:0' shape=(5000, 50) dtype=float32_ref>

In [73]:
embed

<tf.Tensor 'embedding_lookup_3:0' shape=(128, 50) dtype=float32>

In [75]:
#the hidden layer will be setup usng a linear function with a linear equation 
# y = wx + b 
#where w is the set of weights or the weight matrix

In [76]:
weights = tf.Variable(tf.truncated_normal([VOCABULARY_SIZE, embedding_size], stddev=1.0/math.sqrt(embedding_size)))

In [79]:
#biases are also initialized

In [80]:
biases = tf.Variable(tf.zeros([VOCABULARY_SIZE]))

In [82]:
#hidden layer is generated by hidden = (embed_matrix * weights_matrix) + biases
# transpose is to make them compatible with matrix multiplication as both the matrices have the same number of columns whic is the embedding size
#transposing maakes matrix1's columns match matrix2's row count

In [83]:
hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases

In [84]:
hidden_out

<tf.Tensor 'add_1:0' shape=(128, 5000) dtype=float32>

In [85]:
#order of embed 128x50
#order of weights 5000x50 order of transpose of weights = 50x5000
#order of hidden_out 128x5000

In [96]:
train_one_hot = tf.one_hot(train_labels, VOCABULARY_SIZE)

#the input is converted into a one hot notation in order to be able to use the 
# SOFTMAX prediction layer at the output to interpret the hidden layer's output and give us the actual output

In [95]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out, 
                                                              labels=train_one_hot))

In [100]:
#softmax prediction layer actually is slow and uses the cross entropy as the loss fucntion in order to minimise the error in the output
#uses the logit function 1/(1+e^(wx+B)) for one binary output and 
# uses 1/(e^-(wx+B)) fot the other binary output
#it can be further extended to any number of output probablities

In [101]:
#further this softmax's output is fed into a loss minimsation function that uses internally the gradient descent funtion

In [103]:
#cross entropy as a loss function is the measure of the matching between the actual and the predicted probability distributions
#More the difference => more the entropy => more the uncertainty

In [106]:
#need to be reduced. hence fed into the gradient descent optimizer with a step size of 0.1 to minimize the loss

In [108]:
optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

In [109]:
#cosine similarity is useed
#l2 norms are to be calculated first for that 
#cosine(a,b) = a.b/(||a||.||b||)
#||a|| is the l2 norm of a which is nothing but the squre root of the summ of the squares of the vector
# if a = [1 2 3] l2norm(a) = sqrt(1^2 + 2^2 + 3^2)

In [111]:
l2_norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))

normalized_embeddings = embeddings / l2_norm

In [112]:
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)

In [113]:
valid_embeddings

<tf.Tensor 'embedding_lookup_4:0' shape=(16, 50) dtype=float32>

In [114]:
normalized_embeddings

<tf.Tensor 'truediv:0' shape=(5000, 50) dtype=float32>

In [118]:
#gives the cosine similarity.

In [119]:
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [122]:
init = tf.global_variables_initializer()

In [126]:
num_steps = 2001

In [129]:
with tf.Session() as session:
    init.run()
    
    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(word_indexes, batch_size, num_skips, skip_window)     
        
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        
        if step%200 == 0:
            if step > 0:
                average_loss /= 200
                
            print('Average loss at step ', step, ':', average_loss)
            average_loss = 0
            
        #Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 1000 == 0:
            sim = similarity.eval()
            
            for i in xrange(valid_size):
                valid_word = reversed_dictionary[valid_examples[i]]
                top_k = 8 # number of  nearest neighbors
                
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log_str = 'Nearest to %s: ' % valid_word
                
                for k in xrange(top_k):
                    close_word = reversed_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
            print("\n")
            

Average loss at step  0 : 8.616981506347656
Nearest to which:  tropical, briefly, historic, algorithm, steel, achieved, province, respective,
Nearest to most:  sex, types, benjamin, avoid, colour, bringing, waters, whether,
Nearest to after:  taiwan, metro, punishment, signature, mathematician, might, cape, integers,
Nearest to and:  daily, contribution, inuit, twice, me, strict, invented, models,
Nearest to s:  charles, phenomenon, complete, john, sky, genesis, afc, sharp,
Nearest to system:  slavic, bibliography, equipped, shot, hamilton, vessels, piano, ford,
Nearest to all:  showing, elected, cd, si, substances, describe, drink, right,
Nearest to a:  general, actual, issued, board, types, norway, kernel, explained,
Nearest to more:  designed, arranged, storm, jacob, method, operator, did, differences,
Nearest to about:  caused, multi, compared, function, existence, ghost, begin, statistical,
Nearest to other:  farmers, annual, industries, max, apollo, doctrine, christmas, territori