In [2]:
import collections
import math
import os
import random
import zipfile

In [4]:
# services to be imported 

# service of file download from six library
from six.moves import urllib
from six.moves import xrange


# these utilities wrap over python 2 and 3


In [5]:
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [6]:
print(np.__version__)
print(tf.__version__)

1.14.0
1.10.0


In [7]:
# file name for our local machine 
DOWNLOAD_FILENAME='SampleText.zip'

In [15]:
def maybe_download(url_path,expected_bytes):
    if not os.path.exists(DOWNLOAD_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path,DOWNLOAD_FILENAME)
    # check if downloaded file is corrupted
    statinfo=os.stat(DOWNLOAD_FILENAME)
    # let's check size of file against expected size
    if statinfo.st_size == expected_bytes:
        print('found and verified file from this path',url_path)
        print('dounloaded file:',DOWNLOAD_FILENAME)
    else:
        print("size not as expected download from website directly")
        print(statinfo.st_size)
        

In [16]:
# function to read file and parse into individual words
def read_words():
    with zipfile.ZipFile(DOWNLOAD_FILENAME)as f:
        # Get first file from zip contents
        firstfile=f.namelist()[0]
        # read contents of this file
        # use tf.compat.as_str to convert the contents to string
        filestring=tf.compat.as_str(f.read(firstfile))
        # extract words from this entire file string
        words=filestring.split()
    return words

In [17]:
URL_PATH="http://mattmahoney.net/dc/text8.zip"
FILESIZE= 31344016
maybe_download(URL_PATH,FILESIZE)

found and verified file from this path http://mattmahoney.net/dc/text8.zip
dounloaded file: SampleText.zip


In [18]:
# all words of input data set 
vocabulary = read_words()
len(vocabulary)
# roughly 17 million words

17005207

In [33]:
# let us build the daya set in a format that will be useful to us 
# let's generate embeddings only for top n words=n_words
def build_dataset(words,n_words):
    # word count is an array of arrays that holds words and there freqs
    # words which are not in top n will be added to unknown count
    word_counts=[['UNKNOWN',-1]]
    # instantiate counter class from collection lib 
    # it will help us access the most frequently used words
    counter = collections.Counter(words)
    # fUNC most_common will give us this info
    word_counts.extend(counter.most_common(n_words-1))
    # these words are going to be fed to a NN in order to generate W E
    # but NN only accepts numeric input
    # so we will use unique indexes to represent these words
    # higher the frequency - lower the indedx value
    # let us store this word to unique id mapping in dict variable
    dictionary=dict()
    for word,_ in word_counts:
        # iterating through word_counts array we will access words in desc order of freq
        dictionary[word]=len(dictionary)
    # assign unique index to every word ,
    # the len of dict will increase at every step i.e. most common words having lowest index values
    word_indexes=list()
    unknown_count=0
    for word in words:
        if word in dictionary:
            index=dictionary[word]
        # occurrence of all other words outside top n 
        else:
            index=0  ##dictionary['UNKNOWN']
            unknown_count +=1
        # now append the indexes for each word
        word_indexes.append(index)
        # word_indexes contains all the words in indexes form 
    # assign back the count of unknown words
    word_counts[0][1]=unknown_count
    
    # there is one more thing we need a reverse dictionary 
    # which maps the unique index to the word itself
    # {1:"is",2356:"how",43:"this",...}
    reversed_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
    
    return word_counts,word_indexes,dictionary,reversed_dictionary

In [34]:
VOCABULARY_SIZE = 5000

word_counts,word_indexes,dictionary,reversed_dictionary= build_dataset(vocabulary,VOCABULARY_SIZE)

In [35]:
# let's use softmax as pred layer to generate word embeddings
# it will be very very slow training is almost impossible on 1 system

# let's see sample of word counts
word_counts[:10]

[['UNKNOWN', 2735459],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

In [36]:
word_indexes[:10]
# contains indexes of all words

[0, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [42]:
# random sample of keys and values from our dictionary 

import random 

for key in random.sample(list(dictionary),10):
    print(key,":",dictionary[key])
    
# words that occur more free will have small values

height : 2543
consciousness : 2506
hockey : 2464
levels : 1106
choose : 2439
battle : 377
him : 119
court : 350
wish : 3592
formats : 4172


In [43]:
# random sample of keys and values from our dictionary 

import random 

for key in random.sample(list(reversed_dictionary),10):
    print(key,":",reversed_dictionary[key])
    
# words that occur more free will have small values

1333 : netherlands
4231 : creates
357 : america
3800 : interactive
2776 : corporate
4936 : syrian
377 : battle
3660 : purchased
4808 : harold
3963 : subset


In [44]:
# now since data set is generated, we no longer need vocab, so let's delete it

del vocabulary

# this gives a hint to python that we no longer need this an dit can go ahead and clean up the memory allocated


In [45]:
# when we train a simple NN
# we will feed in train data in batches
global_index=0 #index of list of words from our i/p text file 

In [52]:
# helper method to generate first batch
def generate_batch(word_indexes,batch_size,num_skips,skip_window):
    # num_skips is number of words we will choose from our context window
    # skip_window is num of eelements we want to consider on left and right of our word
    # num_skips=2 means we will pick 2 (random) of the words from surronding context of input word as target word
    # global index will keep track of where we are in document
    global global_index
    
    # in any bacth any word will appear num_skips times
    # e.g for word fox we picked 2 random words jump and quick so it will have 2 mappings
    # once with every target word
    # batch size has to be multipme of num_skips
    assert batch_size%num_skips == 0
    # also we cannot select words more that what is present in context window
    assert num_skips <=2*skip_window
    
    # batch car will hold input words with size as batch size
    batch=np.ndarray(shape=(batch_size),dtype=np.int32)
    
    # labels are array of arrays and contain the indexes of target predicted words
    labels=np.ndarray(shape=(batch_size,1),dtype=np.int32)
    # total size of context window
    span=2*skip_window +1 # skip window-input-word skip window
    
    # text fragment within context window will be stored in deque
    # (DS)-double ended queue
    # maxlen of deque is same as len of context window
    buffer= collections.deque(maxlen=span)
    # we are using it bcoz deque allows fast add and removal from either end
    
    # initialize the deque to word on whiCh skip gram is to applied to get train data
    for _ in range(span):
        buffer.append(word_indexes[global_index])
        # increment global index
        # make sure it moves to beg once it is done at end
        global_index=(global_index+1)%len(word_indexes)
        
    # each input word will be used to predict num_skips no of target words
    for i in range(batch_size//num_skips):
        #// is Floor Division
        #initialize target to input word
        target = skip_window  # input word at ceter of winodw
        #removing input word itself to be as target 
        targets_to_avoid=[skip_window]
    
        # let's now pick words at random
        for j in range(num_skips):
            while target in targets_to_avoid:
                target=random.randint(0,span-1)
                
            # add the word u have picked to the list
            targets_to_avoid.append(target)
            
            # assign input word to batch array and targets to label array
            batch[i*num_skips+j]=buffer[skip_window] # input word
            labels[i*num_skips+j,0]=buffer[target] # context words
        # let's slide over the context window for next input
        # appending next word the deque, this will automatically remove a word from beg of deque
        buffer.append(word_indexes[global_index])
        # move global index
        global_index= (global_index+1)%len(word_indexes)
    
    # back track to see that words at end of a batch are included in next batch
    # i.e. words at the end of batch might not be done fully here so back track for it
    global_index=(global_index+len(word_indexes)-span)%len(word_indexes)
    return batch, labels

    

In [53]:
batch,labels=generate_batch(word_indexes,10,2,5)
# 10,2,5-> batch size, num_skips,skip_window

In [54]:
batch
# it's simply array of numbers with every word present twice that's input 2 times bcoz num_skips=2

array([ 151,  151,  855,  855, 3581, 3581,    1,    1,  195,  195])

In [56]:
# input word has topredict 2 target words from the window
labels

array([[ 855],
       [   0],
       [   2],
       [3581],
       [   2],
       [  11],
       [   2],
       [ 195],
       [  59],
       [ 855]])

In [58]:
# this will make more ssense if we print aactual words from rev dict
# let's print actual words and targets
for i in range(9):
    print(reversed_dictionary[batch[i]],":",reversed_dictionary[labels[i][0]])

french : revolution
french : UNKNOWN
revolution : of
revolution : whilst
whilst : of
whilst : is
the : of
the : term
term : used


In [59]:
# now train data s ready we are all set to feed in data to NN

# reset global index
global_index=0
valid_size=16 
# valid_size -> taking 16 random words from top most freq occurring words
# and see what their closest neighbors are 
valid_window=100
# i.e within top 100 words pick 16 words at random
valid_examples=np.random.choice(valid_window,valid_size,replace=True)


In [60]:
batch_size=128
# i.e. we will feed 128 words and there corresponding targets in 1 iteration
embedding_size=50
# num of dimensions that our word embeddings will have
# so hidden layer will have 50 neurons
skip_window=2
num_skips=2

In [62]:
# let's start by making fresh tensorflow graph
tf.reset_default_graph()
# our input placeholders
train_inputs= tf.placeholder(tf.int32,shape=[batch_size])
train_labels= tf.placeholder(tf.int32,shape=[batch_size,1])

In [63]:
# at every iteration we will feed in 128 bits of input and their corresponding labels
# we have kept a backup set-> which has randome instances
# valid_examples to validate training

valid_dataset=tf.constant(valid_examples,dtype=tf.int32)


In [64]:
# shape would be vXn
# fo each word n dimensional embedding
embeddings=tf.Variable(tf.random_uniform([VOCABULARY_SIZE,embedding_size],-1.0,1.0))

embed=tf.nn.embedding_lookup(embeddings,train_inputs)

In [65]:
embeddings
# 5000 vocab size 50 embedding size

<tf.Variable 'Variable:0' shape=(5000, 50) dtype=float32_ref>

In [66]:
embed
# 128*50
# batch size is 128


<tf.Tensor 'embedding_lookup:0' shape=(128, 50) dtype=float32>

In [67]:
# let us setup linear hidden layer
weights = tf.Variable(tf.truncated_normal([VOCABULARY_SIZE,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
# these weights will be multiplied with inputs

# biases would be determined during training
biases=tf.Variable(tf.zeros([VOCABULARY_SIZE]))

In [69]:

# multiply x with weights and add biases
# hidden layer output
hidden_out=tf.matmul(embed,tf.transpose(weights))+biases

# transpose is doen for rc match for multiplication

# this is a NN layer with no activation fucntion: linear layer

In [70]:
hidden_out

<tf.Tensor 'add:0' shape=(128, 5000) dtype=float32>

In [71]:
# convert lables of train to one hot notation

train_one_hot=tf.one_hot(train_labels,VOCABULARY_SIZE)

In [74]:
# output of hidden layer is fed to softmax pred layer 
# using cross entrophy as loss function

loss= tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out,labels=train_one_hot))

In [75]:
optimizer= tf.train.GradientDescentOptimizer(0.1).minimize(loss)

In [76]:
l2_norm=tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keep_dims=True))

# find normalized embeddings by diving them with l2 norm

normalized_embeddings=embeddings/l2_norm
# now we will use these normalized embeddings to find cosine similarity on validation data
# this is to check how our wordtovec embeddings are working
# if similar things are clustered together then it is working well


# let's separate the embeddings of validation data

valid_embeddings=tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [77]:
valid_embeddings
# shape is 16 x 50 bcoz 16 validation inputs with 50 dimension embeddings

<tf.Tensor 'embedding_lookup_1:0' shape=(16, 50) dtype=float32>

In [78]:
normalized_embeddings
# shape is 5000 * 50 for entire set 

<tf.Tensor 'truediv:0' shape=(5000, 50) dtype=float32>

In [79]:
# cosine similarty between validation data 
# and all other words present
similarity=tf.matmul(valid_embeddings,normalized_embeddings,transpose_b=True)

In [81]:
similarity

<tf.Tensor 'MatMul_2:0' shape=(16, 5000) dtype=float32>

In [82]:
# now we are ready  to train NN

init= tf.global_variables_initializer()

In [83]:
# num of steps or epochs to be run for training
num_steps = 20001

In [94]:
with tf.Session() as session:
    init.run()
    average_loss=0
    for step in xrange(num_steps):
        batch_inputs,batch_labels=generate_batch(word_indexes,batch_size,num_skips,skip_window)
        # feed dict to feed train data and its labels
        feed_dict={train_inputs:batch_inputs,train_labels:batch_labels}
        
        _,loss_val=session.run([optimizer,loss],feed_dict=feed_dict)
        average_loss+=loss_val
        
        # divide avg loss by 2000 bcoz it has run for 2000 ecpochs and got added up
        
        
        if step%2000==0:
            if step>0:
                average_loss /=2000
            print('Average loss at step',step ,":",average_loss)
            average_loss=0
            
        # every 10000 steps we will take val data and find closest embedding words
        if step%10000==0:
            sim=similarity.eval()
            for i in xrange(valid_size):
                valid_word=reversed_dictionary[valid_examples[i]]
                top_k=8 # no of nearest neighbors
                nearest=(-sim[i,:]).argsort()[1:top_k+1]
                # arg sort sorts from largest to smallest so we kept negative sign
                # -sim
                # sign is flipped so atht largest distance is the smallest number
                log_str='Nearest to %s:' %valid_word
                for k in xrange(top_k):
                    close_word=reversed_dictionary[nearest[k]]
                    log_str='%s %s' % (log_str, close_word)
                print(log_str)
            print("\n")
                    
                

Average loss at step 0 : 8.705581665039062
Nearest to three: crowd automatic chemistry poets serving mutual gas edward
Nearest to UNKNOWN: census rare easy down grew omega ac whatever
Nearest to were: enterprise mtv autonomy cult audio professor align views
Nearest to history: horses ideas remains america terrain date achieved primarily
Nearest to five: bavaria leads disk substantial births pop topics official
Nearest to on: public accessed cult series roles cats iso editing
Nearest to have: greater factors violent latin stone hunt credit task
Nearest to an: module html retrieved descriptions ann touch connections still
Nearest to of: fellow civilization past decisions always reflected supply fallen
Nearest to not: into trading drugs mythology aid archive moon repeated
Nearest to often: groups foreign stones superior temple whereas experiences europeans
Nearest to that: effective decide surgery tone noise variables hold romans
Nearest to he: clear opposition floor body chance links sta