In [110]:
import argparse
import math
import struct
import sys
import time
import warnings

import numpy as np
import itertools 


In [111]:
class VocabItem:
    def __init__(self, word):
        self.word = word
        self.count = 0
        self.code = None

In [112]:
class Vocabulary:
    def __init__(self, fi, min_count):
        vocab_items = []
        vocab_hash = {}
        word_count = 0
        fi = open(fi, 'r')

        # Add special tokens <bol> (beginning of line) and <eol> (end of line)
        for token in ['<bol>', '<eol>']:
            vocab_hash[token] = len(vocab_items)
            vocab_items.append(VocabItem(token))

        for line in fi:
            tokens = line.split()
            print("\rReading line %s" %tokens)
            for token in tokens:
                if token not in vocab_hash:
                    vocab_hash[token] = len(vocab_items)
                    print ("\r\r token %s" %token)
                    print ("\t\t token value",vocab_hash[token])
                    vocab_items.append(VocabItem(token))
                    
                #assert vocab_items[vocab_hash[token]].word == token, 'Wrong vocab_hash index'
                vocab_items[vocab_hash[token]].count += 1
                word_count += 1
                if word_count % 10000 == 0:
                    sys.stdout.write("\rReading word %d" % word_count)
                    sys.stdout.flush()

            # Add special tokens <bol> (beginning of line) and <eol> (end of line)
            vocab_items[vocab_hash['<bol>']].count += 1
            vocab_items[vocab_hash['<eol>']].count += 1
            word_count += 2

        self.bytes = fi.tell()
        self.vocab_items = vocab_items # List of VocabItem objects
        self.vocab_hash = vocab_hash  # Mapping from each token to its index in vocab
        self.word_count = word_count # Total number of words in train file

        # Add special token <unk> (unknown),
        # merge words occurring less than min_count into <unk>, and
        # sort vocab in descending order by frequency in train file
        self.__sort(min_count)

        #assert self.word_count == sum([t.count for t in self.vocab_items]), 'word_count and sum of t.count do not agree'
        print ('Total words in training file: %d' % self.word_count)
        print ('Total bytes in training file: %d' % self.bytes)
        print ('Vocab size: %d' % len(self))

    def __getitem__(self, i):
        return self.vocab_items[i]

    def __len__(self):
        return len(self.vocab_items)

    def __iter__(self):
        return iter(self.vocab_items)

    def __contains__(self, key):
        return key in self.vocab_hash

    def __sort(self, min_count):
        tmp = []
        tmp.append(VocabItem('<unk>'))
        unk_hash = 0
        
        count_unk = 0
        for token in self.vocab_items:
            if token.count < min_count:
                count_unk += 1
                tmp[unk_hash].count += token.count
                print("word setting as unknow:",token.word)
            else:
                tmp.append(token)

        tmp.sort(key=lambda token : token.count, reverse=True)

        # Update vocab_hash
        vocab_hash = {}
        for i, token in enumerate(tmp):
            vocab_hash[token.word] = i

        self.vocab_items = tmp
        self.vocab_hash = vocab_hash
        print ("printing vocab_hash")
        for key,value in vocab_hash.items():
            print (key,value)
        print ('Unknown vocab size:', count_unk)

    def indices(self, tokens):
        return [self.vocab_hash[token] if token in self else self.vocab_hash['<unk>'] for token in tokens]

In [113]:
class UnigramTable:
    """
    A list of indices of tokens in the vocab following a power law distribution,
    used to draw negative samples.
    """
    def __init__(self, vocab):
        vocab_size = len(vocab)
        power = 0.75
        norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constant
        print (norm)
        table_size = 10 # Length of the unigram table
        #print table_size
        table = np.zeros(table_size, dtype=np.uint32)

        print ('Filling unigram table')
        p = 0 # Cumulative probability
        i = 0
        old_i = 0 
        for j, unigram in enumerate(vocab):
            #print "j",j
            #print "unigram",unigram
            
            p += float(math.pow(unigram.count, power))/norm
            while i < table_size and float(i) / table_size < p:
                table[i] = j
                i += 1
            old_i = i - old_i
            
            print("propability for word %s is %f, kept it  %d times" %(unigram.word,p,old_i))
        self.table = table
    def sample(self, count):
        indices = np.random.randint(low=0, high=len(self.table), size=count)
        return [self.table[i] for i in indices]

def sigmoid(z):
        return 1 / (1 + math.exp(-z))

In [114]:
def init_net(dim, vocab_size):
    # Init syn0 with random numbers from a uniform distribution on the interval [-0.5, 0.5]/dim
    tmp = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(vocab_size, dim))
    input_word = tmp
  

    # Init syn1 with zeros
    tmp = np.zeros(shape=(vocab_size, dim))
    output_word = tmp
  

    return (input_word, output_word)

In [115]:
vocab = Vocabulary("trial_words.txt",0)


Reading line ['We', 'propose', 'the', 'concept']
 token We
		 token value 2
 token propose
		 token value 3
 token the
		 token value 4
 token concept
		 token value 5
printing vocab_hash
the 4
<bol> 0
<eol> 1
We 2
<unk> 6
propose 3
concept 5
Unknown vocab size: 0
Total words in training file: 6
Total bytes in training file: 22
Vocab size: 7


In [116]:
for j, unigram in enumerate(vocab):
    print (j, unigram.word,unigram.count)

0 <bol> 1
1 <eol> 1
2 We 1
3 propose 1
4 the 1
5 concept 1
6 <unk> 0


In [117]:
table = UnigramTable(vocab)

6.0
Filling unigram table
propability for word <bol> is 0.166667, kept it  2 times
propability for word <eol> is 0.333333, kept it  2 times
propability for word We is 0.500000, kept it  3 times
propability for word propose is 0.666667, kept it  4 times
propability for word the is 0.833333, kept it  5 times
propability for word concept is 1.000000, kept it  5 times
propability for word <unk> is 1.000000, kept it  5 times


In [118]:

input_word,output_word = init_net(5, len(vocab))

current_word_count=0
starting_alpha=0.025
win=3
iter_c = 0 
with open("trial_words.txt", 'r') as fi:
    for lines in fi:
        line = lines.strip()
        print (line)
        #print fi.tell()
        #print ("inside loop")
        sent = vocab.indices(['<bol>'] + line.split() + ['<eol>'])
        print ("sent",sent)
        for sent_pos, token in enumerate(sent):
            print ("for sent_pos,indices",sent_pos,token)
            if current_word_count % 100 == 0:
                current_word_count += 1
                    # Recalculate alpha
                alpha = starting_alpha * (1 - float(current_word_count) / vocab.word_count)
                if alpha < starting_alpha * 0.0001: alpha = starting_alpha * 0.0001
            print ("Current alpha is ",alpha)
            current_win = np.random.randint(low=1, high=win+1)
            context_start = max(sent_pos - current_win, 0)
            context_end = min(sent_pos + current_win + 1, len(sent))
            print ("context window")
            print ("starting",context_start,"ending",context_end)
            context = sent[context_start:sent_pos] + sent[sent_pos+1:context_end]
            print ("length of the context is",len(context))
            print (context)
            for context_word in context:
                error_back = np.zeros(5)
                classifiers = [(token, 1)] + [(target, 0) for target in table.sample(2)]
                print("classifiers",classifiers)
                
                for target, label in classifiers:
                    print ("\ttarget",target)
                    print ("\t input word[context_word]",input_word[context_word])
                    print ("\t output word [target]",output_word[target])
                    z = np.dot(input_word[context_word], output_word[target])
                    print ("\tdot product",z)
                    p = sigmoid(z)
                    print ("\tsoftmax",p)
                    g = alpha * (label - p)
                    print ("\tgradient",g)
                    error_back += g * output_word[target]# Error to backpropagate to input word
                    print("\terror",neu1e)
                    output_word[target] -= g * input_word[context_word]# Update output word
                    print ("\tupdated array of target",target,"is",output_word[target])
                    iter_c=iter_c+1
                        # Update input_word
                input_word[context_word] -= error_back
                print("updated array of context word",context_word,"is ",input_word[context_word])
            current_word_count += 1
print ("iteration count",iter_c)

We propose the concept
sent [0, 2, 3, 4, 5, 1]
for sent_pos,indices 0 0
Current alpha is  0.020833333333333336
context window
starting 0 ending 2
length of the context is 1
[2]
classifiers [(0, 1), (2, 0), (0, 0)]
	target 0
	 input word[context_word] [ 0.03351147  0.03172071  0.01760151 -0.04480493  0.06243219]
	 output word [target] [ 0.  0.  0.  0.  0.]
	dot product 0.0
	softmax 0.5
	gradient 0.010416666666666668
	error [  3.77011281e-06  -1.23477260e-05  -1.61005194e-05   1.29908859e-05
  -4.35445197e-06]
	updated array of target 0 is [-0.00034908 -0.00033042 -0.00018335  0.00046672 -0.00065034]
	target 2
	 input word[context_word] [ 0.03351147  0.03172071  0.01760151 -0.04480493  0.06243219]
	 output word [target] [ 0.  0.  0.  0.  0.]
	dot product 0.0
	softmax 0.5
	gradient -0.010416666666666668
	error [  3.77011281e-06  -1.23477260e-05  -1.61005194e-05   1.29908859e-05
  -4.35445197e-06]
	updated array of target 2 is [ 0.00034908  0.00033042  0.00018335 -0.00046672  0.00065034]
	

In [119]:
print(input_word[2])
print (input_word[3])
print (input_word[4])
print (input_word[5])

[ 0.03350482  0.03170136  0.01761725 -0.04485565  0.0624657 ]
[-0.04118667 -0.0032     -0.08003335 -0.04058045 -0.03781537]
[ 0.00905686  0.00739103  0.08426566  0.00703487  0.06915343]
[ 0.07792415  0.03851186  0.04708739 -0.09323393  0.03499396]


In [120]:
print(output_word)

[[  9.19118394e-04   8.51972889e-04   1.59546631e-03  -1.71399129e-03
    2.06043141e-03]
 [ -1.28177746e-03  -6.44439234e-04  -1.31391403e-03  -1.26419422e-03
    1.04130832e-03]
 [  4.12709736e-03   1.14821249e-03   2.66933528e-03  -6.03831374e-04
    3.84478750e-03]
 [ -8.16984013e-04   1.42913146e-04  -1.67785806e-03   9.66938417e-04
   -2.58749482e-03]
 [  4.70783223e-04  -1.54101178e-03  -2.07649918e-04   1.40448283e-05
    3.67907858e-03]
 [  1.00167434e-03   5.97472888e-05   6.51437283e-04  -3.72393368e-03
    1.43012905e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00]]
