In [1]:
import argparse
import math
import struct
import sys
import time
import warnings
import numpy as np
import itertools 

In [2]:
class VocabItem:
    def __init__(self, word):
        self.word = word
        self.count = 0

In [9]:
class Vocabulary:
    def __init__(self, fi, min_count):
        vocab_items = []
        vocab_hash = {}
        word_count = 0
        fi = open(fi, 'r')
        # Add special tokens <bol> (beginning of line) and <eol> (end of line)
        for token in ['<bol>', '<eol>']:
            vocab_hash[token] = len(vocab_items)
            vocab_items.append(VocabItem(token))
        for line in fi:
            tokens = line.split()
            #print("\rReading line %s" %tokens)
            for token in tokens:
                if token not in vocab_hash:
                    vocab_hash[token] = len(vocab_items)
                    #print ("\r\r token %s" %token)
                    #print ("\t\t token value",vocab_hash[token])
                    vocab_items.append(VocabItem(token))
                #assert vocab_items[vocab_hash[token]].word == token, 'Wrong vocab_hash index'
                vocab_items[vocab_hash[token]].count += 1
                word_count += 1
                if word_count % 10000 == 0:
                    sys.stdout.write("\rReading word %d" % word_count)
                    sys.stdout.flush()

            # Add special tokens <bol> (beginning of line) and <eol> (end of line)
            vocab_items[vocab_hash['<bol>']].count += 1
            vocab_items[vocab_hash['<eol>']].count += 1
            word_count += 2
        self.vocab_items = vocab_items # List of VocabItem objects
        self.vocab_hash = vocab_hash  # Mapping from each token to its index in vocab
        self.word_count = word_count # Total number of words in train file
        # Add special token <unk> (unknown),
        # merge words occurring less than min_count into <unk>, and
        # sort vocab in descending order by frequency in train file
        self.__sort(min_count)
        #assert self.word_count == sum([t.count for t in self.vocab_items]), 'word_count and sum of t.count do not agree'
        print ('Total words in training file: %d' % self.word_count)
        #print ('Total bytes in training file: %d' % self.bytes)
        print ('Vocab size: %d' % len(self))
    def __getitem__(self, i):
        return self.vocab_items[i]

    def __len__(self):
        return len(self.vocab_items)

    def __iter__(self):
        return iter(self.vocab_items)

    def __contains__(self, key):
        return key in self.vocab_hash

    def __sort(self, min_count):
        tmp = []
        tmp.append(VocabItem('<unk>'))
        unk_hash = 0
        
        count_unk = 0
        for token in self.vocab_items:
            if token.count < min_count:
                count_unk += 1
                tmp[unk_hash].count += token.count
                #print("word setting as unknow:",token.word)
            else:
                tmp.append(token)

        tmp.sort(key=lambda token : token.count, reverse=True)

        # Update vocab_hash
        vocab_hash = {}
        for i, token in enumerate(tmp):
            vocab_hash[token.word] = i

        self.vocab_items = tmp
        self.vocab_hash = vocab_hash
        #print ("printing vocab_hash")
        #for key,value in vocab_hash.items():
         #   print (key,value)
        #print ('Unknown vocab size:', count_unk)

    def indices(self, tokens):
        return [self.vocab_hash[token] if token in self else self.vocab_hash['<unk>'] for token in tokens]

In [5]:
class UnigramTable:
    """
    A list of indices of tokens in the vocab following a power law distribution,
    used to draw negative samples.
    """
    def __init__(self, vocab):
        vocab_size = len(vocab)
        power = 0.75
        norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constant
        print (norm)
        table_size = 1e8 # Length of the unigram table depends on vocab
        #print table_size
        table = np.zeros(table_size, dtype=np.uint32)

        print ('Filling unigram table')
        p = 0 # Cumulative probability
        i = 0
        old_i = 0 
        for j, unigram in enumerate(vocab):
            #print "j",j
            #print "unigram",unigram
            
            p += float(math.pow(unigram.count, power))/norm
            while i < table_size and float(i) / table_size < p:
                table[i] = j
                i += 1
            old_i = i - old_i
            sys.stdout.write("\r propability for word '%s' is %f, kept it  %d times" %(unigram.word,p,old_i))
            sys.stdout.flush()
            #print("propability for word %s is %f, kept it  %d times" %(unigram.word,p,old_i))
        self.table = table
    def sample(self, count):
        indices = np.random.randint(low=0, high=len(self.table), size=count)
        return [self.table[i] for i in indices]

In [10]:
def sigmoid(z): #sigmoid function goes from -6 to +6
    if z > 6:
        return 1.0
    elif z < -6:
        return 0.0
    else:
        return 1 / (1 + math.exp(-z))

In [11]:
def init_net(dim, vocab_size):
    # Init input words with random numbers from a uniform distribution on the interval [-0.5, 0.5]/dim
    tmp = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(vocab_size, dim))
    input_word = tmp 
    # Init weights with zeros
    tmp = np.zeros(shape=(vocab_size, dim))
    output_word = tmp
  

    return (input_word, output_word)

In [12]:
vocab = Vocabulary("text8",10) #create vocab with count of skip words


Reading word 17000000Total words in training file: 17005209
Vocab size: 47135


In [13]:
table = UnigramTable(vocab) #create unigram table

2159669.0869037905
Filling unigram table
 propability for word 'kirchenmusik' is 1.000000, kept it  50420761 times



In [None]:

input_word,output_word = init_net(5, len(vocab)) #initalize input words and weight matrix with 5 diminension for each word 

current_word_count=0  #count of words to update alpha
alpha_count = 0
last_alpha_count = 0
starting_alpha=0.025 #initial alpha
win=5 #max window size
#iter_c = 0 #iteration count
with open("text8", 'r') as fi:
    for lines in fi: #for each line get the indicies of the words
        line = lines.strip()
        #print (line)
        sent = vocab.indices(['<bol>'] + line.split() + ['<eol>'])
        #print ("sent",sent)
        for sent_pos, token in enumerate(sent): #use the position of the "input word" to create context window randomly
            #print ("for sent_pos,indices",sent_pos,token)
            #update alpha for every 1000 words and print status
            if current_word_count % 1000 == 0:
                alpha_count += (current_word_count - last_alpha_count)
                last_alpha_count = current_word_count
                alpha = starting_alpha * (1 - float(alpha_count) / vocab.word_count)
                if alpha < starting_alpha * 0.0001: alpha = starting_alpha * 0.0001
                sys.stdout.write("\rAlpha: %f processed %d words Progress: %d of %d (%.2f%%)" %
                                 (alpha, current_word_count, alpha_count, vocab.word_count,
                                  float(alpha_count) / vocab.word_count * 100))
                sys.stdout.flush()
            
            #compute the random context
            current_win = np.random.randint(low=1, high=win+1)
            context_start = max(sent_pos - current_win, 0)
            context_end = min(sent_pos + current_win + 1, len(sent))
            #print ("context window")
            #print ("starting",context_start,"ending",context_end)
            
            #get the words which are either side of the input word
            context = sent[context_start:sent_pos] + sent[sent_pos+1:context_end]
            #print ("length of the context is",len(context))
            #print (context)
            
            #for every context word
            for context_word in context:
                error_back = np.zeros(5)
                # classifiy inputword with 1 and sample words taken from unigram table as 0
                classifiers = [(token, 1)] + [(target, 0) for target in table.sample(2)]
                #print("classifiers",classifiers)
                
                #for every word in classifier
                for target, label in classifiers:
                    #print ("\ttarget",target)
                    #print ("\t input word[context_word]",input_word[context_word])
                    #print ("\t output word [target]",output_word[target])
                    
                    #get dot product of input word vector of context word and weight vector of classifier word
                    z = np.dot(input_word[context_word], output_word[target])
                    #print ("\tdot product",z)
                    
                    #compute sigmoid of dot product
                    p = sigmoid(z)
                    #print ("\tsigmoid",p)
                    
                    #gradient
                    g = alpha * (label - p)
                    #print ("\tgradient",g)
                    
                    #sum up the error computed for every word in the classifier
                    error_back += g * output_word[target]# Error to backpropagate to input word vector
                    #print("\terror",neu1e)
                    #update weight of the words in classifer
                    output_word[target] -= g * input_word[context_word]# Update output word
                    
                    #print ("\tupdated array of target",target,"is",output_word[target])
                    #iter_c=iter_c+1
                        # Update input_word
                #update input word vector of the context using summed error of the classfier
                input_word[context_word] -= error_back
                #print("updated array of context word",context_word,"is ",input_word[context_word])
            current_word_count += 1
#print ("iteration count",iter_c)

Alpha: 0.013621 processed 7740000 words Progress: 7740000 of 17005209 (45.52%)