In [9]:
import argparse
import math
import struct
import sys
import time
import warnings

import numpy as np


In [10]:
class VocabItem:
    def __init__(self, word):
        self.word = word
        self.count = 0
        self.path = None # Path (list of indices) from the root to the word (leaf)
        self.code = None

In [11]:
class Vocabulary:
    def __init__(self, fi, min_count):
        vocab_items = []
        vocab_hash = {}
        word_count = 0
        fi = open(fi, 'r')

        # Add special tokens <bol> (beginning of line) and <eol> (end of line)
        for token in ['<bol>', '<eol>']:
            vocab_hash[token] = len(vocab_items)
            vocab_items.append(VocabItem(token))

        for line in fi:
            tokens = line.split()
            for token in tokens:
                if token not in vocab_hash:
                    vocab_hash[token] = len(vocab_items)
                    vocab_items.append(VocabItem(token))
                    
                #assert vocab_items[vocab_hash[token]].word == token, 'Wrong vocab_hash index'
                vocab_items[vocab_hash[token]].count += 1
                word_count += 1
            
                if word_count % 10000 == 0:
                    sys.stdout.write("\rReading word %d" % word_count)
                    sys.stdout.flush()

            # Add special tokens <bol> (beginning of line) and <eol> (end of line)
            vocab_items[vocab_hash['<bol>']].count += 1
            vocab_items[vocab_hash['<eol>']].count += 1
            word_count += 2

        self.bytes = fi.tell()
        self.vocab_items = vocab_items         # List of VocabItem objects
        self.vocab_hash = vocab_hash           # Mapping from each token to its index in vocab
        self.word_count = word_count           # Total number of words in train file

        # Add special token <unk> (unknown),
        # merge words occurring less than min_count into <unk>, and
        # sort vocab in descending order by frequency in train file
        self.__sort(min_count)

        #assert self.word_count == sum([t.count for t in self.vocab_items]), 'word_count and sum of t.count do not agree'
        print 'Total words in training file: %d' % self.word_count
        print 'Total bytes in training file: %d' % self.bytes
        print 'Vocab size: %d' % len(self)

    def __getitem__(self, i):
        return self.vocab_items[i]

    def __len__(self):
        return len(self.vocab_items)

    def __iter__(self):
        return iter(self.vocab_items)

    def __contains__(self, key):
        return key in self.vocab_hash

    def __sort(self, min_count):
        tmp = []
        tmp.append(VocabItem('<unk>'))
        unk_hash = 0
        
        count_unk = 0
        for token in self.vocab_items:
            if token.count < min_count:
                count_unk += 1
                tmp[unk_hash].count += token.count
            else:
                tmp.append(token)

        tmp.sort(key=lambda token : token.count, reverse=True)

        # Update vocab_hash
        vocab_hash = {}
        for i, token in enumerate(tmp):
            vocab_hash[token.word] = i

        self.vocab_items = tmp
        self.vocab_hash = vocab_hash

        print
        print 'Unknown vocab size:', count_unk

    def indices(self, tokens):
        return [self.vocab_hash[token] if token in self else self.vocab_hash['<unk>'] for token in tokens]
        
class UnigramTable:
    """
    A list of indices of tokens in the vocab following a power law distribution,
    used to draw negative samples.
    """
    def __init__(self, vocab):
        vocab_size = len(vocab)
        power = 0.75
        norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constant
        print norm
        table_size = 1e8 # Length of the unigram table
        print table_size
        table = np.zeros(table_size, dtype=np.uint32)

        print 'Filling unigram table'
        p = 0 # Cumulative probability
        i = 0
        for j, unigram in enumerate(vocab):
            #print "j",j
            #print "unigram",unigram
            p += float(math.pow(unigram.count, power))/norm
            #print "propablity", p
            while i < table_size and float(i) / table_size < p:
                table[i] = j
                i += 1
        self.table = table

    def sample(self, count):
        indices = np.random.randint(low=0, high=len(self.table), size=count)
        return [self.table[i] for i in indices]

def sigmoid(z):
    if z > 6:
        return 1.0
    elif z < -6:
        return 0.0
    else:
        return 1 / (1 + math.exp(-z))

In [14]:
vocab = Vocabulary("local.txt",2)



Unknown vocab size: 143
Total words in training file: 338
Total bytes in training file: 1938
Vocab size: 50


In [29]:
check = vocab.__iter__()

In [31]:
a=list(check)

In [36]:
a[2].word

'and'

In [15]:
table = UnigramTable(vocab)

171.04254241
100000000.0
Filling unigram table




In [16]:
table.sample(5)

[3, 0, 12, 0, 8]

In [17]:
fi = open("local.txt", 'r')

In [18]:
current_word_count=0


In [19]:
starting_alpha=0.025
win=5

In [20]:
def init_net(dim, vocab_size):
    # Init syn0 with random numbers from a uniform distribution on the interval [-0.5, 0.5]/dim
    tmp = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(vocab_size, dim))
    syn0 = tmp
  

    # Init syn1 with zeros
    tmp = np.zeros(shape=(vocab_size, dim))
    syn1 = tmp
  

    return (syn0, syn1)

In [21]:
 syn0, syn1 = init_net(5, len(vocab))

In [22]:
print syn1[0]

[ 0.  0.  0.  0.  0.]


In [30]:
with open("local.txt", 'r') as fi:
    for lines in fi:
        line = lines.strip()
        print line
        print fi.tell()
        print "inside loop"
        sent = vocab.indices(['<bol>'] + line.split() + ['<eol>'])
        print sent
        for sent_pos, token in enumerate(sent):
            print sent_pos,token
            if current_word_count % 100 == 0:
                current_word_count += 1
                    # Recalculate alpha
                alpha = starting_alpha * (1 - float(current_word_count) / vocab.word_count)
                if alpha < starting_alpha * 0.0001: alpha = starting_alpha * 0.0001
            current_win = np.random.randint(low=1, high=win+1)
            context_start = max(sent_pos - current_win, 0)
            context_end = min(sent_pos + current_win + 1, len(sent))
            context = sent[context_start:sent_pos] + sent[sent_pos+1:context_end]
            print len(context)
            for context_word in context:
                neu1e = np.zeros(5)
                classifiers = [(token, 1)] + [(target, 0) for target in table.sample(1)]
                for target, label in classifiers:
                    z = np.dot(syn0[context_word], syn1[target])
                    p = sigmoid(z)
                    g = alpha * (label - p)
                    neu1e += g * syn1[target]              # Error to backpropagate to syn0
                    syn1[target] += g * syn0[context_word] # Update syn1

                        # Update syn0
                syn0[context_word] += neu1e
            current_word_count += 1
        
        
            

People are converging on the outskirts of this small town on the borders of East Godavari and Visakhapatnam districts with the single point agenda of getting the Kapu community included in the list of Backward Classes (BCs). This moment gained momentum over the last few weeks in all towns and villages between Vijayawada and East Godavari and also in some Rayalaseema districts.
1938
inside loop
[4, 0, 10, 0, 8, 1, 0, 6, 14, 0, 0, 8, 1, 0, 6, 11, 12, 3, 21, 22, 15, 1, 0, 0, 0, 6, 0, 1, 9, 16, 0, 2, 1, 23, 6, 24, 25, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 3, 11, 12, 3, 0, 2, 26, 0, 0, 5]
0 4
5
1 0
6
2 10
6
3 0
4
4 8
2
5 1
10
6 0
2
7 6
4
8 14
4
9 0
2
10 0
4
11 8
10
12 1
10
13 0
4
14 6
8
15 11
8
16 12
8
17 3
2
18 21
2
19 22
8
20 15
10
21 1
2
22 0
4
23 0
8
24 0
10
25 6
4
26 0
2
27 1
10
28 9
4
29 16
10
30 0
10
31 2
8
32 1
10
33 23
4
34 6
6
35 24
6
36 25
6
37 0
4
38 0
8
39 0
4
40 0
8
41 0
2
42 0
10
43 1
8
44 0
2
45 0
2
46 0
4
47 2
2
48 0
8
49 0
4
50 3
8
51 0
8
52 0
4
53 0
2
54 3
6


In [31]:
def save(vocab, syn0, fo, binary):
    print 'Saving model to', fo
    dim = len(syn0[0])
    if binary:
        fo = open(fo, 'wb')
        fo.write('%d %d\n' % (len(syn0), 5))
        fo.write('\n')
        for token, vector in zip(vocab, syn0):
            fo.write('%s ' % token.word)
            for s in vector:
                fo.write(struct.pack('f', s))
            fo.write('\n')
    else:
        fo = open(fo, 'w')
        fo.write('%d %d\n' % (len(syn0), 5))
        for token, vector in zip(vocab, syn0):
            word = token.word
            vector_str = ' '.join([str(s) for s in vector])
            fo.write('%s %s\n' % (word, vector_str))

    fo.close()

In [34]:
save(vocab,syn0,"localmodel.txt",False)

Saving model to localmodel.txt


In [35]:
save(vocab,syn0,"kapus",True)

Saving model to kapus
