In [37]:
import numpy as np
import nltk
from nltk.corpus import reuters
import string
import re

POWER_FOR_NEGATIVE_SAMPLING = 3.0/4.0

def readPairs(fileName):
    f1 = open(fileName, "r")
    pairs = []
    for line in f1:
        pair = line.strip().split('\t')
        for i in range(len(pair)):
            pair[i] = int(pair[i])
        pairs.append(pair)
    f1.close()
    return pairs

def readVocabulary(fileName):
    f1 = open(fileName, "r")
    vocab = {}
    for line in f1:
        v = line.strip().split('\t')
        vocab[v[0]] = int(v[1])
    f1.close()
    return vocab

def readWord2id(fileName):
    f1 = open(fileName, "r")
    word2id = {}
    for line in f1:
        v = line.strip().split('\t')
        word2id[v[0]] = int(v[1])
    f1.close()
    return word2id

def readId2word(fileName):
    f1 = open(fileName, "r")
    id2word = {}
    for line in f1:
        v = line.strip().split('\t')
        id2word[int(v[0])] = v[1]
    f1.close()
    return id2word

def readAll():
    vocab = readVocabulary('data/vocab.txt')
    word2id = readWord2id('data/word2id.txt')
    id2word = readId2word('data/id2word.txt')
    return vocab, word2id, id2word

def getArrForNegativeSampling(vocab):
    idsForNegativeSampling = []
    for word, count in vocab.items():
        newCount = int(float(count)**(POWER_FOR_NEGATIVE_SAMPLING))
        for i in range(newCount):
            idsForNegativeSampling.append(word2id[word])
    return idsForNegativeSampling

print('Reading pairs')
pairs = readPairs('data/pairs.txt')
print('Reading vocab')
vocab, word2id, id2word = readAll()
print('Creating array for negative sampling')
idsForNegativeSampling = getArrForNegativeSampling(vocab)

print(len(idsForNegativeSampling), len(vocab), len(pairs))


Reading pairs
Reading vocab
Creating array for negative sampling
210795 28354 3846178


In [45]:
# logic for training:
# create random matrics W and C
# W: centre word embedding: d x vocab_size
# C: context word embedding: vocab_size x d
from random import randint
import math

def sigmoid(x):
    return 1 / (1 + math.exp(-x))


vocab_size = len(vocab)
EMBEDDING_DIMENSION = 300
LR = 0.01
NUM_NEGATIVE_SAMPLES = 5
W = np.random.rand(EMBEDDING_DIMENSION, vocab_size)
C = np.random.rand(vocab_size, EMBEDDING_DIMENSION)
for epoch in range(100):
    numDone = 0
    for pair in pairs:
        a = int(pair[0]) # centre word id
        b = int(pair[1]) # context word id

        n = [] # to store negative context word ids
        for i in range(NUM_NEGATIVE_SAMPLES):
            x = randint(0, len(idsForNegativeSampling)-1)
            n.append(idsForNegativeSampling[x])
        
        # n now stores ids for negative samples
        wa = W.T[a]
        # get gradient for wa
        sigCbwa = sigmoid(np.dot(C[b], wa))
        gradwa = (1.0 - sigCbwa)*C[b]
        sigminusCniwa = {}
        for id in n:
            sigminusCniwa[id] = sigmoid(np.dot(-C[id], wa))
            gradwa += (sigminusCniwa[id] - 1.0)*C[id]
            
        # update context embedding for positive sample:
        C[b] += LR*(1.0 - sigCbwa)*wa
        
        #update context embedding for negative samples:
        for id in n:
            C[id] += LR*(sigminusCniwa[id] - 1.0)*wa
        #update wa
        W.T[a] += LR*gradwa
        CUR_OBJECTIVE = math.log(sigCbwa)
        for id in n:
            CUR_OBJECTIVE += math.log(sigminusCniwa[id])
        if(numDone%10000 == 0):
            print('Objective', CUR_OBJECTIVE)
        numDone += 1
    print('Epoch done', epoch)

Objective -369.8252819183574
Objective -1.6748379580564048
Objective -32.62534923103328
Objective -175.52861258024123
Objective -3.450468532406376
Objective -5.659135652950759
Objective -4.040083856615707
Objective -5.964640579071141
Objective -2.678388467662326
Objective -2.5711952051984723
Objective -4.868586832908714
Objective -0.6630235259460455
Objective -2.2298155173539036
Objective -16.844441440965376
Objective -4.625695351542134
Objective -1.5050016958831467
Objective -18.387213341118567
Objective -1.8898147541217238
Objective -3.779517477666791
Objective -2.4444869606725184
Objective -203.64208394185607
Objective -1.5048944933825112
Objective -2.2845700434239444
Objective -1.270336947384752
Objective -1.614033635185388
Objective -2.403228995327174
Objective -2.7559393050130088
Objective -2.342123598522421
Objective -2.4177319499149417
Objective -3.802688909546072
Objective -2.786153558625191
Objective -57.057923604437185
Objective -111.62183729837318
Objective -3.5058908644211

Objective -2.2051084350254175
Objective -4.065020507441029
Objective -3.181172080521367
Objective -2.370687277560304
Objective -1.554456996796584
Objective -4.211244986893946
Objective -20.718139868772752
Objective -1.6693169256158196
Objective -2.356503573006544
Objective -0.3506994798881242
Objective -3.5224533847760098
Objective -2.7158318323156894
Objective -2.0949635586908792
Objective -3.116160846394104
Objective -71.69110808899185
Objective -1.715289245285549
Objective -2.6195897622869992
Objective -3.900396403631365
Objective -2.9787792713467454
Objective -2.0916091053307575
Objective -40.55839768470301
Objective -0.977425689070678
Objective -2.50218372492913
Objective -0.6795793127010858
Objective -7.577895912039692
Objective -1.195568593445649
Objective -0.8239912639584607
Objective -1.9947818920176021
Objective -5.927284113715394
Objective -0.7290153483080475
Objective -4.371994455487665
Objective -2.395687675929439
Objective -0.20382000967484226
Objective -3.097929065743295

In [46]:
word2id['the']

9952

In [48]:
total_tokens = 0
for v in vocab:
    total_tokens += len(v)
total_tokens

211719

In [64]:
vocab['the']/total_tokens

0.2426943259697996