In [152]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import pdb

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class SkipGramModel(nn.Module):

    def __init__(self, vocab_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-1,1)
        
            
    def forward(self, pos_u, pos_v,neg_v):
        #pdb.set_trace()
        pos_u = pos_u.view(-1)
        emb_u = self.u_embeddings(pos_u)
        #neg_v = neg_v.view(len(pos_u),-1)
        #pdb.set_trace()
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        #pdb.set_trace()
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        #pdb.set_trace()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))/ pos_u.size(0)

In [149]:
from torch.utils.data import Dataset
from collections import defaultdict
from gensim.models.doc2vec import TaggedDocument
import random

class wDataSet(Dataset):
    def __init__(self, dataset, power=0.75, neg_samples=2):
        self.dataset = dataset
        self.neg_samples=neg_samples
        self.word2idx = dict()
        self.idx2word = dict()
        self.vocab_size = int()
        self.vocab = set()
        self.create_vocab()
        self.pairs = self.generate_pairs(dataset,neg_samples)
        self.dataset_tensors=self.create_dataset_tensors()

        
    def generate_pairs(self, dataset, ctx_window):
            print("Generating pairs")
            pairs = []
            for sentence in dataset:
                for i,word in enumerate(sentence):
                    for j in range(1,ctx_window):
                        if(i+j<len(sentence)):
                            pairs.append((word,sentence[i+j]))
                        if((i-j)>0):
                            pairs.append((word,sentence[i-j]))
                            
            return pairs
        
    def __len__(self):
        return len(self.dataset_with_samples)
        
        
    def __getitem__(self, idx):
        return self.dataset_with_samples[idx]
    
    def get_neg_samples(self, count, batch_size):
        neg_v = []
        for x in range(1,batch_size+1):
            neg_v.append(random.sample(range(0,self.vocab_size),count))
        return torch.tensor(neg_v).view(batch_size,-1)
        
    
    def create_key_pairs(self,pairs):
        print("Generating key_pairs")
        key_pairs = []
        #print(pairs)
        for x,y in pairs:
            key_pairs.append((self.word2idx.get(x),self.word2idx.get(y)))
        return key_pairs
    
    def create_dataset_tensors(self):
        print("create_dataset_with_samples")
        dataset_tensors = []
        key_pairs = self.create_key_pairs(self.pairs)
        print("creating samples")
        #print(key_pairs)
        for x,y in key_pairs: 
            dataset_tensors.append((torch.tensor([x]),torch.tensor([y])))
        return dataset_tensors
    
    def create_vocab(self):
        print("Creating vocab")
        for i,sentence in enumerate(self.dataset):
            for word in sentence:
                self.vocab.add(word)
        self.word2idx = {w: idx for (idx, w) in enumerate(self.vocab)}
        self.idx2word = {idx: w for (idx, w) in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)
    
   


In [148]:
tmp = []
for x in range(1,11):
    tmp.append(random.sample(range(0,50),2))
t = torch.tensor(tmp)
t.view(10,-1)
    

tensor([[41, 19],
        [ 5, 43],
        [32, 25],
        [21, 48],
        [22, 47],
        [43, 11],
        [13,  8],
        [14, 44],
        [14, 11],
        [24, 34]])

In [163]:
import numpy as np
import torch
from torch.utils.data import DataLoader
import copy

class W2V():
    def __init__(self, data,dim=100, neg_samples=2, alpha=0.01, iterations=10, batch_size=50, 
                 shuffle=False,use_cuda=False,workers=4):
        self.shuffle = shuffle        
        self.batch_size = batch_size
        self.alpha = alpha
        self.dim = dim
        self.data = data
        self.workers = workers

        self.neg_samples = neg_samples
        self.use_cuda = use_cuda
        
        self.models = []
        self.optimizers = []
        self.model = SkipGramModel(len(self.data.vocab), self.dim)
        self.model.to(device)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=alpha)

        self.iterations = iterations
        #self.train()        
 
    def train_with_loader(self):
        loader = DataLoader(self.data.dataset_tensors, self.batch_size, self.shuffle, num_workers=self.workers)
        print('starting training')
        for epoch in range(1,self.iterations):
            for i,(pos_u,pos_v) in enumerate(loader):
                neg_v = self.data.get_neg_samples(self.data.neg_samples,pos_v.size()[0])
                pos_u.to(device)
                neg_v.to(device)
                pos_v = pos_v.view(len(neg_v),-1)
                pos_v.to(device)
                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u,pos_v,neg_v)
                loss.backward()
                self.optimizer.step()
                prev_loss = loss
            print("loss = " + str(loss))
            print("{0:d} epoch of {1:d}".format(epoch+1, self.iterations))

    def get_embedding(self):
        embedding_dict = dict()
        embedding = self.model.u_embeddings.weight.data.numpy()
        for i in range(len(self.data.idx2word)):
            embedding_dict[self.data.idx2word[i]]= embedding[i]
        return embedding_dict

In [89]:
def train(model, dataset, epochs):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    print('starting training')
    for epoch in range(1,epochs):
        for i,(pos_u,pos_v,neg_v) in enumerate(dataset):
            pos_v = pos_v.view(-1,1)
            optimizer.zero_grad()
            loss = model.forward(pos_u,pos_v,neg_v)
            loss.backward()
            optimizer.step()
        print("loss = " + str(loss))
        print("{0:d} epoch of {1:d}".format(epoch+1, epochs))

In [90]:
from gensim.test.utils import datapath
import gensim.downloader as api
#sentences = LineSentence(datapath('lee_background.cor'))
dataset = api.load('text8')
text8_dataset = []
for x in dataset: 
    text8_dataset.append(x)   

In [113]:
text8_first_sentence = []
sentence = []
for i,x in enumerate(text8_dataset[0]):
    sentence.append(x)
    if (i%30 == 0 and i>0):
        text8_first_sentence.append(sentence)
        sentence=[]

In [164]:
text8_dataset_first_sentence = wDataSet((text8_first_sentence))
#text8_wDataset = wDataSet((text8_dataset))

Creating vocab
Generating pairs
create_dataset_with_samples
Generating key_pairs
creating samples


In [165]:
w2v = W2V(text8_dataset_first_sentence)
#w2v = W2V(text8_wDataset)

In [166]:
w2v.data.dataset_tensors[0][0].size()


torch.Size([1])

In [167]:
w2v.train_with_loader()

starting training
loss = tensor(2.0803, grad_fn=<DivBackward0>)
2 epoch of 10
loss = tensor(2.0665, grad_fn=<DivBackward0>)
3 epoch of 10
loss = tensor(2.0641, grad_fn=<DivBackward0>)
4 epoch of 10
loss = tensor(2.0609, grad_fn=<DivBackward0>)
5 epoch of 10
loss = tensor(2.0571, grad_fn=<DivBackward0>)
6 epoch of 10
loss = tensor(2.0514, grad_fn=<DivBackward0>)
7 epoch of 10
loss = tensor(2.0440, grad_fn=<DivBackward0>)
8 epoch of 10
loss = tensor(2.0466, grad_fn=<DivBackward0>)
9 epoch of 10
loss = tensor(2.0402, grad_fn=<DivBackward0>)
10 epoch of 10


### EVALUATION

In [70]:
dict_emb = get_embedding(model, text8_dataset)

In [110]:
from scipy import spatial

x = spatial.distance.cosine(dict_emb['artist'], dict_emb['music'])
y = spatial.distance.cosine(dict_emb['anarchism'],dict_emb['music'])
print(x)
print(y)

0.9448586255311966
0.7955727875232697


In [111]:
import itertools
import numpy as np
score = []
score_dict = dict()
for i,(x,y) in enumerate(itertools.product(text8_dataset.vocab,text8_dataset.vocab)):
    if(i%1000000==0):
        print(i)
    distance = spatial.distance.cosine(dict_emb[x], dict_emb[y])
    score_dict[(x,y)] = distance
    score.append(distance)
print(np.mean(score))
    


0
1000000
2000000
3000000
4000000


KeyboardInterrupt: 

In [77]:
np.std(score)
#print(score_dict[('anarchism','music')])

0.11336667971198654

In [206]:
import random
def get_closest(score_dict, word):
    closest = ()
    distance = 3
    for (x,y),score in score_dict.items():
        #print(x,y,score)
        if((x != y) and ((x==word)or(y==word))):
            if (distance > score):
                print(x,y)
                closest = (x,y)
                distance = score
    return closest
get_closest(score_dict,'operation')

def get_list_closest(vocab, score_dict):
    list_closest = []
    for x in vocab:
        list_closest.append(get_closest(score_dict,x))
    return list_closest

def get_closest_with_score(dict_emb,y):
    distance = 100
    for x,emb in dict_emb.items():
        if(spatial.distance.cosine(dict_emb[x], dict_emb[y])<distance):
            closest = x
    return x
        

operative operation
quiet operation
popular operation
operation unite
operative existence
operative range
operative first
operative reportedly
operative sees
operative beliefs
operative william
operative felt
operative imaginary
operative cuddling
operative cgt
operative existence
existence december
existence represented
existence sees
existence concept
existence literate
existence abstention
existence be
existence spreading
existence prepare
existence peasants
operative range
range december
range everyday
range quiet
range sustaining
range organization
range summary
range examples
range wto
operative december
existence december
december extensively
december calming
december spain
december mutual
december elaborate
operative relating
existence relating
relating first
relating stimulations
relating levels
relating voluntary
operative experiences
existence experiences
experiences harsh
experiences beliefs
experiences deep
experiences trade
experiences around
operative outbursts
existence

operative dominance
extensively dominance
cost dominance
interfering dominance
dominance bolshevism
dominance culminated
dominance inherent
dominance annihilation
dominance septentrionale
dominance existing
operative minarchists
december minarchists
experiences minarchists
outbursts minarchists
dominance minarchists
minarchists seemed
minarchists william
minarchists term
minarchists institution
operative behaviors
existence behaviors
range behaviors
represented behaviors
quiet behaviors
sustaining behaviors
behaviors characteristics
behaviors stay
behaviors linking
behaviors spreading
operative exchange
range exchange
relating exchange
preservation exchange
exchange wasn
exchange amount
exchange favoured
operative competition
existence competition
concept competition
competition domination
competition punk
competition toes
operative garden
existence garden
first garden
proceeded garden
speculate garden
garden examples
garden literate
garden context
garden changes
garden dynamics
garden

operative anarcha
existence anarcha
anarcha if
anarcha travail
anarcha assist
anarcha product
anarcha terms
anarcha workers
anarcha opponents
operative turn
existence turn
extensively turn
turn federation
operative hear
relating hear
first hear
favors hear
hear feminists
hear marx
hear school
hear followers
operative experiment
first experiment
cost experiment
experiment move
experiment books
experiment predecessors
operative makhnovschina
existence makhnovschina
represented makhnovschina
equality makhnovschina
makhnovschina state
makhnovschina opponents
makhnovschina achievement
makhnovschina majority
operative refused
existence refused
range refused
represented refused
sees refused
refused spend
refused need
refused amount
refused from
refused supporting
operative context
existence context
garden context
context domination
context satisfaction
operative preventing
existence preventing
outbursts preventing
first preventing
reportedly preventing
clearly preventing
frustration preventin

operative teaching
existence teaching
equality teaching
teachers teaching
professionals teaching
dismiss teaching
religion teaching
operative forms
existence forms
experiences forms
represented forms
claim forms
inability forms
forms anger
forms specific
operative claims
experiences claims
sees claims
claims assist
claims self
operative well
range well
relating well
first well
proceeded well
well neopagan
well understanding
well distasteful
well terms
operative starhawk
existence starhawk
represented starhawk
domination starhawk
starhawk theft
operative cleaners
range cleaners
sees cleaners
beliefs cleaners
once cleaners
cleaners good
cleaners achievements
cleaners schedules
cleaners react
operative characteristics
outbursts characteristics
first characteristics
extensively characteristics
frustration characteristics
characteristics six
operative stateless
range stateless
culminated stateless
operative armed
existence armed
speculate armed
armed psychology
armed with
operative moment
f

operative etiology
delays etiology
concept etiology
etiology price
operative periods
represented periods
anarcho periods
contractarianism periods
periods infants
periods notion
operative freely
existence freely
range freely
december freely
trade freely
specifically freely
freely claiming
freely christiania
freely bible
operative feminist
delays feminist
cost feminist
abstention feminist
operative english
quiet english
english tests
english interpret
english anger
english attachment
operative respected
existence respected
experiences respected
minarchists respected
formed respected
respected misinterpret
operative andrews
represented andrews
prisons andrews
oxymoron andrews
domination andrews
claims andrews
andrews october
operative regulating
existence regulating
factors regulating
students regulating
levels regulating
annihilation regulating
regulating explosive
operative takes
existence takes
range takes
cost takes
quiet takes
speculate takes
has takes
dismiss takes
favors takes
take

operative industrial
existence industrial
first industrial
represented industrial
factors industrial
extensively industrial
frustration industrial
haymarket industrial
going industrial
context industrial
industrial marx
industrial wrote
operative classic
range classic
both classic
classic attitudes
classic hutterites
operative speech
represented speech
sustaining speech
fully speech
lead speech
speech unwinding
speech advocate
speech extreme
operative ration
extensively ration
everyday ration
ration ayn
ration stanley
operative autistics
preservation autistics
sees autistics
teachers autistics
feelings autistics
autistics base
autistics workerist
operative organized
represented organized
factors organized
stay organized
organized alienation
organized products
organized alternative
organized characterization
operative tolstoy
existence tolstoy
first tolstoy
harsh tolstoy
teachers tolstoy
speculate tolstoy
has tolstoy
tolstoy if
tolstoy proffessed
tolstoy birth
tolstoy writing
operative 

operative lasted
existence lasted
usher lasted
garden lasted
generates lasted
kind lasted
lasted noted
lasted improved
operative stages
existence stages
exponent stages
stages peasants
stages middle
operative pacifist
existence pacifist
represented pacifist
extensively pacifist
beliefs pacifist
shell pacifist
inherent pacifist
pacifist since
pacifist followers
operative expected
existence expected
spend expected
expected entering
expected victory
operative lessons
existence lessons
relating lessons
cost lessons
teachers lessons
students lessons
context lessons
lessons overwhelming
lessons prefers
lessons petite
operative l
relating l
represented l
shell l
will l
l become
l baron
l policy
l resist
operative guattari
existence guattari
visual guattari
garden guattari
phrase guattari
home guattari
coup guattari
guattari alliances
guattari developments
operative offshoots
represented offshoots
beliefs offshoots
context offshoots
offshoots infants
offshoots cuddling
offshoots activism
opera

operative hidden
existence hidden
experiences hidden
hidden teenage
hidden understand
hidden revolutions
operative expressing
represented expressing
routine expressing
formed expressing
expressing noted
expressing happily
expressing taxation
expressing years
operative teams
represented teams
teachers teams
inherent teams
teams create
operative amount
existence amount
relating amount
towards amount
avowed amount
refused amount
anarchy amount
amount underway
operative fairly
existence fairly
represented fairly
concept fairly
equality fairly
utility fairly
quiet fairly
music fairly
teaching fairly
inherent fairly
leader fairly
fairly justify
fairly collectives
fairly condoning
operative variety
existence variety
proceeded variety
factors variety
everyday variety
global variety
cars variety
field variety
variety privilege
variety motor
operative situations
represented situations
factors situations
sees situations
cost situations
stimulations situations
dominance situations
operative short


operative illusions
existence illusions
relating illusions
first illusions
prisons illusions
utility illusions
crass illusions
song illusions
illusions conscious
illusions vegan
illusions toys
illusions plain
operative needed
existence needed
harsh needed
sees needed
back needed
sovereignty needed
civil needed
listed needed
needed god
operative bureaucratic
range bureaucratic
first bureaucratic
frustration bureaucratic
quiet bureaucratic
avowed bureaucratic
engels bureaucratic
bureaucratic intervention
bureaucratic misinterpret
bureaucratic fail
bureaucratic johann
operative term
existence term
first term
represented term
sustaining term
primitivists term
operative references
towards references
dominance references
references sign
references just
operative price
delays price
concept price
domination price
price patriarchy
operative manifesto
existence manifesto
december manifesto
experiences manifesto
equality manifesto
interfering manifesto
manifesto races
operative those
existence th

operative supported
first supported
frustration supported
usher supported
dominance supported
nearly supported
since supported
supported incidence
supported travails
operative wide
range wide
harsh wide
cost wide
term wide
wide assert
wide opposition
operative revision
existence revision
stop revision
sustaining revision
professionals revision
religion revision
spreading revision
revision partially
revision another
operative uncommon
range uncommon
experiences uncommon
represented uncommon
teachers uncommon
away uncommon
claiming uncommon
unusual uncommon
peers uncommon
uncommon stalinist
uncommon flores
uncommon strategies
uncommon estimate
operative originally
existence originally
factors originally
reportedly originally
wildly originally
four originally
shell originally
originally seven
originally crucially
originally bordering
originally bible
originally dedicated
operative informal
outbursts informal
first informal
wildly informal
informal extending
informal spoilt
informal outspo

operative prevalent
existence prevalent
factors prevalent
sovereignty prevalent
prevalent enemy
prevalent motor
operative effects
existence effects
outbursts effects
extensively effects
tests effects
effects sing
operative noted
existence noted
first noted
beliefs noted
once noted
professionals noted
inability noted
dismiss noted
if noted
lasted noted
noted insanity
noted regulate
operative focuses
beliefs focuses
mcquinn focuses
acting focuses
formation focuses
fighting focuses
focuses covered
operative delayed
relating delayed
first delayed
frustration delayed
typically delayed
infants delayed
delayed occupation
operative october
claim october
sees october
sustaining october
dominance october
neopagan october
october order
operative successful
first successful
extensively successful
usher successful
feelings successful
votes successful
property successful
successful only
successful company
operative attempted
outbursts attempted
as attempted
moment attempted
helped attempted
lead att

KeyboardInterrupt: 

In [204]:
file = open("./data/questions-words.txt")
questions = file.readlines()
for i,x in enumerate(questions): 
    questions[i] = x.rstrip("\n").split()
    if x[0]==':':
        del questions[i]
    


In [209]:
def analogy_task(questions,dict_emb):
    score = []
    if all(word in dict_emb for question in questions):
        y = dict_emb[questions[0]] -  dict_emb[questions[1]] +  dict_emb[questions[2]]
        x = get_closest_with_score(dict_emb,y)
        if x == questions[3]:
            score.append(1)
        else: 
            score.append(0)
    return score
        

In [210]:
score = analogy_task(questions,dict_emb)        
    
        


NameError: name 'word' is not defined