In [2]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import pdb

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class SkipGramModel(nn.Module):

    def __init__(self, vocab_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-1,1)
        
            
    def forward(self, pos_u, pos_v,neg_v):
        #pdb.set_trace()
        pos_u = pos_u.view(-1)
        emb_u = self.u_embeddings(pos_u)
        #neg_v = neg_v.view(len(pos_u),-1)
        #pdb.set_trace()
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        #pdb.set_trace()
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        #pdb.set_trace()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))/ pos_u.size(0)

In [3]:
from torch.utils.data import Dataset
from collections import defaultdict
import random
class wDataSet(Dataset):
    def __init__(self, dataset, power=0.75,ctx_window=2):
        self.ctx_window = ctx_window
        self.dataset = dataset
        self.word2idx = dict()
        self.idx2word = dict()
        self.vocab_size = int()
        self.vocab = set()
        self.create_vocab()
        self.pairs = self.generate_pairs()
        self.key_pairs = self.generate_key_pairs(self.pairs)
        #self.dataset_tensors=self.create_dataset_tensors()
        

        
    def generate_pairs(self):
            print("Generating pairs")
            pairs = []
            for sentence in self.dataset:
                for i,word in enumerate(sentence):
                    for j in range(1,self.ctx_window):
                        if(i+j<len(sentence)):
                            pairs.append((word,sentence[i+j]))
                        if((i-j)>0):
                            pairs.append((word,sentence[i-j]))
                            
            return pairs
        
    def __len__(self):
        return len(self.key_pairs)
        
        
    def __getitem__(self, idx):
        return self.key_pairs
    
    def get_neg_samples(self, count, batch_size):
        neg_v = []
        for x in range(1,batch_size+1):
            neg_v.append(random.sample(range(0,self.vocab_size),count))
        return torch.tensor(neg_v).view(batch_size,-1)
        
    
    def generate_key_pairs(self,pairs):
        print("Generating key_pairs")
        key_pairs = []
        #print(pairs)
        for x,y in pairs:
            key_pairs.append((self.word2idx.get(x),self.word2idx.get(y)))
        print("finished creating key_pairs")
        return key_pairs
    
    
    def create_vocab(self):
        print("Creating vocab")
        for i,sentence in enumerate(self.dataset):
            for word in sentence:
                self.vocab.add(word)
        self.word2idx = {w: idx for (idx, w) in enumerate(self.vocab)}
        self.idx2word = {idx: w for (idx, w) in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)
    
   


In [3]:
import datetime
import time
currentDT = datetime.datetime.now()
print ("Current Hour is: %d" % currentDT.hour)


Current Hour is: 18


In [30]:
import numpy as np
import torch
from torch.utils.data import DataLoader
import copy
import time

class W2V():
    def __init__(self, data,dim=100, neg_samples=2, alpha=0.01, iterations=10, batch_size=15000, 
                 shuffle=False,use_cuda=True,workers=7):
        self.shuffle = shuffle        
        self.batch_size = batch_size
        self.alpha = alpha
        self.dim = dim
        self.data = data
        self.workers = workers

        self.neg_samples = neg_samples
        self.use_cuda = use_cuda
        
        self.models = []
        self.optimizers = []
        self.model = SkipGramModel(len(self.data.vocab), self.dim)
        #self.model.cuda()
        print(device)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=alpha)

        self.iterations = iterations
        #self.train()        
 
    def train_with_loader(self):
        loader = DataLoader(self.data.key_pairs, self.batch_size, self.shuffle, num_workers=self.workers)
        print('starting training')
        tenth = int(len(loader)/10)
        for epoch in range(1,self.iterations):
            percent = 0
            start = time.time()
            for i,(pos_u,pos_v) in enumerate(loader):
                if(i%tenth == 0 ):
                    end = time.time()
                    hours, rem = divmod(end-start, 3600)
                    minutes, seconds = divmod(rem, 60)
                    print("Time since start: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
                    print(str(percent) + "% of epoch is done" )
                    percent+=10
                neg_v = self.data.get_neg_samples(self.neg_samples,pos_v.size()[0])
                pos_v = pos_v.view(len(neg_v),-1)
                #pos_v.cuda()
                #pos_u.cuda()
                #neg_v.cuda()
                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u,pos_v,neg_v)
                loss.backward()
                self.optimizer.step()
                prev_loss = loss
            print("loss = " + str(loss))
            print("{0:d} epoch of {1:d}".format(epoch+1, self.iterations))

    def get_embedding(self):
        embedding_dict = dict()
        embedding = self.model.u_embeddings.weight.data.numpy()
        for i in range(len(self.data.idx2word)):
            embedding_dict[self.data.idx2word[i]]= embedding[i]
        return embedding_dict

In [5]:
def train(model, dataset, epochs):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    print('starting training')
    for epoch in range(1,epochs):
        for i,(pos_u,pos_v,neg_v) in enumerate(dataset):
            pos_v = pos_v.view(-1,1)
            optimizer.zero_grad()
            loss = model.forward(pos_u,pos_v,neg_v)
            loss.backward()
            optimizer.step()
        print("loss = " + str(loss))
        print("{0:d} epoch of {1:d}".format(epoch+1, epochs))

In [6]:
from gensim.test.utils import datapath
import gensim.downloader as api
#sentences = LineSentence(datapath('lee_background.cor'))
dataset = api.load('text8')
text8_dataset = []
for x in dataset: 
    text8_dataset.append(x)

In [7]:
text8_first_sentence = []
sentence = []
for i,x in enumerate(text8_dataset[0]):
    sentence.append(x)
    if (i%30 == 0 and i>0):
        text8_first_sentence.append(sentence)
        sentence=[]

In [27]:
#text8_dataset_first_sentence = wDataSet((text8_first_sentence))
text8_wDataset = wDataSet((text8_dataset),ctx_window=3)

Creating vocab
Generating pairs
Generating key_pairs
finished creating key_pairs


In [18]:
len(text8_wDataset.key_pairs)

34005311

In [31]:
#w2v = W2V(text8_dataset_first_sentence)
w2v = W2V(text8_wDataset,neg_samples=4)

cuda:0


In [20]:
w2v.neg_samples

4

In [None]:
w2v.train_with_loader()
bs = 'bs'+str(w2v.batch_size)
neg = 'neg'+str(w2v.neg_samples)
dim = 'dim' + str(w2v.dim)
epochs = 'epochs'+ str(w2v.iterations)
ctxw = 'ctxw' + str(w2v.data.ctx_window)
l = [bs,neg,dim,epochs,ctxw]
filename= "w2v"
for x in l: 
    filename += (str(x) + '_')
filename += '.pkl'
import pickle
dict_emb = w2v.get_embedding()
pickle_out = open( "dict_emb_" + filename ,"wb")
pickle.dump(dict_emb, pickle_out)
pickle_out.close()

starting training
Time since start: 00:00:02.08
0% of epoch is done
Time since start: 00:08:35.96
10% of epoch is done
Time since start: 00:17:05.11
20% of epoch is done
Time since start: 00:25:27.44
30% of epoch is done
Time since start: 00:33:51.47
40% of epoch is done
Time since start: 00:42:12.10
50% of epoch is done
Time since start: 00:50:41.31
60% of epoch is done
Time since start: 00:59:03.37
70% of epoch is done


In [21]:
import pickle 
bs = 'bs'+str(w2v.batch_size)
neg = 'neg'+str(w2v.neg_samples)
dim = 'dim' + str(w2v.dim)
epochs = 'epochs'+ str(w2v.iterations)
ctxw = 'ctxw' + str(w2v.data.ctx_window)
l = [bs,neg,dim,epochs,ctxw]
filename= "w2v"
for x in l: 
    filename += (str(x) + '_')
filename += '.pkl'
filename



'w2vbs20000_neg4_dim100_epochs10_ctxw3_.pkl'

In [22]:
pickle_out = open(filename,"wb")
pickle.dump(w2v, pickle_out)
pickle_out.close()

PicklingError: Can't pickle <class '__main__.W2V'>: it's not the same object as __main__.W2V

In [13]:
import pickle 
with open('w2v_bs10k_neg3_dim100.pkl', 'wb') as output:
        pickle.dump(w2v_loaded, output, pickle.HIGHEST_PROTOCOL)

In [29]:
import itertools
len(text8_wDataset.vocab)

253854

### EVALUATION

In [23]:
dict_emb = w2v.get_embedding()

In [25]:
from scipy import spatial

x = spatial.distance.cosine(dict_emb['artist'], dict_emb['music'])
y = spatial.distance.cosine(dict_emb['anarchism'],dict_emb['music'])
z = spatial.distance.cosine(dict_emb['revolution'],dict_emb['anarchism'])

l = ['music','anarchism','artist','revolution','philosophy','creatine']
print(x)
print(y)
z

0.8569162636995316
0.9923342950642109


0.9120782390236855

In [90]:
def get_distances(word):
    for x in dict_emb.keys():
        yield(x, spatial.distance.cosine(dict_emb[word],dict_emb[x]))

In [92]:
p = []
for x in l:
    p.append((list(get_distances(x))))

In [137]:
from operator import itemgetter
import heapq
for x in p:
    print((heapq.nsmallest(5,x, key=itemgetter(1)))[0])
    print((heapq.nlargest(5,x, key=itemgetter(1)))[0])

('music', 0.0)
('komnenos', 1.4891326129436493)
('anarchism', 0.0)
('gondwanda', 1.4429270923137665)
('artist', 0.0)
('frittura', 1.4317588806152344)
('revolution', 0.0)
('diacriticals', 1.4835915863513947)
('philosophy', 0.0)
('bomblet', 1.4533373713493347)
('creatine', 0.0)
('aupperle', 1.4633850157260895)


In [27]:
import itertools
import numpy as np
from scipy import spatial
score = []
score_dict = dict()
for i,(x,y) in enumerate(itertools.product(text8_wDataset.vocab,text8_wDataset.vocab)):
    if(i%10000000==0):
        print(i)
    distance = spatial.distance.cosine(dict_emb[x], dict_emb[y])
    score_dict[(x,y)] = distance
    score.append(distance)
print(np.mean(score))
    


0


NameError: name 'spatial' is not defined

In [77]:
np.std(score)
#print(score_dict[('anarchism','music')])

0.11336667971198654

In [27]:
import random
def get_closest(score_dict, word):
    closest = ()
    distance = 3
    for (x,y),score in score_dict.items():
        #print(x,y,score)
        if((x != y) and ((x==word)or(y==word))):
            if (distance > score):
                closest = (x,y)
                distance = score
    return closest

def get_closest_with_score(dict_emb,y):
    distance = 100
    for x,emb in dict_emb.items():
        if(spatial.distance.cosine(dict_emb[x], dict_emb[y])<distance):
            closest = x
    return x
        

In [24]:
file = open("./data/questions-words.txt")
questions = file.readlines()
for i,x in enumerate(questions): 
    questions[i] = x.rstrip("\n").split()
    if x[0]==':':
        del questions[i]
    


In [25]:
def analogy_task(questions,dict_emb):
    score = []
    if all(word in dict_emb for word in questions):
        y = dict_emb[questions[0]] -  dict_emb[questions[1]] +  dict_emb[questions[2]]
        x = get_closest_with_score(dict_emb,y)
        if x == questions[3]:
            score.append(1)
        else: 
            score.append(0)
    return score
        

In [26]:
score = analogy_task(questions,dict_emb)
print(score)

[]


In [30]:
words = random.sample(dict_emb.keys(),10)

In [31]:
for x in words:
    print(get_closest(score_dict,x))

NameError: name 'score_dict' is not defined