In [87]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import pdb

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class SkipGramModel(nn.Module):

    def __init__(self, vocab_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-1,1)
        
            
    def forward(self, pos_u, pos_v,neg_v):
        #pdb.set_trace()
        pos_u = pos_u.view(-1)
        emb_u = self.u_embeddings(pos_u)
        #neg_v = neg_v.view(len(pos_u),-1)
        #pdb.set_trace()
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        #pdb.set_trace()
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        #pdb.set_trace()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))/ pos_u.size(0)

In [88]:
from torch.utils.data import Dataset
from collections import defaultdict
from gensim.models.doc2vec import TaggedDocument
import random

class wDataSet(Dataset):
    def __init__(self, dataset, power=0.75, neg_samples=2):
        self.dataset = dataset
        self.neg_samples=neg_samples
        self.word2idx = dict()
        self.idx2word = dict()
        self.vocab_size = int()
        self.vocab = set()
        self.create_vocab()
        self.pairs = self.generate_pairs(dataset,neg_samples)
        self.dataset_with_samples=self.create_dataset_with_samples()

        
    def generate_pairs(self, dataset, ctx_window):
            print("Generating pairs")
            pairs = []
            for sentence in dataset:
                for i,word in enumerate(sentence):
                    for j in range(1,ctx_window):
                        if(i+j<len(sentence)):
                            pairs.append((word,sentence[i+j]))
                        if((i-j)>0):
                            pairs.append((word,sentence[i-j]))
                            
            return pairs
        
    def __len__(self):
        return len(self.dataset_with_samples)
        
        
    def __getitem__(self, idx):
        return self.dataset_with_samples[idx]
        
    
    def create_key_pairs(self,pairs):
        print("Generating key_pairs")
        key_pairs = []
        #print(pairs)
        for x,y in pairs:
            key_pairs.append((self.word2idx.get(x),self.word2idx.get(y)))
        return key_pairs
    
    def create_dataset_with_samples(self, neg_samples=2):
        print("create_dataset_with_samples")
        dataset_with_samples = []
        key_pairs = self.create_key_pairs(self.pairs)
        print("creating samples")
        #print(key_pairs)
        for x,y in key_pairs: 
            neg_v = []
            for z in random.sample(range(0,self.vocab_size),neg_samples):
                neg_v.append(z)
            dataset_with_samples.append((torch.tensor([x]),torch.tensor([y]),torch.tensor(neg_v)))
        return dataset_with_samples
    
    def create_vocab(self):
        print("Creating vocab")
        for i,sentence in enumerate(self.dataset):
            for word in sentence:
                self.vocab.add(word)
        self.word2idx = {w: idx for (idx, w) in enumerate(self.vocab)}
        self.idx2word = {idx: w for (idx, w) in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)
    
   


In [106]:
import numpy as np
import torch
from torch.utils.data import DataLoader
import copy

class W2V():
    def __init__(self, data,dim=100, neg_samples=2, alpha=0.01, iterations=5, batch_size=100, 
                 shuffle=False,use_cuda=False,workers=4):
        self.shuffle = shuffle        
        self.batch_size = batch_size
        self.alpha = alpha
        self.dim = dim
        self.data = data
        self.workers = workers

        self.neg_samples = neg_samples
        self.use_cuda = use_cuda
        
        self.models = []
        self.optimizers = []
        self.model = SkipGramModel(len(self.data.vocab), self.dim)
        if use_cuda:
            self.model.cuda()
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=alpha)

        self.iterations = iterations
        #self.train()        
 
    def train_with_loader(self):
        loader = DataLoader(self.data.dataset_with_samples, self.batch_size, self.shuffle, num_workers=self.workers)
        print('starting training')
        for epoch in range(1,self.iterations):
            for i,(pos_u,pos_v,neg_v) in enumerate(loader):
                pos_u.to(device)
                neg_v.to(device)
                pos_v = pos_v.view(len(neg_v),-1)
                pos_v.to(device)
                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u,pos_v,neg_v)
                loss.backward()
                self.optimizer.step()
                prev_loss = loss
            print("loss = " + str(loss))
            print("{0:d} epoch of {1:d}".format(epoch+1, self.iterations))

    def get_embedding(self):
        embedding_dict = dict()
        embedding = self.model.u_embeddings.weight.data.numpy()
        for i in range(len(self.data.idx2word)):
            embedding_dict[self.data.idx2word[i]]= embedding[i]
        return embedding_dict

In [89]:
def train(model, dataset, epochs):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    print('starting training')
    for epoch in range(1,epochs):
        for i,(pos_u,pos_v,neg_v) in enumerate(dataset):
            pos_v = pos_v.view(-1,1)
            optimizer.zero_grad()
            loss = model.forward(pos_u,pos_v,neg_v)
            loss.backward()
            optimizer.step()
        print("loss = " + str(loss))
        print("{0:d} epoch of {1:d}".format(epoch+1, epochs))

In [90]:
from gensim.test.utils import datapath
import gensim.downloader as api
#sentences = LineSentence(datapath('lee_background.cor'))
dataset = api.load('text8')
text8_dataset = []
for x in dataset: 
    text8_dataset.append(x)   

In [113]:
text8_first_sentence = []
sentence = []
for i,x in enumerate(text8_tmp_dataset[0]):
    sentence.append(x)
    if (i%30 == 0 and i>0):
        text8_first_sentence.append(sentence)
        sentence=[]

In [114]:
text8_dataset_first_sentence = wDataSet((text8_first_sentence))
#text8_wDataset = wDataSet((text8_dataset))

Creating vocab
Generating pairs
create_dataset_with_samples
Generating key_pairs
creating samples


In [112]:
w2v = W2V(text8_dataset_first_sentence)
#w2v = W2V(text8_wDataset)

NameError: name 'text8_dataset_first_sentence' is not defined

In [109]:
w2v.train_with_loader()

starting training
loss = tensor(2.0782, grad_fn=<DivBackward0>)
2 epoch of 5
loss = tensor(2.0751, grad_fn=<DivBackward0>)
3 epoch of 5
loss = tensor(2.0721, grad_fn=<DivBackward0>)
4 epoch of 5
loss = tensor(2.0691, grad_fn=<DivBackward0>)
5 epoch of 5


### EVALUATION

In [70]:
dict_emb = get_embedding(model, text8_dataset)

In [110]:
from scipy import spatial

x = spatial.distance.cosine(dict_emb['artist'], dict_emb['music'])
y = spatial.distance.cosine(dict_emb['anarchism'],dict_emb['music'])
print(x)
print(y)

0.9448586255311966
0.7955727875232697


In [111]:
import itertools
import numpy as np
score = []
score_dict = dict()
for i,(x,y) in enumerate(itertools.product(text8_dataset.vocab,text8_dataset.vocab)):
    if(i%1000000==0):
        print(i)
    distance = spatial.distance.cosine(dict_emb[x], dict_emb[y])
    score_dict[(x,y)] = distance
    score.append(distance)
print(np.mean(score))
    


0
1000000
2000000
3000000
4000000


KeyboardInterrupt: 

In [77]:
np.std(score)
#print(score_dict[('anarchism','music')])

0.11336667971198654

In [83]:
import random
def get_closest(score_dict, word):
    closest = ()
    distance = 3
    for (x,y),score in score_dict.items():
        #print(x,y,score)
        if((x != y) and ((x==word)or(y==word))):
            if (distance > score):
                print(x,y)
                closest = (x,y)
                distance = score
    return closest
get_closest(score_dict,'operation')
        

operative operation
experiences operation
delays operation
operation del
operation unite


('operation', 'unite')