In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import pdb

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class SkipGramModel(nn.Module):

    def __init__(self, vocab_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.u_embeddings.to(device)
        self.v_embeddings.to(device)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-1,1)
        
            
    def forward(self, pos_u, pos_v,neg_v):
        #pdb.set_trace()
        pos_u = pos_u.view(-1)
        emb_u = self.u_embeddings(pos_u)
        #neg_v = neg_v.view(len(pos_u),-1)
        #pdb.set_trace()
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        #pdb.set_trace()
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        #pdb.set_trace()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))/ pos_u.size(0)

In [2]:
from torch.utils.data import Dataset
from collections import defaultdict
from gensim.models.doc2vec import TaggedDocument
import random

class wDataSet(Dataset):
    def __init__(self, dataset, power=0.75, neg_samples=2):
        self.dataset = dataset
        self.neg_samples=neg_samples
        self.word2idx = dict()
        self.idx2word = dict()
        self.vocab_size = int()
        self.vocab = set()
        self.create_vocab()
        self.pairs = self.generate_pairs(dataset,neg_samples)
        self.dataset_with_samples=self.create_dataset_with_samples()

        
    def generate_pairs(self, dataset, ctx_window):
            print("Generating pairs")
            pairs = []
            for sentence in dataset:
                for i,word in enumerate(sentence):
                    for j in range(1,ctx_window):
                        if(i+j<len(sentence)):
                            pairs.append((word,sentence[i+j]))
                        if((i-j)>0):
                            pairs.append((word,sentence[i-j]))
                            
            return pairs
        
    def __len__(self):
        return len(self.dataset_with_samples)
        
        
    def __getitem__(self, idx):
        return self.dataset_with_samples[idx]
        
    
    def create_key_pairs(self,pairs):
        print("Generating key_pairs")
        key_pairs = []
        #print(pairs)
        for x,y in pairs:
            key_pairs.append((self.word2idx.get(x),self.word2idx.get(y)))
        return key_pairs
    
    def create_dataset_with_samples(self, neg_samples=2):
        print("create_dataset_with_samples")
        dataset_with_samples = []
        key_pairs = self.create_key_pairs(self.pairs)
        print("creating samples")
        #print(key_pairs)
        for x,y in key_pairs: 
            neg_v = []
            for z in random.sample(range(0,self.vocab_size),neg_samples):
                neg_v.append(z)
            dataset_with_samples.append((torch.tensor([x]),torch.tensor([y]),torch.tensor(neg_v)))
        return dataset_with_samples
    
    def create_vocab(self):
        print("Creating vocab")
        for i,sentence in enumerate(self.dataset):
            for word in sentence:
                self.vocab.add(word)
        self.word2idx = {w: idx for (idx, w) in enumerate(self.vocab)}
        self.idx2word = {idx: w for (idx, w) in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)
    
   


In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
import copy

class W2V():
    def __init__(self, data,dim=100, neg_samples=2, alpha=0.025, iterations=5, batch_size=100, 
                 shuffle=False,use_cuda=False,workers=4):
        self.shuffle = shuffle        
        self.batch_size = batch_size
        self.alpha = alpha
        self.dim = dim
        self.data = data
        self.workers = workers

        self.neg_samples = neg_samples
        self.use_cuda = use_cuda
        
        self.models = []
        self.optimizers = []
        self.model = SkipGramModel(len(self.data.vocab), self.dim)
        if use_cuda:
            self.model.cuda()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=alpha)

        self.iterations = iterations
        #self.train()        
 
    def train_with_loader(self):
        optimizer = torch.optim.Adam(model.parameters(), lr=0.25)
        loader = DataLoader(dataset, self.batchSize, self.shuffle, self.num_workers)
        print('starting training')
        for epoch in range(1,self.iterations):
            for i,(pos_u,pos_v,neg_v) in enumerate(loader):
                pos_u.to(device)
                neg_v.to(device)
                pos_v = pos_v.view(len(neg_v),-1)
                pos_v.to(device)
                optimizer.zero_grad()
                loss = self.model.forward(pos_u,pos_v,neg_v)
                loss.backward()
                optimizer.step()
                prev_loss = loss
            print("loss = " + str(loss))
            print("{0:d} epoch of {1:d}".format(epoch+1, epochs))

    def get_embedding(self):
        embedding_dict = dict()
        embedding = self.model.u_embeddings.weight.data.numpy()
        for i in range(len(self.data.idx2word)):
            embedding_dict[self.data.idx2word[i]]= embedding[i]
        return embedding_dict

In [3]:
def train(model, dataset, epochs):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    print('starting training')
    for epoch in range(1,epochs):
        for i,(pos_u,pos_v,neg_v) in enumerate(dataset):
            pos_v = pos_v.view(-1,1)
            optimizer.zero_grad()
            loss = model.forward(pos_u,pos_v,neg_v)
            loss.backward()
            optimizer.step()
        print("loss = " + str(loss))
        print("{0:d} epoch of {1:d}".format(epoch+1, epochs))

In [5]:
from gensim.test.utils import datapath
import gensim.downloader as api
#sentences = LineSentence(datapath('lee_background.cor'))
dataset = api.load('text8')
print(type(dataset))
text8_tmp_dataset = []
for x in dataset: 
    text8_tmp_dataset.append(x)   

<class 'text8.Dataset'>


'\nprint(len(p))\nsum = 0\nfor x in p: \n    sum += len(x)\nprint(sum/len(p))\nprint(len(p[0]))\n'

[['this', 'is'], ['a', 'test']]

10000

In [11]:
text8_first_sentence = []
sentence = []
for i,x in enumerate(text8_tmp_dataset[0]):
    sentence.append(x)
    if (i%30 == 0 and i>0):
        text8_first_sentence.append(sentence)
        sentence=[]


[['anarchism',
  'originated',
  'as',
  'a',
  'term',
  'of',
  'abuse',
  'first',
  'used',
  'against',
  'early',
  'working',
  'class',
  'radicals',
  'including',
  'the',
  'diggers',
  'of',
  'the',
  'english',
  'revolution',
  'and',
  'the',
  'sans',
  'culottes',
  'of',
  'the',
  'french',
  'revolution',
  'whilst',
  'the'],
 ['term',
  'is',
  'still',
  'used',
  'in',
  'a',
  'pejorative',
  'way',
  'to',
  'describe',
  'any',
  'act',
  'that',
  'used',
  'violent',
  'means',
  'to',
  'destroy',
  'the',
  'organization',
  'of',
  'society',
  'it',
  'has',
  'also',
  'been',
  'taken',
  'up',
  'as',
  'a'],
 ['positive',
  'label',
  'by',
  'self',
  'defined',
  'anarchists',
  'the',
  'word',
  'anarchism',
  'is',
  'derived',
  'from',
  'the',
  'greek',
  'without',
  'archons',
  'ruler',
  'chief',
  'king',
  'anarchism',
  'as',
  'a',
  'political',
  'philosophy',
  'is',
  'the',
  'belief',
  'that',
  'rulers',
  'are'],
 ['unnece

In [12]:
text8_dataset = wDataSet((text8_first_sentence))

Creating vocab
Generating pairs
create_dataset_with_samples
Generating key_pairs
creating samples


In [15]:
len(text8_dataset)

18983

In [69]:
model = SkipGramModel(text8_dataset.vocab_size,100)
print(text8_dataset.vocab_size)
#print(text8_dataset.vocab)
print('created model')
print(len(text8_dataset.dataset_with_samples))
print(text8_dataset.dataset_with_samples[0])
print(text8_dataset.dataset_with_samples[0][0].size())
model.to(device)
train_with_loader(model, text8_dataset.dataset_with_samples, 20 , 15)
#train(model,text8_dataset.dataset_with_samples,10)|

2520
created model
18983
(tensor([1268]), tensor([1330]), tensor([1919,  675]))
torch.Size([1])
starting training
loss = tensor(492.2577, grad_fn=<DivBackward0>)
2 epoch of 15
loss = tensor(24.2408, grad_fn=<DivBackward0>)
3 epoch of 15
loss = tensor(154.0272, grad_fn=<DivBackward0>)
4 epoch of 15
loss = tensor(-0., grad_fn=<DivBackward0>)
5 epoch of 15
loss = tensor(215.7445, grad_fn=<DivBackward0>)
6 epoch of 15
loss = tensor(87.0228, grad_fn=<DivBackward0>)
7 epoch of 15
loss = tensor(-0., grad_fn=<DivBackward0>)
8 epoch of 15
loss = tensor(548.7433, grad_fn=<DivBackward0>)
9 epoch of 15
loss = tensor(-0., grad_fn=<DivBackward0>)
10 epoch of 15
loss = tensor(475.6506, grad_fn=<DivBackward0>)
11 epoch of 15
loss = tensor(370.8564, grad_fn=<DivBackward0>)
12 epoch of 15
loss = tensor(208.6704, grad_fn=<DivBackward0>)
13 epoch of 15
loss = tensor(334.1071, grad_fn=<DivBackward0>)
14 epoch of 15
loss = tensor(-0., grad_fn=<DivBackward0>)
15 epoch of 15


In [70]:
dict_emb = get_embedding(model, text8_dataset)

In [27]:
print(text8_dataset.vocab)

{'operative', 'existence', 'range', 'december', 'relating', 'experiences', 'outbursts', 'first', 'represented', 'proceeded', 'adam', 'factors', 'bleed', 'reportedly', 'claim', 'harsh', 'extensively', 'everyday', 'clearly', 'frustration', 'wildly', 'preservation', 'visual', 'prisons', 'sees', 'delays', 'beliefs', 'nonviolent', 'concept', 'cost', 'equality', 'markedly', 'interfering', 'stimulations', 'stop', 'usher', 'utility', 'once', 'quiet', 'sustaining', 'parliamentary', 'caused', 'teachers', 'mcquinn', 'name', 'organization', 'towards', 'four', 'shell', 'killing', 'communicating', 'speculate', 'professionals', 'inability', 'as', 'has', 'feelings', 'summary', 'crass', 'dominance', 'minarchists', 'behaviors', 'exchange', 'competition', 'garden', 'characteristic', 'primary', 'ricardo', 'influence', 'oppressive', 'hague', 'ws', 'examples', 'students', 'haymarket', 'practice', 'outposts', 'deep', 'wasn', 'seemed', 'dismiss', 'both', 'altogether', 'literate', 'researchers', 'seems', 'espo

In [28]:
#dict_emb

In [78]:
from scipy import spatial

x = spatial.distance.cosine(dict_emb['artist'], dict_emb['music'])
y = spatial.distance.cosine(dict_emb['anarchism'],dict_emb['music'])
print(x)
print(y)

0.9450588449835777
0.7335448861122131


0.10818819506075589

In [75]:
import itertools
import numpy as np
score = []
score_dict = dict()
for i,(x,y) in enumerate(itertools.product(text8_dataset.vocab,text8_dataset.vocab)):
    if(i%1000000==0):
        print(i)
    distance = spatial.distance.cosine(dict_emb[x], dict_emb[y])
    score_dict[(x,y)] = distance
    score.append(distance)
print(np.mean(score))
    


0
1000000
2000000
3000000
4000000
5000000
6000000
0.9324737188040717


In [77]:
np.std(score)
#print(score_dict[('anarchism','music')])

0.11336667971198654

In [83]:
import random
def get_closest(score_dict, word):
    closest = ()
    distance = 3
    for (x,y),score in score_dict.items():
        #print(x,y,score)
        if((x != y) and ((x==word)or(y==word))):
            if (distance > score):
                print(x,y)
                closest = (x,y)
                distance = score
    return closest
get_closest(score_dict,'operation')
        

operative operation
experiences operation
delays operation
operation del
operation unite


('operation', 'unite')

In [84]:
#model_new.v_embeddings(torch.tensor([1]))

In [53]:
model_new.v_embeddings

Embedding(2520, 100)