In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F


class SkipGramModel(nn.Module):

    def __init__(self, emb_size_u, emb_size_v,emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(emb_size_u, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(emb_size_v, emb_dimension, sparse=False)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-0, 0)
        
    def forward(self, pos_u, pos_v,neg_v):
        emb_u = self.u_embeddings(pos_u)
        neg_v = neg_v.view(len(pos_u),-1)
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))/ pos_u.size(0)

In [2]:
from torch.utils.data import Dataset
from collections import defaultdict
from gensim.models.doc2vec import TaggedDocument
import random

class wDataSet(Dataset):
    def __init__(self, dataset, power=0.75, neg_samples=10):
        self.pairs = self.generate_pairs(dataset,5)
        self.neg_samples=neg_samples
        
    def generate_pairs(self, dataset, ctx_window):
            pairs = []
            for sentence in dataset:
                for i,word in enumerate(sentence):
                    for j in range(1,ctx_window):
                        if(i+j<len(sentence[i])):
                            pairs.append((word,sentence[i+j]))
                        if((i-j)>0):
                            pairs.append((word,sentence[i-j]))
                            
            return pairs
                            
        
    def __len__(self):
        return len(pairs)
        
    def __getitem__(self, idx):
        return pairs[idx]



In [3]:
from gensim.test.utils import datapath
import gensim.downloader as api
#sentences = LineSentence(datapath('lee_background.cor'))
dataset = api.load('text8')
print(type(dataset))
p = []
for x in dataset: 
    p.append(x)
    
"""
print(len(p))
sum = 0
for x in p: 
    sum += len(x)
print(sum/len(p))
print(len(p[0]))
"""

<class 'text8.Dataset'>


'\nprint(len(p))\nsum = 0\nfor x in p: \n    sum += len(x)\nprint(sum/len(p))\nprint(len(p[0]))\n'

In [7]:
#dataset = wDataSet(p)


In [26]:
from collections import Counter
tmp=[['he', 'is', 'a', 'king'],
 ['she', 'is', 'a', 'queen'],
 ['he', 'is', 'a', 'man'],
 ['she', 'is', 'a', 'woman'],
 ['warsaw', 'is', 'poland', 'capital'],
 ['berlin', 'is', 'germany', 'capital'],
 ['paris', 'is', 'france', 'capital']]
print(type(tmp))
print(type(p))
small_dataset=tmp
vocabulary = []
for sentence in small_dataset:
    for word in sentence:
        if word not in vocabulary:
            vocabulary.append(word)
print(vocabulary)
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}


def create_pairs(dataset, ctx_window):
            pairs = []
            for sentence in dataset:
                for i,word in enumerate(sentence):
                    for j in range(1,ctx_window+1):
                        if((i+j)<len(sentence)):
                            pairs.append((word,sentence[i+j]))
                        if((i-j)>0):
                            pairs.append((word,sentence[i-j]))
            return pairs


<class 'list'>
<class 'list'>
['he', 'is', 'a', 'king', 'she', 'queen', 'man', 'woman', 'warsaw', 'poland', 'capital', 'berlin', 'germany', 'paris', 'france']


In [17]:


v_embeddings = nn.Embedding(5, 1, sparse=False)
u_embeddings = nn.Embedding(5, 1, sparse=False)
initrange = 0.5 / 5
u_embeddings.weight.data.uniform_(-initrange, initrange)
v_embeddings.weight.data.uniform_(-0, 0)
print(u_embeddings)
vocabulary_size = len(vocabulary)

Embedding(5, 1)


In [18]:
def one_hot_vector(index):
    x = torch.zeros((vocabulary_size)).long()
    x[index] = 1
    return x 


In [19]:
import pdb
class Test(nn.Module):

    def __init__(self, vocab_size, emb_dimension):
        super(Test, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-0, 0)
        
            
    def forward(self, pos_u, pos_v,neg_v):
        emb_u = self.u_embeddings(pos_u)
        neg_v = neg_v.view(len(pos_u),-1)
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))/ pos_u.size(0)


In [20]:
model = Test(vocabulary_size, 40)

In [21]:
print(pos_v)
pos_v = pos_v.view(-1,1)
print(pos_v)
print(model.forward((one_hot_vector(1)),pos_v,(one_hot_vector(1))))

NameError: name 'pos_v' is not defined

In [22]:
def train(model, dataset, epochs):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    for epoch in range(1,epochs):
        for pos_u,pos_v,neg_v in dataset:
            pos_v = pos_v.view(-1,1)
            optimizer.zero_grad()
            loss = model.forward(pos_u,pos_v,neg_v)
            loss.backward()
            optimizer.step()
        print("loss = " + str(loss))
        print("{0:d} epoch of {1:d}".format(epoch+1, epochs))


In [23]:
def create_key_pairs(pairs):
    key_pairs = []
    for x,y in pairs:
        key_pairs.append((word2idx.get(x),word2idx.get(y)))
    return key_pairs

In [27]:
import random
def create_dataset_with_samples(dataset, vocab_size, neg_samples=2):
    dataset_with_samples = []
    pairs = create_key_pairs(create_pairs(small_dataset,neg_samples))
    for x,y in pairs: 
        neg_v = one_hot_vector(random.randint(0,vocab_size-1))
        for z in random.sample(range(0,vocab_size),neg_samples - 1):
            neg_v = torch.cat((neg_v,one_hot_vector(z)))
        dataset_with_samples.append((one_hot_vector(x),one_hot_vector(y).view(-1,1),neg_v))
    return dataset_with_samples
        
dataset_with_samples = create_dataset_with_samples(small_dataset,vocabulary_size)         
        
    


In [28]:
dataset_with_samples[0][2]

tensor([0, 0, 0,  ..., 0, 0, 0])

In [29]:
train(model,dataset_with_samples,100)

loss = tensor(2.0792, grad_fn=<DivBackward0>)
2 epoch of 100
loss = tensor(2.0789, grad_fn=<DivBackward0>)
3 epoch of 100
loss = tensor(2.0784, grad_fn=<DivBackward0>)
4 epoch of 100
loss = tensor(2.0776, grad_fn=<DivBackward0>)
5 epoch of 100
loss = tensor(2.0762, grad_fn=<DivBackward0>)
6 epoch of 100
loss = tensor(2.0738, grad_fn=<DivBackward0>)
7 epoch of 100
loss = tensor(2.0697, grad_fn=<DivBackward0>)
8 epoch of 100
loss = tensor(2.0630, grad_fn=<DivBackward0>)
9 epoch of 100
loss = tensor(2.0523, grad_fn=<DivBackward0>)
10 epoch of 100
loss = tensor(2.0361, grad_fn=<DivBackward0>)
11 epoch of 100
loss = tensor(2.0138, grad_fn=<DivBackward0>)
12 epoch of 100
loss = tensor(1.9870, grad_fn=<DivBackward0>)
13 epoch of 100
loss = tensor(1.9601, grad_fn=<DivBackward0>)
14 epoch of 100
loss = tensor(1.9382, grad_fn=<DivBackward0>)
15 epoch of 100
loss = tensor(1.9237, grad_fn=<DivBackward0>)
16 epoch of 100
loss = tensor(1.9159, grad_fn=<DivBackward0>)
17 epoch of 100
loss = tensor(1.

KeyboardInterrupt: 