In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F


class SkipGramModel(nn.Module):

    def __init__(self, emb_size_u, emb_size_v,emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(emb_size_u, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(emb_size_v, emb_dimension, sparse=False)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-0, 0)
        
    def forward(self, pos_u, pos_v,neg_v):
        emb_u = self.u_embeddings(pos_u)
        neg_v = neg_v.view(len(pos_u),-1)
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))/ pos_u.size(0)

In [2]:
from torch.utils.data import Dataset
from collections import defaultdict
from gensim.models.doc2vec import TaggedDocument
import random

class wDataSet(Dataset):
    def __init__(self, dataset, power=0.75, neg_samples=10):
        self.pairs = self.generate_pairs(dataset,5)
        self.neg_samples=neg_samples
        
    def generate_pairs(self, dataset, ctx_window):
            pairs = []
            for sentence in dataset:
                for i,word in enumerate(sentence):
                    for j in range(1,ctx_window):
                        if(i+j<len(sentence[i])):
                            pairs.append((word,sentence[i+j]))
                        if((i-j)>0):
                            pairs.append((word,sentence[i-j]))
                            
            return pairs
                            
        
    def __len__(self):
        return len(pairs)
        
    def __getitem__(self, idx):
        return pairs[idx]



In [3]:
from gensim.test.utils import datapath
import gensim.downloader as api
#sentences = LineSentence(datapath('lee_background.cor'))
dataset = api.load('text8')
print(type(dataset))
p = []
for x in dataset: 
    p.append(x)
    
"""
print(len(p))
sum = 0
for x in p: 
    sum += len(x)
print(sum/len(p))
print(len(p[0]))
"""

<class 'text8.Dataset'>


'\nprint(len(p))\nsum = 0\nfor x in p: \n    sum += len(x)\nprint(sum/len(p))\nprint(len(p[0]))\n'

In [7]:
#dataset = wDataSet(p)


In [5]:
from collections import Counter
tmp=[['he', 'is', 'a', 'king'],
 ['she', 'is', 'a', 'queen'],
 ['he', 'is', 'a', 'man'],
 ['she', 'is', 'a', 'woman'],
 ['warsaw', 'is', 'poland', 'capital'],
 ['berlin', 'is', 'germany', 'capital'],
 ['paris', 'is', 'france', 'capital']]
print(type(tmp))
print(type(p))
small_dataset=tmp
vocabulary = []
for sentence in small_dataset:
    for word in sentence:
        if word not in vocabulary:
            vocabulary.append(word)
print(vocabulary)
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}





<class 'list'>
<class 'list'>
tensor([0, 0, 1,  ..., 0, 0, 0])


In [6]:


v_embeddings = nn.Embedding(5, 1, sparse=False)
u_embeddings = nn.Embedding(5, 1, sparse=False)
initrange = 0.5 / 5
u_embeddings.weight.data.uniform_(-initrange, initrange)
v_embeddings.weight.data.uniform_(-0, 0)
print(u_embeddings)
vocabulary_size = len(vocabulary)

<class 'list'>
<class 'list'>
Embedding(5, 1)
[0, 1]
tensor([[ 0.0485],
        [ 0.0485],
        [-0.0991],
        ...,
        [ 0.0485],
        [ 0.0485],
        [ 0.0485]], grad_fn=<EmbeddingBackward>)


In [8]:
def one_hot_vector(index):
    x = torch.zeros((vocabulary_size)).long()
    x[index] = 1
    return x 


In [9]:
import pdb
class Test(nn.Module):

    def __init__(self, vocab_size, emb_dimension):
        super(Test, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-0, 0)
        
            
    def forward(self, pos_u, pos_v,neg_v):
        emb_u = self.u_embeddings(pos_u)
        neg_v = neg_v.view(len(pos_u),-1)
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))/ pos_u.size(0)
