In [126]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import nltk
import numpy as np
import random
from collections import Counter

random.seed(1024)
np.random.seed(1024)

In [91]:
print(torch.__version__)
print(nltk.__version__)

0.4.1
3.2.2


In [92]:
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    gpus = [0]
    torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

## Data load and Preprocessing

In [93]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:500]
corpus = [[word.lower() for word in sent] for sent in corpus]

In [94]:
print(len(corpus), corpus[0])

(500, [u'[', u'moby', u'dick', u'by', u'herman', u'melville', u'1851', u']'])


### exclude sparse words

In [95]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [96]:
word_count = Counter(flatten(corpus))

In [97]:
MIN_COUNT = 3

In [98]:
exclude = [w for w, c in word_count.iteritems() if c < MIN_COUNT]

In [99]:
vocab = set(flatten(corpus)) - set(exclude)

In [100]:
word2index = {}
for vo in vocab:
    word2index[vo] = len(word2index)

In [101]:
index2word = {v:k for k, v in word2index.iteritems()}

In [102]:
WINDOW_SIZE = 5
# windows = flatten()
# nltk.ngrams(sequence, n)

In [103]:
windows =  flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])

In [104]:
print(len(windows))

10277


In [105]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE*2+1):
        if window[i] in exclude or window[WINDOW_SIZE] in exclude:
            continue
        if i == WINDOW_SIZE or window[i] == '<DUMMY>':
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))
        # 使用中心词预测context  

In [106]:
print(len(train_data), train_data[0])

(50242, (u'(', u'supplied'))


In [130]:
def prepare_word(word, word2index):
    return LongTensor([word2index[word]]) if word in word2index else LongTensor([word2index['<UNK>']])

def prepare_sequence(seq, word2index):
    idxs = [word2index[w] if w in word2index else word2index['<UNK>'] for w in seq]
    
    return LongTensor(idxs)

In [108]:
X_p = []
y_p = []

for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1))
    y_p.append(prepare_word(tr[1], word2index).view(1, -1))

train_data = list(zip(X_p, y_p))

In [109]:
len(train_data)

50242

## Build Unigram Distribution ** 0.75

$P(w) = U(w)^{3/4}/Z$

In [110]:
Z = 0.001

In [111]:
word_count = Counter(flatten(corpus))
num_total_words = sum([float(c) for w, c in word_count.iteritems() 
                      if w not in exclude])

In [112]:
print(2/num_total_words)

0.000256476019492


In [113]:
unigram_table = []

for vo in vocab:
    if np.random.rand() > 0.99:
        print(vo, ((word_count[vo]/num_total_words)**0.75)/Z)
    unigram_table.extend([vo] * int(((word_count[vo]/num_total_words)**0.75)/Z))

(u'particular', 3.4084550450597506)
(u'!', 15.059518116976372)
(u'could', 5.732315257888364)
(u'fifty', 2.746967479045098)
(u'yet', 6.261731753708576)
(u'death', 4.0293971981622505)


In [114]:
len(unigram_table)

3500

## Negative sampling

In [118]:
def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].data.cpu().tolist()[0] if USE_CUDA else targets[i].data.tolist()[0]
        while len(nsample) < k: # num of sampling
            neg = np.random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
        
    return torch.cat(neg_samples)

## Modeling

$J_t(\theta) = log\sigma(u_{o}^{T}v_{c}) + \sum_{i=1}^{k}E[log\sigma(-u_{j}^{T}v_{c})]$

In [124]:
class SkipgramNegSampling(nn.Module):
    def __init__(self, vocab_size, projection_dim):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # out embedding
        self.logsigmoid = nn.LogSigmoid()
        
        initrange = (2.0 / (vocab_size + projection_dim)) ** 0.5 # Xavier init
        self.embedding_v.weight.data.uniform_(-initrange, initrange)
        self.embedding_u.weight.data.uniform_(-0.0, 0.0)
    
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_v(center_words) # (B, 1, D)
        target_embeds = self.embedding_u(target_words) # (B, 1, D)
        
        neg_embeds = -self.embedding_u(negative_words) # (B, K, D)
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) #(B, 1)
        negative_score = neg_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        
        loss = self.logsigmoid(positive_score) + torch.sum(self.logsigmoid(negative_score))

        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

## Train

In [135]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 32
EPOCH = 20
NEG = 10 # num of negative sampling

In [127]:
losses = []
model = SkipgramNegSampling(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [133]:
def getBatch(batch_size, train_data):
    np.random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        sindex, eindex = eindex, eindex + batch_size
        yield batch
        
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [136]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        inputs, targets = zip(*batch)
        inputs = torch.cat(inputs) # (B, 1)
        targets = torch.cat(targets)
        negs = negative_sampling(targets, unigram_table, NEG)
        model.zero_grad()
        
        loss = model(inputs, targets, negs)
        loss.backward() # self defined loss
        optimizer.step()
        
        losses.append(loss.data[0])
        print('Epoch: %d, mean_loss: %.02f' % (epoch, np.mean(losses)))
        
        



Epoch: 0, mean_loss: 1326.26
Epoch: 0, mean_loss: 1319.98
Epoch: 0, mean_loss: 1313.73
Epoch: 0, mean_loss: 1307.54
Epoch: 0, mean_loss: 1301.40
Epoch: 0, mean_loss: 1295.35
Epoch: 0, mean_loss: 1289.38
Epoch: 0, mean_loss: 1283.44
Epoch: 0, mean_loss: 1277.54
Epoch: 0, mean_loss: 1271.69
Epoch: 0, mean_loss: 1265.92
Epoch: 0, mean_loss: 1260.15
Epoch: 0, mean_loss: 1254.46
Epoch: 0, mean_loss: 1248.87
Epoch: 0, mean_loss: 1243.31
Epoch: 0, mean_loss: 1237.78
Epoch: 0, mean_loss: 1232.33
Epoch: 0, mean_loss: 1226.91
Epoch: 0, mean_loss: 1221.52
Epoch: 0, mean_loss: 1216.20
Epoch: 0, mean_loss: 1210.91
Epoch: 0, mean_loss: 1205.67
Epoch: 0, mean_loss: 1200.49
Epoch: 0, mean_loss: 1195.36
Epoch: 0, mean_loss: 1190.25
Epoch: 0, mean_loss: 1185.22
Epoch: 0, mean_loss: 1180.19
Epoch: 0, mean_loss: 1175.26
Epoch: 0, mean_loss: 1170.39
Epoch: 0, mean_loss: 1165.55
Epoch: 0, mean_loss: 1160.70
Epoch: 0, mean_loss: 1155.90
Epoch: 0, mean_loss: 1151.18
Epoch: 0, mean_loss: 1146.47
Epoch: 0, mean

AttributeError: 'torch.dtype' object has no attribute 'type'

## Test

In [150]:
def word_similarity(target, vocab):
    target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target:
            continue
        vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        cosine_sim = F.cosine_similarity(target_V, vector).cpu().data[0]
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]
        

In [151]:
test = np.random.choice(list(vocab))
test

u'south'

In [153]:
word_similarity(test, list(vocab))

[[u'jaw', tensor(0.9920)],
 [u'lord', tensor(0.9911)],
 [u'webster', tensor(0.9903)],
 [u'mouth', tensor(0.9895)],
 [u'monster', tensor(0.9895)],
 [u'e', tensor(0.9894)],
 [u'don', tensor(0.9893)],
 [u'narrative', tensor(0.9893)],
 [u'ten', tensor(0.9893)],
 [u'ibid', tensor(0.9892)]]