In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [3]:
FloatTensor = torch.FloatTensor
LongTensor = torch.LongTensor
ByteTensor = torch.ByteTensor

In [4]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [5]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

## Data load and Preprocessing

In [6]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:500]
corpus = [[word.lower() for word in sent] for sent in corpus]

### Build Vocab

In [7]:
vocab = list(set(flatten(corpus)))

In [8]:
word2index = {}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word={v:k for k, v in word2index.items()}

In [9]:
WINDOW_SIZE = 5
windows =  flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])

window_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': 
            continue
        window_data.append((window[WINDOW_SIZE], window[i]))

In [12]:
window_data[:10]

[('[', 'moby'),
 ('[', 'dick'),
 ('[', 'by'),
 ('[', 'herman'),
 ('[', 'melville'),
 ('moby', '['),
 ('moby', 'dick'),
 ('moby', 'by'),
 ('moby', 'herman'),
 ('moby', 'melville')]

## Weighting Function

![](https://user-images.githubusercontent.com/36406676/54082903-d3731700-435f-11e9-9588-49a6397d0ccd.jpg)

In [13]:
def weighting(w_i, w_j):
    try:
        x_ij = X_ik[(w_i, w_j)]
        ## 동시 출현 빈도
    except:
        x_ij = 1
        
    x_max = 100 #100 # fixed in paper
    alpha = 0.75
    
    if x_ij < x_max:
        result = (x_ij/x_max)**alpha
    else:
        result = 1
    
    return result

### Build Co-occurence Matirx X
Because of model complexity, It is important to determine whether a tighter bound can be placed on the number of nonzero elements o f X

In [14]:
X_i = Counter(flatten(corpus)) # X_i

In [15]:
X_ik_window_5 = Counter(window_data) # Co-occurece in window size 5

In [20]:
X_ik_window_5.most_common(10) # 가장 많이 등장하는 조합은 ,,

[((',', ','), 342),
 ((',', 'the'), 280),
 (('the', ','), 280),
 (('the', 'of'), 248),
 (('of', 'the'), 248),
 ((',', 'and'), 229),
 (('and', ','), 229),
 (('the', 'the'), 220),
 ((',', 'a'), 167),
 (('a', ','), 167)]

In [21]:
X_ik = {}
weighting_dic = {}

In [22]:
from itertools import combinations_with_replacement

In [23]:
for bigram in combinations_with_replacement(vocab, 2):
    # ('fields', 'hunks')와 같은 조합을 배출한다. 
    if X_ik_window_5.get(bigram) is not None: # nonzero elements
        co_occer = X_ik_window_5[bigram]
        X_ik[bigram] = co_occer + 1 # log(Xik) -> log(Xik+1) to prevent divergence
        X_ik[(bigram[1],bigram[0])] = co_occer+1
    # ('fields','hunks'), ('hunks','fields') 둘다 딕셔너리에 넣어준다.
    else:
        pass
        
    weighting_dic[bigram] = weighting(bigram[0], bigram[1])
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0])

In [30]:
test = random.choice(window_data)
print(test)
try:
    print(X_ik[(test[0], test[1])] == X_ik[(test[1], test[0])])
except:
    1
    
# ('fields','hunks'), ('hunks','fields') 나 출현빈도는 같을 것이다.

('late', 'bedford')
True


## Prepare train data

In [31]:
u_p = [] # center vec
v_p = [] # context vec
co_p = [] # log(x_ij)
weight_p = [] # f(x_ij)


In [34]:
window_data[:5]

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('[', 'herman'), ('[', 'melville')]

In [35]:
for pair in window_data: 
    u_p.append(prepare_word(pair[0], word2index).view(1, -1)) # ('[','moby')에서 '['
    v_p.append(prepare_word(pair[1], word2index).view(1, -1)) # ('[','moby')에서 'moby'
    
    try:
        cooc = X_ik[pair]
    except: # 단어조합이 없을 경우 1
        cooc = 1

    co_p.append(torch.log(Variable(FloatTensor([cooc]))).view(1, -1))
    weight_p.append(Variable(FloatTensor([weighting_dic[pair]])).view(1, -1))

In [36]:
train_data = list(zip(u_p, v_p, co_p, weight_p))
del u_p # 용량 차지하니까 없에준다
del v_p
del co_p
del weight_p
print(train_data[0]) # tuple (center vec i, context vec j log(x_ij), weight f(w_ij))

(tensor([[2519]]), tensor([[2475]]), tensor([[0.6931]]), tensor([[0.0532]]))


## Modeling
- 목적함수를 이해해보자

## 목적함수를 이해해보자

![](https://user-images.githubusercontent.com/36406676/54083506-a3c80d00-4367-11e9-847e-1be9bf8f29b5.jpg)

![](https://user-images.githubusercontent.com/36406676/54083507-a9bdee00-4367-11e9-8d44-1b0f26995662.jpg)

In [45]:
class Glove(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(Glove, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # center embedding matrix
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # out embedding matrix
        
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)
        
        initrange = (2.0 / (vocab_size + projection_dim)) ** 0.5 # Xavier init
        self.embedding_v.weight.data.uniform_(-initrange, initrange) # init
        self.embedding_u.weight.data.uniform_(-initrange, initrange) # init
        self.v_bias.weight.data.uniform_(-initrange, initrange) # init
        self.u_bias.weight.data.uniform_(-initrange, initrange) # init
        
    def forward(self, center_words, target_words, coocs, weights):
        center_embeds = self.embedding_v(center_words) # B X 1 X D
        target_embeds = self.embedding_u(target_words) # B X 1 X D
        
        center_bias = self.v_bias(center_words).squeeze(1) # B X 1
        target_bias = self.u_bias(target_words).squeeze(1) # B X 1
        
        inner_product = target_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2) # BX1
        
        loss = weights * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)
    
    def prediction(self, inputs):
        v_embeds = self.embedding_v(inputs) # B X 1 X D
        u_embeds = self.embedding_u(inputs) # B X 1 X D
        
        return v_embeds + u_embeds # final embed

## Train

In [46]:
EMBEDDING_SIZE = 50
BATCH_SIZE = 256
EPOCH = 50

In [47]:
losses = []
model = Glove(len(word2index), EMBEDDING_SIZE)
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [49]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets, coocs, weights = zip(*batch)
        
        inputs, targets, coocs, weights = zip(*batch)
        
        inputs = torch.cat(inputs) # B X 1
        targets = torch.cat(targets) # B X 1
        coocs = torch.cat(coocs)
        weights = torch.cat(weights)
        model.zero_grad()
        
        loss = model(inputs, targets, coocs, weights)
        
        loss.backward()
        optimizer.step()
        
        losses.append(loss.data)
        
    if epoch% 10 ==0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch, np.mean(losses)))
        losses = []

Epoch : 0, mean_loss : 225.57
Epoch : 10, mean_loss : 2.60
Epoch : 20, mean_loss : 0.55
Epoch : 30, mean_loss : 0.12
Epoch : 40, mean_loss : 0.04


## Test

In [51]:
def word_similarity(target, vocab):
    target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: 
            continue
        
        vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [52]:
test = random.choice(list(vocab))
test

'experiment'

In [53]:
word_similarity(test, vocab)

[['caravan', 0.8281784057617188],
 ['needles', 0.7806263566017151],
 ['ahoy', 0.7694544792175293],
 [',', 0.7557148337364197],
 ['cheever', 0.7556502819061279],
 ['ah', 0.7461116313934326],
 ['crucifix', 0.7429143786430359],
 ['hval', 0.7399888634681702],
 ['maxim', 0.7346227169036865],
 ['happen', 0.734491229057312]]