## 1.Skipgram with naiive softmax
Regference 김성동님 [DeepNLP-models-Pytorch](https://github.com/DSKSD/DeepNLP-models-Pytorch)

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [10]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


True

In [2]:
print(torch.__version__)
print(nltk.__version__)

1.0.0
3.3


In [202]:
FloatTensor = torch.FloatTensor
LongTensor = torch.LongTensor
ByteTensor = torch.ByteTensor

In [203]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
        
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch
        
        ## traindata의 수가 batch보다 작으면 그냥 그 데이터 한번에 학습

In [204]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))
## 

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))
## prepare_word : 해당단어의 wordindex LongTensor로 가져옴

## Data Load and Preprocessing

#### Load corpus : Gutenberg corpus

In [205]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100] #  sampling sentences for test
corpus = [[word.lower() for word in sent] for sent in corpus]

In [206]:
corpus[:5]

[['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']'],
 ['etymology', '.'],
 ['(',
  'supplied',
  'by',
  'a',
  'late',
  'consumptive',
  'usher',
  'to',
  'a',
  'grammar',
  'school',
  ')'],
 ['the',
  'pale',
  'usher',
  '--',
  'threadbare',
  'in',
  'coat',
  ',',
  'heart',
  ',',
  'body',
  ',',
  'and',
  'brain',
  ';',
  'i',
  'see',
  'him',
  'now',
  '.'],
 ['he',
  'was',
  'ever',
  'dusting',
  'his',
  'old',
  'lexicons',
  'and',
  'grammars',
  ',',
  'with',
  'a',
  'queer',
  'handkerchief',
  ',',
  'mockingly',
  'embellished',
  'with',
  'all',
  'the',
  'gay',
  'flags',
  'of',
  'all',
  'the',
  'known',
  'nations',
  'of',
  'the',
  'world',
  '.']]

#### Extract Stopwords from unigram distribution's tails

In [207]:
word_count = Counter(flatten(corpus))
border = int(len(word_count) * 0.01)  ## 592개의 단어 있어서 상위 하위 5개 제외할 예정

In [208]:
stopwords = word_count.most_common()[:border]+list(reversed(word_count.most_common()))[:border]

In [209]:
stopwords =[s[0] for s in stopwords]

In [210]:
stopwords

[',', '.', 'the', 'of', 'and', 'man', 'artificial', 'civitas', '--(', 'state']

## Build vocab

In [211]:
vocab = list(set(flatten(corpus)) - set(stopwords))
vocab.append('<UNK>')

In [212]:
print(len(set(flatten(corpus))), len(vocab))
# 원래 592개에서 10개 불용어 빼고 UNK하나 추가해서 583개

592 583


In [213]:
word2index = {'<UNK>':0}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}

## {0:'<UNK>',1:'however',2:'foul'...}

## prepare train data
window data example

In [214]:
WINDOW_SIZE = 3
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])
# 각각의 corpus에 대해서 window size가 주어지면 첫번째 단어는 왼쪽에 단어가 없어서 Window_size만큼 Dummy를 주는 작업. 뒤에 코딩 편하도록

# [('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by'),
#  ('<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by', 'herman'),
#  ('<DUMMY>', '[', 'moby', 'dick', 'by', 'herman', 'melville'),
#  ('[', 'moby', 'dick', 'by', 'herman', 'melville', '1851'),
#  ('moby', 'dick', 'by', 'herman', 'melville', '1851', ']'),
#  ('dick', 'by', 'herman', 'melville', '1851', ']', '<DUMMY>'),
#  ('by', 'herman', 'melville', '1851', ']', '<DUMMY>', '<DUMMY>'),
#  ('herman', 'melville', '1851', ']', '<DUMMY>', '<DUMMY>', '<DUMMY>')]

In [215]:
len(windows) ## 1463개의 training data

1463

In [216]:
windows[0]

('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by')

In [217]:
train_data = []

for window in windows:                                 # Skip gram
    for i in range(WINDOW_SIZE * 2 + 1):               # centerword + 2 * WINDOW_SIZE = 7
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': # i  = WINDOW_SIZE는 centerword라는 뜻
            continue                                   # i번째 단어가 centerword이거나 'DUMMY'이면 넘어가라!
        train_data.append((window[WINDOW_SIZE], window[i]))
    
print(train_data[:WINDOW_SIZE*2])

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]


In [218]:
X_p = []
y_p = []

In [219]:
train_data[0] # input이 '[' label 이 'moby'

('[', 'moby')

In [220]:
for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1,-1)) # -1은 미지수 n 차원일부로 늘리는거 (  [] 를 하나 더 씌운다.)
    y_p.append(prepare_word(tr[1], word2index).view(1,-1))

In [236]:
print(X_p[0])
print(y_p[0])
print(X_p[0].shape) # 차원하나 더늘려서 (1,1)됨  
                    # ex) [[64]]

tensor([[64]])
tensor([[281]])
torch.Size([1, 1])


In [222]:
train_data = list(zip(X_p, y_p))
len(train_data)

7606

## Modeling

In [223]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)
        
        self.embedding_v.weight.data.uniform_(-1,1)  # init
        self.embedding_u.weight.data.uniform_(0,0)   # init
        # self.out = nn.Linear(projection_dim, vocab_size)
        
    def forward(self, center_words, target_words, outer_words):
        # outer_words가 BXV 즉 [1,2,3,4,...582,0]이 Batchsize만큼 있는데
        # 이걸 넣으면 VXD의 W' matrix가 Batchsize만큼 있는거
        center_embeds = self.embedding_v(center_words) # B X 1 X D  center_word의 인덱스에 해당하는 하나의 행을 가져오는듯
        target_embeds = self.embedding_u(target_words) # B X 1 X D
        outer_embeds = self.embedding_u(outer_words)   # B X V X D 
        
        scores = target_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2)    # BX1XD * BXDX1 => BX1X1 => BX1   3번째 index가 1이면 없에줌
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2) # BXVXD * BXDX1 => BXVXD * BXDX1=> BXV
        
        nll = -torch.mean(torch.log(torch.exp(scores)/ torch.sum(torch.exp(norm_scores),1).unsqueeze(1))) 
        # log-softmax 2번째 index에 1을 넣어줌
        # torch.sum(torch.exp(num_scores),1) 각 배치별로 Vocab개수만큼의 exp값이 있으면
        return nll # negative log likelihood
    
    def prediction(self,inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

![](https://user-images.githubusercontent.com/36406676/54069443-6b5afd00-429b-11e9-97bf-ff5e525d93d7.jpg)

![](https://user-images.githubusercontent.com/36406676/54069351-ab6db000-429a-11e9-8e99-14764ac89a83.jpg)

In [224]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100

In [225]:
losses = []
model = Skipgram(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

In [243]:
vocabs = prepare_sequence(list(vocab), word2index).expand(BATCH_SIZE, len(vocab))

In [244]:
vocabs

tensor([[  1,   2,   3,  ..., 581, 582,   0],
        [  1,   2,   3,  ..., 581, 582,   0],
        [  1,   2,   3,  ..., 581, 582,   0],
        ...,
        [  1,   2,   3,  ..., 581, 582,   0],
        [  1,   2,   3,  ..., 581, 582,   0],
        [  1,   2,   3,  ..., 581, 582,   0]])

In [230]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs)  ## B X 1
        targets = torch.cat(targets)
        # tuple to tensor 해줘야함
        
        vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab))  # B x V
                                                                                               # [1,2,3,..,581,0]이게 batch_size만큼 뭉쳐있는게 vocabs                                                                                                
        model.zero_grad()

        loss = model(inputs, targets, vocabs)
        
        loss.backward()
        optimizer.step()

        losses.append(loss.data)

    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch,np.mean(losses)))
        losses = []

Epoch : 0, mean_loss : 6.09
Epoch : 10, mean_loss : 4.35
Epoch : 20, mean_loss : 3.47
Epoch : 30, mean_loss : 3.31
Epoch : 40, mean_loss : 3.26
Epoch : 50, mean_loss : 3.24
Epoch : 60, mean_loss : 3.22
Epoch : 70, mean_loss : 3.21
Epoch : 80, mean_loss : 3.21
Epoch : 90, mean_loss : 3.20


## Test

In [231]:
def word_similarity(target, vocab): # target과 전체 단어의 유사도
    target_V = model.prediction(prepare_word(target, word2index)) # d X 1
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: 
            continue
            
        vector = model.prediction(prepare_word(list(vocab)[i], word2index)) # 비교대상
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] # 타겟과의 cosine_similarity
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]     # sort by similarity       

In [233]:
test = random.choice(list(vocab))
test

'clearing'

In [234]:
word_similarity(test,vocab)

[['storied', 0.6760297417640686],
 ['leaving', 0.5904781222343445],
 ['before', 0.5829606652259827],
 ['seven', 0.5702384114265442],
 ['have', 0.5668636560440063],
 [':', 0.5515078902244568],
 ['out', 0.550243616104126],
 ['beating', 0.539977490901947],
 ['hampton', 0.5343126654624939],
 ['whether', 0.5281474590301514]]