## 1.Skipgram with naiive softmax
Regference 김성동님 [DeepNLP-models-Pytorch](https://github.com/DSKSD/DeepNLP-models-Pytorch)

In [6]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [7]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [8]:
print(torch.__version__)
print(nltk.__version__)

1.0.0
3.3


In [9]:
FloatTensor = torch.FloatTensor
LongTensor = torch.LongTensor
ByteTensor = torch.ByteTensor

In [10]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
        
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [11]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

# seq에 해당되는 key값의 value를 리스트에 모으는거


def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

## prepare_word : 해당단어의 wordindex LongTensor로 가져옴
## sequence와의 차이점은 loop가아니라는거

## Data Load and Preprocessing

#### Load corpus : Gutenberg corpus

In [12]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100] #  sampling sentences for test
corpus = [[word.lower() for word in sent] for sent in corpus]

In [13]:
corpus[:5]

[['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']'],
 ['etymology', '.'],
 ['(',
  'supplied',
  'by',
  'a',
  'late',
  'consumptive',
  'usher',
  'to',
  'a',
  'grammar',
  'school',
  ')'],
 ['the',
  'pale',
  'usher',
  '--',
  'threadbare',
  'in',
  'coat',
  ',',
  'heart',
  ',',
  'body',
  ',',
  'and',
  'brain',
  ';',
  'i',
  'see',
  'him',
  'now',
  '.'],
 ['he',
  'was',
  'ever',
  'dusting',
  'his',
  'old',
  'lexicons',
  'and',
  'grammars',
  ',',
  'with',
  'a',
  'queer',
  'handkerchief',
  ',',
  'mockingly',
  'embellished',
  'with',
  'all',
  'the',
  'gay',
  'flags',
  'of',
  'all',
  'the',
  'known',
  'nations',
  'of',
  'the',
  'world',
  '.']]

#### Extract Stopwords from unigram distribution's tails

In [14]:
word_count = Counter(flatten(corpus))
border = int(len(word_count) * 0.01)  ## 592개의 단어 있어서 상위 하위 5개 제외할 예정

In [15]:
stopwords = word_count.most_common()[:border]+list(reversed(word_count.most_common()))[:border]

In [16]:
stopwords =[s[0] for s in stopwords]

In [17]:
stopwords

[',', '.', 'the', 'of', 'and', 'man', 'artificial', 'civitas', '--(', 'state']

## Build vocab

In [18]:
vocab = list(set(flatten(corpus)) - set(stopwords))
vocab.append('<UNK>')

In [19]:
print(len(set(flatten(corpus))), len(vocab))
# 원래 592개에서 10개 불용어 빼고 UNK하나 추가해서 583개

592 583


In [20]:
word2index = {'<UNK>':0}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}

## {0:'<UNK>',1:'however',2:'foul'...}

## prepare train data
window data example

In [21]:
WINDOW_SIZE = 3
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])
# 각각의 corpus에 대해서 window size가 주어지면 첫번째 단어는 왼쪽에 단어가 없어서 Window_size만큼 Dummy를 주는 작업. 뒤에 코딩 편하도록

# [('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by'),
#  ('<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by', 'herman'),
#  ('<DUMMY>', '[', 'moby', 'dick', 'by', 'herman', 'melville'),
#  ('[', 'moby', 'dick', 'by', 'herman', 'melville', '1851'),
#  ('moby', 'dick', 'by', 'herman', 'melville', '1851', ']'),
#  ('dick', 'by', 'herman', 'melville', '1851', ']', '<DUMMY>'),
#  ('by', 'herman', 'melville', '1851', ']', '<DUMMY>', '<DUMMY>'),
#  ('herman', 'melville', '1851', ']', '<DUMMY>', '<DUMMY>', '<DUMMY>')]

In [22]:
len(windows) ## 1463개의 training data

1463

In [23]:
windows[0]

('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by')

In [24]:
train_data = []

for window in windows:                                 # Skip gram
    for i in range(WINDOW_SIZE * 2 + 1):               # centerword + 2 * WINDOW_SIZE = 7
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': # i  = WINDOW_SIZE는 centerword라는 뜻
            continue                                   # i번째 단어가 centerword이거나 'DUMMY'이면 넘어가라!
        train_data.append((window[WINDOW_SIZE], window[i]))
    
print(train_data[:WINDOW_SIZE*2])

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]


In [25]:
X_p = []
y_p = []

In [26]:
train_data[0] # input이 '[' label 이 'moby'

('[', 'moby')

In [27]:
for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1,-1)) # -1은 미지수 n 차원일부로 늘리는거 (  [] 를 하나 더 씌운다.)
    y_p.append(prepare_word(tr[1], word2index).view(1,-1))

In [28]:
print(X_p[0])
print(y_p[0])
print(X_p[0].shape) # 차원하나 더늘려서 (1,1)됨  
                    # ex) [[64]]

tensor([[210]])
tensor([[360]])
torch.Size([1, 1])


In [29]:
train_data = list(zip(X_p, y_p))
len(train_data)

7606

## nn.Embeding

In [30]:
# an Embedding module containing 10 tensors of size 3
embedding = nn.Embedding(10,3)
# a batch of 2 samples of 4 indices each
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
embedding(input)

tensor([[[ 0.6283, -0.0893,  0.7993],
         [ 0.3314, -0.0602,  1.7140],
         [ 0.1613,  0.6638,  0.6927],
         [ 0.2260,  1.2989, -0.6374]],

        [[ 0.1613,  0.6638,  0.6927],
         [ 1.1045, -0.1108,  0.5328],
         [ 0.3314, -0.0602,  1.7140],
         [-0.0591, -0.1103,  0.9381]]], grad_fn=<EmbeddingBackward>)

## Modeling

In [31]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)
        
        self.embedding_v.weight.data.uniform_(-1,1)  # init
        self.embedding_u.weight.data.uniform_(0,0)   # init
        # self.out = nn.Linear(projection_dim, vocab_size)
        
    def forward(self, center_words, target_words, outer_words):
        # outer_words가 BXV 즉 [1,2,3,4,...582,0]
        # 이걸 넣으면 BXV에 디멘션 추가 BXVXD 가 됨
        center_embeds = self.embedding_v(center_words) # B X 1 X D  center_word의 인덱스에 해당하는 하나의 행을 가져오는듯
        target_embeds = self.embedding_u(target_words) # B X 1 X D
        outer_embeds = self.embedding_u(outer_words)   # B X V X D 
        
        scores = target_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2)    # BX1XD * BXDX1 => BX1X1 => BX1   3번째 index가 1이면 없에줌
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2) # BXVXD * BXDX1 => BXVXD * BXDX1=> BXV
                                                                                # 여기 계산이 굉장히 비효울적. Vectorsize가 몇십만개 일수도있는데 ...
        
        nll = -torch.mean(torch.log(torch.exp(scores)/ torch.sum(torch.exp(norm_scores),1).unsqueeze(1))) 
        # log-softmax 2번째 index에 1을 넣어줌
        # torch.sum(torch.exp(num_scores),1) 각 배치별로 Vocab개수만큼의 exp값이 있으면
        return nll # negative log likelihood
    
    def prediction(self,inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

![](https://user-images.githubusercontent.com/36406676/54069443-6b5afd00-429b-11e9-97bf-ff5e525d93d7.jpg)

![](https://user-images.githubusercontent.com/36406676/54069351-ab6db000-429a-11e9-8e99-14764ac89a83.jpg)

In [32]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100

In [34]:
losses = []
model = Skipgram(len(word2index), EMBEDDING_SIZE)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [35]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

In [40]:
vocabs = prepare_sequence(list(vocab), word2index)

{'<UNK>': 0,
 'great': 1,
 'somehow': 2,
 'them': 3,
 'grow': 4,
 'painstaking': 5,
 'has': 6,
 'men': 7,
 'fixed': 8,
 'pikes': 9,
 'splintered': 10,
 'foam': 11,
 'handkerchief': 12,
 'deep': 13,
 'solely': 14,
 't': 15,
 'burrower': 16,
 'exceeding': 17,
 'paunch': 18,
 'pale': 19,
 'dreadful': 20,
 'letter': 21,
 'rosy': 22,
 'down': 23,
 'many': 24,
 'pliny': 25,
 'created': 26,
 'slay': 27,
 'history': 28,
 'hie': 29,
 'brought': 30,
 'ignorance': 31,
 'received': 32,
 'please': 33,
 'hampton': 34,
 'loves': 35,
 'pekee': 36,
 'clear': 37,
 'morals': 38,
 'prepared': 39,
 'for': 40,
 'him': 41,
 'sw': 42,
 'sung': 43,
 'including': 44,
 'maine': 45,
 'from': 46,
 'when': 47,
 'security': 48,
 'spermacetti': 49,
 'monster': 50,
 'taken': 51,
 'besides': 52,
 'whatever': 53,
 'whether': 54,
 'boiling': 55,
 'bred': 56,
 '890': 57,
 'higgledy': 58,
 'very': 59,
 'danish': 60,
 'case': 61,
 'sub': 62,
 'with': 63,
 'mockingly': 64,
 'swallow': 65,
 'verbal': 66,
 'mouth': 67,
 'errom

In [41]:
vocabs

tensor([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
         29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
         43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
         57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
         71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
         85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,
         99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
        113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
        127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
        141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
        155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 1

In [230]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs)  ## B X 1
        targets = torch.cat(targets)
        # tuple to tensor 해줘야함
        
        vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab))  # B x V
                                                                                               # [1,2,3,..,581,0]이게 batch_size만큼 뭉쳐있는게 vocabs                                                                                                
        model.zero_grad()

        loss = model(inputs, targets, vocabs)
        
        loss.backward()
        optimizer.step()

        losses.append(loss.data)

    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch,np.mean(losses)))
        losses = []

Epoch : 0, mean_loss : 6.09
Epoch : 10, mean_loss : 4.35
Epoch : 20, mean_loss : 3.47
Epoch : 30, mean_loss : 3.31
Epoch : 40, mean_loss : 3.26
Epoch : 50, mean_loss : 3.24
Epoch : 60, mean_loss : 3.22
Epoch : 70, mean_loss : 3.21
Epoch : 80, mean_loss : 3.21
Epoch : 90, mean_loss : 3.20


## Test

In [231]:
def word_similarity(target, vocab): # target과 전체 단어의 유사도
    target_V = model.prediction(prepare_word(target, word2index)) # d X 1
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: 
            continue
            
        vector = model.prediction(prepare_word(list(vocab)[i], word2index)) # 비교대상
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] # 타겟과의 cosine_similarity
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]     # sort by similarity       

In [233]:
test = random.choice(list(vocab))
test

'clearing'

In [234]:
word_similarity(test,vocab) ## clearing과 유사한단어 상위10개

[['storied', 0.6760297417640686],
 ['leaving', 0.5904781222343445],
 ['before', 0.5829606652259827],
 ['seven', 0.5702384114265442],
 ['have', 0.5668636560440063],
 [':', 0.5515078902244568],
 ['out', 0.550243616104126],
 ['beating', 0.539977490901947],
 ['hampton', 0.5343126654624939],
 ['whether', 0.5281474590301514]]