In [1]:
from collections import defaultdict
import math
import time
import random
import torch
import torch.nn as nn
from torch.autograd import Variable

import torch.nn.functional as F

import numpy as np

In [2]:
# Functions to read in the corpus
# NOTE: We are using data from the Penn Treebank, which is already converted
#       into an easy-to-use format with "<unk>" symbols. If we were using other
#       data we would have to do pre-processing and consider how to choose
#       unknown words, etc.
w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
UNK = w2i["<unk>"]
def read_dataset(filename):
  with open(filename, "r") as f:
    for line in f:
      yield [w2i[x] for x in line.strip().split(" ")]

# Read in the data
train = list(read_dataset("../data/ptb/train.txt"))
w2i2 = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/ptb/valid.txt"))
i2w = {v: k for k, v in w2i.items()}
n_words = len(w2i)

In [3]:
# Feed-forward Neural Network Language Model

class FFNN_LM(nn.Module):
    def __init__(self, n_words, embed_size, hidden_size, n_gram, dropout_rate = 0.5):
        super(FFNN_LM, self).__init__()
        self.embedding = nn.Embedding(n_words, embed_size)
        self.fnn = nn.Sequential(
            nn.Linear(n_gram * embed_size, hidden_size), nn.Tanh(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, n_words)
        )
    
    def forward(self, word_idxs):
        embed = self.embedding(word_idxs)
        embed_reshape = embed.view(embed.size(0), -1)
        logits = self.fnn(embed_reshape)
        return logits

In [4]:
N = 2
EMB_SIZE = 128
HID_SIZE = 128
DROPOUT_RATE = 0.5

model = FFNN_LM(n_words=n_words, embed_size=EMB_SIZE, hidden_size= HID_SIZE, n_gram=N, dropout_rate=DROPOUT_RATE)

In [5]:
def word_idxs_to_variable(word_idxs):
    var = Variable(torch.LongTensor(word_idxs))
    return var

In [8]:
def cal_logits_for_sents(sents):
    var = word_idxs_to_variable(sents)
    logits = model(var)
    return logits

In [9]:
def cal_loss(sents):
    n_gram_sent = [S] * N
    n_gram_history = []
    target_word_history = []
    
    for sent in sents:
        for target_word in sent + [S]:
            
            # 앞에 등장하는 N개의 단어와 target_word를 매치 시킨다.
            n_gram_sent = n_gram_sent[1:] + [target_word]
            
            # 학습을 위해, feature(previous N-words)와 label(target_word)을 저장
            n_gram_history.append([n_gram_sent])
            target_word_history.append(target_word)
    
    logits = cal_logits_for_sents(n_gram_history)
    target = word_idxs_to_variable(target_word_history)
    loss = F.cross_entropy(input=logits, target=target)
    return loss

In [10]:
def train(train_data):
    lr = 0.0001
    epochs = 5
    
    train_sents, train_cost = 0
    
    optimizer = torch.optim.Adam(model.parameters(), lr)
    
    for epoch in range(epochs):
        start = time.time()
        for i, data in train_data:
            loss = cal_loss(data)
            train_cost += loss


In [120]:
loss = cal_loss([train[0]])

In [None]:
loss

In [None]:
cal_scores_for_sents(train[0])

In [13]:
test = torch.randn(1,5)

In [19]:
test

tensor([[ 0.5469,  1.4084,  2.8021, -0.6644,  1.1593]])

In [48]:
F.softmax(test).multinomial(1).item()

  """Entry point for launching an IPython kernel.


2

In [None]:
torch.opti,