In [1]:
# 本文实现NNLM（神经网络语言模型）
import torch
import torch.nn as nn
import torch.optim as optim

In [7]:
# Model
class NNLM(nn.Module):
    def __init__(self):
        super(NNLM, self).__init__()
        self.C = nn.Embedding(n_class, m)
        self.H = nn.Linear(n_step * m, n_hidden, bias=False)
        self.d = nn.Parameter(torch.ones(n_hidden))
        self.U = nn.Linear(n_hidden, n_class, bias=False)
        self.W = nn.Linear(n_step * m, n_class, bias=False)
        self.b = nn.Parameter(torch.ones(n_class))
        
    def forward(self, X):
        X = self.C(X) # X: [batch_size, n_step, m]
        X = X.view(-1, n_step * m) # [batch_size, n_step * m]
        tanh = torch.tanh(self.d + self.H(X)) # [batch_size, n_hidden]
        output = self.b + self.W(X) + self.U(tanh) # [batch_size, n_class]
        return output

In [3]:
# 制造batch
def make_batch():
    input_batch = []
    target_batch = []
    
    for sen in sentences:
        word = sen.split() # 去掉空格
        input = [word_dict[n] for n in word[:-1]] # 创建1到n-1的词作为输入
        target = word_dict[word[-1]] # 将第n个词作为target，这种通常称作'casual language model'
        
        input_batch.append(input)
        target_batch.append(target)
    
    return input_batch, target_batch

In [12]:
# 超参数
n_step = 2 # 步骤数， 文中的n-1
n_hidden = 2 # 隐藏层大小， 文中的h
m = 2 # embedding大小，文中的m

sentences = ['i like dog', 'i love coffee', 'i hate milk']

word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict) # 词典大小

model = NNLM()

criterion = nn.CrossEntropyLoss()
optimizer =optim.Adam(model.parameters(), lr=0.001)

input_batch, target_batch = make_batch()
input_batch = torch.LongTensor(input_batch)
target_batch = torch.LongTensor(target_batch)

In [13]:
# training
for epoch in range(5000):
    optimizer.zero_grad()
    output = model(input_batch)
    
    # output: [batch_size, n_class] target_batch:[batch_size]
    loss = criterion(output, target_batch)
    if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    
    loss.backward()
    optimizer.step()

predict = model(input_batch).data.max(1, keepdim=True)[1]

# TEST
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])

Epoch: 1000 cost = 0.052230
Epoch: 2000 cost = 0.009138
Epoch: 3000 cost = 0.003213
Epoch: 4000 cost = 0.001457
Epoch: 5000 cost = 0.000741
[['i', 'like'], ['i', 'love'], ['i', 'hate']] -> ['dog', 'coffee', 'milk']


In [18]:
print(predict)
print(number_dict)
print(model(input_batch))
# 模型输出向量维度[batch_size, n_class]，本质是词典大小下每个词的概率，取其中最大的作为预测词。

tensor([[6],
        [2],
        [0]])
{0: 'milk', 1: 'love', 2: 'coffee', 3: 'like', 4: 'i', 5: 'hate', 6: 'dog'}
tensor([[ 1.8795, -2.3168,  2.7281, -2.2479, -1.5696, -2.4741, 10.0732],
        [-0.0236, -1.8379,  9.1519, -1.1050, -1.5363, -0.7063,  1.5923],
        [ 9.9022, -2.0445,  0.6372, -0.7818, -0.6095, -0.3977,  1.9202]],
       grad_fn=<AddBackward0>)
