# model

In [1]:
import torch
import torch.nn as nn
class DiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout_rate):
        super(DiRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size = embed_size, 
                    hidden_size = num_hiddens, 
                    num_layers = num_layers,
                    dropout = dropout_rate,
                    bidirectional = False)
        self.decoder = nn.Linear(num_hiddens, vocab_size) # 初始时间步和最终时间步的隐藏状态作为全连接层输入

    def forward(self, inputs):
        # inputs的形状是(批量大小, 词数)，因为LSTM需要将序列长度(seq_len)作为第一维，所以将输入转置后
        # 再提取词特征，输出形状为(词数, 批量大小, 词向量维度)
        embeddings = self.embedding(inputs.permute(1, 0))
        # rnn.LSTM只传入输入embeddings，因此只返回最后一层的隐藏层在各时间步的隐藏状态。
        # outputs形状是(词数, 批量大小, 2 * 隐藏单元个数)
        outputs, _ = self.encoder(embeddings) # output, (h, c)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为
        # (批量大小, 2 * 隐藏单元个数)。
#         print(outputs.shape)
        output = outputs.permute(1, 0, 2)
#         print(output.shape)
        outs = output.reshape(output.size(0)*output.size(1), output.size(2))
#         print(outs.shape)
        ret = self.decoder(outs)
        return ret

# data

In [2]:
import torch
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

In [3]:
class Corpus(object):
    def __init__(self, train_batch_size=20, eval_batch_size=10, bptt=35):
        self.bptt = bptt
        train_iter = WikiText2(split='train')
        self.tokenizer = get_tokenizer('basic_english')
        counter = Counter()
        for line in train_iter:
            counter.update(self.tokenizer(line))
        self.vocab = Vocab(counter)
        train_iter, val_iter, test_iter = WikiText2()
        train_data = self.data_process(train_iter)
        val_data = self.data_process(val_iter)
        test_data = self.data_process(test_iter)

        self.train_data = self.batchify(train_data, train_batch_size)
        self.val_data = self.batchify(val_data, eval_batch_size)
        self.test_data = self.batchify(test_data, eval_batch_size)

    def data_process(self, raw_text_iter):
        data = [torch.tensor([self.vocab[token] for token in self.tokenizer(item)],
                           dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    def batchify(self, data, batch_size):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Divide the dataset into batch_size parts.
        nbatch = data.size(0) // batch_size
        # Trim off any extra elements that wouldn't cleanly fit (remainders).
        data = data.narrow(0, 0, nbatch * batch_size)
        # Evenly divide the data across the batch_size batches.
        data = data.view(batch_size, -1).t().contiguous()
        return data.to(device)

    def get_batch(self, source, i):
        seq_len = min(self.bptt, len(source) - 1 - i)
        data = source[i:i+seq_len]
        target = source[i+1:i+1+seq_len].reshape(-1)
        return data, target

    def get_ntokens(self):
        return len(self.vocab.stoi)

# main

In [4]:
import argparse
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable

import data
import model
class args():
    emsize = 100
    nhid = 200
    nlayers = 5
    lr = 2
    epochs = 1000
    batch_size = 32
    bptt = 256
    dropout = 0.5
    tied = False
    seed = 1234
    cuda = True
    log_interval = 100
    save = 'model.pt'

In [5]:
data_loader = Corpus(train_batch_size=args.batch_size,
                     eval_batch_size=args.batch_size,
                     bptt=args.bptt)

In [6]:
train_data = data_loader.train_data
val_data = data_loader.val_data
test_data = data_loader.test_data

In [12]:
def get_batch(source, i, evaluation=False):
    seq_len = min(args.bptt, len(source) - 1 - i)
#     data = Variable(source[i:i+seq_len], volatile=evaluation)
#     target = Variable(source[i+1:i+1+seq_len].view(-1))
    data = source[i:i+seq_len].t()
    tmp = source[i+1:i+1+seq_len].t()
#     print(data.shape,tmp.shape)
    target = tmp.reshape(-1)
    return data, target
#     return data.clone().detach(), target.clone().detach()

In [13]:
import torch
torch.__version__
print(train_data.size(0))
for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
    data, targets = get_batch(train_data, i)
    print(data.shape,targets.shape,data[-1],targets[-3])
    print(data.requires_grad)
    if (batch>=2): break

64062
torch.Size([32, 256]) torch.Size([8192]) tensor([ 3878,     9,  1953,    20,    81,   314,  1054,     6,     2,   207,
            4,    20,  1878,     3,     2,  1503,   314,    32,  1062,    44,
         3783,     3, 16499,     2,   154,   123,   150,  1985,     6,  3074,
          599, 16329,   259,  7098,  6385,     0,     4,    43,    45,  4006,
            9,  2594,     6, 10097,   598,     3,   249,     2,     0,  3184,
            3,    33,    11,     9,   724, 11309,   832,     4,    39,    14,
          269,   205,     3,  1878,     3,  3638,    41,    40,  6334,    62,
           82,   188,    23,     3,    23,   101,  6286,    41,  1084,     9,
          185,  1295,     7, 23326,  1970,     4,     2,   913,    26,     2,
         1503,   314,    11,  7228,     3,   588,     8, 22864,  1185,     7,
           31,   687,     8,     0,     2,   317,   441,    12,    16,  2693,
         2451,     7,  3638,     4,    14,   276,   343,     3,  1878,     3,
            2,  1

In [9]:
len(data_loader.vocab.itos)

28783

In [10]:
ntokens = len(data_loader.vocab.itos)
# model = RNNModel(args.model, ntokens, args.emsize, args.nhid,
#         args.nlayers, args.dropout, args.tied)
model = DiRNN(ntokens, args.emsize, args.nhid, args.nlayers, args.dropout)

# Glove

In [11]:
import os
import torchtext.vocab as Vocab
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join("data","glove"))

In [12]:
len(glove_vocab.itos), glove_vocab.stoi['999th']

(400000, 265062)

In [13]:
for i, word in enumerate(data_loader.vocab.itos):
    print(i, word)
    if i>10: break

0 <unk>
1 <pad>
2 the
3 ,
4 .
5 of
6 and
7 in
8 to
9 a
10 =
11 was


In [23]:
def load_pretrained_embedding(l, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
    embed = torch.zeros(l, pretrained_vocab[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(data_loader.vocab.itos):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
#     for i in range(len(vec)):
#         embed[i] = vec[i]
    return embed

model.embedding.weight.data.copy_(load_pretrained_embedding(ntokens, glove_vocab))
model.embedding.weight.requires_grad = True # 直接加载预训练好的, 所以不需要更新它

There are 1028 oov words.


In [15]:
if args.cuda:
    model = model.to(torch.device('cuda'))

In [16]:
model.eval()
model.train()

DiRNN(
  (embedding): Embedding(28783, 100)
  (encoder): LSTM(100, 200, num_layers=5, dropout=0.5)
  (decoder): Linear(in_features=200, out_features=28783, bias=True)
)

In [24]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)

In [18]:
def evaluate(data_source):
    model.eval()
    total_loss = 0
    total_words = 0
    for i in range(0, data_source.size(0) - 1, args.bptt):
        data, targets = get_batch(data_source, i)
        
        output = model(data)
        loss = criterion(output, targets)

        total_loss += loss.data * targets.shape[0]
        total_words += targets.shape[0]
    total_loss = float(total_loss.to(torch.device('cpu')))
#     print(total_loss,total_words)
    return total_loss / total_words, math.exp(total_loss / total_words)

def train():
    model.train()
    total_loss = 0.
    total_words = 0
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        model.zero_grad()
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.data * targets.shape[0]
        total_words += targets.shape[0]
        
#         if (batch % args.log_interval == 0 and batch > 0):
#             cur_loss = total_loss / args.log_interval
#             elapsed = time.time() - start_time
#             print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
#                     'loss {:5.2f} | ppl {:8.2f}'.format(
#                 233, batch, len(train_data) // args.bptt, args.lr,
#                 elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
#             total_loss = 0
#             start_time = time.time()

    cur_loss = total_loss / total_words
    print('train {:10d} words | loss {:5.5f} | ppl {:5.5}'.format(total_words, cur_loss, math.exp(cur_loss)))
    return cur_loss, math.exp(cur_loss)

In [19]:
import numpy as np
Losstrain = []
Lossval = []
Losstest = []

In [25]:
import time
for T in range(args.epochs):
    print('Round : ',T,"  ",time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
#     train()
#     print('  valid ',evaluate(val_data))
#     print('  test  ',evaluate(test_data))
    trainl, trainp = train()
    vall, valp = evaluate(val_data)
    testl, testp = evaluate(test_data)
    
    Losstrain.append(trainp)
    Lossval.append(valp)
    Losstest.append(testp)
    np.save('model-glove-train',np.array(Losstrain))
    np.save('model-glove-val',np.array(Lossval))
    np.save('model-glove-test',np.array(Losstest))
    
    print('  valid {:5.5f} | test {:5.5f}'.format(valp, testp))
    print('-'*80)

Round :  0    2021-05-19 09:50:35
train    2049952 words | loss 6.38019 | ppl 590.04
  valid 454.11489 | test 425.52069
--------------------------------------------------------------------------------
Round :  1    2021-05-19 09:52:42
train    2049952 words | loss 6.33356 | ppl 563.16
  valid 508.25429 | test 472.89789
--------------------------------------------------------------------------------
Round :  2    2021-05-19 09:54:48
train    2049952 words | loss 6.31754 | ppl 554.21
  valid 438.05496 | test 410.52348
--------------------------------------------------------------------------------
Round :  3    2021-05-19 09:56:55
train    2049952 words | loss 6.30438 | ppl 546.96
  valid 475.06683 | test 442.55797
--------------------------------------------------------------------------------
Round :  4    2021-05-19 09:59:01
train    2049952 words | loss 6.29037 | ppl 539.35
  valid 417.35248 | test 389.60191
----------------------------------------------------------------------------

KeyboardInterrupt: 