# World-level language modeling RNN
- https://github.com/pytorch/examples/tree/master/word_language_model

In [131]:
import os
import time
import math
import numpy as np
import torch
import torch.nn as nn

In [32]:
seed = 1111
cuda = False
data = './data/wikitext-2'
batch_size = 20

In [4]:
torch.manual_seed(seed)
device = torch.device('cuda' if cuda else 'cpu')

In [5]:
device

device(type='cpu')

In [16]:
class Dictionary(object):
    
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
    
    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]
    
    def __len__(self):
        return len(self.idx2word)

In [17]:
class Corpus(object):
    
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))
    
    def tokenize(self, path):
        assert os.path.exists(path)
        with open(path, 'r', encoding='utf-8') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        with open(path, 'r', encoding='utf-8') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1
        
        return ids

In [18]:
corpus = Corpus(data)

In [25]:
corpus.train[:10]

tensor([ 0,  1,  2,  3,  4,  1,  0,  0,  5,  6])

In [29]:
[corpus.dictionary.idx2word[i] for i in corpus.train[:100]]

['<eos>',
 '=',
 'Valkyria',
 'Chronicles',
 'III',
 '=',
 '<eos>',
 '<eos>',
 'Senjō',
 'no',
 'Valkyria',
 '3',
 ':',
 '<unk>',
 'Chronicles',
 '(',
 'Japanese',
 ':',
 '戦場のヴァルキュリア3',
 ',',
 'lit',
 '.',
 'Valkyria',
 'of',
 'the',
 'Battlefield',
 '3',
 ')',
 ',',
 'commonly',
 'referred',
 'to',
 'as',
 'Valkyria',
 'Chronicles',
 'III',
 'outside',
 'Japan',
 ',',
 'is',
 'a',
 'tactical',
 'role',
 '@-@',
 'playing',
 'video',
 'game',
 'developed',
 'by',
 'Sega',
 'and',
 'Media.Vision',
 'for',
 'the',
 'PlayStation',
 'Portable',
 '.',
 'Released',
 'in',
 'January',
 '2011',
 'in',
 'Japan',
 ',',
 'it',
 'is',
 'the',
 'third',
 'game',
 'in',
 'the',
 'Valkyria',
 'series',
 '.',
 '<unk>',
 'the',
 'same',
 'fusion',
 'of',
 'tactical',
 'and',
 'real',
 '@-@',
 'time',
 'gameplay',
 'as',
 'its',
 'predecessors',
 ',',
 'the',
 'story',
 'runs',
 'parallel',
 'to',
 'the',
 'first',
 'game',
 'and',
 'follows',
 'the']

In [58]:
def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

In [59]:
eval_batch_size = 10
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)
print(train_data.size())
print(val_data.size())
print(test_data.size())

torch.Size([104431, 20])
torch.Size([21764, 10])
torch.Size([24556, 10])


In [60]:
[corpus.dictionary.idx2word[i] for i in train_data[:, 0][:10]]

['<eos>',
 '=',
 'Valkyria',
 'Chronicles',
 'III',
 '=',
 '<eos>',
 '<eos>',
 'Senjō',
 'no']

In [91]:
xx = torch.from_numpy(np.arange(20))
xx = xx.view(4, 5).t().contiguous()
xx

tensor([[  0,   5,  10,  15],
        [  1,   6,  11,  16],
        [  2,   7,  12,  17],
        [  3,   8,  13,  18],
        [  4,   9,  14,  19]])

In [64]:
ntokens = len(corpus.dictionary)
ntokens

33278

In [154]:
model = 'LSTM'
emsize = 200  # size of word embeddings
nhid = 200    # number of hidden units per layer
nlayers = 2   # number of layers
dropout = 0.2
tied = False

In [155]:
class RNNModel(nn.Module):
    
    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError('invalid nonlinearity')
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)
        
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight
        
        self.init_weights()
        
        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers
    
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            # (num_layers, batch, hidden_size)
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

In [156]:
print(getattr(nn, 'LSTM'))
print(getattr(nn, 'GRU'))
print(getattr(nn, 'Conv2d'))

<class 'torch.nn.modules.rnn.LSTM'>
<class 'torch.nn.modules.rnn.GRU'>
<class 'torch.nn.modules.conv.Conv2d'>


In [157]:
model = RNNModel(model, ntokens, emsize, nhid, nlayers, dropout, tied).to(device)

In [158]:
model

RNNModel(
  (drop): Dropout(p=0.2)
  (encoder): Embedding(33278, 200)
  (rnn): LSTM(200, 200, num_layers=2, dropout=0.2)
  (decoder): Linear(in_features=200, out_features=33278, bias=True)
)

In [159]:
criterion = nn.CrossEntropyLoss()

In [212]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    # 各系列の次の要素を出力してほしい
    # lossを計算する時に1Dテンソルにするためにview(-1)しておく
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [201]:
xx = torch.from_numpy(np.arange(20))
xx = xx.view(4, 5).t().contiguous()
bptt = 3
print(xx)
data, target = get_batch(xx, 3)
print(data)
print(target)

tensor([[  0,   5,  10,  15],
        [  1,   6,  11,  16],
        [  2,   7,  12,  17],
        [  3,   8,  13,  18],
        [  4,   9,  14,  19]])
tensor([[  3,   8,  13,  18]])
tensor([[  4,   9,  14,  19]])


In [230]:
bptt = 35
clip = 0.25  # gradient clipping
log_interval = 200

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def train():
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)

        # バッチごとに系列データを処理する
        # 次のバッチに行ったら前のバッチの隠れ状態を初期値として使うが逆伝搬はしない
        # BPTTなのでそれ以前まで逆伝搬しないように1つ前のバッチの隠れ状態は切り離す
        hidden = repackage_hidden(hidden)
        
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()
        
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval,
                cur_loss,
                math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [231]:
def evaluate(data_source):
    model.eval()
    total_loss = 0.0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

In [232]:
lr = 20
epochs = 40
best_val_loss = None
save = 'model.pt'

In [233]:
try:
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss)))
        print('-' * 89)
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

-----------------------------------------------------------------------------------------
Exiting from training early
