# Initialization

In [1]:
import os
from io import open
import torch
import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx

In [2]:
class Config:
    def __init__(self):
        # training config
        self.data = 'data/Grimm_text.txt' # path to data text
        self.val_frac = 0.1 # fraction of validation
        self.model = 'GRU' # type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)
        self.emsize = 200 # size of word embeddings
        self.nhid = 200 # number of hidden units per layer
        self.nlayers = 2 # number of layers
        self.lr = 20 # initial learning rate
        self.clip = 0.25 # gradient clipping
        self.epochs = 40 # upper epochs limit
        self.batch_size = 20 # batch size
        self.bptt = 35 # sequence length
        self.dropout = 0.2 # dropout applied to layers (0 = no dropout)
        self.seed = 1111 # random seed
        self.log_interval = 200 # report interval
        self.save = 'model/WordRNN.pt' # path to save the final model
        
        # generation config
        self.words = 1000 # number of words to generate
        self.temperature = 1.0 # temperature for generation - higher will increase diversity
        self.outf = 'output/generated.txt' # output file for generated text

In [3]:
args = Config()

In [4]:
# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)

<torch._C.Generator at 0x174dbea3db0>

In [5]:
# Check if GPU is available
train_on_gpu = torch.cuda.is_available()
device = torch.device("cuda" if train_on_gpu else "cpu")
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

Training on GPU!


# Data Loading

In [6]:
# maintain mapping between words and indices
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [7]:
# data loader that reads input text, splits into train and validation and maintain word indices
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        # Open text file and read in data as `text`
        with open(path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        val_idx = int(len(lines)*(1-args.val_frac))
        train_lines, val_lines = lines[:val_idx], lines[val_idx:]
        self.train = self.tokenize(train_lines)
        self.valid = self.tokenize(val_lines)

    def tokenize(self, lines):
        tokens = 0
        for line in lines:
            words = line.split() + ['<eos>']
            tokens += len(words)
            for word in words:
                self.dictionary.add_word(word)

        ids = torch.LongTensor(tokens)
        token = 0
        for line in lines:
            words = line.split() + ['<eos>']
            for word in words:
                ids[token] = self.dictionary.word2idx[word]
                token += 1

        return ids

In [8]:
# load data
corpus = Corpus(args.data)

In [9]:
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

In [10]:
eval_batch_size = 10
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)

# Model Definition

In [13]:
class WordRNN(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(WordRNN, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)


# Model Training

In [14]:
ntokens = len(corpus.dictionary)
model = WordRNN(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout).to(device)

In [15]:
criterion = nn.CrossEntropyLoss()

In [16]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [17]:
# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(args.bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [18]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, args.bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / (len(data_source) - 1)

In [19]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [20]:
# Loop over epochs.
lr = args.lr
best_val_loss = None

In [21]:
# At any point you can use Kernel->Interrupt break out of training early.
try:
    for epoch in range(1, args.epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(args.save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

| epoch   1 |   200/  393 batches | lr 20.00 | ms/batch 31.28 | loss  8.66 | ppl  5744.58
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 14.82s | valid loss  6.41 | valid ppl   607.68
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |   200/  393 batches | lr 20.00 | ms/batch 29.18 | loss  5.89 | ppl   360.83
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 11.87s | valid loss  5.65 | valid ppl   284.40
-----------------------------------------------------------------------------------------
| epoch   3 |   200/  393 batches | lr 20.00 | ms/batch 28.67 | loss  5.20 | ppl   181.84
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 11.71s | valid loss  5.35 | valid ppl   211.42
-----------------------------------------------------------------------------------------
| epoch   4 |   200/  393 batches | lr 20.00 | ms/batch 29.09 | loss  4.90 | ppl   134.87
-----------------------------------------------------------------------------------------
| end of epoch   4 | time: 11.88s | valid loss  5.27 | valid ppl   194.75
----------------------------------------------------------

| epoch  26 |   200/  393 batches | lr 0.00 | ms/batch 36.62 | loss  3.80 | ppl    44.88
-----------------------------------------------------------------------------------------
| end of epoch  26 | time: 15.00s | valid loss  5.04 | valid ppl   154.41
-----------------------------------------------------------------------------------------
| epoch  27 |   200/  393 batches | lr 0.00 | ms/batch 36.65 | loss  3.80 | ppl    44.87
-----------------------------------------------------------------------------------------
| end of epoch  27 | time: 15.15s | valid loss  5.04 | valid ppl   154.41
-----------------------------------------------------------------------------------------
| epoch  28 |   200/  393 batches | lr 0.00 | ms/batch 36.79 | loss  3.81 | ppl    45.06
-----------------------------------------------------------------------------------------
| end of epoch  28 | time: 15.08s | valid loss  5.04 | valid ppl   154.41
-------------------------------------------------------------

In [22]:
# Load the best saved model.
with open(args.save, 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

# Generation

In [23]:
model.eval()
hidden = model.init_hidden(1)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

In [24]:
with open(args.outf, 'w') as outf:    
    with torch.no_grad():  # no tracking history
        for i in range(args.words):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(args.temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]

            outf.write(word + ('\n' if i % 20 == 19 else ' '))

            if i % args.log_interval == 0:
                print('| Generated {}/{} words'.format(i, args.words))

| Generated 0/1000 words
| Generated 200/1000 words
| Generated 400/1000 words
| Generated 600/1000 words
| Generated 800/1000 words
