In [1]:
import time
import math
import os
import torch.nn as nn

import torch.onnx
import torch
from torch.autograd import Variable
import torch.optim as optim

import model as rnn_model
import data

In [2]:
args_seed = 1234
args_temperature = 1.
args_data = '../data/wikitext-2'
args_model = 'BiLSTM'
args_emsize = 650
args_nhid = 650
args_nlayers = 2
args_clip = 0.25
args_epochs = 40
args_batch_size = 20
args_bptt = 35
args_dropout = 0.2
args_log_interval = 200
args_save = 'model_800_bilstm.pt'
args_save_state = 'model_state_800_bilstm.pt'
args_tied = True

In [3]:
torch.manual_seed(args_seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f'using device: {device}')

using device: cuda


In [4]:
###############################################################################
# Load data
###############################################################################

corpus = data.Corpus(args_data)

In [5]:
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

eval_batch_size = 10
train_data = batchify(corpus.train, args_batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [6]:
###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = rnn_model.RNNModel(args_model, ntokens, args_emsize, args_nhid, args_nlayers, args_dropout, args_tied).to(device)

criterion = nn.CrossEntropyLoss()



In [7]:
###############################################################################
# Use Adam optimizer
###############################################################################

# create your optimizer
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [8]:
###############################################################################
# Training code
###############################################################################

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [9]:
# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(args_bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, args_bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args_batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args_bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optimizer.zero_grad()   # zero the gradient buffers
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        optimizer.step()    # Does the update
        total_loss += loss.item()

        if batch % args_log_interval == 0 and batch > 0:
            cur_loss = total_loss / args_log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args_bptt, 
                elapsed * 1000 / args_log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


In [None]:
# Loop over epochs.
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, args_epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(args_save, 'wb') as f:
                torch.save(model, f)
            ## Save State Dictionary
            with open(args_save_state, 'wb') as f:
                torch.save(model.state_dict(), f)
            best_val_loss = val_loss

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

| epoch   1 |   200/ 2983 batches | ms/batch 173.20 | loss  9.11 | ppl  9072.09
| epoch   1 |   400/ 2983 batches | ms/batch 159.34 | loss  9.89 | ppl 19745.22
| epoch   1 |   600/ 2983 batches | ms/batch 159.42 | loss 11.06 | ppl 63393.51
| epoch   1 |   800/ 2983 batches | ms/batch 157.12 | loss 11.49 | ppl 98058.75
| epoch   1 |  1000/ 2983 batches | ms/batch 160.20 | loss 11.43 | ppl 92060.07
| epoch   1 |  1200/ 2983 batches | ms/batch 159.26 | loss 12.25 | ppl 208559.52
| epoch   1 |  1400/ 2983 batches | ms/batch 157.40 | loss 13.28 | ppl 584499.78
| epoch   1 |  1600/ 2983 batches | ms/batch 159.97 | loss 13.32 | ppl 611625.56
| epoch   1 |  1800/ 2983 batches | ms/batch 167.79 | loss 12.95 | ppl 419242.55
| epoch   1 |  2000/ 2983 batches | ms/batch 169.17 | loss 13.17 | ppl 525054.45
| epoch   1 |  2200/ 2983 batches | ms/batch 157.24 | loss 13.37 | ppl 639367.40
| epoch   1 |  2400/ 2983 batches | ms/batch 157.13 | loss 13.90 | ppl 1089058.14
| epoch   1 |  2600/ 2983 batche

In [None]:
# Load the best saved model.
with open(args_save, 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)


# Try to generate with this model now

In [None]:
args_data = '../data/wikitext-2'
args_checkpoint = args_save #'./model.pt'
args_state_dict = args_save_state # './model_state.pt'
args_outf = 'generated.txt'
args_words = 1000
args_seed = 1234
args_temperature = 1.0
args_log_interval = 100

In [None]:
# Set the random seed manually for reproducibility.
torch.manual_seed(args_seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

In [None]:
if args_temperature < 1e-3:
    print("args_temperature has to be greater or equal 1e-3")


In [None]:
corpus = data.Corpus(args_data)
ntokens = len(corpus.dictionary)

In [None]:
with open(args_checkpoint, 'rb') as f:
    model = torch.load(f).to(device)
# model = rnn_model.RNNModel(args_model, ntokens, args_emsize, args_nhid, args_nlayers, args_dropout, args_tied).to(device)
# with open(args_state_dict, 'rb') as f:
#     state_dict = torch.load(f)
#     model.load_state_dict(state_dict)

In [None]:
hidden = model.init_hidden(1)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

In [None]:
model.eval()

In [None]:
with open(args_outf, 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(args_words):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(args_temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]

            outf.write(word + ('\n' if i % 20 == 19 else ' '))

            if i % args_log_interval == 0:
                print('| Generated {}/{} words'.format(i, args_words))

In [None]:
model.state_dict()

## What do the embeddings look like?

In [None]:
import scipy
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
model.state_dict().keys()

In [None]:
model.state_dict()['encoder.weight'].shape

In [None]:
embeds = model.state_dict()['encoder.weight']

In [None]:
np.array(list(corpus.dictionary.word2idx.keys()))

In [None]:
some_words = ['flour', 'water', 'bread', 'coffee', 'espresso', 'driving', 'car', 'horse', 'chicken', 'bird', 'cow', 'leg']
some_words

In [None]:
some_idxs = [corpus.dictionary.word2idx[word] for word in some_words]

In [None]:
print(embeds[some_idxs].shape)
np.array(embeds[some_idxs])

In [None]:
tsne = TSNE(n_components=2, random_state=123)
#np.set_printoptions(suppress=True)
Y = tsne.fit_transform(np.array(embeds[some_idxs]))


In [None]:
x_coords = Y[:, 0]
y_coords = Y[:, 1]
# display scatter plot
plt.scatter(x_coords, y_coords)

for label, x, y in zip(some_words, x_coords, y_coords):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
plt.show()

Well, this is still somewhat terrible. The words seem to be a bit better now, looks like some sentences are there, but this model has hit its limits and it doesn't appear that more training time will help at all.