In [1]:
import torch
import numpy as np
import random

USE_CUDA = torch.cuda.is_available()

random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)
    
BATCH_SIZE = 32
EMBEDDING_SIZE = 650
HIDDEN_SIZE = 1000
MAX_VOCAB_SIZE = 50000

# Load datasets and create iterators

In [2]:
%%time
from torchtext.data import Field
from torchtext.datasets import LanguageModelingDataset

TEXT = Field(lower=True)

train, val, test = LanguageModelingDataset.splits(
    path='data/text8',
    train='train.txt',
    validation='dev.txt',
    test='test.txt',
    text_field=TEXT
)

TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)
print(f'vocabulary size: {len(TEXT.vocab)}')



vocabulary size: 50002
CPU times: user 11.4 s, sys: 1.83 s, total: 13.3 s
Wall time: 13.3 s


In [3]:
print(TEXT.vocab.itos[:50])
print()
print(list(TEXT.vocab.stoi.items())[:50])

['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero', 'nine', 'two', 'is', 'as', 'eight', 'for', 's', 'five', 'three', 'was', 'by', 'that', 'four', 'six', 'seven', 'with', 'on', 'are', 'it', 'from', 'or', 'his', 'an', 'be', 'this', 'he', 'at', 'which', 'not', 'also', 'have', 'were', 'has', 'but', 'other', 'their', 'its', 'first', 'they', 'had']

[('<unk>', 0), ('<pad>', 1), ('the', 2), ('of', 3), ('and', 4), ('one', 5), ('in', 6), ('a', 7), ('to', 8), ('zero', 9), ('nine', 10), ('two', 11), ('is', 12), ('as', 13), ('eight', 14), ('for', 15), ('s', 16), ('five', 17), ('three', 18), ('was', 19), ('by', 20), ('that', 21), ('four', 22), ('six', 23), ('seven', 24), ('with', 25), ('on', 26), ('are', 27), ('it', 28), ('from', 29), ('or', 30), ('his', 31), ('an', 32), ('be', 33), ('this', 34), ('he', 35), ('at', 36), ('which', 37), ('not', 38), ('also', 39), ('have', 40), ('were', 41), ('has', 42), ('but', 43), ('other', 44), ('their', 45), ('its', 46), ('first', 47), ('they',

In [4]:
from torchtext.data import BPTTIterator

VOCAB_SIZE = len(TEXT.vocab)
train_iter, val_iter, test_iter = BPTTIterator.splits(
    (train, val, test),
    batch_size=BATCH_SIZE,
    device='cuda' if USE_CUDA else 'cpu',
    bptt_len=50,
    repeat=False,
    shuffle=True,
)



In [5]:
print(next(iter(train_iter)))
print(next(iter(val_iter)))
print(next(iter(test_iter)))




[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]

[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]

[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]


#### In a language model dataset, the target is always the next word.

In [6]:
it = iter(train_iter)
batch = next(it)
print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:, 1].data]))
print(' '.join([TEXT.vocab.itos[i] for i in batch.target[:, 1].data]))

combine in pairs and then group into trios of pairs which are the smallest visible units of matter this parallels with the structure of modern atomic theory in which pairs or triplets of supposedly fundamental quarks combine to create most typical forms of matter they had also suggested the possibility
in pairs and then group into trios of pairs which are the smallest visible units of matter this parallels with the structure of modern atomic theory in which pairs or triplets of supposedly fundamental quarks combine to create most typical forms of matter they had also suggested the possibility of


#### The data stream continues onto the same slot in the next batch.

In [7]:
for j in range(5):
    print(j)
    print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:, j].data]))
    print(j)
    print(' '.join([TEXT.vocab.itos[i] for i in batch.target[:, j].data]))

0
anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the
0
originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization
1
combine in pairs and then group into trios of pairs which are the smallest visible units of matter this parallels with the structure of modern atomic theory in which pairs or triplets of supposedly fundamental quarks combine to create most typical forms of matter they had also suggested the possibility
1
in pairs and then group into trios of pairs which are the smallest visible units of matter this parallels w

In [8]:
for j in range(5):
    batch = next(it)
    print(j)
    print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:, 2].data]))
    print(j)
    print(' '.join([TEXT.vocab.itos[i] for i in batch.target[:, 2].data]))

0
reject that the relationship goes beyond contact i e mutual borrowing of words between japanese and ainu in fact no attempt to show a relationship with ainu to any other language has gained wide acceptance and ainu is currently considered to be a language isolate culture traditional ainu culture is
0
that the relationship goes beyond contact i e mutual borrowing of words between japanese and ainu in fact no attempt to show a relationship with ainu to any other language has gained wide acceptance and ainu is currently considered to be a language isolate culture traditional ainu culture is quite
1
quite different from japanese culture never shaving after a certain age the men had full beards and <unk> men and women alike cut their hair level with the shoulders at the sides of the head but trimmed it <unk> behind the women tattooed their mouths arms <unk> and sometimes their
1
different from japanese culture never shaving after a certain age the men had full beards and <unk> men and wom

# Build RNN model

In [9]:
import torch.nn as nn

class RNNModel(nn.Module):
    
    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError("""An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity, dropout=dropout)
        
        self.decoder = nn.Linear(nhid, ntoken)
        
        self.init_weights()
        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, inp, hidden):
        emb = self.drop(self.encoder(inp))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        return decoded, hidden
    
    def init_hidden(self, bsz, requires_grad=True):
        weight = next(self.parameters())
        zero_weights = weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad)
        if self.rnn_type == 'LSTM':
            return (zero_weights,) * 2
        else:
            return zero_weights

In [10]:
model = RNNModel('LSTM', VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, 2, dropout=0.5)
if USE_CUDA:
    model = model.cuda()

#### When starting a new batch, we only need detached previous hidden state.

In [11]:
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

# Define loss function and optimizer

In [12]:
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)

# Train RNN model

In [13]:
@torch.no_grad()
def evaluate(model, data):
    model.eval()
    total_loss = 0.
    it = iter(data)
    total_count = 0
    hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target
        if USE_CUDA:
            data, target = data.cuda(), target.cuda()
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
        count = np.multiply(*data.size())
        total_count += count
        total_loss += loss.item() * count
        
    model.train()
    return total_loss / total_count

In [15]:
%%time
GRAD_CLIP = 1.
NUM_EPOCHS = 2

max_val_loss = float('inf')
for epoch in range(NUM_EPOCHS):
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target
        if USE_CUDA:
            data, target = data.cuda(), target.cuda()
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
        if i % 1000 == 0:
            print("epoch", epoch, "iter", i, "loss", loss.item())
        if i % 2000 == 0:
            val_loss = evaluate(model, val_iter)
            if val_loss < max_val_loss:
                print('best model, val loss: ', val_loss)
                torch.save(model.state_dict(), "lm-best.pt")
                max_val_loss = val_loss
            else:
                scheduler.step()

epoch 0 iter 0 loss 10.73099136352539
best model, val loss:  10.460744624484246
epoch 0 iter 1000 loss 6.017040252685547
epoch 0 iter 2000 loss 6.039785385131836
best model, val loss:  5.763722616970248
epoch 0 iter 3000 loss 5.830297470092773
epoch 0 iter 4000 loss 5.506411075592041
best model, val loss:  5.3955653102316266
epoch 0 iter 5000 loss 5.906774997711182
epoch 0 iter 6000 loss 5.584385395050049
best model, val loss:  5.2004493789471855
epoch 0 iter 7000 loss 5.470476150512695
epoch 0 iter 8000 loss 5.342258930206299
best model, val loss:  5.064773329317368
epoch 0 iter 9000 loss 5.415655136108398
epoch 1 iter 0 loss 5.489630222320557
best model, val loss:  4.996402858925081
epoch 1 iter 1000 loss 5.135668754577637
epoch 1 iter 2000 loss 5.336435317993164
best model, val loss:  4.935460278357067
epoch 1 iter 3000 loss 5.196143627166748
epoch 1 iter 4000 loss 5.052638053894043
best model, val loss:  4.868982996849086
epoch 1 iter 5000 loss 5.493694305419922
epoch 1 iter 6000 l

# Load model for evaluation

In [17]:
best_model = RNNModel("LSTM", VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, 2, dropout=0.5)
if USE_CUDA:
    best_model = best_model.cuda()
best_model.load_state_dict(torch.load("lm-best.pt"))

<All keys matched successfully>

In [18]:
val_loss = evaluate(best_model, val_iter)
print("perplexity: ", np.exp(val_loss))



perplexity:  119.56287800351764


In [19]:
test_loss = evaluate(best_model, test_iter)
print("perplexity: ", np.exp(test_loss))

perplexity:  153.88809694501043


#### Generate a 100-word sequence using the trained language model

In [20]:
hidden = best_model.init_hidden(1)
device = torch.device("cuda" if USE_CUDA else "cpu")
inp = torch.randint(VOCAB_SIZE, (1, 1), dtype=torch.long).to(device)
words = []
for i in range(100):
    output, hidden = best_model(inp, hidden)
    word_weights = output.squeeze().exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]
    inp.fill_(word_idx)
    word = TEXT.vocab.itos[word_idx]
    words.append(word) 
print(" ".join(words))

watson and safeguard mccarthy johnny bradley quotes that the letters of any fantastic names in robin dictionary were remembered in morse code i people believe in a assisi the subject was added to a new stella but they took the war for both a read and the <unk> a day ago do the most epic conventions known with the words i said literally where they seldom overlay welcoming and do it upon us bear two in the story tears ended of the one stroke on time when images could re be coastal at the opposite of sin this difference is
