In [397]:

%reload_ext autoreload
%autoreload 2

In [435]:
import time
import numpy as np

import torch
import torch.nn as nn

from data_handlers import Dictionary, tokenise, batch, get_batch
import rnn
import lstm

In [436]:
cuda = False
device = torch.device("cuda" if cuda else "cpu")
path = './data/penn/'

batch_size = 40
emsize = 400
nhid = 1150
dropout = 0.5
tied = False

In [437]:
# LOAD DATA

dictionary = Dictionary()

# Tokenise data to replace characters with integer indexes
train_data, dictionary = tokenise(path+'train.txt', dictionary)
val_data, dictionary   = tokenise(path+'valid.txt', dictionary)
test_data, dictionary  = tokenise(path+'test.txt', dictionary)

# Batch data: reshapes vector as matrix where number of columns j 
# is the batch size.
train_data = batch(train_data, batch_size)
val_data = batch(val_data, batch_size)
test_data  = batch(test_data, batch_size)

In [438]:
# TRAINING CODE

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history"""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    

def evaluate(model, data, ntokens, batch_size, bptt):
    model.eval()
    total_loss = 0
    hidden = model.init_hidden(batch_size)
    with torch.no_grad():
        for i in range(0, data.size(0) - 1, bptt):
            x, y = get_batch(data, i)
            output, hidden = model(x, y)
            output_flat = output.view(-1, ntokens)
            total_loss += len(x) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / (len(data) - 1)
            
    
def train(model, data, ntokens:int, batch_size:int, lr:float, bptt:int, clip):
    log_interval = 1
    
    model.train()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, data.size(0)-1, bptt)):
        inputs, targets = get_batch(data, i, bptt)
        # For each batch, detach hidden state from state created in previous
        # batches. Else, the model would attempt backpropagation through the 
        # entire dataset
        hidden = repackage_hidden(hidden)
        # Zero the gradients from previous iteration, ready for new values
        model.zero_grad()
        # Forward pass
        output, hidden = model(inputs, hidden)
        # Calculate loss
        loss = criterion(output.view(-1, ntokens), targets.view(-1))
        # Backpropagate
        loss.backward()
        
        # TODO: Check clipping config
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
            
        total_loss += loss.item()
        
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed  = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, np.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
    
    
    
    

In [443]:
# BUILD MODEL

ntokens = len(dictionary)
LSTM = lstm.AWD_LSTM(ntokens, emsize, nhid).to(device)

# TODO: Check loss matches paper
criterion = nn.CrossEntropyLoss()


# TRAINING LOOP

epochs = 3
lr = 0.4
bptt = 35
clip = 0.25

for epoch in range(1, epochs+1):
    epoch_start_time = time.time()
    train(LSTM, train_data, ntokens, batch_size, lr, bptt, clip)
    val_loss = evaluate(LSTM, val_data, ntokens, batch_size, bptt)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                       val_loss, np.exp(val_loss)))
    print('-' * 89)
    
    

| epoch   1 |     1/  663 batches | lr 0.40 | ms/batch 13967.38 | loss 18.42 | ppl 99914517.55
| epoch   1 |     2/  663 batches | lr 0.40 | ms/batch 6511.13 | loss  9.20 | ppl  9924.77
| epoch   1 |     3/  663 batches | lr 0.40 | ms/batch 6536.60 | loss  9.20 | ppl  9866.28
| epoch   1 |     4/  663 batches | lr 0.40 | ms/batch 6507.09 | loss  9.19 | ppl  9823.16


KeyboardInterrupt: 

In [433]:
cuda = False
device = torch.device("cuda" if cuda else "cpu")
path = './data/penn/'

batch_size = 40
ntokens = 10000
embedding_size = 400
nhid = 1150


time_steps = 8
num_layers = 3
batch_size = 4



awd_lstm = lstm.AWD_LSTM(ntokens, embedding_size, nhid)


input = torch.randint(0, 10, (time_steps, batch_size, 10, embedding_size))
h0    = torch.randn(num_layers, batch_size, 20)
c0    = torch.randn(num_layers, batch_size, 20)
output = awd_lstm(input, (h0, c0))
print(output.size())

torch.Size([4, 10, 400, 400]) torch.Size([4, 20])


RuntimeError: size mismatch, m1: [4 x 20], m2: [1150 x 4600] at /Users/administrator/nightlies/pytorch-1.0.0/wheel_build_dirs/conda_3.6/conda/conda-bld/pytorch_1544137972173/work/aten/src/TH/generic/THTensorMath.cpp:940

In [386]:
torch.Tensor(np.random.randint(1, 10000, size=(time_steps, batch_size, 10)), dtype=torch.int64)

TypeError: new() received an invalid combination of arguments - got (numpy.ndarray, dtype=torch.dtype), but expected one of:
 * (torch.device device)
 * (torch.Storage storage)
 * (Tensor other)
 * (tuple of ints size, torch.device device)
      didn't match because some of the keywords were incorrect: dtype
 * (object data, torch.device device)
      didn't match because some of the keywords were incorrect: dtype


In [389]:
torch.Tensor.new_full(3, 1, dtype=torch.long, requires_grad=True)

TypeError: descriptor 'new_full' requires a 'torch._C._TensorBase' object but received a 'int'

In [394]:
torch.randint(0, 10, (2, 4, 5))

tensor([[[5, 7, 9, 4, 4],
         [9, 8, 0, 4, 7],
         [1, 6, 4, 4, 3],
         [6, 1, 7, 1, 0]],

        [[8, 5, 3, 4, 6],
         [2, 3, 7, 6, 2],
         [6, 8, 0, 6, 1],
         [3, 3, 3, 3, 0]]])