In [483]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [484]:
import time
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn

from data_handlers import tokenise, batch, get_batch
import rnn

In [478]:
cuda = False
device = torch.device("cuda" if cuda else "cpu")
path = './data/penn/'
batch_size = 40

emsize = 400
nhid = 1150
nlayers = 3
dropout = 0.5
tied = False

In [479]:
# LOAD DATA

dictionary = data_handlers.Dictionary()

# Tokenise data to replace characters with integer indexes
train_data, dictionary = tokenise(path+'train.txt', dictionary)
val_data, dictionary   = tokenise(path+'valid.txt', dictionary)
test_data, dictionary  = tokenise(path+'test.txt', dictionary)

# Batch data: reshapes vector as matrix where number of columns j 
# is the batch size.
train_data = batch(train_data, batch_size)
val_data = batch(val_data, batch_size)
test_data  = batch(test_data, batch_size)

In [480]:
# BUILD MODEL

ntokens = len(dictionary)
LSTM = rnn.LSTMModel(ntokens, emsize, nhid, nlayers, dropout, tied).to(device)

# TODO: Check loss matches paper
criterion = nn.CrossEntropyLoss()


In [481]:
# TRAINING CODE

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history"""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    

def evaluate(model, data, ntokens, batch_size, bptt):
    model.eval()
    total_loss = 0
    hidden = model.init_hidden(batch_size)
    with torch.no_grad():
        for i in range(0, data.size(0) - 1, bptt):
            x, y = get_batch(data, i)
            output, hidden = model(x, y)
            output_flat = output.view(-1, ntokens)
            total_loss += len(x) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / (len(data) - 1)
            
    
def train(model, data, ntokens:int, batch_size:int, lr:float, bptt:int, clip):
    log_interval = 1
    
    model.train()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, data.size(0)-1, bptt)):
        inputs, targets = get_batch(data, i, bptt)
        # For each batch, detach hidden state from state created in previous
        # batches. Else, the model would attempt backpropagation through the 
        # entire dataset
        hidden = repackage_hidden(hidden)
        # Zero the gradients from previous iteration, ready for new values
        model.zero_grad()
        # Forward pass
        output, hidden = model(inputs, hidden)
        # Calculate loss
        loss = criterion(output.view(-1, ntokens), targets.view(-1))
        # Backpropagate
        loss.backward()
        
        # TODO: Check clipping config
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
            
        total_loss += loss.item()
        
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed  = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
    
    
    
    

In [485]:
# TRAINING LOOP

epochs = 3
lr = 0.4
bptt = 35
clip = 0.25

for epoch in range(1, epochs+1):
    epoch_start_time = time.time()
    train(LSTM, train_data, ntokens, batch_size, lr, bptt, clip)
    val_loss = evaluate(LSTM, val_data, ntokens, batch_size, bptt)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                       val_loss, np.exp(val_loss)))
    print('-' * 89)
    
    

| epoch   1 |     1/  663 batches | lr 0.40 | ms/batch 12070.84 | loss 18.39 | ppl 97453976.17
| epoch   1 |     2/  663 batches | lr 0.40 | ms/batch 5872.26 | loss  9.16 | ppl  9468.02
| epoch   1 |     3/  663 batches | lr 0.40 | ms/batch 5860.41 | loss  9.14 | ppl  9329.36
| epoch   1 |     4/  663 batches | lr 0.40 | ms/batch 6018.52 | loss  9.32 | ppl 11179.73
| epoch   1 |     5/  663 batches | lr 0.40 | ms/batch 6011.74 | loss  9.38 | ppl 11870.83
| epoch   1 |     6/  663 batches | lr 0.40 | ms/batch 5941.86 | loss  9.45 | ppl 12696.52


KeyboardInterrupt: 

In [468]:
from rnn import WeightDrop
import torch
from torch.nn import Parameter

module = torch.nn.GRUCell(2, 2)
# weights = ['weight_hh']
# weight_drop_gru = WeightDrop(gru, weights, dropout=0.9)

# input_ = torch.randn(3, 2)
# hidden_state = torch.randn(3, 2)
# weight_drop_gru(input_, hidden_state)

In [379]:
nn.functional.dropout(lstm.state_dict()['weight_hh_l0'], p=0.5, training=True)

tensor([[ 0.0000,  0.9465],
        [-0.0000, -0.7086],
        [-0.3301, -1.0996],
        [ 0.0000,  0.0371],
        [-0.0000,  0.7389],
        [ 0.0000, -0.0000],
        [-0.0000,  0.0000],
        [-0.0000, -0.0000]])

In [463]:
name_w = 'weight_hh'
w = getattr(lstm, name_w)
#del lstm.parameters[name_w]
lstm.register_parameter(name_w+'_raw', Parameter(w))

In [460]:
del lstm.state_dict()[name_w]

In [464]:
list(lstm.parameters())

[Parameter containing:
 tensor([[-0.4608, -0.5012],
         [ 0.5199, -0.2146],
         [-0.4698,  0.0663],
         [ 0.0585, -0.3524],
         [ 0.0013,  0.1005],
         [-0.5843, -0.6274],
         [-0.0040, -0.1587],
         [ 0.5232,  0.0326]], requires_grad=True), Parameter containing:
 tensor([[ 0.2775, -0.6964],
         [ 0.2408,  0.3445],
         [-0.5694,  0.1093],
         [-0.6746, -0.6683],
         [-0.3519,  0.5631],
         [-0.1682, -0.2012],
         [ 0.4750, -0.1853],
         [-0.2210,  0.2469]], requires_grad=True), Parameter containing:
 tensor([-0.2406, -0.3876, -0.1915,  0.1368, -0.3688, -0.3195, -0.3975,  0.1552],
        requires_grad=True), Parameter containing:
 tensor([ 0.2639,  0.6060,  0.2806,  0.1037,  0.0335, -0.5144, -0.5294,  0.1068],
        requires_grad=True), Parameter containing:
 tensor([[ 0.2775, -0.6964],
         [ 0.2408,  0.3445],
         [-0.5694,  0.1093],
         [-0.6746, -0.6683],
         [-0.3519,  0.5631],
         [-0.1

In [469]:
module.state_dict()

OrderedDict([('weight_ih', tensor([[ 0.1540, -0.2117],
                      [-0.4895,  0.1811],
                      [-0.5259,  0.2650],
                      [ 0.5868,  0.2686],
                      [-0.6825,  0.3709],
                      [-0.2350,  0.4248]])),
             ('weight_hh', tensor([[ 0.2631, -0.6679],
                      [ 0.4857, -0.0733],
                      [ 0.7027, -0.6393],
                      [ 0.0815, -0.6032],
                      [ 0.6067, -0.2080],
                      [-0.0372, -0.1714]])),
             ('bias_ih',
              tensor([-0.0945,  0.3338,  0.5635, -0.0618, -0.4097, -0.2291])),
             ('bias_hh',
              tensor([-0.3066,  0.3881, -0.6168,  0.5579, -0.6611,  0.1860]))])

In [467]:
raw_w = getattr(lstm, name_w + '_raw')
w = nn.functional.dropout(raw_w, p=0.5, training=True)
w

tensor([[ 0.0000, -1.3927],
        [ 0.0000,  0.0000],
        [-0.0000,  0.2186],
        [-0.0000, -1.3365],
        [-0.7039,  1.1263],
        [-0.3364, -0.4024],
        [ 0.0000, -0.3705],
        [-0.0000,  0.4938]], grad_fn=<DropoutBackward>)

In [472]:
from collections import OrderedDict
new_state = OrderedDict()

In [474]:
new_state['yo'] = 1

In [475]:
new_state

OrderedDict([('yo', 1)])