# AWD-LSTM

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
#export
from exp.nb_12 import *

# Data

In [3]:
path = untar_data(imdb_path); path

PosixPath('/home/ubuntu/learnai/dl/data/imdb')

In [4]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))

In [5]:
proc_tok, proc_num = TokenizeProcessor(max_workers=8), NumericalizeProcessor()

In [6]:
ll = label_by_func(sd, lambda x:0, proc_x=[proc_tok, proc_num])

In [7]:
pickle.dump(ll, open(path/'ll_lm.pkl', 'wb'))
pickle.dump(proc_num.vocab, open(path/'vocab_lm.pkl', 'wb'))

In [8]:
ll = pickle.load(open(path/'ll_lm.pkl', 'rb'))

In [9]:
vocab = pickle.load(open(path/'vocab_lm.pkl', 'rb'))

In [10]:
bs, bptt = 64, 70
data = lm_databunchify(ll, bs, bptt)

# AWD-LSTM

![](https://upload.wikimedia.org/wikipedia/commons/3/3b/The_LSTM_cell.png)

In [11]:
class LSTMCell(nn.Module):
    def __init__(self, ni, nh):
        super().__init__()
        self.ih = nn.Linear(ni, 4*nh) #mult by 4 because internally LSTMcell splits input into 4 chunks
        self.hh = nn.Linear(nh, 4*nh)
        
    def forward(self, input, state):
        h,c = state
        gates = (self.ih(input) + self.hh(h)).chunk(4,1)
        ingate, forgetgate, outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()
        
        c = (forgetgate*c) + (ingate*cellgate)
        h = outgate * c.tanh()
        return h, (h,c)

In [12]:
class LSTMLayer(nn.Module):
    def __init__(self, cell, *cell_args):
        super().__init__()
        self.cell = cell(*cell_args)
        
    def forward(self, input, state):
        inputs = input.unbind(1)
        outputs = []
        for i in range(len(inputs)):
            out, state = self.cell(inputs[i], state)
            outputs += [out]
        return torch.stack(outputs, dim=1), state

In [13]:
lstm = LSTMLayer(LSTMCell, 300, 300)

In [14]:
x = torch.randn(64, 70, 300)
h = (torch.zeros(64, 300), torch.zeros(64, 300))

CPU

In [15]:
%timeit -n 10 y, h1 = lstm(x, h)

163 ms ± 307 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [17]:
def time_fn(f):
    f()
    torch.cuda.synchronize()

CUDA

In [18]:
f = partial(lstm, x, h)
time_fn(f)

In [19]:
%timeit -n 10 time_fn(f)

32.3 ms ± 9.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Builtin version

In [20]:
lstm = nn.LSTM(input_size=300, hidden_size=300, num_layers=1, batch_first=True)

In [21]:
x = torch.randn(64, 70, 300)
h = (torch.zeros(1, 64, 300), torch.zeros(1, 64, 300))

CPU

In [22]:
%timeit -n 10 y, h1 = lstm(x,h)

137 ms ± 9.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [24]:
f = partial(lstm, x, h)
time_fn(f)

GPU

In [25]:
%timeit -n 10 time_fn(f)

9.46 ms ± 801 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Dropout

In [28]:
j = torch.rand(5)

In [29]:
j.new(10)

tensor([5.0962e+37, 4.5811e-41, 5.0962e+37, 4.5811e-41,        nan, 4.5811e-41,
        1.7753e+28, 2.7927e+20, 1.2723e+25, 9.8834e+17])

In [30]:
#export
def dropout_mask(x, sz, p):
    return x.new(*sz).bernoulli_(1-p).div_(1-p)

In [31]:
x = torch.randn(10, 10)
mask = dropout_mask(x, (10, 10), 0.5); mask

tensor([[0., 2., 2., 2., 2., 2., 0., 0., 2., 0.],
        [0., 2., 0., 0., 0., 2., 0., 0., 2., 2.],
        [0., 2., 2., 2., 2., 2., 2., 0., 0., 2.],
        [2., 0., 0., 0., 2., 2., 0., 2., 0., 0.],
        [2., 0., 2., 2., 2., 2., 0., 0., 2., 2.],
        [2., 0., 2., 2., 2., 2., 0., 2., 0., 0.],
        [0., 0., 0., 2., 2., 0., 2., 0., 2., 0.],
        [2., 0., 2., 0., 2., 0., 2., 0., 0., 2.],
        [2., 2., 0., 2., 2., 0., 0., 2., 0., 0.],
        [2., 2., 2., 0., 0., 0., 2., 0., 0., 2.]])

In [32]:
(x*mask).std(), x.std()

(tensor(1.4219), tensor(1.1419))

In [33]:
x = torch.randn(2,3,3)

In [34]:
x

tensor([[[ 0.7222, -0.1053,  1.7117],
         [-2.6635,  1.7559,  1.0049],
         [ 1.9689, -0.6475,  1.3450]],

        [[-0.6667, -2.4783, -0.0041],
         [ 1.1831, -0.8572, -0.4553],
         [ 0.4848, -0.3634, -1.3000]]])

In [35]:
x.new(*(x.size(0), 1, x.size(2)))

tensor([[[-2.0670e+34,  3.0787e-41, -6.1824e+25]],

        [[ 3.0787e-41,  8.9683e-44,  0.0000e+00]]])

In [36]:
jk=x.new(2,1,3).bernoulli_(0.5).div_(0.5)
jk

tensor([[[2., 2., 2.]],

        [[2., 0., 2.]]])

In [37]:
x * jk

tensor([[[ 1.4445, -0.2105,  3.4233],
         [-5.3269,  3.5118,  2.0099],
         [ 3.9378, -1.2949,  2.6899]],

        [[-1.3334, -0.0000, -0.0083],
         [ 2.3662, -0.0000, -0.9107],
         [ 0.9697, -0.0000, -2.6000]]])

In [38]:
#export
class RNNDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
        
    def forward(self, x):
        if not self.training or self.p==0.:
            return x
        m = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.p)
        return x*m

In [39]:
dp = RNNDropout(0.3)
tst_input = torch.randn(3,3,7)
tst_input, dp(tst_input)

(tensor([[[-0.4490,  2.1071,  0.5121, -0.2689, -0.2110,  1.7420,  0.9224],
          [ 2.1162, -1.2908,  0.8151,  1.1954,  0.9972,  1.3228,  0.8861],
          [ 0.8276,  0.0869, -0.9406, -0.4544,  0.1115,  0.3550, -0.4297]],
 
         [[-0.9620,  2.0118, -0.6758,  0.2074, -1.3335, -0.7043,  0.0327],
          [-0.0513,  1.1234,  0.7724, -0.7723,  2.4399, -1.1658, -0.7786],
          [ 1.3043, -0.7979, -0.5535,  0.5323,  1.6160,  0.7811,  0.6359]],
 
         [[-0.1526,  2.0579,  0.6918, -0.6077,  0.4659, -0.6876, -0.3519],
          [ 0.9676, -0.4449, -0.8816,  1.3766,  1.5724, -0.0510, -0.8011],
          [ 1.0664, -0.2363, -1.4780,  0.6300,  0.6918,  1.3439, -1.8077]]]),
 tensor([[[-0.6414,  3.0101,  0.7315, -0.3841, -0.0000,  2.4886,  1.3177],
          [ 3.0232, -1.8439,  1.1645,  1.7078,  0.0000,  1.8897,  1.2658],
          [ 1.1823,  0.1242, -1.3437, -0.6492,  0.0000,  0.5071, -0.6138]],
 
         [[-1.3743,  0.0000, -0.9655,  0.2964, -1.9050, -1.0061,  0.0468],
          [-0

In [40]:
#export
import warnings

WEIGHT_HH = 'weight_hh_l0'

class WeightDropout(nn.Module):
    def __init__(self, module, weight_p=[0.], layer_names=[WEIGHT_HH]):
        super().__init__()
        self.module, self.weight_p, self.layer_names = module, weight_p, layer_names
        for layer in self.layer_names:
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
            self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training=False)
            
    def _setweights(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training) #applies dropout to actual weights
    
    def forward(self, *args):
        self._setweights()
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            return self.module.forward(*args)

In [41]:
module = nn.LSTM(5, 2)
dp_module = WeightDropout(module, 0.4)
getattr(dp_module.module, WEIGHT_HH)

Parameter containing:
tensor([[-0.1221, -0.0393],
        [-0.4811, -0.5771],
        [-0.3773, -0.2780],
        [ 0.0526, -0.3876],
        [ 0.3805, -0.4425],
        [-0.3881,  0.0485],
        [ 0.5752, -0.3762],
        [ 0.5243,  0.2080]], requires_grad=True)

In [42]:
tst_input = torch.randn(4, 20, 5)
h = (torch.zeros(1, 20, 2), torch.zeros(1, 20, 2))
x, h = dp_module(tst_input, h)
getattr(dp_module.module, WEIGHT_HH)

tensor([[-0.0000, -0.0654],
        [-0.8019, -0.9618],
        [-0.6289, -0.0000],
        [ 0.0876, -0.6460],
        [ 0.6342, -0.0000],
        [-0.6468,  0.0809],
        [ 0.9587, -0.0000],
        [ 0.8739,  0.3466]], grad_fn=<MulBackward0>)

In [43]:
#export
class EmbeddingDropout(nn.Module):
    def __init__(self, emb, embed_p):
        super().__init__()
        self.emb, self.embed_p = emb, embed_p
        self.pad_idx = self.emb.padding_idx
        if self.pad_idx is None:
            self.pad_idx = -1
            
    def forward(self, words, scale=None):
        if self.training and self.embed_p != 0:
            size = (self.emb.weight.size(0), 1)
            mask = dropout_mask(self.emb.weight.data, size, self.embed_p)
            masked_embed = self.emb.weight * mask
        else:
            masked_embed = self.emb.weight
        if scale:
            masked_embed.mul_(scale)
        return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm,
                          self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse)

In [44]:
enc = nn.Embedding(100, 7, padding_idx=1)
enc_dp = EmbeddingDropout(enc, 0.5)
tst_input = torch.randint(0, 100, (8,))
enc_dp(tst_input)

tensor([[-2.8426,  3.0972, -3.1211,  0.9516, -2.4718, -0.2973,  1.6370],
        [-0.8492,  2.2259,  3.0138, -0.0963, -3.0458, -2.7661,  0.9935],
        [ 0.0000, -0.0000, -0.0000,  0.0000, -0.0000, -0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000],
        [-1.2163,  4.1821,  4.4464, -1.2780, -1.6682, -1.5633, -0.4600],
        [ 0.0000, -0.0000, -0.0000, -0.0000,  0.0000,  0.0000,  0.0000],
        [-2.8426,  3.0972, -3.1211,  0.9516, -2.4718, -0.2973,  1.6370],
        [ 1.8341,  2.5440,  0.0676, -1.8668, -1.8180, -1.4580, -0.2391]],
       grad_fn=<EmbeddingBackward>)

# The main Model

In [45]:
#export
def to_detach(h):
    "Detaches `h` from its history"
    if type(h)==torch.Tensor:
        return h.detach() 
    else:
        return tuple(to_detach(v) for v in h)

In [46]:
#export
class AWD_LSTM(nn.Module):
    initrange=0.1
    
    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token,
                hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5):
        super().__init__()
        self.bs, self.emb_sz, self.n_hid, self.n_layers = 1, emb_sz, n_hid, n_layers
        self.emb = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        self.emb_dp = EmbeddingDropout(self.emb, embed_p)
        self.rnns = [nn.LSTM(emb_sz if l==0 else n_hid, (n_hid if l!=n_layers-1 else emb_sz), 1,
                            batch_first=True) for l in range(n_layers)]
        self.rnns = nn.ModuleList([WeightDropout(rnn, weight_p) for rnn in self.rnns])
        self.emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])
        
    def forward(self, input):
        bs, sl = input.size()
        if bs!=self.bs:
            self.bs = bs
            self.reset()
        raw_output = self.input_dp(self.emb_dp(input))
        new_hidden, raw_outputs, outputs = [], [], []
        for l, (rnn, hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l!=self.n_layers - 1:
                raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
        self.hidden = to_detach(new_hidden)
        return raw_outputs, outputs
    
    def _one_hidden(self, l):
        "Return one hidden state"
        if l!=self.n_layers-1:
            nh = self.n_hid
        else:
            nh = self.emb_sz
        return next(self.parameters()).new(1, self.bs, nh).zero_()
    
    def reset(self):
        "Reset the hidden states"
        self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]

In [47]:
#export
class LinearDecoder(nn.Module):
    def __init__(self, n_out, n_hid, output_p, tie_encoder=None, bias=True):
        super().__init__()
        self.output_dp = RNNDropout(output_p)
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        if bias:
            self.decoder.bias.data.zero_()
        if tie_encoder:
            self.decoder.weight = tie_encoder.weight
        else:
            init.kaiming_uniform_(self.decoder.weight)
            
    def forward(self, input):
        raw_outputs, outputs = input
        output = self.output_dp(outputs[-1]).contiguous()
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded, raw_outputs, outputs

In [48]:
#export
class SequentialRNN(nn.Sequential):
    "A sequential module that passes the reset call to its children"
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'):
                c.reset()

In [49]:
#export
def get_language_model(vocab_sz, emb_sz, n_hid, n_layers, pad_token, output_p=0.4, hidden_p=0.2, input_p=0.6,
                      embed_p=0.1, weight_p=0.5, tie_weights=True, bias=True):
    rnn_enc = AWD_LSTM(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token,
                      hidden_p=hidden_p, input_p=input_p, embed_p=embed_p, weight_p=weight_p)
    enc = rnn_enc.emb if tie_weights else None
    return SequentialRNN(rnn_enc, LinearDecoder(vocab_sz, emb_sz, output_p, tie_encoder=enc, bias=bias))

In [50]:
tok_pad = vocab.index(PAD)

In [51]:
tok_pad

1

In [52]:
tst_model = get_language_model(len(vocab), 300, 300, 2, tok_pad)
tst_model = tst_model.cuda()

In [53]:
x, y = next(iter(data.train_dl))

In [54]:
z = tst_model(x.cuda())

In [55]:
len(z)

3

In [56]:
decoded, raw_outputs, outputs = z

In [57]:
decoded.size()

torch.Size([4480, 60003])

In [58]:
len(raw_outputs), len(outputs)

(2, 2)

In [59]:
[o.size() for o in raw_outputs], [o.size() for o in outputs]

([torch.Size([64, 70, 300]), torch.Size([64, 70, 300])],
 [torch.Size([64, 70, 300]), torch.Size([64, 70, 300])])

## Callbacks to train the model

In [60]:
#export
class GradientClipping(Callback):
    def __init__(self, clip=None):
        self.clip = clip
    def after_backward(self):
        if self.clip:
            nn.utils.clip_grad_norm_(self.run.model.parameters(), self.clip)

In [61]:
#export
class RNNTrainer(Callback):
    def __init__(self, alpha, beta):
        self.alpha, self.beta = alpha, beta
        
    def after_pred(self):
        # save the extra outputs for later and only returns the true output
        self.raw_out, self.out = self.pred[1], self.pred[2]
        self.run.pred = self.pred[0]
        
    def after_loss(self):
        #AR and TAR
        if self.alpha != 0.:
            self.run.loss += self.alpha * self.out[-1].float().pow(2).mean()
        if self.beta != 0.:
            h = self.raw_out[-1]
            if len(h)>1:
                self.run.loss += self.beta * (h[:,1:] - h[:, :-1]).float().pow(2).mean()
    
    def begin_epoch(self):
        if hasattr(self.dl.dataset, "batchify"):
            self.dl.dataset.batchify()

In [62]:
#export
def cross_entropy_flat(input, target):
    bs, sl = target.size()
    return F.cross_entropy(input.view(bs*sl, -1), target.view(bs*sl))

def accuracy_flat(input, target):
    bs, sl = target.size()
    return accuracy(input.view(bs*sl, -1), target.view(bs*sl))

In [63]:
emb_sz, nh, nl = 300, 300, 2
model = get_language_model(len(vocab), emb_sz, nh, nl, tok_pad, input_p=0.6, output_p=0.4, weight_p=0.5,
                          embed_p=0.1, hidden_p=0.2)

In [64]:
cbs = [partial(AvgStatsCallback, accuracy_flat),
      CudaCallback, Recorder,
      partial(GradientClipping, clip=0.1),
      partial(RNNTrainer, alpha=2., beta=1.),
      ProgressCallback]

In [65]:
learn = Learner(model, data, cross_entropy_flat, lr=5e-3, cb_funcs=cbs, opt_func=adam_opt())

In [94]:
learn.fit(1)

epoch,train_loss,train_accuracy_flat,valid_loss,valid_accuracy_flat,time
0,5.141464,0.197553,4.608424,0.240792,46:43


# Export

In [66]:
!python notebook2script.py 12a_awd_lstm.ipynb

converted 12a_awd_lstm.ipynb to nb_12a.py
