<a href="https://colab.research.google.com/github/arnaujc91/experiments/blob/main/fixEmbeddingDropout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fastai==2.0.16

In [5]:
from fastai.text.all import *

In [6]:
awd_lstm_original =  AWD_LSTM(vocab_sz=3,
                  emb_sz=5,
                  n_hid=6,
                  n_layers=2)

In [7]:
modules = flatten_model(awd_lstm_original); modules

[Embedding(3, 5, padding_idx=1),
 Embedding(3, 5, padding_idx=1),
 LSTM(5, 6, batch_first=True),
 ParameterModule(),
 LSTM(6, 5, batch_first=True),
 ParameterModule(),
 RNNDropout(),
 RNNDropout(),
 RNNDropout()]

### Problem

1. `flatten_model` contains duplicated layers:

In [8]:
print('Are the first two layers the same? ', modules[0] == modules[1]) 
print('Are the layers unique? ', len(set(modules)) == len(modules))

Are the first two layers the same?  True
Are the layers unique?  False


This is because the function `flatten_model` takes the children of all present layers. In particular `EmbeddingDropout` has as children `nn.Embedding`, so instead of showing `EmbeddingDropout` as the second element of the list `modules` it shows *again* `Embedding`:

In [17]:
next(awd_lstm_original.encoder_dp.children()) == awd_lstm_original.encoder

True

2. The hooks are not fired for the *Embedding* layers.

In [10]:
def hook_fn(m, i, o):
  print(f"Working for layer: -- {m._get_name()} --\n")

for m in flatten_model(awd_lstm_original):
    if has_params(m):
        m.register_forward_hook(hook_fn)

awd_lstm_original(torch.randint(3, (1,4)))

tensor([[[ 0.0585,  0.0986, -0.0598,  0.0704,  0.0693],
         [ 0.0783,  0.1497, -0.0994,  0.1160,  0.1130],
         [ 0.0853,  0.1728, -0.1267,  0.1431,  0.1397],
         [ 0.0879,  0.1821, -0.1455,  0.1586,  0.1554]]],
       grad_fn=<TransposeBackward0>)

### Solution:

In [11]:
class EmbeddingDropout(nn.Embedding):
    "Apply dropout with probability `embed_p` to an embedding layer."
    def __init__(self, *args, embed_p, **kwargs):
        super().__init__(*args, **kwargs)
        self.embed_p = embed_p

    def forward(self, words, scale=None):
        if self.training and self.embed_p != 0:
            size = (self.weight.size(0),1)
            mask = dropout_mask(self.weight.data, size, self.embed_p)
            masked_embed = self.weight * mask
        else: masked_embed = self.weight
        if scale: masked_embed.mul_(scale)
        return F.embedding(words, masked_embed, ifnone(self.padding_idx, -1), self.max_norm,
                       self.norm_type, self.scale_grad_by_freq, self.sparse)
        
class AWD_LSTM(Module):
    "AWD-LSTM inspired by https://arxiv.org/abs/1708.02182"
    initrange=0.1

    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token=1, hidden_p=0.2, input_p=0.6, embed_p=0.1,
                 weight_p=0.5, bidir=False):
        store_attr('emb_sz,n_hid,n_layers,pad_token')
        self.bs = 1
        self.n_dir = 2 if bidir else 1
        # NEW CODE: 
        self.encoder = EmbeddingDropout(vocab_sz, emb_sz, embed_p=embed_p, padding_idx=pad_token)
        self.encoder.weight.data.uniform_(-self.initrange, self.initrange)
        # PREVIOUS CODE:
        # self.encoder = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        # self.encoder_dp = EmbeddingDropout(self.encoder, embed_p)
        # self.encoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.rnns = nn.ModuleList([self._one_rnn(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.n_dir,
                                                 bidir, weight_p, l) for l in range(n_layers)])
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])
        self.reset()

        '''
        IMPORTANT: As you can see previously the layer self.encoder was just used to create the layer self.encoder_dp.
                   Now instead EmbeddingDropout directly inherits from nn.Embedding in order to avoid using self.encoder.
                   Therefore now the code is more compact with the same functionality.
        '''

    def forward(self, inp, from_embeds=False):
        bs,sl = inp.shape[:2] if from_embeds else inp.shape
        if bs!=self.bs: self._change_hidden(bs)

        output = self.input_dp(inp if from_embeds else self.encoder(inp))
        new_hidden = []
        for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            output, new_h = rnn(output, self.hidden[l])
            new_hidden.append(new_h)
            if l != self.n_layers - 1: output = hid_dp(output)
        self.hidden = to_detach(new_hidden, cpu=False, gather=False)
        return output

    def _change_hidden(self, bs):
        self.hidden = [self._change_one_hidden(l, bs) for l in range(self.n_layers)]
        self.bs = bs

    def _one_rnn(self, n_in, n_out, bidir, weight_p, l):
        "Return one of the inner rnn"
        rnn = nn.LSTM(n_in, n_out, 1, batch_first=True, bidirectional=bidir)
        return WeightDropout(rnn, weight_p)

    def _one_hidden(self, l):
        "Return one hidden state"
        nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz) // self.n_dir
        return (one_param(self).new_zeros(self.n_dir, self.bs, nh), one_param(self).new_zeros(self.n_dir, self.bs, nh))

    def _change_one_hidden(self, l, bs):
        if self.bs < bs:
            nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz) // self.n_dir
            return tuple(torch.cat([h, h.new_zeros(self.n_dir, bs-self.bs, nh)], dim=1) for h in self.hidden[l])
        if self.bs > bs: return (self.hidden[l][0][:,:bs].contiguous(), self.hidden[l][1][:,:bs].contiguous())
        return self.hidden[l]

    def reset(self):
        "Reset the hidden states"
        [r.reset() for r in self.rnns if hasattr(r, 'reset')]
        self.hidden = [self._one_hidden(l) for l in range(self.n_layers)]

In [12]:
awd_lstm_modified=  AWD_LSTM(vocab_sz=3,
                  emb_sz=5,
                  n_hid=6,
                  n_layers=2)

In [13]:
modules = flatten_model(awd_lstm_modified); modules

[EmbeddingDropout(3, 5, padding_idx=1),
 LSTM(5, 6, batch_first=True),
 ParameterModule(),
 LSTM(6, 5, batch_first=True),
 ParameterModule(),
 RNNDropout(),
 RNNDropout(),
 RNNDropout()]

Dupilcation of layers?

In [14]:
print('Are the first two layers the same? ', modules[0] == modules[1]) 
print('Are the layers unique? ', len(set(modules)) == len(modules))

Are the first two layers the same?  False
Are the layers unique?  True


Hooks fired for the Embedding layer?

In [15]:
def hook_fn(m, i, o):
  print(f"Working for layer: -- {m._get_name()} --\n")

for m in flatten_model(awd_lstm_modified):
    if has_params(m):
        m.register_forward_hook(hook_fn)

awd_lstm_modified(torch.randint(3, (1,4)))

Working for layer: -- EmbeddingDropout --



tensor([[[-0.0048,  0.0082, -0.0706,  0.0583, -0.0391],
         [-0.0160,  0.0125, -0.0939,  0.0878, -0.0568],
         [-0.0276,  0.0097, -0.1013,  0.1029, -0.0654],
         [-0.0388,  0.0047, -0.1059,  0.1119, -0.0689]]],
       grad_fn=<TransposeBackward0>)