In [1]:
1+1

2

In [53]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

from torchnlp.modules.transformer.sublayers import MultiHeadAttention, PositionwiseFeedForward
from torchnlp.modules.normalization import LayerNorm

class EncoderLayer(nn.Module):
    """
    Represents one Encoder layer of the Transformer Encoder
    Refer Fig. 1 in https://arxiv.org/pdf/1706.03762.pdf
    NOTE: The layer normalization step has been moved to the input as per latest version of T2T
    """
    def __init__(self, hidden_size, total_key_depth, total_value_depth, filter_size, num_heads,
                 bias_mask=None, layer_dropout=0.0, attention_dropout=0.0, relu_dropout=0.0):
        """
        Parameters:
            hidden_size: Hidden size
            total_key_depth: Size of last dimension of keys. Must be divisible by num_head
            total_value_depth: Size of last dimension of values. Must be divisible by num_head
            output_depth: Size last dimension of the final output
            filter_size: Hidden size of the middle layer in FFN
            num_heads: Number of attention heads
            bias_mask: Masking tensor to prevent connections to future elements
            layer_dropout: Dropout for this layer
            attention_dropout: Dropout probability after attention (Should be non-zero only during training)
            relu_dropout: Dropout probability after relu in FFN (Should be non-zero only during training)
        """
        
        super(EncoderLayer, self).__init__()
        
        self.multi_head_attention = MultiHeadAttention(hidden_size, total_key_depth, total_value_depth, 
                                                       hidden_size, num_heads, bias_mask, attention_dropout)
        
        self.positionwise_feed_forward = PositionwiseFeedForward(hidden_size, filter_size, hidden_size,
                                                                 layer_config='cc', padding = 'both', 
                                                                 dropout=relu_dropout)
        self.dropout = nn.Dropout(layer_dropout)
        self.layer_norm_mha = LayerNorm(hidden_size)
        self.layer_norm_ffn = LayerNorm(hidden_size)
        
    def forward(self, inputs):
        x = inputs
        
        # Layer Normalization
        x_norm = self.layer_norm_mha(x)
        input_ = x_norm
        # Multi-head attention
        y, attn, bias_mask = self.multi_head_attention(x_norm, x_norm, x_norm)
        
        # Dropout and residual
        x = self.dropout(x + y)
        
        # Layer Normalization
        x_norm = self.layer_norm_ffn(x)
        
        # Positionwise Feedforward
        y = self.positionwise_feed_forward(x_norm)
        
        # Dropout and residual
        y = self.dropout(x + y)
        
        return y, input_, attn, bias_mask

In [60]:
model_names = []
atten_hook = []
def hook(model, in_f, out_f):
    print("hooking")
    model_names.append(model.__class__)
    atten_hook.append((in_f, out_f))

In [62]:
tt = EncoderLayer(30, 30,30, 20, 1)

In [66]:
tt.multi_head_attention.register_forward_hook(hook=hook)

<torch.utils.hooks.RemovableHandle at 0x7f6b62ddad10>

In [67]:
x = torch.randn(1, 2, 30)
output=tt(x)

hooking


In [65]:
atten_hook

[]

In [68]:
type(tt)

__main__.EncoderLayer

In [72]:
for name, m, in tt.named_modules():
    (name, isinstance(m, MultiHeadAttention))

 False
multi_head_attention True
multi_head_attention.query_linear False
multi_head_attention.key_linear False
multi_head_attention.value_linear False
multi_head_attention.output_linear False
multi_head_attention.dropout False
positionwise_feed_forward False
positionwise_feed_forward.layers False
positionwise_feed_forward.layers.0 False
positionwise_feed_forward.layers.0.pad False
positionwise_feed_forward.layers.0.conv False
positionwise_feed_forward.layers.1 False
positionwise_feed_forward.layers.1.pad False
positionwise_feed_forward.layers.1.conv False
positionwise_feed_forward.relu False
positionwise_feed_forward.dropout False
dropout False
layer_norm_mha False
layer_norm_ffn False


In [73]:
pwd

'/home/zijiao/research/atal'

In [76]:
from torchnlp.data.conll import conll2000_dataset

In [77]:
con = conll2000_dataset(50)

type(con)

iters = con['iters']

train_iter = iters[0]

type(train_iter)

from collections.abc import Iterable

train_data = list(train_iter)

In [97]:
train_data[0].inputs_word

tensor([[   2,  686,    7,  ...,    1,    1,    1],
        [   2,   11,  148,  ...,    1,    1,    1],
        [   2, 1090,   14,  ...,    1,    1,    1],
        ...,
        [   2, 2894,  224,  ...,    1,    1,    1],
        [   2,    5,  183,  ...,    1,    1,    1],
        [   2,    5, 1298,  ...,    1,    1,    1]], device='cuda:0')

In [98]:
train_data[0].inputs_char

tensor([[[ 2,  2,  3,  ...,  1,  1,  1],
         [ 2, 34,  7,  ...,  1,  1,  1],
         [ 2,  7, 19,  ...,  1,  1,  1],
         ...,
         [ 1,  1,  1,  ...,  1,  1,  1],
         [ 1,  1,  1,  ...,  1,  1,  1],
         [ 1,  1,  1,  ...,  1,  1,  1]],

        [[ 2,  2,  3,  ...,  1,  1,  1],
         [ 2, 36,  9,  ...,  1,  1,  1],
         [ 2, 52,  6,  ...,  1,  1,  1],
         ...,
         [ 1,  1,  1,  ...,  1,  1,  1],
         [ 1,  1,  1,  ...,  1,  1,  1],
         [ 1,  1,  1,  ...,  1,  1,  1]],

        [[ 2,  2,  3,  ...,  1,  1,  1],
         [ 2, 37, 11,  ...,  1,  1,  1],
         [ 2, 28, 10,  ...,  1,  1,  1],
         ...,
         [ 1,  1,  1,  ...,  1,  1,  1],
         [ 1,  1,  1,  ...,  1,  1,  1],
         [ 1,  1,  1,  ...,  1,  1,  1]],

        ...,

        [[ 2,  2,  3,  ...,  1,  1,  1],
         [ 2, 36, 18,  ...,  1,  1,  1],
         [ 2, 34,  7,  ...,  1,  1,  1],
         ...,
         [ 1,  1,  1,  ...,  1,  1,  1],
         [ 1,  1,  1, 

In [96]:
# Where is tags
train_data[0].labels

tensor([[2, 5, 8,  ..., 1, 1, 1],
        [2, 6, 5,  ..., 1, 1, 1],
        [2, 5, 5,  ..., 1, 1, 1],
        ...,
        [2, 5, 4,  ..., 1, 1, 1],
        [2, 5, 4,  ..., 1, 1, 1],
        [2, 5, 4,  ..., 1, 1, 1]], device='cuda:0')

In [99]:
con2 = conll2000_dataset(50)

iters_2 = con2['iters']

train_iter_2 = iters_2[0]


from collections.abc import Iterable

train_data_2 = list(train_iter_2)

In [100]:
train_data[0].inputs_word

tensor([[   2,  686,    7,  ...,    1,    1,    1],
        [   2,   11,  148,  ...,    1,    1,    1],
        [   2, 1090,   14,  ...,    1,    1,    1],
        ...,
        [   2, 2894,  224,  ...,    1,    1,    1],
        [   2,    5,  183,  ...,    1,    1,    1],
        [   2,    5, 1298,  ...,    1,    1,    1]], device='cuda:0')

In [101]:
train_data_2[0].inputs_word

tensor([[   2,  686,    7,  ...,    1,    1,    1],
        [   2,   11,  148,  ...,    1,    1,    1],
        [   2, 1090,   14,  ...,    1,    1,    1],
        ...,
        [   2, 2894,  224,  ...,    1,    1,    1],
        [   2,    5,  183,  ...,    1,    1,    1],
        [   2,    5, 1298,  ...,    1,    1,    1]], device='cuda:0')

In [103]:
train_data[1].inputs_word, train_data_2[1].inputs_word

(tensor([[   2,   10,    9,  ...,    1,    1,    1],
         [   2,  477, 3164,  ...,    1,    1,    1],
         [   2,    5, 2165,  ...,    1,    1,    1],
         ...,
         [   2,   36,   62,  ...,    1,    1,    1],
         [   2,   35, 4343,  ...,    1,    1,    1],
         [   2,  449,  348,  ...,    1,    1,    1]], device='cuda:0'),
 tensor([[   2,   10,    9,  ...,    1,    1,    1],
         [   2,  477, 3164,  ...,    1,    1,    1],
         [   2,    5, 2165,  ...,    1,    1,    1],
         ...,
         [   2,   36,   62,  ...,    1,    1,    1],
         [   2,   35, 4343,  ...,    1,    1,    1],
         [   2,  449,  348,  ...,    1,    1,    1]], device='cuda:0'))

In [104]:
%load_ext autoreload
%autoreload 2

In [None]:
train

In [105]:
from torchnlp.data.conll import conll2000_dataset

In [106]:
con2 = conll2000_dataset(50)

iters_2 = con2['iters']

train_iter_2 = iters_2[0]


from collections.abc import Iterable

train_data_2 = list(train_iter_2)

In [107]:
train_data_2[0].inputs_word

tensor([[   2,  686,    7,  ...,    1,    1,    1],
        [   2,   11,  148,  ...,    1,    1,    1],
        [   2, 1090,   14,  ...,    1,    1,    1],
        ...,
        [   2, 2894,  224,  ...,    1,    1,    1],
        [   2,    5,  183,  ...,    1,    1,    1],
        [   2,    5, 1298,  ...,    1,    1,    1]], device='cuda:0')

In [132]:
train_iter_2.shuffle = False

In [133]:
train_iter_2.init_epoch()

In [134]:
train_data_2_2 = list(train_iter_2)

In [135]:
train_data_2_2[0].inputs_word

tensor([[   2,  686,    7,  ...,    1,    1,    1],
        [   2,   11,  148,  ...,    1,    1,    1],
        [   2, 1090,   14,  ...,    1,    1,    1],
        ...,
        [   2, 2894,  224,  ...,    1,    1,    1],
        [   2,    5,  183,  ...,    1,    1,    1],
        [   2,    5, 1298,  ...,    1,    1,    1]], device='cuda:0')

- [x] finished the data shuffle problem
- [ ] make train, shufffle, make evaluate not shuffle
- [ ] put save atttention in evalaution part
- [ ] test load evaluation
- [ ] finish the adversarial loss and start training
REST
---
- [ ] use random pre attention as baseline, to compare the divergence
- [ ] use above to test guiding classifiers