pad 和 pack 是如何实现的？先来段程序观察一下数据的情况

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

x = Variable(torch.randn(10, 20, 30)).cuda()
lens = range(11, 21)[::-1]
print(lens)
x = pack_padded_sequence(x, lens, batch_first=True)
print x.data.size()
print x.batch_sizes

lstm = nn.LSTM(30, 50, batch_first=True).cuda()
h0 = Variable(torch.zeros(1, 10, 50)).cuda()
c0 = Variable(torch.zeros(1, 10, 50)).cuda()

packed_h, (packed_h_t, packed_c_t) = lstm(x, (h0, c0))
print packed_h.data.size(), type(packed_h)
print packed_h_t.size(), type(packed_h_t)
print packed_c_t.size(), type(packed_c_t)
h, _ = pad_packed_sequence(packed_h) 
print h.size() # Size 20 x 10 x 50 instead of 10 x 20 x 50，
# 也就是说即使输入的packedsequence是用batch first的形式构造的，输出的packedsequence依旧是step first
print type(h)

[20, 19, 18, 17, 16, 15, 14, 13, 12, 11]
torch.Size([155, 30])
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
torch.Size([155, 50]) <class 'torch.nn.utils.rnn.PackedSequence'>
torch.Size([1, 10, 50]) <class 'torch.autograd.variable.Variable'>
torch.Size([1, 10, 50]) <class 'torch.autograd.variable.Variable'>
torch.Size([20, 10, 50])
<class 'torch.autograd.variable.Variable'>


In [2]:
print(x)

PackedSequence(data=Variable containing:
 0.2981 -0.3993  1.3740  ...   1.0439  0.5730  0.2853
 0.5220  0.8148 -0.3350  ...   0.4287  3.6346 -0.9161
 0.6322  0.8707 -1.6416  ...   0.1630 -0.9379 -0.8117
          ...             ⋱             ...          
 0.1682 -0.3832  0.7674  ...   0.6986  0.9199  0.9385
 0.4829 -0.4175 -1.5790  ...  -0.5006 -2.2780 -1.6367
 2.1415  1.8825  0.5418  ...   1.3033 -1.0589  1.3188
[torch.cuda.FloatTensor of size 155x30 (GPU 0)]
, batch_sizes=[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1])


In [3]:
t = pad_packed_sequence(x)
print(t)

(Variable containing:
(0 ,.,.) = 
  0.2981 -0.3993  1.3740  ...   1.0439  0.5730  0.2853
  0.5220  0.8148 -0.3350  ...   0.4287  3.6346 -0.9161
  0.6322  0.8707 -1.6416  ...   0.1630 -0.9379 -0.8117
           ...             ⋱             ...          
  0.4115 -1.4341  1.0239  ...   1.5278 -0.8339 -0.3697
  0.4981  1.2309  1.2682  ...   0.3588  0.0668  0.5734
  0.0620 -0.4905  0.7230  ...   1.1195 -1.1458 -0.4309

(1 ,.,.) = 
 -0.4845  0.8116  1.1948  ...   1.5190 -0.2912 -0.2472
 -0.4461  2.9652  0.6847  ...  -1.5527  0.5092  0.6569
 -1.3457  1.3938  0.7274  ...   1.3539 -0.6283  0.5273
           ...             ⋱             ...          
 -1.6433  0.6367  0.1309  ...  -0.7696 -1.0221 -1.7932
 -0.6540 -0.7953 -2.2092  ...  -0.2899 -1.3269 -1.2289
  0.4172  1.0223 -1.2531  ...   1.1317  0.9870 -0.8640

(2 ,.,.) = 
 -0.9000  0.2971  0.2086  ...  -1.0964 -1.9530  0.5754
 -2.0922  1.2616  0.3765  ...   0.1584  1.0004 -0.9046
  0.3324  0.0206  0.5188  ...   0.0596  1.8765 -1.8499
     

从上面的程序上看，pack_padded_sequence会把sequence的steps和batch size根据lengths合并，从(steps, batch size, [other dim]) 变成 (sum(lengths), [other dim])，看看源码怎么写的：

In [34]:
a = [1,2,3]
b = reversed(a)
print(next(b))
print(list(b))
print(a[-1])

3
[2, 1]
3


In [4]:
def pack_padded_sequence(input, lengths, batch_first=False):
    """Packs a Variable containing padded sequences of variable length.

    Input can be of size ``TxBx*`` where T is the length of the longest sequence
    (equal to ``lengths[0]``), B is the batch size, and * is any number of
    dimensions (including 0). If ``batch_first`` is True ``BxTx*`` inputs are expected.

    The sequences should be sorted by length in a decreasing order, i.e.
    ``input[:,0]`` should be the longest sequence, and ``input[:,B-1]`` the
    shortest one.

    Note:
        This function accept any input that has at least two dimensions. You
        can apply it to pack the labels, and use the output of the RNN with
        them to compute the loss directly. A Variable can be retrieved from
        a :class:`PackedSequence` object by accessing its ``.data`` attribute.

    Arguments:
        input (Variable): padded batch of variable length sequences.
        lengths (list[int]): list of sequences lengths of each batch element.
        batch_first (bool, optional): if True, the input is expected in BxTx*
            format.

    Returns:
        a :class:`PackedSequence` object
    """
    if batch_first:
        input = input.transpose(0, 1)

    steps = []
    batch_sizes = []
    lengths_iter = reversed(lengths)
    # current_length 的初始值是序列中的最小长度
    current_length = next(lengths_iter)
    batch_size = input.size(1)
    if len(lengths) != batch_size:
        raise ValueError("lengths array has incorrect size")

    for step, step_value in enumerate(input, 1):
        steps.append(step_value[:batch_size])
        batch_sizes.append(batch_size)

        # 当step等于current_length时，意味着有些数据已经读完了
        # while循环的内部用来判定有多少个数据已经读完
        # 并且相应地调整下一个step中要处理的batch 的大小
        while step == current_length:
            try:
                new_length = next(lengths_iter)
            except StopIteration:
                current_length = None
                break

            if current_length > new_length:  # remember that new_length is the preceding length in the array
                raise ValueError("lengths array has to be sorted in decreasing order")
            batch_size -= 1
            current_length = new_length
        if current_length is None:
            break
    return PackedSequence(torch.cat(steps), batch_sizes)

In [5]:
def pad_packed_sequence(sequence, batch_first=False):
    """Pads a packed batch of variable length sequences.

    It is an inverse operation to :func:`pack_padded_sequence`.

    The returned Variable's data will be of size TxBx*, where T is the length
    of the longest sequence and B is the batch size. If ``batch_size`` is True,
    the data will be transposed into BxTx* format.

    Batch elements will be ordered decreasingly by their length.

    Arguments:
        sequence (PackedSequence): batch to pad
        batch_first (bool, optional): if True, the output will be in BxTx* format.

    Returns:
        Tuple of Variable containing the padded sequence, and a list of lengths
        of each sequence in the batch.
    """
    var_data, batch_sizes = sequence
    max_batch_size = batch_sizes[0]
    output = var_data.data.new(len(batch_sizes), max_batch_size, *var_data.size()[1:]).zero_()
    output = Variable(output)

    lengths = []
    data_offset = 0
    prev_batch_size = batch_sizes[0]
    for i, batch_size in enumerate(batch_sizes):
        output[i, :batch_size] = var_data[data_offset:data_offset + batch_size]
        data_offset += batch_size

        dec = prev_batch_size - batch_size
        if dec > 0:
            lengths.extend((i,) * dec)
        prev_batch_size = batch_size
    lengths.extend((i + 1,) * batch_size)
    lengths.reverse()

    if batch_first:
        output = output.transpose(0, 1)
    return output, lengths

pack_padded_sequence的处理是按照time step展开的，pad_packed_sequence的处理是按照batch size（由pack_padded_sequence得到）展开的。

最后还有一个问题， 就是RNN是如何处理packedsequence作为输入的情况呢？追踪一下源码，可以发现是用的VariableRecurrent，源码如下：

In [None]:
def VariableRecurrent(batch_sizes, inner):
    def forward(input, hidden, weight):
        output = []
        input_offset = 0
        last_batch_size = batch_sizes[0]
        hiddens = []
        flat_hidden = not isinstance(hidden, tuple)
        if flat_hidden:
            hidden = (hidden,)
        for batch_size in batch_sizes:
            step_input = input[input_offset:input_offset + batch_size]
            input_offset += batch_size

            dec = last_batch_size - batch_size
            if dec > 0:
                hiddens.append(tuple(h[-dec:] for h in hidden))
                hidden = tuple(h[:-dec] for h in hidden)
            last_batch_size = batch_size

            if flat_hidden:
                hidden = (inner(step_input, hidden[0], *weight),)
            else:
                hidden = inner(step_input, hidden, *weight)

            output.append(hidden[0])
        hiddens.append(hidden)
        hiddens.reverse()

        hidden = tuple(torch.cat(h, 0) for h in zip(*hiddens))
        assert hidden[0].size(0) == batch_sizes[0]
        if flat_hidden:
            hidden = hidden[0]
        output = torch.cat(output, 0)

        return hidden, output

    return forward

可以看出也是按照step把输入展开，即把packed sequence 恢复成mini batch，只是batch中的不同sequence的长度不同，也就是说不同的step中的输入是不同的，这个时候就需要对hidden进行裁剪和保存。