# 不定长序列pad 处理

In [31]:
import torch
import random
import numpy as np
import torch.utils.data as data
import torchvision.transforms as transforms

In [39]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)  # 并行gpu
        torch.backends.cudnn.deterministic = True  # cpu/gpu结果一致
        torch.backends.cudnn.benchmark = True  # 训练集变化不大时使训练加速
        
# 随机种子
set_seed(0)

In [33]:

"""
参考资料:
https://www.cnblogs.com/sbj123456789/p/9834018.html
对于序列长度可变的情况，介绍使用pad_sequence函数的用法
对于序列不等长的情况，使用填充0来进行等长序列的操作，0的填充可能需要知道所有数据的最大长度，然后开始填充，这样不是很合理，因为
按照批次进行的话，我们希望得到每个批次里面最大的长度即可，然后开始进行填充。使用pad_sequence 函数里面的collate_fn函数来进行操作即可
"""
train_x = [torch.FloatTensor([1, 2, 3, 4, 5, 6, 7, 8, 9]),
           torch.FloatTensor([1, 2, 3, 4, 5, 6, 7]),
           torch.FloatTensor([2, 3, 4, 5, 6, 7]),
           torch.FloatTensor([3, 4, 5, 6, 7]),
           torch.FloatTensor([4, 5, 6, 7]),
           torch.FloatTensor([5, 6, 7]),
           torch.FloatTensor([6, 7]),
           torch.FloatTensor([7])]  # 数据类型是浮点型

In [34]:
class MyData(data.Dataset):
    """
    这里什么都不做，把数据补0功能放到collate_fn这个函数里面去
    """
    def __init__(self, train_x):
        self.train_x = train_x

    def __len__(self):
        return len(self.train_x)

    def __getitem__(self, item):
        return self.train_x[item]

In [35]:
def collate_fn(train_data):
    """
    该函数的功能就是对train_data数据进行填充，填充原则是对当前批次的数据长度先要进行排序
    按照从大到小的顺序排序，然后开始填充
    :param train_data:
    :return:
    """
    train_data.sort(key=lambda data: len(data), reverse=True)  # 按照长度排序
    data_length = [len(data) for data in train_data]  # 得到排序后的数据的长度列表
    train_data = torch.nn.utils.rnn.pad_sequence(train_data, batch_first=True, padding_value=0)  # 对该数据进行填充
    return train_data.unsqueeze(-1), data_length  # 对train_data增加了一维数据，返回数据和长度

In [38]:
# 定义网络
net = torch.nn.LSTM(1, 5, batch_first=True)

train_data = MyData(train_x)

 # 进行数据处理
train_dataloader = data.DataLoader(train_data, batch_size=2, collate_fn=collate_fn)
for data_input, length in train_dataloader:
    # 对于之前的加0操作的数据进行压缩，然后直接丢给lstm进行运算
    data_ = torch.nn.utils.rnn.pack_padded_sequence(data_input, length, batch_first=True)
    print('the data_ is: \n', data_)
    output, (ht, ct) = net(data_)
    print("output is: ", output)
    # 预算结果再解压缩补0然后提取结果
    output_, _ = torch.nn.utils.rnn.pad_packed_sequence(output)
    print(output_.shape)


def blstm():
    lstm = torch.nn.LSTM(input_size=3, hidden_size=5, num_layers=4, batch_first=True, bidirectional=True)
    x = torch.rand(2, 6, 3)
    hidden_state = torch.zeros(4 * 2, 2, 5)
    cell_state = torch.zeros(8, 2, 5)
    outputs, (hidden_state, cell_state) = lstm(x, (hidden_state, cell_state))

    print(outputs.shape)
    print(hidden_state[-1, :, :].shape)
    print(cell_state.shape)
    # print(outputs)
blstm()


the data_ is: 
 PackedSequence(data=tensor([[1.],
        [1.],
        [2.],
        [2.],
        [3.],
        [3.],
        [4.],
        [4.],
        [5.],
        [5.],
        [6.],
        [6.],
        [7.],
        [7.],
        [8.],
        [9.]]), batch_sizes=tensor([2, 2, 2, 2, 2, 2, 2, 1, 1]), sorted_indices=None, unsorted_indices=None)
output is:  PackedSequence(data=tensor([[ 0.1052, -0.1472,  0.0265,  0.1616,  0.2554],
        [ 0.1052, -0.1472,  0.0265,  0.1616,  0.2554],
        [ 0.2213, -0.2098,  0.0769,  0.2237,  0.4416],
        [ 0.2213, -0.2098,  0.0769,  0.2237,  0.4416],
        [ 0.3324, -0.2415,  0.1375,  0.2272,  0.5453],
        [ 0.3324, -0.2415,  0.1375,  0.2272,  0.5453],
        [ 0.4152, -0.2645,  0.1953,  0.1752,  0.5986],
        [ 0.4152, -0.2645,  0.1953,  0.1752,  0.5986],
        [ 0.4578, -0.2880,  0.2404,  0.0654,  0.6206],
        [ 0.4578, -0.2880,  0.2404,  0.0654,  0.6206],
        [ 0.4656, -0.3140,  0.2682, -0.0961,  0.6211],
        