# Parameters

* *class torch.nn.Parameters()*    
        data    
        requires_grads(bool)

# Containers

* *class torch.nn.Module*   
        add_module(name, module)     
        children()   
        cpu()     
        cuda()   
        train()   
        eval()   
        load_state_dict()   
        modules()   
        parameters()  
        state_dict()   
        zero_grads()  

* *class torch.nn.Sequential()*

* *class torch.nn.ModuleList()*

In [1]:
import torch
import torch.nn as nn

In [2]:

class MyModule(nn.Module):
    def __init__(self):
        super(MyModule,self).__init__()
        self.linears = nn.ModuleList([nn.Linear(10,10) for _ in range(10)])
    def forward(self,x):
        for i,l in enumerate(self.linears):
            x = self.linears[i//2] + l(x)
        return x
mynet = MyModule()
mynet

MyModule(
  (linears): ModuleList(
    (0): Linear(in_features=10, out_features=10, bias=True)
    (1): Linear(in_features=10, out_features=10, bias=True)
    (2): Linear(in_features=10, out_features=10, bias=True)
    (3): Linear(in_features=10, out_features=10, bias=True)
    (4): Linear(in_features=10, out_features=10, bias=True)
    (5): Linear(in_features=10, out_features=10, bias=True)
    (6): Linear(in_features=10, out_features=10, bias=True)
    (7): Linear(in_features=10, out_features=10, bias=True)
    (8): Linear(in_features=10, out_features=10, bias=True)
    (9): Linear(in_features=10, out_features=10, bias=True)
  )
)

# 卷积 Conv

## Conv1d

**class torch.nn.Conv1d(in_channels,out_channels,kernel_size,  
           stride=1,paddding=0,dilation=1,groups=1,bias=True)**
$$ out(N_i, C_{out_j})=bias(C {out_j})+\sum^{C{in}-1}{k=0}weight(C{out_j},k)\bigotimes input(N_i,k) $$
* 参数说明：   
    in_channels(int) – 输入信号的通道  
    out_channels(int) – 卷积产生的通道   
    kerner_size(int or tuple) - 卷积核的尺寸   
    stride(int or tuple, optional) - 卷积步长   
    padding (int or tuple, optional)- 输入的每一条边补充0的层数   
    dilation(int or tuple, `optional``) – 卷积核元素之间的间距   
    groups(int, optional) – 从输入通道到输出通道的阻塞连接数   
    bias(bool, optional) - 如果bias=True，添加偏置   
卷积核大小为 （kernel_size,embedding_size)
   
输入: (N,C_in,L_in)    
输出: (N,C_out,L_out)   
输入输出的计算方式：   
$$L_{out}=floor((L_{in}+2padding-dilation(kernel\_size-1)-1)/stride+1)$$

In [4]:
m=nn.Conv1d(16,33,3,stride=1)
input = torch.randn(20,16,50)
output = m(input)
print(output.size())

# 48 = (50-3)/1 +1

torch.Size([20, 33, 48])


In [7]:
m.weight.data.size()

torch.Size([33, 16, 3])

## Conv2d

**class torch.nn.Conv2d(in_channels,out_channels,kernel_size,   
    stride=1,padding=0,dilation=1,groups=1,bias=True)**
* 参数说明同Conv1d
* shape:   
input: (N,C_in,H_in,W_in)   
output: (N,C_out,H_out,W_out)   

$$H_{out}=floor((H_{in}+2padding[0]-dilation[0](kernerl\_size[0]-1)-1)/stride[0]+1)$$   

$$W_{out}=floor((W_{in}+2padding[1]-dilation[1](kernerl\_size[1]-1)-1)/stride[1]+1)$$

In [12]:
m =nn.Conv2d(16,32,3,stride=1,padding=1)
input = torch.randn(20,16,32,32)
output = m(input)
print(output.size())

torch.Size([20, 32, 32, 32])


## Conv3d

**class torch.nn.Conv3d(in_channels,out_channels,kernel_size, stride=1,padding=0,dilation=1,groups=1,bias=True)**
* 参数说明同Conv2d  
* shape：   
input: (N,C_in,D_in,H_in,W_in)   
output: (N,C_out,D_out,H_out,W_out) 

$$D_{out}=floor((D_{in}+2padding[0]-dilation[0](kernerl\_size[0]-1)-1)/stride[0]+1)$$  

$$H_{out}=floor((H_{in}+2padding[1]-dilation[2](kernerl\_size[1]-1)-1)/stride[1]+1)$$

$$W_{out}=floor((W_{in}+2padding[2]-dilation[2](kernerl\_size[2]-1)-1)/stride[2]+1)$$

In [15]:
m =nn.Conv3d(16,32,3,stride=1,padding=0)
input = torch.randn(20,16,32,32,28)
output = m(input)
print(output.size())

torch.Size([20, 32, 30, 30, 26])


## ConvTranspose1d   
1维的解卷积操作

**class torch.nn.ConvTranspose1d(in_channels,out_channels,kernel_size,
stride=1,padding=0,ouput_padding=0,groups=1,bias=True)**
 
* shape:   
input: (N,C_in,L_in)   
output: (N,C_out,L_out)  
$$L_{out}=(L_{in}-1)stride-2padding+kernel\_size+output\_padding$$

In [16]:
dconv = nn.ConvTranspose1d(20,16,3)
input =torch.randn(32,20,18)
output=dconv(input)
print(output.size())
# 20 = （18-1）*1 + 3

torch.Size([32, 16, 20])


## ConvTranspose2d

**class torch.nn.ConvTranspose2d(in_channels,out_channels,kernel_size, stride=1,padding=0,ouput_padding=0,groups=1,bias=True)**
* 参数说明同ConvTranspose1d
* shape:    
input: (N,C_in,H_in，W_in)      
output: (N,C_out,H_out,W_out)   

$$H_{out}=(H_{in}-1)stride[0]-2padding[0]+kernel\_size[0]+output\_padding[0]$$

$$W_{out}=(W_{in}-1)stride[1]-2padding[1]+kernel\_size[1]+output\_padding[1]$$

In [18]:
m1 = nn.ConvTranspose2d(16,33,3,stride=2)
m2 = nn.ConvTranspose2d(16,33,(3,5),stride=(2,1),padding=(4,2))
input = torch.randn(20,16,50,100)
out1 = m1(input)
out2 = m2(input)
print(out1.size())
print(out2.size())

torch.Size([20, 33, 101, 201])
torch.Size([20, 33, 93, 100])


## ConvTranspose3d

**class torch.nn.ConvTranspose3d(in_channels,out_channels,kernel_size, stride=1,padding=0,ouput_padding=0,groups=1,bias=True)**
* shape:   
input: (N,C_in,H_in，W_in)   
output: (N,C_out,H_out,W_out)   

$$D_{out}=(D_{in}-1)stride[0]-2padding[0]+kernel\_size[0]+output\_padding[0]$$

$$H_{out}=(H_{in}-1)stride[1]-2padding[1]+kernel\_size[1]+output\_padding[0]$$

$$W_{out}=(W_{in}-1)stride[2]-2padding[2]+kernel\_size[2]+output\_padding[2]$$

# 池化层

## MaxPool1d

**class torch.nn.MaxPool1d(kernel_size,stride=None,padding=0,
                          dilation=1,return_indices=False,ceil_mode=False)**
* 参数说明：   
kernel_size(int or tuple) - max pooling的窗口大小   
stride(int or tuple, optional) - max pooling的窗口移动的步长。默认值是kernel_size   
padding(int or tuple, optional) - 输入的每一条边补充0的层数   
dilation(int or tuple, optional) – 一个控制窗口中元素步幅的参数    
return_indices - 如果等于True，会返回输出最大值的序号，对于上采样操作会有帮助   
ceil_mode - 如果等于True，计算输出信号大小的时候，会使用向上取整，代替默认的向下取整的操作  
* shape:   
输入: (N,C_in,L_in)   
输出: (N,C_out,L_out)   

$$L_{out}=floor((L_{in} + 2padding - dilation(kernel\_size - 1) - 1)/stride + 1$$

## MaxPool2d

**class torch.nn.MaxPool2d(kernel_size,stride=None,    padding=0,dilation=1,return_indices=False,ceil_mode=False)**
* 参数说明同Maxpool1d 
* shape:   
输入: (N,C,H_in,W_in)   
输出: (N,C,H_out,W_out)   

$$H_{out}=floor((H_{in} + 2padding[0] - dilation[0](kernel\_size[0] - 1) - 1)/stride[0] + 1$$

$$W_{out}=floor((W_{in} + 2padding[1] - dilation[1](kernel\_size[1] - 1) - 1)/stride[1] + 1$$



## MaxPool3d

**class torch.nn.MaxPool3d(kernel_size,stride=None,padding=0,dilation=1, return_indices=False,ceil_mode=False)**   
* 参数说明同Maxpool1d 
* shape:   
输入: (N,C,D_in,H_in,W_in)   
输出: (N,C,D_out,H_out,W_out)   

$$D_{out}=floor((D_{in} + 2padding[0] - dilation[0](kernel\_size[0] - 1) - 1)/stride[0] + 1)$$

$$H_{out}=floor((H_{in} + 2padding[1] - dilation[1](kernel\_size[0] - 1) - 1)/stride[1] + 1)$$

$$W_{out}=floor((W_{in} + 2padding[2] - dilation[2](kernel\_size[2] - 1) - 1)/stride[2] + 1)$$

## MaxUnPool1d   
Maxpool1d的逆过程

**class torch.nn.MaxUnpool1d(kernel_size, stride=None, padding=0)**
* 输入：   
input:需要转换的tensor    
indices：Maxpool1d的索引号    
output_size:一个指定输出大小的torch.Size  
* shape :   
input: (N,C,H_in)  
output:(N,C,H_out)  

$$H_{out}=(H_{in}-1)stride[0]-2padding[0]+kernel\_size[0]$$
也可以使用output_size指定输出的大小

In [24]:
pool = nn.MaxPool1d(2,stride=2,return_indices=True)
unpool = nn.MaxUnpool1d(2,stride=2)
input = torch.Tensor([[[10,2,3,4,5,6,7,8]]])
output,indices = pool(input)
unpool(output,indices)

tensor([[[10.,  0.,  0.,  4.,  0.,  6.,  0.,  8.]]])

## MaxUnpool2d

**class torch.nn.MaxUnpool2d(kernel_size, stride=None, padding=0)**
* 输入：   
input:需要转换的tensor    
indices：Maxpool1d的索引号    
output_size:一个指定输出大小的torch.Size  
* shape :   
input: (N,C,H_in,W_in)   
output:(N,C,H_out,W_out)    

$$H_{out}=(H_{in}-1)stride[0]-2padding[0]+kernel\_size[0]$$

$$W_{out}=(W_{in}-1)stride[1]-2padding[1]+kernel\_size[1]$$

也可以使用output_size指定输出的大小

## MaxUnpool3d

**class torch.nn.MaxUnpool3d(kernel_size, stride=None, padding=0)**
* 输入：   
input:需要转换的tensor    
indices：Maxpool1d的索引号    
output_size:一个指定输出大小的torch.Size  
* shape :   
input: (N,C,D_in,H_in,W_in)
output:(N,C,D_out,H_out,W_out)
    
$$ D_{out}=(D_{in}-1)stride[0]-2padding[0]+kernel\_size[0]$$

$$H_{out}=(H_{in}-1)stride[1]-2padding[0]+kernel\_size[1]  $$

$$W_{out}=(W_{in}-1)stride[2]-2padding[2]+kernel\_size[2]  $$

也可以使用output_size指定输出的大小

## AvgPool1d
## AvgPool2d
## AvgPool3d

**class torch.nn.AvgPool2d(kernel_size, stride=None, 
                           padding=0, ceil_mode=False, count_include_pad=True)**
* 参数说明：                           
count_include_pad - 如果等于True，计算平均池化时，将包括padding填充的0
* shape:    
input: (N,C,H_in,W_in)  
output: (N,C,H_out,W_out)   
$$H_{out}=floor((H_{in}+2padding[0]-kernel_size[0])/stride[0]+1) $$

 $$W_{out}=floor((W_{in}+2padding[1]-kernel_size[1])/stride[1]+1) $$

# 非线性层 （激活函数）


## nn.ReLU
## nn.ReLU6  
* ${ReLU6}(x) = min(max(0,x), 6)$ 

## nn.ELU  
* $f(x) = max(0,x) + min(0, alpha * (e^x - 1))$ alpha=1.0

## nn.PReLU
nn.PReLU(num_parameters=1, init=0.25)

num_parameters：需要学习的a的个数，默认等于1   
init：a的初始值，默认等于0.25

* $PReLU(x) = max(0,x) + a * min(0,x)$

## nn.LeakyReLU
nn.LeakyReLU(negative_slope=0.01, inplace=False) 

## nn.Threshold
nn.Threshold(threshold, value, inplace=False) 

$y=x,if x>=threshold; \ y=value,if x<threshold$

## nn.Hardtanh
nn.Hardtanh(min_value=-1, max_value=1, inplace=False)

$f(x)=+1,if\ x>1; f(x)=−1,if \ x<−1; f(x)=x,otherwise$

## nn.Sigmoid

## nn.Tanh
$Tanh(x) = \frac{e^{x}-e^{-x} }{e^{x} + e^{-x}}$
## nn.LogSigmoid
$LogSigmoid(x) = log( 1 / ( 1 + e^{-x}))$

## nn.Softplus
nn.Softplus(beta=1, threshold=20)   
$Softplus(x) = \frac{1}{beta}*log(1+e^{(beta*x_i)})$ 
## nn.Softshrink  
nn.Softshrink(lambd=0.5)   
$f(x)=x−lambda,if\ x>lambda; f(x)=x+lambda,if\ x<−lambda; f(x)=0,otherwise$
## nn.Softsign
$f(x) = x / (1 + |x|)$
## nn.Softshrink
nn.Softshrink(lambd=0.5)

$Tanhshrink(x)=x−Tanh(x)$
## nn.Softmin
## nn.Softmax
## nn.LogSoftmax

# Normalization layers

## BatchNorm1d
## BatchNorm2d
## BatchNorm3d

**class torch.nn.BatchNorm1d(num_features, eps=1e-05, 
                             momentum=0.1, affine=True)**
* 参数说明：  
num_features： 来自期望输入的特征数，该期望输入的大小为'batch_size x num_features [x width]'   
eps： 为保证数值稳定性（分母不能趋近或取0给分母加上的值。默认为1e-5。   
momentum： 动态均值和动态方差所使用的动量。默认为0.1。   

## GroupNorm
**torch.nn.GroupNorm(num_groups: int, num_channels: int, eps: float = 1e-05, affine: bool = True)**

## LayerNorm
**torch.nn.LayerNorm(normalized_shape: Union[int, List[int], torch.Size], eps: float = 1e-05, elementwise_affine: bool = True)**

# Recurrent layers

## RNN

**class torch.nn.RNN( args, * kwargs)** 
 $$ h_t=tanh(w_{ih} x_t+b_{ih}+w_{hh} h_{t-1}+b_{hh}) $$

* 参数说明:  

input_size – 输入x的特征数量。  
hidden_size – 隐层的特征数量。    
num_layers – RNN的层数。    
nonlinearity – 指定非线性函数使用tanh还是relu。默认是tanh。    
bias – 如果是False，那么RNN层就不会使用偏置权重 $b_ih$和$b_hh$,默认是True   
batch_first – 如果True的话，那么输入Tensor的shape应该是[batch_size, time_step, feature],输出也是这样。  
dropout – 如果值非零，那么除了最后一层外，其它层的输出都会套上一个dropout层。   
bidirectional – 如果True，将会变成一个双向RNN，默认为False。    
* shape :   
输入 ： (input, h_0)    
      input : (seq_len, batch, input_size)   
      h_0 : (num_layers * num_directions, batch, hidden_size)    
输出 : (output, h_n)   
      output ：(seq_len, batch, hidden_size * num_directions)   
      h_n ：(num_layers * num_directions, batch, hidden_size)   

## LSTM

**class torch.nn.LSTM**   
    $$ i_t = sigmoid(W_{ii}x_t+b_{ii}+W_{hi}h_{t-1}+b_{hi}) $$
    $$ f_t = sigmoid(W_{if}x_t+b_{if}+W_{hf}h_{t-1}+b_{hf}) $$
    $$ o_t = sigmoid(W_{io}x_t+b_{io}+W_{ho}h_{t-1}+b_{ho}) $$ 
    $$ g_t = tanh(W_{ig}x_t+b_{ig}+W_{hg}h_{t-1}+b_{hg}) $$ 
    $$ c_t = f_tc_{t-1}+i_tg_t $$
    $$ h_t = o_t*tanh(c_t)  $$
    
$h_t$是时刻$t$的隐状态,$c_t$是时刻$t$的细胞状态，$x_t$是上一层的在时刻$t$的隐状态或者是第一层在时刻$t$的输入。$i_t, f_t, g_t, o_t$ 分别代表 输入门，遗忘门，细胞和输出门

* 参数说明:   
input_size – 输入的特征维度   
hidden_size – 隐状态的特征维度    
num_layers – 层数（和时序展开要区分开）    
bias – 如果为False，那么LSTM将不会使用$b_{ih},b_{hh}$，默认为True。    
batch_first – 如果为True，那么输入和输出Tensor的形状为(batch, seq, feature)     
dropout – 如果非零的话，将会在RNN的输出上加个dropout，最后一层除外。    
bidirectional – 如果为True，将会变成一个双向RNN，默认为False。   
* shape :
 输入：  input, (h_0, c_0)
      input ： (seq_len, batch, input_size)
      h_0 ：(num_layers * num_directions, batch, hidden_size)
      c_0 ：(num_layers * num_directions, batch, hidden_size)
 输出: output, (h_n, c_n)   
      output: (seq_len, batch, hidden_size * num_directions)
      h_n: (num_layers * num_directions, batch, hidden_size)
      c_n: (num_layers * num_directions, batch, hidden_size)

In [36]:
lstm = nn.LSTM(10,20,2)
input = torch.randn(5,3,10)
h0=torch.randn(2,3,20)
c0=torch.randn(2,3,20)
out,hn = lstm(input,(h0,c0))
print(out.size(),hn[0].size(),hn[1].size())

torch.Size([5, 3, 20]) torch.Size([2, 3, 20]) torch.Size([2, 3, 20])


## GRU

**class torch.nn.GRU**
$$  r_t=sigmoid(W_{ir}x_t+b_{ir}+W_{hr}h_{(t-1)}+b_{hr})$$ 
$$  i_t=sigmoid(W_{ii}x_t+b_{ii}+W_{hi}h_{(t-1)}+b_{hi})$$ 
$$  n_t=tanh(W_{in}x_t+b_{in}+rt(W_{hn}h_{(t-1)}+b_{hn}))$$ 
$$  h_t=(1-i_t) nt+i_t*h(t-1) $$ 
$h_t$是是时间$t$的上的隐状态，$x_t$是前一层$t$时刻的隐状态或者是第一层的$t$时刻的输入，$r_t, i_t, n_t$分别是重置门，输入门和新门。
* 参数说明：    
input_size – 期望的输入$x$的特征值的维度      
hidden_size – 隐状态的维度  
num_layers – RNN的层数。    
bias – 如果为False，那么RNN层将不会使用bias，默认为True。    
batch_first – 如果为True的话，那么输入和输出的tensor的形状是(batch, seq, feature)。 -    
dropout – 如果非零的话，将会在RNN的输出上加个dropout，最后一层除外。 -    
bidirectional – 如果为True，将会变成一个双向RNN，默认为False。   

  输入： input, h_0      
      input (seq_len, batch, input_size)    
      h_0 (num_layers * num_directions, batch, hidden_size)   
  输出： output, h_n   
      output (seq_len, batch, hidden_size * num_directions)   
      h_n (num_layers * num_directions, batch, hidden_size)   

## RNNCell
RNNCell(input_size, hidden_size, bias=True, nonlinearity='tanh')
## LSTMCell
LSTMCell(input_size, hidden_size, bias=True)
## GRUCell
GRUCell(input_size, hidden_size, bias=True)

# Transformer Layer

## Tranformer
**torch.nn.Transformer(d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6, num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1, activation: str = 'relu', custom_encoder: Optional[Any]=None, custom_decoder:Optional[Any] = None)**

## TransformerEncoder
**torch.nn.TransformerEncoder(encoder_layer, num_layers, norm=None)**

## TransformerDecoder
**torch.nn.TransformerDecoder(decoder_layer, num_layers, norm=None)**

## TransformerEncoderLayer
**torch.nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu')**

## TransformerDecoderLayer
**torch.nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu')**

# Linear layers

## Linear
**class torch.nn.Linear(in_features, out_features, bias=True)**

# Dropout layers

## Dropout
**class torch.nn.Dropout(p=0.5, inplace=False)**

# Sparse layers

## Embedding
**class torch.nn.Embedding(num_embeddings,embedding_dim,padding_idx=None,
                           max_norm=None,norm_type=2,scale_grad_by_freq=False,sparse=False)**
* 参数说明：   
num_embeddings (int) - 嵌入字典的大小   
embedding_dim (int) - 每个嵌入向量的大小   
padding_idx (int, optional) - 如果提供的话，输出遇到此下标时用零填充   
max_norm (float, optional) - 如果提供的话，会重新归一化词嵌入，使它们的范数小于提供的值   
norm_type (float, optional) - 对于max_norm选项计算p范数时的p   
scale_grad_by_freq (boolean, optional) - 如果提供的话，会根据字典中单词频率缩放梯度                              

# Distance functions 

## pairwiseDistance
**class torch.nn.PairwiseDistance(p=2, eps=1e-06)**    
按批计算向量v1, v2之间的距离

## CosineSimilarity
**torch.nn.CosineSimilarity(dim: int = 1, eps: float = 1e-08)**

# Loss functions

## L1Loss
**class torch.nn.L1Loss(size_average=True)**   
x,y之间差的绝对值的平均值

## MSELoss
**class torch.nn.MSELoss(size_average=True)**   
均方误差

## BCELoss
**class torch.nn.BCELoss(weight=None, size_average=True)**    
二进制交叉熵   
 $$ loss(o,t)=-\frac{1}{n}\sum_i(t[i] log(o[i])+(1-t[i]) log(1-o[i])) $$ 

## CrossEntropyLoss
**class torch.nn.CrossEntropyLoss(weight=None, size_average=True)**    
交叉熵

* nn.CrossEntropyLoss()为交叉熵损失函数，用于解决多分类问题，也可用于解决二分类问题。   
* BCELoss是Binary CrossEntropyLoss的缩写，nn.BCELoss()为二元交叉熵损失函数，只能解决二分类问题。   
* 在使用nn.CrossEntropyLoss()其内部会自动加上Sofrmax层。
* 在使用nn.BCELoss()作为损失函数时，需要在该层前面加上Sigmoid函数，一般使用nn.Sigmoid()即可，

## NLLLoss
**class torch.nn.NLLLoss(weight=None, size_average=True)**  
CrossEntropyLoss()=log_softmax() + NLLLoss()   

## NLLLoss2d
**class torch.nn.NLLLoss2d(weight=None, size_average=True)**


## KLDivLoss
**class torch.nn.KLDivLoss(weight=None, size_average=True)**   
KL散度     
$loss(x,target)=1n∑i(targeti∗(log(targeti)−xi))$

## MarginRankingLoss
**class torch.nn.MarginRankingLoss(margin=0, size_average=True)**
$$loss(x,y)=max(0,−y∗(x1−x2)+margin)$$

# Vision layers

## UpsamplingNearest2d
**class torch.nn.UpsamplingNearest2d(size=None, scale_factor=None)**   
对于多channel 输入 进行 2-D 最近邻上采样    

## UpsamplingBilinear2d
**class torch.nn.UpsamplingBilinear2d(size=None, scale_factor=None)**    
对于多channel 输入 进行 2-D bilinear 上采样

# 其他函数

## utils.clip_grad_norm 
**torch.nn.utils.clip_grad_norm(parameters, max_norm, norm_type=2)**

如果梯度超过阈值，那么就截断，将梯度变为阈值

g = 阀值/|g| * g


## utils.rnn.PackedSequence  
* 不需要单独创建 通过utils.rnn.pack_padded_sequence 或utils.rnn.pad_packed_sequence 创建   
**torch.nn.utils.rnn.PackedSequence(_cls, data, batch_sizes)**

## utils.rnn.pack_padded_sequence 
* 将一个填充过的变长序列压紧   
**orch.nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first=False, enforce_sorted=True)**
* 参数说明：   
enforce_sorted: 强制排序

In [7]:
batch_size =3
max_length= 3
hidden_size = 3
n_layers = 1
input = torch.autograd.Variable(torch.Tensor([[1,0,0],[1,2,3],[4,5,0]]).view(3,3,1))
seq_len =[1,3,2]
packed = nn.utils.rnn.pack_padded_sequence(input,seq_len,batch_first=True,enforce_sorted=False)
print(packed)
print(packed.data,packed.batch_sizes)

PackedSequence(data=tensor([[1.],
        [4.],
        [1.],
        [2.],
        [5.],
        [3.]]), batch_sizes=tensor([3, 2, 1]), sorted_indices=tensor([1, 2, 0]), unsorted_indices=tensor([2, 0, 1]))
tensor([[1.],
        [4.],
        [1.],
        [2.],
        [5.],
        [3.]]) tensor([3, 2, 1])


## nn.utils.rnn.pad_packed_sequence
* 填充打包的可变长度序列批次   
**torch.nn.utils.rnn.pad_packed_sequence(sequence, batch_first=False, padding_value=0.0, total_length=None)**

In [9]:
padded = nn.utils.rnn.pad_packed_sequence(packed)
print(padded)

(tensor([[[1.],
         [1.],
         [4.]],

        [[0.],
         [2.],
         [5.]],

        [[0.],
         [3.],
         [0.]]]), tensor([1, 3, 2]))


## Flatten

**torch.nn.Flatten(start_dim: int = 1, end_dim: int = -1)**