# Seq2seq

### 1. 介绍
Seq2Seq模型是输出的长度不确定时采用的模型，能够将序列作为输入并输出不定长的输出，一般在机器翻译等领域中使用。

论文链接：https://arxiv.org/pdf/1409.3215

### 2. 模型架构
#### 编码器Encoder
将输入序列压缩为一个定长的向量，这个向量包含着输入序列的语义。

#### 解码器Decorder
将编码器输出的向量作为输入，生成指定的序列。 

Seq2Seq通过编码器-解码器框架，将序列转换任务分解为理解和生成两个阶段，
配合LSTM对长序列的建模能力，使机器翻译质量实现了质的飞跃，为后续的Transformer架构奠定了基础。

#### 注意力机制Attention
注意力机制可以有效改进Seq2seq模型，它主要思想是结合编码器中每个隐藏状态与解码器当前状态，通过计算它们之间的相关性分数来决定在生成每个目标词时应该重点关注源序列的哪些部分。原论文并没有使用Attention，但是Attention和Seq2seq的结合为后续Transformer的提出奠定了基础。

### 3. 代码实现
我们将代码分为带Attention版本与不带Attention版本。

下列版本不带Attention：

In [None]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)
    
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        _, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decorder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
     
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        pred = self.fc_out(output.squeeze(0))
        return pred, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        assert encoder.hidden_dim == decoder.hidden_dim
        assert encoder.n_layers == decoder.n_layers
        
    def forward(self, src, trg):
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size)
        hidden, cell = self.encoder(src)
        input = trg[0,:] # 使用目标序列的第一个token
        
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            input = trg[t]
        
        return outputs

带有Attention机制的版本：

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)
    
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded) # [修改处]1
        return outputs, hidden, cell

class Decorder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        # [修改处]2
        
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.attn = nn.Linear(hidden_dim * 2, 1)
        self.rnn = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim * 2 + embedding_dim, output_dim)
     
    def forward(self, input, hidden, cell, encoder_outputs): # [修改处]3
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        
        # [修改处]4
        hidden_last = hidden[-1]
        src_len = encoder_outputs.shape[0]
        hidden_repeated = hidden_last.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs_reshaped = encoder_outputs.permute(1, 0, 2)
        
        # 将当前解码状态与每个编码器输出的组合作为注意力机制的输入
        attn_input = torch.cat((hidden_repeated, encoder_outputs_reshaped), dim=2)
        attn_scores = self.attn(attn_input).squeeze(2)
        attn_weights = F.softmax(attn_scores, dim=1)
        
        # 计算上下文向量
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs_reshaped) # 批量矩阵乘法
        context = context.permute(1, 0, 2)
        
        # 将编码向量和上下文向量的组合作为RNN(LSTM)的输入
        rnn_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        
        # 结合RNN的输出、编码向量和上下文向量，一起作为全连接层的输入
        # 可以让神经网络学习到更多关于句子内部词语之间的关系。
        output = output.squeeze(0)
        embedded = embedded.squeeze(0)
        context = context.squeeze(0)
        pred = self.fc_out(torch.cat((output, context, embedded), dim=1))
        
        return pred, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, src, trg):
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size)
        encoder_outputs, hidden, cell = self.encoder(src)
        input = trg[0,:] # 使用目标序列的第一个token
        
        for t in range(1, trg_length):
            pred, hidden, cell = self.decoder(
                input, hidden, cell, encoder_outputs
            )
            outputs[t] = pred
            input = pred.argmax(1)
        
        return outputs