# 第3课：RNN 循环神经网络

## 学习目标
- 理解序列数据和 RNN 原理
- 掌握 LSTM 和 GRU 结构
- 学会处理时序数据
- 完成序列预测任务

## 1. 循环神经网络简介

RNN 专门处理序列数据，通过循环连接传递历史信息。

**应用场景**：
- 自然语言处理
- 时间序列预测
- 语音识别
- 机器翻译

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

## 2. 基本 RNN

In [None]:
# RNN 单元原理
class SimpleRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SimpleRNNCell, self).__init__()
        self.hidden_size = hidden_size
        
        # 输入到隐藏状态
        self.i2h = nn.Linear(input_size, hidden_size)
        # 隐藏状态到隐藏状态
        self.h2h = nn.Linear(hidden_size, hidden_size)
    
    def forward(self, x, hidden):
        # h_t = tanh(W_ih * x_t + W_hh * h_{t-1})
        hidden = torch.tanh(self.i2h(x) + self.h2h(hidden))
        return hidden

# 测试
rnn_cell = SimpleRNNCell(input_size=10, hidden_size=20)
x = torch.randn(1, 10)  # (batch, input_size)
h = torch.zeros(1, 20)  # (batch, hidden_size)

for t in range(5):
    h = rnn_cell(x, h)
    print(f"时间步 {t}: 隐藏状态形状 = {h.shape}")

In [None]:
# 使用 PyTorch 内置 RNN
rnn = nn.RNN(input_size=10, hidden_size=20, num_layers=2, batch_first=True)

# 输入: (batch, seq_len, input_size)
x = torch.randn(3, 5, 10)  # 3个样本，序列长度5，特征维度10

# 输出和最终隐藏状态
output, h_n = rnn(x)

print(f"输入形状: {x.shape}")
print(f"输出形状: {output.shape}")
print(f"最终隐藏状态形状: {h_n.shape}")

## 3. LSTM（长短期记忆网络）

In [None]:
# LSTM 原理图解
print("""
LSTM 单元结构:

      ┌─────────────────────────────────────────┐
      │                  Cell State             │
      │    ──────────────────────────────►      │
      │          ×         +                    │
      │          │         │                    │
      │    ┌─────┴─────┐   │                    │
      │    │  Forget   │   │                    │
      │    │   Gate    │   │                    │
      │    └───────────┘   │                    │
      │          σ         │                    │
      │          │   ┌─────┴─────┐              │
      │          │   │   Input   │×  tanh      │
      │          │   │   Gate    │              │
      │          │   └───────────┘              │
      │          │         σ                    │
      │    ┌─────┴─────────┴─────┐              │
 h_{t-1}──►│                     ├──────► h_t   │
      │    │      Concat         │              │
   x_t ───►│                     │              │
      │    └─────────────────────┘              │
      │                   │                     │
      │             ┌─────┴─────┐               │
      │             │  Output   │×  tanh       │
      │             │   Gate    │               │
      │             └───────────┘               │
      │                   σ                     │
      └─────────────────────────────────────────┘

三个门:
- 遗忘门 (Forget Gate): 决定丢弃哪些信息
- 输入门 (Input Gate): 决定存储哪些新信息
- 输出门 (Output Gate): 决定输出哪些信息
""")

In [None]:
# 使用 PyTorch LSTM
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2, batch_first=True)

x = torch.randn(3, 5, 10)  # (batch, seq_len, input_size)

# 输出和最终状态 (h_n: 隐藏状态, c_n: 细胞状态)
output, (h_n, c_n) = lstm(x)

print(f"输入形状: {x.shape}")
print(f"输出形状: {output.shape}")
print(f"最终隐藏状态形状: {h_n.shape}")
print(f"最终细胞状态形状: {c_n.shape}")

## 4. GRU（门控循环单元）

In [None]:
print("""
GRU 相比 LSTM:
- 只有两个门：重置门和更新门
- 没有单独的细胞状态
- 参数更少，计算更快
- 在某些任务上效果相当
""")

# 使用 PyTorch GRU
gru = nn.GRU(input_size=10, hidden_size=20, num_layers=2, batch_first=True)

x = torch.randn(3, 5, 10)
output, h_n = gru(x)

print(f"GRU 输出形状: {output.shape}")
print(f"GRU 最终隐藏状态形状: {h_n.shape}")

## 5. 时间序列预测示例

In [None]:
# 生成正弦波数据
def generate_sin_data(seq_length, num_samples):
    X = []
    y = []
    
    for _ in range(num_samples):
        start = np.random.uniform(0, 2 * np.pi)
        x = np.sin(np.linspace(start, start + 4 * np.pi, seq_length + 1))
        X.append(x[:-1])
        y.append(x[1:])
    
    return np.array(X), np.array(y)

# 生成数据
seq_length = 50
X_train, y_train = generate_sin_data(seq_length, 1000)
X_test, y_test = generate_sin_data(seq_length, 100)

# 转换为张量
X_train = torch.FloatTensor(X_train).unsqueeze(-1)  # (batch, seq_len, 1)
y_train = torch.FloatTensor(y_train).unsqueeze(-1)
X_test = torch.FloatTensor(X_test).unsqueeze(-1)
y_test = torch.FloatTensor(y_test).unsqueeze(-1)

print(f"训练数据形状: {X_train.shape}")

# 可视化
plt.figure(figsize=(12, 4))
plt.plot(X_train[0].squeeze(), label='输入')
plt.plot(y_train[0].squeeze(), label='目标')
plt.legend()
plt.title('时间序列预测任务')
plt.show()

In [None]:
# 时间序列预测模型
class LSTMPredictor(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2, output_size=1):
        super(LSTMPredictor, self).__init__()
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
                           batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # x: (batch, seq_len, input_size)
        lstm_out, _ = self.lstm(x)
        # lstm_out: (batch, seq_len, hidden_size)
        output = self.fc(lstm_out)
        # output: (batch, seq_len, output_size)
        return output

model = LSTMPredictor().to(device)
print(model)

In [None]:
# 训练模型
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 创建 DataLoader
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 训练
num_epochs = 20
losses = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    losses.append(avg_loss)
    
    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.6f}')

plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('训练损失')
plt.show()

In [None]:
# 评估模型
model.eval()
with torch.no_grad():
    X_test_device = X_test.to(device)
    predictions = model(X_test_device).cpu()

# 可视化预测结果
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

for i, ax in enumerate(axes.flatten()):
    ax.plot(y_test[i].squeeze(), label='真实值', linewidth=2)
    ax.plot(predictions[i].squeeze(), '--', label='预测值', linewidth=2)
    ax.legend()
    ax.set_title(f'样本 {i+1}')

plt.tight_layout()
plt.show()

# 计算测试损失
test_loss = criterion(predictions, y_test)
print(f"测试 MSE: {test_loss.item():.6f}")

## 6. 文本分类示例

In [None]:
# 简单的文本分类模型
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(TextClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, 
                           batch_first=True, bidirectional=True, dropout=0.3)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)  # *2 因为双向
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding(x)  # (batch, seq_len, embed_dim)
        lstm_out, (h_n, c_n) = self.lstm(embedded)
        
        # 使用最后一个时间步的隐藏状态
        # h_n: (num_layers*2, batch, hidden_dim) for bidirectional
        # 连接前向和后向的最后一层
        hidden = torch.cat((h_n[-2], h_n[-1]), dim=1)
        
        output = self.dropout(hidden)
        output = self.fc(output)
        return output

# 模型示例
text_model = TextClassifier(vocab_size=10000, embed_dim=128, 
                            hidden_dim=256, num_classes=2)
print(text_model)
print(f"\n参数量: {sum(p.numel() for p in text_model.parameters()):,}")

In [None]:
# 模拟文本分类
# 假设已经将文本转换为索引序列
batch_size = 4
seq_len = 20
vocab_size = 10000

# 模拟输入
fake_text = torch.randint(0, vocab_size, (batch_size, seq_len))
fake_labels = torch.randint(0, 2, (batch_size,))

# 前向传播
output = text_model(fake_text)
print(f"输入形状: {fake_text.shape}")
print(f"输出形状: {output.shape}")
print(f"预测: {torch.softmax(output, dim=1)}")

## 7. 双向 RNN

In [None]:
# 双向 LSTM
bi_lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2, 
                  batch_first=True, bidirectional=True)

x = torch.randn(3, 5, 10)
output, (h_n, c_n) = bi_lstm(x)

print("双向 LSTM:")
print(f"  输入形状: {x.shape}")
print(f"  输出形状: {output.shape}  # hidden_size * 2")
print(f"  h_n 形状: {h_n.shape}  # num_layers * 2")

## 8. 注意力机制简介

In [None]:
# 简单的注意力机制
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attention = nn.Linear(hidden_dim, 1)
    
    def forward(self, lstm_output):
        # lstm_output: (batch, seq_len, hidden_dim)
        
        # 计算注意力分数
        attention_scores = self.attention(lstm_output)  # (batch, seq_len, 1)
        attention_weights = torch.softmax(attention_scores, dim=1)
        
        # 加权求和
        context = torch.sum(attention_weights * lstm_output, dim=1)
        
        return context, attention_weights

# 带注意力的 LSTM 分类器
class AttentionLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(AttentionLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        context, attention_weights = self.attention(lstm_out)
        output = self.fc(context)
        return output, attention_weights

# 测试
attn_model = AttentionLSTM(vocab_size=1000, embed_dim=64, hidden_dim=128, num_classes=2)
x = torch.randint(0, 1000, (2, 10))
output, attn_weights = attn_model(x)

print(f"输出形状: {output.shape}")
print(f"注意力权重形状: {attn_weights.shape}")
print(f"注意力权重 (样本1): {attn_weights[0].squeeze()}")

## 9. 练习题

### 练习：使用 LSTM 预测股票价格（模拟数据）

In [None]:
# 生成模拟股票数据
np.random.seed(42)
n_days = 500
stock_prices = 100 + np.cumsum(np.random.randn(n_days) * 2)

plt.figure(figsize=(12, 4))
plt.plot(stock_prices)
plt.title('模拟股票价格')
plt.xlabel('天数')
plt.ylabel('价格')
plt.show()

# 在这里编写代码
# 1. 准备训练数据（使用过去 N 天预测下一天）
# 2. 构建 LSTM 模型
# 3. 训练模型
# 4. 可视化预测结果


## 10. 本课小结

1. **RNN**：处理序列数据，存在梯度消失问题
2. **LSTM**：通过门控机制解决长期依赖
3. **GRU**：简化版 LSTM，参数更少
4. **双向 RNN**：同时考虑前后文
5. **注意力机制**：动态关注重要位置
6. **应用**：时序预测、文本分类、序列标注