# 第2课：CNN 卷积神经网络

## 学习目标
- 理解卷积操作的原理
- 掌握 CNN 的基本架构
- 学会使用 PyTorch 构建 CNN
- 完成图像分类任务

## 1. 卷积神经网络简介

CNN 是专门处理图像数据的神经网络，核心组件包括：
- **卷积层（Conv）**：提取局部特征
- **池化层（Pooling）**：降维，保留主要特征
- **全连接层（FC）**：分类决策

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np

# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

## 2. 卷积操作详解

In [None]:
# 可视化卷积操作
def visualize_convolution():
    # 创建示例图像
    image = torch.tensor([
        [1, 2, 3, 4, 5],
        [6, 7, 8, 9, 10],
        [11, 12, 13, 14, 15],
        [16, 17, 18, 19, 20],
        [21, 22, 23, 24, 25]
    ], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    
    # 创建卷积核（边缘检测）
    kernel = torch.tensor([
        [-1, -1, -1],
        [-1,  8, -1],
        [-1, -1, -1]
    ], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    
    # 应用卷积
    output = F.conv2d(image, kernel, padding=0)
    
    fig, axes = plt.subplots(1, 3, figsize=(12, 4))
    
    axes[0].imshow(image.squeeze(), cmap='gray')
    axes[0].set_title('输入图像 (5x5)')
    
    axes[1].imshow(kernel.squeeze(), cmap='gray')
    axes[1].set_title('卷积核 (3x3)')
    
    axes[2].imshow(output.squeeze().detach(), cmap='gray')
    axes[2].set_title('输出特征图 (3x3)')
    
    plt.tight_layout()
    plt.show()
    
    print(f"输入尺寸: {image.shape}")
    print(f"卷积核尺寸: {kernel.shape}")
    print(f"输出尺寸: {output.shape}")

visualize_convolution()

In [None]:
# 卷积层参数计算
print("卷积输出尺寸公式: (W - K + 2P) / S + 1")
print("W: 输入尺寸, K: 卷积核尺寸, P: 填充, S: 步幅")
print()

def calculate_output_size(W, K, P=0, S=1):
    return (W - K + 2 * P) // S + 1

# 示例
examples = [
    (28, 3, 0, 1),  # MNIST, 3x3 kernel, no padding
    (28, 3, 1, 1),  # MNIST, 3x3 kernel, same padding
    (224, 7, 3, 2), # ImageNet, 7x7 kernel, stride 2
]

for W, K, P, S in examples:
    out = calculate_output_size(W, K, P, S)
    print(f"输入={W}x{W}, 卷积核={K}x{K}, 填充={P}, 步幅={S} -> 输出={out}x{out}")

## 3. 池化操作

In [None]:
# 最大池化和平均池化
x = torch.tensor([
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12],
    [13, 14, 15, 16]
], dtype=torch.float32).unsqueeze(0).unsqueeze(0)

max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)

max_output = max_pool(x)
avg_output = avg_pool(x)

print("输入:")
print(x.squeeze())
print(f"\n最大池化输出 (2x2):")
print(max_output.squeeze())
print(f"\n平均池化输出 (2x2):")
print(avg_output.squeeze())

## 4. 构建 CNN 模型

In [None]:
# 简单的 CNN 模型
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        
        # 卷积层
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        
        # 池化层
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # 全连接层
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, num_classes)
        
        # Dropout
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        # 第一个卷积块
        x = self.pool(F.relu(self.conv1(x)))  # 28x28 -> 14x14
        
        # 第二个卷积块
        x = self.pool(F.relu(self.conv2(x)))  # 14x14 -> 7x7
        
        # 展平
        x = x.view(-1, 64 * 7 * 7)
        
        # 全连接层
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

model = SimpleCNN().to(device)
print(model)

# 计算参数量
total_params = sum(p.numel() for p in model.parameters())
print(f"\n总参数量: {total_params:,}")

## 5. 加载 MNIST 数据集

In [None]:
# 数据预处理
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# 加载数据集
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

print(f"训练集大小: {len(train_dataset)}")
print(f"测试集大小: {len(test_dataset)}")

In [None]:
# 可视化样本
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flatten()):
    img, label = train_dataset[i]
    ax.imshow(img.squeeze(), cmap='gray')
    ax.set_title(f'标签: {label}')
    ax.axis('off')
plt.tight_layout()
plt.show()

## 6. 训练模型

In [None]:
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练函数
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += pred.eq(target).sum().item()
        total += target.size(0)
    
    return total_loss / len(train_loader), correct / total

# 评估函数
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            total_loss += criterion(output, target).item()
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()
            total += target.size(0)
    
    return total_loss / len(test_loader), correct / total

In [None]:
# 训练模型
num_epochs = 5
train_losses, test_losses = [], []
train_accs, test_accs = [], []

for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)
    
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    
    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    print(f'  Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

In [None]:
# 可视化训练过程
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(train_losses, label='训练损失')
axes[0].plot(test_losses, label='测试损失')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].set_title('损失曲线')

axes[1].plot(train_accs, label='训练准确率')
axes[1].plot(test_accs, label='测试准确率')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].legend()
axes[1].set_title('准确率曲线')

plt.tight_layout()
plt.show()

## 7. 可视化特征图

In [None]:
# 可视化卷积层的特征图
def visualize_feature_maps(model, image):
    model.eval()
    
    # 获取第一层卷积的输出
    activation = {}
    
    def get_activation(name):
        def hook(model, input, output):
            activation[name] = output.detach()
        return hook
    
    model.conv1.register_forward_hook(get_activation('conv1'))
    model.conv2.register_forward_hook(get_activation('conv2'))
    
    with torch.no_grad():
        output = model(image.unsqueeze(0).to(device))
    
    # 显示原图
    fig, axes = plt.subplots(3, 8, figsize=(16, 6))
    
    axes[0, 0].imshow(image.squeeze(), cmap='gray')
    axes[0, 0].set_title('原图')
    for i in range(1, 8):
        axes[0, i].axis('off')
    
    # 显示 conv1 特征图
    conv1_features = activation['conv1'].squeeze().cpu()
    for i in range(8):
        axes[1, i].imshow(conv1_features[i], cmap='viridis')
        axes[1, i].set_title(f'Conv1-{i}')
        axes[1, i].axis('off')
    
    # 显示 conv2 特征图
    conv2_features = activation['conv2'].squeeze().cpu()
    for i in range(8):
        axes[2, i].imshow(conv2_features[i], cmap='viridis')
        axes[2, i].set_title(f'Conv2-{i}')
        axes[2, i].axis('off')
    
    plt.tight_layout()
    plt.show()

# 获取一个样本
sample_image, sample_label = test_dataset[0]
visualize_feature_maps(model, sample_image)

## 8. 经典 CNN 架构

In [None]:
# VGG 风格的网络
class VGGStyle(nn.Module):
    def __init__(self, num_classes=10):
        super(VGGStyle, self).__init__()
        
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            # Block 2
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(128 * 7 * 7, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

vgg_model = VGGStyle()
print(f"VGG风格网络参数量: {sum(p.numel() for p in vgg_model.parameters()):,}")

In [None]:
# 残差块（ResNet风格）
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        # 跳跃连接
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)  # 残差连接
        out = F.relu(out)
        return out

# 简单的 ResNet
class SimpleResNet(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleResNet, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        
        self.layer1 = ResidualBlock(32, 32)
        self.layer2 = ResidualBlock(32, 64, stride=2)
        self.layer3 = ResidualBlock(64, 128, stride=2)
        
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(128, num_classes)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

resnet_model = SimpleResNet()
print(f"简单ResNet参数量: {sum(p.numel() for p in resnet_model.parameters()):,}")

## 9. 练习题

### 练习：修改 CNN 架构并比较性能

In [None]:
# 在这里编写代码
# 1. 增加/减少卷积层数量
# 2. 改变卷积核大小
# 3. 添加 Batch Normalization
# 4. 比较不同架构的性能


## 10. 本课小结

1. **卷积层**：提取局部特征，参数共享
2. **池化层**：降维，增加平移不变性
3. **经典架构**：VGG、ResNet 等
4. **关键技术**：BatchNorm、残差连接、Dropout
5. **应用场景**：图像分类、目标检测、语义分割