# Assignment 1

This code baseline is inspired by and modified from [this great tutorial](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).

This code can achieve an accuracy of approximately 86.50% on CIFAR-10. Please set up the environment and run your experiments starting from this baseline. You are expected to achieve an accuracy higher than this baseline.

In [None]:
# import some necessary packages
import torch
import torch.nn as nn
import torch.optim as optim

import torchvision.datasets as tv_datasets
import torchvision.transforms as tv_transforms


import matplotlib.pyplot as plt

In [None]:
# 残差结构
class Resblock(nn.Module):
  def __init__(self,in_channel,out_channel,downsample=False):
    super().__init__()
    self.in_channel = in_channel
    self.out_channel = out_channel
    self.downsample = downsample
    strides = 2 if self.downsample else 1
    if self.downsample or in_channel!=out_channel:
      self.conv3 = nn.Conv2d(in_channel,out_channel,kernel_size=1,stride=strides,padding=0,bias=False)

    self.conv1 = nn.Conv2d(in_channel,out_channel,kernel_size=3,stride =strides,padding =1,bias=False)
    self.bn1  = nn.BatchNorm2d(out_channel)
    self.relu1 = nn.ReLU(inplace = True)
    self.bn2  = nn.BatchNorm2d(out_channel)
    self.conv2 = nn.Conv2d(out_channel,out_channel,kernel_size=3,stride=1,padding=1,bias=False)
    self.relu2 = nn.ReLU(inplace = True)

  def forward(self,x):
    identity = x
    Y = self.relu1(self.bn1(self.conv1(x)))
    Y = self.bn2(self.conv2(Y))
    if self.downsample or self.in_channel!=self.out_channel:
      identity = self.conv3(identity)
    Y=Y+identity
    return self.relu2(Y)



In [None]:
# attention
class SEBlock(nn.Module):
    def __init__(self, in_channels, reduction_ratio=16):
        super(SEBlock, self).__init__()
        # Squeeze: 全局平均池化，将 [B, C, H, W] -> [B, C, 1, 1]
        self.squeeze = nn.AdaptiveAvgPool2d(1)

        # Excitation: 两个全连接层
        self.excitation = nn.Sequential(
            # 第一个FC层，将通道数 C 压缩到 C / r
            nn.Linear(in_channels, in_channels // reduction_ratio, bias=False),
            nn.ReLU(inplace=True),
            # 第二个FC层，将通道数恢复到 C
            nn.Linear(in_channels // reduction_ratio, in_channels, bias=False),
            nn.Sigmoid() # 使用 Sigmoid 得到 0-1 之间的权重
        )

    def forward(self, x):
        b, c, _, _ = x.size()
        # y: [B, C, 1, 1] -> [B, C]
        y = self.squeeze(x).view(b, c)
        # y: [B, C] -> [B, C] (通道权重) -> [B, C, 1, 1]
        y = self.excitation(y).view(b, c, 1, 1)

        # Rescale: 将学习到的通道权重乘以原始输入特征图
        return x * y.expand_as(x)
# 放在你的 SEBlock 定义的旁边即可
class ChannelAttentionModule(nn.Module):
    def __init__(self, in_channels, reduction_ratio=16):
        super(ChannelAttentionModule, self).__init__()
        # 使用一个共享的全连接网络
        self.shared_mlp = nn.Sequential(
            nn.Linear(in_channels, in_channels // reduction_ratio, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(in_channels // reduction_ratio, in_channels, bias=False)
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        b, c, _, _ = x.size()

        # 平均池化分支
        avg_out = self.shared_mlp(torch.mean(x, dim=[2, 3])).view(b, c, 1, 1)

        # 最大池化分支
        max_out = self.shared_mlp(torch.max(x, dim=3)[0].max(dim=2)[0]).view(b, c, 1, 1)

        # 将两个分支的输出相加，然后通过sigmoid得到权重
        channel_weights = self.sigmoid(avg_out + max_out)

        # 将权重乘回原输入
        return x * channel_weights
class SpatialAttentionModule(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttentionModule, self).__init__()
        padding = (kernel_size - 1) // 2
        self.conv = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=kernel_size, padding=padding, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Keep a reference to the original input tensor
        original_input = x

        # Perform pooling across the channel dimension
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)

        # Concatenate to create a 2-channel tensor
        concatenated = torch.cat([avg_out, max_out], dim=1)

        # Generate the spatial attention map (a 1-channel tensor of weights)
        spatial_weights = self.conv(concatenated)
        spatial_weights = self.sigmoid(spatial_weights)

        # Multiply the ORIGINAL input by the learned weights.
        # This is the corrected step!
        return original_input * spatial_weights
class CBAM(nn.Module):
    def __init__(self, in_channels, reduction_ratio=16, kernel_size=7):
        super(CBAM, self).__init__()
        self.channel_attention = ChannelAttentionModule(in_channels, reduction_ratio)
        self.spatial_attention = SpatialAttentionModule(kernel_size)

    def forward(self, x):
        # 先应用通道注意力
        x = self.channel_attention(x)
        # 再应用空间注意力
        x = self.spatial_attention(x)
        return x

In [None]:
# some experimental setup
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

num_epochs = 128
batch_size = 128
num_workers = 2
print_every = 200

optim_name = "Adam"
optim_kwargs = dict(
    lr=1e-4,
    weight_decay=1e-6,
)

# optim_name = "SGD"
# optim_kwargs = dict(
#     lr=0.005,
#     momentum=0.9,
#     weight_decay=5e-4,
# )


# optim_name = "AdamW"
# optim_kwargs = dict(
#     lr=3e-4,         
#     weight_decay=1e-2, 
# )

# preprocessing pipeline for input images
transformation = dict()
for data_type in ("train", "test"):
    is_train = data_type=="train"
    transformation[data_type] = tv_transforms.Compose(([
        tv_transforms.RandomRotation(degrees=15),
        tv_transforms.RandomHorizontalFlip(),
        tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    ] if is_train else []) + 
    [
        tv_transforms.ToTensor(),
        tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ])

for data_type in ("train", "test"):
    is_train = data_type=="train"
    
    if is_train:
        # 这是新的训练集增强管道
        transformation[data_type] = tv_transforms.Compose([
            # 1. 首先应用 TrivialAugmentWide
            tv_transforms.TrivialAugmentWide(),
            tv_transforms.RandomRotation(degrees=15),
            # 2. 接着，仍然保留最核心的几何变换
            tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
            tv_transforms.RandomHorizontalFlip(),
            
            # 3. 转换为 Tensor
            tv_transforms.ToTensor(),
            
            # 4. 归一化
            tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            
            # 5. 【推荐】最后，可以再添加一个 RandomErasing
            # TrivialAugment 和 Erasing 组合使用效果很好
            tv_transforms.RandomErasing(p=0.25, scale=(0.02, 0.2)),
        ])
    else:
        # 测试集保持不变
        transformation[data_type] = tv_transforms.Compose([
            tv_transforms.ToTensor(),
            tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
        ])

In [None]:
# prepare datasets
dataset, loader = {}, {}
for data_type in ("train", "test"):
    is_train = data_type=="train"
    dataset[data_type] = tv_datasets.CIFAR10(
        root="./data", train=is_train, download=True, transform=transformation[data_type],
    )
    loader[data_type] = torch.utils.data.DataLoader(
        dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers,
    )


In [None]:
# 深  宽

net = nn.Sequential(
    # Block 1: Input (3, 32, 32) -> Output (64, 16, 16)
    nn.Conv2d(3, 64, kernel_size=3, padding=1),
    nn.BatchNorm2d(64),
    nn.ReLU(inplace=True),
    nn.Conv2d(64, 64, kernel_size=3, padding=1),
    nn.BatchNorm2d(64),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(kernel_size=2, stride=2),

    # Block 2: Input (64, 16, 16) -> Output (128, 8, 8)
    nn.Conv2d(64, 128, kernel_size=3, padding=1),
    nn.BatchNorm2d(128),
    nn.ReLU(inplace=True),
    nn.Conv2d(128, 128, kernel_size=3, padding=1),
    nn.BatchNorm2d(128),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(kernel_size=2, stride=2),

    # Block 3: Input (128, 8, 8) -> Output (256, 4, 4)
    nn.Conv2d(128, 256, kernel_size=3, padding=1),
    nn.BatchNorm2d(256),
    nn.ReLU(inplace=True),
    nn.Conv2d(256, 256, kernel_size=3, padding=1),
    nn.BatchNorm2d(256),
    nn.ReLU(inplace=True),
    nn.Conv2d(256, 256, kernel_size=3, padding=1),
    nn.BatchNorm2d(256),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(kernel_size=2, stride=2),

    # Block 4: Input (256, 4, 4) -> Output (512, 2, 2)
    nn.Conv2d(256, 512, kernel_size=3, padding=1),
    nn.BatchNorm2d(512),
    nn.ReLU(inplace=True),
    nn.Conv2d(512, 512, kernel_size=3, padding=1),
    nn.BatchNorm2d(512),
    nn.ReLU(inplace=True),
    nn.Conv2d(512, 512, kernel_size=3, padding=1),
    nn.BatchNorm2d(512),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(kernel_size=2, stride=2),

    # Flatten
    nn.Flatten(),

    # Classifier
    nn.Linear(512 * 2 * 2, 4096),
    nn.ReLU(inplace=True),
    nn.Dropout(0.5),
    nn.Linear(4096, 4096),
    nn.ReLU(inplace=True),
    nn.Dropout(0.5),
    nn.Linear(4096, 10),
)

In [None]:
# augumentation
import numpy as np

def rand_bbox(size, lam):
    """
    生成随机的裁切框 (bounding box)
    """
    W = size[2]
    H = size[3]
    # 根据 lambda 计算裁切区域的宽高
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    # 随机选择裁切区域的中心点
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    # 确定裁切区域的四个坐标点，并确保它们在图像范围内
    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2

def cutmix_data(x, y, alpha=1.0):
    """
    对一个批次的数据应用 CutMix

    参数:
    - x: 输入图像批次 (Tensor)
    - y: 对应的标签批次 (Tensor)
    - alpha: Beta 分布的参数，控制混合比例，alpha=1.0 通常是个不错的选择

    返回:
    - mixed_x: 经过 CutMix 处理的图像批次
    - y_a, y_b: 参与混合的两个原始标签
    - lam: 混合比例
    """
    # 从 Beta 分布中采样混合比例 lambda
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    # 生成一个随机的索引，用于从批次中选择另一张图片进行混合
    index = torch.randperm(batch_size).to(device)

    # 获取参与混合的两个原始标签
    y_a, y_b = y, y[index]

    # 生成裁切框
    bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam)

    # 将第二张图的裁切区域粘贴到第一张图上
    x[:, :, bby1:bby2, bbx1:bbx2] = x[index, :, bby1:bby2, bbx1:bbx2]

    # 根据裁切区域的实际大小，调整混合比例 lambda
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size()[-1] * x.size()[-2]))

    return x, y_a, y_b, lam
def cutmix_criterion(criterion, pred, y_a, y_b, lam):
    """
    计算 CutMix 的损失
    criterion: 原始的损失函数 (例如 nn.CrossEntropyLoss)
    pred: 模型的预测输出
    y_a, y_b: 参与混合的两个原始标签
    lam: 混合比例
    """
    loss_a = criterion(pred, y_a)
    loss_b = criterion(pred, y_b)
    return lam * loss_a + (1 - lam) * loss_b



In [None]:
# attention
# class NetWithAttention(nn.Module):
#     def __init__(self):
#         super(NetWithAttention, self).__init__()

#         # 将卷积层分解成更小的块，方便插入注意力
#         self.conv_block1 = nn.Sequential(
#             nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3)
#         )
#         self.conv_block2 = nn.Sequential(
#             nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3)
#         )
#         self.conv_block3 = nn.Sequential(
#             nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
#             nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True)
#             # 在这里，我们准备插入SE模块
#         )

#         # --- 在这里定义我们的SE模块 ---
#         # 输入通道数是 512，因为 conv_block3 的输出通道是 512
#         self.attention = SEBlock(in_channels=512)

#         self.conv_block4 = nn.Sequential(
#             nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3)
#         )

#         self.classifier = nn.Sequential(
#             nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
#             nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
#             nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
#             nn.Linear(128, 10),
#         )

#     def forward(self, x):
#         # 定义新的数据流
#         x = self.conv_block1(x)
#         x = self.conv_block2(x)
#         x = self.conv_block3(x)

#         # --- 在数据流中应用注意力 ---
#         x = self.attention(x)

#         x = self.conv_block4(x)
#         x = torch.flatten(x, 1)
#         x = self.classifier(x)
#         return x
class NetWithMultipleCBAM(nn.Module):
    def __init__(self, num_classes=10): # 添加 num_classes 参数以增加灵活性
        super(NetWithMultipleCBAM, self).__init__()

        # --- 第1个卷积块 ---
        self.conv_block1 = nn.Sequential(
            nn.Conv2d(3, 128, 3, padding=1),
            nn.BatchNorm2d(128), # 推荐在ReLU前加入BN层，可以稳定训练
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Dropout(0.3)
        )
        self.attention1 = CBAM(in_channels=128)

        # --- 第2个卷积块 ---
        self.conv_block2 = nn.Sequential(
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Dropout(0.3)
        )
        self.attention2 = CBAM(in_channels=256)

        # --- 第3个卷积块 ---
        self.conv_block3 = nn.Sequential(
            nn.Conv2d(256, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True)
        )
        self.attention3 = CBAM(in_channels=512)

        # --- 第4个卷积块 ---
        self.conv_block4 = nn.Sequential(
            nn.Conv2d(512, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Dropout(0.3)
        )
        self.attention4 = CBAM(in_channels=256)

        # --- 分类器 ---
        # 输入尺寸需要根据你的输入图像大小计算。
        # CIFAR-10 (32x32): 32 -> MaxPool -> 16 -> MaxPool -> 8 -> MaxPool -> 4. 所以是 256 * 4 * 4
        # Tiny-ImageNet (64x64): 64 -> 32 -> 16 -> 8. 所以是 256 * 8 * 8
        # 这里以CIFAR-10为例
        self.classifier = nn.Sequential(
            nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
            nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
            nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
            nn.Linear(128, num_classes),
        )

    def forward(self, x):
        # Block 1
        x = self.conv_block1(x)
        x = self.attention1(x)

        # Block 2
        x = self.conv_block2(x)
        x = self.attention2(x)

        # Block 3
        x = self.conv_block3(x)
        x = self.attention3(x)

        # Block 4
        x = self.conv_block4(x)
        x = self.attention4(x)

        # Flatten and Classify
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

In [None]:
# 残差
# class CIFAR_Net_With_Resblock(nn.Module):
#     def __init__(self):
#         super().__init__()

#         # --- 1. 初始卷积和下采样部分 (对应你原来的前两个卷积块) ---
#         self.stem = nn.Sequential(
#             nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False),
#             nn.BatchNorm2d(64),
#             nn.ReLU(inplace=True)
#         )

#         self.layer1 = Resblock(in_channel=64,  out_channel=128, downsample=True)
#         self.layer2 = Resblock(in_channel=128,  out_channel=128, downsample=False)
#         # 128x16x16 -> 256x8x8
#         self.layer3 = Resblock(in_channel=128, out_channel=256, downsample=True)
#         self.layer4 = Resblock(in_channel=256, out_channel=256, downsample=False)
#         # 256x8x8 -> 512x4x4
#         self.layer5 = Resblock(in_channel=256, out_channel=512, downsample=True)
#         # 为了增加网络深度，可以再加一个普通的Resblock
#         # 512x4x4 -> 512x4x4
#         self.layer6 = Resblock(in_channel=512, out_channel=512)

#         # --- 3. 全局池化和分类器 ---
#         self.final_pool = nn.AdaptiveAvgPool2d((1, 1))

#         self.classifier = nn.Sequential(
#             nn.Flatten(),
#             # 注意：这里的输入维度是最后一个残差块的输出通道数 (512)
#             nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.4),
#             nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.4),
#             nn.Linear(128, 10)
#         )

#     def forward(self, x):
#         x = self.stem(x)

#         x = self.layer1(x)
#         x = self.layer2(x)
#         x = self.layer3(x)
#         x = self.layer4(x)
#         x = self.layer5(x)
#         x = self.layer6(x)

#         x = self.final_pool(x)
#         x = self.classifier(x)

#         return x
# net = CIFAR_Net_With_Resblock()

In [None]:
# our network architecture
# net = nn.Sequential(
#     nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Flatten(),
#     nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(128, 10),
# )

# move to device
net.to(device)

# print the number of parameters
print(f"number of parameters: {sum(p.numel() for p in net.parameters() if p.requires_grad) / 1_000_000:.2f}M")

## Start Training

In [None]:
# the network optimizer
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)

# loss function
criterion = nn.CrossEntropyLoss()

# add
loss_history = []
iter_history = []
current_iter = 0
val_loss_history = []
val_acc_history = []
val_iter_history = [] 

# training loop
net.train()
for epoch in range(num_epochs):
    net.train()
    running_loss = 0.0
    for i, (img, target) in enumerate(loader["train"]):
        img, target = img.to(device), target.to(device)

        pred = net(img)
        loss = criterion(pred, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        current_iter += 1 # 迭代计数器
        if i % print_every == print_every - 1:
            avg_loss = running_loss / print_every
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            # <-- 修改点: 记录当前迭代次数和对应的平均loss -->
            iter_history.append(current_iter)
            loss_history.append(avg_loss)
            running_loss = 0.0

    net.eval()
    val_loss = 0.0
    # <-- MODIFICATION: Initialize correct and total counters -->
    correct, total = 0, 0
    with torch.no_grad():
        for img, target in loader["test"]:
            img, target = img.to(device), target.to(device)
            pred = net(img)
            loss = criterion(pred, target)
            val_loss += loss.item()

            # <-- MODIFICATION: Add the accuracy calculation logic here -->
            total += len(target)
            correct += (torch.argmax(pred, dim=1) == target).sum().item()

    avg_val_loss = val_loss / len(loader["test"])
    # <-- MODIFICATION: Calculate accuracy percentage -->
    accuracy = 100 * correct / total

    # <-- MODIFICATION: Update print statement to include accuracy -->
    print(f"[Epoch {epoch + 1:3d}] Validation loss: {avg_val_loss:.3f}, Accuracy: {accuracy:.2f}%")
    # --- MODIFICATION: Record validation metrics at the end of the epoch ---
    val_iter_history.append(current_iter)
    val_loss_history.append(avg_val_loss)
    val_acc_history.append(accuracy)

print("Finished Training")
# plot train.test loss and test accuracy
# --- New plotting code for multiple metrics on a shared axis ---

# Create the main figure and the primary y-axis (for loss)
fig, ax1 = plt.subplots(figsize=(12, 6))
plt.title('Training & Validation Metrics')

# Plot Training Loss and Validation Loss on the left y-axis (ax1)
ax1.set_xlabel('Iteration')
ax1.set_ylabel('Loss', color='tab:red')
ax1.plot(iter_history, loss_history, color='tab:red', linestyle='--', alpha=0.7, label='Training Loss')
ax1.plot(val_iter_history, val_loss_history, color='tab:orange', marker='o', label='Validation Loss')
ax1.tick_params(axis='y', labelcolor='tab:red')

# Create the secondary y-axis (for accuracy) that shares the x-axis
ax2 = ax1.twinx()
ax2.set_ylabel('Accuracy (%)', color='tab:blue')
# Plot Validation Accuracy on the right y-axis (ax2)
ax2.plot(val_iter_history, val_acc_history, color='tab:blue', marker='s', label='Validation Accuracy')
ax2.tick_params(axis='y', labelcolor='tab:blue')

# Create a unified legend for all lines
fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.9))

# Final plot adjustments
fig.tight_layout() # Adjust plot to prevent labels from overlapping
plt.grid(True)
plt.savefig('all_metrics_curve.png')

print("\nMetrics curve has been saved to all_metrics_curve.png")

print("\nTraining loss curve has been saved to training_loss_curve.png")

## Evaluating its accuracy

In [None]:
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for img, target in loader["test"]:
        img, target = img.to(device), target.to(device)
        
        # make prediction
        pred = net(img)
        
        # accumulate
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()

print(f"Accuracy of the network on the {total} test images: {100 * correct / total:.2f}%")