1、尝试不同数量的神经元/滤波器

In [1]:
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from torch import nn
import numpy as np
import torch
import os
import random
from tqdm import tqdm as tqdm
from IPython import display

from models.vgg import VGG_A_Small  # you need to implement this network
from data.loaders import get_cifar_loader

import torchvision.utils
from pathlib import Path


# This function is used to calculate the accuracy of model classification
def get_accuracy(model, dataloader, device):
    ## --------------------
    # Add code as needed
    """
    计算模型在 dataloader 上的准确率

    :param model: 待评估的模型
    :param dataloader: 数据加载器(如 val_loader)
    :param device: 'cpu' 或 'cuda'
    :return: float,准确率(0~100)
    """
    model.eval()  # 设置为评估模式
    correct = 0
    total = 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)

            outputs = model(X)
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    accuracy = 100.0 * correct / total
    return accuracy
    ## --------------------
    pass


# Set a random seed to ensure reproducible results
def set_random_seeds(seed_value=0, device='cpu'):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    if device != 'cpu':
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


# We use this function to complete the entire
# training process. In order to plot the loss landscape,
# you need to record the loss value of each step.
# Of course, as before, you can test your model
# after drawing a training round and save the curve
# to observe the training
def train(model, optimizer, criterion, train_loader, val_loader, scheduler=None, epochs_n=100, best_model_path=None):
    model.to(device)
    learning_curve = [np.nan] * epochs_n
    train_accuracy_curve = [np.nan] * epochs_n
    val_accuracy_curve = [np.nan] * epochs_n
    max_val_accuracy = 0
    max_val_accuracy_epoch = 0

    batches_n = len(train_loader)
    losses_list = []
    grads = []

    # 外层循环：epoch
    for epoch in range(epochs_n):
        print(f"\nEpoch {epoch + 1}/{epochs_n}")

        # 创建当前epoch的进度条
        epoch_progress = tqdm(total=batches_n, desc=f"Training batches", unit="batch", leave=False)

        model.train()
        loss_list = []
        learning_curve[epoch] = 0  # 重置为0

        for batch_idx, data in enumerate(train_loader):
            x, y = data
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            prediction = model(x)
            loss = criterion(prediction, y)

            loss_list.append(loss.item())
            learning_curve[epoch] += loss.item()

            loss.backward()
            optimizer.step()

            # 更新进度条（每个batch更新一次）
            epoch_progress.update(1)
            epoch_progress.set_postfix(loss=f"{loss.item():.4f}")

        # 完成当前epoch的进度条
        epoch_progress.close()

        losses_list.append(loss_list)
        learning_curve[epoch] /= batches_n

        # 计算验证准确率
        val_acc = get_accuracy(model, val_loader, device)
        val_accuracy_curve[epoch] = val_acc

        # 打印epoch结果
        print(f"Epoch {epoch + 1} - Loss: {learning_curve[epoch]:.4f} - Val Acc: {val_acc:.2f}%")

        # 保存最佳模型
        if val_acc > max_val_accuracy:
            max_val_accuracy = val_acc
            max_val_accuracy_epoch = epoch
            if best_model_path:
                torch.save(model.state_dict(), best_model_path)
                print(f"Saved new best model at epoch {epoch + 1} with val acc {val_acc:.2f}%")

    return losses_list, grads


# loss landscape绘制，绘制loss文件夹下所有模型
def plot_loss_landscape(loss_dir,
                        output_image="loss_landscape.png",
                        step_size=10,
                        alpha=0.4,
                        use_log_scale=True,
                        figsize=(12, 8),
                        dpi=100,
                        title="Loss Landscape Comparison",
                        xlabel="Training Steps (grouped by iterations)",
                        ylabel="Loss Value",
                        grid_style=('--', 0.7),
                        legend_loc='upper right',
                        legend_fontsize=12,
                        title_fontsize=16,
                        label_fontsize=14,
                        colors=None):
    """
    绘制多个模型的损失景观图，每个模型显示其loss值的波动范围

    参数:
        loss_dir (str): 包含loss文件的目录路径
        output_image (str): 输出图像文件名，默认为"loss_landscape.png"
        step_size (int): 每个step包含的迭代次数，默认为10
        alpha (float): 填充区域的透明度(0-1)，默认为0.4
        use_log_scale (bool): 是否使用对数y轴，默认为True
        figsize (tuple): 图像尺寸，默认为(12, 8)
        dpi (int): 图像分辨率，默认为100
        title (str): 图表标题，默认为"Loss Landscape Comparison"
        xlabel (str): x轴标签，默认为"Training Steps (grouped by iterations)"
        ylabel (str): y轴标签，默认为"Loss Value"
        grid_style (tuple): 网格线样式(linestyle, alpha)，默认为('--', 0.7)
        legend_loc (str): 图例位置，默认为'upper right'
        legend_fontsize (int): 图例字体大小，默认为12
        title_fontsize (int): 标题字体大小，默认为16
        label_fontsize (int): 轴标签字体大小，默认为14
        colors (list): 自定义颜色列表，如果为None则使用默认颜色
    """
    # 创建图形
    plt.figure(figsize=figsize, dpi=dpi)
    plt.title(title, fontsize=title_fontsize)
    plt.xlabel(xlabel, fontsize=label_fontsize)
    plt.ylabel(ylabel, fontsize=label_fontsize)
    plt.grid(True, linestyle=grid_style[0], alpha=grid_style[1])

    # 设置默认颜色
    if colors is None:
        colors = plt.cm.tab10(np.linspace(0, 1, 10))

    # 遍历所有loss文件
    for idx, file_path in enumerate(Path(loss_dir).glob("*.txt")):
        model_name = file_path.stem  # 使用文件名作为模型名称

        # 读取并解析文件
        all_losses = []
        try:
            with open(file_path, 'r') as f:
                for line in f:
                    # 处理每行中的空格分隔的损失值
                    losses = [float(x) for x in line.split()]
                    all_losses.extend(losses)
        except Exception as e:
            print(f"错误: 读取文件 {file_path} 时出错 - {e}")
            continue

        if not all_losses:
            print(f"警告: {file_path} 没有有效数据，跳过")
            continue

        print(f"处理模型: {model_name} ({len(all_losses)} 个loss值)")

        # 计算step数量
        num_steps = len(all_losses) // step_size

        # 准备min_curve和max_curve
        min_curve = []
        max_curve = []

        # 每step_size个loss值作为一个step，计算该step的最小和最大值
        for i in range(num_steps):
            start = i * step_size
            end = start + step_size
            step_losses = all_losses[start:end]

            min_loss = min(step_losses)
            max_loss = max(step_losses)

            min_curve.append(min_loss)
            max_curve.append(max_loss)

        # 创建step数组
        steps = np.arange(num_steps)

        # 填充最小值和最大值之间的区域
        plt.fill_between(steps, min_curve, max_curve,
                         color=colors[idx % len(colors)],
                         alpha=alpha,
                         label=model_name)

    # 添加图例和美化
    plt.legend(fontsize=legend_fontsize, loc=legend_loc)
    if use_log_scale:
        plt.yscale('log')  # 对数坐标更好展示变化
    plt.tight_layout()

    # 保存并显示图像
    plt.savefig(output_image, bbox_inches='tight')
    print(f"结果已保存至: {output_image}")


if __name__ == '__main__':

    # ## Constants (parameters) initialization
    num_workers = 4
    batch_size = 128

    # add our package dir to path
    module_path = os.path.dirname(os.getcwd())
    home_path = module_path
    figures_path = os.path.join(home_path, 'reports', 'figures')
    models_path = os.path.join(home_path, 'reports', 'models')

    # Make sure you are using the right device.
    "没有GPU,这里改用CPU"
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    device = torch.device("cpu")
    print(device)


    # Initialize your data loader and
    # make sure that dataloader works
    # as expected by observing one
    # sample from it.
    train_loader = get_cifar_loader(train=True)
    val_loader = get_cifar_loader(train=False)
    for X, y in train_loader:
        ## --------------------
        # Add code as needed
        # X: 图像张量，shape: [B, 3, 32, 32]
        # y: 标签张量，shape: [B]
        print("Batch shape:", X.shape)
        print("Labels shape:", y.shape)
        print("Labels:", y.tolist())

        # 将前8张图像组成网格展示（将像素还原到 0-1）
        grid = torchvision.utils.make_grid(X[:8], nrow=4, normalize=True)
        npimg = grid.permute(1, 2, 0).numpy()


        plt.figure(figsize=(6, 3))
        plt.imshow(npimg)
        plt.title("Sample Training Images")
        plt.axis("off")
        plt.savefig("sample.png")
        ## --------------------
        break

    # Train your model
    # feel free to modify
    epo = 20
    loss_save_path = 'loss/optimization/loss_1'
    grad_save_path = 'grad/optimization/grad_1'

    set_random_seeds(seed_value=2020, device=device)
    model = VGG_A_Small()
    model_name = type(model).__name__

    lr = 0.001
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    loss, grads = train(model, optimizer, criterion, train_loader, val_loader, epochs_n=epo)
    np.savetxt(os.path.join(loss_save_path, f'{model_name}.txt'), loss, fmt='%s', delimiter=' ')
    np.savetxt(os.path.join(grad_save_path, f'{model_name}.txt'), grads, fmt='%s', delimiter=' ')

    # 绘制“loss”文件夹下所有模型
    plot_loss_landscape(loss_dir=loss_save_path,output_image="optimization_1.png")


cpu
Files already downloaded and verified
Files already downloaded and verified
Batch shape: torch.Size([128, 3, 32, 32])
Labels shape: torch.Size([128])
Labels: [4, 4, 2, 6, 3, 2, 0, 4, 0, 8, 8, 6, 6, 3, 0, 2, 6, 7, 1, 9, 0, 7, 4, 7, 6, 7, 3, 2, 9, 7, 7, 7, 9, 9, 0, 4, 8, 1, 0, 1, 4, 3, 3, 1, 9, 8, 3, 0, 7, 8, 6, 5, 7, 5, 5, 4, 3, 8, 8, 6, 6, 3, 6, 6, 2, 7, 8, 0, 9, 4, 7, 9, 7, 5, 8, 8, 4, 2, 4, 2, 0, 0, 8, 4, 2, 8, 2, 0, 9, 1, 1, 4, 9, 4, 7, 5, 3, 4, 0, 8, 6, 4, 8, 9, 5, 6, 1, 9, 3, 8, 9, 7, 0, 4, 0, 2, 7, 9, 1, 0, 6, 9, 4, 2, 9, 0, 7, 7]

Epoch 1/20


                                                                                   

Epoch 1 - Loss: 1.3156 - Val Acc: 62.46%

Epoch 2/20


                                                                                   

Epoch 2 - Loss: 0.9082 - Val Acc: 69.07%

Epoch 3/20


                                                                                   

Epoch 3 - Loss: 0.7637 - Val Acc: 70.98%

Epoch 4/20


                                                                                   

Epoch 4 - Loss: 0.6696 - Val Acc: 71.50%

Epoch 5/20


                                                                                   

Epoch 5 - Loss: 0.5871 - Val Acc: 72.62%

Epoch 6/20


                                                                                   

Epoch 6 - Loss: 0.5159 - Val Acc: 74.02%

Epoch 7/20


                                                                                   

Epoch 7 - Loss: 0.4582 - Val Acc: 72.81%

Epoch 8/20


                                                                                   

Epoch 8 - Loss: 0.4070 - Val Acc: 75.76%

Epoch 9/20


                                                                                   

Epoch 9 - Loss: 0.3538 - Val Acc: 75.22%

Epoch 10/20


                                                                                   

Epoch 10 - Loss: 0.3143 - Val Acc: 76.07%

Epoch 11/20


                                                                                   

Epoch 11 - Loss: 0.2692 - Val Acc: 75.41%

Epoch 12/20


                                                                                   

Epoch 12 - Loss: 0.2436 - Val Acc: 76.25%

Epoch 13/20


                                                                                   

Epoch 13 - Loss: 0.2084 - Val Acc: 75.30%

Epoch 14/20


                                                                                   

Epoch 14 - Loss: 0.1842 - Val Acc: 76.33%

Epoch 15/20


                                                                                   

Epoch 15 - Loss: 0.1722 - Val Acc: 75.84%

Epoch 16/20


                                                                                   

Epoch 16 - Loss: 0.1353 - Val Acc: 74.39%

Epoch 17/20


                                                                                   

Epoch 17 - Loss: 0.1237 - Val Acc: 75.50%

Epoch 18/20


                                                                                   

Epoch 18 - Loss: 0.1161 - Val Acc: 76.11%

Epoch 19/20


                                                                                   

Epoch 19 - Loss: 0.1166 - Val Acc: 75.35%

Epoch 20/20


                                                                                   

Epoch 20 - Loss: 0.1043 - Val Acc: 75.67%
处理模型: VGG_A_BatchNorm (7820 个loss值)
处理模型: VGG_A_Small (7820 个loss值)
结果已保存至: optimization_1.png


2、尝试不同的损失函数（带有不同的正则化）

In [2]:
from models.vgg import VGG_A_BatchNorm  

if __name__ == '__main__':
    
    # ## Constants (parameters) initialization
    num_workers = 4
    batch_size = 128

    # add our package dir to path
    module_path = os.path.dirname(os.getcwd())
    home_path = module_path
    figures_path = os.path.join(home_path, 'reports', 'figures')
    models_path = os.path.join(home_path, 'reports', 'models')

    # Make sure you are using the right device.
    "没有GPU,这里改用CPU"
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    device = torch.device("cpu")
    print(device)

    train_loader = get_cifar_loader(train=True)
    val_loader = get_cifar_loader(train=False)
        
    # Train your model
    # feel free to modify
    epo = 20
    loss_save_path = 'loss/optimization/loss_2'
    grad_save_path = 'grad/optimization/grad_2'

    set_random_seeds(seed_value=2020, device=device)
    model = VGG_A_BatchNorm()
    model_name = type(model).__name__

    lr = 0.001
    criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    loss, grads = train(model, optimizer, criterion, train_loader, val_loader, epochs_n=epo)
    np.savetxt(os.path.join(loss_save_path, f'{model_name}.txt'), loss, fmt='%s', delimiter=' ')
    np.savetxt(os.path.join(grad_save_path, f'{model_name}.txt'), grads, fmt='%s', delimiter=' ')

    # 绘制“loss”文件夹下所有模型
    plot_loss_landscape(loss_dir=loss_save_path,output_image="optimization_2.png")


cpu
Files already downloaded and verified
Files already downloaded and verified

Epoch 1/20


                                                                                   

Epoch 1 - Loss: 1.5732 - Val Acc: 59.88%

Epoch 2/20


                                                                                   

Epoch 2 - Loss: 1.1799 - Val Acc: 69.68%

Epoch 3/20


                                                                                   

Epoch 3 - Loss: 1.0260 - Val Acc: 74.19%

Epoch 4/20


                                                                                   

Epoch 4 - Loss: 0.9338 - Val Acc: 76.53%

Epoch 5/20


                                                                                   

Epoch 5 - Loss: 0.8720 - Val Acc: 77.85%

Epoch 6/20


                                                                                   

Epoch 6 - Loss: 0.8224 - Val Acc: 76.61%

Epoch 7/20


                                                                                   

Epoch 7 - Loss: 0.7804 - Val Acc: 80.37%

Epoch 8/20


                                                                                   

Epoch 8 - Loss: 0.7355 - Val Acc: 77.38%

Epoch 9/20


                                                                                   

Epoch 9 - Loss: 0.7059 - Val Acc: 80.26%

Epoch 10/20


                                                                                   

Epoch 10 - Loss: 0.6784 - Val Acc: 80.82%

Epoch 11/20


                                                                                   

Epoch 11 - Loss: 0.6524 - Val Acc: 81.18%

Epoch 12/20


                                                                                   

Epoch 12 - Loss: 0.6366 - Val Acc: 78.61%

Epoch 13/20


                                                                                   

Epoch 13 - Loss: 0.6234 - Val Acc: 81.98%

Epoch 14/20


                                                                                   

Epoch 14 - Loss: 0.6069 - Val Acc: 80.34%

Epoch 15/20


                                                                                   

Epoch 15 - Loss: 0.5984 - Val Acc: 79.25%

Epoch 16/20


                                                                                   

Epoch 16 - Loss: 0.5926 - Val Acc: 81.11%

Epoch 17/20


                                                                                   

Epoch 17 - Loss: 0.5870 - Val Acc: 81.46%

Epoch 18/20


                                                                                   

Epoch 18 - Loss: 0.5774 - Val Acc: 79.70%

Epoch 19/20


                                                                                   

Epoch 19 - Loss: 0.5808 - Val Acc: 82.45%

Epoch 20/20


                                                                                   

Epoch 20 - Loss: 0.5702 - Val Acc: 80.94%
处理模型: baseline VGG_A_BatchNorm (7820 个loss值)
处理模型: VGG_A_BatchNorm (7820 个loss值)
结果已保存至: optimization_2.png


3、尝试不同的激活函数

In [1]:
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from torch import nn
import numpy as np
import torch
import os
import random
from tqdm import tqdm as tqdm
from IPython import display

from models.vgg import VGG_A_Leaky 
from data.loaders import get_cifar_loader

import torchvision.utils
from pathlib import Path


# This function is used to calculate the accuracy of model classification
def get_accuracy(model, dataloader, device):
    ## --------------------
    # Add code as needed
    """
    计算模型在 dataloader 上的准确率

    :param model: 待评估的模型
    :param dataloader: 数据加载器(如 val_loader)
    :param device: 'cpu' 或 'cuda'
    :return: float,准确率(0~100)
    """
    model.eval()  # 设置为评估模式
    correct = 0
    total = 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)

            outputs = model(X)
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    accuracy = 100.0 * correct / total
    return accuracy
    ## --------------------
    pass


# Set a random seed to ensure reproducible results
def set_random_seeds(seed_value=0, device='cpu'):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    if device != 'cpu':
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


# We use this function to complete the entire
# training process. In order to plot the loss landscape,
# you need to record the loss value of each step.
# Of course, as before, you can test your model
# after drawing a training round and save the curve
# to observe the training
def train(model, optimizer, criterion, train_loader, val_loader, scheduler=None, epochs_n=100, best_model_path=None):
    model.to(device)
    learning_curve = [np.nan] * epochs_n
    train_accuracy_curve = [np.nan] * epochs_n
    val_accuracy_curve = [np.nan] * epochs_n
    max_val_accuracy = 0
    max_val_accuracy_epoch = 0

    batches_n = len(train_loader)
    losses_list = []
    grads = []

    # 外层循环：epoch
    for epoch in range(epochs_n):
        print(f"\nEpoch {epoch + 1}/{epochs_n}")

        # 创建当前epoch的进度条
        epoch_progress = tqdm(total=batches_n, desc=f"Training batches", unit="batch", leave=False)

        model.train()
        loss_list = []
        learning_curve[epoch] = 0  # 重置为0

        for batch_idx, data in enumerate(train_loader):
            x, y = data
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            prediction = model(x)
            loss = criterion(prediction, y)

            loss_list.append(loss.item())
            learning_curve[epoch] += loss.item()

            loss.backward()
            optimizer.step()

            # 更新进度条（每个batch更新一次）
            epoch_progress.update(1)
            epoch_progress.set_postfix(loss=f"{loss.item():.4f}")

        # 完成当前epoch的进度条
        epoch_progress.close()

        losses_list.append(loss_list)
        learning_curve[epoch] /= batches_n

        # 计算验证准确率
        val_acc = get_accuracy(model, val_loader, device)
        val_accuracy_curve[epoch] = val_acc

        # 打印epoch结果
        print(f"Epoch {epoch + 1} - Loss: {learning_curve[epoch]:.4f} - Val Acc: {val_acc:.2f}%")

        # 保存最佳模型
        if val_acc > max_val_accuracy:
            max_val_accuracy = val_acc
            max_val_accuracy_epoch = epoch
            if best_model_path:
                torch.save(model.state_dict(), best_model_path)
                print(f"Saved new best model at epoch {epoch + 1} with val acc {val_acc:.2f}%")

    return losses_list, grads


# loss landscape绘制，绘制loss文件夹下所有模型
def plot_loss_landscape(loss_dir,
                        output_image="loss_landscape.png",
                        step_size=10,
                        alpha=0.4,
                        use_log_scale=True,
                        figsize=(12, 8),
                        dpi=100,
                        title="Loss Landscape Comparison",
                        xlabel="Training Steps (grouped by iterations)",
                        ylabel="Loss Value",
                        grid_style=('--', 0.7),
                        legend_loc='upper right',
                        legend_fontsize=12,
                        title_fontsize=16,
                        label_fontsize=14,
                        colors=None):
    """
    绘制多个模型的损失景观图，每个模型显示其loss值的波动范围

    参数:
        loss_dir (str): 包含loss文件的目录路径
        output_image (str): 输出图像文件名，默认为"loss_landscape.png"
        step_size (int): 每个step包含的迭代次数，默认为10
        alpha (float): 填充区域的透明度(0-1)，默认为0.4
        use_log_scale (bool): 是否使用对数y轴，默认为True
        figsize (tuple): 图像尺寸，默认为(12, 8)
        dpi (int): 图像分辨率，默认为100
        title (str): 图表标题，默认为"Loss Landscape Comparison"
        xlabel (str): x轴标签，默认为"Training Steps (grouped by iterations)"
        ylabel (str): y轴标签，默认为"Loss Value"
        grid_style (tuple): 网格线样式(linestyle, alpha)，默认为('--', 0.7)
        legend_loc (str): 图例位置，默认为'upper right'
        legend_fontsize (int): 图例字体大小，默认为12
        title_fontsize (int): 标题字体大小，默认为16
        label_fontsize (int): 轴标签字体大小，默认为14
        colors (list): 自定义颜色列表，如果为None则使用默认颜色
    """
    # 创建图形
    plt.figure(figsize=figsize, dpi=dpi)
    plt.title(title, fontsize=title_fontsize)
    plt.xlabel(xlabel, fontsize=label_fontsize)
    plt.ylabel(ylabel, fontsize=label_fontsize)
    plt.grid(True, linestyle=grid_style[0], alpha=grid_style[1])

    # 设置默认颜色
    if colors is None:
        colors = plt.cm.tab10(np.linspace(0, 1, 10))

    # 遍历所有loss文件
    for idx, file_path in enumerate(Path(loss_dir).glob("*.txt")):
        model_name = file_path.stem  # 使用文件名作为模型名称

        # 读取并解析文件
        all_losses = []
        try:
            with open(file_path, 'r') as f:
                for line in f:
                    # 处理每行中的空格分隔的损失值
                    losses = [float(x) for x in line.split()]
                    all_losses.extend(losses)
        except Exception as e:
            print(f"错误: 读取文件 {file_path} 时出错 - {e}")
            continue

        if not all_losses:
            print(f"警告: {file_path} 没有有效数据，跳过")
            continue

        print(f"处理模型: {model_name} ({len(all_losses)} 个loss值)")

        # 计算step数量
        num_steps = len(all_losses) // step_size

        # 准备min_curve和max_curve
        min_curve = []
        max_curve = []

        # 每step_size个loss值作为一个step，计算该step的最小和最大值
        for i in range(num_steps):
            start = i * step_size
            end = start + step_size
            step_losses = all_losses[start:end]

            min_loss = min(step_losses)
            max_loss = max(step_losses)

            min_curve.append(min_loss)
            max_curve.append(max_loss)

        # 创建step数组
        steps = np.arange(num_steps)

        # 填充最小值和最大值之间的区域
        plt.fill_between(steps, min_curve, max_curve,
                         color=colors[idx % len(colors)],
                         alpha=alpha,
                         label=model_name)

    # 添加图例和美化
    plt.legend(fontsize=legend_fontsize, loc=legend_loc)
    if use_log_scale:
        plt.yscale('log')  # 对数坐标更好展示变化
    plt.tight_layout()

    # 保存并显示图像
    plt.savefig(output_image, bbox_inches='tight')
    print(f"结果已保存至: {output_image}")

In [2]:


if __name__ == '__main__':
    
    # ## Constants (parameters) initialization
    num_workers = 4
    batch_size = 128

    # add our package dir to path
    module_path = os.path.dirname(os.getcwd())
    home_path = module_path
    figures_path = os.path.join(home_path, 'reports', 'figures')
    models_path = os.path.join(home_path, 'reports', 'models')

    # Make sure you are using the right device.
    "没有GPU,这里改用CPU"
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    device = torch.device("cpu")
    print(device)

    train_loader = get_cifar_loader(train=True)
    val_loader = get_cifar_loader(train=False)
        
    # Train your model
    # feel free to modify
    epo = 20
    loss_save_path = 'loss/optimization/loss_3'
    grad_save_path = 'grad/optimization/grad_3'

    set_random_seeds(seed_value=2020, device=device)
    model = VGG_A_Leaky()
    model_name = type(model).__name__

    lr = 0.001
    criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    loss, grads = train(model, optimizer, criterion, train_loader, val_loader, epochs_n=epo)
    np.savetxt(os.path.join(loss_save_path, f'{model_name}.txt'), loss, fmt='%s', delimiter=' ')
    np.savetxt(os.path.join(grad_save_path, f'{model_name}.txt'), grads, fmt='%s', delimiter=' ')

    # 绘制“loss”文件夹下所有模型
    plot_loss_landscape(loss_dir=loss_save_path,output_image="optimization_3.png")


cpu
Files already downloaded and verified
Files already downloaded and verified

Epoch 1/20


                                                                                   

Epoch 1 - Loss: 1.5965 - Val Acc: 60.13%

Epoch 2/20


                                                                                   

Epoch 2 - Loss: 1.1995 - Val Acc: 65.83%

Epoch 3/20


                                                                                   

Epoch 3 - Loss: 1.0493 - Val Acc: 72.25%

Epoch 4/20


                                                                                   

Epoch 4 - Loss: 0.9606 - Val Acc: 73.88%

Epoch 5/20


                                                                                   

Epoch 5 - Loss: 0.9010 - Val Acc: 78.99%

Epoch 6/20


                                                                                   

Epoch 6 - Loss: 0.8491 - Val Acc: 78.60%

Epoch 7/20


                                                                                   

Epoch 7 - Loss: 0.8124 - Val Acc: 79.81%

Epoch 8/20


                                                                                   

Epoch 8 - Loss: 0.7714 - Val Acc: 80.01%

Epoch 9/20


                                                                                   

Epoch 9 - Loss: 0.7384 - Val Acc: 81.05%

Epoch 10/20


                                                                                   

Epoch 10 - Loss: 0.7107 - Val Acc: 80.37%

Epoch 11/20


                                                                                   

Epoch 11 - Loss: 0.6865 - Val Acc: 80.19%

Epoch 12/20


                                                                                   

Epoch 12 - Loss: 0.6669 - Val Acc: 78.02%

Epoch 13/20


                                                                                   

Epoch 13 - Loss: 0.6471 - Val Acc: 79.91%

Epoch 14/20


                                                                                   

Epoch 14 - Loss: 0.6263 - Val Acc: 82.16%

Epoch 15/20


                                                                                   

Epoch 15 - Loss: 0.6142 - Val Acc: 78.26%

Epoch 16/20


                                                                                   

Epoch 16 - Loss: 0.6111 - Val Acc: 80.51%

Epoch 17/20


                                                                                   

Epoch 17 - Loss: 0.6010 - Val Acc: 82.17%

Epoch 18/20


                                                                                   

Epoch 18 - Loss: 0.5873 - Val Acc: 80.74%

Epoch 19/20


                                                                                   

Epoch 19 - Loss: 0.5874 - Val Acc: 83.22%

Epoch 20/20


                                                                                   

Epoch 20 - Loss: 0.5825 - Val Acc: 81.79%
处理模型: VGG_A_BatchNorm (7820 个loss值)
处理模型: VGG_A_Leaky (7820 个loss值)
结果已保存至: optimization_3.png


4、使用 torch.optim 尝试不同的优化器
原始的优化器是Adam，这里尝试了'SGD', 'AdamW', 'RMSprop'三种优化器

In [1]:
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from torch import nn
import numpy as np
import torch
import os
import random
from tqdm import tqdm as tqdm

from models.vgg import VGG_A_BatchNorm
from data.loaders import get_cifar_loader

import torchvision.utils
from pathlib import Path


def get_accuracy(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            outputs = model(X)
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    accuracy = 100.0 * correct / total
    return accuracy


def set_random_seeds(seed_value=0, device='cpu'):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    if device != 'cpu':
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


def train(model, optimizer, criterion, train_loader, val_loader, scheduler=None, epochs_n=100, best_model_path=None):
    model.to(device)
    learning_curve = [np.nan] * epochs_n
    train_accuracy_curve = [np.nan] * epochs_n
    val_accuracy_curve = [np.nan] * epochs_n
    max_val_accuracy = 0

    batches_n = len(train_loader)
    losses_list = []
    grads = []

    for epoch in range(epochs_n):
        print(f"\nEpoch {epoch + 1}/{epochs_n}")
        epoch_progress = tqdm(total=batches_n, desc=f"Training batches", unit="batch", leave=False)
        model.train()
        loss_list = []
        learning_curve[epoch] = 0

        for batch_idx, data in enumerate(train_loader):
            x, y = data
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            prediction = model(x)
            loss = criterion(prediction, y)
            loss_list.append(loss.item())
            learning_curve[epoch] += loss.item()
            loss.backward()
            optimizer.step()
            epoch_progress.update(1)
            epoch_progress.set_postfix(loss=f"{loss.item():.4f}")

        epoch_progress.close()
        losses_list.append(loss_list)
        learning_curve[epoch] /= batches_n

        val_acc = get_accuracy(model, val_loader, device)
        val_accuracy_curve[epoch] = val_acc
        print(f"Epoch {epoch + 1} - Loss: {learning_curve[epoch]:.4f} - Val Acc: {val_acc:.2f}%")

        if val_acc > max_val_accuracy:
            max_val_accuracy = val_acc
            if best_model_path:
                torch.save(model.state_dict(), best_model_path)
                print(f"Saved new best model at epoch {epoch + 1} with val acc {val_acc:.2f}%")

        if scheduler:
            scheduler.step()

    return losses_list, grads


def plot_loss_landscape(loss_dir, output_image="loss_landscape.png", step_size=10, alpha=0.4, use_log_scale=True,
                        figsize=(12, 8), dpi=100, title="Loss Landscape Comparison", xlabel="Training Steps",
                        ylabel="Loss Value", grid_style=('--', 0.7), legend_loc='upper right',
                        legend_fontsize=12, title_fontsize=16, label_fontsize=14, colors=None):

    plt.figure(figsize=figsize, dpi=dpi)
    plt.title(title, fontsize=title_fontsize)
    plt.xlabel(xlabel, fontsize=label_fontsize)
    plt.ylabel(ylabel, fontsize=label_fontsize)
    plt.grid(True, linestyle=grid_style[0], alpha=grid_style[1])

    if colors is None:
        colors = plt.cm.tab10(np.linspace(0, 1, 10))

    for idx, file_path in enumerate(Path(loss_dir).glob("*.txt")):
        model_name = file_path.stem
        all_losses = []
        try:
            with open(file_path, 'r') as f:
                for line in f:
                    losses = [float(x) for x in line.split()]
                    all_losses.extend(losses)
        except Exception as e:
            print(f"错误: 读取文件 {file_path} 时出错 - {e}")
            continue

        if not all_losses:
            print(f"警告: {file_path} 没有有效数据，跳过")
            continue

        print(f"处理模型: {model_name} ({len(all_losses)} 个loss值)")
        num_steps = len(all_losses) // step_size
        min_curve, max_curve = [], []

        for i in range(num_steps):
            step_losses = all_losses[i * step_size:(i + 1) * step_size]
            min_curve.append(min(step_losses))
            max_curve.append(max(step_losses))

        steps = np.arange(num_steps)
        plt.fill_between(steps, min_curve, max_curve,
                         color=colors[idx % len(colors)],
                         alpha=alpha,
                         label=model_name)

    plt.legend(fontsize=legend_fontsize, loc=legend_loc)
    if use_log_scale:
        plt.yscale('log')
    plt.tight_layout()
    plt.savefig(output_image, bbox_inches='tight')
    print(f"结果已保存至: {output_image}")


if __name__ == '__main__':
    num_workers = 4
    batch_size = 128

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    device = torch.device("cpu")
    print(device)

    train_loader = get_cifar_loader(train=True)
    val_loader = get_cifar_loader(train=False)
    for X, y in train_loader:
        print("Batch shape:", X.shape)
        print("Labels shape:", y.shape)
        print("Labels:", y.tolist())
        grid = torchvision.utils.make_grid(X[:8], nrow=4, normalize=True)
        npimg = grid.permute(1, 2, 0).numpy()
        plt.figure(figsize=(6, 3))
        plt.imshow(npimg)
        plt.title("Sample Training Images")
        plt.axis("off")
        plt.savefig("sample.png")
        break

    # === 训练设置 ===
    epo = 20
    lr = 0.001
    set_random_seeds(seed_value=2020, device=device)

    model = VGG_A_BatchNorm()
    model_name = type(model).__name__

    # === 优化器选择 ===
    # baseline是Adam
    optimizer_type = 'AdamW'  # 'SGD', 'AdamW', 'RMSprop'

    if optimizer_type == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    elif optimizer_type == 'AdamW':
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    elif optimizer_type == 'RMSprop':
        optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
    else:
        raise ValueError(f"Unsupported optimizer type: {optimizer_type}")

    # === 动态保存路径 ===
    loss_save_path = 'loss/optimization_2'
    grad_save_path = 'grad/optimization_2'
    os.makedirs(loss_save_path, exist_ok=True)
    os.makedirs(grad_save_path, exist_ok=True)

    criterion = nn.CrossEntropyLoss()
    loss, grads = train(model, optimizer, criterion, train_loader, val_loader, epochs_n=epo)

    # === 保存 loss / grad ===
    np.savetxt(os.path.join(loss_save_path, f'{model_name}_{optimizer_type}.txt'), loss, fmt='%s', delimiter=' ')
    np.savetxt(os.path.join(grad_save_path, f'{model_name}_{optimizer_type}.txt'), grads, fmt='%s', delimiter=' ')

    # === 绘制 loss 曲线 ===
    plot_loss_landscape(loss_dir=loss_save_path, output_image="optimization_4.png")


cpu
Files already downloaded and verified
Files already downloaded and verified
Batch shape: torch.Size([128, 3, 32, 32])
Labels shape: torch.Size([128])
Labels: [8, 7, 2, 9, 6, 6, 5, 4, 1, 7, 2, 2, 4, 6, 8, 6, 7, 8, 8, 7, 2, 5, 5, 7, 3, 5, 4, 2, 7, 8, 0, 0, 7, 4, 5, 3, 8, 0, 3, 7, 2, 1, 3, 3, 6, 4, 9, 6, 5, 8, 5, 5, 5, 0, 1, 5, 1, 2, 6, 9, 8, 0, 9, 2, 7, 5, 4, 3, 5, 4, 3, 4, 5, 6, 2, 7, 1, 0, 9, 3, 8, 6, 2, 5, 0, 6, 5, 0, 3, 1, 4, 9, 8, 6, 7, 6, 5, 7, 0, 7, 6, 5, 7, 1, 2, 8, 6, 0, 5, 9, 4, 3, 0, 4, 3, 5, 6, 1, 9, 5, 5, 7, 6, 6, 4, 0, 8, 6]

Epoch 1/20


                                                                                   

Epoch 1 - Loss: 1.3793 - Val Acc: 60.89%

Epoch 2/20


                                                                                   

Epoch 2 - Loss: 0.8902 - Val Acc: 69.72%

Epoch 3/20


                                                                                   

Epoch 3 - Loss: 0.6857 - Val Acc: 73.78%

Epoch 4/20


                                                                                   

Epoch 4 - Loss: 0.5419 - Val Acc: 72.50%

Epoch 5/20


                                                                                   

Epoch 5 - Loss: 0.4390 - Val Acc: 80.33%

Epoch 6/20


                                                                                   

Epoch 6 - Loss: 0.3569 - Val Acc: 79.09%

Epoch 7/20


                                                                                   

Epoch 7 - Loss: 0.2865 - Val Acc: 79.74%

Epoch 8/20


                                                                                   

Epoch 8 - Loss: 0.2222 - Val Acc: 81.36%

Epoch 9/20


                                                                                   

Epoch 9 - Loss: 0.1698 - Val Acc: 79.29%

Epoch 10/20


                                                                                   

Epoch 10 - Loss: 0.1439 - Val Acc: 82.22%

Epoch 11/20


                                                                                   

Epoch 11 - Loss: 0.1133 - Val Acc: 81.71%

Epoch 12/20


                                                                                   

Epoch 12 - Loss: 0.0994 - Val Acc: 79.33%

Epoch 13/20


                                                                                   

Epoch 13 - Loss: 0.0825 - Val Acc: 81.45%

Epoch 14/20


                                                                                   

Epoch 14 - Loss: 0.0701 - Val Acc: 82.48%

Epoch 15/20


                                                                                   

Epoch 15 - Loss: 0.0646 - Val Acc: 79.26%

Epoch 16/20


                                                                                   

Epoch 16 - Loss: 0.0619 - Val Acc: 82.31%

Epoch 17/20


                                                                                   

Epoch 17 - Loss: 0.0595 - Val Acc: 83.21%

Epoch 18/20


                                                                                   

Epoch 18 - Loss: 0.0512 - Val Acc: 80.80%

Epoch 19/20


                                                                                   

Epoch 19 - Loss: 0.0475 - Val Acc: 83.02%

Epoch 20/20


                                                                                   

Epoch 20 - Loss: 0.0467 - Val Acc: 81.92%
处理模型: VGG_A_BatchNorm (7820 个loss值)
处理模型: VGG_A_BatchNorm_AdamW (7820 个loss值)
处理模型: VGG_A_BatchNorm_RMSprop (7820 个loss值)
处理模型: VGG_A_BatchNorm_SGD (7820 个loss值)
结果已保存至: optimization_4.png


通过上面四种优化策略，最终选择使用带有标签平滑（Label Smoothing）的交叉熵损失函数优化器和AdamW优化器，其余不变。重新运行并保存模型权重。

In [4]:
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from torch import nn
import numpy as np
import torch
import os
import random
from tqdm import tqdm as tqdm

from models.vgg import VGG_A_BatchNorm
from data.loaders import get_cifar_loader

import torchvision.utils
from pathlib import Path


def get_accuracy(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            outputs = model(X)
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    accuracy = 100.0 * correct / total
    return accuracy


def set_random_seeds(seed_value=0, device='cpu'):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    if device != 'cpu':
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


def train(model, optimizer, criterion, train_loader, val_loader, scheduler=None, epochs_n=100, best_model_path=None):
    model.to(device)
    max_val_accuracy = 0
    best_epoch = -1
    best_loss = float('inf')

    for epoch in range(epochs_n):
        print(f"\nEpoch {epoch + 1}/{epochs_n}")
        model.train()
        total_loss = 0.0
        batches_n = len(train_loader)

        for x, y in tqdm(train_loader, desc=f"Training Epoch {epoch+1}", leave=False):
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            prediction = model(x)
            loss = criterion(prediction, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / batches_n
        val_acc = get_accuracy(model, val_loader, device)
        print(f"Epoch {epoch + 1} - Loss: {avg_loss:.4f} - Val Acc: {val_acc:.2f}%")

        if val_acc > max_val_accuracy:
            max_val_accuracy = val_acc
            best_loss = avg_loss
            best_epoch = epoch + 1
            if best_model_path:
                torch.save(model.state_dict(), best_model_path)
                print(f"保存新最佳模型：Epoch {best_epoch} | Loss: {best_loss:.4f} | Val Acc: {max_val_accuracy:.2f}%")

        if scheduler:
            scheduler.step()

    print(f"\n最佳模型信息：Epoch {best_epoch} | Loss: {best_loss:.4f} | Val Acc: {max_val_accuracy:.2f}%")
    return best_loss, max_val_accuracy


if __name__ == '__main__':
    num_workers = 4
    batch_size = 128

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    device = torch.device("cpu")
    print(device)

    train_loader = get_cifar_loader(train=True)
    val_loader = get_cifar_loader(train=False)

    # === 训练设置 ===
    epo = 20
    lr = 0.001
    set_random_seeds(seed_value=2020, device=device)

    model = VGG_A_BatchNorm()
    model_name = type(model).__name__

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
    
    # === 动态保存路径 ===

    best_model_dir = 'best_model'
    os.makedirs(best_model_dir, exist_ok=True)
    best_model_path = os.path.join(best_model_dir, f'{model_name}_best.pth')
    
    # === 启动训练 ===
    best_loss, best_acc = train(model, optimizer, criterion, train_loader, val_loader,
                                epochs_n=epo, best_model_path=best_model_path)
    
    # === 输出最终结果 ===
    print(f"\n训练完成！")
    print(f"最佳模型保存在: {best_model_path}")
    print(f"验证集最佳 Loss: {best_loss:.4f}")
    print(f"验证集最佳 Accuracy: {best_acc:.2f}%")


cpu
Files already downloaded and verified
Files already downloaded and verified

Epoch 1/20


                                                                   

Epoch 1 - Loss: 1.5725 - Val Acc: 58.99%
保存新最佳模型：Epoch 1 | Loss: 1.5725 | Val Acc: 58.99%

Epoch 2/20


                                                                   

Epoch 2 - Loss: 1.1723 - Val Acc: 68.64%
保存新最佳模型：Epoch 2 | Loss: 1.1723 | Val Acc: 68.64%

Epoch 3/20


                                                                   

Epoch 3 - Loss: 1.0092 - Val Acc: 74.77%
保存新最佳模型：Epoch 3 | Loss: 1.0092 | Val Acc: 74.77%

Epoch 4/20


                                                                   

Epoch 4 - Loss: 0.9066 - Val Acc: 76.66%
保存新最佳模型：Epoch 4 | Loss: 0.9066 | Val Acc: 76.66%

Epoch 5/20


                                                                   

Epoch 5 - Loss: 0.8289 - Val Acc: 80.81%
保存新最佳模型：Epoch 5 | Loss: 0.8289 | Val Acc: 80.81%

Epoch 6/20


                                                                   

Epoch 6 - Loss: 0.7660 - Val Acc: 81.32%
保存新最佳模型：Epoch 6 | Loss: 0.7660 | Val Acc: 81.32%

Epoch 7/20


                                                                   

Epoch 7 - Loss: 0.7110 - Val Acc: 80.02%

Epoch 8/20


                                                                   

Epoch 8 - Loss: 0.6655 - Val Acc: 81.18%

Epoch 9/20


                                                                   

Epoch 9 - Loss: 0.6305 - Val Acc: 81.49%
保存新最佳模型：Epoch 9 | Loss: 0.6305 | Val Acc: 81.49%

Epoch 10/20


                                                                    

Epoch 10 - Loss: 0.6058 - Val Acc: 79.82%

Epoch 11/20


                                                                    

Epoch 11 - Loss: 0.5834 - Val Acc: 81.73%
保存新最佳模型：Epoch 11 | Loss: 0.5834 | Val Acc: 81.73%

Epoch 12/20


                                                                    

Epoch 12 - Loss: 0.5720 - Val Acc: 81.59%

Epoch 13/20


                                                                    

Epoch 13 - Loss: 0.5643 - Val Acc: 82.50%
保存新最佳模型：Epoch 13 | Loss: 0.5643 | Val Acc: 82.50%

Epoch 14/20


                                                                    

Epoch 14 - Loss: 0.5624 - Val Acc: 82.64%
保存新最佳模型：Epoch 14 | Loss: 0.5624 | Val Acc: 82.64%

Epoch 15/20


                                                                    

Epoch 15 - Loss: 0.5492 - Val Acc: 80.15%

Epoch 16/20


                                                                    

Epoch 16 - Loss: 0.5489 - Val Acc: 83.06%
保存新最佳模型：Epoch 16 | Loss: 0.5489 | Val Acc: 83.06%

Epoch 17/20


                                                                    

Epoch 17 - Loss: 0.5429 - Val Acc: 80.96%

Epoch 18/20


                                                                    

Epoch 18 - Loss: 0.5415 - Val Acc: 81.75%

Epoch 19/20


                                                                    

Epoch 19 - Loss: 0.5418 - Val Acc: 81.56%

Epoch 20/20


                                                                    

Epoch 20 - Loss: 0.5360 - Val Acc: 82.62%

最佳模型信息：Epoch 16 | Loss: 0.5489 | Val Acc: 83.06%

训练完成！
最佳模型保存在: best_model\VGG_A_BatchNorm_best.pth
验证集最佳 Loss: 0.5489
验证集最佳 Accuracy: 83.06%
