In [None]:
import torchvision
import torchvision.transforms as transforms

from google.colab import drive
drive.mount('/content/drive')
save_path = '/content/drive/My Drive/Colab Notebooks/Save/'
import os
# 创建路径如果它不存在
os.makedirs(save_path, exist_ok=True)
previous_checkpoint = None  # 用来追踪上一个checkpoint的文件名


# 准备数据集并预处理
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),  #先四周填充0，在吧图像随机裁剪成32*32
    transforms.RandomHorizontalFlip(),  #图像一半的概率翻转，一半的概率不翻转
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), #R,G,B每层的归一化用到的均值和方差
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) #训练数据集
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)



for _, (x, y) in enumerate(trainset):
  print(x.shape, y)
  break



import torch


trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)   #生成一个个batch进行批训练，组成batch的时候顺序打乱取
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

for _, (x, y) in enumerate(trainloader):
  print(x.shape, y)
  break




import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"你正在使用：{device}")
class ResidualBlock(nn.Module):
    def __init__(self, inchannel, outchannel, stride=1):
        super(ResidualBlock, self).__init__()
        self.left = nn.Sequential(
            nn.Conv2d(inchannel, outchannel, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(outchannel),
            nn.ReLU(inplace=True),
            nn.Conv2d(outchannel, outchannel, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(outchannel)
        )
        self.shortcut = nn.Sequential()
        if stride != 1 or inchannel != outchannel:
            self.shortcut = nn.Sequential(
                nn.Conv2d(inchannel, outchannel, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(outchannel)
            )

    def forward(self, x):
        out = self.left(x)
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, ResidualBlock, num_classes=10):
        super(ResNet, self).__init__()
        self.inchannel = 64
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.layer1 = self.make_layer(ResidualBlock, 64,  1, stride=1)
        self.layer2 = self.make_layer(ResidualBlock, 128, 1, stride=2)
        self.layer3 = self.make_layer(ResidualBlock, 256, 1, stride=2)
        self.layer4 = self.make_layer(ResidualBlock, 512, 1, stride=2)
        self.fc = nn.Linear(512, num_classes)

    def make_layer(self, block, channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.inchannel, channels, stride))
            self.inchannel = channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

def ResNet18():
    return ResNet(ResidualBlock)

net = ResNet18().to(device)
print(net)

# 模型权重文件路径
#model_path = 'model_epoch_X.pth'  # 替换 X 为实际的 epoch 数字

# 加载模型权重
#net.load_state_dict(torch.load(model_path))




def calc_param(model: nn.Module) -> int:
    params = list(model.parameters())
    param_size = 0
    for _param in params:
        _param_size = 1
        for _ in _param.size():
            _param_size *= _
        param_size += _param_size
    return param_size

print(f"The number of model parameters is:{calc_param(net)}")





import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay=5e-4)

# 定义学习率调度器
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)




def load_checkpoint(model, optimizer, scheduler, filename='model_checkpoint.pth'):
    # 注意: 输入模型 & 优化器需要预先定义
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    scheduler.load_state_dict(checkpoint['scheduler'])

    # 也可以加载其他数据，例如 epoch 数或损失
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']

    return model, optimizer, scheduler, epoch, loss


# 例子：加载状态
checkpoint_path = os.path.join(save_path, f'model_checkpoint_43.pth')
model, optimizer, scheduler, start_epoch, val_loss = load_checkpoint(net, optimizer, scheduler, checkpoint_path)






def compute_validation_loss(model, val_loader, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    total_count = 0

    with torch.no_grad():  # No gradients needed
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            loss = criterion(outputs, y)
            total_loss += loss.item() * x.size(0)  # Multiply loss by batch size
            total_count += x.size(0)

    average_loss = total_loss / total_count
    return average_loss

for epoch in range(135):
    net.train()
    running_loss = 0.0
    for batch_idx, (x, y) in enumerate(trainloader, 0):
        inputs, labels = x.to(device), y.to(device)
        # 参数梯度置 0
        optimizer.zero_grad()
        # 前向推理
        outputs = net(inputs)
        # 计算损失
        loss = criterion(outputs, labels)
        # 后向传递梯度计算
        loss.backward()
        # 优化更新参数
        optimizer.step()
        running_loss += loss.item()
        if (batch_idx + 1) % 100 == 0:
            print(f"[EPOCH {epoch + 1:03} | IT {batch_idx + 1:05}] LOSS: {running_loss / batch_idx: .3f}")
    # 假设val_loss是你在验证集上计算得到的损失
    val_loss = compute_validation_loss(net, trainloader, device)
    scheduler.step(val_loss)

    # 保存模型权重、优化器和调度器状态
    # 删除上一个checkpoint
    if previous_checkpoint is not None:
        try:
            os.remove(previous_checkpoint)
            print(f"Removed previous checkpoint: {previous_checkpoint}")
        except OSError as e:
            print(f"Error: {previous_checkpoint} : {e.strerror}")

    checkpoint = {
        'epoch': epoch + 1,
        'state_dict': net.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict(),
        'loss': val_loss,
    }
    checkpoint_path = os.path.join(save_path, f'model_checkpoint_{epoch+1}.pth')
    torch.save(checkpoint, checkpoint_path)
    # 更新previous_checkpoint变量
    previous_checkpoint = checkpoint_path
    print(f"Checkpoint saved at {checkpoint_path}")




def calc_acc(model: nn.Module, testloader: torch.utils.data.DataLoader) -> float:
  net.eval()
  total = 0
  correct = 0
  with torch.no_grad():
    for _, (x, y) in enumerate(testloader):
      x, y = x.to(device), y.to(device)
      outputs = model(x)
      _, predicted = torch.max(outputs.data, 1)
      total += y.size(0)
      correct += (predicted == y).sum().item()

  return correct / total


print(f"""
- 模型参数量：{calc_param(net)}
- 模型测试准确率：{calc_acc(net, testloader)}""")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files already downloaded and verified
Files already downloaded and verified
torch.Size([3, 32, 32]) 6
torch.Size([128, 3, 32, 32]) tensor([1, 1, 2, 3, 6, 6, 8, 6, 5, 5, 4, 2, 3, 5, 6, 5, 7, 0, 2, 7, 1, 8, 0, 8,
        2, 5, 0, 9, 3, 7, 1, 4, 0, 0, 8, 2, 4, 9, 0, 1, 9, 8, 3, 9, 3, 6, 8, 4,
        9, 7, 3, 3, 5, 8, 3, 1, 6, 3, 8, 8, 7, 4, 5, 9, 0, 7, 9, 5, 1, 4, 0, 4,
        8, 1, 2, 2, 0, 0, 5, 4, 7, 4, 6, 6, 1, 4, 6, 5, 4, 9, 1, 1, 1, 0, 1, 2,
        7, 9, 7, 0, 1, 1, 9, 3, 8, 6, 0, 1, 8, 0, 1, 0, 0, 9, 3, 2, 1, 2, 5, 2,
        3, 2, 0, 1, 8, 0, 2, 3])
你正在使用：cuda
ResNet(
  (conv1): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (layer1): Sequential(
    (0): ResidualBlock(
      (left): Sequenti