In [1]:
# 加载数据集
from utils.loadData import *
from utils.tools import *
from utils.trainWandb import *

# 导入写好的 resnet 文件
import sys
sys.path.append("..")
from resnet import *
import os

# 数据处理用到的包
from tqdm import tqdm
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# 导入训练所需要使用的包
import torch
import torch.optim as optim
from torch.optim import lr_scheduler

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

### 关于 net 定义请选择其一运行

### 选择1:使用正常的resnet50训练

In [None]:
net = get_resnet50()

### 选择2:使用带有dropout的resnet50训练
AdaptiveAvgpool2d -> dropout -> linear

In [2]:
net = get_resnet50()

# AdaptiveAvgpool2d -> dropout
net[5].add_module(name='dropout', module=nn.Dropout(0.5))

In [3]:
# 为了在 gpu 上进行训练
net = net.to(device)

# 定义优化器对象
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

# 交叉熵损失函数
criterion = nn.CrossEntropyLoss()

# 学习率降低策略
lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

In [4]:
# 训练轮次 Epoch
EPOCHS = 50

# 定义批量大小
batch_size = 128

# 加载数据集
train_iter, test_iter = load_data_cifar10(batch_size, resize=224)

Files already downloaded and verified
Files already downloaded and verified


## 训练日志的初始化

In [5]:
epoch = 0
batch_idx = 0
best_test_accuracy = 0

# 训练日志-训练集
df_train_log = pd.DataFrame()
log_train = {}
log_train['epoch'] = 0
log_train['batch'] = 0
images, labels = next(iter(train_iter))
log_train.update(train_one_batch(images, labels, device, net, optimizer, criterion,epoch, batch_size))
df_train_log = df_train_log.append(log_train, ignore_index=True)

# 训练日志-测试集
df_test_log = pd.DataFrame()
log_test = {}
log_test['epoch'] = 0
log_test.update(evaluate_testset(test_iter, device, net, criterion, epoch))
df_test_log = df_test_log.append(log_test, ignore_index=True)

## 创建wandb可视化项目

In [6]:
import wandb

wandb.init(project='resnet50-train', name=time.strftime('%m%d%H%M%S'))

[34m[1mwandb[0m: Currently logged in as: [33mzhangenshuo[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from tqdm.autonotebook import tqdm

for epoch in range(1, EPOCHS+1):
    
    print(f'Epoch {epoch}/{EPOCHS}')
    
    ## 训练阶段
    net.train()
    for images, labels in tqdm(train_iter): # 获得一个 batch 的数据和标注
        batch_idx += 1
        # 准备一个训练字典
        log_train = train_one_batch(images, labels, device, net, optimizer,criterion, epoch, batch_size)
        df_train_log = df_train_log.append(log_train, ignore_index=True)
        # 写入训练字典
        wandb.log(log_train)
        
    lr_scheduler.step()

    ## 测试阶段
    net.eval()
    # 准备一个测试字典
    log_test = evaluate_testset(test_iter, device, net, criterion, epoch)
    df_test_log = df_test_log.append(log_test, ignore_index=True)
    # 写入测试字典
    wandb.log(log_test)
    
    # 保存最新的最佳模型文件
    if log_test['test_accuracy'] > best_test_accuracy: 
        # 删除旧的最佳模型文件(如有)
        old_best_checkpoint_path = 'checkpoints/best-{:.3f}.pth'.format(best_test_accuracy)
        if os.path.exists(old_best_checkpoint_path):
            os.remove(old_best_checkpoint_path)
        # 保存新的最佳模型文件
        new_best_checkpoint_path = 'checkpoints/best-{:.3f}.pth'.format(log_test['test_accuracy'])
        torch.save(net, new_best_checkpoint_path)
        print('保存新的最佳模型', 'checkpoints/best-{:.3f}.pth'.format(best_test_accuracy))
        best_test_accuracy = log_test['test_accuracy']

df_train_log.to_csv('训练日志-训练集.csv', index=False)
df_test_log.to_csv('训练日志-测试集.csv', index=False)