In [1]:
import os

import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.optim as optim
from torch.utils.data import DataLoader

from nets.frcnn import FasterRCNN
from nets.frcnn_training import (FasterRCNNTrainer, get_lr_scheduler,
                                 set_optimizer_lr, weights_init)
from utils.callbacks import LossHistory
from utils.dataloader import FRCNNDataset, frcnn_dataset_collate
from utils.utils import get_classes, show_config
from utils.utils_fit import fit_one_epoch



In [None]:
Cuda            = True
train_gpu       = [0,]
fp16            = False
classes_path    = 'model_data/voc_classes.txt'
model_path      = 'model_data/voc_weights_resnet.pth'
input_shape     = [600, 600]
backbone        = "resnet50"
pretrained      = False
anchors_size    = [8, 16, 32]

UnFreeze_Epoch      = 100
Unfreeze_batch_size = 2
Freeze_Train        = True
Init_lr             = 1e-4
Min_lr              = Init_lr * 0.01
optimizer_type      = "adam"
momentum            = 0.9
weight_decay        = 0
lr_decay_type       = 'cos'
save_period         = 5
save_dir            = 'logs'
num_workers         = 4
train_annotation_path   = '2007_train.txt'
val_annotation_path     = '2007_val.txt'

class_names, num_classes = get_classes(classes_path)

os.environ["CUDA_VISIBLE_DEVICES"]  = ','.join(str(x) for x in train_gpu)
ngpus_per_node                      = len(train_gpu)
print('Number of devices: {}'.format(ngpus_per_node))

model = FasterRCNN(num_classes, anchor_scales = anchors_size, backbone = backbone, pretrained = pretrained)
if not pretrained:
    weights_init(model)
if model_path != '':
    print('Load weights {}.'.format(model_path))

    device          = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_dict      = model.state_dict()
    pretrained_dict = torch.load(model_path, map_location = device)
    load_key, no_load_key, temp_dict = [], [], {}
    for k, v in pretrained_dict.items():
        if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v):
            temp_dict[k] = v
            load_key.append(k)
        else:
            no_load_key.append(k)
    model_dict.update(temp_dict)
    model.load_state_dict(model_dict)

loss_history = LossHistory(save_dir, model, input_shape=input_shape)

if fp16:
    from torch.cuda.amp import GradScaler as GradScaler
    scaler = GradScaler()
else:
    scaler = None

model_train     = model.train()
if Cuda:
    model_train = torch.nn.DataParallel(model_train)
    cudnn.benchmark = True
    model_train = model_train.cuda()

with open(train_annotation_path, encoding='utf-8') as f:
    train_lines = f.readlines()
with open(val_annotation_path, encoding='utf-8') as f:
    val_lines   = f.readlines()
num_train   = len(train_lines)
num_val     = len(val_lines)

show_config(
    classes_path = classes_path, model_path = model_path, input_shape = input_shape, \
    Init_Epoch = Init_Epoch, Freeze_Epoch = Freeze_Epoch, UnFreeze_Epoch = UnFreeze_Epoch, Freeze_batch_size = Freeze_batch_size, Unfreeze_batch_size = Unfreeze_batch_size, Freeze_Train = Freeze_Train, \
    Init_lr = Init_lr, Min_lr = Min_lr, optimizer_type = optimizer_type, momentum = momentum, lr_decay_type = lr_decay_type, \
    save_period = save_period, save_dir = save_dir, num_workers = num_workers, num_train = num_train, num_val = num_val
)
wanted_step = 5e4 if optimizer_type == "sgd" else 1.5e4
total_step  = num_train // Unfreeze_batch_size * UnFreeze_Epoch
if total_step <= wanted_step:
    wanted_epoch = wanted_step // (num_train // Unfreeze_batch_size) + 1
    
if True:
    UnFreeze_flag = False
    if Freeze_Train:
        for param in model.extractor.parameters():
            param.requires_grad = False
    model.freeze_bn()

    batch_size = Freeze_batch_size if Freeze_Train else Unfreeze_batch_size

    nbs             = 16
    lr_limit_max    = 1e-4 if optimizer_type == 'adam' else 5e-2
    lr_limit_min    = 1e-4 if optimizer_type == 'adam' else 5e-4
    Init_lr_fit     = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
    Min_lr_fit      = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)

    optimizer = {
        'adam'  : optim.Adam(model.parameters(), Init_lr_fit, betas = (momentum, 0.999), weight_decay = weight_decay),
        'sgd'   : optim.SGD(model.parameters(), Init_lr_fit, momentum = momentum, nesterov=True, weight_decay = weight_decay)
    }[optimizer_type]

    lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)

    epoch_step      = num_train // batch_size
    epoch_step_val  = num_val // batch_size

    if epoch_step == 0 or epoch_step_val == 0:
        raise ValueError("数据集过小，无法继续进行训练，请扩充数据集。")

    train_dataset   = FRCNNDataset(train_lines, input_shape, train = True)
    val_dataset     = FRCNNDataset(val_lines, input_shape, train = False)

    gen             = DataLoader(train_dataset, shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
                                drop_last=True, collate_fn=frcnn_dataset_collate)
    gen_val         = DataLoader(val_dataset  , shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True, 
                                drop_last=True, collate_fn=frcnn_dataset_collate)

    train_util      = FasterRCNNTrainer(model_train, optimizer)

    for epoch in range(Init_Epoch, UnFreeze_Epoch):
        if epoch >= Freeze_Epoch and not UnFreeze_flag and Freeze_Train:
            batch_size = Unfreeze_batch_size

            nbs             = 16
            lr_limit_max    = 1e-4 if optimizer_type == 'adam' else 5e-2
            lr_limit_min    = 1e-4 if optimizer_type == 'adam' else 5e-4
            Init_lr_fit     = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
            Min_lr_fit      = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)

            lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)

            for param in model.extractor.parameters():
                param.requires_grad = True

            model.freeze_bn()

            epoch_step      = num_train // batch_size
            epoch_step_val  = num_val // batch_size

            if epoch_step == 0 or epoch_step_val == 0:
                raise ValueError("数据集过小，无法继续进行训练，请扩充数据集。")

            gen             = DataLoader(train_dataset, shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
                                        drop_last=True, collate_fn=frcnn_dataset_collate)
            gen_val         = DataLoader(val_dataset  , shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True, 
                                        drop_last=True, collate_fn=frcnn_dataset_collate)

            UnFreeze_flag = True

        set_optimizer_lr(optimizer, lr_scheduler_func, epoch)

        fit_one_epoch(model, train_util, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, UnFreeze_Epoch, Cuda, fp16, scaler, save_period, save_dir)

    loss_history.writer.close()

Number of devices: 1
initialize network with normal type
Load weights model_data/voc_weights_resnet.pth.


Epoch 1/100:   0%|          | 0/4137 [00:00<?, ?it/s<class 'dict'>]


Successful Load Key: ['extractor.0.weight', 'extractor.1.weight', 'extractor.1.bias', 'extractor.1.running_mean', 'extractor.1.running_var', 'extractor.1.num_batches_tracked', 'extractor.4.0.conv1.weight', 'extractor.4.0.bn1.weight', 'extractor.4.0.bn1.bias', 'extractor.4.0.bn1.running_mean', 'extractor.4.0.bn1.running_var', 'extractor.4.0.bn1.num_batches_tracked', 'extractor.4.0.conv2.weight', 'extractor.4.0.bn2.weight', 'extractor.4.0.bn2.bias', 'extractor.4.0.bn2.running_mean', 'extractor.4.0.bn2.running_var', 'e ……
Successful Load Key Num: 328

Fail To Load Key: [] ……
Fail To Load Key num: 0

[1;33;44m温馨提示，head部分没有载入是正常现象，Backbone部分没有载入是错误的。[0m
Configurations:
----------------------------------------------------------------------
|                     keys |                                   values|
----------------------------------------------------------------------
|             classes_path |               model_data/voc_classes.txt|
|               model_path |        mode

Epoch 1/100: 100%|██████████| 4137/4137 [1:24:00<00:00,  1.22s/it, lr=1e-5, roi_cls=0.177, roi_loc=0.561, rpn_cls=0.0375, rpn_loc=0.0617, total_loss=0.838]
Epoch 1/100:   0%|          | 0/1238 [00:00<?, ?it/s<class 'dict'>]

Finish Train
Start Validation


Epoch 1/100: 100%|██████████| 1238/1238 [12:34<00:00,  1.64it/s, val_loss=0.986]


Finish Validation
Epoch:1/100
Total Loss: 0.838 || Val Loss: 0.986 
Save best model to best_epoch_weights.pth


Epoch 2/100:   0%|          | 0/4137 [00:00<?, ?it/s<class 'dict'>]

Start Train


Epoch 2/100: 100%|██████████| 4137/4137 [1:24:06<00:00,  1.22s/it, lr=2e-5, roi_cls=0.18, roi_loc=0.564, rpn_cls=0.0376, rpn_loc=0.0594, total_loss=0.84]  
Epoch 2/100:   0%|          | 0/1238 [00:00<?, ?it/s<class 'dict'>]

Finish Train
Start Validation


Epoch 2/100: 100%|██████████| 1238/1238 [13:10<00:00,  1.57it/s, val_loss=0.991]


Finish Validation
Epoch:2/100
Total Loss: 0.840 || Val Loss: 0.991 


Epoch 3/100:   0%|          | 0/4137 [00:00<?, ?it/s<class 'dict'>]

Start Train


Epoch 3/100:  33%|███▎      | 1375/4137 [28:04<55:46,  1.21s/it, lr=5e-5, roi_cls=0.192, roi_loc=0.61, rpn_cls=0.0375, rpn_loc=0.0646, total_loss=0.904]   