# Fine-Tuning fasterrcnn_resnet50_fpn on the Kitti Dataset

The idea of fine-tuning is to train the last layer (for class prediction and bbox drawing).
We can train all of the layer

In [14]:
# #Split images in `images` directory and XML files in `xmls` directory into `train_images`, `valid_images`, and `train_xmls`, `valid_xmls` directories respectively

# import os
# import random
# import shutil

# # Validation split ratio.
# VALID_SPLIT = 0.17

# IMAGES_FOLDER = os.path.join('KittiVOC', 'images')
# XML_FOLDER = os.path.join('KittiVOC', 'annotations')

# TRAIN_IMAGES_DEST = os.path.join('data', 'train_images')
# TRAIN_XML_DEST = os.path.join('data', 'train_xmls')
# VALID_IMAGES_DEST = os.path.join('data', 'valid_images')
# VALID_XMLS_DEST = os.path.join('data', 'valid_xmls')

# os.makedirs(TRAIN_IMAGES_DEST, exist_ok=True)
# os.makedirs(TRAIN_XML_DEST, exist_ok=True)
# os.makedirs(VALID_IMAGES_DEST, exist_ok=True)
# os.makedirs(VALID_XMLS_DEST, exist_ok=True)

# all_src_images = sorted(os.listdir(IMAGES_FOLDER))
# all_src_xmls = sorted(os.listdir(XML_FOLDER))


# # Randomoze images and XML list in same order.
# temp = list(zip(all_src_images, all_src_xmls))
# random.shuffle(temp)
# res1, res2 = zip(*temp)
# temp_images, temp_xmls = list(res1), list(res2)

# print(temp_images[:3])
# print(temp_xmls[:3])

# num_training_images = int(len(temp_images)*(1-VALID_SPLIT))
# num_valid_images = int(len(temp_images)-num_training_images)

# print(num_training_images, num_valid_images)

# train_images = temp_images[:num_training_images]
# train_xmls = temp_xmls[:num_training_images]

# valid_images = temp_images[num_training_images:len(all_src_images)]
# valid_xmls = temp_xmls[num_training_images:len(all_src_images)]

# print(train_images[:3])
# print(valid_images[:3])

# for i in range(len(train_images)):
#     shutil.copy(
#         os.path.join(IMAGES_FOLDER, train_images[i]),
#         os.path.join(TRAIN_IMAGES_DEST, train_images[i])
#     )
#     shutil.copy(
#         os.path.join(XML_FOLDER, train_xmls[i]),
#         os.path.join(TRAIN_XML_DEST, train_xmls[i])
#     )

# for i in range(len(valid_images)):
#     shutil.copy(
#         os.path.join(IMAGES_FOLDER, valid_images[i]),
#         os.path.join(VALID_IMAGES_DEST, valid_images[i])
#     )
#     shutil.copy(
#         os.path.join(XML_FOLDER, valid_xmls[i]),
#         os.path.join(VALID_XMLS_DEST, valid_xmls[i])
#     )

In [15]:
#because all the images have different number of boxes we need to make them equal
#output = model(images, annotations)

In [16]:
from torch_utils.engine import train_one_epoch, evaluate

from datasets import (
    create_train_dataset, create_valid_dataset, 
    create_train_loader, create_valid_loader
)
#from models.create_fasterrcnn_model import create_model
from utils.general import (
    set_training_dir, Averager, 
    save_model, save_loss_plot,
    show_tranformed_image,
    save_mAP, save_model_state, SaveBestModel
)
from utils.logging import (
    set_log, 
    coco_log
)

import torch
import argparse
import yaml
import numpy as np
import sys

torch.multiprocessing.set_sharing_strategy('file_system')

import torchvision

from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# For same annotation colors each time.
np.random.seed(42)

# Train

In [17]:
#open config file
with open('/home/aya/Desktop/Kitti_FasterRCNN/data_configs/data.yaml') as file:
        data_configs = yaml.safe_load(file)

TRAIN_DIR_IMAGES = data_configs['TRAIN_DIR_IMAGES']
TRAIN_DIR_LABELS = data_configs['TRAIN_DIR_LABELS']
VALID_DIR_IMAGES = data_configs['VALID_DIR_IMAGES']
VALID_DIR_LABELS = data_configs['VALID_DIR_LABELS']
CLASSES = data_configs['CLASSES']
NUM_CLASSES = data_configs['NC']
SAVE_VALID_PREDICTIONS = data_configs['SAVE_VALID_PREDICTION_IMAGES']

In [18]:
# Settings model's parameters 
NUM_WORKERS = 4 #num of workers for data processing
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
NUM_EPOCHS = 100

BATCH_SIZE = 8 #batch size to load the data
OUT_DIR = set_training_dir("resultat")
COLORS = np.random.uniform(0, 1, size=(len(CLASSES), 3))

WEIGHTS_PATH= None
RESUME_TRAINING = False
COSINE_ANNEALING = True #use cosine annealing warm restarts

# Set logging file.
set_log(OUT_DIR)
# writer = set_summary_writer(OUT_DIR)

In [19]:
# Model configurations
IMAGE_WIDTH = 350
IMAGE_HEIGHT = 350

# Load the splitted data

In [20]:
train_dataset = create_train_dataset(
        TRAIN_DIR_IMAGES, TRAIN_DIR_LABELS,
        IMAGE_WIDTH, IMAGE_HEIGHT, CLASSES,
        use_train_aug=True,
        mosaic=False
    )

valid_dataset = create_valid_dataset(
        VALID_DIR_IMAGES, VALID_DIR_LABELS, 
        IMAGE_WIDTH, IMAGE_HEIGHT, CLASSES
    )

train_loader = create_train_loader(train_dataset, BATCH_SIZE, NUM_WORKERS)
valid_loader = create_valid_loader(valid_dataset, BATCH_SIZE, NUM_WORKERS)

print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(valid_dataset)}\n")

data/train_xmls
data/valid_xmls
Number of training samples: 6209
Number of validation samples: 1272



In [21]:
# Initialize the Averager class to keep track of the training and validation loss values and help to get the average for each epoch as well
train_loss_hist = Averager()

# Train and validation loss lists to store loss values of all iterations till ena and plot graphs for all iterations.
train_loss_list = []
loss_cls_list = []
loss_box_reg_list = []
loss_objectness_list = []
loss_rpn_list = []
train_loss_list_epoch = []
val_map_05 = []
val_map = []
start_epochs = 0

In [22]:
if WEIGHTS_PATH is None:
        print('Building model from scratch...')        
        model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
        # Get the number of input features 
        in_features = model.roi_heads.box_predictor.cls_score.in_features
        # define a new head for the detector with required number of classes
        model.roi_heads.box_predictor = FastRCNNPredictor(in_features, NUM_CLASSES)
        
# Load pretrained weights if path is provided.
if WEIGHTS_PATH is not None:
        print('Loading pretrained weights...')
        
        # Load the pretrained checkpoint.
        checkpoint = torch.load(WEIGHTS_PATH, map_location=DEVICE) 
        keys = list(checkpoint['model_state_dict'].keys())
        ckpt_state_dict = checkpoint['model_state_dict']
        # Get the number of classes from the loaded checkpoint.
        old_classes = ckpt_state_dict['roi_heads.box_predictor.cls_score.weight'].shape[0]

        model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
        # Build the new model with number of classes same as checkpoint.
        in_features = model.roi_heads.box_predictor.cls_score.in_features
        # define a new head for the detector with required number of classes
        model.roi_heads.box_predictor = FastRCNNPredictor(in_features, NUM_CLASSES)
        # Load weights.
        model.load_state_dict(ckpt_state_dict)

        # Change output features for class predictor and box predictor
        # according to current dataset classes.
        in_features = model.roi_heads.box_predictor.cls_score.in_features
        model.roi_heads.box_predictor.cls_score = torch.nn.Linear(
            in_features=in_features, out_features=NUM_CLASSES, bias=True
        )
        model.roi_heads.box_predictor.bbox_pred = torch.nn.Linear(
            in_features=in_features, out_features=NUM_CLASSES*4, bias=True
        )
if RESUME_TRAINING:
            print('RESUMING TRAINING...')
            # Update the starting epochs, the batch-wise loss list, 
            # and the epoch-wise loss list.
            if checkpoint['epoch']:
                start_epochs = checkpoint['epoch']
                print(f"Resuming from epoch {start_epochs}...")
            if checkpoint['train_loss_list']:
                print('Loading previous batch wise loss list...')
                train_loss_list = checkpoint['train_loss_list']
            if checkpoint['train_loss_list_epoch']:
                print('Loading previous epoch wise loss list...')
                train_loss_list_epoch = checkpoint['train_loss_list_epoch']
            if checkpoint['val_map']:
                print('Loading previous mAP list')
                val_map = checkpoint['val_map']
            if checkpoint['val_map_05']:
                val_map_05 = checkpoint['val_map_05']

Building model from scratch...


In [23]:
print(model)
model = model.to(DEVICE)

# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")
# Get the model parameters.
params = [p for p in model.parameters() if p.requires_grad]

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [24]:
# Define the optimizer.
#optimizer = torch.optim.SGD(params, lr=0.001, momentum=0.9, nesterov=True)
optimizer = torch.optim.AdamW(params, lr=0.0001, weight_decay=0.0005)

In [25]:
if RESUME_TRAINING: 
        
        # LOAD THE OPTIMIZER STATE DICTIONARY FROM THE CHECKPOINT.
        print('Loading optimizer state dictionary...')
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        optimizer.param_groups[0]['capturable'] = True


if COSINE_ANNEALING:
        # LR will be zero as we approach `steps` number of epochs each time.
        # If `steps = 5`, LR will slowly reduce to zero every 5 epochs.
        steps = NUM_EPOCHS + 10
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, 
            T_0=steps,
            T_mult=1,
            verbose=False
        )
else:
        scheduler = None

save_best_model = SaveBestModel()

In [26]:
for epoch in range(start_epochs, NUM_EPOCHS):
        train_loss_hist.reset()

        _, batch_loss_list, \
             batch_loss_cls_list, \
             batch_loss_box_reg_list, \
             batch_loss_objectness_list, \
             batch_loss_rpn_list = train_one_epoch(
            model, 
            optimizer, 
            train_loader, 
            DEVICE, 
            epoch, 
            train_loss_hist,
            print_freq=100,
            scheduler=scheduler
        )

        coco_evaluator, stats, val_pred_image = evaluate(
            model, 
            valid_loader, 
            device=DEVICE,
            save_valid_preds=SAVE_VALID_PREDICTIONS,
            out_dir=OUT_DIR,
            classes=CLASSES,
            colors=COLORS
        )

        # Append the current epoch's batch-wise losses to the `train_loss_list`.
        train_loss_list.extend(batch_loss_list)
        loss_cls_list.extend(batch_loss_cls_list)
        loss_box_reg_list.extend(batch_loss_box_reg_list)
        loss_objectness_list.extend(batch_loss_objectness_list)
        loss_rpn_list.extend(batch_loss_rpn_list)
        # Append curent epoch's average loss to `train_loss_list_epoch`.
        train_loss_list_epoch.append(train_loss_hist.value)
        val_map_05.append(stats[1])
        val_map.append(stats[0])

        # Save loss plot for batch-wise list.
        save_loss_plot(OUT_DIR, train_loss_list)
        # Save loss plot for epoch-wise list.
        save_loss_plot(
            OUT_DIR, 
            train_loss_list_epoch,
            'epochs',
            'train loss',
            save_name='train_loss_epoch' 
        )
        save_loss_plot(
            OUT_DIR, 
            loss_cls_list, 
            'iterations', 
            'loss cls',
            save_name='loss_cls'
        )
        save_loss_plot(
            OUT_DIR, 
            loss_box_reg_list, 
            'iterations', 
            'loss bbox reg',
            save_name='loss_bbox_reg'
        )
        save_loss_plot(
            OUT_DIR,
            loss_objectness_list,
            'iterations',
            'loss obj',
            save_name='loss_obj'
        )
        save_loss_plot(
            OUT_DIR,
            loss_rpn_list,
            'iterations',
            'loss rpn bbox',
            save_name='loss_rpn_bbox'
        )

        # Save mAP plots.
        save_mAP(OUT_DIR, val_map_05, val_map)

        coco_log(OUT_DIR, stats)

        # Save the current epoch model state. This can be used 
        # to resume training. It saves model state dict, number of
        # epochs trained for, optimizer state dict, and loss function.
        save_model(
            epoch, 
            model, 
            optimizer, 
            train_loss_list, 
            train_loss_list_epoch,
            val_map,
            val_map_05,
            OUT_DIR,
            data_configs,
            "fasterrcnn_resnet50_fpn"
        )
        # Save the model dictionary only for the current epoch.
        save_model_state(model, OUT_DIR, data_configs, "fasterrcnn_resnet50_fpn")
        # Save best model if the current mAP @0.5:0.95 IoU is  greater than the last hightest.
        save_best_model(
            model, 
            val_map[-1], 
            epoch, 
            OUT_DIR,
            data_configs,
            "fasterrcnn_resnet50_fpn"
        )

RuntimeError: CUDA out of memory. Tried to allocate 40.00 MiB (GPU 0; 10.91 GiB total capacity; 9.80 GiB already allocated; 23.19 MiB free; 9.88 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF