In [1]:
import torch
import torch.nn as nn
import torchvision

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import os

from torch.utils.data import Dataset, DataLoader
# from torch.utils.data.sampler import Sampler
import torch.optim as optim
import sys
sys.path.append('../')
sys.path.append('../../')

from dataset import CocoDetection, train_transforms, val_transforms, test_transforms
from visualize import visualize
# from rcnn_model import fasterrcnn_resnet201_fpn, FastRCNNPredictor
from engine import evaluate
import utils
from models.swin import *

In [2]:
from models.detection.backbone_utils import swin_fpn_backbone, _validate_trainable_layers
from ops.feature_pyramid_network import LastLevelP6P7, LastLevelMaxPool
from models.detection.retinanet import RetinaNet
from torch.hub import load_state_dict_from_url
from models.detection.anchor_utils import AnchorGenerator
# from models.detection.backbone_utils import mobilenet_backbone

In [3]:
def retinanet_swin_t_fpn(pretrained=False, progress=True,
                           num_classes=91, pretrained_backbone=False, trainable_backbone_layers=None, **kwargs):
    trainable_backbone_layers = _validate_trainable_layers(
        pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3)

    if pretrained:
        # no need to download the backbone if pretrained is set
        pretrained_backbone = False
        
    anchor_sizes = ((32, 64, 128, 256, 512), ) * 5
    aspect_ratios = ((0.5, 0.75, 1.0, 1.5, 2.0),) * len(anchor_sizes)
    rpn_anchor_generator=AnchorGenerator(anchor_sizes, aspect_ratios)
    
    # skip P2 because it generates too many anchors (according to their paper)
    backbone = swin_fpn_backbone('swin_t', pretrained_backbone, returned_layers=[2, 3, 4],
                                   extra_blocks=LastLevelP6P7(256,256), trainable_layers=trainable_backbone_layers)
    
    model = RetinaNet(backbone, num_classes, anchor_generator=rpn_anchor_generator, **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls['retinanet_resnet50_fpn_coco'],
                                              progress=progress)
        model.load_state_dict(state_dict)
        overwrite_eps(model, 0.0)
    return model

In [4]:
import easydict 
args = easydict.EasyDict({ "batch_size": 4, 
                          "epochs": 90, 
                          "data": 0, 
                          'lr':0.002,
                         'momentum':0.9,
                         'weight_decay':1e-4,
                         'start_epoch':0,
                         'gpu':1,
                          'workers':12,
                         'print_freq':1000,
                         'output_dir':'../trained_model/retinanet_swin_v2_t_fpn/'})

In [5]:
from pathlib import Path
path = Path(args.output_dir.split('checkpoint')[0])
path.mkdir(parents=True, exist_ok=True)  

In [6]:
ngpus_per_node = torch.cuda.device_count()
print(ngpus_per_node)
GPU_NUM = args.gpu # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device)
print(device)

3
cuda:1


In [7]:
NUM_CLASS = 91
IMG_SIZE = 448*2
model = retinanet_swin_t_fpn(pretrained=False, min_size=IMG_SIZE, max_size=IMG_SIZE, num_classes=NUM_CLASS)

device = torch.device('cuda')
model.to(device)
print('model is loaded to gpu')

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


return_layers {'layer2': '0', 'layer3': '1', 'layer4': '2'}
model is loaded to gpu


In [8]:
from dataset import CocoDetection, train_transforms, val_transforms, test_transforms
train_dataset = CocoDetection(root='/home/beomgon/Dataset/scl/', annFile='../../data/train.json', 
                              transforms=train_transforms)
test_dataset = CocoDetection(root='/home/beomgon/Dataset/scl/', annFile='../../data/test.json', 
                              transforms=val_transforms)

loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [9]:
image, target = next(iter(train_dataset))
target
                    

{'boxes': tensor([[152.6875, 353.0625, 228.8125, 423.5000]]),
 'category_id': tensor([1]),
 'labels': tensor([1]),
 'image_id': tensor([1]),
 'area': tensor([5362.0547]),
 'iscrowd': tensor([0])}

In [10]:
train_sampler = torch.utils.data.RandomSampler(train_dataset)
test_sampler = torch.utils.data.SequentialSampler(test_dataset)

train_loader = DataLoader(
    train_dataset, batch_size=args.batch_size,
    sampler=train_sampler, num_workers=args.workers,
    collate_fn=utils.collate_fn)

test_loader = DataLoader(
    test_dataset, batch_size=args.batch_size,
    sampler=test_sampler, num_workers=args.workers,
    collate_fn=utils.collate_fn)

In [11]:
params = [p for p in model.parameters() if p.requires_grad]
# optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.weight_decay)
optimizer = torch.optim.SGD(
       params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15, 30, 45, 60, 75], 
                                                    gamma=0.5)

In [12]:
# g_attn = torch.randn([2, 1024, 96])
# print(g_attn.shape)
# attn_windows = torch.randn([2048, 7, 7, 96])
# print(attn_windows.shape)
# # attn_windows = attn_windows.view(B, self.H_, self.W_, self.window_size, self.window_size, C) + g_attn[:,:,None,None,:]
# b = attn_windows.view(2, -1, 7, 7, 96) + g_attn[:,:,None,None,:]
# b.shape

In [12]:
from engine import train_one_epoch

start_time = time.time()
for epoch in range(args.epochs):
    train_one_epoch(model, optimizer, train_loader, device, epoch, args.print_freq)
    lr_scheduler.step()
    
    if epoch > 60 and epoch % 5 == 0 :
        if args.output_dir:
            checkpoint = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'args': args,
                'epoch': epoch
            }
            utils.save_on_master(
                checkpoint,
                os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
            utils.save_on_master(
                checkpoint,
                os.path.join(args.output_dir, 'checkpoint.pth'))

    if epoch > 5 and epoch % 5 == 0 :
        # evaluate after every epoch
        evaluate(model, test_loader, device=device)    
print('total time is {}'.format(time.time() - start_time))    

Epoch: [0]  [   0/1549]  eta: 1:27:55  lr: 0.000004  loss: 2.4303 (2.4303)  classification: 1.6582 (1.6582)  bbox_regression: 0.7720 (0.7720)  time: 3.4055  data: 1.8936  max mem: 14158
Epoch: [0]  [1000/1549]  eta: 0:09:51  lr: 0.002000  loss: 1.8238 (1.9847)  classification: 1.2257 (1.3692)  bbox_regression: 0.5839 (0.6155)  time: 1.0638  data: 0.0169  max mem: 14531
Epoch: [0]  [1548/1549]  eta: 0:00:01  lr: 0.002000  loss: 1.6119 (1.9222)  classification: 0.9910 (1.3117)  bbox_regression: 0.6164 (0.6105)  time: 1.0415  data: 0.0158  max mem: 14539
Epoch: [0] Total time: 0:27:44 (1.0745 s / it)
Epoch: [1]  [   0/1549]  eta: 1:19:16  lr: 0.002000  loss: 1.8279 (1.8279)  classification: 1.1413 (1.1413)  bbox_regression: 0.6866 (0.6866)  time: 3.0709  data: 1.9376  max mem: 14539
Epoch: [1]  [1000/1549]  eta: 0:09:48  lr: 0.002000  loss: 1.4281 (1.6882)  classification: 0.9213 (1.1007)  bbox_regression: 0.5311 (0.5876)  time: 1.0668  data: 0.0173  max mem: 14539
Epoch: [1]  [1548/1549]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/beomgon/anaconda3/envs/pytorch/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_30590/1496413834.py", line 5, in <module>
    train_one_epoch(model, optimizer, train_loader, device, epoch, args.print_freq)
  File "/home/beomgon/project/scl/torchvision_Detection/notebooks/../engine.py", line 48, in train_one_epoch
    losses.backward()
  File "/home/beomgon/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/beomgon/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/autograd/__init__.py", line 154, in backward
    Variable._execution_engine.run_backward(
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File 

TypeError: object of type 'NoneType' has no len()

In [None]:
model