In [1]:
from collections import OrderedDict

import torch
import torch.nn.functional as F
import torchvision
from torch import nn
from torchvision.models._utils import IntermediateLayerGetter
from typing import Dict, List

import sys
sys.path.append('../')
from util.misc import NestedTensor, is_main_process

from models.position_encoding import build_position_encoding
from models.backbone import BackboneBase

import argparse
import datetime
import json
from models import build_model
from datasets import build_dataset
from datasets import get_coco_api_from_dataset
import util.misc as utils
from engine import evaluate, train_one_epoch
from models import build_model

from feed import LbpDataset

import pandas as pd
import numpy as np

In [2]:
from main import get_args_parser
parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()])
parser.add_argument('--workers', default=8, type=int)
args = parser.parse_args(args=[])
args.coco_path = '../COCO/'
args.coco_path
args.batch_size = 4
args.resume = '../trained_models/r50_deformable_detr-checkpoint.pth'
args.distributed = False
args.output_dir = '../trained_models/'
args.num_class = 1 # remove back ground task, only real task
args.epochs = 100
args.lr_drop = 50
args.lr = 0.0002
args.device = 'cuda:2'

In [3]:
# dataset_train = build_dataset(image_set='train', args=args)
# dataset_val = build_dataset(image_set='val', args=args)
# images, targets = next(iter(dataset_train))
# print(type(dataset_val))
# # targets
# dataset_val

In [4]:
from feed import CLASS_MAPPER, get_train_test_list, train_transforms, val_transforms
df = pd.read_csv('../../data/df.csv')
df.label = df.label.apply(lambda x : CLASS_MAPPER[str(x)])
print(df.shape)
# Data loading code#
train_list, test_list = get_train_test_list(df)
train_dataset = LbpDataset(train_list, default_path='/home/beomgon/Dataset/scl/', transform=train_transforms)
test_dataset = LbpDataset(test_list, default_path='/home/beomgon/Dataset/scl/', transform=val_transforms)  

(9735, 12)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['xmax'] = df.apply(lambda x : x['xmin'] + x['w'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ymax'] = df.apply(lambda x : x['ymin'] + x['h'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['area'] = df.apply(lambda x : x['w'] * x['h'], axis=1)


total 7176 train 5382 test 1794
5382
1794


In [5]:
train_sampler = torch.utils.data.RandomSampler(train_dataset)
test_sampler = torch.utils.data.SequentialSampler(test_dataset)
data_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=args.batch_size,
                                           shuffle=False,
                                           num_workers=args.workers,
                                           # pin_memory=True,
                                           sampler=train_sampler,
                                           collate_fn=utils.collate_fn)    

data_loader_test = torch.utils.data.DataLoader(
    test_dataset, batch_size=args.batch_size,
    sampler=test_sampler, num_workers=args.workers,
    collate_fn=utils.collate_fn)


In [6]:
images, targets = next(iter(data_loader))
images.tensors.shape

torch.Size([4, 3, 1024, 1024])

In [7]:
# lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"]
def match_name_keywords(n, name_keywords):
    out = False
    for b in name_keywords:
        if b in n:
            out = True
            break
    return out

In [8]:
device = torch.device(args.device)
model, criterion, postprocessors = build_model(args)

model_without_ddp = model
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('number of params:', n_parameters)
    
param_dicts = [
    {
        "params":
            [p for n, p in model_without_ddp.named_parameters()
             if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
        "lr": args.lr,
    },
    {
        "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
        "lr": args.lr_backbone,
    },
    {
        "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
        "lr": args.lr * args.lr_linear_proj_mult,
    }
]
optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
                              weight_decay=args.weight_decay)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)    

number of params: 39847265


In [9]:
# model_without_ddp

In [10]:
checkpoint = torch.load(args.resume, map_location='cpu')
checkpoint.keys()

dict_keys(['model', 'optimizer', 'lr_scheduler', 'epoch', 'args'])

In [11]:
# if args.sgd:
#     optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9,
#                                 weight_decay=args.weight_decay)
# else:
#     optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
#                                   weight_decay=args.weight_decay)
# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

In [12]:
if args.resume:
    if args.resume.startswith('https'):
        checkpoint = torch.hub.load_state_dict_from_url(
            args.resume, map_location='cpu', check_hash=True)
    else:
        checkpoint = torch.load(args.resume, map_location='cpu')
    print('pretrained model is used')
    model_without_ddp.load_state_dict(checkpoint['model'])
        
class_embed = nn.Linear(in_features=256, out_features=args.num_class + 1, bias=True)         
model_without_ddp.class_embed = nn.ModuleList([class_embed for _ in range(6)])

model_without_ddp.to(device)
print('model is loaded to gpu')

pretrained model is used
model is loaded to gpu


In [13]:
model.class_embed

ModuleList(
  (0): Linear(in_features=256, out_features=2, bias=True)
  (1): Linear(in_features=256, out_features=2, bias=True)
  (2): Linear(in_features=256, out_features=2, bias=True)
  (3): Linear(in_features=256, out_features=2, bias=True)
  (4): Linear(in_features=256, out_features=2, bias=True)
  (5): Linear(in_features=256, out_features=2, bias=True)
)

In [14]:
from engine import train_one_epoch
import time
from pathlib import Path
import os

output_dir = Path(args.output_dir)

print("Start training")
start_time = time.time()
for epoch in range(args.start_epoch, args.epochs):
    if args.distributed:
        sampler_train.set_epoch(epoch)
    train_stats = train_one_epoch(model_without_ddp, criterion, data_loader, optimizer, device, epoch, args.clip_max_norm)
    lr_scheduler.step()
    
    if epoch > 40 and epoch % 5 == 0 :
        if args.output_dir:
            checkpoint = {
                'model': model_without_ddp.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'args': args,
                'epoch': epoch
            }
            utils.save_on_master(
                checkpoint,
                os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
            utils.save_on_master(
                checkpoint,
                os.path.join(args.output_dir, 'checkpoint.pth'))

total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))

Start training
Epoch: [0]  [   0/1346]  eta: 0:30:37  lr: 0.000200  class_error: 75.00  grad_norm: 3623768.50  loss: 1288.9016 (1288.9016)  loss_ce: 214.3728 (214.3728)  loss_bbox: 0.7220 (0.7220)  loss_giou: 1.5172 (1.5172)  loss_ce_0: 169.7816 (169.7816)  loss_bbox_0: 0.9199 (0.9199)  loss_giou_0: 1.4939 (1.4939)  loss_ce_1: 202.5924 (202.5924)  loss_bbox_1: 0.7543 (0.7543)  loss_giou_1: 1.5025 (1.5025)  loss_ce_2: 222.1283 (222.1283)  loss_bbox_2: 0.7662 (0.7662)  loss_giou_2: 1.4993 (1.4993)  loss_ce_3: 234.9000 (234.9000)  loss_bbox_3: 0.7531 (0.7531)  loss_giou_3: 1.5250 (1.5250)  loss_ce_4: 231.4132 (231.4132)  loss_bbox_4: 0.7383 (0.7383)  loss_giou_4: 1.5216 (1.5216)  loss_ce_unscaled: 107.1864 (107.1864)  class_error_unscaled: 75.0000 (75.0000)  loss_bbox_unscaled: 0.1444 (0.1444)  loss_giou_unscaled: 0.7586 (0.7586)  cardinality_error_unscaled: 297.7500 (297.7500)  loss_ce_0_unscaled: 84.8908 (84.8908)  loss_bbox_0_unscaled: 0.1840 (0.1840)  loss_giou_0_unscaled: 0.7469 (0.7

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

In [None]:
a = torch.tensor([0,1,2])

In [None]:
a[0][1]