In [1]:
import os

os.environ["WANDB_DIR"] = "/tmp/user2_wandb"
os.environ["WANDB_CACHE_DIR"] = "/tmp/user2_wandb_cache"
os.environ["WANDB_CONFIG_DIR"] = "/tmp/user2_wandb_config"

import wandb
wandb.login()

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
api = wandb.Api()
me = api.viewer

print("Username:", me.username)
print("Teams:", me.teams)

Username: kkhs4988
Teams: ['kkhs4988', 'cv_11']


In [2]:
# 모듈 import
import pandas as pd
from collections import Counter
from IPython.display import display

from mmengine.config import Config
from mmengine.runner import Runner
from mmdet.registry import DATASETS
from mmdet.utils import register_all_modules

In [None]:
# custom 설정
classes = ("General trash", "Paper", "Paper pack", "Metal", "Glass",
           "Plastic", "Styrofoam", "Plastic bag", "Battery", "Clothing")

root = "../../dataset/"
train_ann = "folds/train_fold0.json"
val_ann   = "folds/val_fold0.json"
test_ann  = "test.json"

# config file 들고오기
cfg = Config.fromfile("configs/my_model/cascade_rcnn_swin_tiny.py")
# cfg = Config.fromfile("configs/my_model/cascade_rcnn_swin_large.py")

register_all_modules(init_default_scope=True)
cfg.default_scope = "mmdet"


########################################################
# 1) Train / Val / Test dataset 구성
########################################################
for ds_key, ann_path in [
    ("train_dataloader", train_ann),
    ("val_dataloader",   val_ann),
    ("test_dataloader",  test_ann),
]:
    if ds_key not in cfg:
        continue

    ds = cfg[ds_key]["dataset"] if "dataset" in cfg[ds_key] else cfg[ds_key]
    ds.metainfo = dict(classes=classes)
    ds.data_root = root
    ds.ann_file = ann_path
    ds.data_prefix = dict(img="")


# dataloader batch 설정
cfg.train_dataloader.batch_size = 1
cfg.train_dataloader.num_workers = 4
cfg.val_dataloader.batch_size = 2
cfg.val_dataloader.num_workers = 2
cfg.test_dataloader.batch_size = 1
cfg.test_dataloader.num_workers = 2


########################################################
# 2) Augmentation pipeline
########################################################
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True),
    dict(type='RandomFlip', prob=[0.5, 0.15], direction=['horizontal', 'vertical']),
    dict(type='MinIoURandomCrop', min_ious=[0.4, 0.5, 0.6, 0.7], min_crop_size=0.3),
    dict(
        type='Albu',
        transforms=[
            dict(
                type='OneOf',
                transforms=[
                    dict(type='RandomBrightnessContrast', p=1.0),
                    dict(type='HueSaturationValue', p=1.0),
                    dict(type='CLAHE', p=1.0),
                    dict(type='RGBShift', p=1.0),
                ],
                p=0.5
            ),
            dict(
                type='OneOf',
                transforms=[
                    dict(type='GaussNoise', p=1.0),
                    dict(type='GaussianBlur', p=1.0),
                    dict(type='Blur', p=1.0),
                ],
                p=0.2
            ),
            dict(
                type='OneOf',
                transforms=[
                    dict(type='ShiftScaleRotate', rotate_limit=10, p=1.0),
                    dict(type='RandomRotate90', p=1.0),
                ],
                p=0.2
            ),
        ],
        bbox_params=dict(
            type='BboxParams',
            format='pascal_voc',
            label_fields=['gt_bboxes_labels', 'gt_ignore_flags'],
            min_visibility=0.0,
            filter_lost_elements=True
        ),
        keymap={'img': 'image', 'gt_bboxes': 'bboxes'},
        skip_img_without_anno=True
    ),
    dict(
    type='RandomChoiceResize',
    scales=[(800, 800), (1024, 1024), (1200, 1200)],
    keep_ratio=True
    ),
    dict(type='PackDetInputs')
]
val_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='Resize', scale=(1024, 1024), keep_ratio=True),
    dict(type='LoadAnnotations', with_bbox=True),
    dict(type='PackDetInputs')
]
tta_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        # --- Multi-Scale TTA ---
        scales=[
            (1024, 1024),  # baseline
            (800, 800),   # COCO 공식 cascade scale
            (1200, 1200),   # 더 큰 객체 대응
        ],
        flip=True,  # 좌우 flip TTA
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='PackDetInputs')
        ]
    ),
]

cfg.test_dataloader.dataset.pipeline = tta_pipeline
cfg.train_dataloader.dataset.pipeline = train_pipeline
cfg.val_dataloader.dataset.pipeline = val_pipeline


########################################################
# 3) Validation 활성화
########################################################
cfg.val_evaluator = dict(
    type="CocoMetric",
    ann_file=root + val_ann,
    metric=["bbox"],
    classwise=True     # 클래스별 mAP도 출력 가능
)

cfg.val_cfg = dict(type="ValLoop")
cfg.test_cfg = dict(type="TestLoop")

# 매 epoch마다 validation
cfg.train_cfg.max_epochs = 18
cfg.train_cfg.val_interval = 1



########################################################
# 4) Backbone 클래스 수 수정
########################################################
for i in range(3):
    cfg.model.roi_head.bbox_head[i].num_classes = len(classes)


########################################################
# 5) Optimizer (AdamW)
########################################################
cfg.optim_wrapper = dict(
    type='AmpOptimWrapper',
    optimizer=dict(
        type='AdamW',
        lr=0.00005,
        weight_decay=0.05,
        betas=(0.9, 0.999),
    ),
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)}),
    clip_grad=dict(max_norm=35, norm_type=2),
)


########################################################
# 6) LR Scheduler
########################################################
cfg.param_scheduler = [
    dict(type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=100),
    dict(
        type='CosineAnnealingLR',
        by_epoch=True,
        begin=0,
        end=cfg.train_cfg["max_epochs"],
        eta_min=1e-6,
    )
]


########################################################
# 7) Checkpoint 저장 방식 → mAP 기반 Top3 자동 저장
########################################################
cfg.default_hooks["checkpoint"] = dict(
    type="CheckpointHook",
    interval=1,
    max_keep_ckpts=3,
    save_best="coco/bbox_mAP",  # mAP 기준 best 저장
    rule="greater",             # 값이 클수록 좋음
)

# 저장 파일명 포맷 (epoch 포함)
cfg.work_dir = "./work_dirs/cascade_rcnn_swin_tiny_last"


########################################################
# 8) W&B 설정
########################################################
vis_backends = [
    dict(type='LocalVisBackend'),
    dict(
        type='WandbVisBackend',
        init_kwargs=dict(
            project='cv_11_OD',
            entity='cv_11',
            name='cascade_swin_tiny_last'
        )
    )
]

cfg.visualizer = dict(
    type='DetLocalVisualizer',
    vis_backends=vis_backends,
    name='visualizer'
)

cfg.log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
cfg.device = "cuda"


In [3]:
# dataset summarization 확인
train_ds_cfg = cfg.train_dataloader.dataset
train_ds = DATASETS.build(train_ds_cfg)

def summarize_dataset(ds):
    ds.full_init()
    num_images = len(ds)
    classes = list(ds.metainfo.get("classes", []))

    counts = Counter()
    for i in range(num_images):
        info = ds.get_data_info(i)
        for inst in info.get("instances", []):
            lbl = inst.get("bbox_label", None)
            if lbl is not None:
                counts[lbl] += 1

    df = pd.DataFrame({
        "category": [f"{i} [{c}]" for i, c in enumerate(classes)],
        "count": [counts.get(i, 0) for i in range(len(classes))]
    })

    print(f"\n [Info] CocoDataset Train dataset with number of images {num_images}, and instance counts:")
    display(df)

summarize_dataset(train_ds)

loading annotations into memory...
Done (t=0.07s)
creating index...
index created!

 [Info] CocoDataset Train dataset with number of images 3914, and instance counts:


Unnamed: 0,category,count
0,0 [General trash],3161
1,1 [Paper],5115
2,2 [Paper pack],706
3,3 [Metal],769
4,4 [Glass],835
5,5 [Plastic],2350
6,6 [Styrofoam],1026
7,7 [Plastic bag],4151
8,8 [Battery],143
9,9 [Clothing],377


In [4]:
# 모델 학습
runner = Runner.from_cfg(cfg)
runner.train()

12/10 18:56:08 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: linux
    Python: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0]
    CUDA available: True
    MUSA available: False
    numpy_random_seed: 874761115
    GPU 0: Tesla V100-SXM2-32GB
    CUDA_HOME: None
    GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0
    PyTorch: 2.1.0+cu118
    PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.1.1 (Git Hash 64f6bcbcbab628e96f33a62c3e975f8535a7bde4)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX512
  - CUDA Runtime 11.8
  - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=comput

12/10 18:56:27 - mmengine - [4m[97mINFO[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.
12/10 18:56:27 - mmengine - [4m[97mINFO[0m - Hooks will be executed in the following order:
before_run:
(VERY_HIGH   ) RuntimeInfoHook                    
(BELOW_NORMAL) LoggerHook                         
 -------------------- 
before_train:
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) IterTimerHook                      
(VERY_LOW    ) CheckpointHook                     
 -------------------- 
before_train_epoch:
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) IterTimerHook                      
(NORMAL      ) DistSamplerSeedHook                
 -------------------- 
before_train_iter:
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) IterTimerHook                      
 -------------------- 
after_train_iter:
(VERY_HIGH   ) Runti



12/10 18:56:54 - mmengine - [4m[97mINFO[0m - Epoch(train)  [1][  50/3914]  base_lr: 2.4773e-05 lr: 2.4773e-06  eta: 8:56:50  time: 0.4575  data_time: 0.0089  memory: 5023  grad_norm: inf  loss: 2.2245  loss_rpn_cls: 0.5918  loss_rpn_bbox: 0.0613  s0.loss_cls: 0.9203  s0.acc: 99.8047  s0.loss_bbox: 0.0553  s1.loss_cls: 0.4363  s1.acc: 99.8047  s1.loss_bbox: 0.0174  s2.loss_cls: 0.1394  s2.acc: 99.8047  s2.loss_bbox: 0.0026
12/10 18:57:16 - mmengine - [4m[97mINFO[0m - Epoch(train)  [1][ 100/3914]  base_lr: 5.0000e-05 lr: 5.0000e-06  eta: 8:40:38  time: 0.4305  data_time: 0.0058  memory: 5043  grad_norm: 5.7125  loss: 0.7866  loss_rpn_cls: 0.1599  loss_rpn_bbox: 0.0345  s0.loss_cls: 0.3105  s0.acc: 96.6797  s0.loss_bbox: 0.1275  s1.loss_cls: 0.0852  s1.acc: 98.6328  s1.loss_bbox: 0.0396  s2.loss_cls: 0.0236  s2.acc: 99.0234  s2.loss_bbox: 0.0057
12/10 18:57:37 - mmengine - [4m[97mINFO[0m - Epoch(train)  [1][ 150/3914]  base_lr: 5.0000e-05 lr: 5.0000e-06  eta: 8:30:44  time: 0.419

OSError: [Errno 28] No space left on device

In [6]:
!nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv

pid, process_name, used_gpu_memory [MiB]
971666, [Not Found], 3068 MiB
1051891, [Not Found], 29374 MiB


In [10]:

!ps -o cmd= -p 1051891