In [1]:
#!export WANDB_API_KEY=2a631ea744b03506a1330798e0724d1d917a821f

In [2]:
%env WANDB_API_KEY=2a631ea744b03506a1330798e0724d1d917a821f

env: WANDB_API_KEY=2a631ea744b03506a1330798e0724d1d917a821f


In [3]:
# 모듈 import

import numpy as np
import random
import sys
sys.path.append("../mmdetection/")

from mmengine.hooks import Hook
from mmengine.config import Config
from mmengine.runner import Runner
from mmdet.registry import DATASETS
from mmdet.utils import register_all_modules


from torch.utils.data import SubsetRandomSampler

In [4]:
import wandb
wandb.login()
from wandb_custom_hooks import WandbInitHook

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmevest71[0m ([33mmevest71-boostcamp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# 모든 모듈 등록
register_all_modules()

In [6]:
classes = ("General trash", "Paper", "Paper pack", "Metal", "Glass", 
           "Plastic", "Styrofoam", "Plastic bag", "Battery", "Clothing")

# config file 들고오기
cfg = Config.fromfile('../mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py')

root='./dataset/'

# dataset config 수정
cfg.dataset_type = 'CocoDataset'
cfg.data_root = root

# Train 데이터셋 설정
train_dataset_cfg = dict(
    data_root=cfg.data_root,
    ann_file='train.json',
    data_prefix=dict(img=''),
    filter_cfg=dict(filter_empty_gt=True, min_size=32),
    pipeline=cfg.train_pipeline,
    metainfo=dict(classes=classes)
)

Dataset = DATASETS.get(cfg.dataset_type)
full_train_dataset = Dataset(**train_dataset_cfg)

# Train 데이터셋의 10%만 사용하기 위한 인덱스 선택
total_train_size = len(full_train_dataset)
subset_train_size = int(total_train_size * 0.1)
train_indices = random.sample(range(total_train_size), subset_train_size)

# 선택된 인덱스만 사용하는 새로운 Train 데이터셋 설정
train_dataset_cfg['indices'] = train_indices

# Train dataset config 수정
cfg.train_dataloader = dict(
    batch_size=4,
    num_workers=2,
    persistent_workers=True,
    sampler=dict(type='DefaultSampler', shuffle=True),
    batch_sampler=dict(type='AspectRatioBatchSampler'),
    dataset=dict(
        type=cfg.dataset_type,
        **train_dataset_cfg
    )
)

# Test(Val) 데이터셋 설정
test_dataset_cfg = dict(
    data_root=cfg.data_root,
    ann_file='test.json',
    data_prefix=dict(img=''),
    test_mode=True,
    pipeline=cfg.test_pipeline,
    metainfo=dict(classes=classes)
)

full_test_dataset = Dataset(**test_dataset_cfg)

# Test 데이터셋의 10%만 사용하기 위한 인덱스 선택
total_test_size = len(full_test_dataset)
subset_test_size = int(total_test_size * 0.05)
test_indices = random.sample(range(total_test_size), subset_test_size)

# 선택된 인덱스만 사용하는 새로운 Test 데이터셋 설정
test_dataset_cfg['indices'] = test_indices

# Validation dataset config 수정 (Test와 동일하게 설정)
cfg.val_dataloader = dict(
    batch_size=1,
    num_workers=2,
    persistent_workers=True,
    drop_last=False,
    sampler=dict(type='DefaultSampler', shuffle=False),
    dataset=dict(
        type=cfg.dataset_type,
        **test_dataset_cfg
    )
)

# Test dataset config 수정 (Validation과 동일하게 설정)
cfg.test_dataloader = cfg.val_dataloader

# Train, val, test evaluator 설정
cfg.train_evaluator = dict(
    type='CocoMetric',
    ann_file=cfg.data_root + 'train.json',
    metric='bbox',
    format_only=False
)

cfg.val_evaluator = dict(
    type='CocoMetric',
    ann_file=cfg.data_root + 'test.json',
    metric='bbox',
    format_only=False
)

cfg.test_evaluator = cfg.val_evaluator

# 기타 설정
cfg.train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
cfg.val_cfg = dict(type='ValLoop')
cfg.test_cfg = dict(type='TestLoop')

# 체크포인트 pth와 로그 저장
cfg.default_hooks = dict(
    timer=dict(type='IterTimerHook'),
    logger=dict(type='LoggerHook', interval=50),
    param_scheduler=dict(type='ParamSchedulerHook'),
    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    visualization=dict(type='DetVisualizationHook'),
)

cfg.env_cfg = dict(
    cudnn_benchmark=False,
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    dist_cfg=dict(backend='nccl'),
)

cfg.work_dir = './work_dirs/faster_rcnn_r50_fpn_1x_trash'

cfg.model.roi_head.bbox_head.num_classes = 10

cfg.optim_wrapper.optimizer.lr = 0.02
cfg.optim_wrapper.clip_grad = dict(max_norm=35, norm_type=2)

# WandB 설정 추가
cfg.visualizer.update(
    type='DetLocalVisualizer',
    vis_backends=[
        dict(type='LocalVisBackend'),
        dict(
            type='WandbVisBackend',
            init_kwargs=dict(
                project='Project2', 
                name='experiment_2'
            )
        )
    ]
)

# WandB 로거 추가
cfg.log_processor = dict(
    type='LogProcessor',
    window_size=50,
    by_epoch=True,
    custom_cfg=[
        dict(
            data_src='wandb',
            log_name='wandb',
            type='WandbLoggerHook',
            init_kwargs={
                'project': 'Project2'
            }
        )
    ]
)

# Custom hooks 추가
cfg.custom_hooks = [
    dict(
        type='wandb_custom_hooks.WandbInitHook',
        project='Project2',
        name='ex5'
        )
    ]

# Runner 생성 및 학습 시작
runner = Runner.from_cfg(cfg)

runner.train()


loading annotations into memory...
Done (t=0.09s)
creating index...
index created!
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
10/14 16:28:57 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: linux
    Python: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0]
    CUDA available: True
    numpy_random_seed: 854419210
    GPU 0: Tesla V100-SXM2-32GB
    CUDA_HOME: None
    GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0
    PyTorch: 1.12.1+cu116
    PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201402
  - Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191122 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v2.6.0 (Git Hash 52b5f107dd9cf10910aaa19cb47f3abf9b349815)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA 

10/14 16:29:06 - mmengine - [4m[97mINFO[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.
10/14 16:29:06 - mmengine - [4m[97mINFO[0m - Hooks will be executed in the following order:
before_run:
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) WandbInitHook                      
(BELOW_NORMAL) LoggerHook                         
 -------------------- 
before_train:
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) IterTimerHook                      
(VERY_LOW    ) CheckpointHook                     
 -------------------- 
before_train_epoch:
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) IterTimerHook                      
(NORMAL      ) DistSamplerSeedHook                
 -------------------- 
before_train_iter:
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) IterTimerHook                      
 ---------



VBox(children=(Label(value='0.050 MB of 0.050 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

10/14 16:29:16 - mmengine - [4m[97mINFO[0m - load model from: torchvision://resnet50
10/14 16:29:16 - mmengine - [4m[97mINFO[0m - Loads checkpoint by torchvision backend from path: torchvision://resnet50

unexpected key in source state_dict: fc.weight, fc.bias

10/14 16:29:18 - mmengine - [4m[97mINFO[0m - Checkpoints will be saved to /data/ephemeral/home/kjh/level2-objectdetection-cv-16/work_dirs/faster_rcnn_r50_fpn_1x_trash.
10/14 16:30:21 - mmengine - [4m[97mINFO[0m - Epoch(train)  [1][ 50/122]  lr: 1.9820e-03  eta: 0:29:45  time: 1.2630  data_time: 0.0141  memory: 4463  grad_norm: 6.8377  loss: 1.1411  loss_rpn_cls: 0.3835  loss_rpn_bbox: 0.0595  loss_cls: 0.6084  acc: 96.2402  loss_bbox: 0.0896
10/14 16:31:16 - mmengine - [4m[97mINFO[0m - Epoch(train)  [1][100/122]  lr: 3.9840e-03  eta: 0:26:57  time: 1.1091  data_time: 0.0088  memory: 4463  grad_norm: 2.5493  loss: 0.7430  loss_rpn_cls: 0.1436  loss_rpn_bbox: 0.0532  loss_cls: 0.3214  acc: 88.0371  loss_bbox: 0.2247



10/14 16:31:53 - mmengine - [4m[97mINFO[0m - Epoch(val)  [1][ 50/487]    eta: 0:01:01  time: 0.1396  data_time: 0.0050  memory: 4463  
10/14 16:31:59 - mmengine - [4m[97mINFO[0m - Epoch(val)  [1][100/487]    eta: 0:00:52  time: 0.1328  data_time: 0.0021  memory: 699  
10/14 16:32:06 - mmengine - [4m[97mINFO[0m - Epoch(val)  [1][150/487]    eta: 0:00:45  time: 0.1331  data_time: 0.0020  memory: 699  
10/14 16:32:13 - mmengine - [4m[97mINFO[0m - Epoch(val)  [1][200/487]    eta: 0:00:38  time: 0.1370  data_time: 0.0019  memory: 699  
10/14 16:32:20 - mmengine - [4m[97mINFO[0m - Epoch(val)  [1][250/487]    eta: 0:00:31  time: 0.1312  data_time: 0.0019  memory: 699  
10/14 16:32:27 - mmengine - [4m[97mINFO[0m - Epoch(val)  [1][300/487]    eta: 0:00:25  time: 0.1419  data_time: 0.0019  memory: 699  
10/14 16:32:33 - mmengine - [4m[97mINFO[0m - Epoch(val)  [1][350/487]    eta: 0:00:18  time: 0.1291  data_time: 0.0019  memory: 699  
10/14 16:32:40 - mmengine - [4m[97mINF



10/14 16:33:54 - mmengine - [4m[97mINFO[0m - Epoch(train)  [2][ 50/122]  lr: 6.8669e-03  eta: 0:25:27  time: 1.1754  data_time: 0.0107  memory: 4463  grad_norm: 2.2501  loss: 0.7764  loss_rpn_cls: 0.1191  loss_rpn_bbox: 0.0516  loss_cls: 0.3464  acc: 92.4805  loss_bbox: 0.2593
10/14 16:34:52 - mmengine - [4m[97mINFO[0m - Epoch(train)  [2][100/122]  lr: 8.8689e-03  eta: 0:24:25  time: 1.1711  data_time: 0.0084  memory: 4463  grad_norm: 2.3552  loss: 0.6651  loss_rpn_cls: 0.0905  loss_rpn_bbox: 0.0456  loss_cls: 0.3014  acc: 93.7012  loss_bbox: 0.2277
10/14 16:35:18 - mmengine - [4m[97mINFO[0m - Exp name: faster-rcnn_r50_fpn_1x_coco_20241014_162857
10/14 16:35:18 - mmengine - [4m[97mINFO[0m - Saving checkpoint at 2 epochs




10/14 16:35:29 - mmengine - [4m[97mINFO[0m - Epoch(val)  [2][ 50/487]    eta: 0:00:59  time: 0.1361  data_time: 0.0024  memory: 4463  
10/14 16:35:35 - mmengine - [4m[97mINFO[0m - Epoch(val)  [2][100/487]    eta: 0:00:52  time: 0.1329  data_time: 0.0022  memory: 699  
10/14 16:35:42 - mmengine - [4m[97mINFO[0m - Epoch(val)  [2][150/487]    eta: 0:00:44  time: 0.1298  data_time: 0.0019  memory: 699  
10/14 16:35:49 - mmengine - [4m[97mINFO[0m - Epoch(val)  [2][200/487]    eta: 0:00:38  time: 0.1393  data_time: 0.0020  memory: 699  
10/14 16:35:55 - mmengine - [4m[97mINFO[0m - Epoch(val)  [2][250/487]    eta: 0:00:31  time: 0.1320  data_time: 0.0021  memory: 699  
10/14 16:36:02 - mmengine - [4m[97mINFO[0m - Epoch(val)  [2][300/487]    eta: 0:00:25  time: 0.1349  data_time: 0.0022  memory: 699  
10/14 16:36:09 - mmengine - [4m[97mINFO[0m - Epoch(val)  [2][350/487]    eta: 0:00:18  time: 0.1318  data_time: 0.0019  memory: 699  
10/14 16:36:15 - mmengine - [4m[97mINF



10/14 16:37:26 - mmengine - [4m[97mINFO[0m - Epoch(train)  [3][ 50/122]  lr: 1.1752e-02  eta: 0:22:56  time: 1.1566  data_time: 0.0097  memory: 4463  grad_norm: 2.6546  loss: 0.7919  loss_rpn_cls: 0.1085  loss_rpn_bbox: 0.0567  loss_cls: 0.3490  acc: 95.2637  loss_bbox: 0.2777
10/14 16:38:25 - mmengine - [4m[97mINFO[0m - Epoch(train)  [3][100/122]  lr: 1.3754e-02  eta: 0:21:55  time: 1.1603  data_time: 0.0088  memory: 4463  grad_norm: 2.4301  loss: 0.7549  loss_rpn_cls: 0.0901  loss_rpn_bbox: 0.0456  loss_cls: 0.3495  acc: 90.9180  loss_bbox: 0.2697
10/14 16:38:50 - mmengine - [4m[97mINFO[0m - Exp name: faster-rcnn_r50_fpn_1x_coco_20241014_162857
10/14 16:38:50 - mmengine - [4m[97mINFO[0m - Saving checkpoint at 3 epochs




10/14 16:38:57 - mmengine - [4m[97mINFO[0m - Epoch(val)  [3][ 50/487]    eta: 0:00:47  time: 0.1089  data_time: 0.0023  memory: 4463  
10/14 16:39:03 - mmengine - [4m[97mINFO[0m - Epoch(val)  [3][100/487]    eta: 0:00:44  time: 0.1194  data_time: 0.0019  memory: 699  
10/14 16:39:10 - mmengine - [4m[97mINFO[0m - Epoch(val)  [3][150/487]    eta: 0:00:41  time: 0.1368  data_time: 0.0020  memory: 699  
10/14 16:39:17 - mmengine - [4m[97mINFO[0m - Epoch(val)  [3][200/487]    eta: 0:00:35  time: 0.1320  data_time: 0.0019  memory: 699  
10/14 16:39:24 - mmengine - [4m[97mINFO[0m - Epoch(val)  [3][250/487]    eta: 0:00:29  time: 0.1345  data_time: 0.0020  memory: 699  
10/14 16:39:30 - mmengine - [4m[97mINFO[0m - Epoch(val)  [3][300/487]    eta: 0:00:23  time: 0.1374  data_time: 0.0022  memory: 699  
10/14 16:39:37 - mmengine - [4m[97mINFO[0m - Epoch(val)  [3][350/487]    eta: 0:00:17  time: 0.1291  data_time: 0.0019  memory: 699  
10/14 16:39:43 - mmengine - [4m[97mINF



10/14 16:40:56 - mmengine - [4m[97mINFO[0m - Epoch(train)  [4][ 50/122]  lr: 1.6637e-02  eta: 0:20:30  time: 1.1764  data_time: 0.0100  memory: 4463  grad_norm: 2.6975  loss: 0.6882  loss_rpn_cls: 0.0772  loss_rpn_bbox: 0.0428  loss_cls: 0.3255  acc: 93.5547  loss_bbox: 0.2427


KeyboardInterrupt: 