In [1]:
import os
import timm
import logging
import argparse
import pandas as pd
from typing import Optional
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parallel
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import torch.utils.data
import torch.utils.data.distributed
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset
from torchmetrics import Accuracy, F1Score, Specificity

from pytorch_lightning import LightningModule
from pytorch_lightning.lite import LightningLite
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar
from pytorch_lightning.strategies import ParallelStrategy
from pytorch_lightning.utilities.cli import LightningCLI
from pytorch_lightning import Trainer
# from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from pytorch_lightning.plugins import DDPPlugin

import sys
sys.path.append('../')

from utils.dataset import PapsClsDataset, train_transforms, val_transforms, test_transforms, MAX_IMAGE_SIZE
from utils.collate import collate_fn
# from utils.sampler_by_group import GroupedBatchSampler, create_area_groups
from utils.losses import SupConLoss, FocalLoss

# from cls_utils.block import Bottleneck, TwoMLPHead, RoIPool
from cls_utils.model import PapsClassificationModel
from utils.collate import collate_fn
from utils.sampler import get_weight_random_sampler
from train_cls import PapsClsModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
parser = argparse.ArgumentParser(description='PyTorch Lightning ImageNet Training')
parser.add_argument('--data_path', metavar='DIR', default='./lbp_data/',
                    help='path to dataset (default: ./lbp_data/)')
parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
                    help='model architecture: (default: resnet18)')
parser.add_argument('-j', '--workers', default=12, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=15, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('-b', '--batch-size', default=16, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')

parser.add_argument('--lr', '--learning-rate', default=0.0005, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')

parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')

parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')

parser.add_argument('--accelerator', '--accelerator', default='gpu', type=str, help='default: gpu')

parser.add_argument('--devices', '--devices', default=2, type=int, help='number of gpus, default 2')
parser.add_argument('--img_size', default=400, type=int, help='input image resolution in swin models')
parser.add_argument('--num_classes', default=6, type=int, help='number of classes')

parser.add_argument('--pretrained', default=True, type=bool, help='set True if using pretrained weights')
parser.add_argument('--output_dir', default='./saved_models/classification', type=str, help='directory for model checkpoint')

_StoreAction(option_strings=['--output_dir'], dest='output_dir', nargs=None, const=None, default='./saved_models/classification', type=<class 'str'>, choices=None, help='directory for model checkpoint', metavar=None)

In [3]:
now = datetime.now().strftime('%Y%m%d_%H%M%S')
args = parser.parse_args([])
if torch.cuda.is_available() :
    args.accelerator = 'gpu'
    args.devices = torch.cuda.device_count()

args.img_size = MAX_IMAGE_SIZE

In [4]:
args.epochs = 12

In [5]:
args.data_path = '../lbp_data/'

In [6]:
logger_tb = TensorBoardLogger('./tuning_logs' +'/' + args.arch, name=now)
logger_wandb = WandbLogger(project='Paps_clf', name=now, mode='online') # online or disabled    

trainer_defaults = dict(
    callbacks = [
        # the PyTorch example refreshes every 10 batches
        TQDMProgressBar(refresh_rate=50),
        # save when the validation top1 accuracy improves
        ModelCheckpoint(monitor="val_acc1", mode="max",
                        dirpath=args.output_dir + '/' + args.arch,
                        filename='paps_tunning_{epoch}_{val_acc1:.2f}'),  
        ModelCheckpoint(monitor="val_acc1", mode="max",
                        dirpath=args.output_dir + '/' + args.arch,
                        filename='paps_tunning_best'),             
    ],    
    # plugins = "deepspeed_stage_2_offload",
    precision = 16,
    max_epochs = args.epochs,
    accelerator = args.accelerator, # auto, or select device, "gpu"
    # devices = args.devices, # number of gpus
    # devices = 1, # number of gpus
    logger = [logger_tb, logger_wandb],
    benchmark = True,
    # strategy = "ddp",
    replace_sampler_ddp=False,
    gpus=[1],
    )

model = PapsClsModel(
    data_path=args.data_path,
    arch=args.arch,
    pretrained=args.pretrained,
    workers=args.workers,
    lr = args.lr,
    batch_size=args.batch_size,
    weight_decay=args.weight_decay,
    num_classes=args.num_classes,
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbeomgon-yu[0m. Use [1m`wandb login --relogin`[0m to force relogin


=> creating model 'resnet18'


In [None]:
trainer = Trainer(**trainer_defaults)
trainer.fit(model)  

trainer.test(model)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


(17828, 16)


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


(5449, 16)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                    | Params
--------------------------------------------------------
0 | model       | PapsClassificationModel | 11.7 M
1 | criterion   | CrossEntropyLoss        | 0     
2 | train_acc1  | Accuracy                | 0     
3 | eval_acc1   | Accuracy                | 0     
4 | f1          | F1Score                 | 0     
5 | specificity | Specificity             | 0     
--------------------------------------------------------
11.7 M    Trainable params
0         Non-trainable params
11.7 M    Total params
23.474    Total estimated model params size (MB)


Epoch 0:   0%|          | 0/1456 [00:00<?, ?it/s]                          



Epoch 0:  76%|███████▌  | 1100/1456 [13:27<04:21,  1.36it/s, loss=0.931, v_num=za7z, train_acc=0.562]
Validation: 0it [00:00, ?it/s][A
Epoch 0:  79%|███████▉  | 1150/1456 [14:12<03:46,  1.35it/s, loss=0.931, v_num=za7z, train_acc=0.562]
Epoch 0:  82%|████████▏ | 1200/1456 [14:45<03:08,  1.36it/s, loss=0.931, v_num=za7z, train_acc=0.562]
Epoch 0:  86%|████████▌ | 1250/1456 [15:17<02:31,  1.36it/s, loss=0.931, v_num=za7z, train_acc=0.562]
Epoch 0:  89%|████████▉ | 1300/1456 [15:49<01:53,  1.37it/s, loss=0.931, v_num=za7z, train_acc=0.562]
Epoch 0:  93%|█████████▎| 1350/1456 [16:22<01:17,  1.37it/s, loss=0.931, v_num=za7z, train_acc=0.562]
Epoch 0:  96%|█████████▌| 1400/1456 [16:51<00:40,  1.38it/s, loss=0.931, v_num=za7z, train_acc=0.562]
Epoch 0: 100%|█████████▉| 1450/1456 [17:21<00:04,  1.39it/s, loss=0.931, v_num=za7z, train_acc=0.562]
Epoch 0: 100%|██████████| 1456/1456 [17:24<00:00,  1.39it/s, loss=0.93, v_num=za7z, train_acc=0.750, val_acc1=0.535, val_f1_score=0.486, val_specifici