In [1]:
import wandb

wandb.init(project="P_stage_img_classification", entity="boostcampe_recsys_03")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mboostcampe_recsys_03[0m (use `wandb login --relogin` to force relogin)
  warn("The `IPython.html` package has been deprecated since IPython 4.0. "


In [1]:
import argparse
import glob
import json
import multiprocessing
import os
import random
import re
from importlib import import_module
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from dataset import MaskBaseDataset
from loss import create_criterion

from util import EarlyStop, FishingKit
kit = FishingKit('print_time_backward')

In [2]:

def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)


def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']


def grid_image(np_images, gts, preds, n=16, shuffle=False):
    batch_size = np_images.shape[0]
    assert n <= batch_size

    choices = random.choices(range(batch_size), k=n) if shuffle else list(range(n))
    figure = plt.figure(figsize=(12, 18 + 2))  # cautions: hardcoded, 이미지 크기에 따라 figsize 를 조정해야 할 수 있습니다. T.T
    plt.subplots_adjust(top=0.8)               # cautions: hardcoded, 이미지 크기에 따라 top 를 조정해야 할 수 있습니다. T.T
    n_grid = np.ceil(n ** 0.5)
    tasks = ["mask", "gender", "age"]
    for idx, choice in enumerate(choices):
        gt = gts[choice].item()
        pred = preds[choice].item()
        image = np_images[choice]
        # title = f"gt: {gt}, pred: {pred}"
        gt_decoded_labels = MaskBaseDataset.decode_multi_class(gt)
        pred_decoded_labels = MaskBaseDataset.decode_multi_class(pred)
        title = "\n".join([
            f"{task} - gt: {gt_label}, pred: {pred_label}"
            for gt_label, pred_label, task
            in zip(gt_decoded_labels, pred_decoded_labels, tasks)
        ])

        plt.subplot(n_grid, n_grid, idx + 1, title=title)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(image, cmap=plt.cm.binary)

    return figure


def increment_path(path, exist_ok=False):
    """ Automatically increment path, i.e. runs/exp --> runs/exp0, runs/exp1 etc.

    Args:
        path (str or pathlib.Path): f"{model_dir}/{args.name}".
        exist_ok (bool): whether increment path (increment if False).
    """
    path = Path(path)
    if (path.exists() and exist_ok) or (not path.exists()):
        return str(path)
    else:
        dirs = glob.glob(f"{path}*")
        matches = [re.search(rf"%s(\d+)" % path.stem, d) for d in dirs]
        i = [int(m.groups()[0]) for m in matches if m]
        n = max(i) + 1 if i else 2
        return f"{path}{n}"



In [5]:

def train(data_dir, model_dir, args):
    seed_everything(args.seed)

    if args.label_type:
        save_dir = increment_path(os.path.join(model_dir, args.name, args.label_type))
    else:
        save_dir = increment_path(os.path.join(model_dir, args.name))

    # -- settings
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # -- dataset
    dataset_module = getattr(import_module("dataset"), args.dataset)  # default: MaskBaseDataset
    dataset = dataset_module(
        data_dir=data_dir,
        # label_type=args.name,
        label_type=args.label_type,
    )
    num_classes = dataset.num_classes  # 18
    print("image 갯수", len(dataset.image_paths))
    print("mask_labels 갯수", len(dataset.mask_labels))
    
    # -- augmentation
    transform_module = getattr(import_module("dataset"), args.augmentation)  # default: BaseAugmentation
    transform = transform_module(
        resize=args.resize,
        mean=dataset.mean,
        std=dataset.std,
    )
    dataset.set_transform(transform)

    # -- data_loader
    train_set, val_set = dataset.split_dataset()

    train_loader = DataLoader(
        train_set,
        batch_size=args.batch_size,
        num_workers=multiprocessing.cpu_count()//2,
        shuffle=True,
        pin_memory=use_cuda,
        drop_last=True,
    )
    print("val_set =====================", len(val_set))
    val_loader = DataLoader(
        val_set,
        batch_size=args.valid_batch_size,
        num_workers=multiprocessing.cpu_count()//2,
        shuffle=False,
        pin_memory=use_cuda,
        drop_last=True,
    )

    # -- model
    if not args.saved_dir_model:
        model_module = getattr(import_module("model"), args.model)  # default: BaseModel
        model = model_module(
            num_classes=num_classes,
            saved_dir=args.saved_dir
        )
    else:
        print(f"기존의 학습된 모델 사용 중...\n해당 모델 경로 :\t{args.saved_dir_model}")
        model = torch.load(args.saved_dir_model)
    model = model.to(device)
    model = torch.nn.DataParallel(model)
    # hook을 이용한 각 모델마다의 시간 ############################
    # model = model.apply(kit.hook().attach)
    #############################################################
    
    # -- loss & metric
    criterion = create_criterion(args.criterion)  # default: cross_entropy
    opt_module = getattr(import_module("torch.optim"), args.optimizer)  # default: SGD
    optimizer = opt_module(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=args.lr,
        weight_decay=5e-4
    )
    scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5)

    # -- logging
    logger = SummaryWriter(log_dir=save_dir)
    # w&b 사용하는 곳 #############################
    wandb.config = {
    "epoch":args.epochs,
    "dataset": args.dataset,
    "augmentation": args.augmentation,
    "resize": args.resize,
    "batch_size": args.batch_size,
    "valid_batch_size": args.valid_batch_size,
    "model": args.model,
    "optimizer": args.optimizer,
    "lr": args.lr,
    "val_ratio": args.val_ratio,
    "criterion": args.criterion,
    "lr_decay_step": args.lr_decay_step,
    "name": args.name,
    "label_type": args.label_type,
    "The number of images of dataset":dataset.image_paths,
    "ratio of mask imgs in dataset":dataset.mask_labels.__len__() / dataset.image_paths.__len__(),
    "mean of dataset":dataset.mean,
    "std of dataset":dataset.std,
    "num of classes":dataset.num_classes,
    "is_using_cuda": use_cuda,
    "multiprocessing.cpu_count()":multiprocessing.cpu_count()
    }

    # Optional
    # wandb.watch(model)
    ###############################################
    with open(os.path.join(save_dir, 'config.json'), 'w', encoding='utf-8') as f:
        json.dump(vars(args), f, ensure_ascii=False, indent=4)

    best_val_acc = 0
    best_val_loss = np.inf
    break_flag = False
    for epoch in range(args.epochs):
        # train loop
        model.train()
        loss_value = 0
        matches = 0
        for idx, train_batch in enumerate(train_loader):
            inputs, labels = train_batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Train ######################################
            optimizer.zero_grad()

            outs = model(inputs)
            preds = torch.argmax(outs, dim=-1)
            loss = criterion(outs, labels)

            loss.backward()
            optimizer.step()
            ##############################################

            loss_value += loss.item()
            matches += (preds == labels).sum().item()
            if (idx + 1) % args.log_interval == 0:
                train_loss = loss_value / args.log_interval
                train_acc = matches / args.batch_size / args.log_interval
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}"
                )
                # logger.add_scalar("Train/loss", train_loss, epoch * len(train_loader) + idx)
                # logger.add_scalar("Train/accuracy", train_acc, epoch * len(train_loader) + idx)
                wandb.log({
                "Train/loss": train_loss,
                "Train/accuracy": train_acc,
                })

                loss_value = 0
                matches = 0
            # w&b 사용하는 곳 #############################
            wandb.log({
                "loss": loss,
                'train_images':wandb.Image(inputs),
                'train_labels':labels
                })

            # Optional
            # wandb.watch(model)
            ###############################################

        scheduler.step()

        # val loop
        with torch.no_grad():
            print("Calculating validation results...")
            model.eval()
            val_loss_items = []
            val_acc_items = []
            # early-stop ###########################################
            early_stop = EarlyStop(rtol=1e-4,length=10)
            ########################################################
            # figure = None
            for val_batch in val_loader:
                inputs, labels = val_batch
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                outs = model(inputs)
                preds = torch.argmax(outs, dim=-1)

                loss_item = criterion(outs, labels).item()
                acc_item = (labels == preds).sum().item()
                val_loss_items.append(loss_item)
                val_acc_items.append(acc_item)

                # if figure is None:
                #     inputs_np = torch.clone(inputs).detach().cpu().permute(0, 2, 3, 1).numpy()
                #     inputs_np = dataset_module.denormalize_image(inputs_np, dataset.mean, dataset.std)
                #     figure = grid_image(
                #         inputs_np, labels, preds, n=16, shuffle=args.dataset != "MaskSplitByProfileDataset"
                #     )

            val_loss = np.sum(val_loss_items) / len(val_loader)
            val_acc = np.sum(val_acc_items) / len(val_set)
            best_val_loss = min(best_val_loss, val_loss)
            # w&b 사용하는 곳 #############################
            wandb.log({
                "Val/loss": val_loss,
                "Val/accuracy": val_acc
                })

            # Optional
            # wandb.watch(model)
            ###############################################
            # early-stop ###########################################
            if early_stop.append(val_acc):
                break_flag=True
            ########################################################
            if val_acc > best_val_acc:
                print(f"New best model for val accuracy : {val_acc:4.2%}! saving the best model..")
                torch.save(model.module.state_dict(), f"{save_dir}/best.pth")
                best_val_acc = val_acc
            torch.save(model.module.state_dict(), f"{save_dir}/last.pth")
            print(
                f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.2} || "
                f"best acc : {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}"
            )
            # logger.add_scalar("Val/loss", val_loss, epoch)
            # logger.add_scalar("Val/accuracy", val_acc, epoch)
            # logger.add_figure("results", figure, epoch)
            print()
            
        if break_flag:
            break
            


In [None]:

if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    from dotenv import load_dotenv
    import os
    load_dotenv(verbose=True)

    # Data and model checkpoints directories
    parser.add_argument('--seed', type=int, default=42, help='random seed (default: 42)')
    parser.add_argument('--epochs', type=int, default=50, help='number of epochs to train (default: 1)')
    parser.add_argument('--dataset', type=str, default='GakGakDataset', help='dataset augmentation type (default: MaskBaseDataset)')
    parser.add_argument('--augmentation', type=str, default='BaseAugmentation', help='data augmentation type (default: BaseAugmentation)')
    parser.add_argument("--resize", nargs="+", type=list, default=[128, 96], help='resize size for image when training')
    parser.add_argument('--batch_size', type=int, default=10, help='input batch size for training (default: 64)')
    parser.add_argument('--valid_batch_size', type=int, default=50, help='input batch size for validing (default: 1000)')
    parser.add_argument('--model', type=str, default='ResNetModel', help='model type (default: BaseModel)')
    parser.add_argument('--optimizer', type=str, default='SGD', help='optimizer type (default: Adam)')
    parser.add_argument('--lr', type=float, default=1e-3, help='learning rate (default: 1e-3)')
    parser.add_argument('--val_ratio', type=float, default=0.2, help='ratio for validaton (default: 0.2)')
    parser.add_argument('--criterion', type=str, default='cross_entropy', help='criterion type (default: cross_entropy)')
    parser.add_argument('--lr_decay_step', type=int, default=20, help='learning rate scheduler deacy step (default: 20)')
    parser.add_argument('--log_interval', type=int, default=20, help='how many batches to wait before logging training status')
    parser.add_argument('--name', default='GakGak_BaseAug_ResNetModel_SGD', help='model save at {SM_MODEL_DIR}/{name} (default: exp)')
    
    # 추가한 것들.
    parser.add_argument('--label_type', type=str, default=None, help='dataset에서 사용하고 싶은 label. "age", "gender", "mask" 중 택1')
    parser.add_argument('--saved_dir', type=str, default='./model', help='/opt/ml/workspace/baseline/model')
    parser.add_argument('--saved_dir_model', type=str, default=None, help='학습시킨 모델을 다시 로드함. 해당 경로는 사용하고 싶은 모델의 .pth를 정확히 기술해야 함.')
    

    # Container environment
    parser.add_argument('--data_dir', type=str, default=os.environ.get('SM_CHANNEL_TRAIN', '/opt/ml/input/data/train/images'))
    # parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR', './model/age'))
    parser.add_argument('--model_dir', type=str, default=f"/opt/ml/workspace/saved")
    

    args = parser.parse_args()

    data_dir = args.data_dir
    model_dir = args.model_dir
    
    train(data_dir, model_dir, args)

usage: ipykernel_launcher.py [-h] [--seed SEED] [--epochs EPOCHS]
                             [--dataset DATASET] [--augmentation AUGMENTATION]
                             [--resize RESIZE [RESIZE ...]]
                             [--batch_size BATCH_SIZE]
                             [--valid_batch_size VALID_BATCH_SIZE]
                             [--model MODEL] [--optimizer OPTIMIZER] [--lr LR]
                             [--val_ratio VAL_RATIO] [--criterion CRITERION]
                             [--lr_decay_step LR_DECAY_STEP]
                             [--log_interval LOG_INTERVAL] [--name NAME]
                             [--label_type LABEL_TYPE] [--saved_dir SAVED_DIR]
                             [--data_dir DATA_DIR] [--model_dir MODEL_DIR]
ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme="hmac-sha256" --Session.key=b"6480822b-f672-4142-b7b0-169ce95667e3" --shell=9002 --transport="t

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


wandb.config에 기록해야 할 것

> config

dataset.image_paths
dataset.mask_labels / dataset.image_paths
dataset.mean
dataset.std
dataset.num_classes


use_cuda
multiprocessing.cpu_count()


args.epochs
args.dataset
args.augmentation
args.resize
args.batch_size
args.valid_batch_size
args.model
args.optimizer
args.lr
args.val_ratio
args.criterion
args.lr_decay_step
args.name
args.label_type