# Directory settings

In [1]:
# ! pip install -q pip install git+https://github.com/ildoonet/pytorch-gradual-warmup-lr.git

In [2]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

TRAIN_PATH = 'gs://ranzcr-data/'

# CFG

In [3]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    device='GPU' # ['TPU', 'GPU']
    nprocs=1 # [1, 8]
    print_freq=100
    num_workers=4
    model_name='resnet200d_320' # 
    size = 256#512
    scheduler='ReduceLROnPlateau' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']
    epochs = 10
    factor=0.1 # ReduceLROnPlateau. 0.2
    patience=0 # ReduceLROnPlateau. 4
    eps=1e-6 # ReduceLROnPlateau
    T_max=5 # CosineAnnealingLR. 4
    T_0 = 4 # CosineAnnealingWarmRestarts
    lr= 5e-4 # 1e-4, 5e-4 para step 2
    min_lr= 1e-6
    batch_size= 24 # 24
    weight_decay=1e-6
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=666
    target_size=11
    target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal',
                 'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged', 'NGT - Normal', 
                 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal',
                 'Swan Ganz Catheter Present']
    n_fold=5
    trn_fold=[0] # [0, 1, 2, 3, 4]. 
    train=True
    USE_TEST_FOLD = True
    CURRENT_FOLD = 0
    WARMUP = 10
    STEP = 0 # 0 sin annotaciones, 1, 2 y 3
    # step 2
    weights=[0.5, 1]
    teacher = '../input/resnet200d-step1-teachers/'
    # step 3
    student = '../input/resnet200-step2-students/'
    # MORE LAYERS: DenseSigmoid, BatchNorm y PReLU
    MORE_LAYERS_MODEL = False
    student_more_layers = '../input/resnet200dstudentsstep2/'
    # TODO para continuar entrenando
    CONTINUE_TRAINING = False
    continue_path = '../input/resnet200dstep3morelayersphase1/'
    lr_continue = 5e-5 # CAMBIAR A MANO
    # OTROS TIPOS DE MODELOS
    DO_EFFICIENT = False
    effnet_path = '../input/efficientnet-pytorch/'
    # SERESNET152D
    DO_SERESNET152D = False
    
# correciones
if CFG.debug:
    CFG.epochs = 1
    
if CFG.CONTINUE_TRAINING:
    CFG.epochs = 14
    CFG.lr = CFG.lr_continue
    
if CFG.MORE_LAYERS_MODEL:
    CFG.student = CFG.student_more_layers
    
if CFG.STEP == 3:
    CFG.USE_TEST_FOLD = True
    
if CFG.DO_SERESNET152D:
    CFG.model_name = 'seresnet152d_320'

In [4]:
if CFG.device == 'TPU':
    import os
    os.system('curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py')
    os.system('python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev')
    os.system('export XLA_USE_BF16=1')
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.parallel_loader as pl
    import torch_xla.distributed.xla_multiprocessing as xmp
    CFG.lr = CFG.lr * CFG.nprocs
    CFG.batch_size = CFG.batch_size // CFG.nprocs
    
if CFG.DO_EFFICIENT:
    !pip install efficientnet_pytorch
    from efficientnet_pytorch import EfficientNet
    CFG.lr = 3e-3
    CFG.model_name = "efficientnet-b7"
    #CFG.size= 600
    CFG.epochs = 10 
    if CFG.device == 'GPU':
        CFG.batch_size = 8
    if CFG.device == 'TPU':
        CFG.batch_size = 16

Collecting efficientnet_pytorch
  Downloading efficientnet_pytorch-0.7.0.tar.gz (20 kB)
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25ldone
[?25h  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.0-py3-none-any.whl size=16033 sha256=3eb817a52468eb038d30d5b8ce99717207e958bc9a7c19890018cd90ab48c5cc
  Stored in directory: /root/.cache/pip/wheels/b7/cc/0d/41d384b0071c6f46e542aded5f8571700ace4f1eb3f1591c29
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.0
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


# Library

In [5]:
# ====================================================
# Library
# ====================================================
import sys
sys.path.append('gs://extras-entrenamiento/pytorch_image_models')
# sys.path.append('gs://extras-entrenamiento/input/pytorch-images-seresnet')

import os
import ast
import copy
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.utils import check_random_state
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
print(torch.__version__)
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
# from warmup_scheduler import GradualWarmupScheduler

from albumentations import (
    Compose, OneOf, Normalize, Resize, RandomResizedCrop, RandomCrop, HorizontalFlip, VerticalFlip, 
    RandomBrightness, RandomContrast, RandomBrightnessContrast, Rotate, ShiftScaleRotate, Cutout, 
    IAAAdditiveGaussianNoise, Transpose, HueSaturationValue, CoarseDropout
    )
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform
import albumentations
from albumentations import *

import timm

if CFG.device == 'TPU':
    import ignite.distributed as idist
elif CFG.device == 'GPU':
    from torch.cuda.amp import autocast, GradScaler

import warnings 
warnings.filterwarnings('ignore')

1.7.0


# Utils

In [6]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    scores = []
    for i in range(y_true.shape[1]):
        score = roc_auc_score(y_true[:,i], y_pred[:,i])
        scores.append(score)
    avg_score = np.mean(scores)
    return avg_score, scores


@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s.')


def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

# Data Loading

In [7]:
train = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train.csv')
dfx = pd.read_csv('../input/split-k-folds/train_folds.csv')
train_annotations = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train_annotations.csv')

print("Folds are: ", sorted(dfx.fold.unique()))
print("Selected folds are: ", CFG.trn_fold)

Folds are:  [0, 1, 2, 3, 4]
Selected folds are:  [0]


# Dataset

In [8]:
# ====================================================
# Dataset
# ====================================================
COLOR_MAP = {'ETT - Abnormal': (255, 0, 0),
             'ETT - Borderline': (0, 255, 0),
             'ETT - Normal': (0, 0, 255),
             'NGT - Abnormal': (255, 255, 0),
             'NGT - Borderline': (255, 0, 255),
             'NGT - Incompletely Imaged': (0, 255, 255),
             'NGT - Normal': (128, 0, 0),
             'CVC - Abnormal': (0, 128, 0),
             'CVC - Borderline': (0, 0, 128),
             'CVC - Normal': (128, 128, 0),
             'Swan Ganz Catheter Present': (128, 0, 128),
            }


class TrainDataset(Dataset):
    def __init__(self, df, df_annotations, annot_size=50, transform=None, use_annot=False):
        self.df = df
        self.df_annotations = df_annotations
        self.use_annot = use_annot
        self.annot_size = annot_size
        self.file_names = df['StudyInstanceUID'].values
        self.labels = df[CFG.target_cols].values
        self.transform = transform
        self.shape = df.shape

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TRAIN_PATH}/{file_name}.jpg'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        labels = torch.tensor(self.labels[idx]).float()
        # annotated imageç
        # STEP 1 o 2
        if CFG.STEP > 0 and CFG.STEP < 3:
            #STEP 2
            if CFG.STEP == 2 and self.use_annot:
                image_annot = image.copy()
            query_string = f"StudyInstanceUID == '{file_name}'"
            df = self.df_annotations.query(query_string)
            for i, row in df.iterrows():
                label = row["label"]
                data = np.array(ast.literal_eval(row["data"]))
                for d in data:
                    # STEP 2
                    if CFG.STEP == 2 and self.use_annot:
                        image_annot[d[1]-self.annot_size//2:d[1]+self.annot_size//2,
                                d[0]-self.annot_size//2:d[0]+self.annot_size//2,
                                :] = COLOR_MAP[label]
                    # STEP 1
                    else:
                        image[d[1]-self.annot_size//2:d[1]+self.annot_size//2,
                              d[0]-self.annot_size//2:d[0]+self.annot_size//2,
                              :] = COLOR_MAP[label]
        # aplicar transform
        if CFG.STEP == 2 and self.use_annot:
            if self.transform:
                augmented = self.transform(image=image, image_annot=image_annot)
                image = augmented['image']
                image_annot = augmented['image_annot']
            return image, image_annot, labels
        else:
            if self.transform:
                augmented = self.transform(image=image)
                image = augmented['image']
            return image, labels

# Transforms

In [9]:
# ====================================================
# Transforms
# ====================================================
# def get_transforms(*, data):
    
#     if data == 'train':
#         return Compose([
#            albumentations.RandomResizedCrop(CFG.size, CFG.size, scale=(0.9, 1), p=1), 
#            albumentations.HorizontalFlip(p=0.5),
#            albumentations.ShiftScaleRotate(p=0.5),
#            albumentations.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=10, val_shift_limit=10, p=0.7),
#            albumentations.RandomBrightnessContrast(brightness_limit=(-0.2,0.2), contrast_limit=(-0.2, 0.2), p=0.7),
#            albumentations.CLAHE(clip_limit=(1,4), p=0.5),
#            albumentations.OneOf([
#                albumentations.OpticalDistortion(distort_limit=1.0),
#                albumentations.GridDistortion(num_steps=5, distort_limit=1.),
#                albumentations.ElasticTransform(alpha=3),
#            ], p=0.2),
#            albumentations.OneOf([
#                albumentations.GaussNoise(var_limit=[10, 50]),
#                albumentations.GaussianBlur(),
#                albumentations.MotionBlur(),
#                albumentations.MedianBlur(),
#            ], p=0.2),
#           albumentations.Resize(CFG.size, CFG.size),
#           albumentations.OneOf([
#               JpegCompression(),
#               Downscale(scale_min=0.1, scale_max=0.15),
#           ], p=0.2),
#           IAAPiecewiseAffine(p=0.2),
#           IAASharpen(p=0.2),
#           albumentations.Cutout(max_h_size=int(CFG.size * 0.1), max_w_size=int(CFG.size * 0.1), num_holes=5, p=0.5),
#           albumentations.Normalize(),
#         ToTensorV2(),
#         ])

#     elif data == 'valid':
#         return Compose([
#             albumentations.Resize(CFG.size, CFG.size),
#            albumentations.Normalize(),
#         ToTensorV2(),
#         ])
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            #Resize(CFG.size, CFG.size),
            RandomResizedCrop(CFG.size, CFG.size, scale=(0.85, 1.0)),
            HorizontalFlip(p=0.5),
            RandomBrightnessContrast(p=0.2, brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2)),
            HueSaturationValue(p=0.2, hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2),
            ShiftScaleRotate(p=0.2, shift_limit=0.0625, scale_limit=0.2, rotate_limit=20),
            CoarseDropout(p=0.2),
            Cutout(p=0.2, max_h_size=16, max_w_size=16, fill_value=(0., 0., 0.), num_holes=16),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])
    
def get_transforms_step2(*, data):
    
    if data == 'train':
        return Compose([
            #Resize(CFG.size, CFG.size),
            RandomResizedCrop(CFG.size, CFG.size, scale=(0.85, 1.0)),
            HorizontalFlip(p=0.5),
            RandomBrightnessContrast(p=0.2, brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2)),
            HueSaturationValue(p=0.2, hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2),
            ShiftScaleRotate(p=0.2, shift_limit=0.0625, scale_limit=0.2, rotate_limit=20),
            CoarseDropout(p=0.2),
            Cutout(p=0.2, max_h_size=16, max_w_size=16, fill_value=(0., 0., 0.), num_holes=16),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ], additional_targets={'image_annot': 'image'})
    
    elif data == 'check':
        return Compose([
            #Resize(CFG.size, CFG.size),
            RandomResizedCrop(CFG.size, CFG.size, scale=(0.85, 1.0)),
            HorizontalFlip(p=0.5),
            RandomBrightnessContrast(p=0.2, brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2)),
            HueSaturationValue(p=0.2, hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2),
            ShiftScaleRotate(p=0.2, shift_limit=0.0625, scale_limit=0.2, rotate_limit=20),
            CoarseDropout(p=0.2),
            Cutout(p=0.2, max_h_size=16, max_w_size=16, fill_value=(0., 0., 0.), num_holes=16),
            #Normalize(
            #    mean=[0.485, 0.456, 0.406],
            #    std=[0.229, 0.224, 0.225],
            #),
            ToTensorV2(),
        ], additional_targets={'image_annot': 'image'})

    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

In [10]:
# from matplotlib import pyplot as plt


# # annotated
# if CFG.STEP == 1:
#     train_dataset = TrainDataset(dfx[dfx['StudyInstanceUID'].isin(train_annotations['StudyInstanceUID'].unique())].reset_index(drop=True),
#                              df_annotations = train_annotations, transform = None)
# elif CFG.STEP == 2:
#     train_dataset = TrainDataset(dfx[dfx['StudyInstanceUID'].isin(train_annotations['StudyInstanceUID'].unique())].reset_index(drop=True),
#                              df_annotations = train_annotations, transform=get_transforms_step2(data='check'), use_annot=True)
# else:
#     train_dataset = TrainDataset(dfx, df_annotations = train_annotations, transform=None)
    
# print(train_dataset.shape)

# for i in range(5):
#     if CFG.STEP == 2:
#         image, image_annot, label = train_dataset[i]
#         plt.subplot(1, 2, 1)
#         plt.imshow(image.transpose(0, 1).transpose(1, 2))
#         plt.subplot(1, 2, 2)
#         plt.imshow(image_annot.transpose(0, 1).transpose(1, 2))
#         plt.title(f'label: {label}')
#         plt.show() 
#     else:
#         image, label = train_dataset[i]
#         plt.imshow(image)
#         plt.title(f'label: {label}')
#         plt.show() 

# MODEL

In [11]:
# ====================================================
# MODEL RESNET 200D
# ====================================================
class CustomResNet200D(nn.Module):
    def __init__(self, model_name='resnet200d_320', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=False)
        if pretrained:
            pretrained_path = '../input/resnet200d-pretrained-weight/resnet200d_ra2-bdba9bf9.pth'
            self.model.load_state_dict(torch.load(pretrained_path))
            print(f'load {model_name} pretrained model')
        n_features = self.model.fc.in_features # son 2048
        self.model.global_pool = nn.Identity()
        self.model.fc = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        if CFG.MORE_LAYERS_MODEL:
            self.sigmoid_fc = nn.Linear(in_features=n_features, out_features=512)
            self.bs = nn.BatchNorm1d(num_features = 512)
            self.prelu = nn.PReLU()
            self.fc = nn.Linear(512, CFG.target_size)
        else:
            self.fc = nn.Linear(n_features, CFG.target_size)

    def forward(self, x):
        bs = x.size(0)
        features = self.model(x)
        pooled_features = self.pooling(features).view(bs, -1)
        if CFG.MORE_LAYERS_MODEL:
            output = F.sigmoid(self.sigmoid_fc(pooled_features))
            output = self.bs(output)
            output = self.prelu(output)
            output = self.fc(output) #output pooled_features
        else:
            output = self.fc(pooled_features)
        # features, _, y_preds
        return features, pooled_features, output
    
    
# ====================================================
# MODEL EFFICIENT B7
# ====================================================
class CustomEfficientB7(nn.Module):
    def __init__(self, model_name='efficientnet-b7', pretrained=False):
        super().__init__()
        self.effnet = EfficientNet.from_pretrained(model_name) 
        n_features = self.effnet._fc.in_features
#         self.effnet._conv_stem.in_channels = 1
#         weight = self.effnet._conv_stem.weight.mean(1, keepdim=True)
#         self.effnet._conv_stem.weight = torch.nn.Parameter(weight)
        self.effnet._fc = nn.Identity()
        
        if CFG.MORE_LAYERS_MODEL:
            self.sigmoid_fc = nn.Linear(in_features=n_features, out_features=512)
            self.bs = nn.BatchNorm1d(num_features = 512)
            self.prelu = nn.PReLU()
            self.out = nn.Linear(512, CFG.target_size)
        else:
            self.out = nn.Linear(n_features, CFG.target_size)

    def forward(self, image):
        batch_size = image.size(0)
    
        x = self.effnet.extract_features(image)
        pooled_features = F.adaptive_avg_pool2d(x, 1).reshape(batch_size, -1)
        if CFG.MORE_LAYERS_MODEL:
            output = F.sigmoid(self.sigmoid_fc(pooled_features))
            output = self.bs(output)
            output = self.prelu(output)
            output = self.out(output)
        else:
            output = self.out(pooled_features)
        # features, _, y_preds
        return x, pooled_features, output

Funcion Loss para step 2
---

In [12]:
class CustomLoss(nn.Module):
    def __init__(self, weights=[1, 1]):
        super(CustomLoss, self).__init__()
        self.weights = weights
        
    def forward(self, teacher_features, features, y_pred, labels):
        consistency_loss = nn.MSELoss()(teacher_features.view(-1), features.view(-1))
        cls_loss = nn.BCEWithLogitsLoss()(y_pred, labels)
        loss = self.weights[0] * consistency_loss + self.weights[1] * cls_loss
        return loss

# Helper functions

In [13]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))
# https://www.kaggle.com/underwearfitting/single-fold-training-of-resnet200d-lb0-965
# class GradualWarmupSchedulerV2(GradualWarmupScheduler):
#     def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
#         super(GradualWarmupSchedulerV2, self).__init__(optimizer, multiplier, total_epoch, after_scheduler)
#     def get_lr(self):
#         if self.last_epoch > self.total_epoch:
#             if self.after_scheduler:
#                 if not self.finished:
#                     self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
#                     self.finished = True
#                 return self.after_scheduler.get_lr()
#             return [base_lr * self.multiplier for base_lr in self.base_lrs]
#         if self.multiplier == 1.0:
#             return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
#         else:
#             return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]


def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    if CFG.device == 'GPU':
        scaler = GradScaler()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if CFG.device == 'GPU':
            with autocast():
                _, _, y_preds = model(images)
                loss = criterion(y_preds, labels)
                # record loss
                losses.update(loss.item(), batch_size)
                if CFG.gradient_accumulation_steps > 1:
                    loss = loss / CFG.gradient_accumulation_steps
                scaler.scale(loss).backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
                if (step + 1) % CFG.gradient_accumulation_steps == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                    global_step += 1
        elif CFG.device == 'TPU':
            _, _, y_preds = model(images)
            loss = criterion(y_preds, labels)
            # record loss
            losses.update(loss.item(), batch_size)
            if CFG.gradient_accumulation_steps > 1:
                loss = loss / CFG.gradient_accumulation_steps
            loss.backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            if (step + 1) % CFG.gradient_accumulation_steps == 0:
                xm.optimizer_step(optimizer, barrier=True)
                optimizer.zero_grad()
                global_step += 1
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if CFG.device == 'GPU':
            if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
                print('Epoch: [{0}][{1}/{2}] '
                      'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                      'Elapsed {remain:s} '
                      'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                      'Grad: {grad_norm:.4f}  '
#                       'LR: {lr:.6f}  '
                      .format(
                       epoch+1, step, len(train_loader), batch_time=batch_time,
                       data_time=data_time, loss=losses,
                       remain=timeSince(start, float(step+1)/len(train_loader)),
                       grad_norm=grad_norm,
#                        lr=scheduler.get_lr()[0],
                       ))
        elif CFG.device == 'TPU':
            if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
                xm.master_print('Epoch: [{0}][{1}/{2}] '
                                'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                                'Elapsed {remain:s} '
                                'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                                'Grad: {grad_norm:.4f}  '
#                                 'LR: {lr:.6f}  '
                                .format(
                                epoch+1, step, len(train_loader), batch_time=batch_time,
                                data_time=data_time, loss=losses,
                                remain=timeSince(start, float(step+1)/len(train_loader)),
                                grad_norm=grad_norm,
#                                 lr=scheduler.get_lr()[0],
                                ))
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    trues = []
    preds = []
    start = end = time.time()
    for step, (images, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            _, _, y_preds = model(images)
        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)
        # record accuracy
        trues.append(labels.to('cpu').numpy())
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if CFG.device == 'GPU':
            if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
                print('EVAL: [{0}/{1}] '
                      'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                      'Elapsed {remain:s} '
                      'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                      .format(
                       step, len(valid_loader), batch_time=batch_time,
                       data_time=data_time, loss=losses,
                       remain=timeSince(start, float(step+1)/len(valid_loader)),
                       ))
        elif CFG.device == 'TPU':
            if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
                xm.master_print('EVAL: [{0}/{1}] '
                                'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                                'Elapsed {remain:s} '
                                'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                                .format(
                                step, len(valid_loader), batch_time=batch_time,
                                data_time=data_time, loss=losses,
                                remain=timeSince(start, float(step+1)/len(valid_loader)),
                                ))
    trues = np.concatenate(trues)
    predictions = np.concatenate(preds)
    return losses.avg, predictions, trues

In [14]:
# ****************************************************************************************************
# **************************************** FOR STEP 2 ************************************************
# ****************************************************************************************************
def train_fn_step2(train_loader, teacher_model, model, criterion, optimizer, epoch, scheduler, device):
    if CFG.device == 'GPU':
        scaler = GradScaler()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, images_annot, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        with torch.no_grad():
            teacher_features, _, _ = teacher_model(images_annot.to(device))
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if CFG.device == 'GPU':
            with autocast():
                features, _, y_preds = model(images)
                loss = criterion(teacher_features, features, y_preds, labels)
                # record loss
                losses.update(loss.item(), batch_size)
                if CFG.gradient_accumulation_steps > 1:
                    loss = loss / CFG.gradient_accumulation_steps
                scaler.scale(loss).backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
                if (step + 1) % CFG.gradient_accumulation_steps == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                    global_step += 1
        elif CFG.device == 'TPU':
            features, _, y_preds = model(images)
            loss = criterion(teacher_features, features, y_preds, labels)
            # record loss
            losses.update(loss.item(), batch_size)
            if CFG.gradient_accumulation_steps > 1:
                loss = loss / CFG.gradient_accumulation_steps
            loss.backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            if (step + 1) % CFG.gradient_accumulation_steps == 0:
                xm.optimizer_step(optimizer, barrier=True) # 
                optimizer.zero_grad()
                global_step += 1
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if CFG.device == 'GPU':
            if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
                print('Epoch: [{0}][{1}/{2}] '
                      'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                      'Elapsed {remain:s} '
                      'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                      'Grad: {grad_norm:.4f}  '
                      'LR: {lr:.6f}  '
                      .format(
                       epoch+1, step, len(train_loader), batch_time=batch_time,
                       data_time=data_time, loss=losses,
                       remain=timeSince(start, float(step+1)/len(train_loader)),
                       grad_norm=grad_norm,
                       lr=scheduler.get_lr()[0],
                       ))
        elif CFG.device == 'TPU':
            if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
                xm.master_print('Epoch: [{0}][{1}/{2}] '
                                'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                                'Elapsed {remain:s} '
                                'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                                'Grad: {grad_norm:.4f}  '
                                'LR: {lr:.6f}  '
                                .format(
                                epoch+1, step, len(train_loader), batch_time=batch_time,
                                data_time=data_time, loss=losses,
                                remain=timeSince(start, float(step+1)/len(train_loader)),
                                grad_norm=grad_norm,
                                lr=scheduler.get_lr()[0],
                                ))
    return losses.avg


# Train loop

In [15]:
# ====================================================
# Train loop
# ====================================================
def train_loop(fold):

    if CFG.device == 'GPU':
        LOGGER.info(f"========== fold: {fold} training ==========")
    elif CFG.device == 'TPU':
        if CFG.nprocs == 1:
            LOGGER.info(f"========== fold: {fold} training ==========")
        elif CFG.nprocs == 8:
            xm.master_print(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    trn_idx = dfx[dfx['fold'] != fold].index
    val_idx = dfx[dfx['fold'] == fold].index

    train_folds = dfx.loc[trn_idx].reset_index(drop=True)
    valid_folds = dfx.loc[val_idx].reset_index(drop=True)
    
    print(train_folds.shape)
    print(valid_folds.shape)
    
    # para anotaciones
    if CFG.STEP == 1 or CFG.STEP == 2:
        train_folds = train_folds[train_folds['StudyInstanceUID'].isin(train_annotations['StudyInstanceUID'].unique())].reset_index(drop=True)
        if CFG.STEP == 1:
            valid_folds = valid_folds[valid_folds['StudyInstanceUID'].isin(train_annotations['StudyInstanceUID'].unique())].reset_index(drop=True)
        print(train_folds.shape)
        print(valid_folds.shape)

    # Para validar y asignar pesos de los modelos
    if CFG.USE_TEST_FOLD and (CFG.STEP == 0 or CFG.STEP == 3):
        test_fold = 0
        train_folds = train_folds[train_folds.fold_test != test_fold].reset_index(drop=True)
        valid_folds = valid_folds[valid_folds.fold_test != test_fold].reset_index(drop=True)
        test_fold = dfx.loc[dfx['fold_test'] == test_fold].reset_index(drop=True)
        print(train_folds.shape)
        print(valid_folds.shape)
        print(test_fold.shape)
    
    valid_labels = valid_folds[CFG.target_cols].values

    if CFG.STEP == 2:
        use_annotations = True
        transform_function_to_use_Train = get_transforms_step2(data='train')
        transform_function_to_use_Valid = get_transforms_step2(data='valid')
    else:
        use_annotations = False
        transform_function_to_use_Train = get_transforms(data='train')
        transform_function_to_use_Valid = get_transforms(data='valid')
        
    train_dataset = TrainDataset(train_folds, df_annotations = train_annotations, use_annot = use_annotations,
                                 transform = transform_function_to_use_Train)
    valid_dataset = TrainDataset(valid_folds, df_annotations = train_annotations, use_annot = False,
                                 transform = transform_function_to_use_Valid)
    if CFG.USE_TEST_FOLD and (CFG.STEP == 0 or CFG.STEP == 3):
        test_fold_dataset = TrainDataset(test_fold, df_annotations = train_annotations,
                                     transform=get_transforms(data='valid'))

    if CFG.device == 'GPU':
        train_loader = DataLoader(train_dataset, 
                                  batch_size=CFG.batch_size, 
                                  shuffle=True, 
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
        valid_loader = DataLoader(valid_dataset, 
                                  batch_size=CFG.batch_size * 2, 
                                  shuffle=False, 
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
        if CFG.USE_TEST_FOLD and (CFG.STEP == 0 or CFG.STEP == 3):
            test_fold_loader = DataLoader(test_fold_dataset, 
                                      batch_size=CFG.batch_size * 2, 
                                      shuffle=False, 
                                      num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    elif CFG.device == 'TPU':
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,
                                                                        num_replicas=xm.xrt_world_size(),
                                                                        rank=xm.get_ordinal(),
                                                                        shuffle=True)
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=CFG.batch_size,
                                                   sampler=train_sampler,
                                                   drop_last=True,
                                                   num_workers=CFG.num_workers)
        valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset,
                                                                        num_replicas=xm.xrt_world_size(),
                                                                        rank=xm.get_ordinal(),
                                                                        shuffle=False)
        valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                                   batch_size=CFG.batch_size * 2,
                                                   sampler=valid_sampler,
                                                   drop_last=False,
                                                   num_workers=CFG.num_workers)
        if CFG.USE_TEST_FOLD and (CFG.STEP == 0 or CFG.STEP == 3):
            test_fold_sampler = torch.utils.data.distributed.DistributedSampler(test_fold_dataset,
                                                                            num_replicas=xm.xrt_world_size(),
                                                                            rank=xm.get_ordinal(),
                                                                            shuffle=False)
            test_fold_loader = torch.utils.data.DataLoader(test_fold_dataset,
                                                       batch_size=CFG.batch_size * 2,
                                                       sampler=test_fold_sampler,
                                                       drop_last=False,
                                                       num_workers=CFG.num_workers)


    # ====================================================
    # scheduler 
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    if CFG.device == 'TPU':
        device = xm.xla_device()
    elif CFG.device == 'GPU':
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
       
    # Leemos el modelo del STEP 1 
    if CFG.STEP == 2:
        teacher_model = CustomResNet200D(CFG.model_name, pretrained=False)
        teacher_model_path = CFG.teacher + "resnet200d_320_fold" + str(fold) + "_step1_best_loss_cpu.pth"
        teacher_model.load_state_dict(torch.load(teacher_model_path, map_location=torch.device('cpu'))['model'])
        for param in teacher_model.parameters():
            param.requires_grad = False
        teacher_model.eval()
        teacher_model.to(device)
        
    # Elegimos el tipo de modelo a entrenar
    if CFG.DO_EFFICIENT:
        model = CustomEfficientB7(CFG.model_name, pretrained=True)
    else:
        model = CustomResNet200D(CFG.model_name, pretrained=True)
        
    # Leemos estudiante si STEP es 3
    if CFG.STEP == 3 and CFG.CONTINUE_TRAINING == False:
        student_model_path = CFG.student + "resnet200d_320_fold" + str(fold) + "_step2_best_loss.pth"
        model.load_state_dict(torch.load(student_model_path, map_location=torch.device('cpu'))['model'])
        
    # Continuacion de entrenamiento
    if CFG.STEP == 3 and CFG.CONTINUE_TRAINING:
        continue_model_path = CFG.continue_path + "resnet200d_320_fold" + str(fold) + "_step3_best_loss_cpu.pth"
        model.load_state_dict(torch.load(continue_model_path, map_location=torch.device('cpu'))['model'])
        
    model.to(device)

    optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)
    
#     warmup_epo = 1 #++
#     scheduler_warmup = GradualWarmupSchedulerV2(optimizer, multiplier=10, total_epoch=warmup_epo, after_scheduler = scheduler) #++

    # ====================================================
    # loop
    # ====================================================
    if CFG.STEP == 2:
        train_criterion = CustomLoss(weights=CFG.weights)
    criterion = nn.BCEWithLogitsLoss()

    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
#         scheduler_warmup.step(epoch) # ++ 
        
        # ********* train *********
        #  ********* TPU *********
        if CFG.device == 'TPU':
            if CFG.nprocs == 1:
                if CFG.STEP == 2:
                    avg_loss = train_fn_step2(train_loader, teacher_model, model, train_criterion, optimizer, epoch, scheduler, device)
                else:
                    avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)
            elif CFG.nprocs == 8:
                para_train_loader = pl.ParallelLoader(train_loader, [device])
                if CFG.STEP == 2:
                    avg_loss = train_fn_step2(para_train_loader.per_device_loader(device), teacher_model, model, train_criterion, optimizer, epoch, scheduler, device)
                else:
                    avg_loss = train_fn(para_train_loader.per_device_loader(device), model, criterion, optimizer, epoch, scheduler, device)
        #  ********* GPU *********
        elif CFG.device == 'GPU':
            if CFG.STEP == 2:
                avg_loss = train_fn_step2(train_loader, teacher_model, model, train_criterion, optimizer, epoch, scheduler, device)
            else:
                avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)
                
        # ********* eval *********
        if CFG.device == 'TPU' and CFG.nprocs == 8: # TODO test_fold
                para_valid_loader = pl.ParallelLoader(valid_loader, [device])
                avg_val_loss, preds, valid_labels = valid_fn(para_valid_loader.per_device_loader(device), model, criterion, device)
                preds = idist.all_gather(torch.tensor(preds)).to('cpu').numpy()
                valid_labels = idist.all_gather(torch.tensor(valid_labels)).to('cpu').numpy()
        else: # para TPU (1 proc) y GPU
            avg_val_loss, preds, _ = valid_fn(valid_loader, model, criterion, device)
            if CFG.USE_TEST_FOLD and (CFG.STEP == 0 or CFG.STEP == 3):
                test_fold_avg_val_loss, test_fold_preds, _ = valid_fn(test_fold_loader, model, criterion, device)
            
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # ********* scoring *********
        score, scores = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        if CFG.device == 'GPU':
            LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            if CFG.USE_TEST_FOLD and (CFG.STEP == 0 or CFG.STEP == 3):
                LOGGER.info(f'Epoch {epoch+1} - test_fold_avg_val_loss: {test_fold_avg_val_loss:.4f}')
            LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {np.round(scores, decimals=4)}')
        elif CFG.device == 'TPU':
            if CFG.nprocs == 1:
                LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
                if CFG.USE_TEST_FOLD and (CFG.STEP == 0 or CFG.STEP == 3):
                    LOGGER.info(f'Epoch {epoch+1} - test_fold_avg_val_loss: {test_fold_avg_val_loss:.4f}')
                LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {np.round(scores, decimals=4)}')
            elif CFG.nprocs == 8:
                xm.master_print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
                if CFG.USE_TEST_FOLD and (CFG.STEP == 0 or CFG.STEP == 3):
                    xm.master_print(f'Epoch {epoch+1} - test_fold_avg_val_loss: {test_fold_avg_val_loss:.4f}')
                xm.master_print(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {np.round(scores, decimals=4)}')
        # ********************
         # SAVE MAX SCORE
        # ********************
        if score > best_score:
            best_score = score
            if CFG.device == 'GPU':
                LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
                torch.save({'model': model.state_dict(), 
                            'preds': preds},
                           OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_step{CFG.STEP}_best_score.pth')
            elif CFG.device == 'TPU':
                if CFG.nprocs == 1:
                    LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
                elif CFG.nprocs == 8:
                    xm.master_print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
                xm.save({'model': model, 
                         'preds': preds}, 
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_step{CFG.STEP}_best_score.pth')
         # ********************
         # SAVE MIN LOSS
        # ********************
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            if CFG.device == 'GPU':
                LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
                torch.save({'model': model.state_dict(), 
                            'preds': preds},
                           OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_step{CFG.STEP}_best_loss.pth')
            elif CFG.device == 'TPU':
                if CFG.nprocs == 1:
                    LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
                elif CFG.nprocs == 8:
                    xm.master_print(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
                xm.save({'model': model, 
                         'preds': preds}, 
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_step{CFG.STEP}_best_loss.pth')
    
    if CFG.nprocs != 8:
        check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_step{CFG.STEP}_best_score.pth')
        for c in [f'pred_{c}' for c in CFG.target_cols]:
            valid_folds[c] = np.nan
        valid_folds[[f'pred_{c}' for c in CFG.target_cols]] = check_point['preds']

    return valid_folds

In [16]:
# ====================================================
# main
# ====================================================
def main():

    """
    Prepare: 1.train  2.folds
    """

    def get_result(result_df):
        preds = result_df[[f'pred_{c}' for c in CFG.target_cols]].values
        labels = result_df[CFG.target_cols].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {np.round(scores, decimals=4)}')
    
    if CFG.train:
        # train 
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(fold)
                oof_df = pd.concat([oof_df, _oof_df])
                if CFG.nprocs != 8:
                    LOGGER.info(f"========== fold: {fold} result ==========")
                    get_result(_oof_df)
                    
        if CFG.nprocs != 8:
            # CV result
            LOGGER.info(f"========== CV ==========")
            get_result(oof_df)
            # save result
            oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)

In [17]:
if __name__ == '__main__':
    if CFG.device == 'TPU':
        print('TPU MODE')
        def _mp_fn(rank, flags):
            torch.set_default_tensor_type('torch.FloatTensor')
            a = main()
        FLAGS = {}
        xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=CFG.nprocs, start_method='fork')
    elif CFG.device == 'GPU':
        print('GPU MODE')
        main()



GPU MODE
(24080, 15)
(6003, 15)
(23091, 15)
(5747, 15)
(1245, 15)


Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b7-dcc49843.pth


  0%|          | 0.00/254M [00:00<?, ?B/s]

Loaded pretrained weights for efficientnet-b7
Epoch: [1][0/2886] Data 2.107 (2.107) Elapsed 0m 4s (remain 204m 41s) Loss: 0.7025(0.7025) Grad: inf  
Epoch: [1][100/2886] Data 0.000 (0.021) Elapsed 1m 11s (remain 32m 49s) Loss: 0.3079(0.3713) Grad: 4630.0894  
Epoch: [1][200/2886] Data 0.000 (0.011) Elapsed 2m 18s (remain 30m 46s) Loss: 0.3493(0.3575) Grad: 1765.6630  
Epoch: [1][300/2886] Data 0.000 (0.007) Elapsed 3m 24s (remain 29m 18s) Loss: 0.2624(0.3449) Grad: 605.0746  
Epoch: [1][400/2886] Data 0.000 (0.005) Elapsed 4m 31s (remain 28m 0s) Loss: 0.3914(0.3378) Grad: 3030.8811  
Epoch: [1][500/2886] Data 0.000 (0.004) Elapsed 5m 36s (remain 26m 40s) Loss: 0.2335(0.3326) Grad: 464.9859  
Epoch: [1][600/2886] Data 0.000 (0.004) Elapsed 6m 44s (remain 25m 37s) Loss: 0.3564(0.3261) Grad: 1377.2668  
Epoch: [1][700/2886] Data 0.000 (0.003) Elapsed 7m 50s (remain 24m 27s) Loss: 0.3464(0.3222) Grad: 450.5427  
Epoch: [1][800/2886] Data 0.000 (0.003) Elapsed 8m 55s (remain 23m 13s) Loss: 

Epoch 1 - avg_train_loss: 0.3076  avg_val_loss: 0.6948  time: 2119s
Epoch 1 - test_fold_avg_val_loss: 0.7753
Epoch 1 - Score: 0.5120  Scores: [0.6151 0.4979 0.5559 0.4281 0.4724 0.5113 0.5586 0.5294 0.4914 0.5087
 0.4635]
Epoch 1 - Save Best Score: 0.5120 Model


EVAL: [77/78] Data 0.000 (0.515) Elapsed 0m 55s (remain 0m 0s) Loss: 0.3854(0.7753) 


Epoch 1 - Save Best Loss: 0.6948 Model


Epoch: [2][0/2886] Data 1.195 (1.195) Elapsed 0m 2s (remain 107m 6s) Loss: 0.2366(0.2366) Grad: nan  
Epoch: [2][100/2886] Data 0.000 (0.012) Elapsed 1m 4s (remain 29m 51s) Loss: 0.2212(0.2925) Grad: 1614.6404  
Epoch: [2][200/2886] Data 0.000 (0.006) Elapsed 2m 5s (remain 28m 1s) Loss: 0.3181(0.2949) Grad: 1596.9803  
Epoch: [2][300/2886] Data 0.000 (0.004) Elapsed 3m 6s (remain 26m 44s) Loss: 0.2874(0.2951) Grad: 1646.7621  
Epoch: [2][400/2886] Data 0.000 (0.003) Elapsed 4m 7s (remain 25m 36s) Loss: 0.2039(0.2988) Grad: 1809.1692  
Epoch: [2][500/2886] Data 0.000 (0.003) Elapsed 5m 9s (remain 24m 33s) Loss: 0.2830(0.2993) Grad: 4674.8286  
Epoch: [2][600/2886] Data 0.000 (0.002) Elapsed 6m 11s (remain 23m 33s) Loss: 0.3408(0.2985) Grad: 1923.9854  
Epoch: [2][700/2886] Data 0.000 (0.002) Elapsed 7m 13s (remain 22m 32s) Loss: 0.3429(0.2972) Grad: 1899.5752  
Epoch: [2][800/2886] Data 0.000 (0.002) Elapsed 8m 15s (remain 21m 30s) Loss: 0.2831(0.2958) Grad: 1673.6250  
Epoch: [2][900/2

Epoch 2 - avg_train_loss: 0.2993  avg_val_loss: 0.2972  time: 2104s
Epoch 2 - test_fold_avg_val_loss: 0.3118
Epoch 2 - Score: 0.4670  Scores: [0.4249 0.4459 0.5547 0.4182 0.4367 0.5104 0.3923 0.4835 0.5181 0.4954
 0.4564]
Epoch 2 - Save Best Loss: 0.2972 Model


EVAL: [77/78] Data 0.000 (0.538) Elapsed 0m 57s (remain 0m 0s) Loss: 0.3770(0.3118) 
Epoch: [3][0/2886] Data 1.615 (1.615) Elapsed 0m 2s (remain 125m 24s) Loss: 0.2043(0.2043) Grad: nan  
Epoch: [3][100/2886] Data 0.000 (0.016) Elapsed 1m 7s (remain 30m 50s) Loss: 0.4203(0.3021) Grad: 4374.8091  
Epoch: [3][200/2886] Data 0.000 (0.008) Elapsed 2m 10s (remain 29m 0s) Loss: 0.3368(0.3010) Grad: 2067.6162  
Epoch: [3][300/2886] Data 0.000 (0.006) Elapsed 3m 13s (remain 27m 43s) Loss: 0.3655(0.3025) Grad: 2238.9250  
Epoch: [3][400/2886] Data 0.000 (0.004) Elapsed 4m 17s (remain 26m 34s) Loss: 0.2736(0.3049) Grad: 1378.1587  
Epoch: [3][500/2886] Data 0.000 (0.003) Elapsed 5m 20s (remain 25m 26s) Loss: 0.2690(0.3027) Grad: 1358.5730  
Epoch: [3][600/2886] Data 0.000 (0.003) Elapsed 6m 24s (remain 24m 22s) Loss: 0.4727(0.3006) Grad: 3299.8989  
Epoch: [3][700/2886] Data 0.000 (0.003) Elapsed 7m 28s (remain 23m 16s) Loss: 0.3026(0.3009) Grad: 1261.8976  
Epoch: [3][800/2886] Data 0.000 (0.00

Epoch 3 - avg_train_loss: 0.2995  avg_val_loss: 0.2965  time: 2168s
Epoch 3 - test_fold_avg_val_loss: 0.3110
Epoch 3 - Score: 0.4867  Scores: [0.5648 0.5183 0.4837 0.4112 0.4918 0.4901 0.5009 0.4937 0.4918 0.5008
 0.4067]
Epoch 3 - Save Best Loss: 0.2965 Model


EVAL: [77/78] Data 0.000 (0.582) Elapsed 1m 1s (remain 0m 0s) Loss: 0.3851(0.3110) 
Epoch: [4][0/2886] Data 1.512 (1.512) Elapsed 0m 2s (remain 131m 27s) Loss: 0.2940(0.2940) Grad: nan  
Epoch: [4][100/2886] Data 0.000 (0.015) Elapsed 1m 10s (remain 32m 20s) Loss: 0.3531(0.3120) Grad: 1848.3649  
Epoch: [4][200/2886] Data 0.000 (0.008) Elapsed 2m 16s (remain 30m 28s) Loss: 0.3248(0.3010) Grad: 1554.6338  
Epoch: [4][300/2886] Data 0.000 (0.005) Elapsed 3m 22s (remain 29m 3s) Loss: 0.2566(0.2950) Grad: 474.6259  
Epoch: [4][400/2886] Data 0.000 (0.004) Elapsed 4m 29s (remain 27m 49s) Loss: 0.3806(0.2960) Grad: 1099.2212  
Epoch: [4][500/2886] Data 0.000 (0.003) Elapsed 5m 34s (remain 26m 34s) Loss: 0.2046(0.2962) Grad: 703.5767  
Epoch: [4][600/2886] Data 0.000 (0.003) Elapsed 6m 40s (remain 25m 23s) Loss: 0.2887(0.2989) Grad: 428.2557  
Epoch: [4][700/2886] Data 0.000 (0.002) Elapsed 7m 46s (remain 24m 15s) Loss: 0.2185(0.2978) Grad: 358.9857  
Epoch: [4][800/2886] Data 0.000 (0.002) E

Epoch 4 - avg_train_loss: 0.2988  avg_val_loss: 0.3270  time: 2282s
Epoch 4 - test_fold_avg_val_loss: 0.3468
Epoch 4 - Score: 0.5102  Scores: [0.5076 0.593  0.4497 0.496  0.5563 0.4193 0.5632 0.5495 0.473  0.4982
 0.5067]


EVAL: [77/78] Data 0.000 (0.598) Elapsed 1m 2s (remain 0m 0s) Loss: 0.4618(0.3468) 
Epoch     4: reducing learning rate of group 0 to 3.0000e-04.
Epoch: [5][0/2886] Data 1.913 (1.913) Elapsed 0m 2s (remain 144m 1s) Loss: 0.3627(0.3627) Grad: nan  
Epoch: [5][100/2886] Data 0.000 (0.019) Elapsed 1m 10s (remain 32m 34s) Loss: 0.2955(0.2899) Grad: 931.6432  
Epoch: [5][200/2886] Data 0.000 (0.010) Elapsed 2m 17s (remain 30m 35s) Loss: 0.2156(0.3020) Grad: 1398.7312  
Epoch: [5][300/2886] Data 0.000 (0.007) Elapsed 3m 23s (remain 29m 5s) Loss: 0.2595(0.2979) Grad: 1048.1196  
Epoch: [5][400/2886] Data 0.000 (0.005) Elapsed 4m 29s (remain 27m 51s) Loss: 0.2067(0.2984) Grad: 1295.3871  
Epoch: [5][500/2886] Data 0.000 (0.004) Elapsed 5m 36s (remain 26m 39s) Loss: 0.4341(0.2977) Grad: 3445.2251  
Epoch: [5][600/2886] Data 0.000 (0.003) Elapsed 6m 42s (remain 25m 31s) Loss: 0.2661(0.2976) Grad: 903.4189  
Epoch: [5][700/2886] Data 0.000 (0.003) Elapsed 7m 49s (remain 24m 22s) Loss: 0.2890(0.29

Epoch 5 - avg_train_loss: 0.2982  avg_val_loss: 0.3034  time: 2245s
Epoch 5 - test_fold_avg_val_loss: 0.3151
Epoch 5 - Score: 0.4838  Scores: [0.4024 0.5279 0.505  0.4964 0.4081 0.5802 0.4521 0.5328 0.4947 0.4724
 0.4503]


EVAL: [77/78] Data 0.000 (0.587) Elapsed 1m 1s (remain 0m 0s) Loss: 0.3805(0.3151) 
Epoch     5: reducing learning rate of group 0 to 3.0000e-05.
Epoch: [6][0/2886] Data 1.820 (1.820) Elapsed 0m 3s (remain 159m 4s) Loss: 0.2311(0.2311) Grad: 6005.8550  
Epoch: [6][100/2886] Data 0.000 (0.018) Elapsed 1m 10s (remain 32m 20s) Loss: 0.2254(0.2981) Grad: 959.8301  
Epoch: [6][200/2886] Data 0.000 (0.009) Elapsed 2m 17s (remain 30m 40s) Loss: 0.3775(0.3054) Grad: 1647.7980  
Epoch: [6][300/2886] Data 0.000 (0.006) Elapsed 3m 25s (remain 29m 20s) Loss: 0.2310(0.3078) Grad: 1258.9050  
Epoch: [6][400/2886] Data 0.000 (0.005) Elapsed 4m 31s (remain 28m 0s) Loss: 0.2436(0.3053) Grad: 1169.2052  
Epoch: [6][500/2886] Data 0.000 (0.004) Elapsed 5m 38s (remain 26m 51s) Loss: 0.3642(0.3015) Grad: 1787.6428  
Epoch: [6][600/2886] Data 0.000 (0.003) Elapsed 6m 45s (remain 25m 41s) Loss: 0.3029(0.3007) Grad: 1911.0996  
Epoch: [6][700/2886] Data 0.000 (0.003) Elapsed 7m 51s (remain 24m 29s) Loss: 0.21

Epoch 6 - avg_train_loss: 0.2981  avg_val_loss: 0.3992  time: 2279s
Epoch 6 - test_fold_avg_val_loss: 0.9724
Epoch 6 - Score: 0.6029  Scores: [0.5756 0.6513 0.678  0.5876 0.6506 0.4733 0.7068 0.5306 0.559  0.4906
 0.7283]
Epoch 6 - Save Best Score: 0.6029 Model


EVAL: [77/78] Data 0.074 (0.606) Elapsed 1m 3s (remain 0m 0s) Loss: 0.3807(0.9724) 
Epoch     6: reducing learning rate of group 0 to 3.0000e-06.
Epoch: [7][0/2886] Data 1.413 (1.413) Elapsed 0m 3s (remain 154m 55s) Loss: 0.2190(0.2190) Grad: 6605.9248  
Epoch: [7][100/2886] Data 0.000 (0.014) Elapsed 1m 13s (remain 33m 39s) Loss: 0.2513(0.2993) Grad: 1254.2385  
Epoch: [7][200/2886] Data 0.000 (0.007) Elapsed 2m 22s (remain 31m 40s) Loss: 0.4329(0.2952) Grad: 2364.8252  
Epoch: [7][300/2886] Data 0.000 (0.005) Elapsed 3m 30s (remain 30m 11s) Loss: 0.3099(0.2943) Grad: 1236.7094  
Epoch: [7][400/2886] Data 0.000 (0.004) Elapsed 4m 39s (remain 28m 52s) Loss: 0.2769(0.2962) Grad: 990.2027  
Epoch: [7][500/2886] Data 0.000 (0.003) Elapsed 5m 48s (remain 27m 38s) Loss: 0.1498(0.2959) Grad: 2037.9340  
Epoch: [7][600/2886] Data 0.000 (0.003) Elapsed 6m 57s (remain 26m 26s) Loss: 0.2760(0.2961) Grad: 1189.8467  
Epoch: [7][700/2886] Data 0.000 (0.002) Elapsed 8m 6s (remain 25m 16s) Loss: 0.2

Epoch 7 - avg_train_loss: 0.2974  avg_val_loss: 0.4237  time: 2316s
Epoch 7 - test_fold_avg_val_loss: 1.1758
Epoch 7 - Score: 0.6116  Scores: [0.5726 0.6696 0.6927 0.6051 0.6565 0.493  0.7152 0.5323 0.5511 0.5038
 0.7354]
Epoch 7 - Save Best Score: 0.6116 Model


EVAL: [77/78] Data 0.271 (0.598) Elapsed 1m 2s (remain 0m 0s) Loss: 0.3805(1.1758) 
Epoch     7: reducing learning rate of group 0 to 3.0000e-07.
Epoch: [8][0/2886] Data 2.274 (2.274) Elapsed 0m 3s (remain 171m 27s) Loss: 0.3258(0.3258) Grad: nan  
Epoch: [8][100/2886] Data 0.000 (0.023) Elapsed 1m 12s (remain 33m 29s) Loss: 0.2983(0.3012) Grad: 859.6932  
Epoch: [8][200/2886] Data 0.000 (0.012) Elapsed 2m 21s (remain 31m 29s) Loss: 0.2362(0.3016) Grad: 959.1054  
Epoch: [8][300/2886] Data 0.000 (0.008) Elapsed 3m 29s (remain 30m 1s) Loss: 0.2569(0.2975) Grad: 1452.6658  
Epoch: [8][400/2886] Data 0.000 (0.006) Elapsed 4m 37s (remain 28m 40s) Loss: 0.2937(0.2977) Grad: 1178.2643  
Epoch: [8][500/2886] Data 0.000 (0.005) Elapsed 5m 46s (remain 27m 28s) Loss: 0.2382(0.2982) Grad: 803.3138  
Epoch: [8][600/2886] Data 0.000 (0.004) Elapsed 6m 54s (remain 26m 14s) Loss: 0.3185(0.2971) Grad: 1065.7411  
Epoch: [8][700/2886] Data 0.000 (0.003) Elapsed 8m 2s (remain 25m 3s) Loss: 0.2572(0.2968

Epoch 8 - avg_train_loss: 0.2973  avg_val_loss: 0.7001  time: 2298s
Epoch 8 - test_fold_avg_val_loss: 2.1195
Epoch 8 - Score: 0.6174  Scores: [0.5756 0.6648 0.6951 0.6063 0.6607 0.5458 0.7092 0.5347 0.5571 0.5156
 0.7261]
Epoch 8 - Save Best Score: 0.6174 Model


EVAL: [77/78] Data 0.196 (0.599) Elapsed 1m 2s (remain 0m 0s) Loss: 0.3789(2.1195) 
Epoch: [9][0/2886] Data 1.456 (1.456) Elapsed 0m 3s (remain 152m 55s) Loss: 0.2618(0.2618) Grad: 3919.2524  
Epoch: [9][100/2886] Data 0.000 (0.015) Elapsed 1m 11s (remain 33m 2s) Loss: 0.2640(0.3020) Grad: 845.6094  
Epoch: [9][200/2886] Data 0.000 (0.008) Elapsed 2m 20s (remain 31m 10s) Loss: 0.2729(0.3042) Grad: 731.0513  
Epoch: [9][300/2886] Data 0.000 (0.005) Elapsed 3m 28s (remain 29m 47s) Loss: 0.2981(0.3027) Grad: 1132.9706  
Epoch: [9][400/2886] Data 0.000 (0.004) Elapsed 4m 35s (remain 28m 30s) Loss: 0.3858(0.3036) Grad: 2935.3091  
Epoch: [9][500/2886] Data 0.000 (0.003) Elapsed 5m 44s (remain 27m 21s) Loss: 0.2484(0.3016) Grad: 1428.9496  
Epoch: [9][600/2886] Data 0.000 (0.003) Elapsed 6m 53s (remain 26m 10s) Loss: 0.3261(0.3014) Grad: 1045.2021  
Epoch: [9][700/2886] Data 0.000 (0.002) Elapsed 8m 1s (remain 25m 0s) Loss: 0.2678(0.2998) Grad: 1088.1326  
Epoch: [9][800/2886] Data 0.000 (0.

Epoch 9 - avg_train_loss: 0.2972  avg_val_loss: 0.5143  time: 2305s
Epoch 9 - test_fold_avg_val_loss: 1.4766
Epoch 9 - Score: 0.6197  Scores: [0.5718 0.6764 0.6976 0.6068 0.6583 0.5552 0.7147 0.5306 0.5557 0.5143
 0.7354]
Epoch 9 - Save Best Score: 0.6197 Model


EVAL: [77/78] Data 0.169 (0.593) Elapsed 1m 2s (remain 0m 0s) Loss: 0.3796(1.4766) 
Epoch: [10][0/2886] Data 1.527 (1.527) Elapsed 0m 2s (remain 133m 26s) Loss: 0.2860(0.2860) Grad: nan  
Epoch: [10][100/2886] Data 0.000 (0.015) Elapsed 1m 11s (remain 32m 38s) Loss: 0.2854(0.2865) Grad: 1571.6271  
Epoch: [10][200/2886] Data 0.000 (0.008) Elapsed 2m 18s (remain 30m 52s) Loss: 0.3968(0.2946) Grad: 2764.3787  
Epoch: [10][300/2886] Data 0.000 (0.005) Elapsed 3m 26s (remain 29m 32s) Loss: 0.2841(0.2974) Grad: 948.4014  
Epoch: [10][400/2886] Data 0.000 (0.004) Elapsed 4m 33s (remain 28m 12s) Loss: 0.3030(0.2957) Grad: 1203.1260  
Epoch: [10][500/2886] Data 0.000 (0.003) Elapsed 5m 40s (remain 27m 0s) Loss: 0.2477(0.2952) Grad: 944.1910  
Epoch: [10][600/2886] Data 0.000 (0.003) Elapsed 6m 48s (remain 25m 51s) Loss: 0.3455(0.2961) Grad: 1211.1790  
Epoch: [10][700/2886] Data 0.000 (0.002) Elapsed 7m 54s (remain 24m 40s) Loss: 0.2154(0.2961) Grad: 1307.4049  
Epoch: [10][800/2886] Data 0.00

Epoch 10 - avg_train_loss: 0.2973  avg_val_loss: 0.3697  time: 2272s
Epoch 10 - test_fold_avg_val_loss: 0.8931
Epoch 10 - Score: 0.6197  Scores: [0.5699 0.6759 0.7008 0.6056 0.6591 0.5548 0.7205 0.5242 0.5564 0.5144
 0.7352]


EVAL: [77/78] Data 0.090 (0.567) Elapsed 1m 0s (remain 0m 0s) Loss: 0.3806(0.8931) 


Score: 0.6197  Scores: [0.5718 0.6764 0.6976 0.6068 0.6583 0.5552 0.7147 0.5306 0.5557 0.5143
 0.7354]
Score: 0.6197  Scores: [0.5718 0.6764 0.6976 0.6068 0.6583 0.5552 0.7147 0.5306 0.5557 0.5143
 0.7354]


In [18]:
# save as cpu
if CFG.device == 'TPU':
    for fold in range(CFG.n_fold):
        if fold in CFG.trn_fold:
            # best score
            state = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_step{CFG.STEP}_best_score.pth')
            torch.save({'model': state['model'].to('cpu').state_dict(), 
                        'preds': state['preds']}, 
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_step{CFG.STEP}_best_score_cpu.pth')
            # best loss
            state = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_step{CFG.STEP}_best_loss.pth')
            torch.save({'model': state['model'].to('cpu').state_dict(), 
                        'preds': state['preds']}, 
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_step{CFG.STEP}_best_loss_cpu.pth')