In [1]:
# !pip install pytorch_ranger

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

In [3]:
os.listdir('../input/cassava-leaf-disease-classification/')

['train_tfrecords',
 'sample_submission.csv',
 'test_tfrecords',
 'label_num_to_disease_map.json',
 'train_images',
 'train.csv',
 'test_images']

In [4]:
train = pd.read_csv('../input/cassava-leaf-disease-merged/merged.csv')
test = pd.read_csv('../input/cassava-leaf-disease-classification//sample_submission.csv')
label_map = pd.read_json('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json', orient='index')

display(train.head())
display(test.head())
display(label_map)

Unnamed: 0,image_id,label,source
0,1000015157.jpg,0,2020
1,1000201771.jpg,3,2020
2,100042118.jpg,1,2020
3,1000723321.jpg,1,2020
4,1000812911.jpg,3,2020


Unnamed: 0,image_id,label
0,2216849948.jpg,4


Unnamed: 0,0
0,Cassava Bacterial Blight (CBB)
1,Cassava Brown Streak Disease (CBSD)
2,Cassava Green Mottle (CGM)
3,Cassava Mosaic Disease (CMD)
4,Healthy


## Directory settings

In [5]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
TRAIN_PATH = '../input/cassava-leaf-disease-merged/train'
TEST_PATH = '../input/cassava-leaf-disease-classification/test_images'

## CFG

In [6]:
class CFG:
    debug = False
    apex = False
    print_freq = 100
    num_workers = 4
    model_name = 'efficientnet_b0'
    size = 380
    scheduler = 'CosineAnnealingWarmRestarts'
    loss_train = 'BiTemperedLoss'
    epochs = 10
    T_0 = 10
    lr_1 = 1e-3
    lr_2 = 1e-4
    t1 = 0.9
    t2 = 1.5
    smooth = 1e-2
    min_lr = 1e-6
    batch_size = 32
    weight_decay = 1e-6
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    seed = 42
    target_size = 5
    target_col = 'label'
    n_fold = 5
    trn_fold = [0]
    train = True
    inference = False
    
if CFG.debug:
    CFG.epochs = 10
    train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)

## Library

In [7]:
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

sys.path.append('../input/bi-tempered-loss-pytorch')
from bi_tempered_loss import *

# sys.path.append('../input/pytorch-optimizer')
# import torch_optimizer as optim

sys.path.append('../input/pytorch-sam')
from sam import SAM

package_path = '../input/image-fmix/FMix-master' #'../input/efficientnet-pytorch-07/efficientnet_pytorch-0.7.0'
sys.path.append(package_path)
from fmix import sample_mask

from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine, RandomResizedCrop,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, CoarseDropout, ShiftScaleRotate, CenterCrop, Resize
)
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

import warnings 
warnings.filterwarnings('ignore')

if CFG.apex:
    from apex import amp

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Utils

In [8]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f}')
    
def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

## CV split

In [9]:
folds = train.copy()
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.groupby(['fold', CFG.target_col]).size())

fold  label
0     0         299
      1         695
      2         604
      3        3092
      4         578
1     0         299
      1         695
      2         604
      3        3092
      4         578
2     0         298
      1         695
      2         603
      3        3093
      4         578
3     0         298
      1         695
      2         603
      3        3093
      4         578
4     0         298
      1         696
      2         603
      3        3092
      4         578
dtype: int64


## Dataset

In [10]:
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.labels = df['label'].values
#         self.labels = pd.get_dummies(df['label']).values  # One Hot Encoding
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TRAIN_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx]).long()
        return image, label
    
class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TEST_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        return image

In [11]:
# train_dataset = TrainDataset(train, transform=None)

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image)
#     plt.title(f'label: {label}')
#     plt.show()

## Transforms

In [12]:
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            RandomResizedCrop(CFG.size, CFG.size), 
            Transpose(p=0.5), 
            HorizontalFlip(p=0.5), 
            VerticalFlip(p=0.5), 
            ShiftScaleRotate(p=0.5), 
            HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5), 
            RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])
    
    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])

In [13]:
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image[0])
#     plt.title(f'label: {label}')
#     plt.show()

In [14]:
def mixup(data, target, alpha):
    indices = torch.randperm(data.size(0))
    shuffled_data = data[indices]
    shuffled_target = target[indices]
    
    lam = np.clip(np.random.beta(alpha, alpha), 0.3, 0.4)
    new_data = data.clone()
    new_data = lam * new_data[:, :, :, :] + (1 - lam) * data[indices, :, :, :]
    targets = (target, shuffled_target, lam)
    return new_data, targets

def rand_bbox(size, lam):
    W = size[2]
    H = size[3]
    cut_rat = np.sqrt(1. - lam)
    cut_w = np.int(W * cut_rat)
    cut_h = np.int(H * cut_rat)
    
    # uniform
    cx = np.random.randint(W)
    cy = np.random.randint(H)
    
    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    return bbx1, bbx2, bby1, bby2

def cutmix(data, target, alpha):
    indices = torch.randperm(data.size(0))
    shuffled_data = data[indices]
    shuffled_target = target[indices]
    
    lam = np.clip(np.random.beta(alpha, alpha), 0.3, 0.4)
    bbx1, bbx2, bby1, bby2 = rand_bbox(data.size(), lam)
    new_data = data.clone()
    new_data[:, :, bby1:bby2, bbx1:bbx2] = data[indices, :, bby1:bby2, bbx1:bbx2]
    # adjust lambda to exactly match pixel ratio
    lam = 1-((bbx2 - bbx1) * (bby2 - bby1) / (data.size()[-1] * data.size()[-2]))
    targets = (target, shuffled_target, lam)
    return new_data, targets

def fmix(data, targets, alpha, decay_power, shape, max_soft=0.0, reformulate=False):
    lam, mask = sample_mask(alpha, decay_power, shape, max_soft, reformulate)
    #mask =torch.tensor(mask, device=device).float()
    indices = torch.randperm(data.size(0))
    shuffled_data = data[indices]
    shuffled_targets = targets[indices]
    x1 = torch.from_numpy(mask).to(device)*data
    x2 = torch.from_numpy(1-mask).to(device)*shuffled_data
    targets=(targets, shuffled_targets, lam)
    
    return (x1+x2), targets

## MODEL

In [15]:
class CustomEfficientNetB0(nn.Module):
    def __init__(self, model_name='efficientnet_b0', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(n_features, CFG.target_size)
        
    def forward(self, x):
        x = self.model(x)
        return x

In [16]:
# model = CustomEfficientNetB0(model_name=CFG.model_name, pretrained=False)
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))
# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, 
#                           num_workers=4, pin_memory=True, drop_last=True)

# for image, label in train_loader:
#     print(image.size())
#     output = model(image)
#     print(output)
#     break

## Loss Functions

In [17]:
# ====================================================
# Label Smoothing
# ====================================================
class LabelSmoothingLoss(nn.Module): 
    def __init__(self, classes=5, smoothing=0.0, dim=-1): 
        super(LabelSmoothingLoss, self).__init__() 
        self.confidence = 1.0 - smoothing 
        self.smoothing = smoothing 
        self.cls = classes 
        self.dim = dim 
    def forward(self, pred, target): 
        pred = pred.log_softmax(dim=self.dim) 
        with torch.no_grad():
            true_dist = torch.zeros_like(pred) 
            true_dist.fill_(self.smoothing / (self.cls - 1)) 
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [18]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduce = reduce

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [19]:
class FocalCosineLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, xent=.1):
        super(FocalCosineLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

        self.xent = xent

        self.y = torch.Tensor([1]).cuda()

    def forward(self, input, target, reduction="mean"):
        cosine_loss = F.cosine_embedding_loss(input, F.one_hot(target, num_classes=input.size(-1)), self.y, reduction=reduction)

        cent_loss = F.cross_entropy(F.normalize(input), target, reduce=False)
        pt = torch.exp(-cent_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * cent_loss

        if reduction == "mean":
            focal_loss = torch.mean(focal_loss)

        return cosine_loss + self.xent * focal_loss

In [20]:
class SymmetricCrossEntropy(nn.Module):

    def __init__(self, alpha=0.1, beta=1.0, num_classes=5):
        super(SymmetricCrossEntropy, self).__init__()
        self.alpha = alpha
        self.beta = beta
        self.num_classes = num_classes

    def forward(self, logits, targets, reduction='mean'):
        onehot_targets = torch.eye(self.num_classes)[targets].cuda()
        ce_loss = F.cross_entropy(logits, targets, reduction=reduction)
        rce_loss = (-onehot_targets*logits.softmax(1).clamp(1e-7, 1.0).log()).sum(1)
        if reduction == 'mean':
            rce_loss = rce_loss.mean()
        elif reduction == 'sum':
            rce_loss = rce_loss.sum()
        return self.alpha * ce_loss + self.beta * rce_loss

In [21]:
def log_t(u, t):
    """Compute log_t for `u'."""
    if t==1.0:
        return u.log()
    else:
        return (u.pow(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u'."""
    if t==1:
        return u.exp()
    else:
        return (1.0 + (1.0-t)*u).relu().pow(1.0 / (1.0 - t))

def compute_normalization_fixed_point(activations, t, num_iters):

    """Returns the normalization value for each example (t > 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same shape as activation with the last dimension being 1.
    """
    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations_step_0 = activations - mu

    normalized_activations = normalized_activations_step_0

    for _ in range(num_iters):
        logt_partition = torch.sum(
                exp_t(normalized_activations, t), -1, keepdim=True)
        normalized_activations = normalized_activations_step_0 * \
                logt_partition.pow(1.0-t)

    logt_partition = torch.sum(
            exp_t(normalized_activations, t), -1, keepdim=True)
    normalization_constants = - log_t(1.0 / logt_partition, t) + mu

    return normalization_constants

def compute_normalization_binary_search(activations, t, num_iters):

    """Returns the normalization value for each example (t < 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (< 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """

    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations = activations - mu

    effective_dim = \
        torch.sum(
                (normalized_activations > -1.0 / (1.0-t)).to(torch.int32),
            dim=-1, keepdim=True).to(activations.dtype)

    shape_partition = activations.shape[:-1] + (1,)
    lower = torch.zeros(shape_partition, dtype=activations.dtype, device=activations.device)
    upper = -log_t(1.0/effective_dim, t) * torch.ones_like(lower)

    for _ in range(num_iters):
        logt_partition = (upper + lower)/2.0
        sum_probs = torch.sum(
                exp_t(normalized_activations - logt_partition, t),
                dim=-1, keepdim=True)
        update = (sum_probs < 1.0).to(activations.dtype)
        lower = torch.reshape(
                lower * update + (1.0-update) * logt_partition,
                shape_partition)
        upper = torch.reshape(
                upper * (1.0 - update) + update * logt_partition,
                shape_partition)

    logt_partition = (upper + lower)/2.0
    return logt_partition + mu

class ComputeNormalization(torch.autograd.Function):
    """
    Class implementing custom backward pass for compute_normalization. See compute_normalization.
    """
    @staticmethod
    def forward(ctx, activations, t, num_iters):
        if t < 1.0:
            normalization_constants = compute_normalization_binary_search(activations, t, num_iters)
        else:
            normalization_constants = compute_normalization_fixed_point(activations, t, num_iters)

        ctx.save_for_backward(activations, normalization_constants)
        ctx.t=t
        return normalization_constants

    @staticmethod
    def backward(ctx, grad_output):
        activations, normalization_constants = ctx.saved_tensors
        t = ctx.t
        normalized_activations = activations - normalization_constants 
        probabilities = exp_t(normalized_activations, t)
        escorts = probabilities.pow(t)
        escorts = escorts / escorts.sum(dim=-1, keepdim=True)
        grad_input = escorts * grad_output
        
        return grad_input, None, None

def compute_normalization(activations, t, num_iters=5):
    """Returns the normalization value for each example. 
    Backward pass is implemented.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    return ComputeNormalization.apply(activations, t, num_iters)

def tempered_sigmoid(activations, t, num_iters = 5):
    """Tempered sigmoid function.
    Args:
      activations: Activations for the positive class for binary classification.
      t: Temperature tensor > 0.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_probabilities = tempered_softmax(internal_activations, t, num_iters)
    return internal_probabilities[..., 0]


def tempered_softmax(activations, t, num_iters=5):
    """Tempered softmax function.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature > 1.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    if t == 1.0:
        return activations.softmax(dim=-1)

    normalization_constants = compute_normalization(activations, t, num_iters)
    return exp_t(activations - normalization_constants, t)

def bi_tempered_binary_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing = 0.0,
        num_iters=5,
        reduction='mean'):

    """Bi-Tempered binary logistic loss.
    Args:
      activations: A tensor containing activations for class 1.
      labels: A tensor with shape as activations, containing probabilities for class 1
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing
      num_iters: Number of iterations to run the method.
    Returns:
      A loss tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_labels = torch.stack([labels.to(activations.dtype),
        1.0 - labels.to(activations.dtype)],
        dim=-1)
    return bi_tempered_logistic_loss(internal_activations, 
            internal_labels,
            t1,
            t2,
            label_smoothing = label_smoothing,
            num_iters = num_iters,
            reduction = reduction)

def bi_tempered_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing=0.0,
        num_iters=5,
        reduction = 'mean'):

    """Bi-Tempered Logistic Loss.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      labels: A tensor with shape and dtype as activations (onehot), 
        or a long tensor of one dimension less than activations (pytorch standard)
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing parameter between [0, 1). Default 0.0.
      num_iters: Number of iterations to run the method. Default 5.
      reduction: ``'none'`` | ``'mean'`` | ``'sum'``. Default ``'mean'``.
        ``'none'``: No reduction is applied, return shape is shape of
        activations without the last dimension.
        ``'mean'``: Loss is averaged over minibatch. Return shape (1,)
        ``'sum'``: Loss is summed over minibatch. Return shape (1,)
    Returns:
      A loss tensor.
    """

    if len(labels.shape)<len(activations.shape): #not one-hot
        labels_onehot = torch.zeros_like(activations)
        labels_onehot.scatter_(1, labels[..., None], 1)
    else:
        labels_onehot = labels

    if label_smoothing > 0:
        num_classes = labels_onehot.shape[-1]
        labels_onehot = ( 1 - label_smoothing * num_classes / (num_classes - 1) ) \
                * labels_onehot + \
                label_smoothing / (num_classes - 1)

    probabilities = tempered_softmax(activations, t2, num_iters)

    loss_values = labels_onehot * log_t(labels_onehot + 1e-10, t1) \
            - labels_onehot * log_t(probabilities, t1) \
            - labels_onehot.pow(2.0 - t1) / (2.0 - t1) \
            + probabilities.pow(2.0 - t1) / (2.0 - t1)
    loss_values = loss_values.sum(dim = -1) #sum over classes

    if reduction == 'none':
        return loss_values
    if reduction == 'sum':
        return loss_values.sum()
    if reduction == 'mean':
        return loss_values.mean()

In [22]:
class BiTemperedLogisticLoss(nn.Module): 
    def __init__(self, t1, t2, smoothing=0.0): 
        super(BiTemperedLogisticLoss, self).__init__() 
        self.t1 = t1
        self.t2 = t2
        self.smoothing = smoothing
    def forward(self, logit_label, truth_label):
        loss_label = bi_tempered_logistic_loss(
            logit_label, truth_label,
            t1=self.t1, t2=self.t2,
            label_smoothing=self.smoothing,
            reduction='none'
        )
        
        loss_label = loss_label.mean()
        return loss_label

In [23]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def asMinutes(s):
    """秒を分に変換する関数"""
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    """経過時間の測定と終了時間の予測を行う関数
    Parameters
    ----------
    since : float
        実験を始めた時刻
    percent : float
        実験が進んだ割合
        
    Returns
    -------
    s : 経過時間
    re : 終了までの時間の予測
    """
    now = time.time()
    s = now - since  # 経過時間の測定
    es = s / percent  # 終了時間の予測
    re = es - s  # 残り時間の予想
    return '%s (remain %s)' % (asMinutes(s), asMinutes(re))

def train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, shechduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for step, (images, labels) in pbar:
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        mix_decision = np.random.rand()
        # cutmix
        if mix_decision < 0.5:
            images, labels = mixup(images, labels, alpha=1.)

        y_preds = model(images.float())
        if mix_decision < 0.5:
            metric = loss_metric(y_preds, labels[0]) * labels[2] + loss_metric(y_preds, labels[1]) * (1. - labels[2])
            loss = loss_train(y_preds, labels[0]) * labels[2] + loss_train(y_preds, labels[1]) * (1. - labels[2])
        else:    
            metric = loss_metric(y_preds, labels)
            loss = loss_train(y_preds, labels)
        # record loss
        losses.update(metric.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        if CFG.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else: 
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.first_step(zero_grad=True)
#             optimizer.zero_grad()
            global_step += 1
        # measure elapsed timec
        if mix_decision < 0.5:
            y_preds_ = model(images.float())
            loss_ = loss_train(y_preds_, labels[0]) * labels[2] + loss_train(y_preds_, labels[1]) * (1. - labels[2])
            loss_.backward()
        else:
            loss_train(model(images), labels).backward()
#         loss = torch.mean(loss)
#         loss.backward()
        optimizer.second_step(zero_grad=True)
        
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}]'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})'
                  'Elapsed {remain:s}' 
                  'Loss: {loss.val:.4f}({loss.avg:.4f})' 
                  'Grad: {grad_norm:.4f}  '
                  .format(epoch+1, step, len(train_loader), batch_time=batch_time, 
                          data_time=data_time, loss=losses, 
                          remain=timeSince(start, float(step+1)/len(train_loader)), 
                          grad_norm=grad_norm))
    return losses.avg

def valid_fn(valid_loader, model, loss_metric, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (images, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images)
        loss = loss_metric(y_preds, labels)
        losses.update(loss.item(), batch_size)
        # record accuracy
        preds.append(y_preds.softmax(1).to('cpu').numpy())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
            
    predictions = np.concatenate(preds)
    return losses.avg, predictions

def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (images) in tk0:
        images = images.to(device)
        avgpreds = []
        for state in states:
            model.load_state_dict(state['model'])
            model.eval()
            with torch.no_grad():
                y_preds = model(images)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs

## Train loop

In [24]:
# ======================================================
# Train loop
# ======================================================

def train_loop(folds, fold):
    
    seed_torch(seed=CFG.seed)    
    
    LOGGER.info(f'========== fold: {fold} training ============')
    
    # ======================================================
    # loader
    # ======================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index
    
    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    
    train_dataset = TrainDataset(train_folds, 
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds, 
                                 transform=get_transforms(data='valid'))
    
    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=True, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    
    # ===============================================
    # scheduler
    # ===============================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler
    
    # ===============================================
    # model & optimizer
    # ===============================================
    model = CustomEfficientNetB0(CFG.model_name, pretrained=True)
    
    # 最初の1epochはclassifier層以外全て凍結する。
    for name, param in model.model.named_parameters():
        if 'classifier' not in name:
            param.requires_grad=False
    
    model.to(device)
    
    base_optimizer = Adam
    optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_1, weight_decay=CFG.weight_decay, amsgrad=False)
    
    scheduler = get_scheduler(optimizer)
    
    # ===============================================
    # apex 
    # ===============================================
    if CFG.apex:
        model.optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
        
    # ===============================================
    # loop
    # ===============================================
    def get_loss_train():
        if CFG.loss_train == 'CrossEntropyLoss':
            loss_train = nn.CrossEntropyLoss()
        elif CFG.loss_train == 'LabelSmoothing':
            loss_train = LabelSmoothingLoss(classes=CFG.target_size, smoothing=CFG.smooth)
        elif CFG.loss_train == 'FocalLoss':
            loss_train = FocalLoss().to(device)
        elif CFG.loss_train == 'FocalCosineLoss':
            loss_train = FocalCosineLoss()
        elif CFG.loss_train == 'SymmetricCrossEntropyLoss':
            loss_train = SymmetricCrossEntropy().to(device)
        elif CFG.loss_train == 'BiTemperedLoss':
            loss_train = BiTemperedLogisticLoss(t1=CFG.t1, t2=CFG.t2, smoothing=CFG.smooth)
        return loss_train
    
    loss_train = get_loss_train()
    LOGGER.info(f'loss_train: {loss_train}')
    loss_metric = nn.CrossEntropyLoss()
    
    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        if epoch == 1:
            
            # 2epoch目に重みを全て解凍する
            for param in model.model.parameters():
                param.requires_grad = True
                
            # 学習率を4e-3から4e-4に落とす
            base_optimizer = Adam
            optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_2, weight_decay=CFG.weight_decay, amsgrad=False)
            scheduler = get_scheduler(optimizer)

            LOGGER.info('requires_grad of all parameters are unlocked')
            
        
        # train
        avg_loss = train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, scheduler, device)
        
        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, loss_metric, device)
        valid_labels = valid_folds[CFG.target_col].values
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()
        
        # scoring
        score = get_score(valid_labels, preds.argmax(1))
        
        elapsed = time.time() - start_time
        
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Accuracy: {score}')
        
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(), 
                        'preds': preds}, 
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
            
        # inference用に全て保存しておく        
        torch.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth')
    
    check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    valid_folds[[str(c) for c in range(5)]] = check_point['preds']
    valid_folds['preds'] = check_point['preds'].argmax(1)
    
    return valid_folds

In [25]:
# ====================================================
# main
# ====================================================
def main():
    
    """
    Prepare: 1.train 2.test 3.submission 4.folds
    """
    
    def get_result(result_df):
        preds = result_df['preds'].values
        labels = result_df[CFG.target_col].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.5f}')
        
    if CFG.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f'=============== fold: {fold} result ================')
                get_result(_oof_df)
                
        # CV result
        LOGGER.info(f'============ CV ============')
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
        
    if CFG.inference:
        # inference
        model = CustomEfficientNetb0(CFG.model_name, pretrained=False)
        states = [torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth') for fold in CFG.trn_fold]
        test_dataset = TestDataset(test, batch_size=CFG.batch_size, shuffle=False, pin_memory=True)
        predictions = inference(model, states, test_loader, device)
        # submission
        test['label'] = predictions.argmax(1)
        test[['image_id', 'label']].to_csv(OUTPUT_DIR+'submission.csv', index=False)

In [26]:
LOGGER.info(f'used device: {device}')

used device: cuda


In [27]:
if __name__ == '__main__':
    main()

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_ra-3dd342df.pth
loss_train: BiTemperedLogisticLoss()


HBox(children=(FloatProgress(value=0.0, max=659.0), HTML(value='')))

Epoch: [1][0/659]Data 2.603 (2.603)Elapsed 0m 4s (remain 47m 34s)Loss: 1.5831(1.5831)Grad: 0.9550  
Epoch: [1][100/659]Data 0.787 (0.294)Elapsed 0m 53s (remain 4m 57s)Loss: 1.3316(1.4254)Grad: 0.3693  
Epoch: [1][200/659]Data 0.471 (0.292)Elapsed 1m 45s (remain 4m 1s)Loss: 1.0869(1.3579)Grad: 0.3759  
Epoch: [1][300/659]Data 0.903 (0.293)Elapsed 2m 37s (remain 3m 7s)Loss: 2.2433(1.2880)Grad: 0.3767  
Epoch: [1][400/659]Data 1.090 (0.292)Elapsed 3m 28s (remain 2m 14s)Loss: 1.2840(1.2646)Grad: 0.2632  
Epoch: [1][500/659]Data 0.484 (0.290)Elapsed 4m 18s (remain 1m 21s)Loss: 0.9399(1.2463)Grad: 0.4566  
Epoch: [1][600/659]Data 0.550 (0.288)Elapsed 5m 8s (remain 0m 29s)Loss: 1.2310(1.2438)Grad: 0.2741  
Epoch: [1][658/659]Data 0.001 (0.286)Elapsed 5m 36s (remain 0m 0s)Loss: 0.7623(1.2375)Grad: 0.3514  

EVAL: [0/165] Data 1.733 (1.733) Elapsed 0m 1s (remain 4m 57s) Loss: 1.1436(1.1436) 
EVAL: [100/165] Data 1.124 (0.320) Elapsed 0m 40s (remain 0m 25s) Loss: 0.9267(0.9245) 


Epoch 1 - avg_train_loss: 1.2375 avg_val_loss: 0.9403 time: 400s
Epoch 1 - Accuracy: 0.6566059225512528
Epoch 1 - Save Best Score: 0.6566 Model
requires_grad of all parameters are unlocked


EVAL: [164/165] Data 0.243 (0.305) Elapsed 1m 3s (remain 0m 0s) Loss: 0.7440(0.9403) 


HBox(children=(FloatProgress(value=0.0, max=659.0), HTML(value='')))

Epoch: [2][0/659]Data 1.996 (1.996)Elapsed 0m 2s (remain 32m 20s)Loss: 0.9463(0.9463)Grad: 2.1587  
Epoch: [2][100/659]Data 0.004 (0.024)Elapsed 1m 11s (remain 6m 36s)Loss: 0.2865(1.1558)Grad: 1.4016  
Epoch: [2][200/659]Data 0.005 (0.014)Elapsed 2m 19s (remain 5m 18s)Loss: 0.4702(1.1186)Grad: 1.5468  
Epoch: [2][300/659]Data 0.007 (0.011)Elapsed 3m 27s (remain 4m 7s)Loss: 2.0150(1.1447)Grad: 1.5658  
Epoch: [2][400/659]Data 0.005 (0.009)Elapsed 4m 36s (remain 2m 58s)Loss: 0.8143(1.1417)Grad: 1.5977  
Epoch: [2][500/659]Data 0.003 (0.008)Elapsed 5m 45s (remain 1m 48s)Loss: 1.9754(1.1257)Grad: 1.0757  
Epoch: [2][600/659]Data 0.005 (0.008)Elapsed 6m 53s (remain 0m 39s)Loss: 0.4672(1.1215)Grad: 1.3300  
Epoch: [2][658/659]Data 0.001 (0.007)Elapsed 7m 32s (remain 0m 0s)Loss: 2.4811(1.1332)Grad: 1.7846  

EVAL: [0/165] Data 1.737 (1.737) Elapsed 0m 1s (remain 4m 57s) Loss: 0.7050(0.7050) 
EVAL: [100/165] Data 0.677 (0.294) Elapsed 0m 37s (remain 0m 24s) Loss: 0.5750(0.5642) 


Epoch 2 - avg_train_loss: 1.1332 avg_val_loss: 0.5701 time: 512s
Epoch 2 - Accuracy: 0.8164388762338648
Epoch 2 - Save Best Score: 0.8164 Model


EVAL: [164/165] Data 0.000 (0.276) Elapsed 0m 58s (remain 0m 0s) Loss: 0.2015(0.5701) 


HBox(children=(FloatProgress(value=0.0, max=659.0), HTML(value='')))

Epoch: [3][0/659]Data 2.752 (2.752)Elapsed 0m 3s (remain 39m 59s)Loss: 0.2664(0.2664)Grad: 1.0684  
Epoch: [3][100/659]Data 0.009 (0.032)Elapsed 1m 12s (remain 6m 39s)Loss: 2.0091(1.0809)Grad: 1.0017  
Epoch: [3][200/659]Data 0.002 (0.018)Elapsed 2m 20s (remain 5m 19s)Loss: 0.3034(1.0422)Grad: 0.9265  
Epoch: [3][300/659]Data 0.003 (0.014)Elapsed 3m 28s (remain 4m 7s)Loss: 2.6115(1.0736)Grad: 1.3272  
Epoch: [3][400/659]Data 0.005 (0.011)Elapsed 4m 35s (remain 2m 57s)Loss: 1.2922(1.0539)Grad: 1.4521  
Epoch: [3][500/659]Data 0.002 (0.010)Elapsed 5m 43s (remain 1m 48s)Loss: 1.3977(1.0644)Grad: 1.1652  
Epoch: [3][600/659]Data 0.003 (0.009)Elapsed 6m 51s (remain 0m 39s)Loss: 0.4065(1.0579)Grad: 1.1057  
Epoch: [3][658/659]Data 0.001 (0.008)Elapsed 7m 30s (remain 0m 0s)Loss: 0.7538(1.0551)Grad: 2.1268  

EVAL: [0/165] Data 2.226 (2.226) Elapsed 0m 2s (remain 6m 17s) Loss: 0.6555(0.6555) 
EVAL: [100/165] Data 0.937 (0.301) Elapsed 0m 38s (remain 0m 24s) Loss: 0.5129(0.4880) 


Epoch 3 - avg_train_loss: 1.0551 avg_val_loss: 0.4759 time: 510s
Epoch 3 - Accuracy: 0.8507972665148064
Epoch 3 - Save Best Score: 0.8508 Model


EVAL: [164/165] Data 0.000 (0.279) Elapsed 0m 59s (remain 0m 0s) Loss: 0.1994(0.4759) 


HBox(children=(FloatProgress(value=0.0, max=659.0), HTML(value='')))

Epoch: [4][0/659]Data 2.304 (2.304)Elapsed 0m 3s (remain 34m 26s)Loss: 1.7000(1.7000)Grad: 1.0782  
Epoch: [4][100/659]Data 0.002 (0.027)Elapsed 1m 11s (remain 6m 36s)Loss: 1.3391(1.0491)Grad: 1.3201  
Epoch: [4][200/659]Data 0.002 (0.015)Elapsed 2m 19s (remain 5m 18s)Loss: 0.5059(0.9963)Grad: 1.1340  
Epoch: [4][300/659]Data 0.003 (0.012)Elapsed 3m 27s (remain 4m 6s)Loss: 1.1708(1.0004)Grad: 0.8691  
Epoch: [4][400/659]Data 0.008 (0.010)Elapsed 4m 35s (remain 2m 57s)Loss: 0.3914(1.0035)Grad: 1.2694  
Epoch: [4][500/659]Data 0.002 (0.009)Elapsed 5m 43s (remain 1m 48s)Loss: 2.2989(1.0204)Grad: 0.7723  
Epoch: [4][600/659]Data 0.004 (0.008)Elapsed 6m 51s (remain 0m 39s)Loss: 2.0086(1.0114)Grad: 1.0438  
Epoch: [4][658/659]Data 0.001 (0.008)Elapsed 7m 30s (remain 0m 0s)Loss: 0.6604(1.0248)Grad: 1.5842  

EVAL: [0/165] Data 1.850 (1.850) Elapsed 0m 1s (remain 5m 18s) Loss: 0.5774(0.5774) 
EVAL: [100/165] Data 0.559 (0.293) Elapsed 0m 37s (remain 0m 24s) Loss: 0.4448(0.4575) 


Epoch 4 - avg_train_loss: 1.0248 avg_val_loss: 0.4562 time: 510s
Epoch 4 - Accuracy: 0.8608580106302202
Epoch 4 - Save Best Score: 0.8609 Model


EVAL: [164/165] Data 0.000 (0.277) Elapsed 0m 59s (remain 0m 0s) Loss: 0.2635(0.4562) 


HBox(children=(FloatProgress(value=0.0, max=659.0), HTML(value='')))

Epoch: [5][0/659]Data 2.068 (2.068)Elapsed 0m 2s (remain 31m 57s)Loss: 0.7365(0.7365)Grad: 1.3372  
Epoch: [5][100/659]Data 0.002 (0.024)Elapsed 1m 11s (remain 6m 34s)Loss: 2.2179(1.0743)Grad: 1.0995  
Epoch: [5][200/659]Data 0.010 (0.014)Elapsed 2m 19s (remain 5m 17s)Loss: 0.6211(1.0451)Grad: 1.1833  
Epoch: [5][300/659]Data 0.003 (0.011)Elapsed 3m 27s (remain 4m 6s)Loss: 1.2028(1.0418)Grad: 0.8512  
Epoch: [5][400/659]Data 0.002 (0.009)Elapsed 4m 35s (remain 2m 57s)Loss: 1.4499(1.0407)Grad: 1.3969  
Epoch: [5][500/659]Data 0.005 (0.008)Elapsed 5m 43s (remain 1m 48s)Loss: 0.9421(1.0290)Grad: 0.7889  
Epoch: [5][600/659]Data 0.002 (0.008)Elapsed 6m 51s (remain 0m 39s)Loss: 1.3521(1.0108)Grad: 1.0219  
Epoch: [5][658/659]Data 0.001 (0.007)Elapsed 7m 30s (remain 0m 0s)Loss: 1.7708(1.0107)Grad: 1.0427  

EVAL: [0/165] Data 1.500 (1.500) Elapsed 0m 1s (remain 4m 28s) Loss: 0.7087(0.7087) 
EVAL: [100/165] Data 0.564 (0.287) Elapsed 0m 37s (remain 0m 23s) Loss: 0.6215(0.5450) 


Epoch 5 - avg_train_loss: 1.0107 avg_val_loss: 0.5317 time: 510s
Epoch 5 - Accuracy: 0.8426347760060744


EVAL: [164/165] Data 0.000 (0.279) Elapsed 0m 59s (remain 0m 0s) Loss: 0.2434(0.5317) 


HBox(children=(FloatProgress(value=0.0, max=659.0), HTML(value='')))

Epoch: [6][0/659]Data 2.214 (2.214)Elapsed 0m 3s (remain 34m 47s)Loss: 1.2397(1.2397)Grad: 0.8794  
Epoch: [6][100/659]Data 0.003 (0.026)Elapsed 1m 11s (remain 6m 37s)Loss: 1.5404(0.9036)Grad: 1.1308  
Epoch: [6][200/659]Data 0.002 (0.015)Elapsed 2m 19s (remain 5m 18s)Loss: 0.5175(0.9033)Grad: 1.0337  
Epoch: [6][300/659]Data 0.003 (0.011)Elapsed 3m 27s (remain 4m 7s)Loss: 1.8321(0.9340)Grad: 0.8797  
Epoch: [6][400/659]Data 0.003 (0.010)Elapsed 4m 35s (remain 2m 57s)Loss: 0.4842(0.9482)Grad: 1.1445  
Epoch: [6][500/659]Data 0.008 (0.008)Elapsed 5m 44s (remain 1m 48s)Loss: 1.8564(0.9515)Grad: 0.7889  
Epoch: [6][600/659]Data 0.003 (0.008)Elapsed 6m 52s (remain 0m 39s)Loss: 0.6110(0.9537)Grad: 0.7905  
Epoch: [6][658/659]Data 0.002 (0.007)Elapsed 7m 31s (remain 0m 0s)Loss: 0.7530(0.9639)Grad: 2.4114  

EVAL: [0/165] Data 1.424 (1.424) Elapsed 0m 1s (remain 4m 10s) Loss: 0.7061(0.7061) 
EVAL: [100/165] Data 0.228 (0.292) Elapsed 0m 37s (remain 0m 24s) Loss: 0.5493(0.5043) 


Epoch 6 - avg_train_loss: 0.9639 avg_val_loss: 0.4913 time: 511s
Epoch 6 - Accuracy: 0.8568716780561884


EVAL: [164/165] Data 0.000 (0.279) Elapsed 0m 59s (remain 0m 0s) Loss: 0.1542(0.4913) 


HBox(children=(FloatProgress(value=0.0, max=659.0), HTML(value='')))

Epoch: [7][0/659]Data 1.950 (1.950)Elapsed 0m 2s (remain 31m 49s)Loss: 1.6205(1.6205)Grad: 0.8937  
Epoch: [7][100/659]Data 0.003 (0.024)Elapsed 1m 11s (remain 6m 34s)Loss: 0.5235(0.9090)Grad: 1.2708  
Epoch: [7][200/659]Data 0.007 (0.014)Elapsed 2m 19s (remain 5m 17s)Loss: 0.3829(0.9495)Grad: 1.2452  
Epoch: [7][300/659]Data 0.003 (0.011)Elapsed 3m 27s (remain 4m 6s)Loss: 1.1524(0.9809)Grad: 0.8587  
Epoch: [7][400/659]Data 0.002 (0.009)Elapsed 4m 35s (remain 2m 57s)Loss: 0.7946(0.9693)Grad: 1.3393  
Epoch: [7][500/659]Data 0.003 (0.008)Elapsed 5m 42s (remain 1m 48s)Loss: 0.7901(0.9629)Grad: 1.0432  
Epoch: [7][600/659]Data 0.003 (0.007)Elapsed 6m 50s (remain 0m 39s)Loss: 0.5140(0.9610)Grad: 1.1956  
Epoch: [7][658/659]Data 0.002 (0.007)Elapsed 7m 29s (remain 0m 0s)Loss: 1.1185(0.9580)Grad: 2.1851  

EVAL: [0/165] Data 1.473 (1.473) Elapsed 0m 1s (remain 4m 22s) Loss: 0.6507(0.6507) 
EVAL: [100/165] Data 1.096 (0.289) Elapsed 0m 37s (remain 0m 23s) Loss: 0.4394(0.4485) 


Epoch 7 - avg_train_loss: 0.9580 avg_val_loss: 0.4436 time: 509s
Epoch 7 - Accuracy: 0.8712984054669703
Epoch 7 - Save Best Score: 0.8713 Model


EVAL: [164/165] Data 0.053 (0.276) Elapsed 0m 59s (remain 0m 0s) Loss: 0.2870(0.4436) 


HBox(children=(FloatProgress(value=0.0, max=659.0), HTML(value='')))

Epoch: [8][0/659]Data 2.087 (2.087)Elapsed 0m 2s (remain 31m 56s)Loss: 0.5422(0.5422)Grad: 0.9286  
Epoch: [8][100/659]Data 0.003 (0.026)Elapsed 1m 11s (remain 6m 34s)Loss: 0.3826(0.9403)Grad: 0.6618  
Epoch: [8][200/659]Data 0.005 (0.016)Elapsed 2m 19s (remain 5m 17s)Loss: 1.6330(0.9427)Grad: 0.8824  
Epoch: [8][300/659]Data 0.003 (0.012)Elapsed 3m 27s (remain 4m 6s)Loss: 1.5132(0.9259)Grad: 0.7896  
Epoch: [8][400/659]Data 0.002 (0.010)Elapsed 4m 35s (remain 2m 57s)Loss: 0.2784(0.9382)Grad: 1.1939  
Epoch: [8][500/659]Data 0.003 (0.009)Elapsed 5m 43s (remain 1m 48s)Loss: 0.8522(0.9246)Grad: 1.0810  
Epoch: [8][600/659]Data 0.005 (0.008)Elapsed 6m 51s (remain 0m 39s)Loss: 1.0470(0.9258)Grad: 0.8327  
Epoch: [8][658/659]Data 0.002 (0.008)Elapsed 7m 30s (remain 0m 0s)Loss: 1.0789(0.9373)Grad: 1.5619  

EVAL: [0/165] Data 1.563 (1.563) Elapsed 0m 1s (remain 4m 35s) Loss: 0.5893(0.5893) 
EVAL: [100/165] Data 0.381 (0.285) Elapsed 0m 37s (remain 0m 23s) Loss: 0.4918(0.4747) 


Epoch 8 - avg_train_loss: 0.9373 avg_val_loss: 0.4671 time: 510s
Epoch 8 - Accuracy: 0.8549734244495064


EVAL: [164/165] Data 0.000 (0.275) Elapsed 0m 59s (remain 0m 0s) Loss: 0.2564(0.4671) 


HBox(children=(FloatProgress(value=0.0, max=659.0), HTML(value='')))

Epoch: [9][0/659]Data 2.119 (2.119)Elapsed 0m 3s (remain 34m 14s)Loss: 0.5021(0.5021)Grad: 1.1397  
Epoch: [9][100/659]Data 0.005 (0.025)Elapsed 1m 11s (remain 6m 33s)Loss: 0.3427(0.8590)Grad: 0.8760  
Epoch: [9][200/659]Data 0.002 (0.015)Elapsed 2m 19s (remain 5m 17s)Loss: 1.1844(0.8937)Grad: 0.8171  
Epoch: [9][300/659]Data 0.003 (0.012)Elapsed 3m 27s (remain 4m 6s)Loss: 1.1318(0.8668)Grad: 0.9010  
Epoch: [9][400/659]Data 0.002 (0.010)Elapsed 4m 34s (remain 2m 56s)Loss: 0.5778(0.8635)Grad: 1.2479  
Epoch: [9][500/659]Data 0.002 (0.009)Elapsed 5m 42s (remain 1m 48s)Loss: 1.5142(0.8642)Grad: 0.9688  
Epoch: [9][600/659]Data 0.005 (0.008)Elapsed 6m 51s (remain 0m 39s)Loss: 0.3557(0.8635)Grad: 0.8695  
Epoch: [9][658/659]Data 0.002 (0.008)Elapsed 7m 30s (remain 0m 0s)Loss: 0.0784(0.8637)Grad: 0.7634  

EVAL: [0/165] Data 1.625 (1.625) Elapsed 0m 1s (remain 4m 45s) Loss: 0.5841(0.5841) 
EVAL: [100/165] Data 1.114 (0.301) Elapsed 0m 38s (remain 0m 24s) Loss: 0.4637(0.4584) 


Epoch 9 - avg_train_loss: 0.8637 avg_val_loss: 0.4553 time: 510s
Epoch 9 - Accuracy: 0.8665527714502658


EVAL: [164/165] Data 0.014 (0.279) Elapsed 0m 59s (remain 0m 0s) Loss: 0.2240(0.4553) 


HBox(children=(FloatProgress(value=0.0, max=659.0), HTML(value='')))

Epoch: [10][0/659]Data 1.963 (1.963)Elapsed 0m 2s (remain 30m 13s)Loss: 0.4174(0.4174)Grad: 0.9059  
Epoch: [10][100/659]Data 0.003 (0.024)Elapsed 1m 11s (remain 6m 33s)Loss: 1.1002(0.9426)Grad: 0.7290  
Epoch: [10][200/659]Data 0.009 (0.014)Elapsed 2m 19s (remain 5m 17s)Loss: 2.1512(0.9465)Grad: 0.7559  
Epoch: [10][300/659]Data 0.015 (0.011)Elapsed 3m 27s (remain 4m 7s)Loss: 1.7537(0.9351)Grad: 0.9613  
Epoch: [10][400/659]Data 0.006 (0.009)Elapsed 4m 35s (remain 2m 57s)Loss: 1.1818(0.9103)Grad: 0.9723  
Epoch: [10][500/659]Data 0.002 (0.008)Elapsed 5m 44s (remain 1m 48s)Loss: 1.3227(0.9039)Grad: 0.7723  
Epoch: [10][600/659]Data 0.002 (0.007)Elapsed 6m 52s (remain 0m 39s)Loss: 0.9613(0.9028)Grad: 0.6797  
Epoch: [10][658/659]Data 0.001 (0.007)Elapsed 7m 31s (remain 0m 0s)Loss: 0.0711(0.9103)Grad: 1.1961  

EVAL: [0/165] Data 1.494 (1.494) Elapsed 0m 1s (remain 4m 22s) Loss: 0.5600(0.5600) 
EVAL: [100/165] Data 0.875 (0.288) Elapsed 0m 37s (remain 0m 23s) Loss: 0.4939(0.4665) 


Epoch 10 - avg_train_loss: 0.9103 avg_val_loss: 0.4596 time: 511s
Epoch 10 - Accuracy: 0.8610478359908884
Score: 0.87130
Score: 0.87130


EVAL: [164/165] Data 0.000 (0.274) Elapsed 0m 59s (remain 0m 0s) Loss: 0.2349(0.4596) 
