In [1]:
# !pip install pytorch_ranger

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

In [3]:
os.listdir('../input/cassava-leaf-disease-classification/')

['train_tfrecords',
 'sample_submission.csv',
 'test_tfrecords',
 'label_num_to_disease_map.json',
 'train_images',
 'train.csv',
 'test_images']

In [4]:
train = pd.read_csv('../input/cassava-leaf-disease-merged/merged.csv')
test = pd.read_csv('../input/cassava-leaf-disease-classification//sample_submission.csv')
label_map = pd.read_json('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json', orient='index')

display(train.head())
display(test.head())
display(label_map)

Unnamed: 0,image_id,label,source
0,1000015157.jpg,0,2020
1,1000201771.jpg,3,2020
2,100042118.jpg,1,2020
3,1000723321.jpg,1,2020
4,1000812911.jpg,3,2020


Unnamed: 0,image_id,label
0,2216849948.jpg,4


Unnamed: 0,0
0,Cassava Bacterial Blight (CBB)
1,Cassava Brown Streak Disease (CBSD)
2,Cassava Green Mottle (CGM)
3,Cassava Mosaic Disease (CMD)
4,Healthy


## Directory settings

In [5]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
TRAIN_PATH = '../input/cassava-leaf-disease-merged/train'
TEST_PATH = '../input/cassava-leaf-disease-classification/test_images'

## CFG

In [6]:
class CFG:
    debug = False
    apex = False
    print_freq = 100
    num_workers = 4
    model_name = 'tf_efficientnet_b3'
    size = 470
    scheduler = 'CosineAnnealingWarmRestarts'
    loss_train = 'BiTemperedLoss'
    epochs = 10
    T_0 = 10
    lr_1 = 5e-4
    lr_2 = 5e-5
    t1 = 0.9
    t2 = 1.5
    smooth = 1e-2
    min_lr = 1e-6
    batch_size = 16
    weight_decay = 1e-6
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    seed = 42
    target_size = 5
    target_col = 'label'
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True
    inference = False
    
if CFG.debug:
    CFG.epochs = 3
    train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)

## Library

In [7]:
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

sys.path.append('../input/bi-tempered-loss-pytorch')
from bi_tempered_loss import *

# sys.path.append('../input/pytorch-optimizer')
# import torch_optimizer as optim

sys.path.append('../input/pytorch-sam')
from sam import SAM

from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine, RandomResizedCrop,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, CoarseDropout, ShiftScaleRotate, CenterCrop, Resize
)
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

import warnings 
warnings.filterwarnings('ignore')

if CFG.apex:
    from apex import amp

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Utils

In [8]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f}')
    
def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

## CV split

In [9]:
folds = train.copy()
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.groupby(['fold', CFG.target_col]).size())

fold  label
0     0         299
      1         695
      2         604
      3        3092
      4         578
1     0         299
      1         695
      2         604
      3        3092
      4         578
2     0         298
      1         695
      2         603
      3        3093
      4         578
3     0         298
      1         695
      2         603
      3        3093
      4         578
4     0         298
      1         696
      2         603
      3        3092
      4         578
dtype: int64


## Dataset

In [10]:
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.labels = df['label'].values
#         self.labels = pd.get_dummies(df['label']).values  # One Hot Encoding
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TRAIN_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx]).long()
        return image, label
    
class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TEST_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        return image

In [11]:
# train_dataset = TrainDataset(train, transform=None)

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image)
#     plt.title(f'label: {label}')
#     plt.show()

## Transforms

In [12]:
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            RandomResizedCrop(CFG.size, CFG.size), 
            Transpose(p=0.5), 
            HorizontalFlip(p=0.5), 
            VerticalFlip(p=0.5), 
            ShiftScaleRotate(p=0.5), 
            HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5), 
            RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])
    
    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])

In [13]:
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image[0])
#     plt.title(f'label: {label}')
#     plt.show()

## MODEL

In [14]:
class CustomEfficientNetB3(nn.Module):
    def __init__(self, model_name='tf_efficientnet_b3', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(n_features, CFG.target_size)
        
    def forward(self, x):
        x = self.model(x)
        return x

In [15]:
# model = CustomEfficientNetB2ns(model_name=CFG.model_name, pretrained=False)
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))
# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, 
#                           num_workers=4, pin_memory=True, drop_last=True)

# for image, label in train_loader:
#     print(image.size())
#     output = model(image)
#     print(output)
#     break

## Loss Functions

In [16]:
# ====================================================
# Label Smoothing
# ====================================================
class LabelSmoothingLoss(nn.Module): 
    def __init__(self, classes=5, smoothing=0.0, dim=-1): 
        super(LabelSmoothingLoss, self).__init__() 
        self.confidence = 1.0 - smoothing 
        self.smoothing = smoothing 
        self.cls = classes 
        self.dim = dim 
    def forward(self, pred, target): 
        pred = pred.log_softmax(dim=self.dim) 
        with torch.no_grad():
            true_dist = torch.zeros_like(pred) 
            true_dist.fill_(self.smoothing / (self.cls - 1)) 
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [17]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduce = reduce

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [18]:
class FocalCosineLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, xent=.1):
        super(FocalCosineLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

        self.xent = xent

        self.y = torch.Tensor([1]).cuda()

    def forward(self, input, target, reduction="mean"):
        cosine_loss = F.cosine_embedding_loss(input, F.one_hot(target, num_classes=input.size(-1)), self.y, reduction=reduction)

        cent_loss = F.cross_entropy(F.normalize(input), target, reduce=False)
        pt = torch.exp(-cent_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * cent_loss

        if reduction == "mean":
            focal_loss = torch.mean(focal_loss)

        return cosine_loss + self.xent * focal_loss

In [19]:
class SymmetricCrossEntropy(nn.Module):

    def __init__(self, alpha=0.1, beta=1.0, num_classes=5):
        super(SymmetricCrossEntropy, self).__init__()
        self.alpha = alpha
        self.beta = beta
        self.num_classes = num_classes

    def forward(self, logits, targets, reduction='mean'):
        onehot_targets = torch.eye(self.num_classes)[targets].cuda()
        ce_loss = F.cross_entropy(logits, targets, reduction=reduction)
        rce_loss = (-onehot_targets*logits.softmax(1).clamp(1e-7, 1.0).log()).sum(1)
        if reduction == 'mean':
            rce_loss = rce_loss.mean()
        elif reduction == 'sum':
            rce_loss = rce_loss.sum()
        return self.alpha * ce_loss + self.beta * rce_loss

In [20]:
def log_t(u, t):
    """Compute log_t for `u'."""
    if t==1.0:
        return u.log()
    else:
        return (u.pow(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u'."""
    if t==1:
        return u.exp()
    else:
        return (1.0 + (1.0-t)*u).relu().pow(1.0 / (1.0 - t))

def compute_normalization_fixed_point(activations, t, num_iters):

    """Returns the normalization value for each example (t > 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same shape as activation with the last dimension being 1.
    """
    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations_step_0 = activations - mu

    normalized_activations = normalized_activations_step_0

    for _ in range(num_iters):
        logt_partition = torch.sum(
                exp_t(normalized_activations, t), -1, keepdim=True)
        normalized_activations = normalized_activations_step_0 * \
                logt_partition.pow(1.0-t)

    logt_partition = torch.sum(
            exp_t(normalized_activations, t), -1, keepdim=True)
    normalization_constants = - log_t(1.0 / logt_partition, t) + mu

    return normalization_constants

def compute_normalization_binary_search(activations, t, num_iters):

    """Returns the normalization value for each example (t < 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (< 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """

    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations = activations - mu

    effective_dim = \
        torch.sum(
                (normalized_activations > -1.0 / (1.0-t)).to(torch.int32),
            dim=-1, keepdim=True).to(activations.dtype)

    shape_partition = activations.shape[:-1] + (1,)
    lower = torch.zeros(shape_partition, dtype=activations.dtype, device=activations.device)
    upper = -log_t(1.0/effective_dim, t) * torch.ones_like(lower)

    for _ in range(num_iters):
        logt_partition = (upper + lower)/2.0
        sum_probs = torch.sum(
                exp_t(normalized_activations - logt_partition, t),
                dim=-1, keepdim=True)
        update = (sum_probs < 1.0).to(activations.dtype)
        lower = torch.reshape(
                lower * update + (1.0-update) * logt_partition,
                shape_partition)
        upper = torch.reshape(
                upper * (1.0 - update) + update * logt_partition,
                shape_partition)

    logt_partition = (upper + lower)/2.0
    return logt_partition + mu

class ComputeNormalization(torch.autograd.Function):
    """
    Class implementing custom backward pass for compute_normalization. See compute_normalization.
    """
    @staticmethod
    def forward(ctx, activations, t, num_iters):
        if t < 1.0:
            normalization_constants = compute_normalization_binary_search(activations, t, num_iters)
        else:
            normalization_constants = compute_normalization_fixed_point(activations, t, num_iters)

        ctx.save_for_backward(activations, normalization_constants)
        ctx.t=t
        return normalization_constants

    @staticmethod
    def backward(ctx, grad_output):
        activations, normalization_constants = ctx.saved_tensors
        t = ctx.t
        normalized_activations = activations - normalization_constants 
        probabilities = exp_t(normalized_activations, t)
        escorts = probabilities.pow(t)
        escorts = escorts / escorts.sum(dim=-1, keepdim=True)
        grad_input = escorts * grad_output
        
        return grad_input, None, None

def compute_normalization(activations, t, num_iters=5):
    """Returns the normalization value for each example. 
    Backward pass is implemented.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    return ComputeNormalization.apply(activations, t, num_iters)

def tempered_sigmoid(activations, t, num_iters = 5):
    """Tempered sigmoid function.
    Args:
      activations: Activations for the positive class for binary classification.
      t: Temperature tensor > 0.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_probabilities = tempered_softmax(internal_activations, t, num_iters)
    return internal_probabilities[..., 0]


def tempered_softmax(activations, t, num_iters=5):
    """Tempered softmax function.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature > 1.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    if t == 1.0:
        return activations.softmax(dim=-1)

    normalization_constants = compute_normalization(activations, t, num_iters)
    return exp_t(activations - normalization_constants, t)

def bi_tempered_binary_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing = 0.0,
        num_iters=5,
        reduction='mean'):

    """Bi-Tempered binary logistic loss.
    Args:
      activations: A tensor containing activations for class 1.
      labels: A tensor with shape as activations, containing probabilities for class 1
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing
      num_iters: Number of iterations to run the method.
    Returns:
      A loss tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_labels = torch.stack([labels.to(activations.dtype),
        1.0 - labels.to(activations.dtype)],
        dim=-1)
    return bi_tempered_logistic_loss(internal_activations, 
            internal_labels,
            t1,
            t2,
            label_smoothing = label_smoothing,
            num_iters = num_iters,
            reduction = reduction)

def bi_tempered_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing=0.0,
        num_iters=5,
        reduction = 'mean'):

    """Bi-Tempered Logistic Loss.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      labels: A tensor with shape and dtype as activations (onehot), 
        or a long tensor of one dimension less than activations (pytorch standard)
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing parameter between [0, 1). Default 0.0.
      num_iters: Number of iterations to run the method. Default 5.
      reduction: ``'none'`` | ``'mean'`` | ``'sum'``. Default ``'mean'``.
        ``'none'``: No reduction is applied, return shape is shape of
        activations without the last dimension.
        ``'mean'``: Loss is averaged over minibatch. Return shape (1,)
        ``'sum'``: Loss is summed over minibatch. Return shape (1,)
    Returns:
      A loss tensor.
    """

    if len(labels.shape)<len(activations.shape): #not one-hot
        labels_onehot = torch.zeros_like(activations)
        labels_onehot.scatter_(1, labels[..., None], 1)
    else:
        labels_onehot = labels

    if label_smoothing > 0:
        num_classes = labels_onehot.shape[-1]
        labels_onehot = ( 1 - label_smoothing * num_classes / (num_classes - 1) ) \
                * labels_onehot + \
                label_smoothing / (num_classes - 1)

    probabilities = tempered_softmax(activations, t2, num_iters)

    loss_values = labels_onehot * log_t(labels_onehot + 1e-10, t1) \
            - labels_onehot * log_t(probabilities, t1) \
            - labels_onehot.pow(2.0 - t1) / (2.0 - t1) \
            + probabilities.pow(2.0 - t1) / (2.0 - t1)
    loss_values = loss_values.sum(dim = -1) #sum over classes

    if reduction == 'none':
        return loss_values
    if reduction == 'sum':
        return loss_values.sum()
    if reduction == 'mean':
        return loss_values.mean()

In [21]:
class BiTemperedLogisticLoss(nn.Module): 
    def __init__(self, t1, t2, smoothing=0.0): 
        super(BiTemperedLogisticLoss, self).__init__() 
        self.t1 = t1
        self.t2 = t2
        self.smoothing = smoothing
    def forward(self, logit_label, truth_label):
        loss_label = bi_tempered_logistic_loss(
            logit_label, truth_label,
            t1=self.t1, t2=self.t2,
            label_smoothing=self.smoothing,
            reduction='none'
        )
        
        loss_label = loss_label.mean()
        return loss_label

In [22]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def asMinutes(s):
    """秒を分に変換する関数"""
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    """経過時間の測定と終了時間の予測を行う関数
    Parameters
    ----------
    since : float
        実験を始めた時刻
    percent : float
        実験が進んだ割合
        
    Returns
    -------
    s : 経過時間
    re : 終了までの時間の予測
    """
    now = time.time()
    s = now - since  # 経過時間の測定
    es = s / percent  # 終了時間の予測
    re = es - s  # 残り時間の予想
    return '%s (remain %s)' % (asMinutes(s), asMinutes(re))

def train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, shechduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(images)
        metric = loss_metric(y_preds, labels)
        loss = loss_train(y_preds, labels)
        # record loss
        losses.update(metric.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        if CFG.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else: 
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.first_step(zero_grad=True)
#             optimizer.zero_grad()
            global_step += 1
        # measure elapsed time
        loss_train(model(images), labels).backward()
#         loss = torch.mean(loss)
#         loss.backward()
        optimizer.second_step(zero_grad=True)
        
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}]'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})'
                  'Elapsed {remain:s}' 
                  'Loss: {loss.val:.4f}({loss.avg:.4f})' 
                  'Grad: {grad_norm:.4f}  '
                  .format(epoch+1, step, len(train_loader), batch_time=batch_time, 
                          data_time=data_time, loss=losses, 
                          remain=timeSince(start, float(step+1)/len(train_loader)), 
                          grad_norm=grad_norm))
    return losses.avg

def valid_fn(valid_loader, model, loss_metric, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (images, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images)
        loss = loss_metric(y_preds, labels)
        losses.update(loss.item(), batch_size)
        # record accuracy
        preds.append(y_preds.softmax(1).to('cpu').numpy())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
            
    predictions = np.concatenate(preds)
    return losses.avg, predictions

def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (images) in tk0:
        images = images.to(device)
        avgpreds = []
        for state in states:
            model.load_state_dict(state['model'])
            model.eval()
            with torch.no_grad():
                y_preds = model(images)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs

## Train loop

In [23]:
# ======================================================
# Train loop
# ======================================================

def train_loop(folds, fold):
    
    seed_torch(seed=CFG.seed)    
    
    LOGGER.info(f'========== fold: {fold} training ============')
    
    # ======================================================
    # loader
    # ======================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index
    
    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    
    train_dataset = TrainDataset(train_folds, 
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds, 
                                 transform=get_transforms(data='valid'))
    
    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=True, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    
    # ===============================================
    # scheduler
    # ===============================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler
    
    # ===============================================
    # model & optimizer
    # ===============================================
    model = CustomEfficientNetB3(CFG.model_name, pretrained=True)
    
    # 最初の3epochはclassifier層以外全て凍結する。
    for name, param in model.model.named_parameters():
        if 'classifier' not in name:
            param.requires_grad=False
    
    model.to(device)
    
    base_optimizer = Adam
    optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_1, weight_decay=CFG.weight_decay, amsgrad=False)
    
    scheduler = get_scheduler(optimizer)
    
    # ===============================================
    # apex 
    # ===============================================
    if CFG.apex:
        model.optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
        
    # ===============================================
    # loop
    # ===============================================
    def get_loss_train():
        if CFG.loss_train == 'CrossEntropyLoss':
            loss_train = nn.CrossEntropyLoss()
        elif CFG.loss_train == 'LabelSmoothing':
            loss_train = LabelSmoothingLoss(classes=CFG.target_size, smoothing=CFG.smooth)
        elif CFG.loss_train == 'FocalLoss':
            loss_train = FocalLoss().to(device)
        elif CFG.loss_train == 'FocalCosineLoss':
            loss_train = FocalCosineLoss()
        elif CFG.loss_train == 'SymmetricCrossEntropyLoss':
            loss_train = SymmetricCrossEntropy().to(device)
        elif CFG.loss_train == 'BiTemperedLoss':
            loss_train = BiTemperedLogisticLoss(t1=CFG.t1, t2=CFG.t2, smoothing=CFG.smooth)
        return loss_train
    
    loss_train = get_loss_train()
    LOGGER.info(f'loss_train: {loss_train}')
    loss_metric = nn.CrossEntropyLoss()
    
    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        if epoch == 1:
            
            # 2epoch目に重みを全て解凍する
            for param in model.model.parameters():
                param.requires_grad = True
                
            # 学習率を4e-3から4e-4に落とす
            base_optimizer = Adam
            optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_2, weight_decay=CFG.weight_decay, amsgrad=False)
            scheduler = get_scheduler(optimizer)

            LOGGER.info('requires_grad of all parameters are unlocked')
            
        
        # train
        avg_loss = train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, scheduler, device)
        
        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, loss_metric, device)
        valid_labels = valid_folds[CFG.target_col].values
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()
        
        # scoring
        score = get_score(valid_labels, preds.argmax(1))
        
        elapsed = time.time() - start_time
        
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Accuracy: {score}')
        
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(), 
                        'preds': preds}, 
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
            
        # inference用に全て保存しておく        
        torch.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth')
    
    check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    valid_folds[[str(c) for c in range(5)]] = check_point['preds']
    valid_folds['preds'] = check_point['preds'].argmax(1)
    
    return valid_folds

In [24]:
# ====================================================
# main
# ====================================================
def main():
    
    """
    Prepare: 1.train 2.test 3.submission 4.folds
    """
    
    def get_result(result_df):
        preds = result_df['preds'].values
        labels = result_df[CFG.target_col].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.5f}')
        
    if CFG.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f'=============== fold: {fold} result ================')
                get_result(_oof_df)
                
                # 1foldのみを用いる
                break
                
        # CV result
        LOGGER.info(f'============ CV ============')
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
        
    if CFG.inference:
        # inference
        model = CustomEfficientNetB3(CFG.model_name, pretrained=False)
        states = [torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth') for fold in CFG.trn_fold]
        test_dataset = TestDataset(test, batch_size=CFG.batch_size, shuffle=False, pin_memory=True)
        predictions = inference(model, states, test_loader, device)
        # submission
        test['label'] = predictions.argmax(1)
        test[['image_id', 'label']].to_csv(OUTPUT_DIR+'submission.csv', index=False)

In [25]:
LOGGER.info(f'used device: {device}')

used device: cuda


In [26]:
if __name__ == '__main__':
    main()

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_aa-84b4657e.pth" to /root/.cache/torch/hub/checkpoints/tf_efficientnet_b3_aa-84b4657e.pth
loss_train: BiTemperedLogisticLoss()


Epoch: [1][0/1317]Data 1.566 (1.566)Elapsed 0m 3s (remain 78m 25s)Loss: 1.6745(1.6745)Grad: 1.0540  
Epoch: [1][100/1317]Data 0.000 (0.018)Elapsed 0m 39s (remain 7m 53s)Loss: 1.2096(1.3463)Grad: 0.5141  
Epoch: [1][200/1317]Data 0.000 (0.010)Elapsed 1m 14s (remain 6m 55s)Loss: 0.8368(1.3052)Grad: 0.4655  
Epoch: [1][300/1317]Data 0.002 (0.007)Elapsed 1m 50s (remain 6m 14s)Loss: 0.6912(1.2427)Grad: 0.6130  
Epoch: [1][400/1317]Data 0.000 (0.006)Elapsed 2m 26s (remain 5m 34s)Loss: 0.7895(1.1935)Grad: 0.4974  
Epoch: [1][500/1317]Data 0.000 (0.005)Elapsed 3m 2s (remain 4m 57s)Loss: 1.2968(1.1622)Grad: 0.5937  
Epoch: [1][600/1317]Data 0.000 (0.004)Elapsed 3m 38s (remain 4m 20s)Loss: 2.0250(1.1265)Grad: 0.6256  
Epoch: [1][700/1317]Data 0.000 (0.004)Elapsed 4m 14s (remain 3m 43s)Loss: 1.8086(1.1010)Grad: 0.6180  
Epoch: [1][800/1317]Data 0.000 (0.004)Elapsed 4m 51s (remain 3m 7s)Loss: 0.6399(1.0789)Grad: 0.4826  
Epoch: [1][900/1317]Data 0.012 (0.003)Elapsed 5m 26s (remain 2m 30s)Loss: 1.3

Epoch 1 - avg_train_loss: 1.0259 avg_val_loss: 0.8412 time: 561s
Epoch 1 - Accuracy: 0.7414578587699316
Epoch 1 - Save Best Score: 0.7415 Model


EVAL: [329/330] Data 0.000 (0.125) Elapsed 1m 23s (remain 0m 0s) Loss: 0.0300(0.8412) 


requires_grad of all parameters are unlocked


Epoch: [2][0/1317]Data 1.223 (1.223)Elapsed 0m 3s (remain 66m 31s)Loss: 0.8464(0.8464)Grad: 2.5129  
Epoch: [2][100/1317]Data 0.000 (0.012)Elapsed 1m 47s (remain 21m 32s)Loss: 0.9658(0.9099)Grad: 3.7640  
Epoch: [2][200/1317]Data 0.000 (0.006)Elapsed 3m 31s (remain 19m 36s)Loss: 0.5797(0.8358)Grad: 3.0845  
Epoch: [2][300/1317]Data 0.000 (0.004)Elapsed 5m 16s (remain 17m 48s)Loss: 0.2834(0.8057)Grad: 2.1353  
Epoch: [2][400/1317]Data 0.000 (0.003)Elapsed 7m 1s (remain 16m 1s)Loss: 1.1282(0.7665)Grad: 2.2329  
Epoch: [2][500/1317]Data 0.000 (0.003)Elapsed 8m 46s (remain 14m 16s)Loss: 0.7685(0.7514)Grad: 1.8899  
Epoch: [2][600/1317]Data 0.000 (0.002)Elapsed 10m 30s (remain 12m 31s)Loss: 0.2472(0.7364)Grad: 1.7523  
Epoch: [2][700/1317]Data 0.000 (0.002)Elapsed 12m 15s (remain 10m 45s)Loss: 0.8161(0.7239)Grad: 2.7122  
Epoch: [2][800/1317]Data 0.000 (0.002)Elapsed 13m 59s (remain 9m 0s)Loss: 0.2010(0.7136)Grad: 1.8443  
Epoch: [2][900/1317]Data 0.000 (0.002)Elapsed 15m 44s (remain 7m 16s

Epoch 2 - avg_train_loss: 0.6812 avg_val_loss: 0.5568 time: 1462s
Epoch 2 - Accuracy: 0.8616173120728929
Epoch 2 - Save Best Score: 0.8616 Model


EVAL: [329/330] Data 0.000 (0.119) Elapsed 1m 22s (remain 0m 0s) Loss: 0.0037(0.5568) 
Epoch: [3][0/1317]Data 1.414 (1.414)Elapsed 0m 2s (remain 58m 54s)Loss: 0.5524(0.5524)Grad: 2.4966  
Epoch: [3][100/1317]Data 0.000 (0.014)Elapsed 1m 47s (remain 21m 30s)Loss: 0.2932(0.5504)Grad: 1.9184  
Epoch: [3][200/1317]Data 0.000 (0.007)Elapsed 3m 32s (remain 19m 37s)Loss: 0.6849(0.5613)Grad: 1.6688  
Epoch: [3][300/1317]Data 0.000 (0.005)Elapsed 5m 16s (remain 17m 48s)Loss: 0.3336(0.5904)Grad: 2.3195  
Epoch: [3][400/1317]Data 0.000 (0.004)Elapsed 7m 0s (remain 16m 1s)Loss: 0.6719(0.6002)Grad: 2.2032  
Epoch: [3][500/1317]Data 0.000 (0.003)Elapsed 8m 44s (remain 14m 14s)Loss: 0.5170(0.5980)Grad: 1.4130  
Epoch: [3][600/1317]Data 0.000 (0.003)Elapsed 10m 29s (remain 12m 29s)Loss: 0.0292(0.5946)Grad: 0.7682  
Epoch: [3][700/1317]Data 0.000 (0.002)Elapsed 12m 13s (remain 10m 44s)Loss: 0.8591(0.6060)Grad: 2.0404  
Epoch: [3][800/1317]Data 0.000 (0.002)Elapsed 13m 57s (remain 8m 59s)Loss: 0.2425(0.

Epoch 3 - avg_train_loss: 0.5981 avg_val_loss: 0.5209 time: 1455s
Epoch 3 - Accuracy: 0.863705391040243
Epoch 3 - Save Best Score: 0.8637 Model


EVAL: [329/330] Data 0.000 (0.110) Elapsed 1m 19s (remain 0m 0s) Loss: 0.0421(0.5209) 
Epoch: [4][0/1317]Data 1.649 (1.649)Elapsed 0m 2s (remain 65m 35s)Loss: 0.0708(0.0708)Grad: 1.0884  
Epoch: [4][100/1317]Data 0.000 (0.016)Elapsed 1m 47s (remain 21m 32s)Loss: 1.5741(0.5615)Grad: 1.3022  
Epoch: [4][200/1317]Data 0.000 (0.008)Elapsed 3m 31s (remain 19m 32s)Loss: 0.5713(0.5332)Grad: 1.1164  
Epoch: [4][300/1317]Data 0.000 (0.006)Elapsed 5m 15s (remain 17m 43s)Loss: 0.4134(0.5568)Grad: 1.1504  
Epoch: [4][400/1317]Data 0.000 (0.004)Elapsed 6m 59s (remain 15m 57s)Loss: 0.7450(0.5500)Grad: 1.2081  
Epoch: [4][500/1317]Data 0.000 (0.003)Elapsed 8m 43s (remain 14m 12s)Loss: 0.4445(0.5600)Grad: 0.9693  
Epoch: [4][600/1317]Data 0.000 (0.003)Elapsed 10m 27s (remain 12m 27s)Loss: 0.2777(0.5703)Grad: 1.4078  
Epoch: [4][700/1317]Data 0.000 (0.003)Elapsed 12m 11s (remain 10m 42s)Loss: 0.6145(0.5800)Grad: 1.6564  
Epoch: [4][800/1317]Data 0.000 (0.002)Elapsed 13m 55s (remain 8m 58s)Loss: 1.2848(

Epoch 4 - avg_train_loss: 0.5621 avg_val_loss: 0.5434 time: 1449s
Epoch 4 - Accuracy: 0.8724373576309795
Epoch 4 - Save Best Score: 0.8724 Model


EVAL: [329/330] Data 0.000 (0.102) Elapsed 1m 17s (remain 0m 0s) Loss: 0.2823(0.5434) 
Epoch: [5][0/1317]Data 1.179 (1.179)Elapsed 0m 2s (remain 58m 31s)Loss: 0.4769(0.4769)Grad: 1.6053  
Epoch: [5][100/1317]Data 0.000 (0.012)Elapsed 1m 46s (remain 21m 26s)Loss: 1.4503(0.5210)Grad: 1.8440  
Epoch: [5][200/1317]Data 0.000 (0.006)Elapsed 3m 30s (remain 19m 31s)Loss: 0.1119(0.5031)Grad: 0.9612  
Epoch: [5][300/1317]Data 0.000 (0.004)Elapsed 5m 14s (remain 17m 41s)Loss: 0.1765(0.5154)Grad: 1.1497  
Epoch: [5][400/1317]Data 0.000 (0.003)Elapsed 6m 58s (remain 15m 55s)Loss: 0.2483(0.5224)Grad: 1.2735  
Epoch: [5][500/1317]Data 0.000 (0.003)Elapsed 8m 42s (remain 14m 10s)Loss: 0.0084(0.5242)Grad: 0.2846  
Epoch: [5][600/1317]Data 0.000 (0.002)Elapsed 10m 26s (remain 12m 25s)Loss: 0.8841(0.5248)Grad: 1.6083  
Epoch: [5][700/1317]Data 0.000 (0.002)Elapsed 12m 9s (remain 10m 41s)Loss: 0.0981(0.5209)Grad: 1.3186  
Epoch: [5][800/1317]Data 0.000 (0.002)Elapsed 13m 54s (remain 8m 57s)Loss: 0.0851(0

Epoch 5 - avg_train_loss: 0.5340 avg_val_loss: 0.5188 time: 1451s
Epoch 5 - Accuracy: 0.881169324221716
Epoch 5 - Save Best Score: 0.8812 Model


EVAL: [329/330] Data 0.000 (0.109) Elapsed 1m 19s (remain 0m 0s) Loss: 0.0306(0.5188) 
Epoch: [6][0/1317]Data 1.098 (1.098)Elapsed 0m 2s (remain 55m 3s)Loss: 0.6491(0.6491)Grad: 1.8243  
Epoch: [6][100/1317]Data 0.000 (0.011)Elapsed 1m 46s (remain 21m 23s)Loss: 2.5207(0.5469)Grad: 1.3112  
Epoch: [6][200/1317]Data 0.000 (0.006)Elapsed 3m 30s (remain 19m 30s)Loss: 0.6172(0.5250)Grad: 1.4717  
Epoch: [6][300/1317]Data 0.000 (0.004)Elapsed 5m 14s (remain 17m 41s)Loss: 0.3444(0.4982)Grad: 1.1341  
Epoch: [6][400/1317]Data 0.000 (0.003)Elapsed 6m 58s (remain 15m 56s)Loss: 0.7948(0.5111)Grad: 1.1244  
Epoch: [6][500/1317]Data 0.000 (0.002)Elapsed 8m 42s (remain 14m 11s)Loss: 0.1921(0.4908)Grad: 1.5225  
Epoch: [6][600/1317]Data 0.000 (0.002)Elapsed 10m 27s (remain 12m 27s)Loss: 0.0858(0.4905)Grad: 1.1239  
Epoch: [6][700/1317]Data 0.000 (0.002)Elapsed 12m 10s (remain 10m 42s)Loss: 0.5498(0.4973)Grad: 1.2770  
Epoch: [6][800/1317]Data 0.000 (0.002)Elapsed 13m 54s (remain 8m 57s)Loss: 1.1099(0

Epoch 6 - avg_train_loss: 0.5184 avg_val_loss: 0.5190 time: 1450s
Epoch 6 - Accuracy: 0.8802201974183751


EVAL: [329/330] Data 0.000 (0.112) Elapsed 1m 19s (remain 0m 0s) Loss: 0.0460(0.5190) 
Epoch: [7][0/1317]Data 1.210 (1.210)Elapsed 0m 2s (remain 55m 16s)Loss: 0.1201(0.1201)Grad: 1.6402  
Epoch: [7][100/1317]Data 0.000 (0.012)Elapsed 1m 46s (remain 21m 21s)Loss: 0.4022(0.5276)Grad: 1.3525  
Epoch: [7][200/1317]Data 0.000 (0.006)Elapsed 3m 30s (remain 19m 29s)Loss: 0.2065(0.5041)Grad: 1.4155  
Epoch: [7][300/1317]Data 0.000 (0.004)Elapsed 5m 14s (remain 17m 41s)Loss: 1.0173(0.5083)Grad: 1.0218  
Epoch: [7][400/1317]Data 0.000 (0.003)Elapsed 6m 58s (remain 15m 55s)Loss: 0.0083(0.5047)Grad: 0.4319  
Epoch: [7][500/1317]Data 0.000 (0.003)Elapsed 8m 42s (remain 14m 11s)Loss: 0.8222(0.4976)Grad: 1.9663  
Epoch: [7][600/1317]Data 0.000 (0.002)Elapsed 10m 26s (remain 12m 26s)Loss: 0.2255(0.4912)Grad: 1.7065  
Epoch: [7][700/1317]Data 0.000 (0.002)Elapsed 12m 10s (remain 10m 42s)Loss: 0.0215(0.4858)Grad: 0.5475  
Epoch: [7][800/1317]Data 0.000 (0.002)Elapsed 13m 55s (remain 8m 58s)Loss: 1.4106(

Epoch 7 - avg_train_loss: 0.4907 avg_val_loss: 0.5096 time: 1455s
Epoch 7 - Accuracy: 0.8819286256643888
Epoch 7 - Save Best Score: 0.8819 Model


EVAL: [329/330] Data 0.000 (0.114) Elapsed 1m 21s (remain 0m 0s) Loss: 0.0695(0.5096) 
Epoch: [8][0/1317]Data 1.210 (1.210)Elapsed 0m 2s (remain 63m 47s)Loss: 0.3628(0.3628)Grad: 1.9526  
Epoch: [8][100/1317]Data 0.000 (0.012)Elapsed 1m 47s (remain 21m 28s)Loss: 0.2833(0.4560)Grad: 0.8896  
Epoch: [8][200/1317]Data 0.000 (0.006)Elapsed 3m 31s (remain 19m 35s)Loss: 0.2097(0.4842)Grad: 1.2584  
Epoch: [8][300/1317]Data 0.000 (0.004)Elapsed 5m 16s (remain 17m 47s)Loss: 1.0089(0.5024)Grad: 1.2294  
Epoch: [8][400/1317]Data 0.000 (0.003)Elapsed 7m 0s (remain 16m 1s)Loss: 0.2523(0.4931)Grad: 1.8982  
Epoch: [8][500/1317]Data 0.000 (0.003)Elapsed 8m 45s (remain 14m 15s)Loss: 0.2445(0.4931)Grad: 0.9139  
Epoch: [8][600/1317]Data 0.000 (0.002)Elapsed 10m 29s (remain 12m 30s)Loss: 1.2841(0.4959)Grad: 1.1543  
Epoch: [8][700/1317]Data 0.000 (0.002)Elapsed 12m 14s (remain 10m 45s)Loss: 0.2716(0.4952)Grad: 0.7976  
Epoch: [8][800/1317]Data 0.000 (0.002)Elapsed 13m 59s (remain 9m 0s)Loss: 0.5736(0.4

Epoch 8 - avg_train_loss: 0.4797 avg_val_loss: 0.5684 time: 1462s
Epoch 8 - Accuracy: 0.8843963553530751
Epoch 8 - Save Best Score: 0.8844 Model


EVAL: [329/330] Data 0.000 (0.115) Elapsed 1m 21s (remain 0m 0s) Loss: 0.0319(0.5684) 
Epoch: [9][0/1317]Data 1.475 (1.475)Elapsed 0m 3s (remain 67m 47s)Loss: 0.0308(0.0308)Grad: 0.5594  
Epoch: [9][100/1317]Data 0.000 (0.015)Elapsed 1m 48s (remain 21m 40s)Loss: 0.5994(0.4030)Grad: 1.3832  
Epoch: [9][200/1317]Data 0.000 (0.008)Elapsed 3m 32s (remain 19m 39s)Loss: 0.1869(0.4316)Grad: 1.6477  
Epoch: [9][300/1317]Data 0.000 (0.005)Elapsed 5m 16s (remain 17m 49s)Loss: 0.1285(0.4679)Grad: 1.2685  
Epoch: [9][400/1317]Data 0.000 (0.004)Elapsed 7m 1s (remain 16m 3s)Loss: 0.8016(0.4754)Grad: 1.0535  
Epoch: [9][500/1317]Data 0.000 (0.003)Elapsed 8m 46s (remain 14m 16s)Loss: 0.3452(0.4740)Grad: 1.5856  
Epoch: [9][600/1317]Data 0.000 (0.003)Elapsed 10m 30s (remain 12m 31s)Loss: 0.1871(0.4877)Grad: 0.9159  
Epoch: [9][700/1317]Data 0.000 (0.002)Elapsed 12m 15s (remain 10m 46s)Loss: 1.0315(0.4917)Grad: 1.7682  
Epoch: [9][800/1317]Data 0.000 (0.002)Elapsed 14m 0s (remain 9m 1s)Loss: 0.0915(0.48

Epoch 9 - avg_train_loss: 0.4718 avg_val_loss: 0.5304 time: 1458s
Epoch 9 - Accuracy: 0.8880030372057707
Epoch 9 - Save Best Score: 0.8880 Model


EVAL: [329/330] Data 0.000 (0.110) Elapsed 1m 19s (remain 0m 0s) Loss: 0.0494(0.5304) 
Epoch: [10][0/1317]Data 1.317 (1.317)Elapsed 0m 2s (remain 59m 43s)Loss: 0.1846(0.1846)Grad: 1.4613  
Epoch: [10][100/1317]Data 0.000 (0.013)Elapsed 1m 47s (remain 21m 36s)Loss: 0.6663(0.4537)Grad: 2.5480  
Epoch: [10][200/1317]Data 0.000 (0.007)Elapsed 3m 31s (remain 19m 36s)Loss: 0.5803(0.4589)Grad: 1.7345  
Epoch: [10][300/1317]Data 0.000 (0.005)Elapsed 5m 15s (remain 17m 45s)Loss: 1.3256(0.4557)Grad: 1.2956  
Epoch: [10][400/1317]Data 0.000 (0.003)Elapsed 6m 59s (remain 15m 58s)Loss: 0.1696(0.4509)Grad: 1.3128  
Epoch: [10][500/1317]Data 0.000 (0.003)Elapsed 8m 44s (remain 14m 13s)Loss: 0.0077(0.4599)Grad: 0.3589  
Epoch: [10][600/1317]Data 0.000 (0.002)Elapsed 10m 28s (remain 12m 28s)Loss: 0.3215(0.4531)Grad: 1.4463  
Epoch: [10][700/1317]Data 0.000 (0.002)Elapsed 12m 12s (remain 10m 43s)Loss: 1.2166(0.4450)Grad: 0.9568  
Epoch: [10][800/1317]Data 0.000 (0.002)Elapsed 13m 56s (remain 8m 58s)Loss

Epoch 10 - avg_train_loss: 0.4585 avg_val_loss: 0.5088 time: 1456s
Epoch 10 - Accuracy: 0.8881928625664389
Epoch 10 - Save Best Score: 0.8882 Model


EVAL: [329/330] Data 0.000 (0.112) Elapsed 1m 20s (remain 0m 0s) Loss: 0.0800(0.5088) 


Score: 0.88819
Score: 0.88819
