In [1]:
# !pip install pytorch_ranger

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

In [3]:
os.listdir('../input/cassava-leaf-disease-classification/')

['train_tfrecords',
 'sample_submission.csv',
 'test_tfrecords',
 'label_num_to_disease_map.json',
 'train_images',
 'train.csv',
 'test_images']

In [4]:
train = pd.read_csv('../input/cassava-leaf-disease-merged/merged.csv')
test = pd.read_csv('../input/cassava-leaf-disease-classification//sample_submission.csv')
label_map = pd.read_json('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json', orient='index')

display(train.head())
display(test.head())
display(label_map)

Unnamed: 0,image_id,label,source
0,1000015157.jpg,0,2020
1,1000201771.jpg,3,2020
2,100042118.jpg,1,2020
3,1000723321.jpg,1,2020
4,1000812911.jpg,3,2020


Unnamed: 0,image_id,label
0,2216849948.jpg,4


Unnamed: 0,0
0,Cassava Bacterial Blight (CBB)
1,Cassava Brown Streak Disease (CBSD)
2,Cassava Green Mottle (CGM)
3,Cassava Mosaic Disease (CMD)
4,Healthy


## Directory settings

In [5]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
TRAIN_PATH = '../input/cassava-leaf-disease-merged/train'
TEST_PATH = '../input/cassava-leaf-disease-classification/test_images'

## CFG

In [6]:
class CFG:
    debug = False
    apex = False
    print_freq = 20
    num_workers = 4
    model_name = 'tf_efficientnet_b0_ns'
    size = 380
    scheduler = 'CosineAnnealingWarmRestarts'
    loss_train = 'BiTemperedLoss'
    epochs = 10
    T_0 = 10
    lr_1 = 1e-3
    lr_2 = 1e-4
    t1 = 0.9
    t2 = 1.5
    smooth = 1e-2
    min_lr = 1e-6
    batch_size = 32
    weight_decay = 1e-6
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    seed = 42
    target_size = 5
    target_col = 'label'
    n_fold = 5
    trn_fold = [1]
    train = True
    inference = False
    
if CFG.debug:
    CFG.epochs = 3
    train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)

## Library

In [7]:
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

sys.path.append('../input/bi-tempered-loss-pytorch')
from bi_tempered_loss import *

# sys.path.append('../input/pytorch-optimizer')
# import torch_optimizer as optim

sys.path.append('../input/pytorch-sam')
from sam import SAM

from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine, RandomResizedCrop,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, CoarseDropout, ShiftScaleRotate, CenterCrop, Resize
)
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

import warnings 
warnings.filterwarnings('ignore')

if CFG.apex:
    from apex import amp

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Utils

In [8]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f}')
    
def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

## CV split

In [9]:
folds = train.copy()
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.groupby(['fold', CFG.target_col]).size())

fold  label
0     0         299
      1         695
      2         604
      3        3092
      4         578
1     0         299
      1         695
      2         604
      3        3092
      4         578
2     0         298
      1         695
      2         603
      3        3093
      4         578
3     0         298
      1         695
      2         603
      3        3093
      4         578
4     0         298
      1         696
      2         603
      3        3092
      4         578
dtype: int64


## Dataset

In [10]:
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.labels = df['label'].values
#         self.labels = pd.get_dummies(df['label']).values  # One Hot Encoding
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TRAIN_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx]).long()
        return image, label
    
class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TEST_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        return image

In [11]:
# train_dataset = TrainDataset(train, transform=None)

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image)
#     plt.title(f'label: {label}')
#     plt.show()

## Transforms

In [12]:
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            RandomResizedCrop(CFG.size, CFG.size), 
            Transpose(p=0.5), 
            HorizontalFlip(p=0.5), 
            VerticalFlip(p=0.5), 
            ShiftScaleRotate(p=0.5), 
            HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5), 
            RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])
    
    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])

In [13]:
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image[0])
#     plt.title(f'label: {label}')
#     plt.show()

## MODEL

In [14]:
class CustomEfficientNetB0ns(nn.Module):
    def __init__(self, model_name='tf_efficientnet_b0_ns', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(n_features, CFG.target_size)
        
    def forward(self, x):
        x = self.model(x)
        return x

In [15]:
# model = CustomEfficientNetB0ns(model_name=CFG.model_name, pretrained=False)
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))
# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, 
#                           num_workers=4, pin_memory=True, drop_last=True)

# for image, label in train_loader:
#     print(image.size())
#     output = model(image)
#     print(output)
#     break

## Loss Functions

In [16]:
# ====================================================
# Label Smoothing
# ====================================================
class LabelSmoothingLoss(nn.Module): 
    def __init__(self, classes=5, smoothing=0.0, dim=-1): 
        super(LabelSmoothingLoss, self).__init__() 
        self.confidence = 1.0 - smoothing 
        self.smoothing = smoothing 
        self.cls = classes 
        self.dim = dim 
    def forward(self, pred, target): 
        pred = pred.log_softmax(dim=self.dim) 
        with torch.no_grad():
            true_dist = torch.zeros_like(pred) 
            true_dist.fill_(self.smoothing / (self.cls - 1)) 
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [17]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduce = reduce

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [18]:
class FocalCosineLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, xent=.1):
        super(FocalCosineLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

        self.xent = xent

        self.y = torch.Tensor([1]).cuda()

    def forward(self, input, target, reduction="mean"):
        cosine_loss = F.cosine_embedding_loss(input, F.one_hot(target, num_classes=input.size(-1)), self.y, reduction=reduction)

        cent_loss = F.cross_entropy(F.normalize(input), target, reduce=False)
        pt = torch.exp(-cent_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * cent_loss

        if reduction == "mean":
            focal_loss = torch.mean(focal_loss)

        return cosine_loss + self.xent * focal_loss

In [19]:
class SymmetricCrossEntropy(nn.Module):

    def __init__(self, alpha=0.1, beta=1.0, num_classes=5):
        super(SymmetricCrossEntropy, self).__init__()
        self.alpha = alpha
        self.beta = beta
        self.num_classes = num_classes

    def forward(self, logits, targets, reduction='mean'):
        onehot_targets = torch.eye(self.num_classes)[targets].cuda()
        ce_loss = F.cross_entropy(logits, targets, reduction=reduction)
        rce_loss = (-onehot_targets*logits.softmax(1).clamp(1e-7, 1.0).log()).sum(1)
        if reduction == 'mean':
            rce_loss = rce_loss.mean()
        elif reduction == 'sum':
            rce_loss = rce_loss.sum()
        return self.alpha * ce_loss + self.beta * rce_loss

In [20]:
def log_t(u, t):
    """Compute log_t for `u'."""
    if t==1.0:
        return u.log()
    else:
        return (u.pow(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u'."""
    if t==1:
        return u.exp()
    else:
        return (1.0 + (1.0-t)*u).relu().pow(1.0 / (1.0 - t))

def compute_normalization_fixed_point(activations, t, num_iters):

    """Returns the normalization value for each example (t > 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same shape as activation with the last dimension being 1.
    """
    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations_step_0 = activations - mu

    normalized_activations = normalized_activations_step_0

    for _ in range(num_iters):
        logt_partition = torch.sum(
                exp_t(normalized_activations, t), -1, keepdim=True)
        normalized_activations = normalized_activations_step_0 * \
                logt_partition.pow(1.0-t)

    logt_partition = torch.sum(
            exp_t(normalized_activations, t), -1, keepdim=True)
    normalization_constants = - log_t(1.0 / logt_partition, t) + mu

    return normalization_constants

def compute_normalization_binary_search(activations, t, num_iters):

    """Returns the normalization value for each example (t < 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (< 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """

    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations = activations - mu

    effective_dim = \
        torch.sum(
                (normalized_activations > -1.0 / (1.0-t)).to(torch.int32),
            dim=-1, keepdim=True).to(activations.dtype)

    shape_partition = activations.shape[:-1] + (1,)
    lower = torch.zeros(shape_partition, dtype=activations.dtype, device=activations.device)
    upper = -log_t(1.0/effective_dim, t) * torch.ones_like(lower)

    for _ in range(num_iters):
        logt_partition = (upper + lower)/2.0
        sum_probs = torch.sum(
                exp_t(normalized_activations - logt_partition, t),
                dim=-1, keepdim=True)
        update = (sum_probs < 1.0).to(activations.dtype)
        lower = torch.reshape(
                lower * update + (1.0-update) * logt_partition,
                shape_partition)
        upper = torch.reshape(
                upper * (1.0 - update) + update * logt_partition,
                shape_partition)

    logt_partition = (upper + lower)/2.0
    return logt_partition + mu

class ComputeNormalization(torch.autograd.Function):
    """
    Class implementing custom backward pass for compute_normalization. See compute_normalization.
    """
    @staticmethod
    def forward(ctx, activations, t, num_iters):
        if t < 1.0:
            normalization_constants = compute_normalization_binary_search(activations, t, num_iters)
        else:
            normalization_constants = compute_normalization_fixed_point(activations, t, num_iters)

        ctx.save_for_backward(activations, normalization_constants)
        ctx.t=t
        return normalization_constants

    @staticmethod
    def backward(ctx, grad_output):
        activations, normalization_constants = ctx.saved_tensors
        t = ctx.t
        normalized_activations = activations - normalization_constants 
        probabilities = exp_t(normalized_activations, t)
        escorts = probabilities.pow(t)
        escorts = escorts / escorts.sum(dim=-1, keepdim=True)
        grad_input = escorts * grad_output
        
        return grad_input, None, None

def compute_normalization(activations, t, num_iters=5):
    """Returns the normalization value for each example. 
    Backward pass is implemented.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    return ComputeNormalization.apply(activations, t, num_iters)

def tempered_sigmoid(activations, t, num_iters = 5):
    """Tempered sigmoid function.
    Args:
      activations: Activations for the positive class for binary classification.
      t: Temperature tensor > 0.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_probabilities = tempered_softmax(internal_activations, t, num_iters)
    return internal_probabilities[..., 0]


def tempered_softmax(activations, t, num_iters=5):
    """Tempered softmax function.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature > 1.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    if t == 1.0:
        return activations.softmax(dim=-1)

    normalization_constants = compute_normalization(activations, t, num_iters)
    return exp_t(activations - normalization_constants, t)

def bi_tempered_binary_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing = 0.0,
        num_iters=5,
        reduction='mean'):

    """Bi-Tempered binary logistic loss.
    Args:
      activations: A tensor containing activations for class 1.
      labels: A tensor with shape as activations, containing probabilities for class 1
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing
      num_iters: Number of iterations to run the method.
    Returns:
      A loss tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_labels = torch.stack([labels.to(activations.dtype),
        1.0 - labels.to(activations.dtype)],
        dim=-1)
    return bi_tempered_logistic_loss(internal_activations, 
            internal_labels,
            t1,
            t2,
            label_smoothing = label_smoothing,
            num_iters = num_iters,
            reduction = reduction)

def bi_tempered_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing=0.0,
        num_iters=5,
        reduction = 'mean'):

    """Bi-Tempered Logistic Loss.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      labels: A tensor with shape and dtype as activations (onehot), 
        or a long tensor of one dimension less than activations (pytorch standard)
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing parameter between [0, 1). Default 0.0.
      num_iters: Number of iterations to run the method. Default 5.
      reduction: ``'none'`` | ``'mean'`` | ``'sum'``. Default ``'mean'``.
        ``'none'``: No reduction is applied, return shape is shape of
        activations without the last dimension.
        ``'mean'``: Loss is averaged over minibatch. Return shape (1,)
        ``'sum'``: Loss is summed over minibatch. Return shape (1,)
    Returns:
      A loss tensor.
    """

    if len(labels.shape)<len(activations.shape): #not one-hot
        labels_onehot = torch.zeros_like(activations)
        labels_onehot.scatter_(1, labels[..., None], 1)
    else:
        labels_onehot = labels

    if label_smoothing > 0:
        num_classes = labels_onehot.shape[-1]
        labels_onehot = ( 1 - label_smoothing * num_classes / (num_classes - 1) ) \
                * labels_onehot + \
                label_smoothing / (num_classes - 1)

    probabilities = tempered_softmax(activations, t2, num_iters)

    loss_values = labels_onehot * log_t(labels_onehot + 1e-10, t1) \
            - labels_onehot * log_t(probabilities, t1) \
            - labels_onehot.pow(2.0 - t1) / (2.0 - t1) \
            + probabilities.pow(2.0 - t1) / (2.0 - t1)
    loss_values = loss_values.sum(dim = -1) #sum over classes

    if reduction == 'none':
        return loss_values
    if reduction == 'sum':
        return loss_values.sum()
    if reduction == 'mean':
        return loss_values.mean()

In [21]:
class BiTemperedLogisticLoss(nn.Module): 
    def __init__(self, t1, t2, smoothing=0.0): 
        super(BiTemperedLogisticLoss, self).__init__() 
        self.t1 = t1
        self.t2 = t2
        self.smoothing = smoothing
    def forward(self, logit_label, truth_label):
        loss_label = bi_tempered_logistic_loss(
            logit_label, truth_label,
            t1=self.t1, t2=self.t2,
            label_smoothing=self.smoothing,
            reduction='none'
        )
        
        loss_label = loss_label.mean()
        return loss_label

In [22]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def asMinutes(s):
    """秒を分に変換する関数"""
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    """経過時間の測定と終了時間の予測を行う関数
    Parameters
    ----------
    since : float
        実験を始めた時刻
    percent : float
        実験が進んだ割合
        
    Returns
    -------
    s : 経過時間
    re : 終了までの時間の予測
    """
    now = time.time()
    s = now - since  # 経過時間の測定
    es = s / percent  # 終了時間の予測
    re = es - s  # 残り時間の予想
    return '%s (remain %s)' % (asMinutes(s), asMinutes(re))

def train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, shechduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(images)
        metric = loss_metric(y_preds, labels)
        loss = loss_train(y_preds, labels)
        # record loss
        losses.update(metric.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        if CFG.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else: 
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.first_step(zero_grad=True)
#             optimizer.zero_grad()
            global_step += 1
        # measure elapsed time
        loss_train(model(images), labels).backward()
#         loss = torch.mean(loss)
#         loss.backward()
        optimizer.second_step(zero_grad=True)
        
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}]'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})'
                  'Elapsed {remain:s}' 
                  'Loss: {loss.val:.4f}({loss.avg:.4f})' 
                  'Grad: {grad_norm:.4f}  '
                  .format(epoch+1, step, len(train_loader), batch_time=batch_time, 
                          data_time=data_time, loss=losses, 
                          remain=timeSince(start, float(step+1)/len(train_loader)), 
                          grad_norm=grad_norm))
    return losses.avg

def valid_fn(valid_loader, model, loss_metric, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (images, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images)
        loss = loss_metric(y_preds, labels)
        losses.update(loss.item(), batch_size)
        # record accuracy
        preds.append(y_preds.softmax(1).to('cpu').numpy())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
            
    predictions = np.concatenate(preds)
    return losses.avg, predictions

def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (images) in tk0:
        images = images.to(device)
        avgpreds = []
        for state in states:
            model.load_state_dict(state['model'])
            model.eval()
            with torch.no_grad():
                y_preds = model(images)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs

## Train loop

In [23]:
# ======================================================
# Train loop
# ======================================================

def train_loop(folds, fold):
    
    seed_torch(seed=CFG.seed)    
    
    LOGGER.info(f'========== fold: {fold} training ============')
    
    # ======================================================
    # loader
    # ======================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index
    
    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    
    train_dataset = TrainDataset(train_folds, 
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds, 
                                 transform=get_transforms(data='valid'))
    
    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=True, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    
    # ===============================================
    # scheduler
    # ===============================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler
    
    # ===============================================
    # model & optimizer
    # ===============================================
    model = CustomEfficientNetB0ns(CFG.model_name, pretrained=True)
    
    # 最初の3epochはclassifier層以外全て凍結する。
    for name, param in model.model.named_parameters():
        if 'classifier' not in name:
            param.requires_grad=False
    
    model.to(device)
    
    base_optimizer = Adam
    optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_1, weight_decay=CFG.weight_decay, amsgrad=False)
    
    scheduler = get_scheduler(optimizer)
    
    # ===============================================
    # apex 
    # ===============================================
    if CFG.apex:
        model.optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
        
    # ===============================================
    # loop
    # ===============================================
    def get_loss_train():
        if CFG.loss_train == 'CrossEntropyLoss':
            loss_train = nn.CrossEntropyLoss()
        elif CFG.loss_train == 'LabelSmoothing':
            loss_train = LabelSmoothingLoss(classes=CFG.target_size, smoothing=CFG.smooth)
        elif CFG.loss_train == 'FocalLoss':
            loss_train = FocalLoss().to(device)
        elif CFG.loss_train == 'FocalCosineLoss':
            loss_train = FocalCosineLoss()
        elif CFG.loss_train == 'SymmetricCrossEntropyLoss':
            loss_train = SymmetricCrossEntropy().to(device)
        elif CFG.loss_train == 'BiTemperedLoss':
            loss_train = BiTemperedLogisticLoss(t1=CFG.t1, t2=CFG.t2, smoothing=CFG.smooth)
        return loss_train
    
    loss_train = get_loss_train()
    LOGGER.info(f'loss_train: {loss_train}')
    loss_metric = nn.CrossEntropyLoss()
    
    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        if epoch == 1:
            
            # 2epoch目に重みを全て解凍する
            for param in model.model.parameters():
                param.requires_grad = True
                
            # 学習率を4e-3から4e-4に落とす
            base_optimizer = Adam
            optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_2, weight_decay=CFG.weight_decay, amsgrad=False)
            scheduler = get_scheduler(optimizer)

            LOGGER.info('requires_grad of all parameters are unlocked')
            
        
        # train
        avg_loss = train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, scheduler, device)
        
        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, loss_metric, device)
        valid_labels = valid_folds[CFG.target_col].values
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()
        
        # scoring
        score = get_score(valid_labels, preds.argmax(1))
        
        elapsed = time.time() - start_time
        
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Accuracy: {score}')
        
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(), 
                        'preds': preds}, 
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
            
        # inference用に全て保存しておく        
        torch.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth')
    
    check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    valid_folds[[str(c) for c in range(5)]] = check_point['preds']
    valid_folds['preds'] = check_point['preds'].argmax(1)
    
    return valid_folds

In [24]:
# ====================================================
# main
# ====================================================
def main():
    
    """
    Prepare: 1.train 2.test 3.submission 4.folds
    """
    
    def get_result(result_df):
        preds = result_df['preds'].values
        labels = result_df[CFG.target_col].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.5f}')
        
    if CFG.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            
            
            
            if fold in CFG.trn_fold:
                _oof_df = train_loop(folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f'=============== fold: {fold} result ================')
                get_result(_oof_df)
                
        # CV result
        LOGGER.info(f'============ CV ============')
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
        
    if CFG.inference:
        # inference
        model = CustomEfficientNetb0ns(CFG.model_name, pretrained=False)
        states = [torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth') for fold in CFG.trn_fold]
        test_dataset = TestDataset(test, batch_size=CFG.batch_size, shuffle=False, pin_memory=True)
        predictions = inference(model, states, test_loader, device)
        # submission
        test['label'] = predictions.argmax(1)
        test[['image_id', 'label']].to_csv(OUTPUT_DIR+'submission.csv', index=False)

In [25]:
LOGGER.info(f'used device: {device}')

used device: cuda


In [26]:
if __name__ == '__main__':
    main()

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ns-c0e6a31c.pth" to /root/.cache/torch/hub/checkpoints/tf_efficientnet_b0_ns-c0e6a31c.pth
loss_train: BiTemperedLogisticLoss()


Epoch: [1][0/659]Data 2.079 (2.079)Elapsed 0m 4s (remain 45m 27s)Loss: 1.5950(1.5950)Grad: 0.6684  
Epoch: [1][20/659]Data 0.410 (0.266)Elapsed 0m 12s (remain 6m 19s)Loss: 0.9495(1.4705)Grad: 0.4298  
Epoch: [1][40/659]Data 0.166 (0.263)Elapsed 0m 22s (remain 5m 37s)Loss: 1.5944(1.5310)Grad: 0.2752  
Epoch: [1][60/659]Data 0.437 (0.278)Elapsed 0m 33s (remain 5m 28s)Loss: 0.8334(1.4926)Grad: 0.4466  
Epoch: [1][80/659]Data 0.449 (0.275)Elapsed 0m 43s (remain 5m 12s)Loss: 1.2794(1.4144)Grad: 0.3835  
Epoch: [1][100/659]Data 0.316 (0.281)Elapsed 0m 54s (remain 5m 3s)Loss: 1.5572(1.3729)Grad: 0.4062  
Epoch: [1][120/659]Data 0.135 (0.285)Elapsed 1m 5s (remain 4m 51s)Loss: 0.8932(1.3380)Grad: 0.3991  
Epoch: [1][140/659]Data 0.337 (0.284)Elapsed 1m 15s (remain 4m 38s)Loss: 1.3663(1.3055)Grad: 0.3033  
Epoch: [1][160/659]Data 0.489 (0.283)Elapsed 1m 26s (remain 4m 26s)Loss: 1.6031(1.2774)Grad: 0.3684  
Epoch: [1][180/659]Data 1.238 (0.285)Elapsed 1m 37s (remain 4m 17s)Loss: 1.4276(1.2555)Gra

Epoch 1 - avg_train_loss: 1.0742 avg_val_loss: 0.9084 time: 412s
Epoch 1 - Accuracy: 0.7118451025056948
Epoch 1 - Save Best Score: 0.7118 Model
requires_grad of all parameters are unlocked


EVAL: [164/165] Data 0.000 (0.315) Elapsed 1m 6s (remain 0m 0s) Loss: 1.1812(0.9084) 
Epoch: [2][0/659]Data 1.891 (1.891)Elapsed 0m 3s (remain 34m 30s)Loss: 1.2930(1.2930)Grad: 2.1135  
Epoch: [2][20/659]Data 0.000 (0.091)Elapsed 0m 17s (remain 9m 6s)Loss: 0.8756(0.9429)Grad: 1.6828  
Epoch: [2][40/659]Data 0.000 (0.047)Elapsed 0m 32s (remain 8m 3s)Loss: 0.9271(0.9107)Grad: 1.5012  
Epoch: [2][60/659]Data 0.000 (0.031)Elapsed 0m 46s (remain 7m 31s)Loss: 0.6637(0.8698)Grad: 1.4521  
Epoch: [2][80/659]Data 0.000 (0.024)Elapsed 1m 0s (remain 7m 13s)Loss: 1.3786(0.8862)Grad: 1.6239  
Epoch: [2][100/659]Data 0.000 (0.019)Elapsed 1m 14s (remain 6m 53s)Loss: 0.8578(0.8703)Grad: 1.2994  
Epoch: [2][120/659]Data 0.000 (0.016)Elapsed 1m 29s (remain 6m 36s)Loss: 1.1332(0.8443)Grad: 1.3345  
Epoch: [2][140/659]Data 0.000 (0.014)Elapsed 1m 43s (remain 6m 19s)Loss: 1.2947(0.8364)Grad: 1.3527  
Epoch: [2][160/659]Data 0.000 (0.012)Elapsed 1m 58s (remain 6m 5s)Loss: 0.4306(0.8174)Grad: 1.1485  
Epoch:

Epoch 2 - avg_train_loss: 0.7069 avg_val_loss: 0.6937 time: 535s
Epoch 2 - Accuracy: 0.8509870918754746
Epoch 2 - Save Best Score: 0.8510 Model


EVAL: [164/165] Data 0.000 (0.284) Elapsed 1m 1s (remain 0m 0s) Loss: 1.3829(0.6937) 
Epoch: [3][0/659]Data 2.267 (2.267)Elapsed 0m 3s (remain 41m 10s)Loss: 0.2362(0.2362)Grad: 1.1067  
Epoch: [3][20/659]Data 0.000 (0.109)Elapsed 0m 19s (remain 9m 38s)Loss: 0.4196(0.6079)Grad: 1.1851  
Epoch: [3][40/659]Data 0.000 (0.056)Elapsed 0m 32s (remain 8m 16s)Loss: 0.7603(0.5980)Grad: 0.9765  
Epoch: [3][60/659]Data 0.000 (0.038)Elapsed 0m 47s (remain 7m 45s)Loss: 0.8801(0.6008)Grad: 1.3798  
Epoch: [3][80/659]Data 0.000 (0.028)Elapsed 1m 2s (remain 7m 23s)Loss: 1.0166(0.6197)Grad: 1.1265  
Epoch: [3][100/659]Data 0.000 (0.023)Elapsed 1m 16s (remain 7m 3s)Loss: 0.5883(0.6241)Grad: 1.3487  
Epoch: [3][120/659]Data 0.000 (0.019)Elapsed 1m 30s (remain 6m 42s)Loss: 0.4446(0.6220)Grad: 1.1225  
Epoch: [3][140/659]Data 0.000 (0.017)Elapsed 1m 44s (remain 6m 25s)Loss: 0.5859(0.6258)Grad: 1.0427  
Epoch: [3][160/659]Data 0.000 (0.015)Elapsed 1m 59s (remain 6m 8s)Loss: 0.3763(0.6304)Grad: 0.9561  
Epoch

Epoch 3 - avg_train_loss: 0.6297 avg_val_loss: 0.6063 time: 536s
Epoch 3 - Accuracy: 0.8673120728929385
Epoch 3 - Save Best Score: 0.8673 Model


EVAL: [164/165] Data 0.015 (0.286) Elapsed 1m 2s (remain 0m 0s) Loss: 1.3494(0.6063) 
Epoch: [4][0/659]Data 2.303 (2.303)Elapsed 0m 3s (remain 36m 4s)Loss: 0.9179(0.9179)Grad: 1.4027  
Epoch: [4][20/659]Data 0.000 (0.111)Elapsed 0m 19s (remain 9m 43s)Loss: 0.2422(0.5432)Grad: 1.0325  
Epoch: [4][40/659]Data 0.000 (0.057)Elapsed 0m 33s (remain 8m 20s)Loss: 0.3803(0.5697)Grad: 1.0718  
Epoch: [4][60/659]Data 0.000 (0.038)Elapsed 0m 47s (remain 7m 45s)Loss: 0.6230(0.6580)Grad: 0.9413  
Epoch: [4][80/659]Data 0.004 (0.029)Elapsed 1m 1s (remain 7m 21s)Loss: 0.4494(0.6264)Grad: 1.4868  
Epoch: [4][100/659]Data 0.000 (0.023)Elapsed 1m 16s (remain 7m 0s)Loss: 1.0021(0.5967)Grad: 1.0373  
Epoch: [4][120/659]Data 0.000 (0.019)Elapsed 1m 30s (remain 6m 42s)Loss: 0.4757(0.5759)Grad: 1.5656  
Epoch: [4][140/659]Data 0.000 (0.017)Elapsed 1m 44s (remain 6m 24s)Loss: 1.0979(0.6160)Grad: 1.0301  
Epoch: [4][160/659]Data 0.000 (0.015)Elapsed 1m 58s (remain 6m 7s)Loss: 1.4722(0.6218)Grad: 1.4629  
Epoch:

Epoch 4 - avg_train_loss: 0.5956 avg_val_loss: 0.5868 time: 543s
Epoch 4 - Accuracy: 0.873006833712984
Epoch 4 - Save Best Score: 0.8730 Model


EVAL: [164/165] Data 0.052 (0.306) Elapsed 1m 5s (remain 0m 0s) Loss: 1.1321(0.5868) 
Epoch: [5][0/659]Data 2.649 (2.649)Elapsed 0m 4s (remain 46m 4s)Loss: 0.5005(0.5005)Grad: 1.2585  
Epoch: [5][20/659]Data 0.000 (0.127)Elapsed 0m 19s (remain 9m 47s)Loss: 1.1311(0.5780)Grad: 0.9424  
Epoch: [5][40/659]Data 0.000 (0.065)Elapsed 0m 33s (remain 8m 26s)Loss: 0.1855(0.5554)Grad: 0.7806  
Epoch: [5][60/659]Data 0.000 (0.044)Elapsed 0m 47s (remain 7m 46s)Loss: 1.5768(0.5509)Grad: 0.9062  
Epoch: [5][80/659]Data 0.004 (0.033)Elapsed 1m 2s (remain 7m 26s)Loss: 0.4707(0.5354)Grad: 0.6901  
Epoch: [5][100/659]Data 0.000 (0.027)Elapsed 1m 16s (remain 7m 3s)Loss: 0.8490(0.5592)Grad: 0.6598  
Epoch: [5][120/659]Data 0.000 (0.022)Elapsed 1m 31s (remain 6m 45s)Loss: 0.3273(0.5702)Grad: 0.9613  
Epoch: [5][140/659]Data 0.000 (0.019)Elapsed 1m 45s (remain 6m 26s)Loss: 0.4154(0.5544)Grad: 0.9030  
Epoch: [5][160/659]Data 0.000 (0.017)Elapsed 2m 0s (remain 6m 11s)Loss: 0.9062(0.5646)Grad: 0.9793  
Epoch:

Epoch 5 - avg_train_loss: 0.5704 avg_val_loss: 0.5808 time: 541s
Epoch 5 - Accuracy: 0.881169324221716
Epoch 5 - Save Best Score: 0.8812 Model


EVAL: [164/165] Data 0.059 (0.289) Elapsed 1m 3s (remain 0m 0s) Loss: 1.0761(0.5808) 
Epoch: [6][0/659]Data 2.238 (2.238)Elapsed 0m 3s (remain 35m 41s)Loss: 0.6287(0.6287)Grad: 1.0103  
Epoch: [6][20/659]Data 0.000 (0.107)Elapsed 0m 18s (remain 9m 6s)Loss: 0.2936(0.6413)Grad: 1.4582  
Epoch: [6][40/659]Data 0.002 (0.055)Elapsed 0m 32s (remain 8m 9s)Loss: 0.3472(0.6917)Grad: 1.0732  
Epoch: [6][60/659]Data 0.000 (0.037)Elapsed 0m 46s (remain 7m 35s)Loss: 0.5489(0.6155)Grad: 0.9653  
Epoch: [6][80/659]Data 0.000 (0.028)Elapsed 1m 1s (remain 7m 19s)Loss: 0.5158(0.6174)Grad: 0.9847  
Epoch: [6][100/659]Data 0.000 (0.023)Elapsed 1m 15s (remain 6m 58s)Loss: 0.2164(0.5883)Grad: 0.9434  
Epoch: [6][120/659]Data 0.000 (0.019)Elapsed 1m 30s (remain 6m 41s)Loss: 0.4012(0.5800)Grad: 1.1517  
Epoch: [6][140/659]Data 0.000 (0.016)Elapsed 1m 44s (remain 6m 22s)Loss: 1.0335(0.5799)Grad: 1.2473  
Epoch: [6][160/659]Data 0.000 (0.014)Elapsed 1m 58s (remain 6m 6s)Loss: 0.3992(0.5855)Grad: 1.1013  
Epoch:

Epoch 6 - avg_train_loss: 0.5496 avg_val_loss: 0.5774 time: 536s
Epoch 6 - Accuracy: 0.884206529992407
Epoch 6 - Save Best Score: 0.8842 Model


EVAL: [164/165] Data 0.169 (0.285) Elapsed 1m 2s (remain 0m 0s) Loss: 0.6657(0.5774) 
Epoch: [7][0/659]Data 2.736 (2.736)Elapsed 0m 3s (remain 39m 37s)Loss: 1.4231(1.4231)Grad: 1.1842  
Epoch: [7][20/659]Data 0.000 (0.132)Elapsed 0m 18s (remain 9m 17s)Loss: 0.2044(0.4527)Grad: 0.9522  
Epoch: [7][40/659]Data 0.000 (0.068)Elapsed 0m 32s (remain 8m 10s)Loss: 0.8456(0.4916)Grad: 1.0998  
Epoch: [7][60/659]Data 0.000 (0.046)Elapsed 0m 46s (remain 7m 35s)Loss: 0.9292(0.5448)Grad: 0.9999  
Epoch: [7][80/659]Data 0.000 (0.035)Elapsed 1m 1s (remain 7m 17s)Loss: 0.8781(0.5300)Grad: 1.1525  
Epoch: [7][100/659]Data 0.000 (0.028)Elapsed 1m 15s (remain 6m 58s)Loss: 1.7854(0.5311)Grad: 1.0397  
Epoch: [7][120/659]Data 0.000 (0.023)Elapsed 1m 29s (remain 6m 39s)Loss: 1.5952(0.5576)Grad: 1.0275  
Epoch: [7][140/659]Data 0.000 (0.020)Elapsed 1m 43s (remain 6m 21s)Loss: 0.2480(0.5731)Grad: 1.2351  
Epoch: [7][160/659]Data 0.000 (0.018)Elapsed 1m 57s (remain 6m 4s)Loss: 0.2173(0.5775)Grad: 0.6092  
Epoc

Epoch 7 - avg_train_loss: 0.5486 avg_val_loss: 0.5431 time: 536s
Epoch 7 - Accuracy: 0.876993166287016


EVAL: [164/165] Data 0.000 (0.290) Elapsed 1m 2s (remain 0m 0s) Loss: 0.5214(0.5431) 
Epoch: [8][0/659]Data 1.814 (1.814)Elapsed 0m 3s (remain 33m 49s)Loss: 0.8568(0.8568)Grad: 0.9325  
Epoch: [8][20/659]Data 0.000 (0.087)Elapsed 0m 19s (remain 9m 43s)Loss: 0.3891(0.5199)Grad: 0.4128  
Epoch: [8][40/659]Data 0.000 (0.045)Elapsed 0m 33s (remain 8m 19s)Loss: 0.5990(0.5035)Grad: 0.9435  
Epoch: [8][60/659]Data 0.000 (0.030)Elapsed 0m 47s (remain 7m 43s)Loss: 0.4136(0.5016)Grad: 1.0613  
Epoch: [8][80/659]Data 0.000 (0.023)Elapsed 1m 1s (remain 7m 17s)Loss: 0.3996(0.5005)Grad: 1.0130  
Epoch: [8][100/659]Data 0.000 (0.018)Elapsed 1m 16s (remain 7m 0s)Loss: 0.2713(0.5025)Grad: 0.7657  
Epoch: [8][120/659]Data 0.000 (0.015)Elapsed 1m 30s (remain 6m 41s)Loss: 0.2947(0.4940)Grad: 0.9070  
Epoch: [8][140/659]Data 0.002 (0.013)Elapsed 1m 44s (remain 6m 24s)Loss: 0.5865(0.4833)Grad: 0.7104  
Epoch: [8][160/659]Data 0.000 (0.012)Elapsed 1m 58s (remain 6m 7s)Loss: 0.6544(0.5000)Grad: 1.1272  
Epoch

Epoch 8 - avg_train_loss: 0.5133 avg_val_loss: 0.5348 time: 538s
Epoch 8 - Accuracy: 0.884206529992407


EVAL: [164/165] Data 0.000 (0.286) Elapsed 1m 2s (remain 0m 0s) Loss: 0.6380(0.5348) 
Epoch: [9][0/659]Data 1.973 (1.973)Elapsed 0m 2s (remain 32m 18s)Loss: 0.4634(0.4634)Grad: 0.9014  
Epoch: [9][20/659]Data 0.000 (0.095)Elapsed 0m 18s (remain 9m 32s)Loss: 1.3495(0.6075)Grad: 1.2239  
Epoch: [9][40/659]Data 0.000 (0.049)Elapsed 0m 32s (remain 8m 15s)Loss: 0.1140(0.6016)Grad: 0.7630  
Epoch: [9][60/659]Data 0.000 (0.033)Elapsed 0m 46s (remain 7m 39s)Loss: 0.2510(0.5459)Grad: 0.5809  
Epoch: [9][80/659]Data 0.000 (0.025)Elapsed 1m 1s (remain 7m 18s)Loss: 1.3427(0.5199)Grad: 0.9586  
Epoch: [9][100/659]Data 0.000 (0.020)Elapsed 1m 16s (remain 7m 1s)Loss: 0.1913(0.5159)Grad: 1.1050  
Epoch: [9][120/659]Data 0.000 (0.017)Elapsed 1m 30s (remain 6m 41s)Loss: 0.2568(0.5269)Grad: 0.9105  
Epoch: [9][140/659]Data 0.000 (0.014)Elapsed 1m 44s (remain 6m 23s)Loss: 0.3408(0.5119)Grad: 0.9612  
Epoch: [9][160/659]Data 0.000 (0.013)Elapsed 1m 58s (remain 6m 7s)Loss: 0.1430(0.5110)Grad: 0.9329  
Epoch

Epoch 9 - avg_train_loss: 0.5198 avg_val_loss: 0.5493 time: 537s
Epoch 9 - Accuracy: 0.8845861807137434
Epoch 9 - Save Best Score: 0.8846 Model


EVAL: [164/165] Data 0.000 (0.290) Elapsed 1m 2s (remain 0m 0s) Loss: 0.5777(0.5493) 
Epoch: [10][0/659]Data 2.059 (2.059)Elapsed 0m 3s (remain 33m 1s)Loss: 0.3766(0.3766)Grad: 1.3122  
Epoch: [10][20/659]Data 0.000 (0.099)Elapsed 0m 18s (remain 9m 24s)Loss: 0.2742(0.4069)Grad: 1.6748  
Epoch: [10][40/659]Data 0.000 (0.051)Elapsed 0m 33s (remain 8m 18s)Loss: 0.1074(0.4076)Grad: 0.7538  
Epoch: [10][60/659]Data 0.000 (0.034)Elapsed 0m 46s (remain 7m 39s)Loss: 0.2006(0.4529)Grad: 0.6015  
Epoch: [10][80/659]Data 0.000 (0.026)Elapsed 1m 1s (remain 7m 16s)Loss: 0.2503(0.4326)Grad: 0.6862  
Epoch: [10][100/659]Data 0.001 (0.021)Elapsed 1m 15s (remain 6m 59s)Loss: 0.3117(0.4384)Grad: 1.0045  
Epoch: [10][120/659]Data 0.000 (0.017)Elapsed 1m 30s (remain 6m 41s)Loss: 0.1726(0.4344)Grad: 0.9453  
Epoch: [10][140/659]Data 0.000 (0.015)Elapsed 1m 44s (remain 6m 23s)Loss: 0.6323(0.4441)Grad: 0.7402  
Epoch: [10][160/659]Data 0.000 (0.013)Elapsed 1m 58s (remain 6m 6s)Loss: 0.8580(0.4624)Grad: 0.684

Epoch 10 - avg_train_loss: 0.5050 avg_val_loss: 0.5422 time: 539s
Epoch 10 - Accuracy: 0.8830675778283978


EVAL: [164/165] Data 0.000 (0.291) Elapsed 1m 3s (remain 0m 0s) Loss: 0.5808(0.5422) 


Score: 0.88459
Score: 0.88459
