In [1]:
# !pip install pytorch_ranger

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

In [3]:
os.listdir('../input/cassava-leaf-disease-classification/')

['train_tfrecords',
 'sample_submission.csv',
 'test_tfrecords',
 'label_num_to_disease_map.json',
 'train_images',
 'train.csv',
 'test_images']

In [4]:
train = pd.read_csv('../input/cassava-leaf-disease-merged/merged.csv')
test = pd.read_csv('../input/cassava-leaf-disease-classification//sample_submission.csv')
label_map = pd.read_json('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json', orient='index')

display(train.head())
display(test.head())
display(label_map)

Unnamed: 0,image_id,label,source
0,1000015157.jpg,0,2020
1,1000201771.jpg,3,2020
2,100042118.jpg,1,2020
3,1000723321.jpg,1,2020
4,1000812911.jpg,3,2020


Unnamed: 0,image_id,label
0,2216849948.jpg,4


Unnamed: 0,0
0,Cassava Bacterial Blight (CBB)
1,Cassava Brown Streak Disease (CBSD)
2,Cassava Green Mottle (CGM)
3,Cassava Mosaic Disease (CMD)
4,Healthy


## Directory settings

In [5]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
TRAIN_PATH = '../input/cassava-leaf-disease-merged/train'
TEST_PATH = '../input/cassava-leaf-disease-classification/test_images'

## CFG

In [6]:
class CFG:
    debug = False
    apex = False
    print_freq = 20
    num_workers = 4
    model_name = 'tf_efficientnet_b1_ns'
    size = 410
    scheduler = 'CosineAnnealingWarmRestarts'
    loss_train = 'BiTemperedLoss'
    epochs = 10
    T_0 = 10
    lr_1 = 1e-3
    lr_2 = 1e-4
    t1 = 0.9
    t2 = 1.5
    smooth = 1e-2
    min_lr = 1e-6
    batch_size = 32
    weight_decay = 1e-6
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    seed = 42
    target_size = 5
    target_col = 'label'
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True
    inference = False
    
if CFG.debug:
    CFG.epochs = 3
    train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)

## Library

In [7]:
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

sys.path.append('../input/bi-tempered-loss-pytorch')
from bi_tempered_loss import *

# sys.path.append('../input/pytorch-optimizer')
# import torch_optimizer as optim

sys.path.append('../input/pytorch-sam')
from sam import SAM

from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine, RandomResizedCrop,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, CoarseDropout, ShiftScaleRotate, CenterCrop, Resize
)
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

import warnings 
warnings.filterwarnings('ignore')

if CFG.apex:
    from apex import amp

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Utils

In [8]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f}')
    
def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

## CV split

In [9]:
folds = train.copy()
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.groupby(['fold', CFG.target_col]).size())

fold  label
0     0         299
      1         695
      2         604
      3        3092
      4         578
1     0         299
      1         695
      2         604
      3        3092
      4         578
2     0         298
      1         695
      2         603
      3        3093
      4         578
3     0         298
      1         695
      2         603
      3        3093
      4         578
4     0         298
      1         696
      2         603
      3        3092
      4         578
dtype: int64


## Dataset

In [10]:
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.labels = df['label'].values
#         self.labels = pd.get_dummies(df['label']).values  # One Hot Encoding
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TRAIN_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx]).long()
        return image, label
    
class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TEST_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        return image

In [11]:
# train_dataset = TrainDataset(train, transform=None)

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image)
#     plt.title(f'label: {label}')
#     plt.show()

## Transforms

In [12]:
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            RandomResizedCrop(CFG.size, CFG.size), 
            Transpose(p=0.5), 
            HorizontalFlip(p=0.5), 
            VerticalFlip(p=0.5), 
            ShiftScaleRotate(p=0.5), 
            HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5), 
            RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])
    
    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])

In [13]:
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image[0])
#     plt.title(f'label: {label}')
#     plt.show()

## MODEL

In [14]:
class CustomEfficientNetB1ns(nn.Module):
    def __init__(self, model_name='tf_efficientnet_b1_ns', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(n_features, CFG.target_size)
        
    def forward(self, x):
        x = self.model(x)
        return x

In [15]:
# model = CustomEfficientNetB1ns(model_name=CFG.model_name, pretrained=False)
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))
# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, 
#                           num_workers=4, pin_memory=True, drop_last=True)

# for image, label in train_loader:
#     print(image.size())
#     output = model(image)
#     print(output)
#     break

## Loss Functions

In [16]:
# ====================================================
# Label Smoothing
# ====================================================
class LabelSmoothingLoss(nn.Module): 
    def __init__(self, classes=5, smoothing=0.0, dim=-1): 
        super(LabelSmoothingLoss, self).__init__() 
        self.confidence = 1.0 - smoothing 
        self.smoothing = smoothing 
        self.cls = classes 
        self.dim = dim 
    def forward(self, pred, target): 
        pred = pred.log_softmax(dim=self.dim) 
        with torch.no_grad():
            true_dist = torch.zeros_like(pred) 
            true_dist.fill_(self.smoothing / (self.cls - 1)) 
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [17]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduce = reduce

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [18]:
class FocalCosineLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, xent=.1):
        super(FocalCosineLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

        self.xent = xent

        self.y = torch.Tensor([1]).cuda()

    def forward(self, input, target, reduction="mean"):
        cosine_loss = F.cosine_embedding_loss(input, F.one_hot(target, num_classes=input.size(-1)), self.y, reduction=reduction)

        cent_loss = F.cross_entropy(F.normalize(input), target, reduce=False)
        pt = torch.exp(-cent_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * cent_loss

        if reduction == "mean":
            focal_loss = torch.mean(focal_loss)

        return cosine_loss + self.xent * focal_loss

In [19]:
class SymmetricCrossEntropy(nn.Module):

    def __init__(self, alpha=0.1, beta=1.0, num_classes=5):
        super(SymmetricCrossEntropy, self).__init__()
        self.alpha = alpha
        self.beta = beta
        self.num_classes = num_classes

    def forward(self, logits, targets, reduction='mean'):
        onehot_targets = torch.eye(self.num_classes)[targets].cuda()
        ce_loss = F.cross_entropy(logits, targets, reduction=reduction)
        rce_loss = (-onehot_targets*logits.softmax(1).clamp(1e-7, 1.0).log()).sum(1)
        if reduction == 'mean':
            rce_loss = rce_loss.mean()
        elif reduction == 'sum':
            rce_loss = rce_loss.sum()
        return self.alpha * ce_loss + self.beta * rce_loss

In [20]:
def log_t(u, t):
    """Compute log_t for `u'."""
    if t==1.0:
        return u.log()
    else:
        return (u.pow(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u'."""
    if t==1:
        return u.exp()
    else:
        return (1.0 + (1.0-t)*u).relu().pow(1.0 / (1.0 - t))

def compute_normalization_fixed_point(activations, t, num_iters):

    """Returns the normalization value for each example (t > 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same shape as activation with the last dimension being 1.
    """
    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations_step_0 = activations - mu

    normalized_activations = normalized_activations_step_0

    for _ in range(num_iters):
        logt_partition = torch.sum(
                exp_t(normalized_activations, t), -1, keepdim=True)
        normalized_activations = normalized_activations_step_0 * \
                logt_partition.pow(1.0-t)

    logt_partition = torch.sum(
            exp_t(normalized_activations, t), -1, keepdim=True)
    normalization_constants = - log_t(1.0 / logt_partition, t) + mu

    return normalization_constants

def compute_normalization_binary_search(activations, t, num_iters):

    """Returns the normalization value for each example (t < 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (< 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """

    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations = activations - mu

    effective_dim = \
        torch.sum(
                (normalized_activations > -1.0 / (1.0-t)).to(torch.int32),
            dim=-1, keepdim=True).to(activations.dtype)

    shape_partition = activations.shape[:-1] + (1,)
    lower = torch.zeros(shape_partition, dtype=activations.dtype, device=activations.device)
    upper = -log_t(1.0/effective_dim, t) * torch.ones_like(lower)

    for _ in range(num_iters):
        logt_partition = (upper + lower)/2.0
        sum_probs = torch.sum(
                exp_t(normalized_activations - logt_partition, t),
                dim=-1, keepdim=True)
        update = (sum_probs < 1.0).to(activations.dtype)
        lower = torch.reshape(
                lower * update + (1.0-update) * logt_partition,
                shape_partition)
        upper = torch.reshape(
                upper * (1.0 - update) + update * logt_partition,
                shape_partition)

    logt_partition = (upper + lower)/2.0
    return logt_partition + mu

class ComputeNormalization(torch.autograd.Function):
    """
    Class implementing custom backward pass for compute_normalization. See compute_normalization.
    """
    @staticmethod
    def forward(ctx, activations, t, num_iters):
        if t < 1.0:
            normalization_constants = compute_normalization_binary_search(activations, t, num_iters)
        else:
            normalization_constants = compute_normalization_fixed_point(activations, t, num_iters)

        ctx.save_for_backward(activations, normalization_constants)
        ctx.t=t
        return normalization_constants

    @staticmethod
    def backward(ctx, grad_output):
        activations, normalization_constants = ctx.saved_tensors
        t = ctx.t
        normalized_activations = activations - normalization_constants 
        probabilities = exp_t(normalized_activations, t)
        escorts = probabilities.pow(t)
        escorts = escorts / escorts.sum(dim=-1, keepdim=True)
        grad_input = escorts * grad_output
        
        return grad_input, None, None

def compute_normalization(activations, t, num_iters=5):
    """Returns the normalization value for each example. 
    Backward pass is implemented.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    return ComputeNormalization.apply(activations, t, num_iters)

def tempered_sigmoid(activations, t, num_iters = 5):
    """Tempered sigmoid function.
    Args:
      activations: Activations for the positive class for binary classification.
      t: Temperature tensor > 0.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_probabilities = tempered_softmax(internal_activations, t, num_iters)
    return internal_probabilities[..., 0]


def tempered_softmax(activations, t, num_iters=5):
    """Tempered softmax function.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature > 1.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    if t == 1.0:
        return activations.softmax(dim=-1)

    normalization_constants = compute_normalization(activations, t, num_iters)
    return exp_t(activations - normalization_constants, t)

def bi_tempered_binary_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing = 0.0,
        num_iters=5,
        reduction='mean'):

    """Bi-Tempered binary logistic loss.
    Args:
      activations: A tensor containing activations for class 1.
      labels: A tensor with shape as activations, containing probabilities for class 1
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing
      num_iters: Number of iterations to run the method.
    Returns:
      A loss tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_labels = torch.stack([labels.to(activations.dtype),
        1.0 - labels.to(activations.dtype)],
        dim=-1)
    return bi_tempered_logistic_loss(internal_activations, 
            internal_labels,
            t1,
            t2,
            label_smoothing = label_smoothing,
            num_iters = num_iters,
            reduction = reduction)

def bi_tempered_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing=0.0,
        num_iters=5,
        reduction = 'mean'):

    """Bi-Tempered Logistic Loss.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      labels: A tensor with shape and dtype as activations (onehot), 
        or a long tensor of one dimension less than activations (pytorch standard)
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing parameter between [0, 1). Default 0.0.
      num_iters: Number of iterations to run the method. Default 5.
      reduction: ``'none'`` | ``'mean'`` | ``'sum'``. Default ``'mean'``.
        ``'none'``: No reduction is applied, return shape is shape of
        activations without the last dimension.
        ``'mean'``: Loss is averaged over minibatch. Return shape (1,)
        ``'sum'``: Loss is summed over minibatch. Return shape (1,)
    Returns:
      A loss tensor.
    """

    if len(labels.shape)<len(activations.shape): #not one-hot
        labels_onehot = torch.zeros_like(activations)
        labels_onehot.scatter_(1, labels[..., None], 1)
    else:
        labels_onehot = labels

    if label_smoothing > 0:
        num_classes = labels_onehot.shape[-1]
        labels_onehot = ( 1 - label_smoothing * num_classes / (num_classes - 1) ) \
                * labels_onehot + \
                label_smoothing / (num_classes - 1)

    probabilities = tempered_softmax(activations, t2, num_iters)

    loss_values = labels_onehot * log_t(labels_onehot + 1e-10, t1) \
            - labels_onehot * log_t(probabilities, t1) \
            - labels_onehot.pow(2.0 - t1) / (2.0 - t1) \
            + probabilities.pow(2.0 - t1) / (2.0 - t1)
    loss_values = loss_values.sum(dim = -1) #sum over classes

    if reduction == 'none':
        return loss_values
    if reduction == 'sum':
        return loss_values.sum()
    if reduction == 'mean':
        return loss_values.mean()

In [21]:
class BiTemperedLogisticLoss(nn.Module): 
    def __init__(self, t1, t2, smoothing=0.0): 
        super(BiTemperedLogisticLoss, self).__init__() 
        self.t1 = t1
        self.t2 = t2
        self.smoothing = smoothing
    def forward(self, logit_label, truth_label):
        loss_label = bi_tempered_logistic_loss(
            logit_label, truth_label,
            t1=self.t1, t2=self.t2,
            label_smoothing=self.smoothing,
            reduction='none'
        )
        
        loss_label = loss_label.mean()
        return loss_label

In [22]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def asMinutes(s):
    """秒を分に変換する関数"""
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    """経過時間の測定と終了時間の予測を行う関数
    Parameters
    ----------
    since : float
        実験を始めた時刻
    percent : float
        実験が進んだ割合
        
    Returns
    -------
    s : 経過時間
    re : 終了までの時間の予測
    """
    now = time.time()
    s = now - since  # 経過時間の測定
    es = s / percent  # 終了時間の予測
    re = es - s  # 残り時間の予想
    return '%s (remain %s)' % (asMinutes(s), asMinutes(re))

def train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, shechduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(images)
        metric = loss_metric(y_preds, labels)
        loss = loss_train(y_preds, labels)
        # record loss
        losses.update(metric.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        if CFG.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else: 
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.first_step(zero_grad=True)
#             optimizer.zero_grad()
            global_step += 1
        # measure elapsed time
        loss_train(model(images), labels).backward()
#         loss = torch.mean(loss)
#         loss.backward()
        optimizer.second_step(zero_grad=True)
        
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}]'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})'
                  'Elapsed {remain:s}' 
                  'Loss: {loss.val:.4f}({loss.avg:.4f})' 
                  'Grad: {grad_norm:.4f}  '
                  .format(epoch+1, step, len(train_loader), batch_time=batch_time, 
                          data_time=data_time, loss=losses, 
                          remain=timeSince(start, float(step+1)/len(train_loader)), 
                          grad_norm=grad_norm))
    return losses.avg

def valid_fn(valid_loader, model, loss_metric, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (images, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images)
        loss = loss_metric(y_preds, labels)
        losses.update(loss.item(), batch_size)
        # record accuracy
        preds.append(y_preds.softmax(1).to('cpu').numpy())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
            
    predictions = np.concatenate(preds)
    return losses.avg, predictions

def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (images) in tk0:
        images = images.to(device)
        avgpreds = []
        for state in states:
            model.load_state_dict(state['model'])
            model.eval()
            with torch.no_grad():
                y_preds = model(images)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs

## Train loop

In [23]:
# ======================================================
# Train loop
# ======================================================

def train_loop(folds, fold):
    
    seed_torch(seed=CFG.seed)    
    
    LOGGER.info(f'========== fold: {fold} training ============')
    
    # ======================================================
    # loader
    # ======================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index
    
    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    
    train_dataset = TrainDataset(train_folds, 
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds, 
                                 transform=get_transforms(data='valid'))
    
    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=True, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    
    # ===============================================
    # scheduler
    # ===============================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler
    
    # ===============================================
    # model & optimizer
    # ===============================================
    model = CustomEfficientNetB1ns(CFG.model_name, pretrained=True)
    
    # 最初の3epochはclassifier層以外全て凍結する。
    for name, param in model.model.named_parameters():
        if 'classifier' not in name:
            param.requires_grad=False
    
    model.to(device)
    
    base_optimizer = Adam
    optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_1, weight_decay=CFG.weight_decay, amsgrad=False)
    
    scheduler = get_scheduler(optimizer)
    
    # ===============================================
    # apex 
    # ===============================================
    if CFG.apex:
        model.optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
        
    # ===============================================
    # loop
    # ===============================================
    def get_loss_train():
        if CFG.loss_train == 'CrossEntropyLoss':
            loss_train = nn.CrossEntropyLoss()
        elif CFG.loss_train == 'LabelSmoothing':
            loss_train = LabelSmoothingLoss(classes=CFG.target_size, smoothing=CFG.smooth)
        elif CFG.loss_train == 'FocalLoss':
            loss_train = FocalLoss().to(device)
        elif CFG.loss_train == 'FocalCosineLoss':
            loss_train = FocalCosineLoss()
        elif CFG.loss_train == 'SymmetricCrossEntropyLoss':
            loss_train = SymmetricCrossEntropy().to(device)
        elif CFG.loss_train == 'BiTemperedLoss':
            loss_train = BiTemperedLogisticLoss(t1=CFG.t1, t2=CFG.t2, smoothing=CFG.smooth)
        return loss_train
    
    loss_train = get_loss_train()
    LOGGER.info(f'loss_train: {loss_train}')
    loss_metric = nn.CrossEntropyLoss()
    
    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        if epoch == 1:
            
            # 2epoch目に重みを全て解凍する
            for param in model.model.parameters():
                param.requires_grad = True
                
            # 学習率を4e-3から4e-4に落とす
            base_optimizer = Adam
            optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_2, weight_decay=CFG.weight_decay, amsgrad=False)
            scheduler = get_scheduler(optimizer)

            LOGGER.info('requires_grad of all parameters are unlocked')
            
        
        # train
        avg_loss = train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, scheduler, device)
        
        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, loss_metric, device)
        valid_labels = valid_folds[CFG.target_col].values
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()
        
        # scoring
        score = get_score(valid_labels, preds.argmax(1))
        
        elapsed = time.time() - start_time
        
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Accuracy: {score}')
        
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(), 
                        'preds': preds}, 
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
            
        # inference用に全て保存しておく        
        torch.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth')
    
    check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    valid_folds[[str(c) for c in range(5)]] = check_point['preds']
    valid_folds['preds'] = check_point['preds'].argmax(1)
    
    return valid_folds

In [24]:
# ====================================================
# main
# ====================================================
def main():
    
    """
    Prepare: 1.train 2.test 3.submission 4.folds
    """
    
    def get_result(result_df):
        preds = result_df['preds'].values
        labels = result_df[CFG.target_col].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.5f}')
        
    if CFG.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f'=============== fold: {fold} result ================')
                get_result(_oof_df)
                
                # 1foldのみを用いる
                break
                
        # CV result
        LOGGER.info(f'============ CV ============')
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
        
    if CFG.inference:
        # inference
        model = CustomEfficientNetB1ns(CFG.model_name, pretrained=False)
        states = [torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth') for fold in CFG.trn_fold]
        test_dataset = TestDataset(test, batch_size=CFG.batch_size, shuffle=False, pin_memory=True)
        predictions = inference(model, states, test_loader, device)
        # submission
        test['label'] = predictions.argmax(1)
        test[['image_id', 'label']].to_csv(OUTPUT_DIR+'submission.csv', index=False)

In [25]:
LOGGER.info(f'used device: {device}')

used device: cuda


In [26]:
if __name__ == '__main__':
    main()

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ns-99dd0c41.pth" to /root/.cache/torch/hub/checkpoints/tf_efficientnet_b1_ns-99dd0c41.pth
loss_train: BiTemperedLogisticLoss()


Epoch: [1][0/659]Data 4.016 (4.016)Elapsed 0m 5s (remain 62m 49s)Loss: 1.6382(1.6382)Grad: 0.6002  
Epoch: [1][20/659]Data 1.207 (0.350)Elapsed 0m 16s (remain 8m 24s)Loss: 2.1709(1.4684)Grad: 0.3376  
Epoch: [1][40/659]Data 0.669 (0.305)Elapsed 0m 28s (remain 7m 10s)Loss: 2.3188(1.5109)Grad: 0.3982  
Epoch: [1][60/659]Data 0.528 (0.293)Elapsed 0m 40s (remain 6m 38s)Loss: 1.5006(1.4751)Grad: 0.3425  
Epoch: [1][80/659]Data 0.211 (0.281)Elapsed 0m 52s (remain 6m 12s)Loss: 1.6148(1.4272)Grad: 0.2912  
Epoch: [1][100/659]Data 0.787 (0.281)Elapsed 1m 4s (remain 5m 54s)Loss: 1.0220(1.3914)Grad: 0.3275  
Epoch: [1][120/659]Data 0.534 (0.275)Elapsed 1m 16s (remain 5m 42s)Loss: 1.1155(1.3656)Grad: 0.2900  
Epoch: [1][140/659]Data 0.402 (0.276)Elapsed 1m 29s (remain 5m 27s)Loss: 1.1185(1.3246)Grad: 0.4386  
Epoch: [1][160/659]Data 0.258 (0.278)Elapsed 1m 41s (remain 5m 15s)Loss: 0.5616(1.2860)Grad: 0.3330  
Epoch: [1][180/659]Data 0.000 (0.277)Elapsed 1m 53s (remain 4m 59s)Loss: 1.3989(1.2650)Gr

Epoch 1 - avg_train_loss: 1.0378 avg_val_loss: 0.8284 time: 482s
Epoch 1 - Accuracy: 0.7262718299164769
Epoch 1 - Save Best Score: 0.7263 Model


EVAL: [164/165] Data 0.061 (0.322) Elapsed 1m 15s (remain 0m 0s) Loss: 0.7653(0.8284) 


requires_grad of all parameters are unlocked


Epoch: [2][0/659]Data 2.507 (2.507)Elapsed 0m 4s (remain 47m 34s)Loss: 0.7535(0.7535)Grad: 2.3011  
Epoch: [2][20/659]Data 0.000 (0.120)Elapsed 0m 27s (remain 13m 52s)Loss: 0.6595(0.8724)Grad: 2.0229  
Epoch: [2][40/659]Data 0.000 (0.061)Elapsed 0m 49s (remain 12m 25s)Loss: 1.1207(0.8566)Grad: 2.0475  
Epoch: [2][60/659]Data 0.000 (0.041)Elapsed 1m 11s (remain 11m 43s)Loss: 0.4173(0.8417)Grad: 1.4104  
Epoch: [2][80/659]Data 0.000 (0.031)Elapsed 1m 33s (remain 11m 10s)Loss: 0.7092(0.7969)Grad: 1.4891  
Epoch: [2][100/659]Data 0.000 (0.025)Elapsed 1m 56s (remain 10m 41s)Loss: 0.8247(0.7935)Grad: 1.7434  
Epoch: [2][120/659]Data 0.000 (0.021)Elapsed 2m 18s (remain 10m 16s)Loss: 1.0480(0.7762)Grad: 1.6517  
Epoch: [2][140/659]Data 0.000 (0.018)Elapsed 2m 40s (remain 9m 49s)Loss: 0.5279(0.7628)Grad: 1.0922  
Epoch: [2][160/659]Data 0.000 (0.016)Elapsed 3m 2s (remain 9m 24s)Loss: 0.6982(0.7636)Grad: 1.5371  
Epoch: [2][180/659]Data 0.000 (0.014)Elapsed 3m 24s (remain 9m 0s)Loss: 0.8651(0.76

Epoch 2 - avg_train_loss: 0.6906 avg_val_loss: 0.5789 time: 806s
Epoch 2 - Accuracy: 0.8545937737281701
Epoch 2 - Save Best Score: 0.8546 Model


EVAL: [164/165] Data 0.000 (0.297) Elapsed 1m 11s (remain 0m 0s) Loss: 0.4954(0.5789) 
Epoch: [3][0/659]Data 2.835 (2.835)Elapsed 0m 4s (remain 45m 49s)Loss: 0.3561(0.3561)Grad: 0.9115  
Epoch: [3][20/659]Data 0.000 (0.136)Elapsed 0m 26s (remain 13m 35s)Loss: 0.5125(0.6129)Grad: 1.4499  
Epoch: [3][40/659]Data 0.000 (0.070)Elapsed 0m 49s (remain 12m 21s)Loss: 0.6585(0.5813)Grad: 1.2559  
Epoch: [3][60/659]Data 0.000 (0.047)Elapsed 1m 11s (remain 11m 39s)Loss: 1.0232(0.5755)Grad: 1.2293  
Epoch: [3][80/659]Data 0.000 (0.035)Elapsed 1m 33s (remain 11m 5s)Loss: 0.5117(0.5861)Grad: 0.8472  
Epoch: [3][100/659]Data 0.000 (0.028)Elapsed 1m 55s (remain 10m 39s)Loss: 0.6597(0.5980)Grad: 0.9137  
Epoch: [3][120/659]Data 0.000 (0.024)Elapsed 2m 17s (remain 10m 12s)Loss: 0.3697(0.6174)Grad: 1.0194  
Epoch: [3][140/659]Data 0.000 (0.020)Elapsed 2m 39s (remain 9m 46s)Loss: 0.4175(0.6146)Grad: 0.8589  
Epoch: [3][160/659]Data 0.000 (0.018)Elapsed 3m 2s (remain 9m 24s)Loss: 1.1849(0.6280)Grad: 0.8857

Epoch 3 - avg_train_loss: 0.6059 avg_val_loss: 0.5748 time: 804s
Epoch 3 - Accuracy: 0.8709187547456341
Epoch 3 - Save Best Score: 0.8709 Model


EVAL: [164/165] Data 0.000 (0.288) Elapsed 1m 10s (remain 0m 0s) Loss: 0.2510(0.5748) 
Epoch: [4][0/659]Data 2.346 (2.346)Elapsed 0m 3s (remain 40m 54s)Loss: 1.4129(1.4129)Grad: 1.5668  
Epoch: [4][20/659]Data 0.000 (0.112)Elapsed 0m 26s (remain 13m 34s)Loss: 0.6068(0.6048)Grad: 1.3489  
Epoch: [4][40/659]Data 0.000 (0.057)Elapsed 0m 48s (remain 12m 18s)Loss: 0.6865(0.5668)Grad: 1.4281  
Epoch: [4][60/659]Data 0.000 (0.039)Elapsed 1m 10s (remain 11m 35s)Loss: 0.3624(0.5952)Grad: 0.9704  
Epoch: [4][80/659]Data 0.000 (0.029)Elapsed 1m 33s (remain 11m 6s)Loss: 0.5973(0.6174)Grad: 1.2833  
Epoch: [4][100/659]Data 0.000 (0.023)Elapsed 1m 55s (remain 10m 37s)Loss: 0.3537(0.6285)Grad: 1.1863  
Epoch: [4][120/659]Data 0.000 (0.020)Elapsed 2m 17s (remain 10m 10s)Loss: 0.2978(0.6155)Grad: 1.0259  
Epoch: [4][140/659]Data 0.000 (0.017)Elapsed 2m 39s (remain 9m 46s)Loss: 1.7014(0.6025)Grad: 1.2908  
Epoch: [4][160/659]Data 0.000 (0.015)Elapsed 3m 1s (remain 9m 22s)Loss: 1.0502(0.6010)Grad: 1.2859

Epoch 4 - avg_train_loss: 0.5734 avg_val_loss: 0.5518 time: 798s
Epoch 4 - Accuracy: 0.8758542141230068
Epoch 4 - Save Best Score: 0.8759 Model


EVAL: [164/165] Data 0.024 (0.274) Elapsed 1m 8s (remain 0m 0s) Loss: 0.2388(0.5518) 
Epoch: [5][0/659]Data 2.619 (2.619)Elapsed 0m 4s (remain 44m 25s)Loss: 0.3718(0.3718)Grad: 0.8947  
Epoch: [5][20/659]Data 0.000 (0.125)Elapsed 0m 26s (remain 13m 38s)Loss: 0.6858(0.6066)Grad: 0.8213  
Epoch: [5][40/659]Data 0.000 (0.064)Elapsed 0m 48s (remain 12m 13s)Loss: 0.6288(0.6439)Grad: 1.5623  
Epoch: [5][60/659]Data 0.000 (0.043)Elapsed 1m 10s (remain 11m 35s)Loss: 0.4858(0.5833)Grad: 1.0480  
Epoch: [5][80/659]Data 0.000 (0.033)Elapsed 1m 33s (remain 11m 3s)Loss: 0.5762(0.5656)Grad: 1.0986  
Epoch: [5][100/659]Data 0.000 (0.026)Elapsed 1m 54s (remain 10m 34s)Loss: 0.1494(0.5607)Grad: 1.2242  
Epoch: [5][120/659]Data 0.000 (0.022)Elapsed 2m 17s (remain 10m 9s)Loss: 0.7576(0.5593)Grad: 1.2996  
Epoch: [5][140/659]Data 0.000 (0.019)Elapsed 2m 38s (remain 9m 43s)Loss: 0.7887(0.5667)Grad: 0.9892  
Epoch: [5][160/659]Data 0.000 (0.017)Elapsed 3m 0s (remain 9m 18s)Loss: 0.1469(0.5520)Grad: 0.8450  

Epoch 5 - avg_train_loss: 0.5578 avg_val_loss: 0.5532 time: 794s
Epoch 5 - Accuracy: 0.8805998481397115
Epoch 5 - Save Best Score: 0.8806 Model


EVAL: [164/165] Data 0.000 (0.267) Elapsed 1m 6s (remain 0m 0s) Loss: 0.1672(0.5532) 
Epoch: [6][0/659]Data 2.425 (2.425)Elapsed 0m 3s (remain 41m 43s)Loss: 0.5635(0.5635)Grad: 1.2068  
Epoch: [6][20/659]Data 0.000 (0.116)Elapsed 0m 26s (remain 13m 11s)Loss: 0.8944(0.5291)Grad: 1.4249  
Epoch: [6][40/659]Data 0.000 (0.059)Elapsed 0m 47s (remain 12m 3s)Loss: 0.0408(0.5192)Grad: 0.6706  
Epoch: [6][60/659]Data 0.000 (0.040)Elapsed 1m 9s (remain 11m 25s)Loss: 0.3514(0.4813)Grad: 1.2286  
Epoch: [6][80/659]Data 0.000 (0.030)Elapsed 1m 31s (remain 10m 55s)Loss: 0.9154(0.4859)Grad: 0.7720  
Epoch: [6][100/659]Data 0.004 (0.024)Elapsed 1m 53s (remain 10m 29s)Loss: 0.3919(0.5149)Grad: 0.6549  
Epoch: [6][120/659]Data 0.000 (0.020)Elapsed 2m 15s (remain 10m 3s)Loss: 0.9740(0.5060)Grad: 1.4691  
Epoch: [6][140/659]Data 0.000 (0.017)Elapsed 2m 37s (remain 9m 39s)Loss: 0.2910(0.5332)Grad: 0.9127  
Epoch: [6][160/659]Data 0.000 (0.015)Elapsed 2m 59s (remain 9m 16s)Loss: 0.6902(0.5297)Grad: 1.2828  

Epoch 6 - avg_train_loss: 0.5160 avg_val_loss: 0.5673 time: 792s
Epoch 6 - Accuracy: 0.8849658314350797
Epoch 6 - Save Best Score: 0.8850 Model


EVAL: [164/165] Data 0.000 (0.271) Elapsed 1m 7s (remain 0m 0s) Loss: 0.1886(0.5673) 
Epoch: [7][0/659]Data 2.253 (2.253)Elapsed 0m 3s (remain 42m 16s)Loss: 0.2284(0.2284)Grad: 0.7383  
Epoch: [7][20/659]Data 0.000 (0.108)Elapsed 0m 26s (remain 13m 20s)Loss: 0.1942(0.4591)Grad: 1.2292  
Epoch: [7][40/659]Data 0.000 (0.055)Elapsed 0m 48s (remain 12m 13s)Loss: 0.4549(0.4639)Grad: 0.9169  
Epoch: [7][60/659]Data 0.000 (0.037)Elapsed 1m 10s (remain 11m 29s)Loss: 0.2392(0.4885)Grad: 0.9067  
Epoch: [7][80/659]Data 0.000 (0.028)Elapsed 1m 32s (remain 10m 57s)Loss: 1.0139(0.5096)Grad: 0.7241  
Epoch: [7][100/659]Data 0.000 (0.023)Elapsed 1m 54s (remain 10m 30s)Loss: 0.5502(0.4925)Grad: 0.7870  
Epoch: [7][120/659]Data 0.000 (0.019)Elapsed 2m 15s (remain 10m 3s)Loss: 0.5496(0.4936)Grad: 1.0142  
Epoch: [7][140/659]Data 0.000 (0.016)Elapsed 2m 37s (remain 9m 39s)Loss: 0.2390(0.4999)Grad: 1.2353  
Epoch: [7][160/659]Data 0.000 (0.014)Elapsed 2m 59s (remain 9m 15s)Loss: 0.0641(0.4949)Grad: 0.8834

Epoch 7 - avg_train_loss: 0.5052 avg_val_loss: 0.5393 time: 794s
Epoch 7 - Accuracy: 0.8861047835990888
Epoch 7 - Save Best Score: 0.8861 Model


EVAL: [164/165] Data 0.000 (0.275) Elapsed 1m 7s (remain 0m 0s) Loss: 0.0944(0.5393) 
Epoch: [8][0/659]Data 1.940 (1.940)Elapsed 0m 3s (remain 35m 56s)Loss: 0.4194(0.4194)Grad: 1.0749  
Epoch: [8][20/659]Data 0.000 (0.093)Elapsed 0m 25s (remain 12m 57s)Loss: 1.1758(0.5018)Grad: 1.2977  
Epoch: [8][40/659]Data 0.000 (0.048)Elapsed 0m 47s (remain 11m 57s)Loss: 0.2190(0.4882)Grad: 0.6022  
Epoch: [8][60/659]Data 0.000 (0.032)Elapsed 1m 9s (remain 11m 20s)Loss: 0.9705(0.5336)Grad: 1.4498  
Epoch: [8][80/659]Data 0.000 (0.024)Elapsed 1m 31s (remain 10m 52s)Loss: 0.1467(0.5196)Grad: 1.0508  
Epoch: [8][100/659]Data 0.000 (0.020)Elapsed 1m 53s (remain 10m 26s)Loss: 0.9369(0.5009)Grad: 1.1954  
Epoch: [8][120/659]Data 0.000 (0.016)Elapsed 2m 15s (remain 10m 0s)Loss: 0.4067(0.4939)Grad: 0.9373  
Epoch: [8][140/659]Data 0.000 (0.014)Elapsed 2m 37s (remain 9m 37s)Loss: 0.1777(0.4985)Grad: 0.6933  
Epoch: [8][160/659]Data 0.000 (0.012)Elapsed 2m 58s (remain 9m 13s)Loss: 0.5715(0.5044)Grad: 1.0837 

Epoch 8 - avg_train_loss: 0.4979 avg_val_loss: 0.5297 time: 792s
Epoch 8 - Accuracy: 0.8891419893697798
Epoch 8 - Save Best Score: 0.8891 Model


EVAL: [164/165] Data 0.123 (0.271) Elapsed 1m 7s (remain 0m 0s) Loss: 0.0625(0.5297) 
Epoch: [9][0/659]Data 2.041 (2.041)Elapsed 0m 3s (remain 37m 17s)Loss: 0.4323(0.4323)Grad: 0.7317  
Epoch: [9][20/659]Data 0.000 (0.098)Elapsed 0m 25s (remain 13m 7s)Loss: 0.7279(0.5852)Grad: 0.9814  
Epoch: [9][40/659]Data 0.000 (0.050)Elapsed 0m 47s (remain 12m 0s)Loss: 0.2186(0.4839)Grad: 0.9769  
Epoch: [9][60/659]Data 0.000 (0.034)Elapsed 1m 9s (remain 11m 24s)Loss: 0.2234(0.4916)Grad: 1.0474  
Epoch: [9][80/659]Data 0.000 (0.026)Elapsed 1m 31s (remain 10m 54s)Loss: 0.2652(0.4736)Grad: 1.1539  
Epoch: [9][100/659]Data 0.000 (0.021)Elapsed 1m 53s (remain 10m 27s)Loss: 0.4189(0.4732)Grad: 2.0119  
Epoch: [9][120/659]Data 0.000 (0.017)Elapsed 2m 15s (remain 10m 2s)Loss: 0.2625(0.4889)Grad: 1.0399  
Epoch: [9][140/659]Data 0.000 (0.015)Elapsed 2m 37s (remain 9m 38s)Loss: 0.0862(0.4787)Grad: 0.9901  
Epoch: [9][160/659]Data 0.000 (0.013)Elapsed 2m 59s (remain 9m 14s)Loss: 0.7727(0.4648)Grad: 1.3458  


Epoch 9 - avg_train_loss: 0.4774 avg_val_loss: 0.5254 time: 794s
Epoch 9 - Accuracy: 0.8904707668944571
Epoch 9 - Save Best Score: 0.8905 Model


EVAL: [164/165] Data 0.000 (0.278) Elapsed 1m 8s (remain 0m 0s) Loss: 0.0877(0.5254) 
Epoch: [10][0/659]Data 2.152 (2.152)Elapsed 0m 3s (remain 39m 23s)Loss: 0.2951(0.2951)Grad: 1.0198  
Epoch: [10][20/659]Data 0.000 (0.103)Elapsed 0m 25s (remain 13m 2s)Loss: 0.1455(0.4915)Grad: 0.6452  
Epoch: [10][40/659]Data 0.000 (0.053)Elapsed 0m 47s (remain 11m 55s)Loss: 0.9035(0.4601)Grad: 0.8902  
Epoch: [10][60/659]Data 0.000 (0.036)Elapsed 1m 9s (remain 11m 22s)Loss: 0.7273(0.4789)Grad: 0.8535  
Epoch: [10][80/659]Data 0.000 (0.027)Elapsed 1m 31s (remain 10m 52s)Loss: 0.3485(0.4801)Grad: 1.5740  
Epoch: [10][100/659]Data 0.000 (0.022)Elapsed 1m 53s (remain 10m 25s)Loss: 0.1991(0.4638)Grad: 0.6597  
Epoch: [10][120/659]Data 0.000 (0.018)Elapsed 2m 15s (remain 10m 2s)Loss: 0.0454(0.4701)Grad: 0.5666  
Epoch: [10][140/659]Data 0.000 (0.016)Elapsed 2m 37s (remain 9m 37s)Loss: 0.3288(0.4780)Grad: 0.8848  
Epoch: [10][160/659]Data 0.000 (0.014)Elapsed 2m 59s (remain 9m 15s)Loss: 0.2275(0.4844)Grad:

Epoch 10 - avg_train_loss: 0.4565 avg_val_loss: 0.5380 time: 795s
Epoch 10 - Accuracy: 0.8906605922551253
Epoch 10 - Save Best Score: 0.8907 Model


EVAL: [164/165] Data 0.000 (0.281) Elapsed 1m 8s (remain 0m 0s) Loss: 0.0761(0.5380) 


Score: 0.89066
Score: 0.89066
