In [1]:
# !pip install pytorch_ranger

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

In [3]:
os.listdir('../input/cassava-leaf-disease-classification/')

['train_tfrecords',
 'sample_submission.csv',
 'test_tfrecords',
 'label_num_to_disease_map.json',
 'train_images',
 'train.csv',
 'test_images']

In [4]:
train = pd.read_csv('../input/cassava-leaf-disease-merged/merged.csv')
test = pd.read_csv('../input/cassava-leaf-disease-classification//sample_submission.csv')
label_map = pd.read_json('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json', orient='index')

display(train.head())
display(test.head())
display(label_map)

Unnamed: 0,image_id,label,source
0,1000015157.jpg,0,2020
1,1000201771.jpg,3,2020
2,100042118.jpg,1,2020
3,1000723321.jpg,1,2020
4,1000812911.jpg,3,2020


Unnamed: 0,image_id,label
0,2216849948.jpg,4


Unnamed: 0,0
0,Cassava Bacterial Blight (CBB)
1,Cassava Brown Streak Disease (CBSD)
2,Cassava Green Mottle (CGM)
3,Cassava Mosaic Disease (CMD)
4,Healthy


## Directory settings

In [5]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
TRAIN_PATH = '../input/cassava-leaf-disease-merged/train'
TEST_PATH = '../input/cassava-leaf-disease-classification/test_images'

## CFG

In [6]:
class CFG:
    debug = False
    apex = False
    print_freq = 100
    num_workers = 4
    model_name = 'tf_efficientnet_b4_ns'
    size = 500
    scheduler = 'CosineAnnealingWarmRestarts'
    loss_train = 'BiTemperedLoss'
    epochs = 10
    T_0 = 10
    lr_1 = 2.5e-4
    lr_2 = 2.5e-5
    t1 = 0.9
    t2 = 1.5
    smooth = 1e-2
    min_lr = 1e-6
    batch_size = 8
    weight_decay = 1e-6
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    seed = 42
    target_size = 5
    target_col = 'label'
    n_fold = 5
    trn_fold = [4]
    train = True
    inference = False
    
if CFG.debug:
    CFG.epochs = 3
    train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)

## Library

In [7]:
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

sys.path.append('../input/bi-tempered-loss-pytorch')
from bi_tempered_loss import *

# sys.path.append('../input/pytorch-optimizer')
# import torch_optimizer as optim

sys.path.append('../input/pytorch-sam')
from sam import SAM

from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine, RandomResizedCrop,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, CoarseDropout, ShiftScaleRotate, CenterCrop, Resize
)
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

import warnings 
warnings.filterwarnings('ignore')

if CFG.apex:
    from apex import amp

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Utils

In [8]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f}')
    
def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

## CV split

In [9]:
folds = train.copy()
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.groupby(['fold', CFG.target_col]).size())

fold  label
0     0         299
      1         695
      2         604
      3        3092
      4         578
1     0         299
      1         695
      2         604
      3        3092
      4         578
2     0         298
      1         695
      2         603
      3        3093
      4         578
3     0         298
      1         695
      2         603
      3        3093
      4         578
4     0         298
      1         696
      2         603
      3        3092
      4         578
dtype: int64


## Dataset

In [10]:
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.labels = df['label'].values
#         self.labels = pd.get_dummies(df['label']).values  # One Hot Encoding
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TRAIN_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx]).long()
        return image, label
    
class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TEST_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        return image

In [11]:
# train_dataset = TrainDataset(train, transform=None)

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image)
#     plt.title(f'label: {label}')
#     plt.show()

## Transforms

In [12]:
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            RandomResizedCrop(CFG.size, CFG.size), 
            Transpose(p=0.5), 
            HorizontalFlip(p=0.5), 
            VerticalFlip(p=0.5), 
            ShiftScaleRotate(p=0.5), 
            HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5), 
            RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])
    
    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])

In [13]:
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image[0])
#     plt.title(f'label: {label}')
#     plt.show()

## MODEL

In [14]:
class CustomEfficientNetB4ns(nn.Module):
    def __init__(self, model_name='tf_efficientnet_b4_ns', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(n_features, CFG.target_size)
        
    def forward(self, x):
        x = self.model(x)
        return x

In [15]:
# model = CustomEfficientNetB4ns(model_name=CFG.model_name, pretrained=False)
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))
# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, 
#                           num_workers=4, pin_memory=True, drop_last=True)

# for image, label in train_loader:
#     print(image.size())
#     output = model(image)
#     print(output)
#     break

## Loss Functions

In [16]:
# ====================================================
# Label Smoothing
# ====================================================
class LabelSmoothingLoss(nn.Module): 
    def __init__(self, classes=5, smoothing=0.0, dim=-1): 
        super(LabelSmoothingLoss, self).__init__() 
        self.confidence = 1.0 - smoothing 
        self.smoothing = smoothing 
        self.cls = classes 
        self.dim = dim 
    def forward(self, pred, target): 
        pred = pred.log_softmax(dim=self.dim) 
        with torch.no_grad():
            true_dist = torch.zeros_like(pred) 
            true_dist.fill_(self.smoothing / (self.cls - 1)) 
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [17]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduce = reduce

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [18]:
class FocalCosineLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, xent=.1):
        super(FocalCosineLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

        self.xent = xent

        self.y = torch.Tensor([1]).cuda()

    def forward(self, input, target, reduction="mean"):
        cosine_loss = F.cosine_embedding_loss(input, F.one_hot(target, num_classes=input.size(-1)), self.y, reduction=reduction)

        cent_loss = F.cross_entropy(F.normalize(input), target, reduce=False)
        pt = torch.exp(-cent_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * cent_loss

        if reduction == "mean":
            focal_loss = torch.mean(focal_loss)

        return cosine_loss + self.xent * focal_loss

In [19]:
class SymmetricCrossEntropy(nn.Module):

    def __init__(self, alpha=0.1, beta=1.0, num_classes=5):
        super(SymmetricCrossEntropy, self).__init__()
        self.alpha = alpha
        self.beta = beta
        self.num_classes = num_classes

    def forward(self, logits, targets, reduction='mean'):
        onehot_targets = torch.eye(self.num_classes)[targets].cuda()
        ce_loss = F.cross_entropy(logits, targets, reduction=reduction)
        rce_loss = (-onehot_targets*logits.softmax(1).clamp(1e-7, 1.0).log()).sum(1)
        if reduction == 'mean':
            rce_loss = rce_loss.mean()
        elif reduction == 'sum':
            rce_loss = rce_loss.sum()
        return self.alpha * ce_loss + self.beta * rce_loss

In [20]:
def log_t(u, t):
    """Compute log_t for `u'."""
    if t==1.0:
        return u.log()
    else:
        return (u.pow(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u'."""
    if t==1:
        return u.exp()
    else:
        return (1.0 + (1.0-t)*u).relu().pow(1.0 / (1.0 - t))

def compute_normalization_fixed_point(activations, t, num_iters):

    """Returns the normalization value for each example (t > 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same shape as activation with the last dimension being 1.
    """
    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations_step_0 = activations - mu

    normalized_activations = normalized_activations_step_0

    for _ in range(num_iters):
        logt_partition = torch.sum(
                exp_t(normalized_activations, t), -1, keepdim=True)
        normalized_activations = normalized_activations_step_0 * \
                logt_partition.pow(1.0-t)

    logt_partition = torch.sum(
            exp_t(normalized_activations, t), -1, keepdim=True)
    normalization_constants = - log_t(1.0 / logt_partition, t) + mu

    return normalization_constants

def compute_normalization_binary_search(activations, t, num_iters):

    """Returns the normalization value for each example (t < 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (< 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """

    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations = activations - mu

    effective_dim = \
        torch.sum(
                (normalized_activations > -1.0 / (1.0-t)).to(torch.int32),
            dim=-1, keepdim=True).to(activations.dtype)

    shape_partition = activations.shape[:-1] + (1,)
    lower = torch.zeros(shape_partition, dtype=activations.dtype, device=activations.device)
    upper = -log_t(1.0/effective_dim, t) * torch.ones_like(lower)

    for _ in range(num_iters):
        logt_partition = (upper + lower)/2.0
        sum_probs = torch.sum(
                exp_t(normalized_activations - logt_partition, t),
                dim=-1, keepdim=True)
        update = (sum_probs < 1.0).to(activations.dtype)
        lower = torch.reshape(
                lower * update + (1.0-update) * logt_partition,
                shape_partition)
        upper = torch.reshape(
                upper * (1.0 - update) + update * logt_partition,
                shape_partition)

    logt_partition = (upper + lower)/2.0
    return logt_partition + mu

class ComputeNormalization(torch.autograd.Function):
    """
    Class implementing custom backward pass for compute_normalization. See compute_normalization.
    """
    @staticmethod
    def forward(ctx, activations, t, num_iters):
        if t < 1.0:
            normalization_constants = compute_normalization_binary_search(activations, t, num_iters)
        else:
            normalization_constants = compute_normalization_fixed_point(activations, t, num_iters)

        ctx.save_for_backward(activations, normalization_constants)
        ctx.t=t
        return normalization_constants

    @staticmethod
    def backward(ctx, grad_output):
        activations, normalization_constants = ctx.saved_tensors
        t = ctx.t
        normalized_activations = activations - normalization_constants 
        probabilities = exp_t(normalized_activations, t)
        escorts = probabilities.pow(t)
        escorts = escorts / escorts.sum(dim=-1, keepdim=True)
        grad_input = escorts * grad_output
        
        return grad_input, None, None

def compute_normalization(activations, t, num_iters=5):
    """Returns the normalization value for each example. 
    Backward pass is implemented.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    return ComputeNormalization.apply(activations, t, num_iters)

def tempered_sigmoid(activations, t, num_iters = 5):
    """Tempered sigmoid function.
    Args:
      activations: Activations for the positive class for binary classification.
      t: Temperature tensor > 0.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_probabilities = tempered_softmax(internal_activations, t, num_iters)
    return internal_probabilities[..., 0]


def tempered_softmax(activations, t, num_iters=5):
    """Tempered softmax function.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature > 1.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    if t == 1.0:
        return activations.softmax(dim=-1)

    normalization_constants = compute_normalization(activations, t, num_iters)
    return exp_t(activations - normalization_constants, t)

def bi_tempered_binary_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing = 0.0,
        num_iters=5,
        reduction='mean'):

    """Bi-Tempered binary logistic loss.
    Args:
      activations: A tensor containing activations for class 1.
      labels: A tensor with shape as activations, containing probabilities for class 1
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing
      num_iters: Number of iterations to run the method.
    Returns:
      A loss tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_labels = torch.stack([labels.to(activations.dtype),
        1.0 - labels.to(activations.dtype)],
        dim=-1)
    return bi_tempered_logistic_loss(internal_activations, 
            internal_labels,
            t1,
            t2,
            label_smoothing = label_smoothing,
            num_iters = num_iters,
            reduction = reduction)

def bi_tempered_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing=0.0,
        num_iters=5,
        reduction = 'mean'):

    """Bi-Tempered Logistic Loss.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      labels: A tensor with shape and dtype as activations (onehot), 
        or a long tensor of one dimension less than activations (pytorch standard)
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing parameter between [0, 1). Default 0.0.
      num_iters: Number of iterations to run the method. Default 5.
      reduction: ``'none'`` | ``'mean'`` | ``'sum'``. Default ``'mean'``.
        ``'none'``: No reduction is applied, return shape is shape of
        activations without the last dimension.
        ``'mean'``: Loss is averaged over minibatch. Return shape (1,)
        ``'sum'``: Loss is summed over minibatch. Return shape (1,)
    Returns:
      A loss tensor.
    """

    if len(labels.shape)<len(activations.shape): #not one-hot
        labels_onehot = torch.zeros_like(activations)
        labels_onehot.scatter_(1, labels[..., None], 1)
    else:
        labels_onehot = labels

    if label_smoothing > 0:
        num_classes = labels_onehot.shape[-1]
        labels_onehot = ( 1 - label_smoothing * num_classes / (num_classes - 1) ) \
                * labels_onehot + \
                label_smoothing / (num_classes - 1)

    probabilities = tempered_softmax(activations, t2, num_iters)

    loss_values = labels_onehot * log_t(labels_onehot + 1e-10, t1) \
            - labels_onehot * log_t(probabilities, t1) \
            - labels_onehot.pow(2.0 - t1) / (2.0 - t1) \
            + probabilities.pow(2.0 - t1) / (2.0 - t1)
    loss_values = loss_values.sum(dim = -1) #sum over classes

    if reduction == 'none':
        return loss_values
    if reduction == 'sum':
        return loss_values.sum()
    if reduction == 'mean':
        return loss_values.mean()

In [21]:
class BiTemperedLogisticLoss(nn.Module): 
    def __init__(self, t1, t2, smoothing=0.0): 
        super(BiTemperedLogisticLoss, self).__init__() 
        self.t1 = t1
        self.t2 = t2
        self.smoothing = smoothing
    def forward(self, logit_label, truth_label):
        loss_label = bi_tempered_logistic_loss(
            logit_label, truth_label,
            t1=self.t1, t2=self.t2,
            label_smoothing=self.smoothing,
            reduction='none'
        )
        
        loss_label = loss_label.mean()
        return loss_label

In [22]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def asMinutes(s):
    """秒を分に変換する関数"""
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    """経過時間の測定と終了時間の予測を行う関数
    Parameters
    ----------
    since : float
        実験を始めた時刻
    percent : float
        実験が進んだ割合
        
    Returns
    -------
    s : 経過時間
    re : 終了までの時間の予測
    """
    now = time.time()
    s = now - since  # 経過時間の測定
    es = s / percent  # 終了時間の予測
    re = es - s  # 残り時間の予想
    return '%s (remain %s)' % (asMinutes(s), asMinutes(re))

def train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, shechduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(images)
        metric = loss_metric(y_preds, labels)
        loss = loss_train(y_preds, labels)
        # record loss
        losses.update(metric.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        if CFG.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else: 
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.first_step(zero_grad=True)
#             optimizer.zero_grad()
            global_step += 1
        # measure elapsed time
        loss_train(model(images), labels).backward()
#         loss = torch.mean(loss)
#         loss.backward()
        optimizer.second_step(zero_grad=True)
        
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}]'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})'
                  'Elapsed {remain:s}' 
                  'Loss: {loss.val:.4f}({loss.avg:.4f})' 
                  'Grad: {grad_norm:.4f}  '
                  .format(epoch+1, step, len(train_loader), batch_time=batch_time, 
                          data_time=data_time, loss=losses, 
                          remain=timeSince(start, float(step+1)/len(train_loader)), 
                          grad_norm=grad_norm))
    return losses.avg

def valid_fn(valid_loader, model, loss_metric, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (images, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images)
        loss = loss_metric(y_preds, labels)
        losses.update(loss.item(), batch_size)
        # record accuracy
        preds.append(y_preds.softmax(1).to('cpu').numpy())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
            
    predictions = np.concatenate(preds)
    return losses.avg, predictions

def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (images) in tk0:
        images = images.to(device)
        avgpreds = []
        for state in states:
            model.load_state_dict(state['model'])
            model.eval()
            with torch.no_grad():
                y_preds = model(images)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs

## Train loop

In [23]:
# ======================================================
# Train loop
# ======================================================

def train_loop(folds, fold):
    
    seed_torch(seed=CFG.seed)    
    
    LOGGER.info(f'========== fold: {fold} training ============')
    
    # ======================================================
    # loader
    # ======================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index
    
    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    
    train_dataset = TrainDataset(train_folds, 
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds, 
                                 transform=get_transforms(data='valid'))
    
    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=True, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    
    # ===============================================
    # scheduler
    # ===============================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler
    
    # ===============================================
    # model & optimizer
    # ===============================================
    model = CustomEfficientNetB4ns(CFG.model_name, pretrained=True)
    
    # 最初の3epochはclassifier層以外全て凍結する。
    for name, param in model.model.named_parameters():
        if 'classifier' not in name:
            param.requires_grad=False
    
    model.to(device)
    
    base_optimizer = Adam
    optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_1, weight_decay=CFG.weight_decay, amsgrad=False)
    
    scheduler = get_scheduler(optimizer)
    
    # ===============================================
    # apex 
    # ===============================================
    if CFG.apex:
        model.optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
        
    # ===============================================
    # loop
    # ===============================================
    def get_loss_train():
        if CFG.loss_train == 'CrossEntropyLoss':
            loss_train = nn.CrossEntropyLoss()
        elif CFG.loss_train == 'LabelSmoothing':
            loss_train = LabelSmoothingLoss(classes=CFG.target_size, smoothing=CFG.smooth)
        elif CFG.loss_train == 'FocalLoss':
            loss_train = FocalLoss().to(device)
        elif CFG.loss_train == 'FocalCosineLoss':
            loss_train = FocalCosineLoss()
        elif CFG.loss_train == 'SymmetricCrossEntropyLoss':
            loss_train = SymmetricCrossEntropy().to(device)
        elif CFG.loss_train == 'BiTemperedLoss':
            loss_train = BiTemperedLogisticLoss(t1=CFG.t1, t2=CFG.t2, smoothing=CFG.smooth)
        return loss_train
    
    loss_train = get_loss_train()
    LOGGER.info(f'loss_train: {loss_train}')
    loss_metric = nn.CrossEntropyLoss()
    
    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        if epoch == 1:
            
            # 2epoch目に重みを全て解凍する
            for param in model.model.parameters():
                param.requires_grad = True
                
            # 学習率を4e-3から4e-4に落とす
            base_optimizer = Adam
            optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_2, weight_decay=CFG.weight_decay, amsgrad=False)
            scheduler = get_scheduler(optimizer)

            LOGGER.info('requires_grad of all parameters are unlocked')
            
        
        # train
        avg_loss = train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, scheduler, device)
        
        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, loss_metric, device)
        valid_labels = valid_folds[CFG.target_col].values
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()
        
        # scoring
        score = get_score(valid_labels, preds.argmax(1))
        
        elapsed = time.time() - start_time
        
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Accuracy: {score}')
        
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(), 
                        'preds': preds}, 
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
            
        # inference用に全て保存しておく        
        torch.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth')
    
    check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    valid_folds[[str(c) for c in range(5)]] = check_point['preds']
    valid_folds['preds'] = check_point['preds'].argmax(1)
    
    return valid_folds

In [24]:
# ====================================================
# main
# ====================================================
def main():
    
    """
    Prepare: 1.train 2.test 3.submission 4.folds
    """
    
    def get_result(result_df):
        preds = result_df['preds'].values
        labels = result_df[CFG.target_col].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.5f}')
        
    if CFG.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f'=============== fold: {fold} result ================')
                get_result(_oof_df)
                
        # CV result
        LOGGER.info(f'============ CV ============')
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
        
    if CFG.inference:
        # inference
        model = CustomEfficientNetB4ns(CFG.model_name, pretrained=False)
        states = [torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth') for fold in CFG.trn_fold]
        test_dataset = TestDataset(test, batch_size=CFG.batch_size, shuffle=False, pin_memory=True)
        predictions = inference(model, states, test_loader, device)
        # submission
        test['label'] = predictions.argmax(1)
        test[['image_id', 'label']].to_csv(OUTPUT_DIR+'submission.csv', index=False)

In [25]:
LOGGER.info(f'used device: {device}')

used device: cuda


In [26]:
if __name__ == '__main__':
    main()

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ns-d6313a46.pth" to /root/.cache/torch/hub/checkpoints/tf_efficientnet_b4_ns-d6313a46.pth
loss_train: BiTemperedLogisticLoss()


Epoch: [1][0/2634]Data 0.958 (0.958)Elapsed 0m 2s (remain 92m 1s)Loss: 1.5684(1.5684)Grad: 0.7996  
Epoch: [1][100/2634]Data 0.000 (0.010)Elapsed 0m 26s (remain 10m 57s)Loss: 1.6975(1.3986)Grad: 0.7264  
Epoch: [1][200/2634]Data 0.000 (0.005)Elapsed 0m 49s (remain 10m 0s)Loss: 0.5229(1.3874)Grad: 0.6896  
Epoch: [1][300/2634]Data 0.000 (0.004)Elapsed 1m 13s (remain 9m 28s)Loss: 1.4204(1.3920)Grad: 0.6751  
Epoch: [1][400/2634]Data 0.000 (0.003)Elapsed 1m 37s (remain 9m 1s)Loss: 0.6445(1.3530)Grad: 0.4929  
Epoch: [1][500/2634]Data 0.000 (0.003)Elapsed 2m 1s (remain 8m 35s)Loss: 1.4101(1.3466)Grad: 0.5456  
Epoch: [1][600/2634]Data 0.000 (0.002)Elapsed 2m 24s (remain 8m 8s)Loss: 0.8687(1.3225)Grad: 0.6008  
Epoch: [1][700/2634]Data 0.000 (0.002)Elapsed 2m 48s (remain 7m 44s)Loss: 1.0893(1.3152)Grad: 0.6351  
Epoch: [1][800/2634]Data 0.000 (0.002)Elapsed 3m 12s (remain 7m 20s)Loss: 0.8757(1.2961)Grad: 0.6917  
Epoch: [1][900/2634]Data 0.006 (0.002)Elapsed 3m 36s (remain 6m 55s)Loss: 1.91

Epoch 1 - avg_train_loss: 1.1446 avg_val_loss: 0.7970 time: 712s
Epoch 1 - Accuracy: 0.7184355420543004
Epoch 1 - Save Best Score: 0.7184 Model


EVAL: [658/659] Data 0.000 (0.010) Elapsed 1m 23s (remain 0m 0s) Loss: 1.0355(0.7970) 


requires_grad of all parameters are unlocked


Epoch: [2][0/2634]Data 0.750 (0.750)Elapsed 0m 1s (remain 86m 18s)Loss: 0.8898(0.8898)Grad: 4.5573  
Epoch: [2][100/2634]Data 0.000 (0.008)Elapsed 1m 28s (remain 37m 1s)Loss: 1.2570(0.9967)Grad: 4.0677  
Epoch: [2][200/2634]Data 0.000 (0.004)Elapsed 2m 55s (remain 35m 21s)Loss: 0.5997(0.9592)Grad: 2.6860  
Epoch: [2][300/2634]Data 0.000 (0.003)Elapsed 4m 22s (remain 33m 53s)Loss: 0.3879(0.9301)Grad: 1.7720  
Epoch: [2][400/2634]Data 0.000 (0.002)Elapsed 5m 49s (remain 32m 23s)Loss: 0.2918(0.8892)Grad: 1.7777  
Epoch: [2][500/2634]Data 0.000 (0.002)Elapsed 7m 15s (remain 30m 55s)Loss: 0.1940(0.8575)Grad: 2.8358  
Epoch: [2][600/2634]Data 0.000 (0.001)Elapsed 8m 42s (remain 29m 27s)Loss: 0.3917(0.8461)Grad: 3.0145  
Epoch: [2][700/2634]Data 0.000 (0.001)Elapsed 10m 9s (remain 27m 59s)Loss: 0.3697(0.8360)Grad: 2.3605  
Epoch: [2][800/2634]Data 0.000 (0.001)Elapsed 11m 35s (remain 26m 32s)Loss: 0.7874(0.8235)Grad: 3.1490  
Epoch: [2][900/2634]Data 0.000 (0.001)Elapsed 13m 2s (remain 25m 5s

Epoch 2 - avg_train_loss: 0.7259 avg_val_loss: 0.4856 time: 2366s
Epoch 2 - Accuracy: 0.8651984051642301
Epoch 2 - Save Best Score: 0.8652 Model


EVAL: [658/659] Data 0.000 (0.003) Elapsed 1m 21s (remain 0m 0s) Loss: 0.0735(0.4856) 
Epoch: [3][0/2634]Data 0.769 (0.769)Elapsed 0m 1s (remain 78m 42s)Loss: 0.0125(0.0125)Grad: 0.7072  
Epoch: [3][100/2634]Data 0.000 (0.008)Elapsed 1m 28s (remain 37m 8s)Loss: 1.3066(0.7332)Grad: 2.2775  
Epoch: [3][200/2634]Data 0.000 (0.004)Elapsed 2m 55s (remain 35m 26s)Loss: 0.1742(0.7206)Grad: 1.6612  
Epoch: [3][300/2634]Data 0.000 (0.003)Elapsed 4m 22s (remain 33m 55s)Loss: 0.6645(0.6932)Grad: 2.1823  
Epoch: [3][400/2634]Data 0.000 (0.002)Elapsed 5m 49s (remain 32m 27s)Loss: 1.5692(0.6772)Grad: 1.0248  
Epoch: [3][500/2634]Data 0.000 (0.002)Elapsed 7m 16s (remain 30m 58s)Loss: 0.1871(0.6625)Grad: 1.8187  
Epoch: [3][600/2634]Data 0.000 (0.001)Elapsed 8m 43s (remain 29m 29s)Loss: 0.2565(0.6383)Grad: 2.2376  
Epoch: [3][700/2634]Data 0.000 (0.001)Elapsed 10m 10s (remain 28m 2s)Loss: 0.1701(0.6558)Grad: 1.1149  
Epoch: [3][800/2634]Data 0.000 (0.001)Elapsed 11m 36s (remain 26m 34s)Loss: 1.8825(0.

Epoch 3 - avg_train_loss: 0.6153 avg_val_loss: 0.4758 time: 2367s
Epoch 3 - Accuracy: 0.87583064363015
Epoch 3 - Save Best Score: 0.8758 Model


EVAL: [658/659] Data 0.000 (0.004) Elapsed 1m 20s (remain 0m 0s) Loss: 0.0450(0.4758) 
Epoch: [4][0/2634]Data 0.820 (0.820)Elapsed 0m 1s (remain 78m 47s)Loss: 0.0130(0.0130)Grad: 0.4908  
Epoch: [4][100/2634]Data 0.000 (0.008)Elapsed 1m 28s (remain 36m 52s)Loss: 0.0626(0.6330)Grad: 1.6549  
Epoch: [4][200/2634]Data 0.000 (0.004)Elapsed 2m 54s (remain 35m 17s)Loss: 0.5729(0.6513)Grad: 1.7227  
Epoch: [4][300/2634]Data 0.000 (0.003)Elapsed 4m 21s (remain 33m 48s)Loss: 0.0050(0.6367)Grad: 0.3184  
Epoch: [4][400/2634]Data 0.000 (0.002)Elapsed 5m 48s (remain 32m 19s)Loss: 0.1059(0.6317)Grad: 1.6769  
Epoch: [4][500/2634]Data 0.000 (0.002)Elapsed 7m 15s (remain 30m 53s)Loss: 0.2262(0.6016)Grad: 2.0135  
Epoch: [4][600/2634]Data 0.000 (0.002)Elapsed 8m 42s (remain 29m 26s)Loss: 0.0320(0.6017)Grad: 1.1884  
Epoch: [4][700/2634]Data 0.000 (0.001)Elapsed 10m 8s (remain 27m 59s)Loss: 2.4028(0.6087)Grad: 3.8103  
Epoch: [4][800/2634]Data 0.000 (0.001)Elapsed 11m 36s (remain 26m 32s)Loss: 1.1310(0

Epoch 4 - avg_train_loss: 0.5907 avg_val_loss: 0.4748 time: 2369s
Epoch 4 - Accuracy: 0.8824757926713499
Epoch 4 - Save Best Score: 0.8825 Model


EVAL: [658/659] Data 0.000 (0.004) Elapsed 1m 20s (remain 0m 0s) Loss: 0.0244(0.4748) 
Epoch: [5][0/2634]Data 0.818 (0.818)Elapsed 0m 1s (remain 80m 36s)Loss: 0.2934(0.2934)Grad: 2.4936  
Epoch: [5][100/2634]Data 0.000 (0.008)Elapsed 1m 28s (remain 36m 56s)Loss: 1.2610(0.5473)Grad: 1.4227  
Epoch: [5][200/2634]Data 0.000 (0.004)Elapsed 2m 54s (remain 35m 18s)Loss: 0.0429(0.5828)Grad: 0.8477  
Epoch: [5][300/2634]Data 0.000 (0.003)Elapsed 4m 21s (remain 33m 49s)Loss: 0.0239(0.5655)Grad: 0.6262  
Epoch: [5][400/2634]Data 0.000 (0.002)Elapsed 5m 48s (remain 32m 19s)Loss: 2.4758(0.5735)Grad: 2.8125  
Epoch: [5][500/2634]Data 0.000 (0.002)Elapsed 7m 14s (remain 30m 51s)Loss: 0.5949(0.5429)Grad: 3.4200  
Epoch: [5][600/2634]Data 0.000 (0.001)Elapsed 8m 41s (remain 29m 24s)Loss: 0.8694(0.5436)Grad: 0.6795  
Epoch: [5][700/2634]Data 0.000 (0.001)Elapsed 10m 8s (remain 27m 56s)Loss: 2.1800(0.5741)Grad: 2.5271  
Epoch: [5][800/2634]Data 0.000 (0.001)Elapsed 11m 34s (remain 26m 29s)Loss: 1.2873(0

Epoch 5 - avg_train_loss: 0.5676 avg_val_loss: 0.4538 time: 2366s
Epoch 5 - Accuracy: 0.8870324662996013
Epoch 5 - Save Best Score: 0.8870 Model


EVAL: [658/659] Data 0.000 (0.003) Elapsed 1m 20s (remain 0m 0s) Loss: 0.0375(0.4538) 
Epoch: [6][0/2634]Data 0.619 (0.619)Elapsed 0m 1s (remain 80m 23s)Loss: 1.4918(1.4918)Grad: 1.9210  
Epoch: [6][100/2634]Data 0.000 (0.006)Elapsed 1m 28s (remain 36m 59s)Loss: 0.2748(0.5952)Grad: 1.3094  
Epoch: [6][200/2634]Data 0.000 (0.003)Elapsed 2m 55s (remain 35m 20s)Loss: 1.2635(0.6067)Grad: 1.9577  
Epoch: [6][300/2634]Data 0.000 (0.002)Elapsed 4m 21s (remain 33m 48s)Loss: 0.1332(0.5780)Grad: 1.3376  
Epoch: [6][400/2634]Data 0.000 (0.002)Elapsed 5m 48s (remain 32m 20s)Loss: 0.0206(0.5809)Grad: 0.8614  
Epoch: [6][500/2634]Data 0.000 (0.001)Elapsed 7m 15s (remain 30m 52s)Loss: 0.6608(0.5840)Grad: 1.4442  
Epoch: [6][600/2634]Data 0.000 (0.001)Elapsed 8m 41s (remain 29m 24s)Loss: 0.0175(0.5783)Grad: 0.4384  
Epoch: [6][700/2634]Data 0.000 (0.001)Elapsed 10m 8s (remain 27m 57s)Loss: 0.1251(0.5686)Grad: 1.5464  
Epoch: [6][800/2634]Data 0.000 (0.001)Elapsed 11m 35s (remain 26m 31s)Loss: 0.1629(0

Epoch 6 - avg_train_loss: 0.5511 avg_val_loss: 0.4644 time: 2367s
Epoch 6 - Accuracy: 0.8900702487184355
Epoch 6 - Save Best Score: 0.8901 Model


EVAL: [658/659] Data 0.000 (0.005) Elapsed 1m 21s (remain 0m 0s) Loss: 0.0184(0.4644) 
Epoch: [7][0/2634]Data 0.780 (0.780)Elapsed 0m 1s (remain 78m 23s)Loss: 0.0914(0.0914)Grad: 1.1535  
Epoch: [7][100/2634]Data 0.000 (0.008)Elapsed 1m 28s (remain 36m 57s)Loss: 0.9993(0.5873)Grad: 2.4593  
Epoch: [7][200/2634]Data 0.000 (0.004)Elapsed 2m 54s (remain 35m 17s)Loss: 1.6850(0.5604)Grad: 1.2680  
Epoch: [7][300/2634]Data 0.000 (0.003)Elapsed 4m 21s (remain 33m 48s)Loss: 0.1761(0.5639)Grad: 1.5368  
Epoch: [7][400/2634]Data 0.000 (0.002)Elapsed 5m 48s (remain 32m 20s)Loss: 0.1931(0.5456)Grad: 3.2645  
Epoch: [7][500/2634]Data 0.000 (0.002)Elapsed 7m 15s (remain 30m 52s)Loss: 0.3252(0.5323)Grad: 2.5508  
Epoch: [7][600/2634]Data 0.000 (0.001)Elapsed 8m 41s (remain 29m 24s)Loss: 0.2098(0.5304)Grad: 1.4188  
Epoch: [7][700/2634]Data 0.000 (0.001)Elapsed 10m 8s (remain 27m 57s)Loss: 0.1818(0.5241)Grad: 1.7843  
Epoch: [7][800/2634]Data 0.000 (0.001)Elapsed 11m 34s (remain 26m 30s)Loss: 0.3884(0

Epoch 7 - avg_train_loss: 0.5439 avg_val_loss: 0.4461 time: 2365s
Epoch 7 - Accuracy: 0.8904499715207899
Epoch 7 - Save Best Score: 0.8904 Model


EVAL: [658/659] Data 0.000 (0.005) Elapsed 1m 21s (remain 0m 0s) Loss: 0.0177(0.4461) 
Epoch: [8][0/2634]Data 0.783 (0.783)Elapsed 0m 1s (remain 78m 39s)Loss: 0.2274(0.2274)Grad: 1.4096  
Epoch: [8][100/2634]Data 0.000 (0.008)Elapsed 1m 28s (remain 36m 57s)Loss: 0.2701(0.5352)Grad: 2.6655  
Epoch: [8][200/2634]Data 0.000 (0.004)Elapsed 2m 54s (remain 35m 17s)Loss: 0.0107(0.6023)Grad: 0.4550  
Epoch: [8][300/2634]Data 0.000 (0.003)Elapsed 4m 21s (remain 33m 48s)Loss: 0.0549(0.5824)Grad: 0.7284  
Epoch: [8][400/2634]Data 0.000 (0.002)Elapsed 5m 48s (remain 32m 19s)Loss: 0.9910(0.5382)Grad: 1.4691  
Epoch: [8][500/2634]Data 0.000 (0.002)Elapsed 7m 14s (remain 30m 51s)Loss: 0.8349(0.5178)Grad: 1.0936  
Epoch: [8][600/2634]Data 0.000 (0.001)Elapsed 8m 41s (remain 29m 23s)Loss: 0.0363(0.5245)Grad: 1.1230  
Epoch: [8][700/2634]Data 0.000 (0.001)Elapsed 10m 8s (remain 27m 56s)Loss: 0.8801(0.5160)Grad: 1.9750  
Epoch: [8][800/2634]Data 0.000 (0.001)Elapsed 11m 34s (remain 26m 29s)Loss: 4.8063(0

Epoch 8 - avg_train_loss: 0.5382 avg_val_loss: 0.4521 time: 2364s
Epoch 8 - Accuracy: 0.8864628820960698


EVAL: [658/659] Data 0.000 (0.003) Elapsed 1m 20s (remain 0m 0s) Loss: 0.0322(0.4521) 
Epoch: [9][0/2634]Data 0.643 (0.643)Elapsed 0m 1s (remain 77m 33s)Loss: 0.1961(0.1961)Grad: 2.5605  
Epoch: [9][100/2634]Data 0.000 (0.007)Elapsed 1m 28s (remain 37m 3s)Loss: 2.8273(0.6139)Grad: 1.5933  
Epoch: [9][200/2634]Data 0.000 (0.003)Elapsed 2m 55s (remain 35m 25s)Loss: 0.0005(0.5934)Grad: 0.1474  
Epoch: [9][300/2634]Data 0.000 (0.002)Elapsed 4m 22s (remain 33m 52s)Loss: 1.1699(0.5586)Grad: 1.2154  
Epoch: [9][400/2634]Data 0.000 (0.002)Elapsed 5m 49s (remain 32m 24s)Loss: 1.2691(0.5409)Grad: 2.5975  
Epoch: [9][500/2634]Data 0.000 (0.001)Elapsed 7m 15s (remain 30m 55s)Loss: 0.2643(0.5236)Grad: 1.5622  
Epoch: [9][600/2634]Data 0.000 (0.001)Elapsed 8m 42s (remain 29m 27s)Loss: 0.1115(0.5132)Grad: 1.6861  
Epoch: [9][700/2634]Data 0.000 (0.001)Elapsed 10m 9s (remain 27m 59s)Loss: 1.0512(0.5017)Grad: 0.9661  
Epoch: [9][800/2634]Data 0.000 (0.001)Elapsed 11m 35s (remain 26m 32s)Loss: 0.4452(0.

Epoch 9 - avg_train_loss: 0.5164 avg_val_loss: 0.5045 time: 2367s
Epoch 9 - Accuracy: 0.8923485855325612
Epoch 9 - Save Best Score: 0.8923 Model


EVAL: [658/659] Data 0.000 (0.004) Elapsed 1m 21s (remain 0m 0s) Loss: 0.0165(0.5045) 
Epoch: [10][0/2634]Data 0.767 (0.767)Elapsed 0m 1s (remain 80m 59s)Loss: 0.4201(0.4201)Grad: 2.1624  
Epoch: [10][100/2634]Data 0.000 (0.008)Elapsed 1m 28s (remain 37m 3s)Loss: 0.0975(0.5395)Grad: 1.1003  
Epoch: [10][200/2634]Data 0.000 (0.004)Elapsed 2m 55s (remain 35m 26s)Loss: 0.0576(0.5000)Grad: 1.5630  
Epoch: [10][300/2634]Data 0.000 (0.003)Elapsed 4m 22s (remain 33m 54s)Loss: 0.0069(0.5355)Grad: 0.2961  
Epoch: [10][400/2634]Data 0.000 (0.002)Elapsed 5m 49s (remain 32m 24s)Loss: 1.2756(0.5280)Grad: 3.7298  
Epoch: [10][500/2634]Data 0.000 (0.002)Elapsed 7m 15s (remain 30m 56s)Loss: 0.0887(0.5076)Grad: 1.5821  
Epoch: [10][600/2634]Data 0.000 (0.001)Elapsed 8m 42s (remain 29m 28s)Loss: 0.0102(0.5002)Grad: 0.4507  
Epoch: [10][700/2634]Data 0.000 (0.001)Elapsed 10m 9s (remain 28m 0s)Loss: 0.3816(0.5019)Grad: 1.5036  
Epoch: [10][800/2634]Data 0.000 (0.001)Elapsed 11m 36s (remain 26m 32s)Loss: 0

Epoch 10 - avg_train_loss: 0.5166 avg_val_loss: 0.4546 time: 2365s
Epoch 10 - Accuracy: 0.8902601101196127


EVAL: [658/659] Data 0.000 (0.003) Elapsed 1m 20s (remain 0m 0s) Loss: 0.0133(0.4546) 


Score: 0.89235
Score: 0.89235
