In [1]:
# !pip install pytorch_ranger

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

In [3]:
os.listdir('../input/cassava-leaf-disease-classification/')

['train_tfrecords',
 'sample_submission.csv',
 'test_tfrecords',
 'label_num_to_disease_map.json',
 'train_images',
 'train.csv',
 'test_images']

In [4]:
train = pd.read_csv('../input/cassava-leaf-disease-merged/merged.csv')
test = pd.read_csv('../input/cassava-leaf-disease-classification//sample_submission.csv')
label_map = pd.read_json('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json', orient='index')

display(train.head())
display(test.head())
display(label_map)

Unnamed: 0,image_id,label,source
0,1000015157.jpg,0,2020
1,1000201771.jpg,3,2020
2,100042118.jpg,1,2020
3,1000723321.jpg,1,2020
4,1000812911.jpg,3,2020


Unnamed: 0,image_id,label
0,2216849948.jpg,4


Unnamed: 0,0
0,Cassava Bacterial Blight (CBB)
1,Cassava Brown Streak Disease (CBSD)
2,Cassava Green Mottle (CGM)
3,Cassava Mosaic Disease (CMD)
4,Healthy


## Directory settings

In [5]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
TRAIN_PATH = '../input/cassava-leaf-disease-merged/train'
TEST_PATH = '../input/cassava-leaf-disease-classification/test_images'

## CFG

In [6]:
class CFG:
    debug = False
    apex = False
    print_freq = 100
    num_workers = 4
    model_name = 'swsl_resnext50_32x4d'
    size = 410
    scheduler = 'CosineAnnealingWarmRestarts'
    loss_train = 'BiTemperedLoss'
    epochs = 10
    T_0 = 10
    lr_1 = 1e-3
    lr_2 = 1e-4
    t1 = 0.9
    t2 = 1.5
    smooth = 1e-2
    min_lr = 1e-6
    batch_size = 32
    weight_decay = 1e-6
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    seed = 42
    target_size = 5
    target_col = 'label'
    n_fold = 5
    trn_fold = [0]
    train = True
    inference = False
    
if CFG.debug:
    CFG.epochs = 3
    train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)

## Library

In [7]:
import sys
sys.path.append('../input/pytorch-image-models/')

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

sys.path.append('../input/pytorch-sam')
from sam import SAM

from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine, RandomResizedCrop,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, CoarseDropout, ShiftScaleRotate, CenterCrop, Resize
)
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

import warnings 
warnings.filterwarnings('ignore')

if CFG.apex:
    from apex import amp

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Utils

In [8]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f}')
    
def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

## CV split

In [9]:
folds = train.copy()
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.groupby(['fold', CFG.target_col]).size())

fold  label
0     0         299
      1         695
      2         604
      3        3092
      4         578
1     0         299
      1         695
      2         604
      3        3092
      4         578
2     0         298
      1         695
      2         603
      3        3093
      4         578
3     0         298
      1         695
      2         603
      3        3093
      4         578
4     0         298
      1         696
      2         603
      3        3092
      4         578
dtype: int64


## Dataset

In [10]:
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.labels = df['label'].values
#         self.labels = pd.get_dummies(df['label']).values  # One Hot Encoding
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TRAIN_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx]).long()
        return image, label
    
class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TEST_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        return image

In [11]:
# train_dataset = TrainDataset(train, transform=None)

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image)
#     plt.title(f'label: {label}')
#     plt.show()

## Transforms

In [12]:
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            RandomResizedCrop(CFG.size, CFG.size), 
            Transpose(p=0.5), 
            HorizontalFlip(p=0.5), 
            VerticalFlip(p=0.5), 
            ShiftScaleRotate(p=0.5), 
            HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5), 
            RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])
    
    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size), 
            Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
            ), 
            ToTensorV2(),
        ])

In [13]:
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))

# for i in range(1):
#     image, label = train_dataset[i]
#     plt.imshow(image[0])
#     plt.title(f'label: {label}')
#     plt.show()

## MODEL

In [14]:
class Custom_swsl_resnext101_32x8d(nn.Module):
    def __init__(self, model_name='swsl_resnext101_32x8d', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.fc.in_features
        self.model.fc = nn.Linear(n_features, CFG.target_size)
        
    def forward(self, x):
        x = self.model(x)
        return x

In [15]:
# model = Customswsl_resnext101_32x8d(model_name=CFG.model_name, pretrained=False)
# train_dataset = TrainDataset(train, transform=get_transforms(data='train'))
# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, 
#                           num_workers=4, pin_memory=True, drop_last=True)

# for image, label in train_loader:
#     print(image.size())
#     output = model(image)
#     print(output)
#     break

## Loss Functions

In [16]:
# ====================================================
# Label Smoothing
# ====================================================
class LabelSmoothingLoss(nn.Module): 
    def __init__(self, classes=5, smoothing=0.0, dim=-1): 
        super(LabelSmoothingLoss, self).__init__() 
        self.confidence = 1.0 - smoothing 
        self.smoothing = smoothing 
        self.cls = classes 
        self.dim = dim 
    def forward(self, pred, target): 
        pred = pred.log_softmax(dim=self.dim) 
        with torch.no_grad():
            true_dist = torch.zeros_like(pred) 
            true_dist.fill_(self.smoothing / (self.cls - 1)) 
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [17]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduce = reduce

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [18]:
class FocalCosineLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, xent=.1):
        super(FocalCosineLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

        self.xent = xent

        self.y = torch.Tensor([1]).cuda()

    def forward(self, input, target, reduction="mean"):
        cosine_loss = F.cosine_embedding_loss(input, F.one_hot(target, num_classes=input.size(-1)), self.y, reduction=reduction)

        cent_loss = F.cross_entropy(F.normalize(input), target, reduce=False)
        pt = torch.exp(-cent_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * cent_loss

        if reduction == "mean":
            focal_loss = torch.mean(focal_loss)

        return cosine_loss + self.xent * focal_loss

In [19]:
class SymmetricCrossEntropy(nn.Module):

    def __init__(self, alpha=0.1, beta=1.0, num_classes=5):
        super(SymmetricCrossEntropy, self).__init__()
        self.alpha = alpha
        self.beta = beta
        self.num_classes = num_classes

    def forward(self, logits, targets, reduction='mean'):
        onehot_targets = torch.eye(self.num_classes)[targets].cuda()
        ce_loss = F.cross_entropy(logits, targets, reduction=reduction)
        rce_loss = (-onehot_targets*logits.softmax(1).clamp(1e-7, 1.0).log()).sum(1)
        if reduction == 'mean':
            rce_loss = rce_loss.mean()
        elif reduction == 'sum':
            rce_loss = rce_loss.sum()
        return self.alpha * ce_loss + self.beta * rce_loss

In [20]:
def log_t(u, t):
    """Compute log_t for `u'."""
    if t==1.0:
        return u.log()
    else:
        return (u.pow(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u'."""
    if t==1:
        return u.exp()
    else:
        return (1.0 + (1.0-t)*u).relu().pow(1.0 / (1.0 - t))

def compute_normalization_fixed_point(activations, t, num_iters):

    """Returns the normalization value for each example (t > 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same shape as activation with the last dimension being 1.
    """
    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations_step_0 = activations - mu

    normalized_activations = normalized_activations_step_0

    for _ in range(num_iters):
        logt_partition = torch.sum(
                exp_t(normalized_activations, t), -1, keepdim=True)
        normalized_activations = normalized_activations_step_0 * \
                logt_partition.pow(1.0-t)

    logt_partition = torch.sum(
            exp_t(normalized_activations, t), -1, keepdim=True)
    normalization_constants = - log_t(1.0 / logt_partition, t) + mu

    return normalization_constants

def compute_normalization_binary_search(activations, t, num_iters):

    """Returns the normalization value for each example (t < 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (< 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """

    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations = activations - mu

    effective_dim = \
        torch.sum(
                (normalized_activations > -1.0 / (1.0-t)).to(torch.int32),
            dim=-1, keepdim=True).to(activations.dtype)

    shape_partition = activations.shape[:-1] + (1,)
    lower = torch.zeros(shape_partition, dtype=activations.dtype, device=activations.device)
    upper = -log_t(1.0/effective_dim, t) * torch.ones_like(lower)

    for _ in range(num_iters):
        logt_partition = (upper + lower)/2.0
        sum_probs = torch.sum(
                exp_t(normalized_activations - logt_partition, t),
                dim=-1, keepdim=True)
        update = (sum_probs < 1.0).to(activations.dtype)
        lower = torch.reshape(
                lower * update + (1.0-update) * logt_partition,
                shape_partition)
        upper = torch.reshape(
                upper * (1.0 - update) + update * logt_partition,
                shape_partition)

    logt_partition = (upper + lower)/2.0
    return logt_partition + mu

class ComputeNormalization(torch.autograd.Function):
    """
    Class implementing custom backward pass for compute_normalization. See compute_normalization.
    """
    @staticmethod
    def forward(ctx, activations, t, num_iters):
        if t < 1.0:
            normalization_constants = compute_normalization_binary_search(activations, t, num_iters)
        else:
            normalization_constants = compute_normalization_fixed_point(activations, t, num_iters)

        ctx.save_for_backward(activations, normalization_constants)
        ctx.t=t
        return normalization_constants

    @staticmethod
    def backward(ctx, grad_output):
        activations, normalization_constants = ctx.saved_tensors
        t = ctx.t
        normalized_activations = activations - normalization_constants 
        probabilities = exp_t(normalized_activations, t)
        escorts = probabilities.pow(t)
        escorts = escorts / escorts.sum(dim=-1, keepdim=True)
        grad_input = escorts * grad_output
        
        return grad_input, None, None

def compute_normalization(activations, t, num_iters=5):
    """Returns the normalization value for each example. 
    Backward pass is implemented.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    return ComputeNormalization.apply(activations, t, num_iters)

def tempered_sigmoid(activations, t, num_iters = 5):
    """Tempered sigmoid function.
    Args:
      activations: Activations for the positive class for binary classification.
      t: Temperature tensor > 0.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_probabilities = tempered_softmax(internal_activations, t, num_iters)
    return internal_probabilities[..., 0]


def tempered_softmax(activations, t, num_iters=5):
    """Tempered softmax function.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature > 1.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    if t == 1.0:
        return activations.softmax(dim=-1)

    normalization_constants = compute_normalization(activations, t, num_iters)
    return exp_t(activations - normalization_constants, t)

def bi_tempered_binary_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing = 0.0,
        num_iters=5,
        reduction='mean'):

    """Bi-Tempered binary logistic loss.
    Args:
      activations: A tensor containing activations for class 1.
      labels: A tensor with shape as activations, containing probabilities for class 1
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing
      num_iters: Number of iterations to run the method.
    Returns:
      A loss tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_labels = torch.stack([labels.to(activations.dtype),
        1.0 - labels.to(activations.dtype)],
        dim=-1)
    return bi_tempered_logistic_loss(internal_activations, 
            internal_labels,
            t1,
            t2,
            label_smoothing = label_smoothing,
            num_iters = num_iters,
            reduction = reduction)

def bi_tempered_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing=0.0,
        num_iters=5,
        reduction = 'mean'):

    """Bi-Tempered Logistic Loss.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      labels: A tensor with shape and dtype as activations (onehot), 
        or a long tensor of one dimension less than activations (pytorch standard)
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing parameter between [0, 1). Default 0.0.
      num_iters: Number of iterations to run the method. Default 5.
      reduction: ``'none'`` | ``'mean'`` | ``'sum'``. Default ``'mean'``.
        ``'none'``: No reduction is applied, return shape is shape of
        activations without the last dimension.
        ``'mean'``: Loss is averaged over minibatch. Return shape (1,)
        ``'sum'``: Loss is summed over minibatch. Return shape (1,)
    Returns:
      A loss tensor.
    """

    if len(labels.shape)<len(activations.shape): #not one-hot
        labels_onehot = torch.zeros_like(activations)
        labels_onehot.scatter_(1, labels[..., None], 1)
    else:
        labels_onehot = labels

    if label_smoothing > 0:
        num_classes = labels_onehot.shape[-1]
        labels_onehot = ( 1 - label_smoothing * num_classes / (num_classes - 1) ) \
                * labels_onehot + \
                label_smoothing / (num_classes - 1)

    probabilities = tempered_softmax(activations, t2, num_iters)

    loss_values = labels_onehot * log_t(labels_onehot + 1e-10, t1) \
            - labels_onehot * log_t(probabilities, t1) \
            - labels_onehot.pow(2.0 - t1) / (2.0 - t1) \
            + probabilities.pow(2.0 - t1) / (2.0 - t1)
    loss_values = loss_values.sum(dim = -1) #sum over classes

    if reduction == 'none':
        return loss_values
    if reduction == 'sum':
        return loss_values.sum()
    if reduction == 'mean':
        return loss_values.mean()

In [21]:
class BiTemperedLogisticLoss(nn.Module): 
    def __init__(self, t1, t2, smoothing=0.0): 
        super(BiTemperedLogisticLoss, self).__init__() 
        self.t1 = t1
        self.t2 = t2
        self.smoothing = smoothing
    def forward(self, logit_label, truth_label):
        loss_label = bi_tempered_logistic_loss(
            logit_label, truth_label,
            t1=self.t1, t2=self.t2,
            label_smoothing=self.smoothing,
            reduction='none'
        )
        
        loss_label = loss_label.mean()
        return loss_label

In [22]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def asMinutes(s):
    """秒を分に変換する関数"""
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    """経過時間の測定と終了時間の予測を行う関数
    Parameters
    ----------
    since : float
        実験を始めた時刻
    percent : float
        実験が進んだ割合
        
    Returns
    -------
    s : 経過時間
    re : 終了までの時間の予測
    """
    now = time.time()
    s = now - since  # 経過時間の測定
    es = s / percent  # 終了時間の予測
    re = es - s  # 残り時間の予想
    return '%s (remain %s)' % (asMinutes(s), asMinutes(re))

def train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, shechduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(images)
        metric = loss_metric(y_preds, labels)
        loss = loss_train(y_preds, labels)
        # record loss
        losses.update(metric.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        if CFG.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else: 
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.first_step(zero_grad=True)
#             optimizer.zero_grad()
            global_step += 1
        # measure elapsed time
        loss_train(model(images), labels).backward()
#         loss = torch.mean(loss)
#         loss.backward()
        optimizer.second_step(zero_grad=True)
        
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}]'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})'
                  'Elapsed {remain:s}' 
                  'Loss: {loss.val:.4f}({loss.avg:.4f})' 
                  'Grad: {grad_norm:.4f}  '
                  .format(epoch+1, step, len(train_loader), batch_time=batch_time, 
                          data_time=data_time, loss=losses, 
                          remain=timeSince(start, float(step+1)/len(train_loader)), 
                          grad_norm=grad_norm))
    return losses.avg

def valid_fn(valid_loader, model, loss_metric, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (images, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images)
        loss = loss_metric(y_preds, labels)
        losses.update(loss.item(), batch_size)
        # record accuracy
        preds.append(y_preds.softmax(1).to('cpu').numpy())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
            
    predictions = np.concatenate(preds)
    return losses.avg, predictions

def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (images) in tk0:
        images = images.to(device)
        avgpreds = []
        for state in states:
            model.load_state_dict(state['model'])
            model.eval()
            with torch.no_grad():
                y_preds = model(images)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs

## Train loop

In [23]:
# ======================================================
# Train loop
# ======================================================

def train_loop(folds, fold):
    
    seed_torch(seed=CFG.seed)    
    
    LOGGER.info(f'========== fold: {fold} training ============')
    
    # ======================================================
    # loader
    # ======================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index
    
    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    
    train_dataset = TrainDataset(train_folds, 
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds, 
                                 transform=get_transforms(data='valid'))
    
    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=True, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    
    # ===============================================
    # scheduler
    # ===============================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler
    
    # ===============================================
    # model & optimizer
    # ===============================================
    model = Custom_swsl_resnext101_32x8d(CFG.model_name, pretrained=True)
    
    # 最初の1epochはclassifier層以外全て凍結する。
    for name, param in model.model.named_parameters():
        if 'fc' not in name:
            param.requires_grad=False
    
    model.to(device)
    
    base_optimizer = Adam
    optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_1, weight_decay=CFG.weight_decay, amsgrad=False)
    
    scheduler = get_scheduler(optimizer)
    
    # ===============================================
    # apex 
    # ===============================================
    if CFG.apex:
        model.optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
        
    # ===============================================
    # loop
    # ===============================================
    def get_loss_train():
        if CFG.loss_train == 'CrossEntropyLoss':
            loss_train = nn.CrossEntropyLoss()
        elif CFG.loss_train == 'LabelSmoothing':
            loss_train = LabelSmoothingLoss(classes=CFG.target_size, smoothing=CFG.smooth)
        elif CFG.loss_train == 'FocalLoss':
            loss_train = FocalLoss().to(device)
        elif CFG.loss_train == 'FocalCosineLoss':
            loss_train = FocalCosineLoss()
        elif CFG.loss_train == 'SymmetricCrossEntropyLoss':
            loss_train = SymmetricCrossEntropy().to(device)
        elif CFG.loss_train == 'BiTemperedLoss':
            loss_train = BiTemperedLogisticLoss(t1=CFG.t1, t2=CFG.t2, smoothing=CFG.smooth)
        return loss_train
    
    loss_train = get_loss_train()
    LOGGER.info(f'loss_train: {loss_train}')
    loss_metric = nn.CrossEntropyLoss()
    
    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        if epoch == 1:
            
            # 2epoch目に重みを全て解凍する
            for param in model.model.parameters():
                param.requires_grad = True
                
            # 学習率を落とす
            base_optimizer = Adam
            optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.lr_2, weight_decay=CFG.weight_decay, amsgrad=False)
            scheduler = get_scheduler(optimizer)

            LOGGER.info('requires_grad of all parameters are unlocked')
            
        
        # train
        avg_loss = train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, scheduler, device)
        
        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, loss_metric, device)
        valid_labels = valid_folds[CFG.target_col].values
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()
        
        # scoring
        score = get_score(valid_labels, preds.argmax(1))
        
        elapsed = time.time() - start_time
        
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Accuracy: {score}')
        
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(), 
                        'preds': preds}, 
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
        if epoch > 4:
            # inference用に全て保存しておく        
            torch.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth')
    
    check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    valid_folds[[str(c) for c in range(5)]] = check_point['preds']
    valid_folds['preds'] = check_point['preds'].argmax(1)
    
    return valid_folds

In [24]:
# ====================================================
# main
# ====================================================
def main():
    
    """
    Prepare: 1.train 2.test 3.submission 4.folds
    """
    
    def get_result(result_df):
        preds = result_df['preds'].values
        labels = result_df[CFG.target_col].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.5f}')
        
    if CFG.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f'=============== fold: {fold} result ================')
                get_result(_oof_df)
                
        # CV result
        LOGGER.info(f'============ CV ============')
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
        
    if CFG.inference:
        # inference
        model = Custom_swsl_resnext101_32x8d(CFG.model_name, pretrained=False)
        states = [torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth') for fold in CFG.trn_fold]
        test_dataset = TestDataset(test, batch_size=CFG.batch_size, shuffle=False, pin_memory=True)
        predictions = inference(model, states, test_loader, device)
        # submission
        test['label'] = predictions.argmax(1)
        test[['image_id', 'label']].to_csv(OUTPUT_DIR+'submission.csv', index=False)

In [25]:
LOGGER.info(f'used device: {device}')

used device: cuda


In [26]:
if __name__ == '__main__':
    main()

Downloading: "https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext50_32x4-72679e44.pth" to /root/.cache/torch/hub/checkpoints/semi_weakly_supervised_resnext50_32x4-72679e44.pth
loss_train: BiTemperedLogisticLoss()


Epoch: [1][0/659]Data 2.806 (2.806)Elapsed 0m 4s (remain 50m 47s)Loss: 1.4669(1.4669)Grad: 3.4962  
Epoch: [1][100/659]Data 0.147 (0.131)Elapsed 1m 4s (remain 5m 54s)Loss: 0.8528(1.5841)Grad: 1.4041  
Epoch: [1][200/659]Data 0.247 (0.137)Elapsed 2m 4s (remain 4m 44s)Loss: 1.5428(1.3923)Grad: 2.1670  
Epoch: [1][300/659]Data 0.244 (0.136)Elapsed 3m 5s (remain 3m 40s)Loss: 0.7338(1.3189)Grad: 1.1531  
Epoch: [1][400/659]Data 0.157 (0.137)Elapsed 4m 5s (remain 2m 38s)Loss: 1.2477(1.2643)Grad: 1.1990  
Epoch: [1][500/659]Data 0.214 (0.142)Elapsed 5m 7s (remain 1m 36s)Loss: 0.9653(1.2364)Grad: 0.5536  
Epoch: [1][600/659]Data 0.210 (0.144)Elapsed 6m 7s (remain 0m 35s)Loss: 1.1143(1.2003)Grad: 0.9948  
Epoch: [1][658/659]Data 0.000 (0.143)Elapsed 6m 41s (remain 0m 0s)Loss: 1.2860(1.1817)Grad: 2.4573  
EVAL: [0/165] Data 1.940 (1.940) Elapsed 0m 2s (remain 5m 57s) Loss: 0.9463(0.9463) 
EVAL: [100/165] Data 0.604 (0.232) Elapsed 0m 47s (remain 0m 29s) Loss: 1.2497(0.8784) 


Epoch 1 - avg_train_loss: 1.1817 avg_val_loss: 0.9083 time: 476s
Epoch 1 - Accuracy: 0.7074791192103265
Epoch 1 - Save Best Score: 0.7075 Model


EVAL: [164/165] Data 0.000 (0.214) Elapsed 1m 13s (remain 0m 0s) Loss: 1.2252(0.9083) 


requires_grad of all parameters are unlocked


Epoch: [2][0/659]Data 2.605 (2.605)Elapsed 0m 4s (remain 50m 13s)Loss: 1.1936(1.1936)Grad: 13.6150  
Epoch: [2][100/659]Data 0.000 (0.026)Elapsed 2m 51s (remain 15m 48s)Loss: 1.1538(0.8884)Grad: 2.9236  
Epoch: [2][200/659]Data 0.000 (0.013)Elapsed 5m 38s (remain 12m 51s)Loss: 0.5349(0.7943)Grad: 3.3094  
Epoch: [2][300/659]Data 0.000 (0.009)Elapsed 8m 25s (remain 10m 0s)Loss: 0.3409(0.7360)Grad: 2.0752  
Epoch: [2][400/659]Data 0.000 (0.007)Elapsed 11m 11s (remain 7m 12s)Loss: 0.7219(0.7143)Grad: 2.3242  
Epoch: [2][500/659]Data 0.000 (0.005)Elapsed 13m 57s (remain 4m 24s)Loss: 0.5706(0.6902)Grad: 2.9619  
Epoch: [2][600/659]Data 0.000 (0.004)Elapsed 16m 44s (remain 1m 36s)Loss: 0.1246(0.6832)Grad: 1.3670  
Epoch: [2][658/659]Data 0.000 (0.004)Elapsed 18m 20s (remain 0m 0s)Loss: 1.1777(0.6785)Grad: 4.1596  
EVAL: [0/165] Data 2.338 (2.338) Elapsed 0m 2s (remain 7m 1s) Loss: 0.3645(0.3645) 
EVAL: [100/165] Data 0.576 (0.229) Elapsed 0m 46s (remain 0m 29s) Loss: 0.7949(0.6268) 


Epoch 2 - avg_train_loss: 0.6785 avg_val_loss: 0.6276 time: 1173s
Epoch 2 - Accuracy: 0.8513667425968109
Epoch 2 - Save Best Score: 0.8514 Model


EVAL: [164/165] Data 0.000 (0.209) Elapsed 1m 13s (remain 0m 0s) Loss: 0.2811(0.6276) 
Epoch: [3][0/659]Data 2.406 (2.406)Elapsed 0m 4s (remain 45m 51s)Loss: 0.4423(0.4423)Grad: 1.8798  
Epoch: [3][100/659]Data 0.000 (0.024)Elapsed 2m 50s (remain 15m 43s)Loss: 0.3225(0.6704)Grad: 1.3575  
Epoch: [3][200/659]Data 0.000 (0.012)Elapsed 5m 37s (remain 12m 48s)Loss: 0.2568(0.6409)Grad: 1.1884  
Epoch: [3][300/659]Data 0.000 (0.008)Elapsed 8m 23s (remain 9m 59s)Loss: 0.1737(0.6153)Grad: 3.2385  
Epoch: [3][400/659]Data 0.000 (0.006)Elapsed 11m 10s (remain 7m 11s)Loss: 0.2937(0.6112)Grad: 1.0422  
Epoch: [3][500/659]Data 0.000 (0.005)Elapsed 13m 56s (remain 4m 23s)Loss: 0.4168(0.6112)Grad: 2.3621  
Epoch: [3][600/659]Data 0.000 (0.004)Elapsed 16m 43s (remain 1m 36s)Loss: 0.8358(0.6126)Grad: 1.4819  
Epoch: [3][658/659]Data 0.000 (0.004)Elapsed 18m 18s (remain 0m 0s)Loss: 0.4349(0.6090)Grad: 2.5423  
EVAL: [0/165] Data 1.730 (1.730) Elapsed 0m 1s (remain 5m 22s) Loss: 0.4583(0.4583) 
EVAL: [10

Epoch 3 - avg_train_loss: 0.6090 avg_val_loss: 0.5677 time: 1171s
Epoch 3 - Accuracy: 0.8616173120728929
Epoch 3 - Save Best Score: 0.8616 Model


EVAL: [164/165] Data 0.000 (0.201) Elapsed 1m 11s (remain 0m 0s) Loss: 0.3202(0.5677) 
Epoch: [4][0/659]Data 2.369 (2.369)Elapsed 0m 4s (remain 45m 33s)Loss: 0.7164(0.7164)Grad: 1.5803  
Epoch: [4][100/659]Data 0.000 (0.024)Elapsed 2m 50s (remain 15m 43s)Loss: 0.5569(0.6387)Grad: 1.3846  
Epoch: [4][200/659]Data 0.000 (0.012)Elapsed 5m 37s (remain 12m 48s)Loss: 0.2969(0.6182)Grad: 1.6572  
Epoch: [4][300/659]Data 0.000 (0.008)Elapsed 8m 23s (remain 9m 58s)Loss: 0.5724(0.6187)Grad: 1.8976  
Epoch: [4][400/659]Data 0.000 (0.006)Elapsed 11m 9s (remain 7m 11s)Loss: 1.4205(0.5975)Grad: 2.1868  
Epoch: [4][500/659]Data 0.000 (0.005)Elapsed 13m 56s (remain 4m 23s)Loss: 0.9310(0.5926)Grad: 1.2042  
Epoch: [4][600/659]Data 0.000 (0.004)Elapsed 16m 42s (remain 1m 36s)Loss: 0.5343(0.5936)Grad: 1.6507  
Epoch: [4][658/659]Data 0.000 (0.004)Elapsed 18m 18s (remain 0m 0s)Loss: 0.4856(0.5963)Grad: 2.4087  
EVAL: [0/165] Data 2.648 (2.648) Elapsed 0m 2s (remain 8m 3s) Loss: 0.6388(0.6388) 
EVAL: [100/

Epoch 4 - avg_train_loss: 0.5963 avg_val_loss: 0.5659 time: 1170s
Epoch 4 - Accuracy: 0.8680713743356112
Epoch 4 - Save Best Score: 0.8681 Model


EVAL: [164/165] Data 0.000 (0.197) Elapsed 1m 11s (remain 0m 0s) Loss: 0.2782(0.5659) 
Epoch: [5][0/659]Data 2.315 (2.315)Elapsed 0m 4s (remain 45m 11s)Loss: 0.2312(0.2312)Grad: 1.8553  
Epoch: [5][100/659]Data 0.000 (0.023)Elapsed 2m 50s (remain 15m 41s)Loss: 0.2685(0.5746)Grad: 1.1448  
Epoch: [5][200/659]Data 0.000 (0.012)Elapsed 5m 36s (remain 12m 47s)Loss: 0.3577(0.5720)Grad: 0.9423  
Epoch: [5][300/659]Data 0.000 (0.008)Elapsed 8m 22s (remain 9m 58s)Loss: 0.7663(0.5815)Grad: 1.2712  
Epoch: [5][400/659]Data 0.000 (0.006)Elapsed 11m 9s (remain 7m 10s)Loss: 0.6766(0.5825)Grad: 1.5942  
Epoch: [5][500/659]Data 0.000 (0.005)Elapsed 13m 55s (remain 4m 23s)Loss: 0.2859(0.5752)Grad: 1.9247  
Epoch: [5][600/659]Data 0.000 (0.004)Elapsed 16m 42s (remain 1m 36s)Loss: 0.6818(0.5782)Grad: 1.8945  
Epoch: [5][658/659]Data 0.000 (0.004)Elapsed 18m 17s (remain 0m 0s)Loss: 0.1744(0.5858)Grad: 1.5581  
EVAL: [0/165] Data 1.766 (1.766) Elapsed 0m 2s (remain 5m 31s) Loss: 0.5756(0.5756) 
EVAL: [100

Epoch 5 - avg_train_loss: 0.5858 avg_val_loss: 0.5818 time: 1170s
Epoch 5 - Accuracy: 0.8694001518602885
Epoch 5 - Save Best Score: 0.8694 Model


EVAL: [164/165] Data 0.000 (0.201) Elapsed 1m 11s (remain 0m 0s) Loss: 0.1649(0.5818) 
Epoch: [6][0/659]Data 2.626 (2.626)Elapsed 0m 4s (remain 48m 19s)Loss: 0.2627(0.2627)Grad: 1.7325  
Epoch: [6][100/659]Data 0.000 (0.026)Elapsed 2m 50s (remain 15m 43s)Loss: 0.1630(0.5831)Grad: 1.1830  
Epoch: [6][200/659]Data 0.000 (0.013)Elapsed 5m 37s (remain 12m 48s)Loss: 0.4385(0.5565)Grad: 1.5004  
Epoch: [6][300/659]Data 0.000 (0.009)Elapsed 8m 23s (remain 9m 58s)Loss: 0.2736(0.5653)Grad: 1.5403  
Epoch: [6][400/659]Data 0.000 (0.007)Elapsed 11m 9s (remain 7m 11s)Loss: 1.0056(0.5573)Grad: 1.1386  
Epoch: [6][500/659]Data 0.000 (0.005)Elapsed 13m 56s (remain 4m 23s)Loss: 1.4909(0.5590)Grad: 1.5113  
Epoch: [6][600/659]Data 0.000 (0.005)Elapsed 16m 42s (remain 1m 36s)Loss: 0.9601(0.5544)Grad: 1.9644  
Epoch: [6][658/659]Data 0.000 (0.004)Elapsed 18m 18s (remain 0m 0s)Loss: 3.6334(0.5600)Grad: 2.9411  
EVAL: [0/165] Data 1.971 (1.971) Elapsed 0m 2s (remain 6m 1s) Loss: 0.5127(0.5127) 
EVAL: [100/

Epoch 6 - avg_train_loss: 0.5600 avg_val_loss: 0.5406 time: 1172s
Epoch 6 - Accuracy: 0.8754745634016705
Epoch 6 - Save Best Score: 0.8755 Model


EVAL: [164/165] Data 0.000 (0.211) Elapsed 1m 13s (remain 0m 0s) Loss: 0.2100(0.5406) 
Epoch: [7][0/659]Data 2.198 (2.198)Elapsed 0m 4s (remain 43m 58s)Loss: 0.3141(0.3141)Grad: 1.8467  
Epoch: [7][100/659]Data 0.000 (0.022)Elapsed 2m 50s (remain 15m 40s)Loss: 0.7018(0.5338)Grad: 1.1463  
Epoch: [7][200/659]Data 0.000 (0.011)Elapsed 5m 36s (remain 12m 47s)Loss: 0.1748(0.5329)Grad: 1.2071  
Epoch: [7][300/659]Data 0.000 (0.007)Elapsed 8m 23s (remain 9m 58s)Loss: 0.0424(0.5374)Grad: 0.8298  
Epoch: [7][400/659]Data 0.000 (0.006)Elapsed 11m 9s (remain 7m 10s)Loss: 0.3142(0.5341)Grad: 0.9118  
Epoch: [7][500/659]Data 0.000 (0.005)Elapsed 13m 55s (remain 4m 23s)Loss: 0.2192(0.5440)Grad: 1.9947  
Epoch: [7][600/659]Data 0.000 (0.004)Elapsed 16m 42s (remain 1m 36s)Loss: 0.1046(0.5430)Grad: 1.1110  
Epoch: [7][658/659]Data 0.000 (0.003)Elapsed 18m 17s (remain 0m 0s)Loss: 0.3430(0.5455)Grad: 0.8850  
EVAL: [0/165] Data 1.792 (1.792) Elapsed 0m 2s (remain 5m 31s) Loss: 0.3927(0.3927) 
EVAL: [100

Epoch 7 - avg_train_loss: 0.5455 avg_val_loss: 0.5259 time: 1167s
Epoch 7 - Accuracy: 0.8849658314350797
Epoch 7 - Save Best Score: 0.8850 Model


EVAL: [164/165] Data 0.000 (0.184) Elapsed 1m 9s (remain 0m 0s) Loss: 0.0806(0.5259) 
Epoch: [8][0/659]Data 2.466 (2.466)Elapsed 0m 4s (remain 46m 36s)Loss: 0.3549(0.3549)Grad: 1.0479  
Epoch: [8][100/659]Data 0.000 (0.025)Elapsed 2m 50s (remain 15m 42s)Loss: 0.5121(0.5367)Grad: 1.8342  
Epoch: [8][200/659]Data 0.000 (0.012)Elapsed 5m 37s (remain 12m 48s)Loss: 0.6003(0.5373)Grad: 1.3426  
Epoch: [8][300/659]Data 0.000 (0.008)Elapsed 8m 23s (remain 9m 58s)Loss: 0.4255(0.5222)Grad: 1.3039  
Epoch: [8][400/659]Data 0.000 (0.006)Elapsed 11m 9s (remain 7m 10s)Loss: 0.6617(0.5221)Grad: 1.5309  
Epoch: [8][500/659]Data 0.000 (0.005)Elapsed 13m 56s (remain 4m 23s)Loss: 0.5818(0.5167)Grad: 1.0085  
Epoch: [8][600/659]Data 0.000 (0.004)Elapsed 16m 42s (remain 1m 36s)Loss: 0.3506(0.5106)Grad: 1.2125  
Epoch: [8][658/659]Data 0.000 (0.004)Elapsed 18m 17s (remain 0m 0s)Loss: 0.2790(0.5138)Grad: 1.4792  
EVAL: [0/165] Data 1.587 (1.587) Elapsed 0m 1s (remain 5m 6s) Loss: 0.5131(0.5131) 
EVAL: [100/1

Epoch 8 - avg_train_loss: 0.5138 avg_val_loss: 0.5159 time: 1168s
Epoch 8 - Accuracy: 0.8868640850417616
Epoch 8 - Save Best Score: 0.8869 Model


EVAL: [164/165] Data 0.000 (0.193) Elapsed 1m 10s (remain 0m 0s) Loss: 0.1523(0.5159) 
Epoch: [9][0/659]Data 2.498 (2.498)Elapsed 0m 4s (remain 47m 25s)Loss: 0.2830(0.2830)Grad: 1.3473  
Epoch: [9][100/659]Data 0.000 (0.025)Elapsed 2m 50s (remain 15m 43s)Loss: 0.6008(0.5197)Grad: 1.4434  
Epoch: [9][200/659]Data 0.000 (0.013)Elapsed 5m 37s (remain 12m 48s)Loss: 0.5957(0.4851)Grad: 1.5370  
Epoch: [9][300/659]Data 0.000 (0.008)Elapsed 8m 23s (remain 9m 58s)Loss: 0.3186(0.4834)Grad: 0.8078  
Epoch: [9][400/659]Data 0.000 (0.006)Elapsed 11m 9s (remain 7m 10s)Loss: 0.8342(0.5107)Grad: 1.1229  
Epoch: [9][500/659]Data 0.000 (0.005)Elapsed 13m 55s (remain 4m 23s)Loss: 0.8568(0.5124)Grad: 1.3332  
Epoch: [9][600/659]Data 0.000 (0.004)Elapsed 16m 42s (remain 1m 36s)Loss: 0.4603(0.5115)Grad: 1.4583  
Epoch: [9][658/659]Data 0.000 (0.004)Elapsed 18m 17s (remain 0m 0s)Loss: 0.0565(0.5098)Grad: 1.0839  
EVAL: [0/165] Data 1.569 (1.569) Elapsed 0m 1s (remain 4m 59s) Loss: 0.5706(0.5706) 
EVAL: [100

Epoch 9 - avg_train_loss: 0.5098 avg_val_loss: 0.5358 time: 1167s
Epoch 9 - Accuracy: 0.889331814730448
Epoch 9 - Save Best Score: 0.8893 Model


EVAL: [164/165] Data 0.000 (0.188) Elapsed 1m 9s (remain 0m 0s) Loss: 0.0966(0.5358) 
Epoch: [10][0/659]Data 2.976 (2.976)Elapsed 0m 4s (remain 51m 59s)Loss: 0.2646(0.2646)Grad: 1.4778  
Epoch: [10][100/659]Data 0.000 (0.030)Elapsed 2m 51s (remain 15m 46s)Loss: 0.6253(0.5042)Grad: 1.4297  
Epoch: [10][200/659]Data 0.000 (0.015)Elapsed 5m 37s (remain 12m 48s)Loss: 0.4701(0.4974)Grad: 1.4723  
Epoch: [10][300/659]Data 0.000 (0.010)Elapsed 8m 23s (remain 9m 59s)Loss: 0.5391(0.5114)Grad: 1.3204  
Epoch: [10][400/659]Data 0.000 (0.008)Elapsed 11m 10s (remain 7m 11s)Loss: 0.1929(0.5107)Grad: 0.7630  
Epoch: [10][500/659]Data 0.000 (0.006)Elapsed 13m 56s (remain 4m 23s)Loss: 0.6962(0.5070)Grad: 1.4443  
Epoch: [10][600/659]Data 0.000 (0.005)Elapsed 16m 42s (remain 1m 36s)Loss: 0.1018(0.5024)Grad: 0.8529  
Epoch: [10][658/659]Data 0.000 (0.005)Elapsed 18m 18s (remain 0m 0s)Loss: 0.7189(0.4992)Grad: 2.9819  
EVAL: [0/165] Data 1.771 (1.771) Elapsed 0m 2s (remain 5m 33s) Loss: 0.5549(0.5549) 
EV

Epoch 10 - avg_train_loss: 0.4992 avg_val_loss: 0.5269 time: 1170s
Epoch 10 - Accuracy: 0.8902809415337889
Epoch 10 - Save Best Score: 0.8903 Model


EVAL: [164/165] Data 0.000 (0.198) Elapsed 1m 11s (remain 0m 0s) Loss: 0.0591(0.5269) 


Score: 0.89028
Score: 0.89028
