## Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR='./'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

## CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug = False
    print_freq = 100
    num_workers = 4
    model_name = 'tf_efficientnet_b3_ns'
    size = 512
    scheduler = 'custom'  # ['custom', 'ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']
    batch_size = 24
    epochs = 10  # changed
    #factor=0.2 # ReduceLROnPlateau
    #patience=4 # ReduceLROnPlateau
    #eps=1e-6 # ReduceLROnPlateau
    T_max=10 # CosineAnnealingLR
    #T_0=15 # CosineAnnealingWarmRestarts
    lr = 1e-5 * batch_size
    min_lr = 1e-7 * batch_size
    weight_decay = 1e-6
    loss_train = 'FocalLoss'
    smooth = 1e-2
    t1 = 0.9
    t2 = 1.5
    gradient_accumulation_steps = 1
    max_grad_norm = 50  # changed
    seed = 42
    target_size_list = [8811, 8812, 8811, 8811, 8811]
    target_size = 8811
    target_col = "label_group"
    scale = 30
    margin = 0.5
    fc_dim = 512
    n_fold = 5
    trn_fold = [0]
    train = True
    
    scheduler_params = {
        "lr_start": 1e-6 * batch_size,
        "lr_max": 1e-5 * batch_size,     # 1e-5 * 32 (if batch_size(=32) is different then)
        "lr_min": 1e-7 * batch_size,
        "lr_ramp_ep": 5,
        "lr_sus_ep": 0,
        "lr_decay": 0.8,
    }

## Library

In [3]:
# ====================================================
# Library
# ====================================================
import sys
sys.path.append('/home/yuki/shopee/input/pytorch-image-models/pytorch-image-models-master/')

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from IPython.display import display

import scipy as sp
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau, _LRScheduler

from albumentations import (
    Compose, OneOf, Normalize, Resize, RandomResizedCrop, RandomCrop, HorizontalFlip, VerticalFlip, 
    RandomBrightness, RandomContrast, RandomBrightnessContrast, Rotate, ShiftScaleRotate, Cutout, 
    IAAAdditiveGaussianNoise, Transpose
    )
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data Loading

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('/home/yuki/shopee/input/shopee-product-matching/train.csv')
test = pd.read_csv('/home/yuki/shopee/input/shopee-product-matching/test.csv')
sample_sub = pd.read_csv('/home/yuki/shopee/input/shopee-product-matching/sample_submission.csv')

In [5]:
train.head()

if CFG.debug:
    CFG.epochs = 1
    train = train.sample(n=30, random_state=CFG.seed).reset_index(drop=True)

In [6]:
test.head()

Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


In [7]:
sample_sub.head()

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744
1,test_3588702337,test_3588702337
2,test_4015706929,test_4015706929


## preprocess

In [8]:
ROOT_DIR = '/home/yuki/shopee/input/shopee-product-matching/'
TRAIN_PATH = ROOT_DIR + 'train_images/'
TEST_PATH = ROOT_DIR + 'test_images/'

train['file_path'] = train['image'].apply(lambda x: TRAIN_PATH + x)
test['file_path'] = test['image'].apply(lambda x: TEST_PATH + x)

labelencoder = LabelEncoder()
train['label_group'] = labelencoder.fit_transform(train['label_group'])

## Utils

In [9]:

# ====================================================
# Utils
# ====================================================
# def get_score(y_true, y_pred):
#     scores = []
#     for i in range(y_true.shape[1]):
#         score = roc_auc_score(y_true[:,i], y_pred[:,i])
#         scores.append(score)
#     avg_score = np.mean(scores)
#     return avg_score, scores

@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s.')

def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

## CV split

In [10]:
folds = train.copy()
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = folds['label_group'].values
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col], groups)):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
display(folds.groupby('fold').size())
folds.head()

fold
0    6851
1    6849
2    6850
3    6850
4    6850
dtype: int64

Unnamed: 0,posting_id,image,image_phash,title,label_group,file_path,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,666,../input/shopee-product-matching/train_images/...,3
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",7572,../input/shopee-product-matching/train_images/...,3
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,6172,../input/shopee-product-matching/train_images/...,4
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,10509,../input/shopee-product-matching/train_images/...,3
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,9425,../input/shopee-product-matching/train_images/...,1


In [11]:
for fold in range(5):
    folds_ = folds[folds['fold'] != fold]
    print(folds_['label_group'].nunique())

8811
8812
8811
8811
8811


## Dataset

In [12]:
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.labels = df[CFG.target_col].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_path = self.df['file_path'][idx]
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx])
        return image, label

## Transforms

In [13]:
# ====================================================
# Transforms
# ====================================================
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            Resize(CFG.size, CFG.size, always_apply=True),
            HorizontalFlip(p=0.5),
            VerticalFlip(p=0.5),
            Rotate(limit=120, p=0.8),
            RandomBrightness(limit=(0.09, 0.06), p=0.5),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])
    
    if data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225 ],
            ),
            ToTensorV2(),
        ])

## Model

In [14]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)
        
        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin
        
    def forward(self, x, label):
        cosine = F.linear(F.normalize(x), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin: 
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale
        return output

In [15]:
class CustomEfficientNet(nn.Module):
    def __init__(
        self,
        n_classes = CFG.target_size,
        model_name = CFG.model_name,
        fc_dim = CFG.fc_dim,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = True):
        
        super(CustomEfficientNet, self).__init__()
        print(f'Building Model Backbone for {model_name} model')
        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.use_fc = use_fc
        
        if use_fc:
            self.dropout = nn.Dropout(p=0.5)
            self.classifier = nn.Linear(in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            in_features = fc_dim
        
        self.final = ArcMarginProduct(
            in_features,
            n_classes,
            scale = scale,
            margin = False,
            easy_margin = False,
            ls_eps = 0.0
        )
    
    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
        
    def forward(self, image, label):
        features = self.extract_features(image)
        logits = self.final(features, label)
        return logits
        
    def extract_features(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)
        
        if self.use_fc:
            x = self.dropout(x)
            x = self.classifier(x)
            x = self.bn(x)
        return x

## scheduler

In [16]:
class CustomScheduler(_LRScheduler):
    def __init__(self, optimizer, lr_start=5e-6, lr_max=1e-5,
                 lr_min=1e-6, lr_ramp_ep=5, lr_sus_ep=0, lr_decay=0.8,
                 last_epoch=-1):
        self.lr_start = lr_start
        self.lr_max = lr_max
        self.lr_min = lr_min
        self.lr_ramp_ep = lr_ramp_ep
        self.lr_sus_ep = lr_sus_ep
        self.lr_decay = lr_decay
        super(CustomScheduler, self).__init__(optimizer, last_epoch)
        
    def get_lr(self):
        if not self._get_lr_called_within_step:
            warnings.warn("To get the last learning rate computed by the scheduler, "
                          "please use `get_last_lr()`.", UserWarning)
        
        if self.last_epoch == 0:
            self.last_epoch += 1
            return [self.lr_start for _ in self.optimizer.param_groups]
        
        lr = self._compute_lr_from_epoch()
        self.last_epoch += 1
        
        return [lr for _ in self.optimizer.param_groups]
    
    def _get_closed_form_lr(self):
        return self.base_lrs
    
    def _compute_lr_from_epoch(self):
        if self.last_epoch < self.lr_ramp_ep:
            lr = ((self.lr_max - self.lr_start) / 
                  self.lr_ramp_ep * self.last_epoch + 
                  self.lr_start)
        
        elif self.last_epoch < self.lr_ramp_ep + self.lr_sus_ep:
            lr = self.lr_max
            
        else:
            lr = ((self.lr_max - self.lr_min) * self.lr_decay**
                  (self.last_epoch - self.lr_ramp_ep - self.lr_sus_ep) + 
                  self.lr_min)
        return lr

## Loss Functions

In [17]:
# ====================================================
# Label Smoothing
# ====================================================
class LabelSmoothingLoss(nn.Module): 
    def __init__(self, classes=5, smoothing=0.0, dim=-1): 
        super(LabelSmoothingLoss, self).__init__() 
        self.confidence = 1.0 - smoothing 
        self.smoothing = smoothing 
        self.cls = classes 
        self.dim = dim 
    def forward(self, pred, target): 
        pred = pred.log_softmax(dim=self.dim) 
        with torch.no_grad():
            true_dist = torch.zeros_like(pred) 
            true_dist.fill_(self.smoothing / (self.cls - 1)) 
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [18]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduce = reduce

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [19]:
class FocalCosineLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, xent=.1):
        super(FocalCosineLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

        self.xent = xent

        self.y = torch.Tensor([1]).cuda()

    def forward(self, input, target, reduction="mean"):
        cosine_loss = F.cosine_embedding_loss(input, F.one_hot(target, num_classes=input.size(-1)), self.y, reduction=reduction)

        cent_loss = F.cross_entropy(F.normalize(input), target, reduce=False)
        pt = torch.exp(-cent_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * cent_loss

        if reduction == "mean":
            focal_loss = torch.mean(focal_loss)

        return cosine_loss + self.xent * focal_loss

In [20]:
class SymmetricCrossEntropy(nn.Module):

    def __init__(self, alpha=0.1, beta=1.0, num_classes=CFG.target_size):
        super(SymmetricCrossEntropy, self).__init__()
        self.alpha = alpha
        self.beta = beta
        self.num_classes = num_classes

    def forward(self, logits, targets, reduction='mean'):
        onehot_targets = torch.eye(self.num_classes)[targets].cuda()
        ce_loss = F.cross_entropy(logits, targets, reduction=reduction)
        rce_loss = (-onehot_targets*logits.softmax(1).clamp(1e-7, 1.0).log()).sum(1)
        if reduction == 'mean':
            rce_loss = rce_loss.mean()
        elif reduction == 'sum':
            rce_loss = rce_loss.sum()
        return self.alpha * ce_loss + self.beta * rce_loss

In [21]:
def log_t(u, t):
    """Compute log_t for `u'."""
    if t==1.0:
        return u.log()
    else:
        return (u.pow(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u'."""
    if t==1:
        return u.exp()
    else:
        return (1.0 + (1.0-t)*u).relu().pow(1.0 / (1.0 - t))

def compute_normalization_fixed_point(activations, t, num_iters):

    """Returns the normalization value for each example (t > 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same shape as activation with the last dimension being 1.
    """
    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations_step_0 = activations - mu

    normalized_activations = normalized_activations_step_0

    for _ in range(num_iters):
        logt_partition = torch.sum(
                exp_t(normalized_activations, t), -1, keepdim=True)
        normalized_activations = normalized_activations_step_0 * \
                logt_partition.pow(1.0-t)

    logt_partition = torch.sum(
            exp_t(normalized_activations, t), -1, keepdim=True)
    normalization_constants = - log_t(1.0 / logt_partition, t) + mu

    return normalization_constants

def compute_normalization_binary_search(activations, t, num_iters):

    """Returns the normalization value for each example (t < 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (< 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """

    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations = activations - mu

    effective_dim = \
        torch.sum(
                (normalized_activations > -1.0 / (1.0-t)).to(torch.int32),
            dim=-1, keepdim=True).to(activations.dtype)

    shape_partition = activations.shape[:-1] + (1,)
    lower = torch.zeros(shape_partition, dtype=activations.dtype, device=activations.device)
    upper = -log_t(1.0/effective_dim, t) * torch.ones_like(lower)

    for _ in range(num_iters):
        logt_partition = (upper + lower)/2.0
        sum_probs = torch.sum(
                exp_t(normalized_activations - logt_partition, t),
                dim=-1, keepdim=True)
        update = (sum_probs < 1.0).to(activations.dtype)
        lower = torch.reshape(
                lower * update + (1.0-update) * logt_partition,
                shape_partition)
        upper = torch.reshape(
                upper * (1.0 - update) + update * logt_partition,
                shape_partition)

    logt_partition = (upper + lower)/2.0
    return logt_partition + mu

class ComputeNormalization(torch.autograd.Function):
    """
    Class implementing custom backward pass for compute_normalization. See compute_normalization.
    """
    @staticmethod
    def forward(ctx, activations, t, num_iters):
        if t < 1.0:
            normalization_constants = compute_normalization_binary_search(activations, t, num_iters)
        else:
            normalization_constants = compute_normalization_fixed_point(activations, t, num_iters)

        ctx.save_for_backward(activations, normalization_constants)
        ctx.t=t
        return normalization_constants

    @staticmethod
    def backward(ctx, grad_output):
        activations, normalization_constants = ctx.saved_tensors
        t = ctx.t
        normalized_activations = activations - normalization_constants 
        probabilities = exp_t(normalized_activations, t)
        escorts = probabilities.pow(t)
        escorts = escorts / escorts.sum(dim=-1, keepdim=True)
        grad_input = escorts * grad_output
        
        return grad_input, None, None

def compute_normalization(activations, t, num_iters=5):
    """Returns the normalization value for each example. 
    Backward pass is implemented.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    return ComputeNormalization.apply(activations, t, num_iters)

def tempered_sigmoid(activations, t, num_iters = 5):
    """Tempered sigmoid function.
    Args:
      activations: Activations for the positive class for binary classification.
      t: Temperature tensor > 0.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_probabilities = tempered_softmax(internal_activations, t, num_iters)
    return internal_probabilities[..., 0]


def tempered_softmax(activations, t, num_iters=5):
    """Tempered softmax function.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature > 1.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    if t == 1.0:
        return activations.softmax(dim=-1)

    normalization_constants = compute_normalization(activations, t, num_iters)
    return exp_t(activations - normalization_constants, t)

def bi_tempered_binary_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing = 0.0,
        num_iters=5,
        reduction='mean'):

    """Bi-Tempered binary logistic loss.
    Args:
      activations: A tensor containing activations for class 1.
      labels: A tensor with shape as activations, containing probabilities for class 1
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing
      num_iters: Number of iterations to run the method.
    Returns:
      A loss tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_labels = torch.stack([labels.to(activations.dtype),
        1.0 - labels.to(activations.dtype)],
        dim=-1)
    return bi_tempered_logistic_loss(internal_activations, 
            internal_labels,
            t1,
            t2,
            label_smoothing = label_smoothing,
            num_iters = num_iters,
            reduction = reduction)

def bi_tempered_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing=0.0,
        num_iters=5,
        reduction = 'mean'):

    """Bi-Tempered Logistic Loss.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      labels: A tensor with shape and dtype as activations (onehot), 
        or a long tensor of one dimension less than activations (pytorch standard)
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing parameter between [0, 1). Default 0.0.
      num_iters: Number of iterations to run the method. Default 5.
      reduction: ``'none'`` | ``'mean'`` | ``'sum'``. Default ``'mean'``.
        ``'none'``: No reduction is applied, return shape is shape of
        activations without the last dimension.
        ``'mean'``: Loss is averaged over minibatch. Return shape (1,)
        ``'sum'``: Loss is summed over minibatch. Return shape (1,)
    Returns:
      A loss tensor.
    """

    if len(labels.shape)<len(activations.shape): #not one-hot
        labels_onehot = torch.zeros_like(activations)
        labels_onehot.scatter_(1, labels[..., None], 1)
    else:
        labels_onehot = labels

    if label_smoothing > 0:
        num_classes = labels_onehot.shape[-1]
        labels_onehot = ( 1 - label_smoothing * num_classes / (num_classes - 1) ) \
                * labels_onehot + \
                label_smoothing / (num_classes - 1)

    probabilities = tempered_softmax(activations, t2, num_iters)

    loss_values = labels_onehot * log_t(labels_onehot + 1e-10, t1) \
            - labels_onehot * log_t(probabilities, t1) \
            - labels_onehot.pow(2.0 - t1) / (2.0 - t1) \
            + probabilities.pow(2.0 - t1) / (2.0 - t1)
    loss_values = loss_values.sum(dim = -1) #sum over classes

    if reduction == 'none':
        return loss_values
    if reduction == 'sum':
        return loss_values.sum()
    if reduction == 'mean':
        return loss_values.mean()

In [22]:
class BiTemperedLogisticLoss(nn.Module): 
    def __init__(self, t1, t2, smoothing=0.0): 
        super(BiTemperedLogisticLoss, self).__init__() 
        self.t1 = t1
        self.t2 = t2
        self.smoothing = smoothing
    def forward(self, logit_label, truth_label):
        loss_label = bi_tempered_logistic_loss(
            logit_label, truth_label,
            t1=self.t1, t2=self.t2,
            label_smoothing=self.smoothing,
            reduction='none'
        )
        
        loss_label = loss_label.mean()
        return loss_label

## Helper functions

In [23]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

def train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    tk0 = tqdm(enumerate(train_loader), total=len(train_loader))
    for step, (images, labels) in tk0:
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(images, labels)
        loss = loss_train(y_preds, labels)
        metric = loss_metric(y_preds, labels)
        # record loss
        losses.update(metric.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
        # measure elapesd time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  #'LR: {lr:.6f}  '
                  .format(
                   epoch+1, step, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(train_loader)),
                   grad_norm=grad_norm,
                   #lr=scheduler.get_lr()[0],
                   ))
    return losses.avg

def valid_fn(valid_loader, model, loss_metric, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    tk0 = tqdm(enumerate(valid_loader), total=len(valid_loader))
    for step, (images, labels) in tk0:
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images, labels)
        loss = loss_metric(y_preds, labels)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.softmax(axis=1).to('cpu').numpy())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [24]:
# ====================================================
# Train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index
    
    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    
    
    # ====================================================
    # preprocess
    # ====================================================
    label_group_list = list(set(train_folds['label_group']))
    new_set = set([0])
    label_group_map = {}
    for label in tqdm(label_group_list):
        old_label = label
        while label not in new_set:
            label -= 1
            label = int(label)
        label += 1
        label_group_map[old_label] = label
        new_set.add(label)
    train_folds['label_group'] = train_folds['label_group'].apply(lambda x: label_group_map[x]-1)
    CFG.target_size = CFG.target_size_list[fold]
    
    train_dataset = TrainDataset(train_folds,
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds,
                                 transform=get_transforms(data='valid'))
    
    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers,
                              pin_memory=True,
                              drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers,
                              pin_memory=True,
                              drop_last=False)
    # ====================================================
    # scheduler 
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='custom':
            scheduler = CustomScheduler(optimizer, **CFG.scheduler_params)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler
    
    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomEfficientNet(model_name=CFG.model_name, pretrained=True)
    model.to(device)
    
    if CFG.scheduler == 'custom':
        optimizer = Adam(model.parameters(), lr=CFG.scheduler_params['lr_start'])
    else:
        optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False)
    
    scheduler = get_scheduler(optimizer)
    LOGGER.info(f'scheduler: {CFG.scheduler}')
    
    # ====================================================
    # loop
    # ====================================================
    def get_loss_train():
        if CFG.loss_train == 'CrossEntropyLoss':
            loss_train = nn.CrossEntropyLoss()
        elif CFG.loss_train == 'LabelSmoothing':
            loss_train = LabelSmoothingLoss(classes=CFG.target_size, smoothing=CFG.smooth)
        elif CFG.loss_train == 'FocalLoss':
            loss_train = FocalLoss(gamma=0.5).to(device)
        elif CFG.loss_train == 'FocalCosineLoss':
            loss_train = FocalCosineLoss()
        elif CFG.loss_train == 'SymmetricCrossEntropyLoss':
            loss_train = SymmetricCrossEntropy().to(device)
        elif CFG.loss_train == 'BiTemperedLoss':
            loss_train = BiTemperedLogisticLoss(t1=CFG.t1, t2=CFG.t2, smoothing=CFG.smooth)
        return loss_train
    
    loss_train = get_loss_train()
    LOGGER.info(f'loss_train: {loss_train}')
    loss_metric = nn.CrossEntropyLoss()

    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, loss_train, loss_metric, optimizer, epoch, scheduler, device)
        
        #eval
#         avg_val_loss, preds = valid_fn(valid_loader, model, loss_metric, device)
        valid_labels = valid_folds[CFG.target_col].values
        
        if isinstance(scheduler, CustomScheduler):
            scheduler.step()
        elif isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step() 
            
        elapsed = time.time() - start_time
        
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s')
        
        if avg_loss < best_loss:
            best_loss = avg_loss 
            LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
            torch.save({'model': model.state_dict()},
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
        
#     check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
#     valid_folds[[str(c) for c in range(CFG.target_size)]] = check_point['preds']
#     valid_folds['preds'] = check_point['preds'].argmax(1)
    
    return valid_folds, best_loss

In [25]:
# ====================================================
# main
# ====================================================
def main():

    """
    Prepare: 1.train  2.test  3.submission  4.folds
    """
    
    if CFG.train:
        # train
        oof_df = pd.DataFrame()
        best_losses = []
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df, best_loss = train_loop(folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                best_losses.append(best_loss)
                LOGGER.info(f"========== fold: {fold} result ==========")
                LOGGER.info(f"Loss: {best_loss:.4f}")
        # CV result
        LOGGER.info(f"========== Result ==========")
        LOGGER.info(f"Loss: {np.mean(best_losses):.4f}")

In [26]:
if __name__ == '__main__':
    main()



  0%|          | 0/8811 [00:00<?, ?it/s]

Building Model Backbone for tf_efficientnet_b3_ns model


Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ns-9d44bf68.pth" to /root/.cache/torch/hub/checkpoints/tf_efficientnet_b3_ns-9d44bf68.pth
scheduler: custom
loss_train: CrossEntropyLoss()


  0%|          | 0/1712 [00:00<?, ?it/s]

Epoch: [1][0/1712] Data 1.492 (1.492) Elapsed 0m 3s (remain 102m 52s) Loss: 9.6590(9.6590) Grad: 29.4883  
Epoch: [1][100/1712] Data 0.002 (0.017) Elapsed 1m 4s (remain 17m 2s) Loss: 10.3332(9.9405) Grad: 30.5098  
Epoch: [1][200/1712] Data 0.002 (0.010) Elapsed 2m 5s (remain 15m 40s) Loss: 8.8662(9.9046) Grad: 29.6096  
Epoch: [1][300/1712] Data 0.002 (0.007) Elapsed 3m 5s (remain 14m 30s) Loss: 10.7454(9.9107) Grad: 30.1073  
Epoch: [1][400/1712] Data 0.002 (0.006) Elapsed 4m 6s (remain 13m 24s) Loss: 9.5913(9.8861) Grad: 30.1895  
Epoch: [1][500/1712] Data 0.002 (0.005) Elapsed 5m 6s (remain 12m 20s) Loss: 9.0586(9.8673) Grad: 29.4803  
Epoch: [1][600/1712] Data 0.002 (0.005) Elapsed 6m 7s (remain 11m 18s) Loss: 9.9554(9.8427) Grad: 29.5395  
Epoch: [1][700/1712] Data 0.002 (0.004) Elapsed 7m 7s (remain 10m 16s) Loss: 10.0139(9.8277) Grad: 29.9306  
Epoch: [1][800/1712] Data 0.001 (0.004) Elapsed 8m 7s (remain 9m 14s) Loss: 10.0225(9.8067) Grad: 29.7340  
Epoch: [1][900/1712] Data 0

Epoch 1 - avg_train_loss: 9.6436  time: 1041s
Epoch 1 - Save Best Loss: 9.6436 Model


Epoch: [1][1711/1712] Data 0.002 (0.003) Elapsed 17m 20s (remain 0m 0s) Loss: 9.6089(9.6436) Grad: 28.2736  


  0%|          | 0/1712 [00:00<?, ?it/s]

Epoch: [2][0/1712] Data 1.235 (1.235) Elapsed 0m 2s (remain 57m 47s) Loss: 9.6420(9.6420) Grad: 28.6784  
Epoch: [2][100/1712] Data 0.002 (0.015) Elapsed 1m 1s (remain 16m 22s) Loss: 9.1830(9.1586) Grad: 28.2236  
Epoch: [2][200/1712] Data 0.001 (0.008) Elapsed 2m 0s (remain 15m 8s) Loss: 10.0998(9.0501) Grad: 28.8079  
Epoch: [2][300/1712] Data 0.003 (0.006) Elapsed 3m 0s (remain 14m 3s) Loss: 8.9161(8.9895) Grad: 27.9277  
Epoch: [2][400/1712] Data 0.002 (0.005) Elapsed 3m 59s (remain 13m 2s) Loss: 9.2448(8.9336) Grad: 28.7999  
Epoch: [2][500/1712] Data 0.001 (0.005) Elapsed 4m 58s (remain 12m 1s) Loss: 9.1135(8.8904) Grad: 27.8760  
Epoch: [2][600/1712] Data 0.002 (0.004) Elapsed 5m 57s (remain 11m 1s) Loss: 7.9615(8.8151) Grad: 26.6554  
Epoch: [2][700/1712] Data 0.002 (0.004) Elapsed 6m 56s (remain 10m 1s) Loss: 7.4555(8.7410) Grad: 27.6109  
Epoch: [2][800/1712] Data 0.002 (0.004) Elapsed 7m 56s (remain 9m 1s) Loss: 8.1931(8.6722) Grad: 27.3687  
Epoch: [2][900/1712] Data 0.001 

Epoch 2 - avg_train_loss: 8.0072  time: 1015s
Epoch 2 - Save Best Loss: 8.0072 Model


Epoch: [2][1711/1712] Data 0.001 (0.003) Elapsed 16m 55s (remain 0m 0s) Loss: 7.2783(8.0072) Grad: 27.4641  


  0%|          | 0/1712 [00:00<?, ?it/s]

Epoch: [3][0/1712] Data 1.630 (1.630) Elapsed 0m 2s (remain 71m 29s) Loss: 4.9559(4.9559) Grad: 23.3622  
Epoch: [3][100/1712] Data 0.002 (0.018) Elapsed 1m 1s (remain 16m 26s) Loss: 5.9321(5.9198) Grad: 25.4503  
Epoch: [3][200/1712] Data 0.002 (0.010) Elapsed 2m 0s (remain 15m 6s) Loss: 5.6320(5.8236) Grad: 24.8477  
Epoch: [3][300/1712] Data 0.002 (0.007) Elapsed 2m 59s (remain 14m 2s) Loss: 5.6340(5.8086) Grad: 24.4071  
Epoch: [3][400/1712] Data 0.002 (0.006) Elapsed 3m 58s (remain 12m 59s) Loss: 5.7111(5.8139) Grad: 25.5731  
Epoch: [3][500/1712] Data 0.002 (0.005) Elapsed 4m 57s (remain 11m 59s) Loss: 5.6142(5.7715) Grad: 24.3864  
Epoch: [3][600/1712] Data 0.002 (0.005) Elapsed 5m 57s (remain 10m 59s) Loss: 5.6806(5.7336) Grad: 24.4447  
Epoch: [3][700/1712] Data 0.002 (0.004) Elapsed 6m 55s (remain 9m 59s) Loss: 5.7256(5.6932) Grad: 24.2968  
Epoch: [3][800/1712] Data 0.002 (0.004) Elapsed 7m 55s (remain 9m 0s) Loss: 3.7349(5.6311) Grad: 20.1257  
Epoch: [3][900/1712] Data 0.0

Epoch 3 - avg_train_loss: 5.1662  time: 1014s
Epoch 3 - Save Best Loss: 5.1662 Model


Epoch: [3][1711/1712] Data 0.005 (0.003) Elapsed 16m 53s (remain 0m 0s) Loss: 4.1618(5.1662) Grad: 21.5408  


  0%|          | 0/1712 [00:00<?, ?it/s]

Epoch: [4][0/1712] Data 1.465 (1.465) Elapsed 0m 2s (remain 64m 34s) Loss: 2.9761(2.9761) Grad: 20.5990  
Epoch: [4][100/1712] Data 0.002 (0.017) Elapsed 1m 1s (remain 16m 17s) Loss: 2.7167(2.8724) Grad: 20.0543  
Epoch: [4][200/1712] Data 0.002 (0.009) Elapsed 2m 0s (remain 15m 5s) Loss: 3.1494(2.8830) Grad: 20.8663  
Epoch: [4][300/1712] Data 0.002 (0.007) Elapsed 2m 59s (remain 14m 1s) Loss: 2.6070(2.8656) Grad: 18.9084  
Epoch: [4][400/1712] Data 0.001 (0.006) Elapsed 3m 58s (remain 13m 0s) Loss: 2.7168(2.8595) Grad: 19.9243  
Epoch: [4][500/1712] Data 0.002 (0.005) Elapsed 4m 57s (remain 11m 59s) Loss: 3.0363(2.8586) Grad: 19.6910  
Epoch: [4][600/1712] Data 0.001 (0.004) Elapsed 5m 56s (remain 10m 59s) Loss: 3.1593(2.8394) Grad: 21.3667  
Epoch: [4][700/1712] Data 0.002 (0.004) Elapsed 6m 55s (remain 9m 59s) Loss: 2.8649(2.8292) Grad: 20.2870  
Epoch: [4][800/1712] Data 0.002 (0.004) Elapsed 7m 54s (remain 9m 0s) Loss: 1.6627(2.8293) Grad: 16.8389  
Epoch: [4][900/1712] Data 0.00

Epoch 4 - avg_train_loss: 2.6843  time: 1014s
Epoch 4 - Save Best Loss: 2.6843 Model


Epoch: [4][1711/1712] Data 0.001 (0.003) Elapsed 16m 54s (remain 0m 0s) Loss: 2.1464(2.6843) Grad: 18.4093  


  0%|          | 0/1712 [00:00<?, ?it/s]

Epoch: [5][0/1712] Data 1.235 (1.235) Elapsed 0m 2s (remain 59m 38s) Loss: 1.4039(1.4039) Grad: 15.1362  
Epoch: [5][100/1712] Data 0.002 (0.014) Elapsed 1m 1s (remain 16m 18s) Loss: 1.4435(1.2526) Grad: 16.2676  
Epoch: [5][200/1712] Data 0.006 (0.008) Elapsed 2m 0s (remain 15m 6s) Loss: 0.8732(1.2502) Grad: 12.1924  
Epoch: [5][300/1712] Data 0.002 (0.006) Elapsed 3m 0s (remain 14m 4s) Loss: 1.0802(1.2593) Grad: 13.8974  
Epoch: [5][400/1712] Data 0.002 (0.005) Elapsed 3m 59s (remain 13m 3s) Loss: 1.5982(1.2566) Grad: 16.4681  
Epoch: [5][500/1712] Data 0.002 (0.005) Elapsed 4m 58s (remain 12m 2s) Loss: 1.3044(1.2575) Grad: 14.5880  
Epoch: [5][600/1712] Data 0.002 (0.004) Elapsed 5m 58s (remain 11m 2s) Loss: 1.1111(1.2622) Grad: 14.5398  
Epoch: [5][700/1712] Data 0.002 (0.004) Elapsed 6m 57s (remain 10m 2s) Loss: 1.6895(1.2699) Grad: 15.9774  
Epoch: [5][800/1712] Data 0.006 (0.004) Elapsed 7m 57s (remain 9m 2s) Loss: 1.0638(1.2718) Grad: 13.6996  
Epoch: [5][900/1712] Data 0.002 (

Epoch 5 - avg_train_loss: 1.2456  time: 1018s
Epoch 5 - Save Best Loss: 1.2456 Model


Epoch: [5][1711/1712] Data 0.001 (0.003) Elapsed 16m 58s (remain 0m 0s) Loss: 1.1730(1.2456) Grad: 14.8724  


  0%|          | 0/1712 [00:00<?, ?it/s]

Epoch: [6][0/1712] Data 1.787 (1.787) Elapsed 0m 2s (remain 74m 22s) Loss: 0.9417(0.9417) Grad: 13.1186  
Epoch: [6][100/1712] Data 0.002 (0.020) Elapsed 1m 2s (remain 16m 30s) Loss: 0.7261(0.6647) Grad: 11.6175  
Epoch: [6][200/1712] Data 0.002 (0.011) Elapsed 2m 1s (remain 15m 12s) Loss: 0.4496(0.6583) Grad: 8.7406  
Epoch: [6][300/1712] Data 0.002 (0.008) Elapsed 3m 0s (remain 14m 7s) Loss: 0.7556(0.6549) Grad: 12.0517  
Epoch: [6][400/1712] Data 0.002 (0.007) Elapsed 3m 59s (remain 13m 4s) Loss: 0.4093(0.6531) Grad: 8.2930  
Epoch: [6][500/1712] Data 0.002 (0.006) Elapsed 4m 58s (remain 12m 2s) Loss: 0.3908(0.6573) Grad: 8.0896  
Epoch: [6][600/1712] Data 0.002 (0.005) Elapsed 5m 58s (remain 11m 2s) Loss: 0.7822(0.6609) Grad: 12.0641  
Epoch: [6][700/1712] Data 0.002 (0.005) Elapsed 6m 57s (remain 10m 2s) Loss: 0.9538(0.6656) Grad: 13.7298  
Epoch: [6][800/1712] Data 0.002 (0.004) Elapsed 7m 57s (remain 9m 3s) Loss: 0.5632(0.6648) Grad: 9.8520  
Epoch: [6][900/1712] Data 0.002 (0.0

Epoch 6 - avg_train_loss: 0.6722  time: 1019s
Epoch 6 - Save Best Loss: 0.6722 Model


Epoch: [6][1711/1712] Data 0.001 (0.003) Elapsed 16m 58s (remain 0m 0s) Loss: 0.5003(0.6722) Grad: 9.7540  


  0%|          | 0/1712 [00:00<?, ?it/s]

Epoch: [7][0/1712] Data 1.268 (1.268) Elapsed 0m 2s (remain 61m 31s) Loss: 0.3018(0.3018) Grad: 7.3696  
Epoch: [7][100/1712] Data 0.002 (0.015) Elapsed 1m 1s (remain 16m 27s) Loss: 0.2275(0.4081) Grad: 5.9753  
Epoch: [7][200/1712] Data 0.002 (0.009) Elapsed 2m 0s (remain 15m 9s) Loss: 0.6132(0.4032) Grad: 10.6805  
Epoch: [7][300/1712] Data 0.002 (0.006) Elapsed 2m 59s (remain 14m 2s) Loss: 0.4119(0.4077) Grad: 8.4201  
Epoch: [7][400/1712] Data 0.002 (0.005) Elapsed 3m 58s (remain 13m 1s) Loss: 0.2801(0.4115) Grad: 6.5603  
Epoch: [7][500/1712] Data 0.002 (0.005) Elapsed 4m 57s (remain 12m 0s) Loss: 0.2046(0.4087) Grad: 5.3667  
Epoch: [7][600/1712] Data 0.008 (0.004) Elapsed 5m 57s (remain 11m 0s) Loss: 0.3872(0.4092) Grad: 8.2518  
Epoch: [7][700/1712] Data 0.002 (0.004) Elapsed 6m 56s (remain 10m 0s) Loss: 0.7801(0.4122) Grad: 12.5530  
Epoch: [7][800/1712] Data 0.002 (0.004) Elapsed 7m 55s (remain 9m 0s) Loss: 0.1642(0.4147) Grad: 4.3143  
Epoch: [7][900/1712] Data 0.002 (0.004)

Epoch 7 - avg_train_loss: 0.4251  time: 1014s
Epoch 7 - Save Best Loss: 0.4251 Model


Epoch: [7][1711/1712] Data 0.001 (0.003) Elapsed 16m 53s (remain 0m 0s) Loss: 0.2371(0.4251) Grad: 5.8528  


  0%|          | 0/1712 [00:00<?, ?it/s]

Epoch: [8][0/1712] Data 1.395 (1.395) Elapsed 0m 2s (remain 66m 27s) Loss: 0.5795(0.5795) Grad: 10.4225  
Epoch: [8][100/1712] Data 0.002 (0.016) Elapsed 1m 1s (remain 16m 27s) Loss: 0.2246(0.3063) Grad: 5.6979  
Epoch: [8][200/1712] Data 0.001 (0.009) Elapsed 2m 0s (remain 15m 9s) Loss: 0.1902(0.3034) Grad: 5.4954  
Epoch: [8][300/1712] Data 0.008 (0.007) Elapsed 2m 59s (remain 14m 3s) Loss: 0.3893(0.3010) Grad: 8.1169  
Epoch: [8][400/1712] Data 0.001 (0.006) Elapsed 3m 58s (remain 13m 0s) Loss: 0.2919(0.2993) Grad: 6.8835  
Epoch: [8][500/1712] Data 0.002 (0.005) Elapsed 4m 57s (remain 12m 0s) Loss: 0.0914(0.3014) Grad: 2.5448  
Epoch: [8][600/1712] Data 0.001 (0.004) Elapsed 5m 57s (remain 11m 0s) Loss: 0.7856(0.3021) Grad: 11.5159  
Epoch: [8][700/1712] Data 0.001 (0.004) Elapsed 6m 56s (remain 10m 0s) Loss: 0.2643(0.3008) Grad: 6.7147  
Epoch: [8][800/1712] Data 0.001 (0.004) Elapsed 7m 55s (remain 9m 0s) Loss: 0.4454(0.3003) Grad: 8.8782  
Epoch: [8][900/1712] Data 0.002 (0.004)

Epoch 8 - avg_train_loss: 0.3140  time: 1013s
Epoch 8 - Save Best Loss: 0.3140 Model


Epoch: [8][1711/1712] Data 0.001 (0.003) Elapsed 16m 53s (remain 0m 0s) Loss: 0.4536(0.3140) Grad: 8.8496  


  0%|          | 0/1712 [00:00<?, ?it/s]

Epoch: [9][0/1712] Data 1.675 (1.675) Elapsed 0m 2s (remain 73m 25s) Loss: 0.2403(0.2403) Grad: 6.3272  
Epoch: [9][100/1712] Data 0.001 (0.019) Elapsed 1m 1s (remain 16m 26s) Loss: 0.2309(0.2405) Grad: 6.6571  
Epoch: [9][200/1712] Data 0.001 (0.010) Elapsed 2m 0s (remain 15m 9s) Loss: 0.4682(0.2386) Grad: 9.0394  
Epoch: [9][300/1712] Data 0.002 (0.008) Elapsed 2m 59s (remain 14m 3s) Loss: 0.3781(0.2406) Grad: 8.3053  
Epoch: [9][400/1712] Data 0.002 (0.006) Elapsed 3m 59s (remain 13m 1s) Loss: 0.2195(0.2405) Grad: 6.0717  
Epoch: [9][500/1712] Data 0.002 (0.005) Elapsed 4m 58s (remain 12m 0s) Loss: 0.0769(0.2417) Grad: 2.4070  
Epoch: [9][600/1712] Data 0.002 (0.005) Elapsed 5m 57s (remain 11m 0s) Loss: 0.2880(0.2427) Grad: 6.8072  
Epoch: [9][700/1712] Data 0.008 (0.004) Elapsed 6m 56s (remain 10m 0s) Loss: 0.2408(0.2409) Grad: 5.8496  
Epoch: [9][800/1712] Data 0.003 (0.004) Elapsed 7m 55s (remain 9m 0s) Loss: 0.3630(0.2417) Grad: 8.1973  
Epoch: [9][900/1712] Data 0.002 (0.004) E

Epoch 9 - avg_train_loss: 0.2495  time: 1013s
Epoch 9 - Save Best Loss: 0.2495 Model


Epoch: [9][1711/1712] Data 0.001 (0.003) Elapsed 16m 52s (remain 0m 0s) Loss: 0.1049(0.2495) Grad: 3.5423  


  0%|          | 0/1712 [00:00<?, ?it/s]

Epoch: [10][0/1712] Data 1.463 (1.463) Elapsed 0m 2s (remain 62m 46s) Loss: 0.3318(0.3318) Grad: 7.6898  
Epoch: [10][100/1712] Data 0.001 (0.017) Elapsed 1m 1s (remain 16m 22s) Loss: 0.2132(0.2106) Grad: 6.3562  
Epoch: [10][200/1712] Data 0.001 (0.009) Elapsed 2m 0s (remain 15m 5s) Loss: 0.2917(0.2055) Grad: 6.6260  
Epoch: [10][300/1712] Data 0.002 (0.007) Elapsed 2m 59s (remain 14m 2s) Loss: 0.1981(0.2048) Grad: 5.5926  
Epoch: [10][400/1712] Data 0.001 (0.006) Elapsed 3m 58s (remain 13m 1s) Loss: 0.0900(0.2058) Grad: 2.5763  
Epoch: [10][500/1712] Data 0.002 (0.005) Elapsed 4m 58s (remain 12m 0s) Loss: 0.1319(0.2074) Grad: 4.2266  
Epoch: [10][600/1712] Data 0.001 (0.005) Elapsed 5m 57s (remain 11m 0s) Loss: 0.1732(0.2082) Grad: 4.9342  
Epoch: [10][700/1712] Data 0.002 (0.004) Elapsed 6m 56s (remain 10m 0s) Loss: 0.1460(0.2092) Grad: 4.2494  
Epoch: [10][800/1712] Data 0.002 (0.004) Elapsed 7m 55s (remain 9m 0s) Loss: 0.4769(0.2080) Grad: 9.4070  
Epoch: [10][900/1712] Data 0.002

Epoch 10 - avg_train_loss: 0.2121  time: 1013s
Epoch 10 - Save Best Loss: 0.2121 Model


Epoch: [10][1711/1712] Data 0.001 (0.003) Elapsed 16m 53s (remain 0m 0s) Loss: 0.3127(0.2121) Grad: 7.4287  


Loss: 0.2121
Loss: 0.2121
