## Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR='./'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

## CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug = False
    print_freq = 100
    num_workers = 4
    model_name = 'tf_efficientnet_b3_ns'
    size = 512
    scheduler = 'custom'  # ['custom', 'ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']
    batch_size = 12
    epochs = 10  # changed
    #factor=0.2 # ReduceLROnPlateau
    #patience=4 # ReduceLROnPlateau
    #eps=1e-6 # ReduceLROnPlateau
    T_max=10 # CosineAnnealingLR
    #T_0=15 # CosineAnnealingWarmRestarts
    lr = 1e-5 * batch_size
    min_lr = 1e-7 * batch_size
    weight_decay = 1e-6
    gradient_accumulation_steps = 1
    max_grad_norm = 50  # changed
    seed = 42
    target_size = 11014
    target_col = "label_group"
    scale = 30
    margin = 0.5
    fc_dim = 512
    n_fold = 5
    trn_fold = [0]
    train = True
    
    scheduler_params = {
        "lr_start": 1e-6 * batch_size,
        "lr_max": 1e-5 * batch_size,     # 1e-5 * 32 (if batch_size(=32) is different then)
        "lr_min": 1e-7 * batch_size,
        "lr_ramp_ep": 5,
        "lr_sus_ep": 0,
        "lr_decay": 0.8,
    }

## Library

In [3]:
# ====================================================
# Library
# ====================================================
import sys
sys.path.append('/home/yuki/shopee/input/pytorch-image-models/pytorch-image-models-master/')
sys.path.append('/home/yuki/shopee/input/pytorch-sam')

from sam import SAM

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau, _LRScheduler

from albumentations import (
    Compose, OneOf, Normalize, Resize, RandomResizedCrop, RandomCrop, HorizontalFlip, VerticalFlip, 
    RandomBrightness, RandomContrast, RandomBrightnessContrast, Rotate, ShiftScaleRotate, Cutout, 
    IAAAdditiveGaussianNoise, Transpose
    )
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data Loading

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('/home/yuki/shopee/input/shopee-product-matching/train.csv')
test = pd.read_csv('/home/yuki/shopee/input/shopee-product-matching/test.csv')
sample_sub = pd.read_csv('/home/yuki/shopee/input/shopee-product-matching/sample_submission.csv')

In [5]:
train.head()

if CFG.debug:
    CFG.epochs = 3
    train = train.sample(n=3000, random_state=CFG.seed).reset_index(drop=True)

In [6]:
test.head()

Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


In [7]:
sample_sub.head()

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744
1,test_3588702337,test_3588702337
2,test_4015706929,test_4015706929


## preprocess

In [8]:
ROOT_DIR = '../input/shopee-product-matching/'
os.listdir(ROOT_DIR)

['sample_submission.csv',
 'train_images',
 'train.csv',
 'test.csv',
 'test_images']

In [9]:
ROOT_DIR = '../input/shopee-product-matching/'
TRAIN_PATH = ROOT_DIR + 'train_images/'
TEST_PATH = ROOT_DIR + 'test_images/'

train['file_path'] = train['image'].apply(lambda x: TRAIN_PATH + x)
test['file_path'] = test['image'].apply(lambda x: TEST_PATH + x)

labelencoder = LabelEncoder()
train['label_group'] = labelencoder.fit_transform(train['label_group'])

## Utils

In [10]:

# ====================================================
# Utils
# ====================================================
# def get_score(y_true, y_pred):
#     scores = []
#     for i in range(y_true.shape[1]):
#         score = roc_auc_score(y_true[:,i], y_pred[:,i])
#         scores.append(score)
#     avg_score = np.mean(scores)
#     return avg_score, scores

@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s.')

def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

## CV split

In [11]:
folds = train.copy()
Fold = StratifiedKFold(n_splits=CFG.n_fold)
groups = folds['label_group'].values
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col], groups)):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
display(folds.groupby('fold').size())
folds.head()

fold
0    6850
1    6850
2    6850
3    6850
4    6850
dtype: int64

Unnamed: 0,posting_id,image,image_phash,title,label_group,file_path,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,666,../input/shopee-product-matching/train_images/...,0
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",7572,../input/shopee-product-matching/train_images/...,2
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,6172,../input/shopee-product-matching/train_images/...,0
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,10509,../input/shopee-product-matching/train_images/...,1
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,9425,../input/shopee-product-matching/train_images/...,3


## Dataset

In [12]:
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.labels = df[CFG.target_col].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_path = self.df['file_path'][idx]
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx])
        return image, label

## Transforms

In [13]:
# ====================================================
# Transforms
# ====================================================
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            Resize(CFG.size, CFG.size, always_apply=True),
            HorizontalFlip(p=0.5),
            VerticalFlip(p=0.5),
            Rotate(limit=120, p=0.8),
            RandomBrightness(limit=(0.09, 0.06), p=0.5),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])
    
    if data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225 ],
            ),
            ToTensorV2(),
        ])

## Model

In [14]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)
        
        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin
        
    def forward(self, x, label):
        cosine = F.linear(F.normalize(x), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin: 
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale
        return output

In [15]:
class CustomEfficientNet(nn.Module):
    def __init__(
        self,
        n_classes = CFG.target_size,
        model_name = CFG.model_name,
        fc_dim = CFG.fc_dim,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = True):
        
        super(CustomEfficientNet, self).__init__()
        print(f'Building Model Backbone for {model_name} model')
        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.use_fc = use_fc
        
        if use_fc:
            self.dropout = nn.Dropout(p=0.1)
            self.classifier = nn.Linear(in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            in_features = fc_dim
        
        self.final = ArcMarginProduct(
            in_features,
            n_classes,
            scale = scale,
            margin = False,
            easy_margin = False,
            ls_eps = 0.0
        )
    
    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
        
    def forward(self, image, label):
        features = self.extract_features(image)
        logits = self.final(features, label)
        return logits
        
    def extract_features(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)
        
        if self.use_fc:
            x = self.dropout(x)
            x = self.classifier(x)
            x = self.bn(x)
        return x

## scheduler

In [16]:
class CustomScheduler(_LRScheduler):
    def __init__(self, optimizer, lr_start=5e-6, lr_max=1e-5,
                 lr_min=1e-6, lr_ramp_ep=5, lr_sus_ep=0, lr_decay=0.8,
                 last_epoch=-1):
        self.lr_start = lr_start
        self.lr_max = lr_max
        self.lr_min = lr_min
        self.lr_ramp_ep = lr_ramp_ep
        self.lr_sus_ep = lr_sus_ep
        self.lr_decay = lr_decay
        super(CustomScheduler, self).__init__(optimizer, last_epoch)
        
    def get_lr(self):
        if not self._get_lr_called_within_step:
            warnings.warn("To get the last learning rate computed by the scheduler, "
                          "please use `get_last_lr()`.", UserWarning)
        
        if self.last_epoch == 0:
            self.last_epoch += 1
            return [self.lr_start for _ in self.optimizer.param_groups]
        
        lr = self._compute_lr_from_epoch()
        self.last_epoch += 1
        
        return [lr for _ in self.optimizer.param_groups]
    
    def _get_closed_form_lr(self):
        return self.base_lrs
    
    def _compute_lr_from_epoch(self):
        if self.last_epoch < self.lr_ramp_ep:
            lr = ((self.lr_max - self.lr_start) / 
                  self.lr_ramp_ep * self.last_epoch + 
                  self.lr_start)
        
        elif self.last_epoch < self.lr_ramp_ep + self.lr_sus_ep:
            lr = self.lr_max
            
        else:
            lr = ((self.lr_max - self.lr_min) * self.lr_decay**
                  (self.last_epoch - self.lr_ramp_ep - self.lr_sus_ep) + 
                  self.lr_min)
        return lr

## Helper functions

In [17]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    tk0 = tqdm(enumerate(train_loader), total=len(train_loader))
    for step, (images, labels) in tk0:
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(images, labels)
        loss = criterion(y_preds, labels)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.first_step(zero_grad=True)
#             optimizer.zero_grad()
            global_step += 1
            criterion(model(images, labels), labels).backward()
            optimizer.second_step(zero_grad=True)
        # measure elapesd time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  #'LR: {lr:.6f}  '
                  .format(
                   epoch+1, step, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(train_loader)),
                   grad_norm=grad_norm,
                   #lr=scheduler.get_lr()[0],
                   ))
    return losses.avg

def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    tk0 = tqdm(enumerate(valid_loader), total=len(valid_loader))
    for step, (images, labels) in tk0:
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images, labels)
        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.softmax(axis=1).to('cpu').numpy())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [18]:
# ====================================================
# Train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index
    
    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    
    train_dataset = TrainDataset(train_folds,
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds,
                                 transform=get_transforms(data='valid'))
    
    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers,
                              pin_memory=True,
                              drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers,
                              pin_memory=True,
                              drop_last=False)
    # ====================================================
    # scheduler 
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='custom':
            scheduler = CustomScheduler(optimizer, **CFG.scheduler_params)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler
    
    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomEfficientNet(model_name=CFG.model_name, pretrained=True)
    model.to(device)
    
    if CFG.scheduler == 'custom':
        base_optimizer = Adam
        optimizer = SAM(model.parameters(), base_optimizer, lr=CFG.scheduler_params['lr_start'])
    else:
        base_optimizer = Adam
        optimize = SAM(model.parameters(), base_optimizer, lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False)
    
    
    scheduler = get_scheduler(optimizer)
    LOGGER.info(f'scheduler: {CFG.scheduler}')
    
    # ====================================================
    # loop
    # ====================================================
    criterion = nn.CrossEntropyLoss()

    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)
        
        #eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        valid_labels = valid_folds[CFG.target_col].values
        
        if isinstance(scheduler, CustomScheduler):
            scheduler.step()
        elif isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step() 
            
        elapsed = time.time() - start_time
        
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss 
            LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'preds': preds},
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
        
    check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    valid_folds[[str(c) for c in range(CFG.target_size)]] = check_point['preds']
    valid_folds['preds'] = check_point['preds'].argmax(1)
    
    return valid_folds, best_loss

In [19]:
# ====================================================
# main
# ====================================================
def main():

    """
    Prepare: 1.train  2.test  3.submission  4.folds
    """
    
    if CFG.train:
        # train
        oof_df = pd.DataFrame()
        best_losses = []
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df, best_loss = train_loop(folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                best_losses.append(best_loss)
                LOGGER.info(f"========== fold: {fold} result ==========")
                LOGGER.info(f"Loss: {best_loss:.4f}")
        # CV result
        LOGGER.info(f"========== Result ==========")
        LOGGER.info(f"Loss: {np.mean(best_losses):.4f}")
        # save result
#         oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)

In [20]:
if __name__ == '__main__':
    main()



Building Model Backbone for tf_efficientnet_b3_ns model


Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ns-9d44bf68.pth" to /root/.cache/torch/hub/checkpoints/tf_efficientnet_b3_ns-9d44bf68.pth
scheduler: custom


  0%|          | 0/2283 [00:00<?, ?it/s]

Epoch: [1][0/2283] Data 1.293 (1.293) Elapsed 0m 3s (remain 131m 33s) Loss: 9.8696(9.8696) Grad: 37.8591  
Epoch: [1][100/2283] Data 0.009 (0.015) Elapsed 1m 34s (remain 33m 51s) Loss: 10.3753(10.1965) Grad: 37.5898  
Epoch: [1][200/2283] Data 0.002 (0.008) Elapsed 3m 4s (remain 31m 48s) Loss: 9.5625(10.1634) Grad: 37.0811  
Epoch: [1][300/2283] Data 0.002 (0.006) Elapsed 4m 34s (remain 30m 7s) Loss: 10.4415(10.1112) Grad: 36.6238  
Epoch: [1][400/2283] Data 0.002 (0.005) Elapsed 6m 4s (remain 28m 32s) Loss: 9.9186(10.0734) Grad: 36.3126  
Epoch: [1][500/2283] Data 0.011 (0.004) Elapsed 7m 34s (remain 26m 57s) Loss: 9.4830(10.0501) Grad: 36.1190  
Epoch: [1][600/2283] Data 0.003 (0.004) Elapsed 9m 4s (remain 25m 24s) Loss: 9.9252(10.0175) Grad: 36.5943  
Epoch: [1][700/2283] Data 0.002 (0.004) Elapsed 10m 34s (remain 23m 52s) Loss: 10.2858(9.9879) Grad: 37.0922  
Epoch: [1][800/2283] Data 0.002 (0.003) Elapsed 12m 4s (remain 22m 21s) Loss: 9.6629(9.9643) Grad: 36.1827  
Epoch: [1][900/

  0%|          | 0/571 [00:00<?, ?it/s]

EVAL: [0/571] Data 1.232 (1.232) Elapsed 0m 1s (remain 13m 17s) Loss: 9.0611(9.0611) 
EVAL: [100/571] Data 0.007 (0.101) Elapsed 0m 21s (remain 1m 40s) Loss: 9.0707(9.0479) 
EVAL: [200/571] Data 0.015 (0.096) Elapsed 0m 42s (remain 1m 18s) Loss: 8.8790(9.0020) 
EVAL: [300/571] Data 0.003 (0.094) Elapsed 1m 3s (remain 0m 56s) Loss: 9.4757(9.0373) 
EVAL: [400/571] Data 0.041 (0.094) Elapsed 1m 24s (remain 0m 35s) Loss: 9.2023(9.0847) 
EVAL: [500/571] Data 0.018 (0.093) Elapsed 1m 45s (remain 0m 14s) Loss: 9.1770(9.1377) 


Epoch 1 - avg_train_loss: 9.6719  avg_val_loss: 9.1819  time: 2180s
Epoch 1 - Save Best Loss: 9.1819 Model


EVAL: [570/571] Data 0.001 (0.092) Elapsed 1m 59s (remain 0m 0s) Loss: 9.6131(9.1819) 


  0%|          | 0/2283 [00:00<?, ?it/s]

Epoch: [2][0/2283] Data 1.090 (1.090) Elapsed 0m 2s (remain 85m 24s) Loss: 8.8744(8.8744) Grad: 31.9475  
Epoch: [2][100/2283] Data 0.002 (0.013) Elapsed 1m 31s (remain 33m 2s) Loss: 9.1671(9.2161) Grad: 31.3244  
Epoch: [2][200/2283] Data 0.002 (0.007) Elapsed 3m 1s (remain 31m 15s) Loss: 8.9802(9.2247) Grad: 31.9681  
Epoch: [2][300/2283] Data 0.002 (0.006) Elapsed 4m 30s (remain 29m 40s) Loss: 9.3466(9.2068) Grad: 32.8511  
Epoch: [2][400/2283] Data 0.002 (0.005) Elapsed 5m 59s (remain 28m 8s) Loss: 9.5335(9.1443) Grad: 34.1142  
Epoch: [2][500/2283] Data 0.002 (0.004) Elapsed 7m 29s (remain 26m 38s) Loss: 8.3616(9.0907) Grad: 34.0318  
Epoch: [2][600/2283] Data 0.002 (0.004) Elapsed 8m 58s (remain 25m 7s) Loss: 8.1233(9.0306) Grad: 33.4339  
Epoch: [2][700/2283] Data 0.002 (0.003) Elapsed 10m 28s (remain 23m 37s) Loss: 8.9611(8.9714) Grad: 33.0777  
Epoch: [2][800/2283] Data 0.002 (0.003) Elapsed 11m 57s (remain 22m 7s) Loss: 6.8277(8.9050) Grad: 32.4603  
Epoch: [2][900/2283] Data

  0%|          | 0/571 [00:00<?, ?it/s]

EVAL: [0/571] Data 0.960 (0.960) Elapsed 0m 1s (remain 10m 21s) Loss: 7.1235(7.1235) 
EVAL: [100/571] Data 0.185 (0.081) Elapsed 0m 19s (remain 1m 31s) Loss: 6.2821(5.7457) 
EVAL: [200/571] Data 0.316 (0.082) Elapsed 0m 39s (remain 1m 13s) Loss: 6.7606(5.6564) 
EVAL: [300/571] Data 0.007 (0.078) Elapsed 0m 58s (remain 0m 52s) Loss: 6.5292(5.7530) 
EVAL: [400/571] Data 0.002 (0.078) Elapsed 1m 17s (remain 0m 32s) Loss: 6.2727(5.9468) 
EVAL: [500/571] Data 0.060 (0.077) Elapsed 1m 37s (remain 0m 13s) Loss: 6.5591(6.1988) 


Epoch 2 - avg_train_loss: 8.0748  avg_val_loss: 6.4241  time: 2153s
Epoch 2 - Save Best Loss: 6.4241 Model


EVAL: [570/571] Data 0.001 (0.076) Elapsed 1m 49s (remain 0m 0s) Loss: 8.1106(6.4241) 


  0%|          | 0/2283 [00:00<?, ?it/s]

Epoch: [3][0/2283] Data 1.073 (1.073) Elapsed 0m 2s (remain 88m 2s) Loss: 4.6216(4.6216) Grad: 29.6373  
Epoch: [3][100/2283] Data 0.002 (0.012) Elapsed 1m 31s (remain 33m 2s) Loss: 5.5904(5.7260) Grad: 30.0510  
Epoch: [3][200/2283] Data 0.002 (0.007) Elapsed 3m 1s (remain 31m 16s) Loss: 6.7677(5.7400) Grad: 31.1089  
Epoch: [3][300/2283] Data 0.002 (0.005) Elapsed 4m 30s (remain 29m 40s) Loss: 5.4555(5.6851) Grad: 30.3818  
Epoch: [3][400/2283] Data 0.002 (0.004) Elapsed 5m 59s (remain 28m 9s) Loss: 5.6755(5.6645) Grad: 28.2531  
Epoch: [3][500/2283] Data 0.002 (0.004) Elapsed 7m 29s (remain 26m 38s) Loss: 7.2995(5.6713) Grad: 31.8570  
Epoch: [3][600/2283] Data 0.002 (0.004) Elapsed 8m 59s (remain 25m 8s) Loss: 5.3809(5.6387) Grad: 28.6750  
Epoch: [3][700/2283] Data 0.002 (0.003) Elapsed 10m 28s (remain 23m 38s) Loss: 6.3160(5.5924) Grad: 29.4799  
Epoch: [3][800/2283] Data 0.002 (0.003) Elapsed 11m 58s (remain 22m 8s) Loss: 6.0840(5.5517) Grad: 29.9899  
Epoch: [3][900/2283] Data 

  0%|          | 0/571 [00:00<?, ?it/s]

EVAL: [0/571] Data 1.059 (1.059) Elapsed 0m 1s (remain 11m 45s) Loss: 5.6463(5.6463) 
EVAL: [100/571] Data 0.028 (0.084) Elapsed 0m 20s (remain 1m 35s) Loss: 3.2052(3.3704) 
EVAL: [200/571] Data 0.002 (0.077) Elapsed 0m 39s (remain 1m 11s) Loss: 3.5077(3.3625) 
EVAL: [300/571] Data 0.025 (0.078) Elapsed 0m 58s (remain 0m 52s) Loss: 3.3836(3.4765) 
EVAL: [400/571] Data 0.146 (0.076) Elapsed 1m 18s (remain 0m 33s) Loss: 3.3190(3.6587) 
EVAL: [500/571] Data 0.110 (0.076) Elapsed 1m 37s (remain 0m 13s) Loss: 3.8145(3.9022) 


Epoch 3 - avg_train_loss: 5.1309  avg_val_loss: 4.1580  time: 2154s
Epoch 3 - Save Best Loss: 4.1580 Model


EVAL: [570/571] Data 0.001 (0.076) Elapsed 1m 51s (remain 0m 0s) Loss: 6.5771(4.1580) 


  0%|          | 0/2283 [00:00<?, ?it/s]

Epoch: [4][0/2283] Data 1.059 (1.059) Elapsed 0m 2s (remain 84m 6s) Loss: 1.1434(1.1434) Grad: 16.0309  
Epoch: [4][100/2283] Data 0.005 (0.012) Elapsed 1m 31s (remain 33m 3s) Loss: 2.2283(2.7103) Grad: 22.2913  
Epoch: [4][200/2283] Data 0.002 (0.007) Elapsed 3m 1s (remain 31m 17s) Loss: 3.2706(2.5985) Grad: 25.5156  
Epoch: [4][300/2283] Data 0.002 (0.005) Elapsed 4m 31s (remain 29m 44s) Loss: 2.9324(2.5946) Grad: 24.4927  
Epoch: [4][400/2283] Data 0.002 (0.004) Elapsed 6m 0s (remain 28m 12s) Loss: 2.8689(2.5829) Grad: 24.8746  
Epoch: [4][500/2283] Data 0.002 (0.004) Elapsed 7m 30s (remain 26m 41s) Loss: 2.9064(2.5814) Grad: 25.1368  
Epoch: [4][600/2283] Data 0.002 (0.004) Elapsed 8m 59s (remain 25m 10s) Loss: 1.8187(2.5836) Grad: 21.2363  
Epoch: [4][700/2283] Data 0.002 (0.003) Elapsed 10m 29s (remain 23m 39s) Loss: 2.8825(2.5911) Grad: 26.8020  
Epoch: [4][800/2283] Data 0.002 (0.003) Elapsed 11m 58s (remain 22m 9s) Loss: 1.1713(2.5817) Grad: 18.4328  
Epoch: [4][900/2283] Data

  0%|          | 0/571 [00:00<?, ?it/s]

EVAL: [0/571] Data 0.820 (0.820) Elapsed 0m 0s (remain 9m 0s) Loss: 4.1998(4.1998) 
EVAL: [100/571] Data 0.006 (0.083) Elapsed 0m 20s (remain 1m 33s) Loss: 2.5813(2.4607) 
EVAL: [200/571] Data 0.010 (0.075) Elapsed 0m 38s (remain 1m 11s) Loss: 2.4829(2.4242) 
EVAL: [300/571] Data 0.015 (0.076) Elapsed 0m 58s (remain 0m 52s) Loss: 2.2797(2.5134) 
EVAL: [400/571] Data 0.037 (0.076) Elapsed 1m 17s (remain 0m 32s) Loss: 2.6914(2.6325) 
EVAL: [500/571] Data 0.394 (0.076) Elapsed 1m 37s (remain 0m 13s) Loss: 2.3553(2.8017) 


Epoch 4 - avg_train_loss: 2.5400  avg_val_loss: 3.0013  time: 2155s
Epoch 4 - Save Best Loss: 3.0013 Model


EVAL: [570/571] Data 0.001 (0.076) Elapsed 1m 49s (remain 0m 0s) Loss: 5.1600(3.0013) 


  0%|          | 0/2283 [00:00<?, ?it/s]

Epoch: [5][0/2283] Data 1.059 (1.059) Elapsed 0m 2s (remain 86m 38s) Loss: 1.8067(1.8067) Grad: 21.5012  
Epoch: [5][100/2283] Data 0.002 (0.012) Elapsed 1m 31s (remain 33m 2s) Loss: 1.1765(1.0703) Grad: 16.9221  
Epoch: [5][200/2283] Data 0.002 (0.007) Elapsed 3m 1s (remain 31m 18s) Loss: 0.9120(1.0320) Grad: 16.8207  
Epoch: [5][300/2283] Data 0.002 (0.005) Elapsed 4m 31s (remain 29m 44s) Loss: 0.8191(1.0481) Grad: 14.1865  
Epoch: [5][400/2283] Data 0.002 (0.004) Elapsed 6m 0s (remain 28m 12s) Loss: 0.5459(1.0393) Grad: 10.1978  
Epoch: [5][500/2283] Data 0.002 (0.004) Elapsed 7m 30s (remain 26m 40s) Loss: 0.8850(1.0403) Grad: 15.4911  
Epoch: [5][600/2283] Data 0.002 (0.004) Elapsed 8m 59s (remain 25m 9s) Loss: 1.3825(1.0433) Grad: 18.8171  
Epoch: [5][700/2283] Data 0.002 (0.003) Elapsed 10m 28s (remain 23m 39s) Loss: 1.0510(1.0437) Grad: 17.0741  
Epoch: [5][800/2283] Data 0.011 (0.003) Elapsed 11m 58s (remain 22m 9s) Loss: 1.1873(1.0496) Grad: 17.6134  
Epoch: [5][900/2283] Data

  0%|          | 0/571 [00:00<?, ?it/s]

EVAL: [0/571] Data 0.842 (0.842) Elapsed 0m 0s (remain 9m 19s) Loss: 3.2243(3.2243) 
EVAL: [100/571] Data 0.095 (0.083) Elapsed 0m 20s (remain 1m 33s) Loss: 2.1969(2.0317) 
EVAL: [200/571] Data 0.072 (0.077) Elapsed 0m 38s (remain 1m 11s) Loss: 1.7118(1.9841) 
EVAL: [300/571] Data 0.003 (0.075) Elapsed 0m 58s (remain 0m 52s) Loss: 1.4097(2.0497) 
EVAL: [400/571] Data 0.013 (0.076) Elapsed 1m 17s (remain 0m 32s) Loss: 2.4039(2.1465) 
EVAL: [500/571] Data 0.013 (0.075) Elapsed 1m 37s (remain 0m 13s) Loss: 1.8948(2.2782) 


Epoch 5 - avg_train_loss: 1.0857  avg_val_loss: 2.4309  time: 2154s
Epoch 5 - Save Best Loss: 2.4309 Model


EVAL: [570/571] Data 0.001 (0.074) Elapsed 1m 49s (remain 0m 0s) Loss: 4.1441(2.4309) 


  0%|          | 0/2283 [00:00<?, ?it/s]

Epoch: [6][0/2283] Data 1.945 (1.945) Elapsed 0m 2s (remain 113m 48s) Loss: 0.5749(0.5749) Grad: 11.4470  
Epoch: [6][100/2283] Data 0.002 (0.021) Elapsed 1m 32s (remain 33m 15s) Loss: 0.4130(0.4901) Grad: 10.0669  
Epoch: [6][200/2283] Data 0.002 (0.011) Elapsed 3m 1s (remain 31m 23s) Loss: 0.5182(0.4799) Grad: 11.9237  
Epoch: [6][300/2283] Data 0.002 (0.008) Elapsed 4m 31s (remain 29m 45s) Loss: 0.6839(0.4799) Grad: 14.0172  
Epoch: [6][400/2283] Data 0.002 (0.007) Elapsed 6m 0s (remain 28m 12s) Loss: 0.3157(0.4876) Grad: 8.9139  
Epoch: [6][500/2283] Data 0.002 (0.006) Elapsed 7m 30s (remain 26m 40s) Loss: 0.1425(0.4861) Grad: 5.6270  
Epoch: [6][600/2283] Data 0.002 (0.005) Elapsed 8m 59s (remain 25m 10s) Loss: 0.5171(0.4929) Grad: 11.4059  
Epoch: [6][700/2283] Data 0.002 (0.005) Elapsed 10m 29s (remain 23m 39s) Loss: 0.5361(0.4960) Grad: 12.3576  
Epoch: [6][800/2283] Data 0.002 (0.004) Elapsed 11m 58s (remain 22m 9s) Loss: 0.1981(0.4957) Grad: 5.7467  
Epoch: [6][900/2283] Data

  0%|          | 0/571 [00:00<?, ?it/s]

EVAL: [0/571] Data 0.937 (0.937) Elapsed 0m 1s (remain 9m 59s) Loss: 2.7238(2.7238) 
EVAL: [100/571] Data 0.029 (0.083) Elapsed 0m 20s (remain 1m 34s) Loss: 1.9620(1.8386) 
EVAL: [200/571] Data 0.012 (0.080) Elapsed 0m 39s (remain 1m 13s) Loss: 1.8345(1.7922) 
EVAL: [300/571] Data 0.011 (0.078) Elapsed 0m 58s (remain 0m 52s) Loss: 1.2644(1.8450) 
EVAL: [400/571] Data 0.071 (0.077) Elapsed 1m 18s (remain 0m 33s) Loss: 1.7814(1.9180) 
EVAL: [500/571] Data 0.267 (0.078) Elapsed 1m 38s (remain 0m 13s) Loss: 1.1722(2.0274) 
EVAL: [570/571] Data 0.002 (0.077) Elapsed 1m 51s (remain 0m 0s) Loss: 3.6643(2.1570) 


Epoch 6 - avg_train_loss: 0.5291  avg_val_loss: 2.1570  time: 2156s
Epoch 6 - Save Best Loss: 2.1570 Model


  0%|          | 0/2283 [00:00<?, ?it/s]

Epoch: [7][0/2283] Data 1.169 (1.169) Elapsed 0m 2s (remain 85m 19s) Loss: 0.3636(0.3636) Grad: 10.1095  
Epoch: [7][100/2283] Data 0.002 (0.013) Elapsed 1m 31s (remain 33m 6s) Loss: 0.2942(0.2864) Grad: 8.1187  
Epoch: [7][200/2283] Data 0.002 (0.008) Elapsed 3m 1s (remain 31m 17s) Loss: 0.1248(0.2763) Grad: 4.8998  
Epoch: [7][300/2283] Data 0.002 (0.006) Elapsed 4m 30s (remain 29m 43s) Loss: 0.2176(0.2747) Grad: 7.3822  
Epoch: [7][400/2283] Data 0.002 (0.005) Elapsed 6m 0s (remain 28m 10s) Loss: 0.3605(0.2750) Grad: 9.9263  
Epoch: [7][500/2283] Data 0.002 (0.004) Elapsed 7m 29s (remain 26m 39s) Loss: 0.3912(0.2736) Grad: 11.3536  
Epoch: [7][600/2283] Data 0.002 (0.004) Elapsed 8m 59s (remain 25m 8s) Loss: 0.1161(0.2757) Grad: 4.2202  
Epoch: [7][700/2283] Data 0.002 (0.003) Elapsed 10m 28s (remain 23m 38s) Loss: 0.3065(0.2764) Grad: 8.3577  
Epoch: [7][800/2283] Data 0.002 (0.003) Elapsed 11m 57s (remain 22m 8s) Loss: 0.1113(0.2801) Grad: 3.4484  
Epoch: [7][900/2283] Data 0.002 

  0%|          | 0/571 [00:00<?, ?it/s]

EVAL: [0/571] Data 0.965 (0.965) Elapsed 0m 1s (remain 10m 46s) Loss: 2.6835(2.6835) 
EVAL: [100/571] Data 0.006 (0.084) Elapsed 0m 20s (remain 1m 35s) Loss: 1.8670(1.7540) 
EVAL: [200/571] Data 0.004 (0.081) Elapsed 0m 40s (remain 1m 14s) Loss: 1.7707(1.7008) 
EVAL: [300/571] Data 0.165 (0.080) Elapsed 0m 59s (remain 0m 53s) Loss: 1.1920(1.7402) 
EVAL: [400/571] Data 0.013 (0.080) Elapsed 1m 19s (remain 0m 33s) Loss: 1.7423(1.8037) 
EVAL: [500/571] Data 0.010 (0.078) Elapsed 1m 38s (remain 0m 13s) Loss: 1.1985(1.9043) 


Epoch 7 - avg_train_loss: 0.3003  avg_val_loss: 2.0229  time: 2160s
Epoch 7 - Save Best Loss: 2.0229 Model


EVAL: [570/571] Data 0.002 (0.078) Elapsed 1m 52s (remain 0m 0s) Loss: 3.6335(2.0229) 


  0%|          | 0/2283 [00:00<?, ?it/s]

Epoch: [8][0/2283] Data 1.193 (1.193) Elapsed 0m 2s (remain 89m 32s) Loss: 0.0535(0.0535) Grad: 1.6080  
Epoch: [8][100/2283] Data 0.002 (0.014) Elapsed 1m 32s (remain 33m 12s) Loss: 0.1615(0.1707) Grad: 5.5454  
Epoch: [8][200/2283] Data 0.002 (0.008) Elapsed 3m 1s (remain 31m 22s) Loss: 0.1847(0.1823) Grad: 7.6291  
Epoch: [8][300/2283] Data 0.002 (0.006) Elapsed 4m 31s (remain 29m 48s) Loss: 0.1437(0.1821) Grad: 4.8379  
Epoch: [8][400/2283] Data 0.002 (0.005) Elapsed 6m 1s (remain 28m 15s) Loss: 0.2107(0.1865) Grad: 6.9061  
Epoch: [8][500/2283] Data 0.002 (0.004) Elapsed 7m 31s (remain 26m 44s) Loss: 0.0638(0.1898) Grad: 2.4718  
Epoch: [8][600/2283] Data 0.002 (0.004) Elapsed 9m 0s (remain 25m 13s) Loss: 0.1248(0.1899) Grad: 4.2518  
Epoch: [8][700/2283] Data 0.002 (0.004) Elapsed 10m 30s (remain 23m 42s) Loss: 0.1087(0.1898) Grad: 3.4782  
Epoch: [8][800/2283] Data 0.002 (0.003) Elapsed 12m 0s (remain 22m 12s) Loss: 0.0981(0.1910) Grad: 3.1317  
Epoch: [8][900/2283] Data 0.002 (

  0%|          | 0/571 [00:00<?, ?it/s]

EVAL: [0/571] Data 0.930 (0.930) Elapsed 0m 1s (remain 10m 14s) Loss: 2.6778(2.6778) 
EVAL: [100/571] Data 0.011 (0.090) Elapsed 0m 20s (remain 1m 37s) Loss: 1.8303(1.7027) 
EVAL: [200/571] Data 0.014 (0.084) Elapsed 0m 40s (remain 1m 14s) Loss: 1.6737(1.6577) 
EVAL: [300/571] Data 0.016 (0.082) Elapsed 1m 0s (remain 0m 53s) Loss: 1.1897(1.6988) 
EVAL: [400/571] Data 0.067 (0.081) Elapsed 1m 19s (remain 0m 33s) Loss: 1.8169(1.7617) 
EVAL: [500/571] Data 0.242 (0.080) Elapsed 1m 39s (remain 0m 13s) Loss: 1.1648(1.8604) 


Epoch 8 - avg_train_loss: 0.2031  avg_val_loss: 1.9730  time: 2162s
Epoch 8 - Save Best Loss: 1.9730 Model


EVAL: [570/571] Data 0.001 (0.080) Elapsed 1m 53s (remain 0m 0s) Loss: 3.8047(1.9730) 


  0%|          | 0/2283 [00:00<?, ?it/s]

Epoch: [9][0/2283] Data 1.137 (1.137) Elapsed 0m 2s (remain 88m 43s) Loss: 0.4337(0.4337) Grad: 10.2670  
Epoch: [9][100/2283] Data 0.002 (0.013) Elapsed 1m 32s (remain 33m 8s) Loss: 0.2384(0.1452) Grad: 7.2224  
Epoch: [9][200/2283] Data 0.002 (0.008) Elapsed 3m 1s (remain 31m 20s) Loss: 0.0755(0.1451) Grad: 2.6365  
Epoch: [9][300/2283] Data 0.002 (0.006) Elapsed 4m 31s (remain 29m 45s) Loss: 0.0528(0.1509) Grad: 1.8211  
Epoch: [9][400/2283] Data 0.002 (0.005) Elapsed 6m 0s (remain 28m 13s) Loss: 0.1542(0.1497) Grad: 5.4153  
Epoch: [9][500/2283] Data 0.003 (0.004) Elapsed 7m 30s (remain 26m 42s) Loss: 0.0810(0.1510) Grad: 3.4645  
Epoch: [9][600/2283] Data 0.002 (0.004) Elapsed 9m 0s (remain 25m 11s) Loss: 0.1835(0.1496) Grad: 6.0886  
Epoch: [9][700/2283] Data 0.002 (0.004) Elapsed 10m 29s (remain 23m 41s) Loss: 0.0768(0.1472) Grad: 2.7146  
Epoch: [9][800/2283] Data 0.002 (0.003) Elapsed 11m 59s (remain 22m 11s) Loss: 0.1836(0.1474) Grad: 6.3953  
Epoch: [9][900/2283] Data 0.002 

  0%|          | 0/571 [00:00<?, ?it/s]

EVAL: [0/571] Data 0.898 (0.898) Elapsed 0m 1s (remain 9m 53s) Loss: 2.5391(2.5391) 
EVAL: [100/571] Data 0.006 (0.089) Elapsed 0m 20s (remain 1m 37s) Loss: 1.8499(1.6697) 
EVAL: [200/571] Data 0.027 (0.084) Elapsed 0m 40s (remain 1m 14s) Loss: 1.5990(1.6250) 
EVAL: [300/571] Data 0.007 (0.082) Elapsed 0m 59s (remain 0m 53s) Loss: 1.1558(1.6671) 
EVAL: [400/571] Data 0.014 (0.081) Elapsed 1m 20s (remain 0m 33s) Loss: 1.9388(1.7295) 
EVAL: [500/571] Data 0.010 (0.081) Elapsed 1m 40s (remain 0m 14s) Loss: 1.1538(1.8245) 


Epoch 9 - avg_train_loss: 0.1581  avg_val_loss: 1.9361  time: 2163s
Epoch 9 - Save Best Loss: 1.9361 Model


EVAL: [570/571] Data 0.001 (0.081) Elapsed 1m 53s (remain 0m 0s) Loss: 3.5805(1.9361) 


  0%|          | 0/2283 [00:00<?, ?it/s]

Epoch: [10][0/2283] Data 0.979 (0.979) Elapsed 0m 2s (remain 83m 54s) Loss: 0.0568(0.0568) Grad: 1.8397  
Epoch: [10][100/2283] Data 0.002 (0.012) Elapsed 1m 32s (remain 33m 9s) Loss: 0.3033(0.1328) Grad: 9.3592  
Epoch: [10][200/2283] Data 0.002 (0.007) Elapsed 3m 2s (remain 31m 25s) Loss: 0.2114(0.1304) Grad: 7.6926  
Epoch: [10][300/2283] Data 0.002 (0.005) Elapsed 4m 31s (remain 29m 50s) Loss: 0.0739(0.1269) Grad: 2.6091  
Epoch: [10][400/2283] Data 0.005 (0.004) Elapsed 6m 1s (remain 28m 16s) Loss: 0.1830(0.1284) Grad: 5.2780  
Epoch: [10][500/2283] Data 0.002 (0.004) Elapsed 7m 31s (remain 26m 44s) Loss: 0.1753(0.1250) Grad: 5.9102  
Epoch: [10][600/2283] Data 0.002 (0.004) Elapsed 9m 0s (remain 25m 13s) Loss: 0.2821(0.1263) Grad: 9.9694  
Epoch: [10][700/2283] Data 0.002 (0.003) Elapsed 10m 30s (remain 23m 43s) Loss: 0.1135(0.1257) Grad: 3.9529  
Epoch: [10][800/2283] Data 0.002 (0.003) Elapsed 12m 0s (remain 22m 13s) Loss: 0.0964(0.1261) Grad: 3.4827  
Epoch: [10][900/2283] Dat

  0%|          | 0/571 [00:00<?, ?it/s]

EVAL: [0/571] Data 0.860 (0.860) Elapsed 0m 0s (remain 9m 29s) Loss: 2.5715(2.5715) 
EVAL: [100/571] Data 0.040 (0.087) Elapsed 0m 20s (remain 1m 36s) Loss: 1.7841(1.6528) 
EVAL: [200/571] Data 0.002 (0.082) Elapsed 0m 40s (remain 1m 14s) Loss: 1.5427(1.6059) 
EVAL: [300/571] Data 0.015 (0.079) Elapsed 0m 59s (remain 0m 53s) Loss: 1.2132(1.6503) 
EVAL: [400/571] Data 0.002 (0.080) Elapsed 1m 19s (remain 0m 33s) Loss: 1.8754(1.7103) 
EVAL: [500/571] Data 0.019 (0.078) Elapsed 1m 38s (remain 0m 13s) Loss: 1.0343(1.8034) 


Epoch 10 - avg_train_loss: 0.1289  avg_val_loss: 1.9122  time: 2162s
Epoch 10 - Save Best Loss: 1.9122 Model


EVAL: [570/571] Data 0.001 (0.078) Elapsed 1m 52s (remain 0m 0s) Loss: 3.5594(1.9122) 


Loss: 1.9122
Loss: 1.9122
