In [1]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
!pip3 install tokenizers wandb sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip3 install transformers huggingface-hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip3 install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import os
os.chdir("drive/")
os.chdir('My Drive')
os.chdir('Experiment')
os.chdir('TransformerBased')

In [6]:
OUTPUT_DIR = './emotion_recognition/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [7]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Aug 29 15:49:42 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# CFG

In [8]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    _wandb_kernel='bluehills'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-large"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=50 # [0, 50, 100]
    epochs=10
    encoder_lr=1e-5 #2e-5
    decoder_lr=1e-5 #2e-5
    min_lr=5e-7
    eps=5e-7
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.1
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [9]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    import wandb
    try:
        # from kaggle_secrets import UserSecretsClient
        # user_secrets = UserSecretsClient()
        # secret_value_0 = user_secrets.get_secret("wandb_api")
        # wandb.login(key=secret_value_0)
        print('login to wandb')
        wandb.login()
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='Emotion Recognition', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

login to wandb


[34m[1mwandb[0m: Currently logged in as: [33mbluehills[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Library

In [10]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
from math import sqrt
import shutil
import string
import pickle
import random
import joblib
import itertools
import logging
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedGroupKFold

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import torch.cuda.amp as amp

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification

import datasets
from datasets import list_datasets, load_dataset

import huggingface_hub
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats

%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.__version__: 1.12.1+cu113
tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


# Utils

In [11]:
# ====================================================
# Utils
# ====================================================

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

# Load Data

In [12]:
dataset_name = 'jakeazcona/short-text-labeled-emotion-classification'
my_dataset = load_dataset(dataset_name)



  0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
train, test = my_dataset['train'], my_dataset['test']

train = train.to_pandas()
test = test.to_pandas()

print(train.shape)
print(test.shape)

(24032, 2)
(6008, 2)


In [14]:
train.head()

Unnamed: 0,sample,label
0,i tend to be a window shopper when im alone be...,3
1,i will hopefully be able to feel less inhibite...,5
2,i feel very fond of my pinky kids,3
3,i feel like i had so much to write then got di...,0
4,i knew i was going to look at the mess and fee...,5


In [15]:
test.head()

Unnamed: 0,sample,label
0,i will not feel so alone anymore,5
1,"This is a huge stretch, but I would enjoy seei...",2
2,i know it will be no picnic and i will not fee...,5
3,i can write as many entries as humanly possibl...,0
4,Id give my left nut for another [NAME]. ...and...,2


# CV

In [16]:
!pip3 install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

dfx = pd.get_dummies(train, columns=["label"]).groupby(["sample"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("label_") or c == "sample"]
dfx = dfx[cols]

mskf = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "sample"]
dfx_labels = dfx[labels]
dfx["fold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
    print(len(trn_), len(val_))
    dfx.loc[val_, "fold"] = fold

train = train.merge(dfx[["sample", "fold"]], on="sample", how="left")
print(train.fold.value_counts())

19042 4756
19031 4767
19042 4756
19039 4759
19038 4760
3    4812
0    4807
4    4805
2    4804
1    4804
Name: fold, dtype: int64


In [17]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

In [18]:
train['text'] = '[CFG]' + train['sample'] + '[SEP]'
test['test'] = 'CFG' + test['sample'] + '[SEP]'

In [19]:
print('Describe train:', train.describe())
print('-------------------')
print('Describe test:', test.describe())

Describe train:               label          fold
count  24032.000000  24032.000000
mean       2.603196      2.000166
std        1.729178      1.414243
min        0.000000      0.000000
25%        1.000000      1.000000
50%        2.000000      2.000000
75%        4.000000      3.000000
max        5.000000      4.000000
-------------------
Describe test:              label
count  6008.000000
mean      2.600866
std       1.743349
min       0.000000
25%       1.000000
50%       2.000000
75%       4.000000
max       5.000000


In [20]:
train.head()

Unnamed: 0,sample,label,fold,text
0,i tend to be a window shopper when im alone be...,3,0,[CFG]i tend to be a window shopper when im alo...
1,i will hopefully be able to feel less inhibite...,5,3,[CFG]i will hopefully be able to feel less inh...
2,i feel very fond of my pinky kids,3,2,[CFG]i feel very fond of my pinky kids[SEP]
3,i feel like i had so much to write then got di...,0,2,[CFG]i feel like i had so much to write then g...
4,i knew i was going to look at the mess and fee...,5,2,[CFG]i knew i was going to look at the mess an...


In [21]:
test.head()

Unnamed: 0,sample,label,test
0,i will not feel so alone anymore,5,CFGi will not feel so alone anymore[SEP]
1,"This is a huge stretch, but I would enjoy seei...",2,"CFGThis is a huge stretch, but I would enjoy s..."
2,i know it will be no picnic and i will not fee...,5,CFGi know it will be no picnic and i will not ...
3,i can write as many entries as humanly possibl...,0,CFGi can write as many entries as humanly poss...
4,Id give my left nut for another [NAME]. ...and...,2,CFGId give my left nut for another [NAME]. ......


# Tokenizer

In [22]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [23]:
# ====================================================
# Define max_len
# ====================================================
# text_col = 'sample'
# lengths = []
# tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
# for text in tk0:
#     length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
#     lengths.append(length)

def define_max_len_(text_col, df):
    max_lenghts = []
    tk0 = tqdm(df[text_col].fillna("").values, total=len(df))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        max_lenghts.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(max_lenghts)}')
    
    return max_lenghts

# set the max length with the longest sample
CFG.max_len = max(define_max_len_('text', train)) + 4

LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/24032 [00:00<?, ?it/s]

text max(lengths): 76
INFO:__main__:text max(lengths): 76
max_len: 80
INFO:__main__:max_len: 80


In [24]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs



class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].values
        self.label = df['label'].values

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.text[item])
        label = torch.tensor(self.label[item], dtype=torch.long)
        return inputs, label

# Model

In [25]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False, num_of_classes=6, memory_max_len=CFG.max_len):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(memory_max_len * self.config.hidden_size, num_of_classes)
        self._init_weights(self.fc)
        

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        batch_size = feature.shape[0]
        feature = torch.reshape(feature, (batch_size, -1))
        output = self.fc(self.fc_dropout(feature))
        return output

# Helper Function

In [26]:
from sklearn import metrics


def get_score(y_pred, y_true):
    log_loss = metrics.log_loss
    # trues = []
    # a1, a2, a3 = [1., 0., 0.], [0., 1., 0.], [0., 0., 1.]
    # for true in y_true:
    #     t = a1 if true == 0 else a2 if true == 1 else a3
    #     trues.append(t)
    score = log_loss(y_true, y_pred)
    return score

In [27]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds, labels)
        loss = torch.masked_select(loss, labels != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds, labels)
        loss = torch.masked_select(loss, labels != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [28]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['label'].values


    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    # criterion = nn.BCEWithLogitsLoss(reduction="none")
    criterion = nn.CrossEntropyLoss()
    
    best_score = 1.

    for epoch in range(CFG.epochs):
        start_time = time.time()
        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)
        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        # predictions = predictions.reshape((len(valid_folds), CFG.max_len))


        #TODO scoring
        score = get_score(predictions, valid_labels)


        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

        torch.cuda.empty_cache()
        gc.collect()

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']

    torch.cuda.empty_cache()
    gc.collect()
    
    return best_score

In [None]:
if CFG.train:
    avg_score = 0
    total_fold = 0
    for fold in range(CFG.n_fold):
        if fold in CFG.trn_fold:
            best_score = train_loop(train, fold)
            avg_score += best_score
            total_fold += 1
    avg_score = avg_score / total_fold
    LOGGER.info(f"========== CV ==========")
    LOGGER.info(f'CV score = {avg_score}')

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1201] Elapsed 0m 3s (remain 70m 56s) Loss: 5.6367(5.6367) Grad: nan  LR: 0.00000020  
Epoch: [1][100/1201] Elapsed 0m 28s (remain 5m 10s) Loss: 2.9336(4.3575) Grad: 462915.4062  LR: 0.00001000  
Epoch: [1][200/1201] Elapsed 0m 53s (remain 4m 26s) Loss: 2.5547(3.6791) Grad: 320011.3750  LR: 0.00001000  
Epoch: [1][300/1201] Elapsed 1m 18s (remain 3m 54s) Loss: 2.7031(3.2469) Grad: 385796.0938  LR: 0.00000999  
Epoch: [1][400/1201] Elapsed 1m 43s (remain 3m 26s) Loss: 0.5942(2.9120) Grad: 223659.3125  LR: 0.00000998  
Epoch: [1][500/1201] Elapsed 2m 8s (remain 2m 59s) Loss: 1.6807(2.6449) Grad: 249850.6250  LR: 0.00000996  
Epoch: [1][600/1201] Elapsed 2m 33s (remain 2m 32s) Loss: 0.6304(2.3840) Grad: 166657.5156  LR: 0.00000995  
Epoch: [1][700/1201] Elapsed 2m 58s (remain 2m 6s) Loss: 1.1230(2.1856) Grad: 245002.9219  LR: 0.00000993  
Epoch: [1][800/1201] Elapsed 3m 23s (remain 1m 41s) Loss: 1.7900(2.0341) Grad: 254419.0469  LR: 0.00000990  
Epoch: [1][900/1201] Elapsed 3m

Epoch 1 - avg_train_loss: 1.6059  avg_val_loss: 0.5174  time: 331s
INFO:__main__:Epoch 1 - avg_train_loss: 1.6059  avg_val_loss: 0.5174  time: 331s
Epoch 1 - Score: 1.1787
INFO:__main__:Epoch 1 - Score: 1.1787


EVAL: [300/301] Elapsed 0m 28s (remain 0m 0s) Loss: 1.8333(0.5174) 
Epoch: [2][0/1201] Elapsed 0m 0s (remain 15m 12s) Loss: 0.0015(0.0015) Grad: nan  LR: 0.00000977  
Epoch: [2][100/1201] Elapsed 0m 26s (remain 4m 43s) Loss: 0.6147(0.4172) Grad: 716023.3125  LR: 0.00000973  
Epoch: [2][200/1201] Elapsed 0m 51s (remain 4m 16s) Loss: 0.0645(0.4498) Grad: 305206.9062  LR: 0.00000969  
Epoch: [2][300/1201] Elapsed 1m 16s (remain 3m 50s) Loss: 0.3289(0.4556) Grad: 507084.5312  LR: 0.00000964  
Epoch: [2][400/1201] Elapsed 1m 42s (remain 3m 23s) Loss: 0.0041(0.4431) Grad: 21791.2559  LR: 0.00000959  
Epoch: [2][500/1201] Elapsed 2m 7s (remain 2m 57s) Loss: 0.5298(0.4492) Grad: 508516.2812  LR: 0.00000954  
Epoch: [2][600/1201] Elapsed 2m 32s (remain 2m 32s) Loss: 0.3733(0.4561) Grad: 539142.1250  LR: 0.00000948  
Epoch: [2][700/1201] Elapsed 2m 57s (remain 2m 6s) Loss: 0.0131(0.4593) Grad: 41897.1172  LR: 0.00000942  
Epoch: [2][800/1201] Elapsed 3m 22s (remain 1m 41s) Loss: 0.0891(0.4614) G

Epoch 2 - avg_train_loss: 0.4528  avg_val_loss: 0.4787  time: 331s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4528  avg_val_loss: 0.4787  time: 331s
Epoch 2 - Score: 0.9112
INFO:__main__:Epoch 2 - Score: 0.9112
Epoch 2 - Save Best Score: 0.9112 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.9112 Model


EVAL: [300/301] Elapsed 0m 28s (remain 0m 0s) Loss: 2.5160(0.4787) 
Epoch: [3][0/1201] Elapsed 0m 0s (remain 14m 59s) Loss: 0.3311(0.3311) Grad: nan  LR: 0.00000908  
Epoch: [3][100/1201] Elapsed 0m 27s (remain 4m 56s) Loss: 0.1033(0.3957) Grad: 360905.5938  LR: 0.00000900  
Epoch: [3][200/1201] Elapsed 0m 52s (remain 4m 19s) Loss: 0.2573(0.4157) Grad: 246581.4062  LR: 0.00000892  
Epoch: [3][300/1201] Elapsed 1m 17s (remain 3m 50s) Loss: 1.2578(0.4221) Grad: 472433.6562  LR: 0.00000884  
Epoch: [3][400/1201] Elapsed 1m 41s (remain 3m 22s) Loss: 0.9839(0.4194) Grad: 445305.0938  LR: 0.00000875  
Epoch: [3][500/1201] Elapsed 2m 6s (remain 2m 56s) Loss: 1.5254(0.4079) Grad: 423879.3125  LR: 0.00000866  
Epoch: [3][600/1201] Elapsed 2m 31s (remain 2m 31s) Loss: 0.0261(0.4138) Grad: 75639.6172  LR: 0.00000857  
Epoch: [3][700/1201] Elapsed 2m 56s (remain 2m 6s) Loss: 0.1846(0.4144) Grad: 203923.8906  LR: 0.00000848  
Epoch: [3][800/1201] Elapsed 3m 21s (remain 1m 40s) Loss: 0.1519(0.4235) 

Epoch 3 - avg_train_loss: 0.4384  avg_val_loss: 0.4906  time: 329s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4384  avg_val_loss: 0.4906  time: 329s
Epoch 3 - Score: 0.9334
INFO:__main__:Epoch 3 - Score: 0.9334


EVAL: [300/301] Elapsed 0m 28s (remain 0m 0s) Loss: 2.0556(0.4906) 
Epoch: [4][0/1201] Elapsed 0m 0s (remain 15m 44s) Loss: 0.0110(0.0110) Grad: nan  LR: 0.00000798  
Epoch: [4][100/1201] Elapsed 0m 25s (remain 4m 37s) Loss: 0.1102(0.2744) Grad: 363838.3125  LR: 0.00000787  
Epoch: [4][200/1201] Elapsed 0m 50s (remain 4m 10s) Loss: 0.0375(0.2743) Grad: 181747.6250  LR: 0.00000776  
Epoch: [4][300/1201] Elapsed 1m 15s (remain 3m 44s) Loss: 0.3201(0.2821) Grad: 704403.0625  LR: 0.00000765  
Epoch: [4][400/1201] Elapsed 1m 40s (remain 3m 19s) Loss: 0.0144(0.2792) Grad: 41635.8125  LR: 0.00000754  
Epoch: [4][500/1201] Elapsed 2m 5s (remain 2m 54s) Loss: 0.0604(0.2933) Grad: 145426.5625  LR: 0.00000742  
Epoch: [4][600/1201] Elapsed 2m 30s (remain 2m 30s) Loss: 1.4307(0.2954) Grad: 374065.9375  LR: 0.00000731  
Epoch: [4][700/1201] Elapsed 2m 55s (remain 2m 5s) Loss: 0.0010(0.2965) Grad: 2911.6814  LR: 0.00000719  
Epoch: [4][800/1201] Elapsed 3m 21s (remain 1m 40s) Loss: 0.0084(0.2990) Gr

Epoch 4 - avg_train_loss: 0.3100  avg_val_loss: 0.5349  time: 330s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3100  avg_val_loss: 0.5349  time: 330s
Epoch 4 - Score: 0.7655
INFO:__main__:Epoch 4 - Score: 0.7655
Epoch 4 - Save Best Score: 0.7655 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7655 Model


EVAL: [300/301] Elapsed 0m 28s (remain 0m 0s) Loss: 2.9012(0.5349) 
Epoch: [5][0/1201] Elapsed 0m 0s (remain 15m 23s) Loss: 0.2874(0.2874) Grad: nan  LR: 0.00000658  
Epoch: [5][100/1201] Elapsed 0m 27s (remain 4m 54s) Loss: 0.0528(0.2178) Grad: 218271.3594  LR: 0.00000646  
Epoch: [5][200/1201] Elapsed 0m 52s (remain 4m 22s) Loss: 0.2482(0.2311) Grad: 418077.6562  LR: 0.00000633  
Epoch: [5][300/1201] Elapsed 1m 17s (remain 3m 53s) Loss: 0.0026(0.2101) Grad: 17309.7168  LR: 0.00000621  
Epoch: [5][400/1201] Elapsed 1m 42s (remain 3m 24s) Loss: 0.5005(0.2113) Grad: 622147.1875  LR: 0.00000608  
Epoch: [5][500/1201] Elapsed 2m 7s (remain 2m 58s) Loss: 0.0004(0.2112) Grad: 2562.0544  LR: 0.00000595  
Epoch: [5][600/1201] Elapsed 2m 32s (remain 2m 32s) Loss: 0.2340(0.2168) Grad: 277288.3438  LR: 0.00000582  
Epoch: [5][700/1201] Elapsed 2m 57s (remain 2m 6s) Loss: 0.0062(0.2271) Grad: 13048.0537  LR: 0.00000569  
Epoch: [5][800/1201] Elapsed 3m 21s (remain 1m 40s) Loss: 0.0015(0.2321) Gra

Epoch 5 - avg_train_loss: 0.2336  avg_val_loss: 0.6742  time: 328s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2336  avg_val_loss: 0.6742  time: 328s
Epoch 5 - Score: 0.6468
INFO:__main__:Epoch 5 - Score: 0.6468
Epoch 5 - Save Best Score: 0.6468 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.6468 Model


EVAL: [300/301] Elapsed 0m 28s (remain 0m 0s) Loss: 2.2615(0.6742) 
Epoch: [6][0/1201] Elapsed 0m 0s (remain 15m 17s) Loss: 0.3713(0.3713) Grad: nan  LR: 0.00000503  
Epoch: [6][100/1201] Elapsed 0m 26s (remain 4m 49s) Loss: 0.0000(0.1625) Grad: 221.7269  LR: 0.00000490  
Epoch: [6][200/1201] Elapsed 0m 51s (remain 4m 17s) Loss: 0.0060(0.1537) Grad: 37023.1172  LR: 0.00000477  
Epoch: [6][300/1201] Elapsed 1m 16s (remain 3m 47s) Loss: 0.0017(0.1620) Grad: 5257.1763  LR: 0.00000464  
Epoch: [6][400/1201] Elapsed 1m 40s (remain 3m 21s) Loss: 0.0181(0.1539) Grad: 92228.8906  LR: 0.00000451  
Epoch: [6][500/1201] Elapsed 2m 5s (remain 2m 55s) Loss: 0.0001(0.1657) Grad: 431.2959  LR: 0.00000438  
Epoch: [6][600/1201] Elapsed 2m 30s (remain 2m 29s) Loss: 0.4629(0.1713) Grad: 272468.4375  LR: 0.00000425  
Epoch: [6][700/1201] Elapsed 2m 54s (remain 2m 4s) Loss: 0.0014(0.1854) Grad: 4460.3818  LR: 0.00000412  
Epoch: [6][800/1201] Elapsed 3m 19s (remain 1m 39s) Loss: 0.0000(0.1808) Grad: 96.00

Epoch 6 - avg_train_loss: 0.1888  avg_val_loss: 0.6228  time: 326s
INFO:__main__:Epoch 6 - avg_train_loss: 0.1888  avg_val_loss: 0.6228  time: 326s
Epoch 6 - Score: 0.6275
INFO:__main__:Epoch 6 - Score: 0.6275
Epoch 6 - Save Best Score: 0.6275 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.6275 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 2.6960(0.6228) 
Epoch: [7][0/1201] Elapsed 0m 0s (remain 14m 25s) Loss: 0.0000(0.0000) Grad: nan  LR: 0.00000348  
Epoch: [7][100/1201] Elapsed 0m 26s (remain 4m 48s) Loss: 0.0001(0.1006) Grad: 319.0548  LR: 0.00000336  
Epoch: [7][200/1201] Elapsed 0m 51s (remain 4m 15s) Loss: 0.0000(0.1243) Grad: 33.7988  LR: 0.00000323  
Epoch: [7][300/1201] Elapsed 1m 15s (remain 3m 47s) Loss: 0.5234(0.1266) Grad: 352108.1562  LR: 0.00000311  
Epoch: [7][400/1201] Elapsed 1m 40s (remain 3m 19s) Loss: 0.0000(0.1227) Grad: 4.0947  LR: 0.00000299  
Epoch: [7][500/1201] Elapsed 2m 5s (remain 2m 54s) Loss: 0.0237(0.1296) Grad: 76801.0312  LR: 0.00000287  
Epoch: [7][600/1201] Elapsed 2m 29s (remain 2m 29s) Loss: 0.4963(0.1355) Grad: 435703.4688  LR: 0.00000275  
Epoch: [7][700/1201] Elapsed 2m 53s (remain 2m 3s) Loss: 0.0000(0.1348) Grad: 8.9749  LR: 0.00000264  
Epoch: [7][800/1201] Elapsed 3m 17s (remain 1m 38s) Loss: 0.0007(0.1329) Grad: 3495.2500  

Epoch 7 - avg_train_loss: 0.1322  avg_val_loss: 0.7059  time: 326s
INFO:__main__:Epoch 7 - avg_train_loss: 0.1322  avg_val_loss: 0.7059  time: 326s
Epoch 7 - Score: 0.5600
INFO:__main__:Epoch 7 - Score: 0.5600
Epoch 7 - Save Best Score: 0.5600 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.5600 Model


EVAL: [300/301] Elapsed 0m 28s (remain 0m 0s) Loss: 2.4816(0.7059) 
Epoch: [8][0/1201] Elapsed 0m 0s (remain 14m 47s) Loss: 0.0000(0.0000) Grad: nan  LR: 0.00000208  
Epoch: [8][100/1201] Elapsed 0m 25s (remain 4m 43s) Loss: 0.0751(0.1127) Grad: 437761.0000  LR: 0.00000197  
Epoch: [8][200/1201] Elapsed 0m 51s (remain 4m 16s) Loss: 0.0000(0.0899) Grad: 0.0311  LR: 0.00000187  
Epoch: [8][300/1201] Elapsed 1m 16s (remain 3m 47s) Loss: 0.0545(0.0834) Grad: 191467.1875  LR: 0.00000177  
Epoch: [8][400/1201] Elapsed 1m 40s (remain 3m 21s) Loss: 0.0040(0.0834) Grad: 14672.8203  LR: 0.00000167  
Epoch: [8][500/1201] Elapsed 2m 5s (remain 2m 55s) Loss: 0.0000(0.0853) Grad: 3.3816  LR: 0.00000157  
Epoch: [8][600/1201] Elapsed 2m 30s (remain 2m 29s) Loss: 0.0000(0.0918) Grad: 39.9246  LR: 0.00000148  
Epoch: [8][700/1201] Elapsed 2m 55s (remain 2m 4s) Loss: 0.0000(0.0874) Grad: 174.2040  LR: 0.00000139  
Epoch: [8][800/1201] Elapsed 3m 19s (remain 1m 39s) Loss: 0.3372(0.0899) Grad: 386473.1562

Epoch 8 - avg_train_loss: 0.0947  avg_val_loss: 0.7068  time: 327s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0947  avg_val_loss: 0.7068  time: 327s
Epoch 8 - Score: 0.5562
INFO:__main__:Epoch 8 - Score: 0.5562
Epoch 8 - Save Best Score: 0.5562 Model
INFO:__main__:Epoch 8 - Save Best Score: 0.5562 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 2.4537(0.7068) 
Epoch: [9][0/1201] Elapsed 0m 0s (remain 14m 38s) Loss: 0.0000(0.0000) Grad: nan  LR: 0.00000096  
Epoch: [9][100/1201] Elapsed 0m 25s (remain 4m 43s) Loss: 0.0000(0.0765) Grad: 3.9254  LR: 0.00000089  
Epoch: [9][200/1201] Elapsed 0m 51s (remain 4m 15s) Loss: 0.0000(0.0824) Grad: 7.5523  LR: 0.00000082  
Epoch: [9][300/1201] Elapsed 1m 15s (remain 3m 46s) Loss: 0.0234(0.0781) Grad: 79610.2188  LR: 0.00000075  
Epoch: [9][400/1201] Elapsed 1m 40s (remain 3m 20s) Loss: 0.0000(0.0794) Grad: 85.0652  LR: 0.00000068  
Epoch: [9][500/1201] Elapsed 2m 5s (remain 2m 54s) Loss: 0.0020(0.0782) Grad: 8984.9512  LR: 0.00000061  
Epoch: [9][600/1201] Elapsed 2m 29s (remain 2m 29s) Loss: 0.0000(0.0773) Grad: 120.7874  LR: 0.00000055  
Epoch: [9][700/1201] Elapsed 2m 54s (remain 2m 4s) Loss: 0.0000(0.0761) Grad: 0.2472  LR: 0.00000049  
Epoch: [9][800/1201] Elapsed 3m 19s (remain 1m 39s) Loss: 0.0000(0.0772) Grad: 0.2648  LR: 0.0000

Epoch 9 - avg_train_loss: 0.0792  avg_val_loss: 0.7247  time: 327s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0792  avg_val_loss: 0.7247  time: 327s
Epoch 9 - Score: 0.5466
INFO:__main__:Epoch 9 - Score: 0.5466
Epoch 9 - Save Best Score: 0.5466 Model
INFO:__main__:Epoch 9 - Save Best Score: 0.5466 Model


EVAL: [300/301] Elapsed 0m 28s (remain 0m 0s) Loss: 2.4683(0.7247) 
Epoch: [10][0/1201] Elapsed 0m 0s (remain 14m 50s) Loss: 0.2910(0.2910) Grad: nan  LR: 0.00000025  
Epoch: [10][100/1201] Elapsed 0m 26s (remain 4m 48s) Loss: 0.0003(0.0843) Grad: 2161.2400  LR: 0.00000021  
Epoch: [10][200/1201] Elapsed 0m 51s (remain 4m 15s) Loss: 0.0006(0.0697) Grad: 2928.8630  LR: 0.00000017  
Epoch: [10][300/1201] Elapsed 1m 15s (remain 3m 47s) Loss: 0.0013(0.0607) Grad: 6086.7925  LR: 0.00000014  
Epoch: [10][400/1201] Elapsed 1m 40s (remain 3m 21s) Loss: 0.5684(0.0591) Grad: 306330.9688  LR: 0.00000011  
Epoch: [10][500/1201] Elapsed 2m 5s (remain 2m 55s) Loss: 0.0199(0.0640) Grad: 88499.5312  LR: 0.00000009  
Epoch: [10][600/1201] Elapsed 2m 29s (remain 2m 29s) Loss: 0.0001(0.0638) Grad: 497.0710  LR: 0.00000006  
Epoch: [10][700/1201] Elapsed 2m 54s (remain 2m 4s) Loss: 0.2227(0.0639) Grad: 290058.5000  LR: 0.00000004  
Epoch: [10][800/1201] Elapsed 3m 19s (remain 1m 39s) Loss: 0.0000(0.0618) 

Epoch 10 - avg_train_loss: 0.0669  avg_val_loss: 0.7341  time: 326s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0669  avg_val_loss: 0.7341  time: 326s
Epoch 10 - Score: 0.5460
INFO:__main__:Epoch 10 - Score: 0.5460
Epoch 10 - Save Best Score: 0.5460 Model
INFO:__main__:Epoch 10 - Save Best Score: 0.5460 Model


EVAL: [300/301] Elapsed 0m 28s (remain 0m 0s) Loss: 2.4078(0.7341) 


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1201] Elapsed 0m 0s (remain 9m 56s) Loss: 11.1172(11.1172) Grad: nan  LR: 0.00000020  
Epoch: [1][100/1201] Elapsed 0m 25s (remain 4m 40s) Loss: 3.4609(4.6316) Grad: 198090.6406  LR: 0.00001000  
Epoch: [1][200/1201] Elapsed 0m 50s (remain 4m 10s) Loss: 3.4219(3.7967) Grad: 206582.4688  LR: 0.00001000  
Epoch: [1][300/1201] Elapsed 1m 15s (remain 3m 44s) Loss: 2.7793(3.3356) Grad: 102079.4453  LR: 0.00000999  
Epoch: [1][400/1201] Elapsed 1m 39s (remain 3m 18s) Loss: 0.7734(2.9309) Grad: 72865.1719  LR: 0.00000998  
Epoch: [1][500/1201] Elapsed 2m 3s (remain 2m 52s) Loss: 1.0469(2.6236) Grad: 102452.7188  LR: 0.00000996  
Epoch: [1][600/1201] Elapsed 2m 28s (remain 2m 28s) Loss: 0.4407(2.3453) Grad: 81461.7422  LR: 0.00000995  
Epoch: [1][700/1201] Elapsed 2m 52s (remain 2m 3s) Loss: 2.2441(2.1478) Grad: 107395.9609  LR: 0.00000993  
Epoch: [1][800/1201] Elapsed 3m 17s (remain 1m 38s) Loss: 0.9507(1.9901) Grad: 157538.9688  LR: 0.00000990  
Epoch: [1][900/1201] Elapsed 3m 

Epoch 1 - avg_train_loss: 1.5622  avg_val_loss: 0.5916  time: 323s
INFO:__main__:Epoch 1 - avg_train_loss: 1.5622  avg_val_loss: 0.5916  time: 323s
Epoch 1 - Score: 0.6752
INFO:__main__:Epoch 1 - Score: 0.6752
Epoch 1 - Save Best Score: 0.6752 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6752 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.7597(0.5916) 
Epoch: [2][0/1201] Elapsed 0m 0s (remain 10m 41s) Loss: 0.1072(0.1072) Grad: nan  LR: 0.00000977  
Epoch: [2][100/1201] Elapsed 0m 25s (remain 4m 37s) Loss: 2.3027(0.5501) Grad: 480727.4375  LR: 0.00000973  
Epoch: [2][200/1201] Elapsed 0m 51s (remain 4m 13s) Loss: 0.4905(0.4920) Grad: 271353.0312  LR: 0.00000969  
Epoch: [2][300/1201] Elapsed 1m 15s (remain 3m 45s) Loss: 1.1992(0.4741) Grad: 503776.0625  LR: 0.00000964  
Epoch: [2][400/1201] Elapsed 1m 39s (remain 3m 19s) Loss: 0.3777(0.4650) Grad: 207479.3750  LR: 0.00000959  
Epoch: [2][500/1201] Elapsed 2m 4s (remain 2m 53s) Loss: 0.7402(0.4818) Grad: 294348.5312  LR: 0.00000954  
Epoch: [2][600/1201] Elapsed 2m 28s (remain 2m 28s) Loss: 0.2673(0.4717) Grad: 190192.0156  LR: 0.00000948  
Epoch: [2][700/1201] Elapsed 2m 52s (remain 2m 3s) Loss: 1.0830(0.4613) Grad: 379793.4375  LR: 0.00000942  
Epoch: [2][800/1201] Elapsed 3m 16s (remain 1m 38s) Loss: 0.1248(0.4569)

Epoch 2 - avg_train_loss: 0.4501  avg_val_loss: 0.5188  time: 322s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4501  avg_val_loss: 0.5188  time: 322s
Epoch 2 - Score: 0.5317
INFO:__main__:Epoch 2 - Score: 0.5317
Epoch 2 - Save Best Score: 0.5317 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5317 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 1.6997(0.5188) 
Epoch: [3][0/1201] Elapsed 0m 0s (remain 10m 46s) Loss: 0.0065(0.0065) Grad: nan  LR: 0.00000908  
Epoch: [3][100/1201] Elapsed 0m 25s (remain 4m 42s) Loss: 0.1415(0.3215) Grad: 757835.8750  LR: 0.00000900  
Epoch: [3][200/1201] Elapsed 0m 50s (remain 4m 12s) Loss: 0.1893(0.3119) Grad: 533933.3750  LR: 0.00000892  
Epoch: [3][300/1201] Elapsed 1m 14s (remain 3m 43s) Loss: 0.3010(0.3297) Grad: 515401.9062  LR: 0.00000884  
Epoch: [3][400/1201] Elapsed 1m 39s (remain 3m 18s) Loss: 0.1183(0.3265) Grad: 220700.8906  LR: 0.00000875  
Epoch: [3][500/1201] Elapsed 2m 3s (remain 2m 52s) Loss: 0.5938(0.3259) Grad: 228976.0000  LR: 0.00000866  
Epoch: [3][600/1201] Elapsed 2m 28s (remain 2m 28s) Loss: 0.0113(0.3310) Grad: 31827.9844  LR: 0.00000857  
Epoch: [3][700/1201] Elapsed 2m 52s (remain 2m 3s) Loss: 1.4443(0.3420) Grad: 451194.7188  LR: 0.00000848  
Epoch: [3][800/1201] Elapsed 3m 17s (remain 1m 38s) Loss: 0.5181(0.3455) 

Epoch 3 - avg_train_loss: 0.3448  avg_val_loss: 0.5527  time: 322s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3448  avg_val_loss: 0.5527  time: 322s
Epoch 3 - Score: 0.4494
INFO:__main__:Epoch 3 - Score: 0.4494
Epoch 3 - Save Best Score: 0.4494 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4494 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0126(0.5527) 
Epoch: [4][0/1201] Elapsed 0m 0s (remain 10m 33s) Loss: 0.3452(0.3452) Grad: nan  LR: 0.00000798  
Epoch: [4][100/1201] Elapsed 0m 25s (remain 4m 40s) Loss: 0.1257(0.2804) Grad: 393508.0938  LR: 0.00000787  
Epoch: [4][200/1201] Elapsed 0m 50s (remain 4m 11s) Loss: 0.0430(0.2645) Grad: 265023.5938  LR: 0.00000776  
Epoch: [4][300/1201] Elapsed 1m 15s (remain 3m 44s) Loss: 0.5703(0.2841) Grad: 733690.2500  LR: 0.00000765  
Epoch: [4][400/1201] Elapsed 1m 38s (remain 3m 17s) Loss: 0.0009(0.2687) Grad: 6750.3599  LR: 0.00000754  
Epoch: [4][500/1201] Elapsed 2m 3s (remain 2m 51s) Loss: 0.0898(0.2760) Grad: 264768.7188  LR: 0.00000743  
Epoch: [4][600/1201] Elapsed 2m 27s (remain 2m 27s) Loss: 0.3430(0.2807) Grad: 437212.1250  LR: 0.00000731  
Epoch: [4][700/1201] Elapsed 2m 51s (remain 2m 2s) Loss: 0.6611(0.2868) Grad: 636405.2500  LR: 0.00000719  
Epoch: [4][800/1201] Elapsed 3m 15s (remain 1m 37s) Loss: 0.4534(0.2850) G

Epoch 4 - avg_train_loss: 0.2968  avg_val_loss: 0.5357  time: 319s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2968  avg_val_loss: 0.5357  time: 319s
Epoch 4 - Score: 0.4214
INFO:__main__:Epoch 4 - Score: 0.4214
Epoch 4 - Save Best Score: 0.4214 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.4214 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0652(0.5357) 
Epoch: [5][0/1201] Elapsed 0m 0s (remain 10m 20s) Loss: 0.1015(0.1015) Grad: nan  LR: 0.00000658  
Epoch: [5][100/1201] Elapsed 0m 25s (remain 4m 33s) Loss: 0.0047(0.2163) Grad: 29731.9746  LR: 0.00000646  
Epoch: [5][200/1201] Elapsed 0m 50s (remain 4m 9s) Loss: 0.0192(0.2199) Grad: 108381.8828  LR: 0.00000633  
Epoch: [5][300/1201] Elapsed 1m 14s (remain 3m 42s) Loss: 0.2400(0.2430) Grad: 637335.1875  LR: 0.00000621  
Epoch: [5][400/1201] Elapsed 1m 38s (remain 3m 17s) Loss: 0.4380(0.2505) Grad: 538393.5625  LR: 0.00000608  
Epoch: [5][500/1201] Elapsed 2m 2s (remain 2m 51s) Loss: 1.1191(0.2675) Grad: 260404.1562  LR: 0.00000595  
Epoch: [5][600/1201] Elapsed 2m 27s (remain 2m 26s) Loss: 0.6646(0.2757) Grad: 303304.5938  LR: 0.00000582  
Epoch: [5][700/1201] Elapsed 2m 51s (remain 2m 2s) Loss: 0.0002(0.2791) Grad: 603.0012  LR: 0.00000569  
Epoch: [5][800/1201] Elapsed 3m 16s (remain 1m 37s) Loss: 0.0064(0.2846) Grad

Epoch 5 - avg_train_loss: 0.2862  avg_val_loss: 0.6447  time: 321s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2862  avg_val_loss: 0.6447  time: 321s
Epoch 5 - Score: 0.3985
INFO:__main__:Epoch 5 - Score: 0.3985
Epoch 5 - Save Best Score: 0.3985 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.3985 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0031(0.6447) 
Epoch: [6][0/1201] Elapsed 0m 0s (remain 10m 13s) Loss: 0.0000(0.0000) Grad: nan  LR: 0.00000504  
Epoch: [6][100/1201] Elapsed 0m 25s (remain 4m 38s) Loss: 0.0052(0.2140) Grad: 28092.7109  LR: 0.00000490  
Epoch: [6][200/1201] Elapsed 0m 50s (remain 4m 10s) Loss: 0.0019(0.2055) Grad: 24164.8145  LR: 0.00000477  
Epoch: [6][300/1201] Elapsed 1m 14s (remain 3m 42s) Loss: 0.0043(0.1928) Grad: 13482.5176  LR: 0.00000464  
Epoch: [6][400/1201] Elapsed 1m 38s (remain 3m 16s) Loss: 0.0002(0.2034) Grad: 575.2208  LR: 0.00000451  
Epoch: [6][500/1201] Elapsed 2m 2s (remain 2m 50s) Loss: 0.1499(0.2084) Grad: 314790.2188  LR: 0.00000438  
Epoch: [6][600/1201] Elapsed 2m 26s (remain 2m 25s) Loss: 0.0073(0.2133) Grad: 28495.7930  LR: 0.00000425  
Epoch: [6][700/1201] Elapsed 2m 50s (remain 2m 1s) Loss: 0.1989(0.2172) Grad: 220862.0938  LR: 0.00000412  
Epoch: [6][800/1201] Elapsed 3m 14s (remain 1m 37s) Loss: 0.2258(0.2157) Grad: 

Epoch 6 - avg_train_loss: 0.2126  avg_val_loss: 0.6034  time: 320s
INFO:__main__:Epoch 6 - avg_train_loss: 0.2126  avg_val_loss: 0.6034  time: 320s
Epoch 6 - Score: 0.4271
INFO:__main__:Epoch 6 - Score: 0.4271


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.2066(0.6034) 
Epoch: [7][0/1201] Elapsed 0m 0s (remain 10m 19s) Loss: 0.0727(0.0727) Grad: nan  LR: 0.00000348  
Epoch: [7][100/1201] Elapsed 0m 25s (remain 4m 32s) Loss: 0.2196(0.1549) Grad: 498664.2188  LR: 0.00000336  
Epoch: [7][200/1201] Elapsed 0m 49s (remain 4m 6s) Loss: 0.0001(0.1474) Grad: 262.0700  LR: 0.00000324  
Epoch: [7][300/1201] Elapsed 1m 14s (remain 3m 41s) Loss: 0.0076(0.1460) Grad: 47089.2852  LR: 0.00000311  
Epoch: [7][400/1201] Elapsed 1m 38s (remain 3m 16s) Loss: 0.2496(0.1496) Grad: 293534.5625  LR: 0.00000299  
Epoch: [7][500/1201] Elapsed 2m 2s (remain 2m 51s) Loss: 0.7168(0.1591) Grad: 2436960.5000  LR: 0.00000287  
Epoch: [7][600/1201] Elapsed 2m 26s (remain 2m 26s) Loss: 0.0920(0.1575) Grad: 252119.7812  LR: 0.00000276  
Epoch: [7][700/1201] Elapsed 2m 50s (remain 2m 1s) Loss: 0.0000(0.1615) Grad: 4.8102  LR: 0.00000264  
Epoch: [7][800/1201] Elapsed 3m 14s (remain 1m 37s) Loss: 0.0001(0.1615) Grad: 44

Epoch 7 - avg_train_loss: 0.1550  avg_val_loss: 0.7159  time: 320s
INFO:__main__:Epoch 7 - avg_train_loss: 0.1550  avg_val_loss: 0.7159  time: 320s
Epoch 7 - Score: 0.3980
INFO:__main__:Epoch 7 - Score: 0.3980
Epoch 7 - Save Best Score: 0.3980 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.3980 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0456(0.7159) 
Epoch: [8][0/1201] Elapsed 0m 0s (remain 11m 7s) Loss: 0.3069(0.3069) Grad: nan  LR: 0.00000208  
Epoch: [8][100/1201] Elapsed 0m 25s (remain 4m 40s) Loss: 0.7163(0.1648) Grad: 496504.1875  LR: 0.00000198  
Epoch: [8][200/1201] Elapsed 0m 50s (remain 4m 10s) Loss: 0.0132(0.1443) Grad: 99668.7344  LR: 0.00000187  
Epoch: [8][300/1201] Elapsed 1m 14s (remain 3m 43s) Loss: 0.0005(0.1409) Grad: 3842.8491  LR: 0.00000177  
Epoch: [8][400/1201] Elapsed 1m 39s (remain 3m 17s) Loss: 0.0000(0.1295) Grad: 6.0489  LR: 0.00000167  
Epoch: [8][500/1201] Elapsed 2m 3s (remain 2m 52s) Loss: 0.0001(0.1236) Grad: 1179.8785  LR: 0.00000157  
Epoch: [8][600/1201] Elapsed 2m 27s (remain 2m 27s) Loss: 0.0000(0.1230) Grad: 31.0208  LR: 0.00000148  
Epoch: [8][700/1201] Elapsed 2m 51s (remain 2m 2s) Loss: 0.0000(0.1218) Grad: 0.9770  LR: 0.00000139  
Epoch: [8][800/1201] Elapsed 3m 15s (remain 1m 37s) Loss: 0.1398(0.1228) Grad: 255098.2812  

Epoch 8 - avg_train_loss: 0.1222  avg_val_loss: 0.7274  time: 321s
INFO:__main__:Epoch 8 - avg_train_loss: 0.1222  avg_val_loss: 0.7274  time: 321s
Epoch 8 - Score: 0.3811
INFO:__main__:Epoch 8 - Score: 0.3811
Epoch 8 - Save Best Score: 0.3811 Model
INFO:__main__:Epoch 8 - Save Best Score: 0.3811 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0349(0.7274) 
Epoch: [9][0/1201] Elapsed 0m 0s (remain 10m 27s) Loss: 0.1892(0.1892) Grad: nan  LR: 0.00000097  
Epoch: [9][100/1201] Elapsed 0m 25s (remain 4m 35s) Loss: 0.0006(0.0876) Grad: 3669.3745  LR: 0.00000089  
Epoch: [9][200/1201] Elapsed 0m 50s (remain 4m 10s) Loss: 0.0000(0.0911) Grad: 23.7869  LR: 0.00000082  
Epoch: [9][300/1201] Elapsed 1m 14s (remain 3m 42s) Loss: 0.0000(0.1039) Grad: 3.4643  LR: 0.00000075  
Epoch: [9][400/1201] Elapsed 1m 38s (remain 3m 16s) Loss: 0.0370(0.0931) Grad: 271068.0000  LR: 0.00000068  
Epoch: [9][500/1201] Elapsed 2m 2s (remain 2m 50s) Loss: 0.2766(0.0911) Grad: inf  LR: 0.00000061  
Epoch: [9][600/1201] Elapsed 2m 26s (remain 2m 26s) Loss: 0.1018(0.0947) Grad: 208409.4375  LR: 0.00000055  
Epoch: [9][700/1201] Elapsed 2m 49s (remain 2m 1s) Loss: 0.0271(0.0993) Grad: 92871.8359  LR: 0.00000049  
Epoch: [9][800/1201] Elapsed 3m 13s (remain 1m 36s) Loss: 0.0003(0.1019) Grad: 949.7521  LR:

Epoch 9 - avg_train_loss: 0.0989  avg_val_loss: 0.7549  time: 317s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0989  avg_val_loss: 0.7549  time: 317s
Epoch 9 - Score: 0.3689
INFO:__main__:Epoch 9 - Score: 0.3689
Epoch 9 - Save Best Score: 0.3689 Model
INFO:__main__:Epoch 9 - Save Best Score: 0.3689 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0002(0.7549) 
Epoch: [10][0/1201] Elapsed 0m 0s (remain 10m 27s) Loss: 0.0609(0.0609) Grad: nan  LR: 0.00000025  
Epoch: [10][100/1201] Elapsed 0m 24s (remain 4m 26s) Loss: 0.4165(0.0870) Grad: 742170.7500  LR: 0.00000021  
Epoch: [10][200/1201] Elapsed 0m 48s (remain 4m 3s) Loss: 0.0000(0.0901) Grad: 5.3506  LR: 0.00000017  
Epoch: [10][300/1201] Elapsed 1m 12s (remain 3m 37s) Loss: 0.0000(0.0830) Grad: 10.4633  LR: 0.00000014  
Epoch: [10][400/1201] Elapsed 1m 36s (remain 3m 12s) Loss: 0.0000(0.0797) Grad: 68.4723  LR: 0.00000011  
Epoch: [10][500/1201] Elapsed 2m 0s (remain 2m 48s) Loss: 0.7500(0.0829) Grad: 256838.2812  LR: 0.00000009  
Epoch: [10][600/1201] Elapsed 2m 24s (remain 2m 24s) Loss: 0.0000(0.0813) Grad: 17.3142  LR: 0.00000006  
Epoch: [10][700/1201] Elapsed 2m 48s (remain 2m 0s) Loss: 0.0001(0.0780) Grad: 222.7366  LR: 0.00000004  
Epoch: [10][800/1201] Elapsed 3m 12s (remain 1m 36s) Loss: 0.0219(0.0803) Grad: 70983

Epoch 10 - avg_train_loss: 0.0820  avg_val_loss: 0.7590  time: 318s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0820  avg_val_loss: 0.7590  time: 318s
Epoch 10 - Score: 0.3676
INFO:__main__:Epoch 10 - Score: 0.3676
Epoch 10 - Save Best Score: 0.3676 Model
INFO:__main__:Epoch 10 - Save Best Score: 0.3676 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0005(0.7590) 


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1201] Elapsed 0m 0s (remain 10m 30s) Loss: 6.4414(6.4414) Grad: nan  LR: 0.00000020  
Epoch: [1][100/1201] Elapsed 0m 25s (remain 4m 41s) Loss: 4.1250(4.0645) Grad: 244029.8906  LR: 0.00001000  
Epoch: [1][200/1201] Elapsed 0m 50s (remain 4m 9s) Loss: 3.1035(3.5555) Grad: 177356.9688  LR: 0.00001000  
Epoch: [1][300/1201] Elapsed 1m 13s (remain 3m 40s) Loss: 1.5361(3.1811) Grad: 218131.0156  LR: 0.00000999  
Epoch: [1][400/1201] Elapsed 1m 37s (remain 3m 15s) Loss: 1.4512(2.9189) Grad: 135572.4219  LR: 0.00000998  
Epoch: [1][500/1201] Elapsed 2m 1s (remain 2m 50s) Loss: 1.4580(2.6298) Grad: 134618.0938  LR: 0.00000996  
Epoch: [1][600/1201] Elapsed 2m 25s (remain 2m 25s) Loss: 0.8364(2.3867) Grad: 77341.7734  LR: 0.00000995  
Epoch: [1][700/1201] Elapsed 2m 50s (remain 2m 1s) Loss: 0.3855(2.2033) Grad: 77190.5859  LR: 0.00000993  
Epoch: [1][800/1201] Elapsed 3m 14s (remain 1m 37s) Loss: 0.2615(2.0354) Grad: 49574.6016  LR: 0.00000990  
Epoch: [1][900/1201] Elapsed 3m 38s

Epoch 1 - avg_train_loss: 1.5927  avg_val_loss: 0.5009  time: 318s
INFO:__main__:Epoch 1 - avg_train_loss: 1.5927  avg_val_loss: 0.5009  time: 318s
Epoch 1 - Score: 1.1978
INFO:__main__:Epoch 1 - Score: 1.1978


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0243(0.5009) 
Epoch: [2][0/1201] Elapsed 0m 0s (remain 10m 1s) Loss: 1.0391(1.0391) Grad: nan  LR: 0.00000977  
Epoch: [2][100/1201] Elapsed 0m 24s (remain 4m 30s) Loss: 0.9551(0.4270) Grad: 695761.2500  LR: 0.00000973  
Epoch: [2][200/1201] Elapsed 0m 49s (remain 4m 5s) Loss: 0.4556(0.4327) Grad: 555203.8125  LR: 0.00000969  
Epoch: [2][300/1201] Elapsed 1m 13s (remain 3m 39s) Loss: 0.6685(0.4331) Grad: 436074.7812  LR: 0.00000964  
Epoch: [2][400/1201] Elapsed 1m 37s (remain 3m 14s) Loss: 0.5708(0.4259) Grad: 797101.3125  LR: 0.00000959  
Epoch: [2][500/1201] Elapsed 2m 1s (remain 2m 50s) Loss: 0.0403(0.4256) Grad: 135338.5000  LR: 0.00000954  
Epoch: [2][600/1201] Elapsed 2m 26s (remain 2m 25s) Loss: 0.1934(0.4307) Grad: 181267.3594  LR: 0.00000948  
Epoch: [2][700/1201] Elapsed 2m 50s (remain 2m 1s) Loss: 0.0112(0.4290) Grad: 26810.3906  LR: 0.00000942  
Epoch: [2][800/1201] Elapsed 3m 14s (remain 1m 37s) Loss: 0.0003(0.4214) Gr

Epoch 2 - avg_train_loss: 0.4317  avg_val_loss: 0.4258  time: 320s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4317  avg_val_loss: 0.4258  time: 320s
Epoch 2 - Score: 1.1528
INFO:__main__:Epoch 2 - Score: 1.1528


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1688(0.4258) 
Epoch: [3][0/1201] Elapsed 0m 0s (remain 10m 8s) Loss: 1.3867(1.3867) Grad: nan  LR: 0.00000908  
Epoch: [3][100/1201] Elapsed 0m 25s (remain 4m 32s) Loss: 0.8779(0.3310) Grad: 594708.7500  LR: 0.00000900  
Epoch: [3][200/1201] Elapsed 0m 49s (remain 4m 5s) Loss: 0.0002(0.3302) Grad: 628.5917  LR: 0.00000892  
Epoch: [3][300/1201] Elapsed 1m 13s (remain 3m 41s) Loss: 0.8013(0.3221) Grad: 786571.5000  LR: 0.00000884  
Epoch: [3][400/1201] Elapsed 1m 38s (remain 3m 15s) Loss: 0.3186(0.3329) Grad: 633129.7500  LR: 0.00000875  
Epoch: [3][500/1201] Elapsed 2m 2s (remain 2m 50s) Loss: 0.5308(0.3392) Grad: 565181.7500  LR: 0.00000866  
Epoch: [3][600/1201] Elapsed 2m 26s (remain 2m 26s) Loss: 0.0224(0.3352) Grad: 145876.0469  LR: 0.00000857  
Epoch: [3][700/1201] Elapsed 2m 50s (remain 2m 1s) Loss: 0.3752(0.3341) Grad: 581678.9375  LR: 0.00000848  
Epoch: [3][800/1201] Elapsed 3m 14s (remain 1m 37s) Loss: 0.3955(0.3316) Grad

Epoch 3 - avg_train_loss: 0.3515  avg_val_loss: 0.4468  time: 319s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3515  avg_val_loss: 0.4468  time: 319s
Epoch 3 - Score: 1.0069
INFO:__main__:Epoch 3 - Score: 1.0069


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 1.6246(0.4468) 
Epoch: [4][0/1201] Elapsed 0m 0s (remain 10m 48s) Loss: 0.4768(0.4768) Grad: nan  LR: 0.00000798  
Epoch: [4][100/1201] Elapsed 0m 24s (remain 4m 28s) Loss: 0.3914(0.3068) Grad: 392908.0938  LR: 0.00000787  
Epoch: [4][200/1201] Elapsed 0m 48s (remain 4m 3s) Loss: 0.0043(0.2925) Grad: 32419.9023  LR: 0.00000776  
Epoch: [4][300/1201] Elapsed 1m 12s (remain 3m 37s) Loss: 0.4592(0.2917) Grad: 356770.7812  LR: 0.00000765  
Epoch: [4][400/1201] Elapsed 1m 37s (remain 3m 13s) Loss: 0.1002(0.2983) Grad: 181066.3125  LR: 0.00000754  
Epoch: [4][500/1201] Elapsed 2m 0s (remain 2m 49s) Loss: 0.1271(0.3016) Grad: 200850.6562  LR: 0.00000743  
Epoch: [4][600/1201] Elapsed 2m 25s (remain 2m 25s) Loss: 1.0391(0.3032) Grad: 258553.9688  LR: 0.00000731  
Epoch: [4][700/1201] Elapsed 2m 49s (remain 2m 0s) Loss: 0.6523(0.3089) Grad: 354523.1562  LR: 0.00000719  
Epoch: [4][800/1201] Elapsed 3m 13s (remain 1m 36s) Loss: 0.8428(0.3100) G

Epoch 4 - avg_train_loss: 0.3297  avg_val_loss: 0.4615  time: 316s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3297  avg_val_loss: 0.4615  time: 316s
Epoch 4 - Score: 0.9228
INFO:__main__:Epoch 4 - Score: 0.9228
Epoch 4 - Save Best Score: 0.9228 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.9228 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.3450(0.4615) 
Epoch: [5][0/1201] Elapsed 0m 0s (remain 10m 10s) Loss: 0.1423(0.1423) Grad: nan  LR: 0.00000658  
Epoch: [5][100/1201] Elapsed 0m 25s (remain 4m 35s) Loss: 0.4688(0.2292) Grad: 511903.6562  LR: 0.00000646  
Epoch: [5][200/1201] Elapsed 0m 50s (remain 4m 9s) Loss: 0.4495(0.2410) Grad: 523509.8125  LR: 0.00000633  
Epoch: [5][300/1201] Elapsed 1m 14s (remain 3m 41s) Loss: 0.3621(0.2296) Grad: 494915.1875  LR: 0.00000621  
Epoch: [5][400/1201] Elapsed 1m 38s (remain 3m 15s) Loss: 0.5010(0.2337) Grad: 360729.3750  LR: 0.00000608  
Epoch: [5][500/1201] Elapsed 2m 2s (remain 2m 50s) Loss: 0.0347(0.2427) Grad: 85433.5312  LR: 0.00000595  
Epoch: [5][600/1201] Elapsed 2m 26s (remain 2m 26s) Loss: 0.4221(0.2516) Grad: 222792.7656  LR: 0.00000582  
Epoch: [5][700/1201] Elapsed 2m 50s (remain 2m 1s) Loss: 0.0006(0.2624) Grad: 861.9755  LR: 0.00000569  
Epoch: [5][800/1201] Elapsed 3m 14s (remain 1m 37s) Loss: 0.0239(0.2673) Grad

Epoch 5 - avg_train_loss: 0.2935  avg_val_loss: 0.6137  time: 319s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2935  avg_val_loss: 0.6137  time: 319s
Epoch 5 - Score: 0.9271
INFO:__main__:Epoch 5 - Score: 0.9271


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0529(0.6137) 
Epoch: [6][0/1201] Elapsed 0m 0s (remain 10m 40s) Loss: 0.4053(0.4053) Grad: nan  LR: 0.00000504  
Epoch: [6][100/1201] Elapsed 0m 24s (remain 4m 26s) Loss: 0.3767(0.2674) Grad: 532613.1250  LR: 0.00000490  
Epoch: [6][200/1201] Elapsed 0m 48s (remain 4m 2s) Loss: 0.0112(0.2404) Grad: 62921.8477  LR: 0.00000477  
Epoch: [6][300/1201] Elapsed 1m 13s (remain 3m 38s) Loss: 0.0007(0.2358) Grad: 1481.5166  LR: 0.00000464  
Epoch: [6][400/1201] Elapsed 1m 37s (remain 3m 13s) Loss: 0.1228(0.2297) Grad: 174808.5000  LR: 0.00000451  
Epoch: [6][500/1201] Elapsed 2m 0s (remain 2m 48s) Loss: 0.1439(0.2267) Grad: 98947.0781  LR: 0.00000438  
Epoch: [6][600/1201] Elapsed 2m 25s (remain 2m 25s) Loss: 0.0061(0.2228) Grad: 10190.2275  LR: 0.00000425  
Epoch: [6][700/1201] Elapsed 2m 49s (remain 2m 1s) Loss: 0.0061(0.2221) Grad: 11641.7129  LR: 0.00000412  
Epoch: [6][800/1201] Elapsed 3m 13s (remain 1m 36s) Loss: 0.0000(0.2232) Grad: 

Epoch 6 - avg_train_loss: 0.2153  avg_val_loss: 0.5954  time: 318s
INFO:__main__:Epoch 6 - avg_train_loss: 0.2153  avg_val_loss: 0.5954  time: 318s
Epoch 6 - Score: 0.6927
INFO:__main__:Epoch 6 - Score: 0.6927
Epoch 6 - Save Best Score: 0.6927 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.6927 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0060(0.5954) 
Epoch: [7][0/1201] Elapsed 0m 0s (remain 10m 12s) Loss: 0.6533(0.6533) Grad: nan  LR: 0.00000348  
Epoch: [7][100/1201] Elapsed 0m 24s (remain 4m 31s) Loss: 0.0006(0.1446) Grad: 3284.9390  LR: 0.00000336  
Epoch: [7][200/1201] Elapsed 0m 50s (remain 4m 8s) Loss: 0.6011(0.1343) Grad: 688679.0000  LR: 0.00000324  
Epoch: [7][300/1201] Elapsed 1m 13s (remain 3m 41s) Loss: 0.0988(0.1455) Grad: 454462.3438  LR: 0.00000311  
Epoch: [7][400/1201] Elapsed 1m 37s (remain 3m 15s) Loss: 0.0492(0.1467) Grad: 242287.4375  LR: 0.00000299  
Epoch: [7][500/1201] Elapsed 2m 1s (remain 2m 50s) Loss: 0.1912(0.1412) Grad: 572822.8125  LR: 0.00000287  
Epoch: [7][600/1201] Elapsed 2m 25s (remain 2m 25s) Loss: 0.0000(0.1418) Grad: 190.4690  LR: 0.00000276  
Epoch: [7][700/1201] Elapsed 2m 49s (remain 2m 1s) Loss: 0.0347(0.1450) Grad: 355014.0625  LR: 0.00000264  
Epoch: [7][800/1201] Elapsed 3m 13s (remain 1m 36s) Loss: 0.0000(0.1471) Grad:

Epoch 7 - avg_train_loss: 0.1466  avg_val_loss: 0.6678  time: 318s
INFO:__main__:Epoch 7 - avg_train_loss: 0.1466  avg_val_loss: 0.6678  time: 318s
Epoch 7 - Score: 0.7235
INFO:__main__:Epoch 7 - Score: 0.7235


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0180(0.6678) 
Epoch: [8][0/1201] Elapsed 0m 0s (remain 10m 6s) Loss: 0.6021(0.6021) Grad: nan  LR: 0.00000208  
Epoch: [8][100/1201] Elapsed 0m 24s (remain 4m 23s) Loss: 0.0621(0.1344) Grad: 345864.9062  LR: 0.00000198  
Epoch: [8][200/1201] Elapsed 0m 47s (remain 3m 58s) Loss: 0.6592(0.1137) Grad: 578746.9375  LR: 0.00000187  
Epoch: [8][300/1201] Elapsed 1m 12s (remain 3m 37s) Loss: 0.0000(0.1198) Grad: 228.1860  LR: 0.00000177  
Epoch: [8][400/1201] Elapsed 1m 36s (remain 3m 12s) Loss: 0.0000(0.1217) Grad: 0.6769  LR: 0.00000167  
Epoch: [8][500/1201] Elapsed 2m 0s (remain 2m 48s) Loss: 0.0000(0.1241) Grad: 39.1546  LR: 0.00000157  
Epoch: [8][600/1201] Elapsed 2m 24s (remain 2m 24s) Loss: 0.0009(0.1174) Grad: 6103.0845  LR: 0.00000148  
Epoch: [8][700/1201] Elapsed 2m 48s (remain 2m 0s) Loss: 0.1089(0.1158) Grad: 361963.0000  LR: 0.00000139  
Epoch: [8][800/1201] Elapsed 3m 13s (remain 1m 36s) Loss: 0.0064(0.1208) Grad: 40722.12

Epoch 8 - avg_train_loss: 0.1222  avg_val_loss: 0.7165  time: 317s
INFO:__main__:Epoch 8 - avg_train_loss: 0.1222  avg_val_loss: 0.7165  time: 317s
Epoch 8 - Score: 0.6340
INFO:__main__:Epoch 8 - Score: 0.6340
Epoch 8 - Save Best Score: 0.6340 Model
INFO:__main__:Epoch 8 - Save Best Score: 0.6340 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0023(0.7165) 
Epoch: [9][0/1201] Elapsed 0m 0s (remain 10m 26s) Loss: 0.0250(0.0250) Grad: nan  LR: 0.00000097  
Epoch: [9][100/1201] Elapsed 0m 25s (remain 4m 37s) Loss: 0.0000(0.0992) Grad: 41.1889  LR: 0.00000089  
Epoch: [9][200/1201] Elapsed 0m 49s (remain 4m 6s) Loss: 0.2466(0.0971) Grad: 638944.9375  LR: 0.00000082  
Epoch: [9][300/1201] Elapsed 1m 13s (remain 3m 38s) Loss: 0.0000(0.0908) Grad: 51.5275  LR: 0.00000075  
Epoch: [9][400/1201] Elapsed 1m 37s (remain 3m 13s) Loss: 0.1071(0.0943) Grad: 368606.6562  LR: 0.00000068  
Epoch: [9][500/1201] Elapsed 2m 0s (remain 2m 48s) Loss: 0.0041(0.1027) Grad: 38232.9453  LR: 0.00000061  
Epoch: [9][600/1201] Elapsed 2m 24s (remain 2m 24s) Loss: 0.0002(0.1044) Grad: 3503.2058  LR: 0.00000055  
Epoch: [9][700/1201] Elapsed 2m 48s (remain 2m 0s) Loss: 0.0000(0.1046) Grad: 9.9605  LR: 0.00000049  
Epoch: [9][800/1201] Elapsed 3m 12s (remain 1m 36s) Loss: 0.0515(0.1029) Grad: 382431.406

Epoch 9 - avg_train_loss: 0.1053  avg_val_loss: 0.7147  time: 317s
INFO:__main__:Epoch 9 - avg_train_loss: 0.1053  avg_val_loss: 0.7147  time: 317s
Epoch 9 - Score: 0.6290
INFO:__main__:Epoch 9 - Score: 0.6290
Epoch 9 - Save Best Score: 0.6290 Model
INFO:__main__:Epoch 9 - Save Best Score: 0.6290 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0012(0.7147) 
Epoch: [10][0/1201] Elapsed 0m 0s (remain 10m 30s) Loss: 0.0002(0.0002) Grad: nan  LR: 0.00000025  
Epoch: [10][100/1201] Elapsed 0m 25s (remain 4m 33s) Loss: 0.0000(0.1215) Grad: 124.5067  LR: 0.00000021  
Epoch: [10][200/1201] Elapsed 0m 50s (remain 4m 10s) Loss: 0.0950(0.1047) Grad: 524723.7500  LR: 0.00000017  
Epoch: [10][300/1201] Elapsed 1m 14s (remain 3m 43s) Loss: 0.0000(0.0991) Grad: 111.9821  LR: 0.00000014  
Epoch: [10][400/1201] Elapsed 1m 38s (remain 3m 17s) Loss: 0.0013(0.0921) Grad: 3066.0779  LR: 0.00000011  
Epoch: [10][500/1201] Elapsed 2m 3s (remain 2m 51s) Loss: 0.0116(0.0922) Grad: 21115.0059  LR: 0.00000009  
Epoch: [10][600/1201] Elapsed 2m 27s (remain 2m 27s) Loss: 0.0005(0.0950) Grad: 1499.9928  LR: 0.00000006  
Epoch: [10][700/1201] Elapsed 2m 51s (remain 2m 2s) Loss: 0.0023(0.0909) Grad: 5149.2852  LR: 0.00000004  
Epoch: [10][800/1201] Elapsed 3m 15s (remain 1m 37s) Loss: 0.0000(0.0906) Gra

Epoch 10 - avg_train_loss: 0.0972  avg_val_loss: 0.7161  time: 320s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0972  avg_val_loss: 0.7161  time: 320s
Epoch 10 - Score: 0.6263
INFO:__main__:Epoch 10 - Score: 0.6263
Epoch 10 - Save Best Score: 0.6263 Model
INFO:__main__:Epoch 10 - Save Best Score: 0.6263 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0010(0.7161) 


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1201] Elapsed 0m 0s (remain 11m 2s) Loss: 10.4531(10.4531) Grad: nan  LR: 0.00000020  
Epoch: [1][100/1201] Elapsed 0m 25s (remain 4m 38s) Loss: 3.0938(5.0090) Grad: 144125.3438  LR: 0.00001000  
Epoch: [1][200/1201] Elapsed 0m 49s (remain 4m 7s) Loss: 1.9941(3.9302) Grad: 120143.9297  LR: 0.00001000  
Epoch: [1][300/1201] Elapsed 1m 13s (remain 3m 39s) Loss: 2.2461(3.4590) Grad: 122260.5625  LR: 0.00000999  
Epoch: [1][400/1201] Elapsed 1m 37s (remain 3m 14s) Loss: 1.5625(3.0852) Grad: 192270.6406  LR: 0.00000998  
Epoch: [1][500/1201] Elapsed 2m 1s (remain 2m 50s) Loss: 0.8696(2.7774) Grad: 118042.6094  LR: 0.00000996  
Epoch: [1][600/1201] Elapsed 2m 25s (remain 2m 25s) Loss: 0.3069(2.5208) Grad: 76409.7109  LR: 0.00000995  
Epoch: [1][700/1201] Elapsed 2m 50s (remain 2m 1s) Loss: 0.4382(2.2960) Grad: 73003.7422  LR: 0.00000993  
Epoch: [1][800/1201] Elapsed 3m 14s (remain 1m 37s) Loss: 0.1128(2.1292) Grad: 39492.5078  LR: 0.00000990  
Epoch: [1][900/1201] Elapsed 3m 38

Epoch 1 - avg_train_loss: 1.6651  avg_val_loss: 0.5012  time: 319s
INFO:__main__:Epoch 1 - avg_train_loss: 1.6651  avg_val_loss: 0.5012  time: 319s
Epoch 1 - Score: 1.2237
INFO:__main__:Epoch 1 - Score: 1.2237


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.8310(0.5012) 
Epoch: [2][0/1201] Elapsed 0m 0s (remain 10m 50s) Loss: 0.1614(0.1614) Grad: nan  LR: 0.00000977  
Epoch: [2][100/1201] Elapsed 0m 24s (remain 4m 24s) Loss: 0.1932(0.5323) Grad: 337012.8438  LR: 0.00000973  
Epoch: [2][200/1201] Elapsed 0m 48s (remain 3m 58s) Loss: 0.1383(0.4854) Grad: 198076.4531  LR: 0.00000969  
Epoch: [2][300/1201] Elapsed 1m 11s (remain 3m 34s) Loss: 0.4231(0.4708) Grad: 289652.8438  LR: 0.00000964  
Epoch: [2][400/1201] Elapsed 1m 35s (remain 3m 10s) Loss: 0.0077(0.4673) Grad: 24561.6387  LR: 0.00000959  
Epoch: [2][500/1201] Elapsed 1m 59s (remain 2m 46s) Loss: 0.4165(0.4648) Grad: 334804.7812  LR: 0.00000954  
Epoch: [2][600/1201] Elapsed 2m 23s (remain 2m 23s) Loss: 0.2031(0.4604) Grad: 206426.2500  LR: 0.00000948  
Epoch: [2][700/1201] Elapsed 2m 47s (remain 1m 59s) Loss: 0.5737(0.4488) Grad: 486691.0938  LR: 0.00000942  
Epoch: [2][800/1201] Elapsed 3m 11s (remain 1m 35s) Loss: 0.0453(0.4491

Epoch 2 - avg_train_loss: 0.4369  avg_val_loss: 0.4241  time: 316s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4369  avg_val_loss: 0.4241  time: 316s
Epoch 2 - Score: 1.0476
INFO:__main__:Epoch 2 - Score: 1.0476


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 1.1323(0.4241) 
Epoch: [3][0/1201] Elapsed 0m 0s (remain 10m 17s) Loss: 0.1458(0.1458) Grad: nan  LR: 0.00000908  
Epoch: [3][100/1201] Elapsed 0m 24s (remain 4m 30s) Loss: 0.6382(0.2951) Grad: 316354.7500  LR: 0.00000900  
Epoch: [3][200/1201] Elapsed 0m 48s (remain 4m 3s) Loss: 1.1895(0.3571) Grad: 452810.4062  LR: 0.00000892  
Epoch: [3][300/1201] Elapsed 1m 13s (remain 3m 40s) Loss: 0.0870(0.3796) Grad: 111966.3047  LR: 0.00000883  
Epoch: [3][400/1201] Elapsed 1m 38s (remain 3m 16s) Loss: 0.6240(0.3827) Grad: 116529.5234  LR: 0.00000875  
Epoch: [3][500/1201] Elapsed 2m 2s (remain 2m 51s) Loss: 1.5352(0.3983) Grad: 216072.2344  LR: 0.00000866  
Epoch: [3][600/1201] Elapsed 2m 27s (remain 2m 27s) Loss: 0.5049(0.3987) Grad: 132809.8906  LR: 0.00000857  
Epoch: [3][700/1201] Elapsed 2m 52s (remain 2m 2s) Loss: 0.3262(0.4067) Grad: 207420.6094  LR: 0.00000848  
Epoch: [3][800/1201] Elapsed 3m 17s (remain 1m 38s) Loss: 0.9062(0.4044) 

Epoch 3 - avg_train_loss: 0.4110  avg_val_loss: 0.5808  time: 324s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4110  avg_val_loss: 0.5808  time: 324s
Epoch 3 - Score: 1.0033
INFO:__main__:Epoch 3 - Score: 1.0033


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 1.4325(0.5808) 
Epoch: [4][0/1201] Elapsed 0m 0s (remain 10m 33s) Loss: 0.0507(0.0507) Grad: nan  LR: 0.00000798  
Epoch: [4][100/1201] Elapsed 0m 24s (remain 4m 31s) Loss: 1.0654(0.3638) Grad: 729181.8125  LR: 0.00000787  
Epoch: [4][200/1201] Elapsed 0m 49s (remain 4m 4s) Loss: 0.1497(0.3074) Grad: 526406.5000  LR: 0.00000776  
Epoch: [4][300/1201] Elapsed 1m 14s (remain 3m 41s) Loss: 0.0619(0.2910) Grad: 157119.4531  LR: 0.00000765  
Epoch: [4][400/1201] Elapsed 1m 38s (remain 3m 16s) Loss: 0.0017(0.2934) Grad: 6649.6919  LR: 0.00000754  
Epoch: [4][500/1201] Elapsed 2m 3s (remain 2m 52s) Loss: 0.1700(0.2837) Grad: 229592.8906  LR: 0.00000742  
Epoch: [4][600/1201] Elapsed 2m 27s (remain 2m 27s) Loss: 0.5552(0.2882) Grad: 278112.5938  LR: 0.00000731  
Epoch: [4][700/1201] Elapsed 2m 52s (remain 2m 2s) Loss: 0.0000(0.2887) Grad: 121.8035  LR: 0.00000719  
Epoch: [4][800/1201] Elapsed 3m 17s (remain 1m 38s) Loss: 0.0012(0.2887) Grad:

Epoch 4 - avg_train_loss: 0.2892  avg_val_loss: 0.4621  time: 324s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2892  avg_val_loss: 0.4621  time: 324s
Epoch 4 - Score: 0.9984
INFO:__main__:Epoch 4 - Score: 0.9984
Epoch 4 - Save Best Score: 0.9984 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.9984 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 1.5305(0.4621) 
Epoch: [5][0/1201] Elapsed 0m 0s (remain 10m 34s) Loss: 0.0140(0.0140) Grad: nan  LR: 0.00000658  
Epoch: [5][100/1201] Elapsed 0m 26s (remain 4m 47s) Loss: 0.0054(0.2203) Grad: 32385.3516  LR: 0.00000646  
Epoch: [5][200/1201] Elapsed 0m 51s (remain 4m 16s) Loss: 0.6948(0.2178) Grad: 675376.3750  LR: 0.00000633  
Epoch: [5][300/1201] Elapsed 1m 15s (remain 3m 47s) Loss: 0.5103(0.2223) Grad: 508087.8750  LR: 0.00000620  
Epoch: [5][400/1201] Elapsed 1m 40s (remain 3m 20s) Loss: 0.0003(0.2151) Grad: 2056.0107  LR: 0.00000608  
Epoch: [5][500/1201] Elapsed 2m 5s (remain 2m 54s) Loss: 0.0262(0.2184) Grad: 193417.4375  LR: 0.00000595  
Epoch: [5][600/1201] Elapsed 2m 29s (remain 2m 29s) Loss: 0.6167(0.2148) Grad: 1408726.6250  LR: 0.00000582  
Epoch: [5][700/1201] Elapsed 2m 54s (remain 2m 4s) Loss: 0.0004(0.2114) Grad: 1759.0773  LR: 0.00000569  
Epoch: [5][800/1201] Elapsed 3m 18s (remain 1m 39s) Loss: 1.0654(0.2177) Gra

Epoch 5 - avg_train_loss: 0.2261  avg_val_loss: 0.5852  time: 324s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2261  avg_val_loss: 0.5852  time: 324s
Epoch 5 - Score: 0.9168
INFO:__main__:Epoch 5 - Score: 0.9168
Epoch 5 - Save Best Score: 0.9168 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.9168 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 1.4874(0.5852) 
Epoch: [6][0/1201] Elapsed 0m 0s (remain 10m 46s) Loss: 0.0001(0.0001) Grad: nan  LR: 0.00000503  
Epoch: [6][100/1201] Elapsed 0m 26s (remain 4m 44s) Loss: 0.0000(0.1345) Grad: 138.5698  LR: 0.00000490  
Epoch: [6][200/1201] Elapsed 0m 51s (remain 4m 14s) Loss: 0.1531(0.1603) Grad: 566069.5000  LR: 0.00000477  
Epoch: [6][300/1201] Elapsed 1m 15s (remain 3m 44s) Loss: 0.0013(0.1551) Grad: 8134.2212  LR: 0.00000464  
Epoch: [6][400/1201] Elapsed 1m 39s (remain 3m 18s) Loss: 0.0997(0.1582) Grad: 196408.3125  LR: 0.00000451  
Epoch: [6][500/1201] Elapsed 2m 3s (remain 2m 52s) Loss: 0.1379(0.1708) Grad: 221441.5781  LR: 0.00000438  
Epoch: [6][600/1201] Elapsed 2m 27s (remain 2m 27s) Loss: 0.0000(0.1767) Grad: 77.9641  LR: 0.00000425  
Epoch: [6][700/1201] Elapsed 2m 52s (remain 2m 2s) Loss: 0.0003(0.1776) Grad: 1132.6567  LR: 0.00000412  
Epoch: [6][800/1201] Elapsed 3m 16s (remain 1m 38s) Loss: 0.0314(0.1800) Grad: 1117

Epoch 6 - avg_train_loss: 0.1959  avg_val_loss: 0.5915  time: 321s
INFO:__main__:Epoch 6 - avg_train_loss: 0.1959  avg_val_loss: 0.5915  time: 321s
Epoch 6 - Score: 0.8837
INFO:__main__:Epoch 6 - Score: 0.8837
Epoch 6 - Save Best Score: 0.8837 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.8837 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 2.0685(0.5915) 
Epoch: [7][0/1201] Elapsed 0m 0s (remain 10m 28s) Loss: 0.0005(0.0005) Grad: nan  LR: 0.00000348  
Epoch: [7][100/1201] Elapsed 0m 26s (remain 4m 45s) Loss: 0.5840(0.1462) Grad: 526180.5625  LR: 0.00000336  
Epoch: [7][200/1201] Elapsed 0m 51s (remain 4m 14s) Loss: 0.0009(0.1404) Grad: 6438.3311  LR: 0.00000323  
Epoch: [7][300/1201] Elapsed 1m 15s (remain 3m 47s) Loss: 0.0474(0.1350) Grad: 283424.9375  LR: 0.00000311  
Epoch: [7][400/1201] Elapsed 1m 40s (remain 3m 20s) Loss: 0.0000(0.1314) Grad: 307.7647  LR: 0.00000299  
Epoch: [7][500/1201] Elapsed 2m 4s (remain 2m 53s) Loss: 0.0000(0.1332) Grad: 129.0074  LR: 0.00000287  
Epoch: [7][600/1201] Elapsed 2m 28s (remain 2m 28s) Loss: 0.0000(0.1351) Grad: 11.5863  LR: 0.00000275  
Epoch: [7][700/1201] Elapsed 2m 53s (remain 2m 3s) Loss: 0.0000(0.1341) Grad: 88.7914  LR: 0.00000263  
Epoch: [7][800/1201] Elapsed 3m 17s (remain 1m 38s) Loss: 0.0000(0.1304) Grad: 108.0008 

Epoch 7 - avg_train_loss: 0.1376  avg_val_loss: 0.6517  time: 324s
INFO:__main__:Epoch 7 - avg_train_loss: 0.1376  avg_val_loss: 0.6517  time: 324s
Epoch 7 - Score: 0.7878
INFO:__main__:Epoch 7 - Score: 0.7878
Epoch 7 - Save Best Score: 0.7878 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.7878 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 2.2353(0.6517) 
Epoch: [8][0/1201] Elapsed 0m 0s (remain 10m 42s) Loss: 0.0000(0.0000) Grad: nan  LR: 0.00000208  
Epoch: [8][100/1201] Elapsed 0m 26s (remain 4m 46s) Loss: 0.1113(0.1248) Grad: 914477.6250  LR: 0.00000197  
Epoch: [8][200/1201] Elapsed 0m 50s (remain 4m 13s) Loss: 0.0000(0.1202) Grad: 211.8895  LR: 0.00000187  
Epoch: [8][300/1201] Elapsed 1m 15s (remain 3m 44s) Loss: 0.0000(0.1173) Grad: 26.0221  LR: 0.00000177  
Epoch: [8][400/1201] Elapsed 1m 39s (remain 3m 18s) Loss: 0.0001(0.1139) Grad: 775.7560  LR: 0.00000167  
Epoch: [8][500/1201] Elapsed 2m 3s (remain 2m 52s) Loss: 1.3154(0.1163) Grad: 2659415.7500  LR: 0.00000157  
Epoch: [8][600/1201] Elapsed 2m 27s (remain 2m 27s) Loss: 0.0143(0.1158) Grad: 181044.1250  LR: 0.00000148  
Epoch: [8][700/1201] Elapsed 2m 51s (remain 2m 2s) Loss: 0.3379(0.1183) Grad: 589794.0625  LR: 0.00000139  
Epoch: [8][800/1201] Elapsed 3m 16s (remain 1m 38s) Loss: 0.0177(0.1181) Grad: 12

Epoch 8 - avg_train_loss: 0.1113  avg_val_loss: 0.6841  time: 321s
INFO:__main__:Epoch 8 - avg_train_loss: 0.1113  avg_val_loss: 0.6841  time: 321s
Epoch 8 - Score: 0.7451
INFO:__main__:Epoch 8 - Score: 0.7451
Epoch 8 - Save Best Score: 0.7451 Model
INFO:__main__:Epoch 8 - Save Best Score: 0.7451 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 2.2964(0.6841) 
Epoch: [9][0/1201] Elapsed 0m 0s (remain 10m 30s) Loss: 0.0191(0.0191) Grad: nan  LR: 0.00000096  
Epoch: [9][100/1201] Elapsed 0m 25s (remain 4m 32s) Loss: 0.0188(0.0670) Grad: 292759.7500  LR: 0.00000089  
Epoch: [9][200/1201] Elapsed 0m 49s (remain 4m 8s) Loss: 0.0001(0.0913) Grad: 973.8676  LR: 0.00000081  
Epoch: [9][300/1201] Elapsed 1m 13s (remain 3m 41s) Loss: 0.1416(0.0899) Grad: 269698.4062  LR: 0.00000074  
Epoch: [9][400/1201] Elapsed 1m 37s (remain 3m 15s) Loss: 0.1204(0.0951) Grad: 410906.4062  LR: 0.00000068  
Epoch: [9][500/1201] Elapsed 2m 1s (remain 2m 50s) Loss: 0.0199(0.0969) Grad: 94987.7578  LR: 0.00000061  
Epoch: [9][600/1201] Elapsed 2m 24s (remain 2m 24s) Loss: 0.0000(0.0940) Grad: 14.4347  LR: 0.00000055  
Epoch: [9][700/1201] Elapsed 2m 48s (remain 2m 0s) Loss: 0.0001(0.0947) Grad: 681.7233  LR: 0.00000049  
Epoch: [9][800/1201] Elapsed 3m 12s (remain 1m 36s) Loss: 0.9668(0.0968) Grad: 28605

Epoch 9 - avg_train_loss: 0.0977  avg_val_loss: 0.6657  time: 315s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0977  avg_val_loss: 0.6657  time: 315s
Epoch 9 - Score: 0.7373
INFO:__main__:Epoch 9 - Score: 0.7373
Epoch 9 - Save Best Score: 0.7373 Model
INFO:__main__:Epoch 9 - Save Best Score: 0.7373 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 2.3954(0.6657) 
Epoch: [10][0/1201] Elapsed 0m 0s (remain 9m 58s) Loss: 0.0003(0.0003) Grad: nan  LR: 0.00000025  
Epoch: [10][100/1201] Elapsed 0m 24s (remain 4m 30s) Loss: 0.0003(0.0892) Grad: 1574.0310  LR: 0.00000021  
Epoch: [10][200/1201] Elapsed 0m 49s (remain 4m 5s) Loss: 0.0019(0.0816) Grad: 15334.2207  LR: 0.00000017  
Epoch: [10][300/1201] Elapsed 1m 13s (remain 3m 39s) Loss: 0.0029(0.0717) Grad: 25700.9492  LR: 0.00000014  
Epoch: [10][400/1201] Elapsed 1m 37s (remain 3m 13s) Loss: 0.0002(0.0869) Grad: 2121.9880  LR: 0.00000011  
Epoch: [10][500/1201] Elapsed 2m 0s (remain 2m 48s) Loss: 0.8760(0.0873) Grad: 522708.3125  LR: 0.00000008  
Epoch: [10][600/1201] Elapsed 2m 24s (remain 2m 24s) Loss: 0.0000(0.0865) Grad: 44.0727  LR: 0.00000006  
Epoch: [10][700/1201] Elapsed 2m 48s (remain 2m 0s) Loss: 0.1627(0.0870) Grad: 835687.8125  LR: 0.00000004  
Epoch: [10][800/1201] Elapsed 3m 12s (remain 1m 36s) Loss: 0.0000(0.0858) Gr

Epoch 10 - avg_train_loss: 0.0812  avg_val_loss: 0.6671  time: 316s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0812  avg_val_loss: 0.6671  time: 316s
Epoch 10 - Score: 0.7376
INFO:__main__:Epoch 10 - Score: 0.7376


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 2.3951(0.6671) 


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1201] Elapsed 0m 0s (remain 10m 26s) Loss: 5.2891(5.2891) Grad: nan  LR: 0.00000020  
Epoch: [1][100/1201] Elapsed 0m 24s (remain 4m 22s) Loss: 4.0625(3.8716) Grad: 365100.0625  LR: 0.00001000  
Epoch: [1][200/1201] Elapsed 0m 48s (remain 3m 59s) Loss: 2.3438(3.4792) Grad: 325888.4375  LR: 0.00001000  
Epoch: [1][300/1201] Elapsed 1m 12s (remain 3m 35s) Loss: 2.2617(3.1684) Grad: 279460.4375  LR: 0.00000999  
Epoch: [1][400/1201] Elapsed 1m 35s (remain 3m 10s) Loss: 1.0596(2.9048) Grad: 295657.9688  LR: 0.00000998  
Epoch: [1][500/1201] Elapsed 1m 59s (remain 2m 47s) Loss: 1.2432(2.6321) Grad: 207041.3125  LR: 0.00000996  
Epoch: [1][600/1201] Elapsed 2m 23s (remain 2m 23s) Loss: 1.2549(2.3813) Grad: 236024.3125  LR: 0.00000995  
Epoch: [1][700/1201] Elapsed 2m 47s (remain 1m 59s) Loss: 1.2832(2.1889) Grad: 264251.7188  LR: 0.00000993  
Epoch: [1][800/1201] Elapsed 3m 11s (remain 1m 35s) Loss: 2.6094(2.0262) Grad: 262204.0312  LR: 0.00000990  
Epoch: [1][900/1201] Elapsed 

Epoch 1 - avg_train_loss: 1.6070  avg_val_loss: 0.4528  time: 315s
INFO:__main__:Epoch 1 - avg_train_loss: 1.6070  avg_val_loss: 0.4528  time: 315s
Epoch 1 - Score: 0.4242
INFO:__main__:Epoch 1 - Score: 0.4242
Epoch 1 - Save Best Score: 0.4242 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4242 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0029(0.4528) 
Epoch: [2][0/1201] Elapsed 0m 0s (remain 10m 26s) Loss: 1.5332(1.5332) Grad: nan  LR: 0.00000977  
Epoch: [2][100/1201] Elapsed 0m 25s (remain 4m 42s) Loss: 1.0596(0.4875) Grad: 1727419.5000  LR: 0.00000973  
Epoch: [2][200/1201] Elapsed 0m 50s (remain 4m 11s) Loss: 0.6758(0.4969) Grad: 601626.0625  LR: 0.00000969  
Epoch: [2][300/1201] Elapsed 1m 14s (remain 3m 42s) Loss: 1.3047(0.4779) Grad: 578119.8125  LR: 0.00000964  
Epoch: [2][400/1201] Elapsed 1m 38s (remain 3m 16s) Loss: 0.2203(0.4742) Grad: 354765.8438  LR: 0.00000959  
Epoch: [2][500/1201] Elapsed 2m 2s (remain 2m 51s) Loss: 0.4099(0.4634) Grad: 464919.6562  LR: 0.00000954  
Epoch: [2][600/1201] Elapsed 2m 26s (remain 2m 26s) Loss: 0.1923(0.4720) Grad: 448751.0000  LR: 0.00000948  
Epoch: [2][700/1201] Elapsed 2m 50s (remain 2m 1s) Loss: 0.0996(0.4666) Grad: 520844.7188  LR: 0.00000942  
Epoch: [2][800/1201] Elapsed 3m 14s (remain 1m 37s) Loss: 0.0333(0.4559

Epoch 2 - avg_train_loss: 0.4492  avg_val_loss: 0.4279  time: 319s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4492  avg_val_loss: 0.4279  time: 319s
Epoch 2 - Score: 0.3074
INFO:__main__:Epoch 2 - Score: 0.3074
Epoch 2 - Save Best Score: 0.3074 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.3074 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0001(0.4279) 
Epoch: [3][0/1201] Elapsed 0m 0s (remain 11m 22s) Loss: 0.4861(0.4861) Grad: nan  LR: 0.00000908  
Epoch: [3][100/1201] Elapsed 0m 25s (remain 4m 35s) Loss: 0.0682(0.3891) Grad: 232099.1562  LR: 0.00000900  
Epoch: [3][200/1201] Elapsed 0m 49s (remain 4m 8s) Loss: 0.0145(0.3549) Grad: 71513.2734  LR: 0.00000892  
Epoch: [3][300/1201] Elapsed 1m 13s (remain 3m 40s) Loss: 1.0635(0.3845) Grad: 922420.8750  LR: 0.00000884  
Epoch: [3][400/1201] Elapsed 1m 37s (remain 3m 13s) Loss: 0.0176(0.3875) Grad: 56137.9336  LR: 0.00000875  
Epoch: [3][500/1201] Elapsed 2m 0s (remain 2m 48s) Loss: 0.0413(0.3875) Grad: 196426.1875  LR: 0.00000866  
Epoch: [3][600/1201] Elapsed 2m 24s (remain 2m 24s) Loss: 0.6001(0.3928) Grad: 452147.3750  LR: 0.00000857  
Epoch: [3][700/1201] Elapsed 2m 48s (remain 2m 0s) Loss: 0.0565(0.3878) Grad: 209132.3594  LR: 0.00000848  
Epoch: [3][800/1201] Elapsed 3m 12s (remain 1m 35s) Loss: 0.4160(0.3852) Gr

Epoch 3 - avg_train_loss: 0.3949  avg_val_loss: 0.5461  time: 316s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3949  avg_val_loss: 0.5461  time: 316s
Epoch 3 - Score: 0.3065
INFO:__main__:Epoch 3 - Score: 0.3065
Epoch 3 - Save Best Score: 0.3065 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.3065 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0000(0.5461) 
Epoch: [4][0/1201] Elapsed 0m 0s (remain 10m 6s) Loss: 0.0054(0.0054) Grad: nan  LR: 0.00000798  
Epoch: [4][100/1201] Elapsed 0m 26s (remain 4m 48s) Loss: 0.3965(0.3043) Grad: 590763.8750  LR: 0.00000787  
Epoch: [4][200/1201] Elapsed 0m 51s (remain 4m 14s) Loss: 0.8081(0.2734) Grad: 642011.2500  LR: 0.00000776  
Epoch: [4][300/1201] Elapsed 1m 14s (remain 3m 43s) Loss: 0.0514(0.2932) Grad: 537173.3125  LR: 0.00000765  
Epoch: [4][400/1201] Elapsed 1m 38s (remain 3m 16s) Loss: 0.0533(0.3092) Grad: 254289.8906  LR: 0.00000754  
Epoch: [4][500/1201] Elapsed 2m 2s (remain 2m 51s) Loss: 0.2109(0.3010) Grad: 545043.4375  LR: 0.00000743  
Epoch: [4][600/1201] Elapsed 2m 26s (remain 2m 25s) Loss: 0.3970(0.3022) Grad: 1313227.0000  LR: 0.00000731  
Epoch: [4][700/1201] Elapsed 2m 50s (remain 2m 1s) Loss: 0.0007(0.3034) Grad: 2617.5686  LR: 0.00000719  
Epoch: [4][800/1201] Elapsed 3m 14s (remain 1m 36s) Loss: 0.0521(0.2949) G

Epoch 4 - avg_train_loss: 0.3131  avg_val_loss: 0.4566  time: 318s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3131  avg_val_loss: 0.4566  time: 318s
Epoch 4 - Score: 0.2802
INFO:__main__:Epoch 4 - Score: 0.2802
Epoch 4 - Save Best Score: 0.2802 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.2802 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0000(0.4566) 
Epoch: [5][0/1201] Elapsed 0m 0s (remain 9m 56s) Loss: 0.5957(0.5957) Grad: nan  LR: 0.00000658  
Epoch: [5][100/1201] Elapsed 0m 25s (remain 4m 32s) Loss: 0.0636(0.2057) Grad: 247937.9844  LR: 0.00000646  
Epoch: [5][200/1201] Elapsed 0m 49s (remain 4m 5s) Loss: 0.3381(0.2172) Grad: 526375.6875  LR: 0.00000633  
Epoch: [5][300/1201] Elapsed 1m 13s (remain 3m 39s) Loss: 0.8970(0.2275) Grad: 776224.3125  LR: 0.00000621  
Epoch: [5][400/1201] Elapsed 1m 37s (remain 3m 13s) Loss: 0.0016(0.2263) Grad: 10049.0947  LR: 0.00000608  
Epoch: [5][500/1201] Elapsed 2m 1s (remain 2m 49s) Loss: 0.2119(0.2305) Grad: 1043160.5000  LR: 0.00000595  
Epoch: [5][600/1201] Elapsed 2m 25s (remain 2m 25s) Loss: 0.0001(0.2317) Grad: 407.6133  LR: 0.00000582  
Epoch: [5][700/1201] Elapsed 2m 48s (remain 2m 0s) Loss: 0.1827(0.2355) Grad: 441188.0625  LR: 0.00000569  
Epoch: [5][800/1201] Elapsed 3m 12s (remain 1m 36s) Loss: 0.2876(0.2386) Grad

Epoch 5 - avg_train_loss: 0.2665  avg_val_loss: 0.5070  time: 316s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2665  avg_val_loss: 0.5070  time: 316s
Epoch 5 - Score: 0.2969
INFO:__main__:Epoch 5 - Score: 0.2969


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0000(0.5070) 
Epoch: [6][0/1201] Elapsed 0m 0s (remain 9m 59s) Loss: 0.0063(0.0063) Grad: nan  LR: 0.00000504  
Epoch: [6][100/1201] Elapsed 0m 24s (remain 4m 24s) Loss: 0.0087(0.1521) Grad: 80451.6328  LR: 0.00000490  
Epoch: [6][200/1201] Elapsed 0m 48s (remain 4m 2s) Loss: 0.2959(0.2058) Grad: 392138.1875  LR: 0.00000477  
Epoch: [6][300/1201] Elapsed 1m 12s (remain 3m 36s) Loss: 0.0001(0.1927) Grad: 570.5211  LR: 0.00000464  
Epoch: [6][400/1201] Elapsed 1m 36s (remain 3m 12s) Loss: 0.0015(0.1888) Grad: 11614.6367  LR: 0.00000451  
Epoch: [6][500/1201] Elapsed 2m 0s (remain 2m 48s) Loss: 0.2722(0.1970) Grad: 394867.6250  LR: 0.00000438  
Epoch: [6][600/1201] Elapsed 2m 25s (remain 2m 24s) Loss: 0.0414(0.2020) Grad: 525962.8125  LR: 0.00000425  
Epoch: [6][700/1201] Elapsed 2m 49s (remain 2m 0s) Loss: 0.0115(0.2021) Grad: 114902.1875  LR: 0.00000412  
Epoch: [6][800/1201] Elapsed 3m 13s (remain 1m 36s) Loss: 0.0004(0.2051) Grad: 

Epoch 6 - avg_train_loss: 0.2027  avg_val_loss: 0.5700  time: 318s
INFO:__main__:Epoch 6 - avg_train_loss: 0.2027  avg_val_loss: 0.5700  time: 318s
Epoch 6 - Score: 0.2694
INFO:__main__:Epoch 6 - Score: 0.2694
Epoch 6 - Save Best Score: 0.2694 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.2694 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0000(0.5700) 
Epoch: [7][0/1201] Elapsed 0m 0s (remain 10m 40s) Loss: 0.0058(0.0058) Grad: nan  LR: 0.00000348  
Epoch: [7][100/1201] Elapsed 0m 25s (remain 4m 36s) Loss: 0.0001(0.1179) Grad: 853.8406  LR: 0.00000336  
Epoch: [7][200/1201] Elapsed 0m 50s (remain 4m 11s) Loss: 0.0001(0.1095) Grad: 1530.1672  LR: 0.00000324  
Epoch: [7][300/1201] Elapsed 1m 14s (remain 3m 43s) Loss: 0.0031(0.1469) Grad: 27472.9395  LR: 0.00000311  
Epoch: [7][400/1201] Elapsed 1m 38s (remain 3m 16s) Loss: 0.3855(0.1494) Grad: 858002.6875  LR: 0.00000299  
Epoch: [7][500/1201] Elapsed 2m 2s (remain 2m 51s) Loss: 0.0016(0.1417) Grad: 24768.1621  LR: 0.00000287  
Epoch: [7][600/1201] Elapsed 2m 26s (remain 2m 26s) Loss: 0.1161(0.1368) Grad: 408820.7500  LR: 0.00000275  
Epoch: [7][700/1201] Elapsed 2m 50s (remain 2m 1s) Loss: 0.0385(0.1392) Grad: 208144.7500  LR: 0.00000264  
Epoch: [7][800/1201] Elapsed 3m 14s (remain 1m 37s) Loss: 0.8496(0.1411) Grad: 

Epoch 7 - avg_train_loss: 0.1476  avg_val_loss: 0.6013  time: 317s
INFO:__main__:Epoch 7 - avg_train_loss: 0.1476  avg_val_loss: 0.6013  time: 317s
Epoch 7 - Score: 0.2679
INFO:__main__:Epoch 7 - Score: 0.2679
Epoch 7 - Save Best Score: 0.2679 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.2679 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0000(0.6013) 
Epoch: [8][0/1201] Elapsed 0m 0s (remain 9m 42s) Loss: 0.0014(0.0014) Grad: nan  LR: 0.00000208  
Epoch: [8][100/1201] Elapsed 0m 25s (remain 4m 32s) Loss: 0.4204(0.0838) Grad: 664484.1875  LR: 0.00000197  
Epoch: [8][200/1201] Elapsed 0m 49s (remain 4m 7s) Loss: 1.0000(0.1020) Grad: 626687.1250  LR: 0.00000187  
Epoch: [8][300/1201] Elapsed 1m 13s (remain 3m 41s) Loss: 0.1219(0.0956) Grad: 409691.4375  LR: 0.00000177  
Epoch: [8][400/1201] Elapsed 1m 37s (remain 3m 15s) Loss: 0.0025(0.0903) Grad: 16192.5732  LR: 0.00000167  
Epoch: [8][500/1201] Elapsed 2m 2s (remain 2m 50s) Loss: 0.9629(0.0968) Grad: 491546.3125  LR: 0.00000157  
Epoch: [8][600/1201] Elapsed 2m 25s (remain 2m 25s) Loss: 0.0000(0.0965) Grad: 3.9207  LR: 0.00000148  
Epoch: [8][700/1201] Elapsed 2m 49s (remain 2m 1s) Loss: 0.0003(0.0984) Grad: 2296.0769  LR: 0.00000139  
Epoch: [8][800/1201] Elapsed 3m 13s (remain 1m 36s) Loss: 0.0839(0.1027) Grad: 419

Epoch 8 - avg_train_loss: 0.1128  avg_val_loss: 0.6447  time: 316s
INFO:__main__:Epoch 8 - avg_train_loss: 0.1128  avg_val_loss: 0.6447  time: 316s
Epoch 8 - Score: 0.2637
INFO:__main__:Epoch 8 - Score: 0.2637
Epoch 8 - Save Best Score: 0.2637 Model
INFO:__main__:Epoch 8 - Save Best Score: 0.2637 Model


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0000(0.6447) 
Epoch: [9][0/1201] Elapsed 0m 0s (remain 10m 31s) Loss: 0.0232(0.0232) Grad: nan  LR: 0.00000097  
Epoch: [9][100/1201] Elapsed 0m 25s (remain 4m 38s) Loss: 0.0000(0.0944) Grad: 1.5173  LR: 0.00000089  
Epoch: [9][200/1201] Elapsed 0m 50s (remain 4m 9s) Loss: 0.0252(0.0840) Grad: 82939.8750  LR: 0.00000082  
Epoch: [9][300/1201] Elapsed 1m 13s (remain 3m 40s) Loss: 0.1050(0.0928) Grad: 258111.0625  LR: 0.00000075  
Epoch: [9][400/1201] Elapsed 1m 37s (remain 3m 14s) Loss: 0.1555(0.0896) Grad: 293417.2500  LR: 0.00000068  
Epoch: [9][500/1201] Elapsed 2m 1s (remain 2m 49s) Loss: 0.4099(0.0964) Grad: 258463.0156  LR: 0.00000061  
Epoch: [9][600/1201] Elapsed 2m 25s (remain 2m 24s) Loss: 0.0000(0.0972) Grad: 5.0466  LR: 0.00000055  
Epoch: [9][700/1201] Elapsed 2m 48s (remain 2m 0s) Loss: 0.3411(0.0990) Grad: 299728.7812  LR: 0.00000049  
Epoch: [9][800/1201] Elapsed 3m 12s (remain 1m 36s) Loss: 0.0215(0.0964) Grad: 60175

Epoch 9 - avg_train_loss: 0.0995  avg_val_loss: 0.6464  time: 317s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0995  avg_val_loss: 0.6464  time: 317s
Epoch 9 - Score: 0.2668
INFO:__main__:Epoch 9 - Score: 0.2668


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0000(0.6464) 
Epoch: [10][0/1201] Elapsed 0m 0s (remain 10m 39s) Loss: 0.0049(0.0049) Grad: nan  LR: 0.00000025  
Epoch: [10][100/1201] Elapsed 0m 24s (remain 4m 25s) Loss: 0.0008(0.0996) Grad: 13379.2822  LR: 0.00000021  
Epoch: [10][200/1201] Elapsed 0m 48s (remain 4m 1s) Loss: 0.1754(0.0824) Grad: 606094.6875  LR: 0.00000017  
Epoch: [10][300/1201] Elapsed 1m 13s (remain 3m 38s) Loss: 0.1836(0.0821) Grad: 676663.6250  LR: 0.00000014  
Epoch: [10][400/1201] Elapsed 1m 36s (remain 3m 13s) Loss: 0.8813(0.0848) Grad: 734666.0625  LR: 0.00000011  
Epoch: [10][500/1201] Elapsed 2m 1s (remain 2m 49s) Loss: 0.0000(0.0825) Grad: 142.2573  LR: 0.00000009  
Epoch: [10][600/1201] Elapsed 2m 25s (remain 2m 24s) Loss: 0.5664(0.0822) Grad: 638513.5000  LR: 0.00000006  
Epoch: [10][700/1201] Elapsed 2m 49s (remain 2m 0s) Loss: 0.0117(0.0795) Grad: 86113.0625  LR: 0.00000004  
Epoch: [10][800/1201] Elapsed 3m 13s (remain 1m 36s) Loss: 0.0017(0.08

Epoch 10 - avg_train_loss: 0.0824  avg_val_loss: 0.6500  time: 318s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0824  avg_val_loss: 0.6500  time: 318s
Epoch 10 - Score: 0.2669
INFO:__main__:Epoch 10 - Score: 0.2669


EVAL: [300/301] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0000(0.6500) 


CV score = 0.5081608755318541
INFO:__main__:CV score = 0.5081608755318541
