In [1]:
# ====================================================
# CFG
# ====================================================
import numpy as np
class CFG:
    wandb = True
    DEBUG = False
    TO_KAGGLE = True
    score_path = "gs://feedback3/output/scores/scores3.csv"
    MEMO = "ベースライン"
    file_name = "001"
    model="xlm/xlm-roberta-large-squad2"
    patience = 3
    n_fold=4
    trn_fold=[0,1,2,3]
    model_config_path = f"/home/jupyter/models/{model}/"
    model_bin_path = f"/home/jupyter/models/{model}/"
    competition='FB3'
    apex=True
    print_freq=20
    num_workers=4
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=10
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    train=True
    
if CFG.DEBUG:
    CFG.epochs = 2
    #CFG.trn_fold = [0]

In [2]:
import os
import datetime
import pickle
import glob

# ====================================================
# datetime
# ====================================================
t_delta = datetime.timedelta(hours=9)
JST = datetime.timezone(t_delta, 'JST')
now = datetime.datetime.now(JST)
date = now.strftime('%Y%m%d')
date2 = now.strftime('%Y%m%d%H%M')


# ====================================================
# file_path
# ====================================================
if "/" in CFG.model:
    model_name = CFG.model.replace("/","-")
else:
    model_name = CFG.model

path ="/home/jupyter/feedback-prize-english-language-learning/"
if CFG.DEBUG:
    OUTPUT_DIR = f'/home/jupyter/output/ex/DEBUG/{model_name}/{CFG.file_name}/{date2}/'
else:
    OUTPUT_DIR = f'/home/jupyter/output/ex/{model_name}/{CFG.file_name}/{date2}/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)



# ====================================================
# wandb 
# ====================================================
if CFG.wandb:
    import wandb
    
    def class2dict(f):
            return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))
        
    project='Feedback Prize - English Language Learning'
    if CFG.DEBUG:
        group = "DEBUG"
    else:
        group = model_name
    wandb_name = f"{CFG.file_name}_{date2}"
    job_type = CFG.file_name  #"train"

    wandb_api = "your_id"
    wandb.login(key=wandb_api)
    anony = None
    run = wandb.init(project=project, 
                         name = wandb_name,
                         config=class2dict(CFG),
                         group=group,
                         job_type=job_type,
                         anonymous=anony)

# Library

In [3]:
# !pip download transformers==4.21.2
# !pip download tokenizers==0.12.1

In [4]:
# !pip install transformers
# !pip install tokenizers

In [5]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
import shutil
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=/home/jupyter/code_baseline/FB3_pip_wheels transformers')
os.system('python -m pip install --no-index --find-links=/home/jupyter/code_baseline/FB3_pip_wheels tokenizers')
import tokenizers
import transformers
# print(f"tokenizers.__version__: {tokenizers.__version__}")
# print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')





Found existing installation: transformers 4.21.2
Uninstalling transformers-4.21.2:
  Successfully uninstalled transformers-4.21.2




Found existing installation: tokenizers 0.12.1
Uninstalling tokenizers-0.12.1:
  Successfully uninstalled tokenizers-0.12.1




Looking in links: /home/jupyter/code_baseline/FB3_pip_wheels
Processing /home/jupyter/code_baseline/FB3_pip_wheels/transformers-4.21.2-py3-none-any.whl
Processing /home/jupyter/code_baseline/FB3_pip_wheels/tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: tokenizers, transformers


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.10.1 requires transformers<4.21,>=4.1, but you have transformers 4.21.2 which is incompatible.


Successfully installed tokenizers-0.12.1 transformers-4.21.2
Looking in links: /home/jupyter/code_baseline/FB3_pip_wheels




env: TOKENIZERS_PARALLELISM=true


# Utils

In [6]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [7]:
# ====================================================
# Data Loading
# ====================================================

train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
submission = pd.read_csv(path+'sample_submission.csv')

Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)

if CFG.DEBUG:
    # display(train.groupby('fold').size())
    train = train.sample(n=50, random_state=0).reset_index(drop=True)
    # display(train.groupby('fold').size())

# tokenizer

In [8]:
# ====================================================
# tokenizer models/roberta/roberta-base/config.json
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(f"/home/jupyter/models/{CFG.model}/")
CFG.tokenizer = tokenizer

# Dataset

In [9]:
# ====================================================
# Define max_len
# ====================================================
# lengths = []
# tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
# for text in tk0:
#     length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
#     lengths.append(length)
# CFG.max_len = max(lengths) + 3 # cls & sep & sep
# LOGGER.info(f"max_len: {CFG.max_len}")

In [10]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [11]:
# ====================================================
# Model
# ====================================================
#MeanPoolingはoutput_hidden_statesに関係している   https://qiita.com/niship2/items/f84751aed893da869cec
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model_config_path, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model_bin_path, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [12]:
# model_config_path = f"/home/jupyter/models/deberta/{model}/model"
# model_bin_path = f"/home/jupyter/models/deberta/{CFG.model}/model"

In [13]:
# conf = AutoConfig.from_pretrained(f"/home/jupyter/models/deberta/{CFG.model}/model", output_hidden_states=True)
# AutoModel.from_pretrained(f"/home/jupyter/models/deberta/{CFG.model}/model", config=conf)

# Loss

In [14]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

# Helpler functions

In [15]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# train loop

In [16]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf
    patience = CFG.patience
    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
            patience = CFG.patience
        else:
            patience -= 1
            if patience<=0:
                break

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [17]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
        return score
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                score = get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        score = round(get_result(oof_df),3)
        oof_df.to_pickle(OUTPUT_DIR+f'oof_df.pkl')
        
    # if CFG.DEBUG:
    #     import send2trash
    #     send2trash.send2trash(OUTPUT_DIR)
    CFG.OUTPUT_DIR = OUTPUT_DIR
    dict_cfg = {k: vars(CFG)[k] for k in vars(CFG) if "__" not in k}
    with open(OUTPUT_DIR+"dict_cfg", 'wb') as web:
        pickle.dump(dict_cfg , web)
    with open(OUTPUT_DIR+"class_cfg", 'wb') as web:
        pickle.dump(CFG , web)
    
    
    if CFG.wandb:
        wandb.config.update(class2dict(CFG))
        wandb.finish()

XLMRobertaConfig {
  "_name_or_path": "/home/jupyter/models/xlm/xlm-roberta-large-squad2/",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "name": "XLMRoberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

Some weights of the model checkpoint at /home/jupyter/models/xlm/xlm-roberta-large-squad2/

Epoch: [1][0/366] Elapsed 0m 2s (remain 14m 1s) Loss: 2.7632(2.7632) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 28s (remain 7m 51s) Loss: 0.3811(0.8326) Grad: 147013.3438  LR: 0.00002000  
Epoch: [1][40/366] Elapsed 0m 56s (remain 7m 27s) Loss: 0.1968(0.5457) Grad: 212119.2812  LR: 0.00001999  
Epoch: [1][60/366] Elapsed 1m 24s (remain 7m 4s) Loss: 0.1343(0.4127) Grad: 64138.3906  LR: 0.00001999  
Epoch: [1][80/366] Elapsed 1m 52s (remain 6m 37s) Loss: 0.1248(0.3513) Grad: 58718.6523  LR: 0.00001998  
Epoch: [1][100/366] Elapsed 2m 20s (remain 6m 9s) Loss: 0.1264(0.3087) Grad: 44561.7188  LR: 0.00001996  
Epoch: [1][120/366] Elapsed 2m 48s (remain 5m 41s) Loss: 0.2087(0.2817) Grad: 275295.6562  LR: 0.00001995  
Epoch: [1][140/366] Elapsed 3m 16s (remain 5m 14s) Loss: 0.1031(0.2598) Grad: 52518.0312  LR: 0.00001993  
Epoch: [1][160/366] Elapsed 3m 44s (remain 4m 46s) Loss: 0.1239(0.2449) Grad: 114878.3750  LR: 0.00001990  
Epoch: [1][180/366] Elapsed 4m 12s (remain 4m 18s

Epoch 1 - avg_train_loss: 0.1810  avg_val_loss: 0.1216  time: 622s
Epoch 1 - Score: 0.4932  Scores: [0.5367866678028637, 0.5306789142073715, 0.4138750436479738, 0.5162673672084735, 0.4981927288883549, 0.4636062827026192]
Epoch 1 - Save Best Score: 0.4932 Model


EVAL: [60/62] Elapsed 1m 48s (remain 0m 1s) Loss: 0.1201(0.1216) 
EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.1343(0.1216) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 10m 5s) Loss: 0.1059(0.1059) Grad: inf  LR: 0.00001951  
Epoch: [2][20/366] Elapsed 0m 29s (remain 8m 10s) Loss: 0.1122(0.0994) Grad: 269762.9062  LR: 0.00001946  
Epoch: [2][40/366] Elapsed 0m 58s (remain 7m 40s) Loss: 0.1709(0.0981) Grad: 178277.9062  LR: 0.00001940  
Epoch: [2][60/366] Elapsed 1m 26s (remain 7m 10s) Loss: 0.1451(0.1019) Grad: 96727.5078  LR: 0.00001934  
Epoch: [2][80/366] Elapsed 1m 54s (remain 6m 41s) Loss: 0.0970(0.1013) Grad: 115791.8516  LR: 0.00001928  
Epoch: [2][100/366] Elapsed 2m 22s (remain 6m 13s) Loss: 0.0673(0.1024) Grad: 83698.0781  LR: 0.00001921  
Epoch: [2][120/366] Elapsed 2m 50s (remain 5m 44s) Loss: 0.0852(0.1023) Grad: 44160.4883  LR: 0.00001914  
Epoch: [2][140/366] Elapsed 3m 18s (remain 5m 16s) Loss: 0.0868(0.1032) Grad: 29905.4922  LR: 0.00001907  
Epoch: [2][160/366] Ela

Epoch 2 - avg_train_loss: 0.1024  avg_val_loss: 0.1382  time: 623s
Epoch 2 - Score: 0.5270  Scores: [0.49467134805379953, 0.5730713683260458, 0.4539340100979298, 0.5339185070258419, 0.6054843192722549, 0.5012161419930862]


EVAL: [60/62] Elapsed 1m 48s (remain 0m 1s) Loss: 0.1123(0.1384) 
EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.0656(0.1382) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 8m 30s) Loss: 0.0801(0.0801) Grad: inf  LR: 0.00001809  
Epoch: [3][20/366] Elapsed 0m 29s (remain 8m 0s) Loss: 0.0663(0.0807) Grad: 64637.2227  LR: 0.00001799  
Epoch: [3][40/366] Elapsed 0m 57s (remain 7m 33s) Loss: 0.0750(0.0759) Grad: 63260.2188  LR: 0.00001789  
Epoch: [3][60/366] Elapsed 1m 25s (remain 7m 6s) Loss: 0.0591(0.0744) Grad: 70978.1406  LR: 0.00001778  
Epoch: [3][80/366] Elapsed 1m 53s (remain 6m 39s) Loss: 0.1008(0.0735) Grad: 186877.5625  LR: 0.00001767  
Epoch: [3][100/366] Elapsed 2m 21s (remain 6m 11s) Loss: 0.0628(0.0717) Grad: 66204.3203  LR: 0.00001756  
Epoch: [3][120/366] Elapsed 2m 49s (remain 5m 43s) Loss: 0.0691(0.0731) Grad: 83008.6797  LR: 0.00001745  
Epoch: [3][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.0720(0.0732) Grad: 103896.4922  LR: 0.00001733  
Epoch: [3][160/366] Elapse

Epoch 3 - avg_train_loss: 0.0718  avg_val_loss: 0.1113  time: 623s
Epoch 3 - Score: 0.4728  Scores: [0.49837191509702594, 0.4584343390823896, 0.4548551250904761, 0.47093433759713377, 0.49074749018481856, 0.46352098969548916]
Epoch 3 - Save Best Score: 0.4728 Model


EVAL: [60/62] Elapsed 1m 48s (remain 0m 1s) Loss: 0.0951(0.1113) 
EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.1187(0.1113) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 9m 46s) Loss: 0.0521(0.0521) Grad: 132745.0156  LR: 0.00001588  
Epoch: [4][20/366] Elapsed 0m 29s (remain 8m 6s) Loss: 0.0488(0.0569) Grad: 46250.7109  LR: 0.00001574  
Epoch: [4][40/366] Elapsed 0m 58s (remain 7m 41s) Loss: 0.0358(0.0576) Grad: 49216.5977  LR: 0.00001560  
Epoch: [4][60/366] Elapsed 1m 26s (remain 7m 10s) Loss: 0.0619(0.0569) Grad: 123806.8828  LR: 0.00001546  
Epoch: [4][80/366] Elapsed 1m 54s (remain 6m 41s) Loss: 0.0762(0.0573) Grad: 129057.1562  LR: 0.00001532  
Epoch: [4][100/366] Elapsed 2m 22s (remain 6m 13s) Loss: 0.0604(0.0569) Grad: 84102.4375  LR: 0.00001517  
Epoch: [4][120/366] Elapsed 2m 50s (remain 5m 44s) Loss: 0.0574(0.0572) Grad: 92633.7734  LR: 0.00001502  
Epoch: [4][140/366] Elapsed 3m 18s (remain 5m 16s) Loss: 0.0480(0.0564) Grad: 96718.7031  LR: 0.00001487  
Epoch: [4][160/36

Epoch 4 - avg_train_loss: 0.0540  avg_val_loss: 0.1067  time: 623s
Epoch 4 - Score: 0.4626  Scores: [0.4985723602926085, 0.45597746615564105, 0.4163348269269878, 0.46697285461720067, 0.47636559170165665, 0.4613733487291804]
Epoch 4 - Save Best Score: 0.4626 Model


EVAL: [60/62] Elapsed 1m 48s (remain 0m 1s) Loss: 0.0951(0.1068) 
EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.0646(0.1067) 
Epoch: [5][0/366] Elapsed 0m 1s (remain 9m 46s) Loss: 0.0378(0.0378) Grad: 148103.5000  LR: 0.00001310  
Epoch: [5][20/366] Elapsed 0m 29s (remain 8m 5s) Loss: 0.0280(0.0375) Grad: 47660.7148  LR: 0.00001294  
Epoch: [5][40/366] Elapsed 0m 58s (remain 7m 40s) Loss: 0.0401(0.0379) Grad: 75312.2031  LR: 0.00001277  
Epoch: [5][60/366] Elapsed 1m 25s (remain 7m 9s) Loss: 0.0321(0.0376) Grad: 70184.6328  LR: 0.00001261  
Epoch: [5][80/366] Elapsed 1m 54s (remain 6m 41s) Loss: 0.0253(0.0378) Grad: 78812.6641  LR: 0.00001244  
Epoch: [5][100/366] Elapsed 2m 22s (remain 6m 12s) Loss: 0.0313(0.0379) Grad: 77736.7344  LR: 0.00001228  
Epoch: [5][120/366] Elapsed 2m 50s (remain 5m 44s) Loss: 0.0342(0.0380) Grad: 61787.0273  LR: 0.00001211  
Epoch: [5][140/366] Elapsed 3m 18s (remain 5m 16s) Loss: 0.0546(0.0382) Grad: 99550.6797  LR: 0.00001194  
Epoch: [5][160/366] 

Epoch 5 - avg_train_loss: 0.0378  avg_val_loss: 0.1122  time: 623s
Epoch 5 - Score: 0.4745  Scores: [0.4984753688756329, 0.46621928140446417, 0.42493921921220174, 0.4749665003560645, 0.5093030187745844, 0.47323244509094975]


Epoch: [6][0/366] Elapsed 0m 1s (remain 10m 27s) Loss: 0.0340(0.0340) Grad: 177189.2344  LR: 0.00001002  
Epoch: [6][20/366] Elapsed 0m 29s (remain 8m 4s) Loss: 0.0155(0.0226) Grad: 52476.7109  LR: 0.00000985  
Epoch: [6][40/366] Elapsed 0m 57s (remain 7m 35s) Loss: 0.0181(0.0225) Grad: 69039.0391  LR: 0.00000967  
Epoch: [6][60/366] Elapsed 1m 25s (remain 7m 8s) Loss: 0.0124(0.0228) Grad: 43984.3555  LR: 0.00000950  
Epoch: [6][80/366] Elapsed 1m 53s (remain 6m 40s) Loss: 0.0225(0.0225) Grad: 72935.5000  LR: 0.00000933  
Epoch: [6][100/366] Elapsed 2m 21s (remain 6m 12s) Loss: 0.0099(0.0228) Grad: 34366.3047  LR: 0.00000916  
Epoch: [6][120/366] Elapsed 2m 49s (remain 5m 44s) Loss: 0.0169(0.0225) Grad: 79320.2812  LR: 0.00000899  
Epoch: [6][140/366] Elapsed 3m 18s (remain 5m 15s) Loss: 0.0276(0.0227) Grad: 101139.8125  LR: 0.00000882  
Epoch: [6][160/366] Elapsed 3m 45s (remain 4m 47s) Loss: 0.0247(0.0226) Grad: 87793.7031  LR: 0.00000865  
Epoch: [6][180/366] Elapsed 4m 14s (remain 

Epoch 6 - avg_train_loss: 0.0235  avg_val_loss: 0.1139  time: 622s
Epoch 6 - Score: 0.4784  Scores: [0.5059763414630313, 0.46404916775952787, 0.44432623990686065, 0.48232324125493203, 0.4998354822694417, 0.4740429290619073]


EVAL: [60/62] Elapsed 1m 48s (remain 0m 1s) Loss: 0.1185(0.1139) 
EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.0910(0.1139) 
Epoch: [7][0/366] Elapsed 0m 1s (remain 9m 43s) Loss: 0.0149(0.0149) Grad: 150319.0156  LR: 0.00000693  
Epoch: [7][20/366] Elapsed 0m 29s (remain 8m 4s) Loss: 0.0187(0.0169) Grad: 153871.6406  LR: 0.00000677  
Epoch: [7][40/366] Elapsed 0m 57s (remain 7m 35s) Loss: 0.0139(0.0155) Grad: 109869.0156  LR: 0.00000661  
Epoch: [7][60/366] Elapsed 1m 25s (remain 7m 7s) Loss: 0.0168(0.0155) Grad: 210545.8750  LR: 0.00000645  
Epoch: [7][80/366] Elapsed 1m 53s (remain 6m 40s) Loss: 0.0153(0.0154) Grad: 111442.5078  LR: 0.00000629  
Epoch: [7][100/366] Elapsed 2m 21s (remain 6m 11s) Loss: 0.0120(0.0149) Grad: 104133.5391  LR: 0.00000613  
Epoch: [7][120/366] Elapsed 2m 49s (remain 5m 43s) Loss: 0.0125(0.0147) Grad: 126569.4297  LR: 0.00000597  
Epoch: [7][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.0108(0.0144) Grad: 115938.3516  LR: 0.00000581  
Epoch: [7][16

Epoch 7 - avg_train_loss: 0.0136  avg_val_loss: 0.1132  time: 623s
Epoch 7 - Score: 0.4770  Scores: [0.5079569411701885, 0.4690022775469264, 0.44292427486223657, 0.47869809493489757, 0.4895670704737082, 0.47361433832134187]


EVAL: [60/62] Elapsed 1m 48s (remain 0m 1s) Loss: 0.1156(0.1132) 
EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.0860(0.1132) 


Score: 0.4626  Scores: [0.4985723602926085, 0.45597746615564105, 0.4163348269269878, 0.46697285461720067, 0.47636559170165665, 0.4613733487291804]
XLMRobertaConfig {
  "_name_or_path": "/home/jupyter/models/xlm/xlm-roberta-large-squad2/",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "name": "XLMRoberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.2",
  "type_vocab_

Epoch: [1][0/366] Elapsed 0m 1s (remain 8m 43s) Loss: 2.4362(2.4362) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 29s (remain 8m 3s) Loss: 0.1578(0.8013) Grad: 156519.1562  LR: 0.00002000  
Epoch: [1][40/366] Elapsed 0m 57s (remain 7m 34s) Loss: 0.1995(0.5233) Grad: 93375.5859  LR: 0.00001999  
Epoch: [1][60/366] Elapsed 1m 25s (remain 7m 6s) Loss: 0.1865(0.4150) Grad: 59215.9219  LR: 0.00001999  
Epoch: [1][80/366] Elapsed 1m 53s (remain 6m 38s) Loss: 0.1660(0.3488) Grad: 75322.1953  LR: 0.00001998  
Epoch: [1][100/366] Elapsed 2m 21s (remain 6m 10s) Loss: 0.1223(0.3037) Grad: 32843.6641  LR: 0.00001996  
Epoch: [1][120/366] Elapsed 2m 49s (remain 5m 42s) Loss: 0.1696(0.2799) Grad: 52524.9531  LR: 0.00001995  
Epoch: [1][140/366] Elapsed 3m 17s (remain 5m 14s) Loss: 0.1114(0.2610) Grad: 44211.9180  LR: 0.00001993  
Epoch: [1][160/366] Elapsed 3m 45s (remain 4m 46s) Loss: 0.1213(0.2458) Grad: 38228.9180  LR: 0.00001991  
Epoch: [1][180/366] Elapsed 4m 13s (remain 4m 18s) L

Epoch 1 - avg_train_loss: 0.1848  avg_val_loss: 0.1176  time: 621s
Epoch 1 - Score: 0.4863  Scores: [0.5131988118420827, 0.4668221979423336, 0.44518492246740793, 0.48984420026718617, 0.5327238252570434, 0.46981069223246535]
Epoch 1 - Save Best Score: 0.4863 Model


Epoch: [2][0/366] Elapsed 0m 1s (remain 8m 56s) Loss: 0.1124(0.1124) Grad: inf  LR: 0.00001951  
Epoch: [2][20/366] Elapsed 0m 29s (remain 8m 3s) Loss: 0.0879(0.1078) Grad: 163347.5781  LR: 0.00001946  
Epoch: [2][40/366] Elapsed 0m 57s (remain 7m 37s) Loss: 0.1018(0.1028) Grad: 110720.9609  LR: 0.00001940  
Epoch: [2][60/366] Elapsed 1m 25s (remain 7m 6s) Loss: 0.1539(0.1026) Grad: 121738.7656  LR: 0.00001934  
Epoch: [2][80/366] Elapsed 1m 53s (remain 6m 39s) Loss: 0.0946(0.1006) Grad: 44519.9219  LR: 0.00001928  
Epoch: [2][100/366] Elapsed 2m 21s (remain 6m 11s) Loss: 0.1082(0.0994) Grad: 67131.3281  LR: 0.00001921  
Epoch: [2][120/366] Elapsed 2m 49s (remain 5m 43s) Loss: 0.1426(0.1006) Grad: 122727.9844  LR: 0.00001914  
Epoch: [2][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.1505(0.1012) Grad: 87121.8828  LR: 0.00001907  
Epoch: [2][160/366] Elapsed 3m 45s (remain 4m 47s) Loss: 0.1229(0.0996) Grad: 69639.7266  LR: 0.00001900  
Epoch: [2][180/366] Elapsed 4m 13s (remain 4m 19s

Epoch 2 - avg_train_loss: 0.0980  avg_val_loss: 0.1111  time: 622s
Epoch 2 - Score: 0.4725  Scores: [0.5031422693404599, 0.4560782040963218, 0.433535570621158, 0.47323334338062734, 0.5098245612244544, 0.45933614095082836]
Epoch 2 - Save Best Score: 0.4725 Model


Epoch: [3][0/366] Elapsed 0m 1s (remain 11m 9s) Loss: 0.0647(0.0647) Grad: 135275.6094  LR: 0.00001809  
Epoch: [3][20/366] Elapsed 0m 29s (remain 8m 11s) Loss: 0.0746(0.0732) Grad: 153401.2656  LR: 0.00001799  
Epoch: [3][40/366] Elapsed 0m 58s (remain 7m 40s) Loss: 0.0749(0.0725) Grad: 153507.6719  LR: 0.00001789  
Epoch: [3][60/366] Elapsed 1m 25s (remain 7m 9s) Loss: 0.0493(0.0694) Grad: 97417.5391  LR: 0.00001778  
Epoch: [3][80/366] Elapsed 1m 53s (remain 6m 40s) Loss: 0.0521(0.0707) Grad: 82442.2656  LR: 0.00001767  
Epoch: [3][100/366] Elapsed 2m 22s (remain 6m 12s) Loss: 0.0772(0.0701) Grad: 114454.8203  LR: 0.00001756  
Epoch: [3][120/366] Elapsed 2m 49s (remain 5m 44s) Loss: 0.0576(0.0697) Grad: 68704.6484  LR: 0.00001745  
Epoch: [3][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.0572(0.0703) Grad: 53787.9375  LR: 0.00001733  
Epoch: [3][160/366] Elapsed 3m 46s (remain 4m 47s) Loss: 0.0712(0.0702) Grad: 65239.2070  LR: 0.00001721  
Epoch: [3][180/366] Elapsed 4m 14s (remai

Epoch 3 - avg_train_loss: 0.0687  avg_val_loss: 0.1145  time: 622s
Epoch 3 - Score: 0.4797  Scores: [0.49916831951939417, 0.46291285265907633, 0.4276319713290357, 0.48314460457994757, 0.532845233537474, 0.47241941902862805]


Epoch: [4][0/366] Elapsed 0m 1s (remain 9m 45s) Loss: 0.0666(0.0666) Grad: 153214.7188  LR: 0.00001589  
Epoch: [4][20/366] Elapsed 0m 29s (remain 8m 4s) Loss: 0.0454(0.0495) Grad: 138386.4531  LR: 0.00001575  
Epoch: [4][40/366] Elapsed 0m 57s (remain 7m 36s) Loss: 0.0533(0.0506) Grad: 215325.2656  LR: 0.00001561  
Epoch: [4][60/366] Elapsed 1m 25s (remain 7m 8s) Loss: 0.0479(0.0492) Grad: 157749.4844  LR: 0.00001546  
Epoch: [4][80/366] Elapsed 1m 53s (remain 6m 39s) Loss: 0.0365(0.0489) Grad: 148711.6875  LR: 0.00001532  
Epoch: [4][100/366] Elapsed 2m 21s (remain 6m 11s) Loss: 0.0564(0.0490) Grad: 222308.3438  LR: 0.00001517  
Epoch: [4][120/366] Elapsed 2m 49s (remain 5m 43s) Loss: 0.0620(0.0495) Grad: 168251.2344  LR: 0.00001502  
Epoch: [4][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.0399(0.0496) Grad: 104696.6328  LR: 0.00001488  
Epoch: [4][160/366] Elapsed 3m 45s (remain 4m 47s) Loss: 0.0631(0.0494) Grad: 85148.3438  LR: 0.00001473  
Epoch: [4][180/366] Elapsed 4m 13s (re

Epoch 4 - avg_train_loss: 0.0506  avg_val_loss: 0.1172  time: 622s
Epoch 4 - Score: 0.4858  Scores: [0.5314954021109568, 0.46297777654354705, 0.4342281515128991, 0.4875078302620349, 0.5159096935203543, 0.4827037383729002]


Epoch: [5][0/366] Elapsed 0m 1s (remain 9m 59s) Loss: 0.0307(0.0307) Grad: 109983.6719  LR: 0.00001310  
Epoch: [5][20/366] Elapsed 0m 29s (remain 8m 7s) Loss: 0.0389(0.0464) Grad: 98927.6797  LR: 0.00001294  
Epoch: [5][40/366] Elapsed 0m 57s (remain 7m 36s) Loss: 0.0223(0.0429) Grad: 101506.8359  LR: 0.00001278  
Epoch: [5][60/366] Elapsed 1m 25s (remain 7m 8s) Loss: 0.0367(0.0403) Grad: 134691.2188  LR: 0.00001261  
Epoch: [5][80/366] Elapsed 1m 53s (remain 6m 40s) Loss: 0.0471(0.0394) Grad: 115892.6641  LR: 0.00001245  
Epoch: [5][100/366] Elapsed 2m 21s (remain 6m 12s) Loss: 0.0352(0.0386) Grad: 153667.7344  LR: 0.00001228  
Epoch: [5][120/366] Elapsed 2m 49s (remain 5m 43s) Loss: 0.0313(0.0383) Grad: 106620.6328  LR: 0.00001211  
Epoch: [5][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.0595(0.0384) Grad: 164260.2656  LR: 0.00001195  
Epoch: [5][160/366] Elapsed 3m 45s (remain 4m 47s) Loss: 0.0414(0.0386) Grad: 166670.6406  LR: 0.00001178  
Epoch: [5][180/366] Elapsed 4m 14s (re

Epoch 5 - avg_train_loss: 0.0379  avg_val_loss: 0.1159  time: 623s
Epoch 5 - Score: 0.4832  Scores: [0.5127935906756294, 0.46571877356589425, 0.4290833726420945, 0.48674778400861035, 0.51559896295386, 0.489495699299651]
Score: 0.4725  Scores: [0.5031422693404599, 0.4560782040963218, 0.433535570621158, 0.47323334338062734, 0.5098245612244544, 0.45933614095082836]
XLMRobertaConfig {
  "_name_or_path": "/home/jupyter/models/xlm/xlm-roberta-large-squad2/",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "name": "XLMRoberta",
  "nu

Epoch: [1][0/366] Elapsed 0m 1s (remain 8m 52s) Loss: 2.4620(2.4620) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 29s (remain 8m 3s) Loss: 0.2504(0.6263) Grad: 233804.6094  LR: 0.00002000  
Epoch: [1][40/366] Elapsed 0m 57s (remain 7m 37s) Loss: 0.1101(0.4076) Grad: 73197.1328  LR: 0.00001999  
Epoch: [1][60/366] Elapsed 1m 25s (remain 7m 7s) Loss: 0.1102(0.3231) Grad: 73490.2500  LR: 0.00001999  
Epoch: [1][80/366] Elapsed 1m 53s (remain 6m 40s) Loss: 0.0871(0.2778) Grad: 54031.8008  LR: 0.00001998  
Epoch: [1][100/366] Elapsed 2m 21s (remain 6m 12s) Loss: 0.0964(0.2473) Grad: 100273.4922  LR: 0.00001996  
Epoch: [1][120/366] Elapsed 2m 49s (remain 5m 44s) Loss: 0.1443(0.2270) Grad: 135112.1250  LR: 0.00001995  
Epoch: [1][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.1378(0.2191) Grad: 61673.8750  LR: 0.00001993  
Epoch: [1][160/366] Elapsed 3m 45s (remain 4m 47s) Loss: 0.1698(0.2102) Grad: 126272.8594  LR: 0.00001990  
Epoch: [1][180/366] Elapsed 4m 14s (remain 4m 19s

Epoch 1 - avg_train_loss: 0.1663  avg_val_loss: 0.1214  time: 622s
Epoch 1 - Score: 0.4943  Scores: [0.5157249668797449, 0.4803677011033084, 0.443541335003367, 0.4950996816028902, 0.4876365014702995, 0.5434322880248401]
Epoch 1 - Save Best Score: 0.4943 Model


EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.1000(0.1214) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 10m 19s) Loss: 0.0743(0.0743) Grad: 271503.0625  LR: 0.00001951  
Epoch: [2][20/366] Elapsed 0m 29s (remain 8m 7s) Loss: 0.1402(0.1054) Grad: 209470.2500  LR: 0.00001946  
Epoch: [2][40/366] Elapsed 0m 57s (remain 7m 38s) Loss: 0.0826(0.0979) Grad: 60800.4961  LR: 0.00001940  
Epoch: [2][60/366] Elapsed 1m 25s (remain 7m 9s) Loss: 0.1101(0.0965) Grad: 169033.2500  LR: 0.00001934  
Epoch: [2][80/366] Elapsed 1m 53s (remain 6m 40s) Loss: 0.0980(0.0939) Grad: 169705.5781  LR: 0.00001928  
Epoch: [2][100/366] Elapsed 2m 22s (remain 6m 12s) Loss: 0.0843(0.0934) Grad: 70817.1719  LR: 0.00001921  
Epoch: [2][120/366] Elapsed 2m 49s (remain 5m 44s) Loss: 0.1669(0.0927) Grad: 191163.8281  LR: 0.00001914  
Epoch: [2][140/366] Elapsed 3m 18s (remain 5m 15s) Loss: 0.0921(0.0916) Grad: 98882.5234  LR: 0.00001907  
Epoch: [2][160/366] Elapsed 3m 46s (remain 4m 47s) Loss: 0.0915(0.0907) Grad: 917

Epoch 2 - avg_train_loss: 0.0977  avg_val_loss: 0.1247  time: 622s
Epoch 2 - Score: 0.5012  Scores: [0.5232468523138903, 0.4859627125660677, 0.4364795921373425, 0.5136213680639902, 0.5085812882874033, 0.5395387951364493]


EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.3067(0.1247) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 10m 11s) Loss: 0.0839(0.0839) Grad: 161495.5625  LR: 0.00001809  
Epoch: [3][20/366] Elapsed 0m 29s (remain 8m 5s) Loss: 0.0813(0.0906) Grad: 232188.5938  LR: 0.00001799  
Epoch: [3][40/366] Elapsed 0m 57s (remain 7m 34s) Loss: 0.0629(0.0852) Grad: 88253.0938  LR: 0.00001789  
Epoch: [3][60/366] Elapsed 1m 25s (remain 7m 6s) Loss: 0.0494(0.0800) Grad: 74752.4922  LR: 0.00001778  
Epoch: [3][80/366] Elapsed 1m 53s (remain 6m 39s) Loss: 0.0584(0.0779) Grad: 50098.8125  LR: 0.00001767  
Epoch: [3][100/366] Elapsed 2m 21s (remain 6m 11s) Loss: 0.0761(0.0767) Grad: 238480.8906  LR: 0.00001756  
Epoch: [3][120/366] Elapsed 2m 49s (remain 5m 43s) Loss: 0.0643(0.0763) Grad: 122135.9531  LR: 0.00001745  
Epoch: [3][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.0625(0.0758) Grad: 73027.0859  LR: 0.00001733  
Epoch: [3][160/366] Elapsed 3m 45s (remain 4m 47s) Loss: 0.0867(0.0751) Grad: 1046

Epoch 3 - avg_train_loss: 0.0748  avg_val_loss: 0.1255  time: 622s
Epoch 3 - Score: 0.5028  Scores: [0.5182010144514304, 0.4907475239955454, 0.45821112925854696, 0.4845147158034486, 0.5275421784371646, 0.5374932983373003]


EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.2866(0.1255) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 10m 14s) Loss: 0.0858(0.0858) Grad: 194960.7812  LR: 0.00001588  
Epoch: [4][20/366] Elapsed 0m 29s (remain 8m 5s) Loss: 0.0598(0.0773) Grad: 128696.9766  LR: 0.00001574  
Epoch: [4][40/366] Elapsed 0m 57s (remain 7m 36s) Loss: 0.0581(0.0683) Grad: 134010.4375  LR: 0.00001560  
Epoch: [4][60/366] Elapsed 1m 25s (remain 7m 8s) Loss: 0.0482(0.0627) Grad: 155953.7344  LR: 0.00001546  
Epoch: [4][80/366] Elapsed 1m 53s (remain 6m 40s) Loss: 0.0494(0.0612) Grad: 191266.5938  LR: 0.00001532  
Epoch: [4][100/366] Elapsed 2m 21s (remain 6m 12s) Loss: 0.0480(0.0605) Grad: 215338.7812  LR: 0.00001517  
Epoch: [4][120/366] Elapsed 2m 49s (remain 5m 44s) Loss: 0.0421(0.0593) Grad: 169692.0000  LR: 0.00001502  
Epoch: [4][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.0519(0.0581) Grad: 177554.0312  LR: 0.00001487  
Epoch: [4][160/366] Elapsed 3m 45s (remain 4m 47s) Loss: 0.0565(0.0577) Grad: 

Epoch 4 - avg_train_loss: 0.0551  avg_val_loss: 0.1124  time: 623s
Epoch 4 - Score: 0.4756  Scores: [0.5012376972651091, 0.4608111413914197, 0.43601867417293866, 0.48550010488855877, 0.5000250174750601, 0.47024359702190766]
Epoch 4 - Save Best Score: 0.4756 Model


EVAL: [61/62] Elapsed 1m 49s (remain 0m 0s) Loss: 0.1324(0.1124) 
Epoch: [5][0/366] Elapsed 0m 1s (remain 10m 13s) Loss: 0.0677(0.0677) Grad: 205641.2656  LR: 0.00001310  
Epoch: [5][20/366] Elapsed 0m 29s (remain 8m 9s) Loss: 0.0375(0.0437) Grad: 123372.0000  LR: 0.00001294  
Epoch: [5][40/366] Elapsed 0m 58s (remain 7m 44s) Loss: 0.0456(0.0417) Grad: 127782.3203  LR: 0.00001277  
Epoch: [5][60/366] Elapsed 1m 26s (remain 7m 11s) Loss: 0.0288(0.0427) Grad: 100761.2656  LR: 0.00001261  
Epoch: [5][80/366] Elapsed 1m 54s (remain 6m 42s) Loss: 0.0412(0.0424) Grad: 158146.2188  LR: 0.00001244  
Epoch: [5][100/366] Elapsed 2m 22s (remain 6m 13s) Loss: 0.0391(0.0408) Grad: 144348.1250  LR: 0.00001228  
Epoch: [5][120/366] Elapsed 2m 50s (remain 5m 45s) Loss: 0.0472(0.0402) Grad: 160873.6562  LR: 0.00001211  
Epoch: [5][140/366] Elapsed 3m 18s (remain 5m 16s) Loss: 0.0475(0.0397) Grad: 139794.2969  LR: 0.00001194  
Epoch: [5][160/366] Elapsed 3m 46s (remain 4m 48s) Loss: 0.0445(0.0394) Grad:

Epoch 5 - avg_train_loss: 0.0402  avg_val_loss: 0.1148  time: 623s
Epoch 5 - Score: 0.4808  Scores: [0.5072584689265096, 0.46613956994101813, 0.4325807505562029, 0.49907519782233234, 0.5053026323257701, 0.4742527989007901]


EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.1268(0.1148) 
Epoch: [6][0/366] Elapsed 0m 1s (remain 10m 4s) Loss: 0.0242(0.0242) Grad: 153885.9062  LR: 0.00001002  
Epoch: [6][20/366] Elapsed 0m 29s (remain 8m 8s) Loss: 0.0377(0.0278) Grad: 104585.7656  LR: 0.00000985  
Epoch: [6][40/366] Elapsed 0m 57s (remain 7m 38s) Loss: 0.0309(0.0269) Grad: 162130.0781  LR: 0.00000967  
Epoch: [6][60/366] Elapsed 1m 25s (remain 7m 8s) Loss: 0.0255(0.0265) Grad: 213676.2656  LR: 0.00000950  
Epoch: [6][80/366] Elapsed 1m 53s (remain 6m 40s) Loss: 0.0213(0.0268) Grad: 114701.7734  LR: 0.00000933  
Epoch: [6][100/366] Elapsed 2m 21s (remain 6m 12s) Loss: 0.0314(0.0266) Grad: 116338.3125  LR: 0.00000916  
Epoch: [6][120/366] Elapsed 2m 50s (remain 5m 44s) Loss: 0.0193(0.0264) Grad: 122845.3281  LR: 0.00000899  
Epoch: [6][140/366] Elapsed 3m 18s (remain 5m 16s) Loss: 0.0279(0.0261) Grad: 173635.0625  LR: 0.00000882  
Epoch: [6][160/366] Elapsed 3m 46s (remain 4m 48s) Loss: 0.0312(0.0262) Grad: 1

Epoch 6 - avg_train_loss: 0.0255  avg_val_loss: 0.1182  time: 623s
Epoch 6 - Score: 0.4885  Scores: [0.5035003000364993, 0.47918568098729253, 0.4608154204359996, 0.49447419945092486, 0.5095977208874605, 0.4835504764307415]


EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.1708(0.1182) 
Epoch: [7][0/366] Elapsed 0m 1s (remain 10m 23s) Loss: 0.0172(0.0172) Grad: 176272.0469  LR: 0.00000693  
Epoch: [7][20/366] Elapsed 0m 29s (remain 8m 8s) Loss: 0.0240(0.0173) Grad: 113734.5156  LR: 0.00000677  
Epoch: [7][40/366] Elapsed 0m 57s (remain 7m 37s) Loss: 0.0133(0.0156) Grad: 97245.3750  LR: 0.00000661  
Epoch: [7][60/366] Elapsed 1m 25s (remain 7m 9s) Loss: 0.0096(0.0151) Grad: 102076.6562  LR: 0.00000645  
Epoch: [7][80/366] Elapsed 1m 54s (remain 6m 41s) Loss: 0.0109(0.0150) Grad: 115152.1484  LR: 0.00000629  
Epoch: [7][100/366] Elapsed 2m 22s (remain 6m 13s) Loss: 0.0101(0.0144) Grad: 86807.3984  LR: 0.00000613  
Epoch: [7][120/366] Elapsed 2m 50s (remain 5m 44s) Loss: 0.0088(0.0141) Grad: 112940.8203  LR: 0.00000597  
Epoch: [7][140/366] Elapsed 3m 18s (remain 5m 16s) Loss: 0.0124(0.0140) Grad: 96631.4766  LR: 0.00000581  
Epoch: [7][160/366] Elapsed 3m 46s (remain 4m 48s) Loss: 0.0228(0.0141) Grad: 115

Epoch 7 - avg_train_loss: 0.0136  avg_val_loss: 0.1181  time: 623s
Epoch 7 - Score: 0.4880  Scores: [0.516674901915653, 0.47324682623090336, 0.43922165873588714, 0.49934865662333805, 0.5142956683464883, 0.4850545719276853]


EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.1598(0.1181) 


Score: 0.4756  Scores: [0.5012376972651091, 0.4608111413914197, 0.43601867417293866, 0.48550010488855877, 0.5000250174750601, 0.47024359702190766]
XLMRobertaConfig {
  "_name_or_path": "/home/jupyter/models/xlm/xlm-roberta-large-squad2/",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "name": "XLMRoberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.2",
  "type_vocab_

Epoch: [1][0/366] Elapsed 0m 1s (remain 8m 54s) Loss: 2.5352(2.5352) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 29s (remain 8m 0s) Loss: 0.2075(0.7182) Grad: 79676.1797  LR: 0.00002000  
Epoch: [1][40/366] Elapsed 0m 57s (remain 7m 36s) Loss: 0.0878(0.4640) Grad: 62942.4570  LR: 0.00001999  
Epoch: [1][60/366] Elapsed 1m 25s (remain 7m 7s) Loss: 0.1353(0.3683) Grad: 126171.1641  LR: 0.00001999  
Epoch: [1][80/366] Elapsed 1m 53s (remain 6m 39s) Loss: 0.1841(0.3178) Grad: 180307.0156  LR: 0.00001998  
Epoch: [1][100/366] Elapsed 2m 21s (remain 6m 11s) Loss: 0.1222(0.2840) Grad: 119581.6328  LR: 0.00001996  
Epoch: [1][120/366] Elapsed 2m 49s (remain 5m 44s) Loss: 0.1780(0.2639) Grad: 65341.8047  LR: 0.00001995  
Epoch: [1][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.0673(0.2422) Grad: 35427.6758  LR: 0.00001993  
Epoch: [1][160/366] Elapsed 3m 46s (remain 4m 47s) Loss: 0.1553(0.2303) Grad: 96572.9531  LR: 0.00001990  
Epoch: [1][180/366] Elapsed 4m 13s (remain 4m 19s)

Epoch 1 - avg_train_loss: 0.1797  avg_val_loss: 0.1576  time: 622s
Epoch 1 - Score: 0.5639  Scores: [0.5173475824211443, 0.4636121429551024, 0.5423331902698354, 0.6235879930034115, 0.6085868632853003, 0.6279889383183976]
Epoch 1 - Save Best Score: 0.5639 Model


EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.1130(0.1576) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 10m 23s) Loss: 0.1730(0.1730) Grad: inf  LR: 0.00001951  
Epoch: [2][20/366] Elapsed 0m 29s (remain 8m 9s) Loss: 0.0880(0.1376) Grad: 77814.2734  LR: 0.00001946  
Epoch: [2][40/366] Elapsed 0m 57s (remain 7m 38s) Loss: 0.1317(0.1218) Grad: inf  LR: 0.00001940  
Epoch: [2][60/366] Elapsed 1m 25s (remain 7m 8s) Loss: 0.0888(0.1159) Grad: 86337.3672  LR: 0.00001934  
Epoch: [2][80/366] Elapsed 1m 53s (remain 6m 39s) Loss: 0.0714(0.1106) Grad: 105663.3672  LR: 0.00001928  
Epoch: [2][100/366] Elapsed 2m 21s (remain 6m 12s) Loss: 0.0848(0.1082) Grad: 51274.5391  LR: 0.00001921  
Epoch: [2][120/366] Elapsed 2m 50s (remain 5m 44s) Loss: 0.1292(0.1087) Grad: 100463.7578  LR: 0.00001914  
Epoch: [2][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.1460(0.1071) Grad: inf  LR: 0.00001907  
Epoch: [2][160/366] Elapsed 3m 45s (remain 4m 47s) Loss: 0.1042(0.1092) Grad: 26278.2754  LR: 0.00001900 

Epoch 2 - avg_train_loss: 0.1137  avg_val_loss: 0.1095  time: 622s
Epoch 2 - Score: 0.4695  Scores: [0.493610971032976, 0.450454632871778, 0.443137895055705, 0.48031916700421895, 0.4813843941534646, 0.4681586590369438]
Epoch 2 - Save Best Score: 0.4695 Model


EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.0724(0.1095) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 10m 25s) Loss: 0.0874(0.0874) Grad: 151401.7188  LR: 0.00001809  
Epoch: [3][20/366] Elapsed 0m 29s (remain 8m 3s) Loss: 0.1295(0.1064) Grad: 53860.0664  LR: 0.00001799  
Epoch: [3][40/366] Elapsed 0m 57s (remain 7m 39s) Loss: 0.0756(0.0968) Grad: 30016.6914  LR: 0.00001789  
Epoch: [3][60/366] Elapsed 1m 25s (remain 7m 8s) Loss: 0.0847(0.0918) Grad: 32932.3125  LR: 0.00001778  
Epoch: [3][80/366] Elapsed 1m 53s (remain 6m 39s) Loss: 0.0732(0.0903) Grad: 24621.1074  LR: 0.00001767  
Epoch: [3][100/366] Elapsed 2m 21s (remain 6m 11s) Loss: 0.0688(0.0880) Grad: 32202.3750  LR: 0.00001756  
Epoch: [3][120/366] Elapsed 2m 49s (remain 5m 43s) Loss: 0.0869(0.0888) Grad: 33222.3828  LR: 0.00001745  
Epoch: [3][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.0709(0.0882) Grad: 22398.7754  LR: 0.00001733  
Epoch: [3][160/366] Elapsed 3m 45s (remain 4m 47s) Loss: 0.0824(0.0880) Grad: 18347.2

Epoch 3 - avg_train_loss: 0.0848  avg_val_loss: 0.1050  time: 622s
Epoch 3 - Score: 0.4597  Scores: [0.4868640387053845, 0.45240426994255234, 0.42607106540289724, 0.4617995508889472, 0.4802120612938904, 0.4511196012854824]
Epoch 3 - Save Best Score: 0.4597 Model


EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.0323(0.1050) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 9m 12s) Loss: 0.0535(0.0535) Grad: inf  LR: 0.00001588  
Epoch: [4][20/366] Elapsed 0m 29s (remain 8m 8s) Loss: 0.0546(0.0649) Grad: 121907.4531  LR: 0.00001574  
Epoch: [4][40/366] Elapsed 0m 58s (remain 7m 42s) Loss: 0.0502(0.0624) Grad: 84719.1328  LR: 0.00001560  
Epoch: [4][60/366] Elapsed 1m 26s (remain 7m 10s) Loss: 0.0800(0.0613) Grad: 56268.2539  LR: 0.00001546  
Epoch: [4][80/366] Elapsed 1m 54s (remain 6m 41s) Loss: 0.0819(0.0606) Grad: 56703.7930  LR: 0.00001532  
Epoch: [4][100/366] Elapsed 2m 22s (remain 6m 13s) Loss: 0.0811(0.0609) Grad: 55715.3203  LR: 0.00001517  
Epoch: [4][120/366] Elapsed 2m 50s (remain 5m 44s) Loss: 0.0429(0.0614) Grad: 42236.7266  LR: 0.00001502  
Epoch: [4][140/366] Elapsed 3m 18s (remain 5m 16s) Loss: 0.0719(0.0623) Grad: 35396.0234  LR: 0.00001487  
Epoch: [4][160/366] Elapsed 3m 46s (remain 4m 48s) Loss: 0.0569(0.0618) Grad: 44492.8906  LR

Epoch 4 - avg_train_loss: 0.0626  avg_val_loss: 0.1111  time: 623s
Epoch 4 - Score: 0.4730  Scores: [0.5197856527185196, 0.456761046670412, 0.4249485970827994, 0.4722005372004021, 0.4857987906643233, 0.47877073677698095]


EVAL: [61/62] Elapsed 1m 49s (remain 0m 0s) Loss: 0.0419(0.1111) 
Epoch: [5][0/366] Elapsed 0m 1s (remain 9m 1s) Loss: 0.0632(0.0632) Grad: inf  LR: 0.00001310  
Epoch: [5][20/366] Elapsed 0m 29s (remain 8m 2s) Loss: 0.0291(0.0519) Grad: 44647.5781  LR: 0.00001294  
Epoch: [5][40/366] Elapsed 0m 57s (remain 7m 34s) Loss: 0.0464(0.0506) Grad: 55791.2109  LR: 0.00001277  
Epoch: [5][60/366] Elapsed 1m 25s (remain 7m 7s) Loss: 0.0479(0.0508) Grad: 63710.7578  LR: 0.00001261  
Epoch: [5][80/366] Elapsed 1m 53s (remain 6m 39s) Loss: 0.0429(0.0513) Grad: 57836.0508  LR: 0.00001244  
Epoch: [5][100/366] Elapsed 2m 21s (remain 6m 11s) Loss: 0.0425(0.0512) Grad: 22537.8398  LR: 0.00001228  
Epoch: [5][120/366] Elapsed 2m 49s (remain 5m 43s) Loss: 0.0520(0.0508) Grad: 27682.2129  LR: 0.00001211  
Epoch: [5][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.0463(0.0507) Grad: 34159.8008  LR: 0.00001194  
Epoch: [5][160/366] Elapsed 3m 45s (remain 4m 47s) Loss: 0.0514(0.0504) Grad: 56430.4844  LR: 0

Epoch 5 - avg_train_loss: 0.0496  avg_val_loss: 0.1067  time: 622s
Epoch 5 - Score: 0.4634  Scores: [0.4989622939005234, 0.45782037630429895, 0.42183483624793033, 0.4597552744422284, 0.4886923856270761, 0.4531969485195886]


EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.0290(0.1067) 
Epoch: [6][0/366] Elapsed 0m 1s (remain 10m 28s) Loss: 0.0225(0.0225) Grad: 163049.4844  LR: 0.00001002  
Epoch: [6][20/366] Elapsed 0m 29s (remain 8m 6s) Loss: 0.0471(0.0378) Grad: 87018.1094  LR: 0.00000985  
Epoch: [6][40/366] Elapsed 0m 57s (remain 7m 38s) Loss: 0.0279(0.0378) Grad: 74409.3672  LR: 0.00000967  
Epoch: [6][60/366] Elapsed 1m 25s (remain 7m 7s) Loss: 0.0409(0.0365) Grad: 106645.0078  LR: 0.00000950  
Epoch: [6][80/366] Elapsed 1m 53s (remain 6m 40s) Loss: 0.0412(0.0368) Grad: 89586.8672  LR: 0.00000933  
Epoch: [6][100/366] Elapsed 2m 21s (remain 6m 11s) Loss: 0.0342(0.0372) Grad: 43562.3555  LR: 0.00000916  
Epoch: [6][120/366] Elapsed 2m 49s (remain 5m 43s) Loss: 0.0342(0.0371) Grad: 61801.6641  LR: 0.00000899  
Epoch: [6][140/366] Elapsed 3m 17s (remain 5m 15s) Loss: 0.0319(0.0374) Grad: 51889.4688  LR: 0.00000882  
Epoch: [6][160/366] Elapsed 3m 45s (remain 4m 47s) Loss: 0.0291(0.0377) Grad: 59412.

Epoch 6 - avg_train_loss: 0.0369  avg_val_loss: 0.1109  time: 623s
Epoch 6 - Score: 0.4720  Scores: [0.5198984281268286, 0.45612299789337707, 0.422015262156808, 0.46082040577053063, 0.5148639641504099, 0.4580126219050994]


EVAL: [61/62] Elapsed 1m 48s (remain 0m 0s) Loss: 0.0251(0.1109) 


Score: 0.4597  Scores: [0.4868640387053845, 0.45240426994255234, 0.42607106540289724, 0.4617995508889472, 0.4802120612938904, 0.4511196012854824]
Score: 0.4677  Scores: [0.49749285995577996, 0.4563276076631515, 0.42805717777669, 0.4719590527102862, 0.4917965632960613, 0.4605687114477619]


In [18]:
print(OUTPUT_DIR)

/home/jupyter/output/ex/xlm-xlm-roberta-large-squad2/001/202211111601/


In [19]:
# import os
# OUTPUT_DIR = "/home/jupyter/output/ex/roberta-base/004/202210151736/"
# PREDICT_DIR = OUTPUT_DIR.replace("output","predict")
# if not os.path.exists(PREDICT_DIR):
#     os.makedirs(PREDICT_DIR)


In [20]:
if CFG.TO_KAGGLE:
    UPLOAD_DIR = OUTPUT_DIR
    EX_NO = f"{model_name}-{CFG.file_name}" # 実験番号などを入れる、folderのpathにする
    USERID = 'your_id'


    def dataset_upload():
        import json
        from kaggle.api.kaggle_api_extended import KaggleApi

        id = f'{USERID}/{EX_NO}'

        dataset_metadata = {}
        dataset_metadata['id'] = id
        dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
        dataset_metadata['title'] = f'{EX_NO}'

        with open(UPLOAD_DIR +'dataset-metadata.json', 'w') as f:
            json.dump(dataset_metadata, f, indent=4)

        api = KaggleApi()
        api.authenticate()

        # データセットがない場合
        if f'{USERID}/{EX_NO}' not in [str(d) for d in api.dataset_list(user=USERID, search=f'"{EX_NO}"')]:
            api.dataset_create_new(folder=UPLOAD_DIR,
                                   convert_to_csv=False,
                                   dir_mode='skip')
            
            
             #フォルダーを削除
            if f'{USERID}/{EX_NO}' not in [str(d) for d in api.dataset_list(user=USERID, search=f'"{EX_NO}"')]:
                remove_files = glob.glob(OUTPUT_DIR+"*")
                remove_files.remove(OUTPUT_DIR+"oof_df.pkl")
                for file in remove_files:
                    os.remove(file)
                print("folder upload")
                            #apiコマンドを書き込む
                f = open(f'{model_name}_api_command.txt', 'a')
                api_command = f"!kaggle datasets download -d hiroki8383/{EX_NO}\n"
                f.write(api_command)
                f.close()
            else:
                print("folder not upload")
            
            
        # データセットがある場合→更新されない場合がある（後で原因追及)
        else:
            print("this folder exsits")
            # api.dataset_create_version(folder=UPLOAD_DIR,
            #                            version_notes='update',
            #                            convert_to_csv=False,
            #                            delete_old_versions=False,
            #                            dir_mode='zip')

        

        
    dataset_upload()

Starting upload for file config.pth


100% 2.48k/2.48k [00:02<00:00, 1.09kB/s]


Upload successful: config.pth (2KB)
Starting upload for file xlm-xlm-roberta-large-squad2_fold1_best.pth


100% 2.09G/2.09G [00:46<00:00, 48.2MB/s]


Upload successful: xlm-xlm-roberta-large-squad2_fold1_best.pth (2GB)
Starting upload for file xlm-xlm-roberta-large-squad2_fold0_best.pth


100% 2.09G/2.09G [00:51<00:00, 43.8MB/s]


Upload successful: xlm-xlm-roberta-large-squad2_fold0_best.pth (2GB)
Starting upload for file class_cfg


100% 19.0/19.0 [00:02<00:00, 8.65B/s]


Upload successful: class_cfg (19B)
Starting upload for file oof_df.pkl


100% 9.09M/9.09M [00:02<00:00, 3.23MB/s]


Upload successful: oof_df.pkl (9MB)
Starting upload for file xlm-xlm-roberta-large-squad2_fold3_best.pth


100% 2.09G/2.09G [00:59<00:00, 37.5MB/s]


Upload successful: xlm-xlm-roberta-large-squad2_fold3_best.pth (2GB)
Starting upload for file train.log


100% 10.4k/10.4k [00:02<00:00, 5.20kB/s]


Upload successful: train.log (10KB)
Starting upload for file dict_cfg


100% 8.66M/8.66M [00:02<00:00, 3.25MB/s]


Upload successful: dict_cfg (9MB)
Starting upload for file xlm-xlm-roberta-large-squad2_fold2_best.pth


100% 2.09G/2.09G [01:06<00:00, 33.4MB/s]


Upload successful: xlm-xlm-roberta-large-squad2_fold2_best.pth (2GB)
folder upload


In [21]:
if not CFG.DEBUG:
    def to_write_score(CFG):
        df = pd.read_csv(CFG.score_path)
        def get_result2(oof_df):
                labels = oof_df[CFG.target_cols].values
                preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
                score, scores = get_score(labels, preds)
                LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
                return score,scores

        score,scores = get_result2(oof_df)
        name = "-".join(OUTPUT_DIR.split("/")[-4:-2])
        base = {"name":name,"score":score,"memo":CFG.MEMO} 
        base.update(dict(zip(CFG.target_cols,scores)))
        df = df.append(base,ignore_index=True)
        df.to_csv(CFG.score_path,index=False)
    to_write_score(CFG)

Score: 0.4677  Scores: [0.49749285995577996, 0.4563276076631515, 0.42805717777669, 0.4719590527102862, 0.4917965632960613, 0.4605687114477619]


In [22]:
df = pd.read_csv(CFG.score_path)
df

Unnamed: 0,name,memo,LB,score,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,base-discriminator-001,,0.46,0.466957,0.501431,0.45546,0.428477,0.46569,0.485904,0.46478
1,base-discriminator-002,,0.45,0.474078,0.504413,0.462707,0.432588,0.469546,0.500918,0.474296
2,deberta-v3-base-003,2epochs,,0.459342,0.496531,0.450482,0.419448,0.457413,0.476347,0.455832
3,deberta-v3-base-003,6epochs,0.44,0.456879,0.488333,0.448687,0.417134,0.458769,0.479633,0.448721
4,deberta-v3-base-006,15folds,,0.463635,0.47965,0.47591,0.442828,0.463557,0.456412,0.463457
5,deberta-v3-base-006,20folds,,0.449212,0.475585,0.433808,0.415852,0.457436,0.484686,0.427907
6,deberta-v3-base-006,10folds,,0.446338,0.490661,0.435001,0.40946,0.417236,0.4818,0.443871
7,distilroberta-base-001,4folds,,0.470528,0.501629,0.459765,0.42674,0.473205,0.501976,0.45985
8,distilroberta-base-002,"バッチ16,4folds",,0.471035,0.503345,0.458993,0.427209,0.471235,0.50485,0.46058
9,microsoft-deberta-v3-large-001,,0.46,0.450311,0.482961,0.445596,0.410873,0.449346,0.466162,0.446928
