In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Fri Feb 17 11:37:02 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    27W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = True
    freezing = True
    clean_content = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [6]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
OUTPUT_EXP_DIR = DIR + '/output/EXP008/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    return recall_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.70, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["abstract"]  

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 1 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:05<00:00, 953.56it/s]
max_len: 510
INFO:__main__:max_len: 510


In [14]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [16]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [17]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="mean")
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [18]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/233] Elapsed 0m 2s (remain 10m 1s) Loss: 0.7410(0.7410) Grad: nan  LR: 0.00002000  
Epoch: [1][100/233] Elapsed 1m 9s (remain 1m 30s) Loss: 0.5555(0.6317) Grad: 21297.5820  LR: 0.00001943  
Epoch: [1][200/233] Elapsed 2m 18s (remain 0m 22s) Loss: 0.5767(0.6316) Grad: 3764.7456  LR: 0.00001779  
Epoch: [1][232/233] Elapsed 2m 41s (remain 0m 0s) Loss: 0.5402(0.6301) Grad: 9779.7490  LR: 0.00001707  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 43s) Loss: 0.3859(0.3859) 


Epoch 1 - avg_train_loss: 0.6301  avg_val_loss: 0.5966  time: 197s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6301  avg_val_loss: 0.5966  time: 197s
Epoch 1 - Score: 0.0000
INFO:__main__:Epoch 1 - Score: 0.0000
Epoch 1 - Save Best Score: 0.0000 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0000 Model


EVAL: [38/39] Elapsed 0m 35s (remain 0m 0s) Loss: 0.9864(0.5966) 
Epoch: [2][0/233] Elapsed 0m 1s (remain 4m 29s) Loss: 0.6262(0.6262) Grad: nan  LR: 0.00001705  
Epoch: [2][100/233] Elapsed 1m 9s (remain 1m 31s) Loss: 0.5775(0.5759) Grad: 119631.0391  LR: 0.00001430  
Epoch: [2][200/233] Elapsed 2m 17s (remain 0m 21s) Loss: 0.3747(0.5796) Grad: 81616.5781  LR: 0.00001108  
Epoch: [2][232/233] Elapsed 2m 40s (remain 0m 0s) Loss: 0.5688(0.5772) Grad: 25629.9531  LR: 0.00001000  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 48s) Loss: 0.3726(0.3726) 


Epoch 2 - avg_train_loss: 0.5772  avg_val_loss: 0.5828  time: 196s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5772  avg_val_loss: 0.5828  time: 196s
Epoch 2 - Score: 0.1890
INFO:__main__:Epoch 2 - Score: 0.1890
Epoch 2 - Save Best Score: 0.1890 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.1890 Model


EVAL: [38/39] Elapsed 0m 35s (remain 0m 0s) Loss: 0.8877(0.5828) 
Epoch: [3][0/233] Elapsed 0m 0s (remain 2m 38s) Loss: 0.5681(0.5681) Grad: nan  LR: 0.00000997  
Epoch: [3][100/233] Elapsed 1m 8s (remain 1m 29s) Loss: 0.6951(0.5540) Grad: 114350.2500  LR: 0.00000666  
Epoch: [3][200/233] Elapsed 2m 15s (remain 0m 21s) Loss: 0.5343(0.5469) Grad: 42350.7969  LR: 0.00000373  
Epoch: [3][232/233] Elapsed 2m 37s (remain 0m 0s) Loss: 0.4389(0.5459) Grad: 31904.7090  LR: 0.00000293  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 46s) Loss: 0.3091(0.3091) 


Epoch 3 - avg_train_loss: 0.5459  avg_val_loss: 0.5735  time: 193s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5459  avg_val_loss: 0.5735  time: 193s
Epoch 3 - Score: 0.2336
INFO:__main__:Epoch 3 - Score: 0.2336
Epoch 3 - Save Best Score: 0.2336 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.2336 Model


EVAL: [38/39] Elapsed 0m 35s (remain 0m 0s) Loss: 0.9929(0.5735) 
Epoch: [4][0/233] Elapsed 0m 0s (remain 3m 27s) Loss: 0.3409(0.3409) Grad: nan  LR: 0.00000291  
Epoch: [4][100/233] Elapsed 1m 8s (remain 1m 29s) Loss: 0.5924(0.5211) Grad: 109963.3047  LR: 0.00000097  
Epoch: [4][200/233] Elapsed 2m 16s (remain 0m 21s) Loss: 0.3779(0.5160) Grad: 74852.4219  LR: 0.00000006  
Epoch: [4][232/233] Elapsed 2m 38s (remain 0m 0s) Loss: 0.6802(0.5146) Grad: 47224.1406  LR: 0.00000000  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 41s) Loss: 0.3131(0.3131) 


Epoch 4 - avg_train_loss: 0.5146  avg_val_loss: 0.5749  time: 194s
INFO:__main__:Epoch 4 - avg_train_loss: 0.5146  avg_val_loss: 0.5749  time: 194s
Epoch 4 - Score: 0.2756
INFO:__main__:Epoch 4 - Score: 0.2756
Epoch 4 - Save Best Score: 0.2756 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.2756 Model


EVAL: [38/39] Elapsed 0m 35s (remain 0m 0s) Loss: 0.9760(0.5749) 


Score: 0.2756
INFO:__main__:Score: 0.2756
ACC BEST Score: 0.7010
INFO:__main__:ACC BEST Score: 0.7010
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/233] Elapsed 0m 0s (remain 3m 26s) Loss: 0.4535(0.4535) Grad: nan  LR: 0.00002000  
Epoch: [1][100/233] Elapsed 1m 13s (remain 1m 36s) Loss: 0.3777(0.5890) Grad: 39964.2422  LR: 0.00001943  
Epoch: [1][200/233] Elapsed 2m 20s (remain 0m 22s) Loss: 0.5255(0.6085) Grad: 98355.2969  LR: 0.00001779  
Epoch: [1][232/233] Elapsed 2m 42s (remain 0m 0s) Loss: 0.7323(0.6078) Grad: 50410.7617  LR: 0.00001707  
EVAL: [0/39] Elapsed 0m 0s (remain 0m 37s) Loss: 0.4024(0.4024) 


Epoch 1 - avg_train_loss: 0.6078  avg_val_loss: 0.5812  time: 198s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6078  avg_val_loss: 0.5812  time: 198s
Epoch 1 - Score: 0.1470
INFO:__main__:Epoch 1 - Score: 0.1470
Epoch 1 - Save Best Score: 0.1470 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.1470 Model


EVAL: [38/39] Elapsed 0m 35s (remain 0m 0s) Loss: 0.9514(0.5812) 
Epoch: [2][0/233] Elapsed 0m 0s (remain 3m 3s) Loss: 0.4653(0.4653) Grad: nan  LR: 0.00001705  
Epoch: [2][100/233] Elapsed 1m 10s (remain 1m 31s) Loss: 0.5717(0.5768) Grad: 18688.6914  LR: 0.00001430  
Epoch: [2][200/233] Elapsed 2m 18s (remain 0m 22s) Loss: 0.5658(0.5604) Grad: 25413.3203  LR: 0.00001108  
Epoch: [2][232/233] Elapsed 2m 39s (remain 0m 0s) Loss: 0.4693(0.5583) Grad: 20823.5098  LR: 0.00001000  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 39s) Loss: 0.3138(0.3138) 


Epoch 2 - avg_train_loss: 0.5583  avg_val_loss: 0.5651  time: 195s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5583  avg_val_loss: 0.5651  time: 195s
Epoch 2 - Score: 0.1864
INFO:__main__:Epoch 2 - Score: 0.1864
Epoch 2 - Save Best Score: 0.1864 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.1864 Model


EVAL: [38/39] Elapsed 0m 35s (remain 0m 0s) Loss: 1.0769(0.5651) 
Epoch: [3][0/233] Elapsed 0m 0s (remain 3m 49s) Loss: 0.5373(0.5373) Grad: nan  LR: 0.00000997  
Epoch: [3][100/233] Elapsed 1m 8s (remain 1m 29s) Loss: 0.4255(0.4499) Grad: 65322.8398  LR: 0.00000666  
Epoch: [3][200/233] Elapsed 2m 18s (remain 0m 22s) Loss: 0.5074(0.4358) Grad: 38324.3281  LR: 0.00000373  
Epoch: [3][232/233] Elapsed 2m 41s (remain 0m 0s) Loss: 0.3616(0.4288) Grad: 40876.0156  LR: 0.00000293  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 45s) Loss: 0.3138(0.3138) 


Epoch 3 - avg_train_loss: 0.4288  avg_val_loss: 0.6327  time: 197s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4288  avg_val_loss: 0.6327  time: 197s
Epoch 3 - Score: 0.2861
INFO:__main__:Epoch 3 - Score: 0.2861
Epoch 3 - Save Best Score: 0.2861 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.2861 Model


EVAL: [38/39] Elapsed 0m 35s (remain 0m 0s) Loss: 1.3709(0.6327) 
Epoch: [4][0/233] Elapsed 0m 0s (remain 2m 58s) Loss: 0.6487(0.6487) Grad: nan  LR: 0.00000291  
Epoch: [4][100/233] Elapsed 1m 12s (remain 1m 34s) Loss: 0.3263(0.3228) Grad: 125053.6719  LR: 0.00000097  
Epoch: [4][200/233] Elapsed 2m 21s (remain 0m 22s) Loss: 0.2514(0.3291) Grad: 125789.1562  LR: 0.00000006  
Epoch: [4][232/233] Elapsed 2m 43s (remain 0m 0s) Loss: 0.2213(0.3323) Grad: 77729.8359  LR: 0.00000000  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 38s) Loss: 0.4186(0.4186) 


Epoch 4 - avg_train_loss: 0.3323  avg_val_loss: 0.6561  time: 200s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3323  avg_val_loss: 0.6561  time: 200s
Epoch 4 - Score: 0.3386
INFO:__main__:Epoch 4 - Score: 0.3386
Epoch 4 - Save Best Score: 0.3386 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.3386 Model


EVAL: [38/39] Elapsed 0m 35s (remain 0m 0s) Loss: 1.3009(0.6561) 


Score: 0.3386
INFO:__main__:Score: 0.3386
ACC BEST Score: 0.6977
INFO:__main__:ACC BEST Score: 0.6977
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/233] Elapsed 0m 1s (remain 3m 53s) Loss: 0.5407(0.5407) Grad: nan  LR: 0.00002000  
Epoch: [1][100/233] Elapsed 1m 7s (remain 1m 28s) Loss: 0.5808(0.6125) Grad: 15518.9424  LR: 0.00001943  
Epoch: [1][200/233] Elapsed 2m 19s (remain 0m 22s) Loss: 0.7715(0.6026) Grad: 29704.2734  LR: 0.00001779  
Epoch: [1][232/233] Elapsed 2m 42s (remain 0m 0s) Loss: 0.6249(0.6018) Grad: 46011.4336  LR: 0.00001707  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 43s) Loss: 0.3131(0.3131) 


Epoch 1 - avg_train_loss: 0.6018  avg_val_loss: 0.5761  time: 197s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6018  avg_val_loss: 0.5761  time: 197s
Epoch 1 - Score: 0.0816
INFO:__main__:Epoch 1 - Score: 0.0816
Epoch 1 - Save Best Score: 0.0816 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0816 Model


EVAL: [38/39] Elapsed 0m 34s (remain 0m 0s) Loss: 1.0675(0.5761) 
Epoch: [2][0/233] Elapsed 0m 0s (remain 3m 31s) Loss: 0.4608(0.4608) Grad: nan  LR: 0.00001705  
Epoch: [2][100/233] Elapsed 1m 11s (remain 1m 33s) Loss: 0.4768(0.5441) Grad: 23713.4082  LR: 0.00001430  
Epoch: [2][200/233] Elapsed 2m 19s (remain 0m 22s) Loss: 0.4297(0.5359) Grad: 25368.1367  LR: 0.00001108  
Epoch: [2][232/233] Elapsed 2m 42s (remain 0m 0s) Loss: 0.4422(0.5326) Grad: 28174.6172  LR: 0.00001000  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 50s) Loss: 0.2874(0.2874) 


Epoch 2 - avg_train_loss: 0.5326  avg_val_loss: 0.5934  time: 198s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5326  avg_val_loss: 0.5934  time: 198s
Epoch 2 - Score: 0.1789
INFO:__main__:Epoch 2 - Score: 0.1789
Epoch 2 - Save Best Score: 0.1789 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.1789 Model


EVAL: [38/39] Elapsed 0m 34s (remain 0m 0s) Loss: 1.1446(0.5934) 
Epoch: [3][0/233] Elapsed 0m 1s (remain 3m 54s) Loss: 0.4257(0.4257) Grad: nan  LR: 0.00000997  
Epoch: [3][100/233] Elapsed 1m 11s (remain 1m 33s) Loss: 0.3776(0.4040) Grad: 82855.9297  LR: 0.00000666  
Epoch: [3][200/233] Elapsed 2m 21s (remain 0m 22s) Loss: 0.4271(0.3940) Grad: 103386.6562  LR: 0.00000373  
Epoch: [3][232/233] Elapsed 2m 43s (remain 0m 0s) Loss: 0.5305(0.3940) Grad: 242525.0469  LR: 0.00000293  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 44s) Loss: 0.2952(0.2952) 


Epoch 3 - avg_train_loss: 0.3940  avg_val_loss: 0.6573  time: 199s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3940  avg_val_loss: 0.6573  time: 199s
Epoch 3 - Score: 0.2763
INFO:__main__:Epoch 3 - Score: 0.2763
Epoch 3 - Save Best Score: 0.2763 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.2763 Model


EVAL: [38/39] Elapsed 0m 34s (remain 0m 0s) Loss: 1.2888(0.6573) 
Epoch: [4][0/233] Elapsed 0m 1s (remain 6m 26s) Loss: 0.1437(0.1437) Grad: nan  LR: 0.00000291  
Epoch: [4][100/233] Elapsed 1m 10s (remain 1m 32s) Loss: 0.3462(0.3114) Grad: 198414.8906  LR: 0.00000097  
Epoch: [4][200/233] Elapsed 2m 19s (remain 0m 22s) Loss: 0.1887(0.3157) Grad: 88125.3828  LR: 0.00000006  
Epoch: [4][232/233] Elapsed 2m 43s (remain 0m 0s) Loss: 0.2961(0.3146) Grad: 193300.7500  LR: 0.00000000  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 43s) Loss: 0.3672(0.3672) 


Epoch 4 - avg_train_loss: 0.3146  avg_val_loss: 0.6524  time: 198s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3146  avg_val_loss: 0.6524  time: 198s
Epoch 4 - Score: 0.3526
INFO:__main__:Epoch 4 - Score: 0.3526
Epoch 4 - Save Best Score: 0.3526 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.3526 Model


EVAL: [38/39] Elapsed 0m 34s (remain 0m 0s) Loss: 1.1115(0.6524) 


Score: 0.3526
INFO:__main__:Score: 0.3526
ACC BEST Score: 0.7031
INFO:__main__:ACC BEST Score: 0.7031
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/233] Elapsed 0m 0s (remain 3m 27s) Loss: 0.6811(0.6811) Grad: nan  LR: 0.00002000  
Epoch: [1][100/233] Elapsed 1m 10s (remain 1m 32s) Loss: 0.6250(0.6160) Grad: 44052.2539  LR: 0.00001943  
Epoch: [1][200/233] Elapsed 2m 19s (remain 0m 22s) Loss: 0.4652(0.6089) Grad: 7546.2681  LR: 0.00001779  
Epoch: [1][232/233] Elapsed 2m 42s (remain 0m 0s) Loss: 0.2773(0.6067) Grad: 62322.0625  LR: 0.00001707  
EVAL: [0/39] Elapsed 0m 0s (remain 0m 37s) Loss: 0.3129(0.3129) 


Epoch 1 - avg_train_loss: 0.6067  avg_val_loss: 0.5856  time: 197s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6067  avg_val_loss: 0.5856  time: 197s
Epoch 1 - Score: 0.0000
INFO:__main__:Epoch 1 - Score: 0.0000
Epoch 1 - Save Best Score: 0.0000 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0000 Model


EVAL: [38/39] Elapsed 0m 34s (remain 0m 0s) Loss: 1.1358(0.5856) 
Epoch: [2][0/233] Elapsed 0m 1s (remain 3m 56s) Loss: 0.4583(0.4583) Grad: nan  LR: 0.00001705  
Epoch: [2][100/233] Elapsed 1m 11s (remain 1m 33s) Loss: 0.4172(0.5228) Grad: 134501.1406  LR: 0.00001430  
Epoch: [2][200/233] Elapsed 2m 20s (remain 0m 22s) Loss: 0.5277(0.5343) Grad: 24343.8125  LR: 0.00001108  
Epoch: [2][232/233] Elapsed 2m 42s (remain 0m 0s) Loss: 0.7897(0.5333) Grad: 74955.5078  LR: 0.00001000  
EVAL: [0/39] Elapsed 0m 0s (remain 0m 37s) Loss: 0.1678(0.1678) 
EVAL: [38/39] Elapsed 0m 34s (remain 0m 0s) Loss: 1.7197(0.6625) 


Epoch 2 - avg_train_loss: 0.5333  avg_val_loss: 0.6625  time: 197s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5333  avg_val_loss: 0.6625  time: 197s
Epoch 2 - Score: 0.0551
INFO:__main__:Epoch 2 - Score: 0.0551
Epoch 2 - Save Best Score: 0.0551 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.0551 Model


Epoch: [3][0/233] Elapsed 0m 0s (remain 3m 28s) Loss: 0.6393(0.6393) Grad: nan  LR: 0.00000997  
Epoch: [3][100/233] Elapsed 1m 10s (remain 1m 31s) Loss: 0.3555(0.4274) Grad: 53954.7852  LR: 0.00000666  
Epoch: [3][200/233] Elapsed 2m 20s (remain 0m 22s) Loss: 0.1995(0.4265) Grad: 32362.3633  LR: 0.00000373  
Epoch: [3][232/233] Elapsed 2m 42s (remain 0m 0s) Loss: 0.3405(0.4194) Grad: 51666.8164  LR: 0.00000293  
EVAL: [0/39] Elapsed 0m 1s (remain 0m 38s) Loss: 0.2946(0.2946) 


Epoch 3 - avg_train_loss: 0.4194  avg_val_loss: 0.6550  time: 197s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4194  avg_val_loss: 0.6550  time: 197s
Epoch 3 - Score: 0.3018
INFO:__main__:Epoch 3 - Score: 0.3018
Epoch 3 - Save Best Score: 0.3018 Model


EVAL: [38/39] Elapsed 0m 34s (remain 0m 0s) Loss: 1.3868(0.6550) 


INFO:__main__:Epoch 3 - Save Best Score: 0.3018 Model


Epoch: [4][0/233] Elapsed 0m 0s (remain 3m 39s) Loss: 0.4198(0.4198) Grad: nan  LR: 0.00000291  
Epoch: [4][100/233] Elapsed 1m 9s (remain 1m 30s) Loss: 0.2871(0.2884) Grad: 76030.2266  LR: 0.00000097  
Epoch: [4][200/233] Elapsed 2m 18s (remain 0m 22s) Loss: 0.3669(0.2877) Grad: 250164.0156  LR: 0.00000006  
Epoch: [4][232/233] Elapsed 2m 42s (remain 0m 0s) Loss: 0.2080(0.2863) Grad: 132604.0156  LR: 0.00000000  
EVAL: [0/39] Elapsed 0m 0s (remain 0m 37s) Loss: 0.4013(0.4013) 


Epoch 4 - avg_train_loss: 0.2863  avg_val_loss: 0.6775  time: 197s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2863  avg_val_loss: 0.6775  time: 197s
Epoch 4 - Score: 0.4094
INFO:__main__:Epoch 4 - Score: 0.4094
Epoch 4 - Save Best Score: 0.4094 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.4094 Model


EVAL: [38/39] Elapsed 0m 34s (remain 0m 0s) Loss: 1.2266(0.6775) 


Score: 0.4094
INFO:__main__:Score: 0.4094
ACC BEST Score: 0.7056
INFO:__main__:ACC BEST Score: 0.7056
Score: 0.3441
INFO:__main__:Score: 0.3441
ACC BEST Score: 0.6990
INFO:__main__:ACC BEST Score: 0.6990


In [19]:
from google.colab import runtime
runtime.unassign()