# About this notebook
- [Luke](https://arxiv.org/pdf/2010.01057v1.pdf)-base starter notebook
- [Inference notebook](https://www.kaggle.com/yasufuminakama/jigsaw4-luke-base-starter-sub)
- Approach References
    - https://www.kaggle.com/c/jigsaw-toxic-severity-rating/discussion/286471
    - https://www.kaggle.com/debarshichanda/pytorch-w-b-jigsaw-starter
    - https://www.kaggle.com/debarshichanda/0-816-jigsaw-inference
    - Thanks for sharing @debarshichanda

# Directory settings

In [22]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './output/jigsaw_server_luke/luke_folder/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [23]:
# ====================================================
# CFG
# ====================================================
class CFG:
    competition='Jigsaw4'
    _wandb_kernel='nakama'
    apex=True
    print_freq=50
    num_workers=4
    model="../model/luke_base"
    model_name="studio-ousia/luke-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    encoder_lr=1e-4
    decoder_lr=1e-4
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=64
    fc_dropout=0.
    text="text"
    target="target"
    target_size=1
    head=32
    tail=32
    max_len=head+tail
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    margin=0.4
    seed=42
    
    
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    debug=False
    epochs=2

    translate_aug=False
    translate_path="../input/translatetoxic/comment_translation.csv"

In [24]:
HASH_NAME="lukemodel origin margin=0.  "
if CFG.debug==True:
    HASH_NAME="lukemodel debug "

# translate_text=["text_fr","text_de","text_es"]
translate_text=["text_fr","text_de"]

In [25]:
# ====================================================
# wandb
# ====================================================
import wandb

try:
#     from kaggle_secrets import UserSecretsClient
#     user_secrets = UserSecretsClient()
#     secret_value_0 = user_secrets.get_secret("wandb_api")
#     if CFG.debug==False:
#         wandb.login(key="ebe051612bfb733306f4e4b5df4b043050ebea6e")
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

    
def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))



# Library

In [26]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import sys
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

# os.system('pip uninstall -q transformers -y')
# os.system('pip uninstall -q tokenizers -y')
# os.system('pip uninstall -q huggingface_hub -y')

# os.system('mkdir -p /tmp/pip/cache-tokenizers/')
# os.system('cp ../input/tokenizers-0103/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl /tmp/pip/cache-tokenizers/')
# os.system('pip install -q --no-index --find-links /tmp/pip/cache-tokenizers/ tokenizers')

# os.system('mkdir -p /tmp/pip/cache-huggingface-hub/')
# os.system('cp ../input/huggingface-hub-008/huggingface_hub-0.0.8-py3-none-any.whl /tmp/pip/cache-huggingface-hub/')
# os.system('pip install -q --no-index --find-links /tmp/pip/cache-huggingface-hub/ huggingface_hub')

# os.system('mkdir -p /tmp/pip/cache-transformers/')
# os.system('cp ../input/transformers-470/transformers-4.7.0-py3-none-any.whl /tmp/pip/cache-transformers/')
# os.system('pip install -q --no-index --find-links /tmp/pip/cache-transformers/ transformers')

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import LukeTokenizer, LukeModel, LukeConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.10.3
transformers.__version__: 4.15.0


# Utils

In [27]:
# ====================================================
# Utils
# ====================================================
def get_score(df):
    score = len(df[df['less_toxic_pred'] < df['more_toxic_pred']]) / len(df)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [28]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('./input/jigsaw-toxic-severity-rating/validation_data.csv')

from sklearn.preprocessing import LabelEncoder
def generate_comments(data):
    more_toxic_text=data["more_toxic"].values
    less_toxic_text=data["less_toxic"].values    
    comments=np.concatenate((more_toxic_text,less_toxic_text))
    comments=np.unique(comments)
    comments=pd.DataFrame({"text":comments})
    text_encoder=LabelEncoder()
    text_encoder.fit(comments)
    comments["encode_text"]=text_encoder.transform(comments["text"])
    comments["toxic_value"]=0
    comments["access_time"]=0
    data["encode_less"]=text_encoder.transform(data["less_toxic"])
    data["encode_more"]=text_encoder.transform(data["more_toxic"])
    
    return data,comments
train,comments=generate_comments(train)
if CFG.debug:
    train = train.sample(n=100, random_state=CFG.seed).reset_index(drop=True)
test = pd.read_csv('./input/jigsaw-toxic-severity-rating/comments_to_score.csv')
submission = pd.read_csv('./input/jigsaw-toxic-severity-rating/sample_submission.csv')
print(train.shape)
print(test.shape, submission.shape)
display(train.head())
display(test.head())
display(submission.head())

(30108, 5)
(7537, 2) (7537, 2)


Unnamed: 0,worker,less_toxic,more_toxic,encode_less,encode_more
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,2405,12151
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,7215,653
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",2632,7222
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,7973,12968
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",3524,3266


Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...


Unnamed: 0,comment_id,score
0,114890,0.5
1,732895,0.5
2,1139051,0.5
3,1434512,0.5
4,2084821,0.5


# CV split

In [29]:
from sklearn.model_selection import GroupKFold
class UnionFind():
    def __init__(self, n):
        self.n = n
        self.parents = [-1] * n

    def find(self, x):
        if self.parents[x] < 0:
            return x
        else:
            self.parents[x] = self.find(self.parents[x])
            return self.parents[x]

    def union(self, x, y):
        x = self.find(x)
        y = self.find(y)
        if x == y:
            return
        if self.parents[x] > self.parents[y]:
            x, y = y, x
        self.parents[x] += self.parents[y]
        self.parents[y] = x


def get_group_unionfind(train: pd.DataFrame):
    less_unique_text = train['less_toxic'].unique()
    more_unique_text = train['more_toxic'].unique()
    unique_text = np.hstack([less_unique_text, more_unique_text])
    unique_text = np.unique(unique_text).tolist()    
    text2num = {text: i for i, text in enumerate(unique_text)}
    num2text = {num: text for text, num in text2num.items()}
    train['num_less_toxic'] = train['less_toxic'].map(text2num)
    train['num_more_toxic'] = train['more_toxic'].map(text2num)

    uf = UnionFind(len(unique_text))
    for seq1, seq2 in train[['num_less_toxic', 'num_more_toxic']].to_numpy():
        uf.union(seq1, seq2)

    text2group = {num2text[i]: uf.find(i) for i in range(len(unique_text))}
    train['group'] = train['less_toxic'].map(text2group)
    train = train.drop(columns=['num_less_toxic', 'num_more_toxic'])
    return train

In [30]:
# ====================================================
# CV split
# ====================================================
# Fold = GroupKFold(n_splits=CFG.n_fold)
# for n, (trn_index, val_index) in enumerate(Fold.split(train, train, train['worker'])):
#     train.loc[val_index, 'fold'] = int(n)
# train['fold'] = train['fold'].astype(int)
# display(train.groupby('fold').size())

In [31]:
data_df=train.copy()
data_df = get_group_unionfind(data_df)
group_kfold = GroupKFold(n_splits=CFG.n_fold)
for fold, (trn_idx, val_idx) in enumerate(group_kfold.split(data_df, data_df, data_df['group'])): 
    data_df.loc[val_idx , "fold"] = fold

data_df["fold"] = data_df["fold"].astype(int)
train=data_df.copy()

In [32]:
display(train.groupby('fold').size())

fold
0    6022
1    6022
2    6022
3    6021
4    6021
dtype: int64

# tokenizer

In [33]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = LukeTokenizer.from_pretrained(CFG.model, lowercase=True)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

# Dataset

In [34]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(text, cfg):
    if cfg.tail == 0:
        inputs = cfg.tokenizer.encode_plus(text, 
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           max_length=cfg.max_len,
                                           pad_to_max_length=True,
                                           truncation=True)
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = cfg.tokenizer.encode_plus(text,
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           truncation=True)
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_len:
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]])
            if k == 'input_ids':
                new_v = np.ones(cfg.max_len) * cfg.tokenizer.pad_token_id
            else:
                new_v = np.zeros(cfg.max_len)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.less_toxic = df['less_toxic'].fillna("none").values
        self.more_toxic = df['more_toxic'].fillna("none").values

    def __len__(self):
        return len(self.less_toxic)

    def __getitem__(self, item):
        less_toxic_inputs = prepare_input(str(self.less_toxic[item]), self.cfg)
        more_toxic_inputs = prepare_input(str(self.more_toxic[item]), self.cfg)
        label = torch.tensor(1, dtype=torch.float)
        return less_toxic_inputs, more_toxic_inputs, label


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df[cfg.text].fillna("none").values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = prepare_input(text, self.cfg)
        return inputs

# Model

In [35]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = LukeConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        self.config.update({"hidden_dropout_prob": 0.0,"attention_probs_dropout_prob":0.0})  
        if pretrained:
            self.model = LukeModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = LukeModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output


# class AttentionBlock(nn.Module):
#     def __init__(self, in_features, middle_features, out_features):
#         super().__init__()
#         self.in_features = in_features
#         self.middle_features = middle_features
#         self.out_features = out_features
#         self.W = nn.Linear(in_features, middle_features)
#         self.V = nn.Linear(middle_features, out_features)

#     def forward(self, features):
#         att = torch.tanh(self.W(features))
#         score = self.V(att)
#         attention_weights = torch.softmax(score, dim=1)
#         context_vector = attention_weights * features
#         context_vector = torch.sum(context_vector, dim=1)
#         return context_vector

# class CustomModel(nn.Module):
#     def __init__(self, cfg, config_path=None, pretrained=False):
#         super().__init__()
#         self.cfg = cfg
#         if config_path is None:
#             self.config = LukeConfig.from_pretrained(cfg.model, output_hidden_states=True)
#         else:
#             self.config = torch.load(config_path)
#         self.config.update({"hidden_dropout_prob": 0.0,"attention_probs_dropout_prob":0.0})  
#         if pretrained:
#             self.model = LukeModel.from_pretrained(cfg.model, config=self.config)
#         else:
#             self.model = LukeModel(self.config)
            
#         self.seq_attn_head = nn.Sequential(
#             nn.LayerNorm(self.config.hidden_size),
#             # nn.Dropout(0.1),
#             AttentionBlock(self.config.hidden_size, self.config.hidden_size, 1),
#             # nn.Linear(self.config.hidden_size, 2 if kl_loss else 1),
#         )
#         self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        
#     def feature(self, inputs):
#         outputs = self.model(**inputs)
#         last_hidden_states = outputs[0]
        
#         feature = self.seq_attn_head(last_hidden_states)
#         return feature

#     def forward(self, inputs):
#         feature = self.feature(inputs)
#         output = self.fc(feature)
#         return output
    
    


In [36]:
config = LukeConfig.from_pretrained(CFG.model, output_hidden_states=True)

# Helpler functions

In [37]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (less_toxic_inputs, more_toxic_inputs, labels) in enumerate(train_loader):
        for k, v in less_toxic_inputs.items():
            less_toxic_inputs[k] = v.to(device)
        for k, v in more_toxic_inputs.items():
            more_toxic_inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            less_toxic_y_preds = model(less_toxic_inputs)
            more_toxic_y_preds = model(more_toxic_inputs)
            loss = criterion(more_toxic_y_preds, less_toxic_y_preds, labels)
        losses.update(loss.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        wandb.log({f"loss": losses.val,
                   f"lr": scheduler.get_lr()[0]})
    return losses.avg


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [38]:
# ====================================================
# train loop
# ====================================================

def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index
    
    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    validation = folds.loc[val_idx].reset_index(drop=True)
    if CFG.translate_aug==True:
        
        df_train=train_folds.copy()
        comment_translation=pd.read_csv(CFG.translate_path)
        comment_translation=comment_translation.merge(comments,on="text",how="left")
        df_train_encode=df_train.drop(["less_toxic","more_toxic"],axis=1)
        for language_text in translate_text:
            temp_train=df_train_encode
            
            temp_train=temp_train.merge(comment_translation[["encode_text",language_text]],left_on="encode_less",right_on="encode_text",how="left")
            temp_train=temp_train.rename(columns={language_text:"less_toxic"})
            temp_train.drop(["encode_text"],axis=1,inplace=True)
            
            temp_train=temp_train.merge(comment_translation[["encode_text",language_text]],left_on="encode_more",right_on="encode_text",how="left")
            temp_train=temp_train.rename(columns={language_text:"more_toxic"})
            temp_train.drop(["encode_text"],axis=1,inplace=True)
            df_train=pd.concat([df_train,temp_train])
        train_folds=df_train
        
    valid_folds = sorted(set(validation['less_toxic'].unique()) | set(validation['more_toxic'].unique()))
    valid_folds = pd.DataFrame({'text': valid_folds}).reset_index()
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TestDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.MarginRankingLoss(margin=CFG.margin)
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        preds = inference_fn(valid_loader, model, device)
        
        # scoring
        valid_folds['pred'] = preds
        if 'less_toxic_pred' in validation.columns:
            validation = validation.drop(columns='less_toxic_pred')
        if 'more_toxic_pred' in validation.columns:
            validation = validation.drop(columns='more_toxic_pred')
        rename_cols = {CFG.text: 'less_toxic', 'pred': 'less_toxic_pred'}
        validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                      on='less_toxic', how='left')
        rename_cols = {CFG.text: 'more_toxic', 'pred': 'more_toxic_pred'}
        validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                      on='more_toxic', how='left')
        score = get_score(validation)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        wandb.log({ 
                   f"avg_train_loss": avg_loss, 
                   f"score": score})
        
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'preds': preds},
                        OUTPUT_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    preds = torch.load(OUTPUT_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                       map_location=torch.device('cpu'))['preds']
    valid_folds['pred'] = preds
    if 'less_toxic_pred' in validation.columns:
        validation = validation.drop(columns='less_toxic_pred')
    if 'more_toxic_pred' in validation.columns:
        validation = validation.drop(columns='more_toxic_pred')
    rename_cols = {CFG.text: 'less_toxic', 'pred': 'less_toxic_pred'}
    validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                  on='less_toxic', how='left')
    rename_cols = {CFG.text: 'more_toxic', 'pred': 'more_toxic_pred'}
    validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                  on='more_toxic', how='left')

    torch.cuda.empty_cache()
    gc.collect()
    
    return validation

In [39]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        score = get_score(oof_df)
        LOGGER.info(f'Score: {score:<.4f}')
        return score
    
    if CFG.train:
        # train 
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            run = wandb.init(project='Jigsaw', 
                 config=class2dict(CFG),
                 group=f'{HASH_NAME}-Baseline',
                 job_type="train",
                 name=f'{HASH_NAME}-fold-{fold}',
                 anonymous=anony)
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            run.finish()
        oof_df = oof_df.reset_index(drop=True)
        # CV result
        LOGGER.info(f"========== CV ==========")
        cv=get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
        
        run = wandb.init(project='Jigsaw', 
                 config=class2dict(CFG),
                 group=f'{HASH_NAME}-Baseline',
                 job_type="cv",
                 name=f'cv',
                 anonymous=anony)
        wandb.log({"cv":cv})
        run.finish()

Some weights of the model checkpoint at ../model/luke_base were not used when initializing LukeModel: ['embeddings.position_ids']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/376] Elapsed 0m 0s (remain 4m 28s) Loss: 0.4017(0.4017) Grad: nan  LR: 0.00010000  
Epoch: [1][50/376] Elapsed 0m 9s (remain 1m 2s) Loss: 0.2858(0.3198) Grad: 3575.2378  LR: 0.00009887  
Epoch: [1][100/376] Elapsed 0m 18s (remain 0m 51s) Loss: 0.2925(0.3060) Grad: 5201.5801  LR: 0.00009561  
Epoch: [1][150/376] Elapsed 0m 28s (remain 0m 42s) Loss: 0.2294(0.2982) Grad: 3029.9248  LR: 0.00009038  
Epoch: [1][200/376] Elapsed 0m 37s (remain 0m 32s) Loss: 0.2553(0.2938) Grad: 2592.1562  LR: 0.00008338  
Epoch: [1][250/376] Elapsed 0m 46s (remain 0m 23s) Loss: 0.2231(0.2919) Grad: 2283.8789  LR: 0.00007494  
Epoch: [1][300/376] Elapsed 0m 55s (remain 0m 13s) Loss: 0.2975(0.2906) Grad: 1551.5574  LR: 0.00006541  
Epoch: [1][350/376] Elapsed 1m 4s (remain 0m 4s) Loss: 0.2718(0.2882) Grad: 1982.5321  LR: 0.00005521  
Epoch: [1][375/376] Elapsed 1m 9s (remain 0m 0s) Loss: 0.3638(0.2871) Grad: 2944.8665  LR: 0.00005000  


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:01<00:00, 22.57it/s]
Epoch 1 - avg_train_loss: 0.2871  time: 72s
Epoch 1 - avg_train_loss: 0.2871  time: 72s
Epoch 1 - Score: 0.6966
Epoch 1 - Score: 0.6966
Epoch 1 - Save Best Score: 0.6966 Model
Epoch 1 - Save Best Score: 0.6966 Model


Epoch: [2][0/376] Elapsed 0m 0s (remain 3m 46s) Loss: 0.2260(0.2260) Grad: nan  LR: 0.00004979  
Epoch: [2][50/376] Elapsed 0m 9s (remain 1m 2s) Loss: 0.2426(0.2403) Grad: 2653.0679  LR: 0.00003943  
Epoch: [2][100/376] Elapsed 0m 19s (remain 0m 52s) Loss: 0.1856(0.2421) Grad: 4148.5327  LR: 0.00002952  
Epoch: [2][150/376] Elapsed 0m 28s (remain 0m 42s) Loss: 0.2764(0.2405) Grad: 2570.0237  LR: 0.00002051  
Epoch: [2][200/376] Elapsed 0m 37s (remain 0m 32s) Loss: 0.2052(0.2420) Grad: 2704.3743  LR: 0.00001278  
Epoch: [2][250/376] Elapsed 0m 47s (remain 0m 23s) Loss: 0.2128(0.2403) Grad: 2177.5461  LR: 0.00000666  
Epoch: [2][300/376] Elapsed 0m 56s (remain 0m 14s) Loss: 0.2153(0.2412) Grad: 2013.1694  LR: 0.00000243  
Epoch: [2][350/376] Elapsed 1m 5s (remain 0m 4s) Loss: 0.2893(0.2404) Grad: 2994.8003  LR: 0.00000027  
Epoch: [2][375/376] Elapsed 1m 10s (remain 0m 0s) Loss: 0.3035(0.2417) Grad: 3720.2893  LR: 0.00000000  


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:02<00:00, 22.40it/s]
Epoch 2 - avg_train_loss: 0.2417  time: 73s
Epoch 2 - avg_train_loss: 0.2417  time: 73s
Epoch 2 - Score: 0.6908
Epoch 2 - Score: 0.6908
Score: 0.6966
Score: 0.6966


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
avg_train_loss,█▁
loss,█▆▇▄▆▂▃▄▅▃▄▇▄▄▄▆▅▃▄▅▂▂▄▁▅▂▅▂▄▁▅▁▄▂▃▃▁▃▄▅
lr,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
score,█▁

0,1
avg_train_loss,0.24167
loss,0.30347
lr,0.0
score,0.6908


Some weights of the model checkpoint at ../model/luke_base were not used when initializing LukeModel: ['embeddings.position_ids']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/376] Elapsed 0m 0s (remain 4m 20s) Loss: 0.3893(0.3893) Grad: nan  LR: 0.00010000  
Epoch: [1][50/376] Elapsed 0m 10s (remain 1m 4s) Loss: 0.2588(0.3199) Grad: 7290.3433  LR: 0.00009887  
Epoch: [1][100/376] Elapsed 0m 19s (remain 0m 52s) Loss: 0.3236(0.3065) Grad: 6544.6816  LR: 0.00009561  
Epoch: [1][150/376] Elapsed 0m 28s (remain 0m 42s) Loss: 0.2570(0.3005) Grad: 3650.2034  LR: 0.00009038  
Epoch: [1][200/376] Elapsed 0m 37s (remain 0m 32s) Loss: 0.2180(0.2959) Grad: 3628.7937  LR: 0.00008338  
Epoch: [1][250/376] Elapsed 0m 47s (remain 0m 23s) Loss: 0.2152(0.2939) Grad: 3381.8157  LR: 0.00007494  
Epoch: [1][300/376] Elapsed 0m 56s (remain 0m 14s) Loss: 0.3284(0.2916) Grad: 2034.8888  LR: 0.00006541  
Epoch: [1][350/376] Elapsed 1m 5s (remain 0m 4s) Loss: 0.2845(0.2895) Grad: 2047.3544  LR: 0.00005521  
Epoch: [1][375/376] Elapsed 1m 10s (remain 0m 0s) Loss: 0.2262(0.2880) Grad: 2594.0454  LR: 0.00005000  


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:01<00:00, 22.58it/s]
Epoch 1 - avg_train_loss: 0.2880  time: 72s
Epoch 1 - avg_train_loss: 0.2880  time: 72s
Epoch 1 - Score: 0.6999
Epoch 1 - Score: 0.6999
Epoch 1 - Save Best Score: 0.6999 Model
Epoch 1 - Save Best Score: 0.6999 Model


Epoch: [2][0/376] Elapsed 0m 0s (remain 4m 2s) Loss: 0.1998(0.1998) Grad: nan  LR: 0.00004979  
Epoch: [2][50/376] Elapsed 0m 9s (remain 1m 3s) Loss: 0.2531(0.2411) Grad: 2855.5596  LR: 0.00003943  
Epoch: [2][100/376] Elapsed 0m 19s (remain 0m 52s) Loss: 0.3112(0.2405) Grad: 2721.0906  LR: 0.00002952  
Epoch: [2][150/376] Elapsed 0m 28s (remain 0m 42s) Loss: 0.2534(0.2434) Grad: 2764.3293  LR: 0.00002051  
Epoch: [2][200/376] Elapsed 0m 38s (remain 0m 33s) Loss: 0.2810(0.2442) Grad: 2816.4067  LR: 0.00001278  
Epoch: [2][250/376] Elapsed 0m 47s (remain 0m 23s) Loss: 0.2403(0.2412) Grad: 2838.9741  LR: 0.00000666  
Epoch: [2][300/376] Elapsed 0m 57s (remain 0m 14s) Loss: 0.2671(0.2419) Grad: 2594.4448  LR: 0.00000243  
Epoch: [2][350/376] Elapsed 1m 6s (remain 0m 4s) Loss: 0.2520(0.2404) Grad: 2751.5803  LR: 0.00000027  
Epoch: [2][375/376] Elapsed 1m 11s (remain 0m 0s) Loss: 0.1987(0.2402) Grad: 2647.4048  LR: 0.00000000  


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:02<00:00, 22.47it/s]
Epoch 2 - avg_train_loss: 0.2402  time: 73s
Epoch 2 - avg_train_loss: 0.2402  time: 73s
Epoch 2 - Score: 0.7031
Epoch 2 - Score: 0.7031
Epoch 2 - Save Best Score: 0.7031 Model
Epoch 2 - Save Best Score: 0.7031 Model
Score: 0.7031
Score: 0.7031


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
avg_train_loss,█▁
loss,█▇▃▅▄▅▅█▅▆▅▄▅▃▄▄▄▃▃▄▃▄▃▃▁▃▃▅▂▆▄▂▂▃▄▆▃▃▃▂
lr,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
score,▁█

0,1
avg_train_loss,0.24018
loss,0.19875
lr,0.0
score,0.70309


Some weights of the model checkpoint at ../model/luke_base were not used when initializing LukeModel: ['embeddings.position_ids']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/376] Elapsed 0m 0s (remain 4m 36s) Loss: 0.3940(0.3940) Grad: nan  LR: 0.00010000  
Epoch: [1][50/376] Elapsed 0m 9s (remain 1m 2s) Loss: 0.3120(0.3201) Grad: 6444.5977  LR: 0.00009887  
Epoch: [1][100/376] Elapsed 0m 19s (remain 0m 51s) Loss: 0.2622(0.3066) Grad: 3696.4834  LR: 0.00009561  
Epoch: [1][150/376] Elapsed 0m 28s (remain 0m 42s) Loss: 0.2291(0.2997) Grad: 3030.4839  LR: 0.00009038  
Epoch: [1][200/376] Elapsed 0m 37s (remain 0m 32s) Loss: 0.2804(0.2944) Grad: 2386.8596  LR: 0.00008338  
Epoch: [1][250/376] Elapsed 0m 46s (remain 0m 23s) Loss: 0.2667(0.2912) Grad: 2462.8562  LR: 0.00007494  
Epoch: [1][300/376] Elapsed 0m 56s (remain 0m 14s) Loss: 0.2982(0.2899) Grad: 1757.0070  LR: 0.00006541  
Epoch: [1][350/376] Elapsed 1m 5s (remain 0m 4s) Loss: 0.2441(0.2868) Grad: 3010.6812  LR: 0.00005521  
Epoch: [1][375/376] Elapsed 1m 10s (remain 0m 0s) Loss: 0.3096(0.2859) Grad: 2535.6262  LR: 0.00005000  


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:01<00:00, 22.54it/s]
Epoch 1 - avg_train_loss: 0.2859  time: 72s
Epoch 1 - avg_train_loss: 0.2859  time: 72s
Epoch 1 - Score: 0.6915
Epoch 1 - Score: 0.6915
Epoch 1 - Save Best Score: 0.6915 Model
Epoch 1 - Save Best Score: 0.6915 Model


Epoch: [2][0/376] Elapsed 0m 0s (remain 4m 2s) Loss: 0.2791(0.2791) Grad: nan  LR: 0.00004979  
Epoch: [2][50/376] Elapsed 0m 9s (remain 1m 3s) Loss: 0.3003(0.2473) Grad: 2455.4458  LR: 0.00003943  
Epoch: [2][100/376] Elapsed 0m 19s (remain 0m 51s) Loss: 0.2010(0.2426) Grad: 2559.4663  LR: 0.00002952  
Epoch: [2][150/376] Elapsed 0m 28s (remain 0m 42s) Loss: 0.2124(0.2384) Grad: 2494.4919  LR: 0.00002051  
Epoch: [2][200/376] Elapsed 0m 37s (remain 0m 32s) Loss: 0.2398(0.2388) Grad: 3317.5645  LR: 0.00001278  
Epoch: [2][250/376] Elapsed 0m 46s (remain 0m 23s) Loss: 0.2469(0.2392) Grad: 3088.7705  LR: 0.00000666  
Epoch: [2][300/376] Elapsed 0m 56s (remain 0m 13s) Loss: 0.2524(0.2391) Grad: 2941.7729  LR: 0.00000243  
Epoch: [2][350/376] Elapsed 1m 5s (remain 0m 4s) Loss: 0.1771(0.2389) Grad: 2963.4490  LR: 0.00000027  
Epoch: [2][375/376] Elapsed 1m 9s (remain 0m 0s) Loss: 0.1898(0.2380) Grad: 3939.5308  LR: 0.00000000  


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:02<00:00, 22.01it/s]
Epoch 2 - avg_train_loss: 0.2380  time: 72s
Epoch 2 - avg_train_loss: 0.2380  time: 72s
Epoch 2 - Score: 0.6998
Epoch 2 - Score: 0.6998
Epoch 2 - Save Best Score: 0.6998 Model
Epoch 2 - Save Best Score: 0.6998 Model
Score: 0.6998
Score: 0.6998


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
avg_train_loss,█▁
loss,██▆▄▅▅█▅▇▃▆▇▄▇▆▆▅▅▃▅▅▅▄▄▅▃▃▄▃▄▅▃▃▅▂▆▂▃▁▂
lr,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
score,▁█

0,1
avg_train_loss,0.23797
loss,0.18979
lr,0.0
score,0.69977


Some weights of the model checkpoint at ../model/luke_base were not used when initializing LukeModel: ['embeddings.position_ids']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/376] Elapsed 0m 0s (remain 4m 27s) Loss: 0.4037(0.4037) Grad: nan  LR: 0.00010000  
Epoch: [1][50/376] Elapsed 0m 10s (remain 1m 4s) Loss: 0.2836(0.3207) Grad: 7318.3252  LR: 0.00009887  
Epoch: [1][100/376] Elapsed 0m 19s (remain 0m 52s) Loss: 0.3243(0.3077) Grad: 4362.7021  LR: 0.00009561  
Epoch: [1][150/376] Elapsed 0m 28s (remain 0m 42s) Loss: 0.2808(0.2979) Grad: 4159.4878  LR: 0.00009038  
Epoch: [1][200/376] Elapsed 0m 38s (remain 0m 33s) Loss: 0.2506(0.2925) Grad: 6506.4873  LR: 0.00008338  
Epoch: [1][250/376] Elapsed 0m 47s (remain 0m 23s) Loss: 0.2786(0.2904) Grad: 2700.7090  LR: 0.00007494  
Epoch: [1][300/376] Elapsed 0m 57s (remain 0m 14s) Loss: 0.2607(0.2898) Grad: 3226.7295  LR: 0.00006541  
Epoch: [1][350/376] Elapsed 1m 6s (remain 0m 4s) Loss: 0.2772(0.2875) Grad: 3248.1182  LR: 0.00005521  
Epoch: [1][375/376] Elapsed 1m 11s (remain 0m 0s) Loss: 0.2798(0.2860) Grad: 3016.5940  LR: 0.00005000  


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:02<00:00, 22.21it/s]
Epoch 1 - avg_train_loss: 0.2860  time: 73s
Epoch 1 - avg_train_loss: 0.2860  time: 73s
Epoch 1 - Score: 0.6937
Epoch 1 - Score: 0.6937
Epoch 1 - Save Best Score: 0.6937 Model
Epoch 1 - Save Best Score: 0.6937 Model


Epoch: [2][0/376] Elapsed 0m 0s (remain 3m 55s) Loss: 0.2623(0.2623) Grad: nan  LR: 0.00004979  
Epoch: [2][50/376] Elapsed 0m 9s (remain 1m 3s) Loss: 0.2962(0.2386) Grad: 6242.4644  LR: 0.00003943  
Epoch: [2][100/376] Elapsed 0m 19s (remain 0m 52s) Loss: 0.2339(0.2366) Grad: 6207.7456  LR: 0.00002952  
Epoch: [2][150/376] Elapsed 0m 28s (remain 0m 42s) Loss: 0.2252(0.2393) Grad: 2830.8330  LR: 0.00002051  
Epoch: [2][200/376] Elapsed 0m 37s (remain 0m 32s) Loss: 0.2437(0.2401) Grad: 2770.2488  LR: 0.00001278  
Epoch: [2][250/376] Elapsed 0m 46s (remain 0m 23s) Loss: 0.1704(0.2398) Grad: 3081.7512  LR: 0.00000666  
Epoch: [2][300/376] Elapsed 0m 55s (remain 0m 13s) Loss: 0.2160(0.2402) Grad: 2650.2229  LR: 0.00000243  
Epoch: [2][350/376] Elapsed 1m 5s (remain 0m 4s) Loss: 0.3089(0.2399) Grad: 3032.3550  LR: 0.00000027  
Epoch: [2][375/376] Elapsed 1m 9s (remain 0m 0s) Loss: 0.2284(0.2384) Grad: 3269.5989  LR: 0.00000000  


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:02<00:00, 22.21it/s]
Epoch 2 - avg_train_loss: 0.2384  time: 72s
Epoch 2 - avg_train_loss: 0.2384  time: 72s
Epoch 2 - Score: 0.6967
Epoch 2 - Score: 0.6967
Epoch 2 - Save Best Score: 0.6967 Model
Epoch 2 - Save Best Score: 0.6967 Model
Score: 0.6967
Score: 0.6967


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
avg_train_loss,█▁
loss,█▇▂▅▄▅▇▆▅▅▄▆▇▄▄▄▃█▇▄▁▂▂▃▄▂▁▃▁▃▃▄▄▁▃▄▄▁▄▂
lr,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
score,▁█

0,1
avg_train_loss,0.23839
loss,0.22839
lr,0.0
score,0.69673


Some weights of the model checkpoint at ../model/luke_base were not used when initializing LukeModel: ['embeddings.position_ids']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/376] Elapsed 0m 0s (remain 4m 34s) Loss: 0.4136(0.4136) Grad: nan  LR: 0.00010000  
Epoch: [1][50/376] Elapsed 0m 9s (remain 1m 3s) Loss: 0.2993(0.3122) Grad: 4205.0649  LR: 0.00009887  
Epoch: [1][100/376] Elapsed 0m 19s (remain 0m 52s) Loss: 0.3085(0.2946) Grad: 4208.1694  LR: 0.00009561  
Epoch: [1][150/376] Elapsed 0m 28s (remain 0m 42s) Loss: 0.2278(0.2932) Grad: 2809.5981  LR: 0.00009038  
Epoch: [1][200/376] Elapsed 0m 37s (remain 0m 32s) Loss: 0.2872(0.2906) Grad: 3208.6365  LR: 0.00008338  
Epoch: [1][250/376] Elapsed 0m 46s (remain 0m 23s) Loss: 0.3583(0.2873) Grad: 4584.6260  LR: 0.00007494  
Epoch: [1][300/376] Elapsed 0m 56s (remain 0m 13s) Loss: 0.2356(0.2860) Grad: 2220.9226  LR: 0.00006541  
Epoch: [1][350/376] Elapsed 1m 5s (remain 0m 4s) Loss: 0.2601(0.2844) Grad: 2722.6099  LR: 0.00005521  
Epoch: [1][375/376] Elapsed 1m 10s (remain 0m 0s) Loss: 0.3066(0.2841) Grad: 3104.5920  LR: 0.00005000  


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:02<00:00, 22.29it/s]
Epoch 1 - avg_train_loss: 0.2841  time: 72s
Epoch 1 - avg_train_loss: 0.2841  time: 72s
Epoch 1 - Score: 0.6972
Epoch 1 - Score: 0.6972
Epoch 1 - Save Best Score: 0.6972 Model
Epoch 1 - Save Best Score: 0.6972 Model


Epoch: [2][0/376] Elapsed 0m 0s (remain 3m 44s) Loss: 0.2698(0.2698) Grad: nan  LR: 0.00004979  
Epoch: [2][50/376] Elapsed 0m 9s (remain 1m 3s) Loss: 0.1917(0.2319) Grad: 2602.6995  LR: 0.00003943  
Epoch: [2][100/376] Elapsed 0m 19s (remain 0m 52s) Loss: 0.2743(0.2371) Grad: 3608.9495  LR: 0.00002952  
Epoch: [2][150/376] Elapsed 0m 28s (remain 0m 42s) Loss: 0.2132(0.2347) Grad: 2675.7192  LR: 0.00002051  
Epoch: [2][200/376] Elapsed 0m 38s (remain 0m 33s) Loss: 0.2222(0.2360) Grad: 2559.3853  LR: 0.00001278  
Epoch: [2][250/376] Elapsed 0m 47s (remain 0m 23s) Loss: 0.1652(0.2367) Grad: 2562.3076  LR: 0.00000666  
Epoch: [2][300/376] Elapsed 0m 56s (remain 0m 14s) Loss: 0.2356(0.2370) Grad: 2345.9412  LR: 0.00000243  
Epoch: [2][350/376] Elapsed 1m 6s (remain 0m 4s) Loss: 0.2771(0.2361) Grad: 2920.6804  LR: 0.00000027  
Epoch: [2][375/376] Elapsed 1m 11s (remain 0m 0s) Loss: 0.2302(0.2353) Grad: 2194.6548  LR: 0.00000000  


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:02<00:00, 22.32it/s]
Epoch 2 - avg_train_loss: 0.2353  time: 73s
Epoch 2 - avg_train_loss: 0.2353  time: 73s
Epoch 2 - Score: 0.6886
Epoch 2 - Score: 0.6886
Score: 0.6972
Score: 0.6972


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
avg_train_loss,█▁
loss,█▇▆▃▄▆▇▄▃▂▅▅▄▆▅▄▃█▆▇▃▄▁▆▅▃▅▃▁▇▄▇▇▆▄▆▁▆▂▃
lr,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
score,█▁

0,1
avg_train_loss,0.23529
loss,0.2302
lr,0.0
score,0.68859


Score: 0.6987
Score: 0.6987


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.0, max=1.0)…

0,1
cv,▁

0,1
cv,0.69868


In [40]:
ipypath="./output/jigsaw_server_luke/luke_folder/.ipynb_checkpoints"
if os.path.exists(ipypath):
    os.removedirs(ipypath)
ipypath="./output/jigsaw_server_luke/.ipynb_checkpoints"
if os.path.exists(ipypath):
    os.removedirs(ipypath)