# About this notebook
- [Luke](https://arxiv.org/pdf/2010.01057v1.pdf)-base starter notebook
- [Inference notebook](https://www.kaggle.com/yasufuminakama/jigsaw4-luke-base-starter-sub)
- Approach References
    - https://www.kaggle.com/c/jigsaw-toxic-severity-rating/discussion/286471
    - https://www.kaggle.com/debarshichanda/pytorch-w-b-jigsaw-starter
    - https://www.kaggle.com/debarshichanda/0-816-jigsaw-inference
    - Thanks for sharing @debarshichanda

# Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    competition='Jigsaw4'
    _wandb_kernel='nakama'
    apex=True
    print_freq=50
    num_workers=4
    model="studio-ousia/luke-base"
    model_name="studio-ousia/luke-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    encoder_lr=1e-4
    decoder_lr=1e-4
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=64
    fc_dropout=0.
    text="text"
    target="target"
    target_size=1
    head=32
    tail=32
    max_len=head+tail
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    margin=0.4
    seed=42
    
    rank_diff=3000
    section_num=6
    sample_times=2
    max_sfold_diff=4
    min_sfold_diff=1
    
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    debug=False
    epochs=2

    translate_aug=False
    translate_path="../input/translatetoxic/comment_translation.csv"

In [3]:
HASH_NAME="luke persedo_com  rank_diff=2000 s=2"
if CFG.debug==True:
    HASH_NAME="lukemodel debug "

# translate_text=["text_fr","text_de","text_es"]
translate_text=["text_fr","text_de"]

In [4]:
# ====================================================
# wandb
# ====================================================
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("wandb_api")
    if CFG.debug==False:
        wandb.login(key=secret_value_0)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

    
def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))



If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. 
Get your W&B access token from here: https://wandb.ai/authorize


# Library

In [5]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import sys
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -q transformers -y')
os.system('pip uninstall -q tokenizers -y')
os.system('pip uninstall -q huggingface_hub -y')

os.system('mkdir -p /tmp/pip/cache-tokenizers/')
os.system('cp ../input/tokenizers-0103/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl /tmp/pip/cache-tokenizers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-tokenizers/ tokenizers')

os.system('mkdir -p /tmp/pip/cache-huggingface-hub/')
os.system('cp ../input/huggingface-hub-008/huggingface_hub-0.0.8-py3-none-any.whl /tmp/pip/cache-huggingface-hub/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-huggingface-hub/ huggingface_hub')

os.system('mkdir -p /tmp/pip/cache-transformers/')
os.system('cp ../input/transformers-470/transformers-4.7.0-py3-none-any.whl /tmp/pip/cache-transformers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-transformers/ transformers')

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import LukeTokenizer, LukeModel, LukeConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.7.0 requires transformers<4.10,>=4.1, which is not installed.
datasets 1.14.0 requires huggingface-hub<0.1.0,>=0.0.19, but you have huggingface-hub 0.0.8 which is incompatible.


tokenizers.__version__: 0.10.3
transformers.__version__: 4.7.0


# Utils

In [6]:
# ====================================================
# Utils
# ====================================================
def get_score(df):
    score = len(df[df['less_toxic_pred'] < df['more_toxic_pred']]) / len(df)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [7]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

from sklearn.preprocessing import LabelEncoder
def generate_comments(data):
    more_toxic_text=data["more_toxic"].values
    less_toxic_text=data["less_toxic"].values    
    comments=np.concatenate((more_toxic_text,less_toxic_text))
    comments=np.unique(comments)
    comments=pd.DataFrame({"text":comments})
    text_encoder=LabelEncoder()
    text_encoder.fit(comments)
    comments["encode_text"]=text_encoder.transform(comments["text"])
    comments["toxic_value"]=0
    comments["access_time"]=0
    data["encode_less"]=text_encoder.transform(data["less_toxic"])
    data["encode_more"]=text_encoder.transform(data["more_toxic"])
    
    return data,comments
train,comments=generate_comments(train)
if CFG.debug:
    train = train.sample(n=100, random_state=CFG.seed).reset_index(drop=True)
test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
print(train.shape)
print(test.shape, submission.shape)
display(train.head())
display(test.head())
display(submission.head())

(30108, 5)
(7537, 2) (7537, 2)


Unnamed: 0,worker,less_toxic,more_toxic,encode_less,encode_more
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,2405,12151
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,7215,653
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",2632,7222
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,7973,12968
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",3524,3266


Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...


Unnamed: 0,comment_id,score
0,114890,0.5
1,732895,0.5
2,1139051,0.5
3,1434512,0.5
4,2084821,0.5


# CV split

In [8]:
from sklearn.model_selection import GroupKFold
class UnionFind():
    def __init__(self, n):
        self.n = n
        self.parents = [-1] * n

    def find(self, x):
        if self.parents[x] < 0:
            return x
        else:
            self.parents[x] = self.find(self.parents[x])
            return self.parents[x]

    def union(self, x, y):
        x = self.find(x)
        y = self.find(y)
        if x == y:
            return
        if self.parents[x] > self.parents[y]:
            x, y = y, x
        self.parents[x] += self.parents[y]
        self.parents[y] = x


def get_group_unionfind(train: pd.DataFrame):
    less_unique_text = train['less_toxic'].unique()
    more_unique_text = train['more_toxic'].unique()
    unique_text = np.hstack([less_unique_text, more_unique_text])
    unique_text = np.unique(unique_text).tolist()    
    text2num = {text: i for i, text in enumerate(unique_text)}
    num2text = {num: text for text, num in text2num.items()}
    train['num_less_toxic'] = train['less_toxic'].map(text2num)
    train['num_more_toxic'] = train['more_toxic'].map(text2num)

    uf = UnionFind(len(unique_text))
    for seq1, seq2 in train[['num_less_toxic', 'num_more_toxic']].to_numpy():
        uf.union(seq1, seq2)

    text2group = {num2text[i]: uf.find(i) for i in range(len(unique_text))}
    train['group'] = train['less_toxic'].map(text2group)
    train = train.drop(columns=['num_less_toxic', 'num_more_toxic'])
    return train

In [9]:
# ====================================================
# CV split
# ====================================================
# Fold = GroupKFold(n_splits=CFG.n_fold)
# for n, (trn_index, val_index) in enumerate(Fold.split(train, train, train['worker'])):
#     train.loc[val_index, 'fold'] = int(n)
# train['fold'] = train['fold'].astype(int)
# display(train.groupby('fold').size())

In [10]:
data_df=train.copy()
data_df = get_group_unionfind(data_df)
group_kfold = GroupKFold(n_splits=CFG.n_fold)
for fold, (trn_idx, val_idx) in enumerate(group_kfold.split(data_df, data_df, data_df['group'])): 
    data_df.loc[val_idx , "fold"] = fold

data_df["fold"] = data_df["fold"].astype(int)
train=data_df.copy()
display(train.groupby('fold').size())

fold
0    6022
1    6022
2    6022
3    6021
4    6021
dtype: int64

In [11]:
# ====================================================
# from model work
# ====================================================

luke_val=pd.read_csv("../input/luke-val-data/luke_val_0.860.csv")
luke_val=luke_val.rename(columns={"less_toxic_pred":"less_value","more_toxic_pred":"more_value"})
def generate_comments_luke(data):
    more_toxic_text=data[["more_toxic","encode_more","more_value"]]
    less_toxic_text=data[["less_toxic","encode_less","less_value"]]
    more_toxic_text=more_toxic_text.rename(columns={"more_toxic":"text","encode_more":"encode_text","more_value":"toxic_value"})
    less_toxic_text=less_toxic_text.rename(columns={"less_toxic":"text","encode_less":"encode_text","less_value":"toxic_value"})
    
    comments=pd.concat([more_toxic_text,less_toxic_text],axis=0)
    comments=comments.drop_duplicates(subset=None,keep="first",inplace=False) 
    return comments
comments_luke=generate_comments_luke(luke_val)
comments_luke["score"]=comments_luke["toxic_value"].rank(method="first").astype(int)

In [12]:
# ====================================================
# produce more data
# ====================================================
# bradley terry work
# comments_rank=pd.read_csv("../input/commentbt/comments.csv")

comments_rank=comments_luke
comments_rank=comments_rank[["text","score"]]

comments_rank=comments_rank.merge(comments[["text","encode_text"]],on="text",how="left")
less_fold=comments_rank.merge(train[["encode_less","fold"]],left_on="encode_text",right_on="encode_less",
                              how="left").drop_duplicates("text")["fold"]
more_fold=comments_rank.merge(train[["encode_more","fold"]],left_on="encode_text",right_on="encode_more",
                              how="left").drop_duplicates("text")["fold"]

less_fold=less_fold.rename("less_fold").reset_index(drop=True)
more_fold=more_fold.rename("more_fold").reset_index(drop=True)
fold_df=pd.concat([less_fold,more_fold],axis=1)
comments_rank["fold"]=fold_df.apply(lambda row:row["more_fold"] if pd.isna(row["less_fold"]) else row["less_fold"],axis=1)
comments_rank["fold"]=comments_rank.fold.astype(int)


In [13]:
from itertools import combinations

# def typicalsamling(group, typicalNDict):
#     name = group.name
#     n = typicalNDict[name]
#     return group.sample(n=n,replace=True,random_state=42)
def typicalsamling(group, sample_times):
    return group.sample(n=sample_times,replace=True,random_state=42)

def make_pair_data(data):
    train=pd.DataFrame(columns=['less_toxic',"encode_less", 'rank1','more_toxic',"encode_more", 'rank2'])
    
    for fold in range(CFG.n_fold):
        df=data[data["fold"]==fold]
        section_num=CFG.section_num
        
        df["sfold"]=pd.qcut(df["score"],[i *1/section_num for i in range(section_num+1)],labels=[i for i in range(section_num)])
        ## sort by score ensure toxic value of sentence1 <sentence2
        df=df.sort_values("score").reset_index(drop=True)
        df['sfold'] = df['sfold'].astype(int)

        id1 = []
        id2 = []
        for f in combinations(range(df.shape[0]),2):
            
            id1.append(f[0])
            id2.append(f[1])
        
        indices_df = pd.DataFrame(id1, columns=['id1'])
        indices_df['id2'] = id2

        # shuffle
        indices_df = indices_df.sample(frac=1).reset_index(drop=True)
    
        # map to texts and scores
        x1 = df.iloc[indices_df.id1][['text', "encode_text",'score',"sfold"]].rename(
            columns={"encode_text":"encode_less","text": "less_toxic", "score": "less_rank","sfold":"sfold1"}).reset_index(drop=True)
        x2 = df.iloc[indices_df.id2][['text', "encode_text",'score',"sfold"]].rename(
            columns={"encode_text":"encode_more","text": "more_toxic", "score": "more_rank","sfold":"sfold2"}).reset_index(drop=True)
        
        # combine
        x3 = pd.concat([x1,x2], axis = 1, ignore_index = True)
        x3.columns = ['less_toxic',"encode_less", 'rank1',"sfold1",'more_toxic',"encode_more", 'rank2',"sfold2"]
        
        choose_list=((x3["sfold2"]-x3["sfold1"])<=CFG.max_sfold_diff) & ((x3["sfold2"]-x3["sfold1"])>=CFG.min_sfold_diff)

        x3=x3[choose_list]
        comment_id=df["encode_text"]
        x4=pd.DataFrame(columns=x3.columns)


        result = x3.groupby(['encode_less',"sfold2"],as_index=False).apply(typicalsamling, CFG.sample_times)
        result=result[['less_toxic',"encode_less", 'rank1','more_toxic',"encode_more", 'rank2']]
        result["fold"]=fold
        result=result.reset_index(drop=True)
        
        train=pd.concat([train,result],axis=0)
        del x1,x2,x3,x4
    train=train.sample(frac=1).reset_index(drop=True)
    train['fold'] = train['fold'].astype(int)
    display(train.groupby('fold').size())
    return train

In [14]:
persedo_train=make_pair_data(comments_rank)
persedo_train["worker"]=99999
persedo_train["group"]=99999
persedo_train=persedo_train.drop(["rank1","rank2"],axis=1)

#combine together

train=train.append(persedo_train)
train=train.reset_index(drop=True)

# train.to_csv("./output/train_fold.csv")

fold
0    13308
1    13300
2    13292
3    13300
4    13308
dtype: int64

In [15]:
def check_data_leak(train,fold1,fold2):
    data_fold1=train[train["fold"]==fold1]
    data_fold2=train[train["fold"]==fold2]
    
    more_toxic_text1=data_fold1["more_toxic"].values
    less_toxic_text1=data_fold1["less_toxic"].values    
    comments1=np.concatenate((more_toxic_text1,less_toxic_text1))
    comments1=np.unique(comments1)
    comments1=pd.DataFrame({"text":comments1})
    
    more_toxic_text2=data_fold2["more_toxic"].values
    less_toxic_text2=data_fold2["less_toxic"].values   
    comments2=np.concatenate((more_toxic_text2,less_toxic_text2))
    comments2=np.unique(comments2)
    comments2=pd.DataFrame({"text":comments2})
    comments2["overlap"]=0
    
    comments1=comments1.merge(comments2,on="text",how="left")
    print((comments1.overlap.isna()==False).sum())
check_data_leak(train,0,1)

0


# tokenizer

In [16]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = LukeTokenizer.from_pretrained(CFG.model, lowercase=True)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.3M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/33.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

# Dataset

In [17]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(text, cfg):
    if cfg.tail == 0:
        inputs = cfg.tokenizer.encode_plus(text, 
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           max_length=cfg.max_len,
                                           pad_to_max_length=True,
                                           truncation=True)
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = cfg.tokenizer.encode_plus(text,
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           truncation=True)
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_len:
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]])
            if k == 'input_ids':
                new_v = np.ones(cfg.max_len) * cfg.tokenizer.pad_token_id
            else:
                new_v = np.zeros(cfg.max_len)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.less_toxic = df['less_toxic'].fillna("none").values
        self.more_toxic = df['more_toxic'].fillna("none").values

    def __len__(self):
        return len(self.less_toxic)

    def __getitem__(self, item):
        less_toxic_inputs = prepare_input(str(self.less_toxic[item]), self.cfg)
        more_toxic_inputs = prepare_input(str(self.more_toxic[item]), self.cfg)
        label = torch.tensor(1, dtype=torch.float)
        return less_toxic_inputs, more_toxic_inputs, label


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df[cfg.text].fillna("none").values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = prepare_input(text, self.cfg)
        return inputs

# Model

In [18]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = LukeConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        self.config.update({"hidden_dropout_prob": 0.0,"attention_probs_dropout_prob":0.0})  
        if pretrained:
            self.model = LukeModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = LukeModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output


# class AttentionBlock(nn.Module):
#     def __init__(self, in_features, middle_features, out_features):
#         super().__init__()
#         self.in_features = in_features
#         self.middle_features = middle_features
#         self.out_features = out_features
#         self.W = nn.Linear(in_features, middle_features)
#         self.V = nn.Linear(middle_features, out_features)

#     def forward(self, features):
#         att = torch.tanh(self.W(features))
#         score = self.V(att)
#         attention_weights = torch.softmax(score, dim=1)
#         context_vector = attention_weights * features
#         context_vector = torch.sum(context_vector, dim=1)
#         return context_vector

# class CustomModel(nn.Module):
#     def __init__(self, cfg, config_path=None, pretrained=False):
#         super().__init__()
#         self.cfg = cfg
#         if config_path is None:
#             self.config = LukeConfig.from_pretrained(cfg.model, output_hidden_states=True)
#         else:
#             self.config = torch.load(config_path)
#         self.config.update({"hidden_dropout_prob": 0.0,"attention_probs_dropout_prob":0.0})  
#         if pretrained:
#             self.model = LukeModel.from_pretrained(cfg.model, config=self.config)
#         else:
#             self.model = LukeModel(self.config)
            
#         self.seq_attn_head = nn.Sequential(
#             nn.LayerNorm(self.config.hidden_size),
#             # nn.Dropout(0.1),
#             AttentionBlock(self.config.hidden_size, self.config.hidden_size, 1),
#             # nn.Linear(self.config.hidden_size, 2 if kl_loss else 1),
#         )
#         self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        
#     def feature(self, inputs):
#         outputs = self.model(**inputs)
#         last_hidden_states = outputs[0]
        
#         feature = self.seq_attn_head(last_hidden_states)
#         return feature

#     def forward(self, inputs):
#         feature = self.feature(inputs)
#         output = self.fc(feature)
#         return output
    
    


In [19]:
config = LukeConfig.from_pretrained(CFG.model, output_hidden_states=True)

Downloading:   0%|          | 0.00/761 [00:00<?, ?B/s]

# Helpler functions

In [20]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (less_toxic_inputs, more_toxic_inputs, labels) in enumerate(train_loader):
        for k, v in less_toxic_inputs.items():
            less_toxic_inputs[k] = v.to(device)
        for k, v in more_toxic_inputs.items():
            more_toxic_inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            less_toxic_y_preds = model(less_toxic_inputs)
            more_toxic_y_preds = model(more_toxic_inputs)
            loss = criterion(more_toxic_y_preds, less_toxic_y_preds, labels)
        losses.update(loss.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        wandb.log({f"loss": losses.val,
                   f"lr": scheduler.get_lr()[0]})
    return losses.avg


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [21]:
# ====================================================
# train loop
# ====================================================

def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index
    
    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    validation = folds.loc[val_idx].reset_index(drop=True)
    if CFG.translate_aug==True:
        
        df_train=train_folds.copy()
        comment_translation=pd.read_csv(CFG.translate_path)
        comment_translation=comment_translation.merge(comments,on="text",how="left")
        df_train_encode=df_train.drop(["less_toxic","more_toxic"],axis=1)
        for language_text in translate_text:
            temp_train=df_train_encode
            
            temp_train=temp_train.merge(comment_translation[["encode_text",language_text]],left_on="encode_less",right_on="encode_text",how="left")
            temp_train=temp_train.rename(columns={language_text:"less_toxic"})
            temp_train.drop(["encode_text"],axis=1,inplace=True)
            
            temp_train=temp_train.merge(comment_translation[["encode_text",language_text]],left_on="encode_more",right_on="encode_text",how="left")
            temp_train=temp_train.rename(columns={language_text:"more_toxic"})
            temp_train.drop(["encode_text"],axis=1,inplace=True)
            df_train=pd.concat([df_train,temp_train])
        train_folds=df_train
        
    valid_folds = sorted(set(validation['less_toxic'].unique()) | set(validation['more_toxic'].unique()))
    valid_folds = pd.DataFrame({'text': valid_folds}).reset_index()
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TestDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.MarginRankingLoss(margin=CFG.margin)
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        preds = inference_fn(valid_loader, model, device)
        
        # scoring
        valid_folds['pred'] = preds
        if 'less_toxic_pred' in validation.columns:
            validation = validation.drop(columns='less_toxic_pred')
        if 'more_toxic_pred' in validation.columns:
            validation = validation.drop(columns='more_toxic_pred')
        rename_cols = {CFG.text: 'less_toxic', 'pred': 'less_toxic_pred'}
        validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                      on='less_toxic', how='left')
        rename_cols = {CFG.text: 'more_toxic', 'pred': 'more_toxic_pred'}
        validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                      on='more_toxic', how='left')
        score = get_score(validation)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        wandb.log({ 
                   f"avg_train_loss": avg_loss, 
                   f"score": score})
        
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'preds': preds},
                        OUTPUT_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    preds = torch.load(OUTPUT_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                       map_location=torch.device('cpu'))['preds']
    valid_folds['pred'] = preds
    if 'less_toxic_pred' in validation.columns:
        validation = validation.drop(columns='less_toxic_pred')
    if 'more_toxic_pred' in validation.columns:
        validation = validation.drop(columns='more_toxic_pred')
    rename_cols = {CFG.text: 'less_toxic', 'pred': 'less_toxic_pred'}
    validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                  on='less_toxic', how='left')
    rename_cols = {CFG.text: 'more_toxic', 'pred': 'more_toxic_pred'}
    validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                  on='more_toxic', how='left')

    torch.cuda.empty_cache()
    gc.collect()
    
    return validation

In [22]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        score = get_score(oof_df)
        LOGGER.info(f'Score: {score:<.4f}')
        return score
    
    if CFG.train:
        # train 
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            run = wandb.init(project='Jigsaw', 
                 config=class2dict(CFG),
                 group=f'{HASH_NAME}-Baseline',
                 job_type="train",
                 name=f'{HASH_NAME}-fold-{fold}',
                 anonymous=anony)
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            run.finish()
        run = wandb.init(project='Jigsaw', 
                 config=class2dict(CFG),
                 group=f'{HASH_NAME}-Baseline',
                 job_type="cv",
                 name=f'cv',
                 anonymous=anony)
        
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== all CV ==========")
        cv=get_result(oof_df)
        wandb.log({"all cv":cv})
        
        oof_df=oof_df.query("worker!=99999")
        # CV result
        LOGGER.info(f"========== origin CV ==========")
        cv=get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'tokenizer/oof_df.csv', index=False)
        
        
        wandb.log({"origin cv":cv})
        run.finish()

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade



CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Downloading:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

Some weights of the model checkpoint at studio-ousia/luke-base were not used when initializing LukeModel: ['embeddings.position_ids']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1207] Elapsed 0m 2s (remain 58m 30s) Loss: 0.3647(0.3647) Grad: nan  LR: 0.00010000  
Epoch: [1][50/1207] Elapsed 0m 41s (remain 15m 36s) Loss: 0.1729(0.2396) Grad: 7431.4771  LR: 0.00009989  
Epoch: [1][100/1207] Elapsed 1m 19s (remain 14m 32s) Loss: 0.2376(0.2306) Grad: 9056.2451  LR: 0.00009957  
Epoch: [1][150/1207] Elapsed 1m 58s (remain 13m 45s) Loss: 0.2052(0.2186) Grad: 11042.9189  LR: 0.00009904  
Epoch: [1][200/1207] Elapsed 2m 36s (remain 13m 3s) Loss: 0.2273(0.2184) Grad: 4631.6997  LR: 0.00009830  
Epoch: [1][250/1207] Elapsed 3m 14s (remain 12m 22s) Loss: 0.1519(0.2157) Grad: 3698.9810  LR: 0.00009736  
Epoch: [1][300/1207] Elapsed 3m 53s (remain 11m 42s) Loss: 0.1568(0.2125) Grad: 4593.9961  LR: 0.00009622  
Epoch: [1][350/1207] Elapsed 4m 31s (remain 11m 2s) Loss: 0.1804(0.2099) Grad: 2544.5271  LR: 0.00009488  
Epoch: [1][400/1207] Elapsed 5m 10s (remain 10m 23s) Loss: 0.1915(0.2067) Grad: 4566.3452  LR: 0.00009335  
Epoch: [1][450/1207] Elapsed 5m 48s (re

100%|██████████| 45/45 [00:06<00:00,  7.13it/s]
Epoch 1 - avg_train_loss: 0.1796  time: 935s
Epoch 1 - Score: 0.8314
Epoch 1 - Save Best Score: 0.8314 Model


Epoch: [2][0/1207] Elapsed 0m 1s (remain 34m 51s) Loss: 0.1029(0.1029) Grad: nan  LR: 0.00004997  
Epoch: [2][50/1207] Elapsed 0m 40s (remain 15m 21s) Loss: 0.0720(0.1280) Grad: 2523.6663  LR: 0.00004672  
Epoch: [2][100/1207] Elapsed 1m 18s (remain 14m 25s) Loss: 0.1626(0.1298) Grad: 4818.0542  LR: 0.00004348  
Epoch: [2][150/1207] Elapsed 1m 57s (remain 13m 40s) Loss: 0.0739(0.1275) Grad: 3664.9265  LR: 0.00004027  
Epoch: [2][200/1207] Elapsed 2m 35s (remain 12m 59s) Loss: 0.1337(0.1285) Grad: 3789.7515  LR: 0.00003711  
Epoch: [2][250/1207] Elapsed 3m 14s (remain 12m 18s) Loss: 0.1989(0.1284) Grad: 5761.3145  LR: 0.00003399  
Epoch: [2][300/1207] Elapsed 3m 52s (remain 11m 39s) Loss: 0.1435(0.1289) Grad: 3842.1292  LR: 0.00003095  
Epoch: [2][350/1207] Elapsed 4m 30s (remain 11m 0s) Loss: 0.1161(0.1283) Grad: 4598.4155  LR: 0.00002798  
Epoch: [2][400/1207] Elapsed 5m 9s (remain 10m 21s) Loss: 0.2001(0.1275) Grad: 4070.8381  LR: 0.00002511  
Epoch: [2][450/1207] Elapsed 5m 47s (rem

100%|██████████| 45/45 [00:07<00:00,  6.04it/s]
Epoch 2 - avg_train_loss: 0.1283  time: 936s
Epoch 2 - Score: 0.8351
Epoch 2 - Save Best Score: 0.8351 Model
Score: 0.8351


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
avg_train_loss,█▁
loss,▆▇▅▅▅▆▅▇▇▃▄▄▄█▄▄▆▃▃▄▃▃▂▃▅▄▅▄▁▄▂▃▃▃▅▆▃▃▄▃
lr,███████▇▇▇▇▇▇▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
score,▁█

0,1
avg_train_loss,0.12834
loss,0.11246
lr,0.0
score,0.83508


[34m[1mwandb[0m: Currently logged in as: [33manony-mouse-193644[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Some weights of the model checkpoint at studio-ousia/luke-base were not used when initializing LukeModel: ['embeddings.position_ids']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1207] Elapsed 0m 1s (remain 31m 57s) Loss: 0.3791(0.3791) Grad: nan  LR: 0.00010000  
Epoch: [1][50/1207] Elapsed 0m 40s (remain 15m 9s) Loss: 0.1990(0.2401) Grad: 7149.9629  LR: 0.00009989  
Epoch: [1][100/1207] Elapsed 1m 18s (remain 14m 19s) Loss: 0.2246(0.2229) Grad: 7750.2646  LR: 0.00009957  
Epoch: [1][150/1207] Elapsed 1m 56s (remain 13m 37s) Loss: 0.1567(0.2126) Grad: 3088.8928  LR: 0.00009904  
Epoch: [1][200/1207] Elapsed 2m 35s (remain 12m 57s) Loss: 0.1714(0.2063) Grad: 3193.7407  LR: 0.00009830  
Epoch: [1][250/1207] Elapsed 3m 13s (remain 12m 17s) Loss: 0.1880(0.2016) Grad: 2846.4412  LR: 0.00009736  
Epoch: [1][300/1207] Elapsed 3m 52s (remain 11m 38s) Loss: 0.1175(0.1994) Grad: 3163.4617  LR: 0.00009622  
Epoch: [1][350/1207] Elapsed 4m 30s (remain 10m 59s) Loss: 0.1322(0.1968) Grad: 3030.3469  LR: 0.00009488  
Epoch: [1][400/1207] Elapsed 5m 9s (remain 10m 21s) Loss: 0.1956(0.1944) Grad: 2564.4248  LR: 0.00009335  
Epoch: [1][450/1207] Elapsed 5m 47s (rem

100%|██████████| 45/45 [00:06<00:00,  6.44it/s]
Epoch 1 - avg_train_loss: 0.1728  time: 935s
Epoch 1 - Score: 0.8301
Epoch 1 - Save Best Score: 0.8301 Model


Epoch: [2][0/1207] Elapsed 0m 1s (remain 31m 59s) Loss: 0.1101(0.1101) Grad: nan  LR: 0.00004997  
Epoch: [2][50/1207] Elapsed 0m 40s (remain 15m 9s) Loss: 0.1547(0.1284) Grad: 4271.1597  LR: 0.00004672  
Epoch: [2][100/1207] Elapsed 1m 18s (remain 14m 19s) Loss: 0.1299(0.1273) Grad: 3973.2083  LR: 0.00004348  
Epoch: [2][150/1207] Elapsed 1m 56s (remain 13m 38s) Loss: 0.1996(0.1291) Grad: 5114.3188  LR: 0.00004027  
Epoch: [2][200/1207] Elapsed 2m 35s (remain 12m 57s) Loss: 0.1518(0.1278) Grad: 4010.0601  LR: 0.00003711  
Epoch: [2][250/1207] Elapsed 3m 13s (remain 12m 18s) Loss: 0.1856(0.1284) Grad: 5793.7476  LR: 0.00003399  
Epoch: [2][300/1207] Elapsed 3m 52s (remain 11m 39s) Loss: 0.0791(0.1294) Grad: 3794.4155  LR: 0.00003095  
Epoch: [2][350/1207] Elapsed 4m 30s (remain 10m 59s) Loss: 0.1139(0.1281) Grad: 3922.9495  LR: 0.00002798  
Epoch: [2][400/1207] Elapsed 5m 8s (remain 10m 21s) Loss: 0.0874(0.1279) Grad: 4607.3027  LR: 0.00002511  
Epoch: [2][450/1207] Elapsed 5m 47s (rem

100%|██████████| 45/45 [00:06<00:00,  7.27it/s]
Epoch 2 - avg_train_loss: 0.1274  time: 934s
Epoch 2 - Score: 0.8330
Epoch 2 - Save Best Score: 0.8330 Model
Score: 0.8330


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
avg_train_loss,█▁
loss,█▄▄▃▃▃▅▃▄▄▂▄▃▃▃▅▂▄▂▃▂▁▁▃▃▄▂▃▄▄▂▃▃▃▄▃▂▃▃▂
lr,███████▇▇▇▇▇▇▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
score,▁█

0,1
avg_train_loss,0.1274
loss,0.09653
lr,0.0
score,0.83304


[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Some weights of the model checkpoint at studio-ousia/luke-base were not used when initializing LukeModel: ['embeddings.position_ids']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1207] Elapsed 0m 2s (remain 43m 7s) Loss: 0.3854(0.3854) Grad: nan  LR: 0.00010000  
Epoch: [1][50/1207] Elapsed 0m 40s (remain 15m 18s) Loss: 0.1929(0.2388) Grad: 5355.6646  LR: 0.00009989  
Epoch: [1][100/1207] Elapsed 1m 18s (remain 14m 24s) Loss: 0.1652(0.2175) Grad: 5448.2173  LR: 0.00009957  
Epoch: [1][150/1207] Elapsed 1m 57s (remain 13m 41s) Loss: 0.2535(0.2051) Grad: 6028.2451  LR: 0.00009904  
Epoch: [1][200/1207] Elapsed 2m 35s (remain 12m 59s) Loss: 0.1632(0.2006) Grad: 12168.6055  LR: 0.00009830  
Epoch: [1][250/1207] Elapsed 3m 14s (remain 12m 19s) Loss: 0.2340(0.1968) Grad: 3031.0684  LR: 0.00009736  
Epoch: [1][300/1207] Elapsed 3m 52s (remain 11m 40s) Loss: 0.2100(0.1934) Grad: 4841.0825  LR: 0.00009622  
Epoch: [1][350/1207] Elapsed 4m 31s (remain 11m 0s) Loss: 0.2061(0.1918) Grad: 9391.1660  LR: 0.00009488  
Epoch: [1][400/1207] Elapsed 5m 9s (remain 10m 21s) Loss: 0.1848(0.1896) Grad: 4701.9951  LR: 0.00009335  
Epoch: [1][450/1207] Elapsed 5m 47s (rem

100%|██████████| 45/45 [00:07<00:00,  6.39it/s]
Epoch 1 - avg_train_loss: 0.1786  time: 936s
Epoch 1 - Score: 0.8124
Epoch 1 - Save Best Score: 0.8124 Model


Epoch: [2][0/1207] Elapsed 0m 1s (remain 36m 11s) Loss: 0.1598(0.1598) Grad: nan  LR: 0.00004997  
Epoch: [2][50/1207] Elapsed 0m 40s (remain 15m 12s) Loss: 0.1164(0.1334) Grad: 6041.5220  LR: 0.00004672  
Epoch: [2][100/1207] Elapsed 1m 18s (remain 14m 21s) Loss: 0.2088(0.1344) Grad: 9220.4482  LR: 0.00004348  
Epoch: [2][150/1207] Elapsed 1m 57s (remain 13m 38s) Loss: 0.1244(0.1361) Grad: 4251.0117  LR: 0.00004027  
Epoch: [2][200/1207] Elapsed 2m 35s (remain 12m 58s) Loss: 0.1907(0.1369) Grad: 5817.6177  LR: 0.00003711  
Epoch: [2][250/1207] Elapsed 3m 13s (remain 12m 18s) Loss: 0.1376(0.1360) Grad: 3995.4849  LR: 0.00003399  
Epoch: [2][300/1207] Elapsed 3m 52s (remain 11m 39s) Loss: 0.1262(0.1360) Grad: 5376.7085  LR: 0.00003095  
Epoch: [2][350/1207] Elapsed 4m 30s (remain 10m 59s) Loss: 0.0911(0.1363) Grad: 4200.6118  LR: 0.00002798  
Epoch: [2][400/1207] Elapsed 5m 8s (remain 10m 21s) Loss: 0.1102(0.1355) Grad: 5215.5029  LR: 0.00002511  
Epoch: [2][450/1207] Elapsed 5m 47s (re

100%|██████████| 45/45 [00:06<00:00,  7.25it/s]
Epoch 2 - avg_train_loss: 0.1333  time: 934s
Epoch 2 - Score: 0.8118
Score: 0.8124


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
avg_train_loss,█▁
loss,▇▇▄▅▇▇▅█▄█▅▆█▄▆▅▅▂▄▆▃▄▄▇▅▆▂▇▃▁▄▂▄▇▅▆▇▅▃▃
lr,███████▇▇▇▇▇▇▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
score,█▁

0,1
avg_train_loss,0.13329
loss,0.07598
lr,0.0
score,0.81179


[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Some weights of the model checkpoint at studio-ousia/luke-base were not used when initializing LukeModel: ['embeddings.position_ids']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1207] Elapsed 0m 2s (remain 40m 35s) Loss: 0.3624(0.3624) Grad: nan  LR: 0.00010000  
Epoch: [1][50/1207] Elapsed 0m 40s (remain 15m 17s) Loss: 0.1967(0.2291) Grad: 4764.8896  LR: 0.00009989  
Epoch: [1][100/1207] Elapsed 1m 18s (remain 14m 23s) Loss: 0.1583(0.2140) Grad: 6803.3213  LR: 0.00009957  
Epoch: [1][150/1207] Elapsed 1m 57s (remain 13m 40s) Loss: 0.1890(0.2091) Grad: 24169.6016  LR: 0.00009904  
Epoch: [1][200/1207] Elapsed 2m 35s (remain 12m 59s) Loss: 0.1561(0.2024) Grad: 4232.9458  LR: 0.00009830  
Epoch: [1][250/1207] Elapsed 3m 14s (remain 12m 19s) Loss: 0.1588(0.1996) Grad: 4975.9829  LR: 0.00009736  
Epoch: [1][300/1207] Elapsed 3m 52s (remain 11m 39s) Loss: 0.1756(0.1984) Grad: 4593.9565  LR: 0.00009622  
Epoch: [1][350/1207] Elapsed 4m 30s (remain 11m 0s) Loss: 0.1495(0.1964) Grad: 4699.2861  LR: 0.00009488  
Epoch: [1][400/1207] Elapsed 5m 9s (remain 10m 21s) Loss: 0.2096(0.1952) Grad: 2381.8154  LR: 0.00009335  
Epoch: [1][450/1207] Elapsed 5m 47s (re

100%|██████████| 45/45 [00:06<00:00,  6.65it/s]
Epoch 1 - avg_train_loss: 0.1758  time: 935s
Epoch 1 - Score: 0.8270
Epoch 1 - Save Best Score: 0.8270 Model


Epoch: [2][0/1207] Elapsed 0m 1s (remain 35m 55s) Loss: 0.0895(0.0895) Grad: nan  LR: 0.00004997  
Epoch: [2][50/1207] Elapsed 0m 40s (remain 15m 15s) Loss: 0.1154(0.1317) Grad: 5459.8276  LR: 0.00004672  
Epoch: [2][100/1207] Elapsed 1m 18s (remain 14m 22s) Loss: 0.1640(0.1332) Grad: 3807.1423  LR: 0.00004348  
Epoch: [2][150/1207] Elapsed 1m 57s (remain 13m 39s) Loss: 0.1247(0.1320) Grad: 4818.7988  LR: 0.00004027  
Epoch: [2][200/1207] Elapsed 2m 35s (remain 12m 58s) Loss: 0.1184(0.1320) Grad: 4903.5229  LR: 0.00003711  
Epoch: [2][250/1207] Elapsed 3m 13s (remain 12m 18s) Loss: 0.1127(0.1326) Grad: 4249.5366  LR: 0.00003399  
Epoch: [2][300/1207] Elapsed 3m 52s (remain 11m 39s) Loss: 0.1272(0.1315) Grad: 4388.1147  LR: 0.00003095  
Epoch: [2][350/1207] Elapsed 4m 30s (remain 10m 59s) Loss: 0.1464(0.1325) Grad: 4387.3374  LR: 0.00002798  
Epoch: [2][400/1207] Elapsed 5m 8s (remain 10m 21s) Loss: 0.1519(0.1312) Grad: 4355.0791  LR: 0.00002511  
Epoch: [2][450/1207] Elapsed 5m 47s (re

100%|██████████| 45/45 [00:06<00:00,  6.96it/s]
Epoch 2 - avg_train_loss: 0.1289  time: 934s
Epoch 2 - Score: 0.8281
Epoch 2 - Save Best Score: 0.8281 Model
Score: 0.8281


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
avg_train_loss,█▁
loss,▄▇▇▅▆▆█▆▇▃▇▅▅▅▇▆▅▅▁▇▇▄▁▃▂▂▆▆▄▅▆▃▂▂▆▄▇▂▂▅
lr,███████▇▇▇▇▇▇▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
score,▁█

0,1
avg_train_loss,0.12885
loss,0.14332
lr,0.0
score,0.82811


[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Some weights of the model checkpoint at studio-ousia/luke-base were not used when initializing LukeModel: ['embeddings.position_ids']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1207] Elapsed 0m 1s (remain 36m 7s) Loss: 0.3874(0.3874) Grad: nan  LR: 0.00010000  
Epoch: [1][50/1207] Elapsed 0m 40s (remain 15m 9s) Loss: 0.1946(0.2327) Grad: 8731.4961  LR: 0.00009989  
Epoch: [1][100/1207] Elapsed 1m 18s (remain 14m 19s) Loss: 0.2114(0.2145) Grad: 5134.8442  LR: 0.00009957  
Epoch: [1][150/1207] Elapsed 1m 56s (remain 13m 36s) Loss: 0.1413(0.2075) Grad: 4523.4121  LR: 0.00009904  
Epoch: [1][200/1207] Elapsed 2m 35s (remain 12m 56s) Loss: 0.1555(0.2009) Grad: 2536.8333  LR: 0.00009830  
Epoch: [1][250/1207] Elapsed 3m 13s (remain 12m 17s) Loss: 0.2064(0.1954) Grad: 4182.6953  LR: 0.00009736  
Epoch: [1][300/1207] Elapsed 3m 51s (remain 11m 38s) Loss: 0.1847(0.1928) Grad: 3124.8411  LR: 0.00009622  
Epoch: [1][350/1207] Elapsed 4m 30s (remain 10m 59s) Loss: 0.1726(0.1896) Grad: 2035.8291  LR: 0.00009488  
Epoch: [1][400/1207] Elapsed 5m 8s (remain 10m 20s) Loss: 0.2709(0.1876) Grad: 2897.9758  LR: 0.00009335  
Epoch: [1][450/1207] Elapsed 5m 47s (rema

100%|██████████| 45/45 [00:06<00:00,  6.86it/s]
Epoch 1 - avg_train_loss: 0.1692  time: 934s
Epoch 1 - Score: 0.8115
Epoch 1 - Save Best Score: 0.8115 Model


Epoch: [2][0/1207] Elapsed 0m 1s (remain 38m 4s) Loss: 0.1820(0.1820) Grad: nan  LR: 0.00004997  
Epoch: [2][50/1207] Elapsed 0m 40s (remain 15m 15s) Loss: 0.1117(0.1302) Grad: 3414.6333  LR: 0.00004672  
Epoch: [2][100/1207] Elapsed 1m 18s (remain 14m 22s) Loss: 0.1087(0.1265) Grad: 3559.8525  LR: 0.00004348  
Epoch: [2][150/1207] Elapsed 1m 57s (remain 13m 38s) Loss: 0.0938(0.1269) Grad: 3978.1335  LR: 0.00004027  
Epoch: [2][200/1207] Elapsed 2m 35s (remain 12m 58s) Loss: 0.1346(0.1252) Grad: 4426.9473  LR: 0.00003711  
Epoch: [2][250/1207] Elapsed 3m 13s (remain 12m 18s) Loss: 0.1722(0.1264) Grad: 4902.9492  LR: 0.00003399  
Epoch: [2][300/1207] Elapsed 3m 52s (remain 11m 39s) Loss: 0.0928(0.1271) Grad: 3662.7559  LR: 0.00003095  
Epoch: [2][350/1207] Elapsed 4m 30s (remain 10m 59s) Loss: 0.0623(0.1293) Grad: 4186.4839  LR: 0.00002798  
Epoch: [2][400/1207] Elapsed 5m 8s (remain 10m 21s) Loss: 0.1111(0.1297) Grad: 3282.5264  LR: 0.00002511  
Epoch: [2][450/1207] Elapsed 5m 47s (rem

100%|██████████| 45/45 [00:06<00:00,  7.03it/s]
Epoch 2 - avg_train_loss: 0.1274  time: 934s
Epoch 2 - Score: 0.8204
Epoch 2 - Save Best Score: 0.8204 Model
Score: 0.8204


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
avg_train_loss,█▁
loss,▇▅▅▆▅▅▅▆▆▃▆▄▆▅▃▇█▄▄▄▂▃▂▄▄▄▄▄▃▃▁▆▂▂▄▂▂▄▂▄
lr,███████▇▇▇▇▇▇▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
score,▁█

0,1
avg_train_loss,0.12737
loss,0.0841
lr,0.0
score,0.82037


[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Score: 0.8258
Score: 0.7094

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
all cv,▁
origin cv,▁

0,1
all cv,0.82581
origin cv,0.70945
