In [28]:
import os
import numpy as np
import pandas as pd
import random
import time
import string
from collections import defaultdict, deque

from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, logging
from transformers import AlbertConfig, AlbertTokenizer, AlbertModel
from transformers import RobertaConfig,RobertaTokenizer, RobertaModel
from transformers import BertConfig, BertTokenizer, BertModel

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, SequentialSampler, RandomSampler, DataLoader
from torch.cuda.amp import autocast as autocast
from torch.cuda.amp import  GradScaler

from torch.optim import lr_scheduler
from tqdm.notebook import tqdm

import gc; gc.enable()
from IPython.display import clear_output
from torch.optim.swa_utils import AveragedModel, SWALR
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
logging.set_verbosity_error()

In [29]:


# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

#产生一个哈希值
# def id_generator(size=12, chars=string.ascii_lowercase + string.digits):
#     return ''.join(random.SystemRandom().choice(chars) for _ in range(size))

# HASH_NAME = id_generator(size=12)


# config

In [30]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CONFIG={
    "TRAIN_BATCH_SIZE":48,
    "MAX_LENGTH":128,
    "DEV_BATCH_SIZE": 64,
    "LR":2e-5,
    "EPS":1e-8,
    "weight_decay":1e-6,
    
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    "T_max": 500,
    "T_0":500,
    "margin":0.45,
    "fold_num":5,
    "seed":2021,
    "num_class":1,
    
    "EPOCHS":4,
    "evaluate_step":None,
    "swa_start":4,
    "model_init_lr":0.9e-4,
    "multiplier":0.9,
    "classifier_lr":1e-4 ,
    "swa_lr": 1e-5
}
input_dir="./input/jigsaw-toxic-severity-rating"


更换模型

In [31]:
hidden_size="hidden_size"
num_hidden_layers="num_hidden_layers"
OUT_DIR="./output/jigsaw_server_albert/albert_folder"
#for xlnet
# hidden_size="d_model"
# num_hidden_layers="n_layer"

# MODEL_DIR="../model/roberta-base"
# MODEL_DIR="../model/roberta-large"

MODEL_DIR="../model/albert-large-v2"
# MODEL_DIR="../model/xlnet-base-cased"

Model_type="Albert"
tokenizer_func_dict={"Albert":AlbertTokenizer,"auto":AutoTokenizer,"Roberta":RobertaTokenizer}
config_func_dict={"Albert":AlbertConfig,"auto":AutoConfig,"Roberta":RobertaConfig}
model_func_dict={"Albert":AlbertModel,"auto":AutoModel,"Roberta":RobertaModel}

##检查事项
* 提交之前 注意 run_db 是否打开 是否创建了正确的hash值
* test是否关闭
* 如果 换模型 model_dir 是否正确 model struct是否正确
* gpu 是否需要打开

In [32]:
DATASET_TEST=False
run_db=True
model_struct="OriginModel"

HASH_NAME="albert  margin=0.45"

swa_use=False
data_aug=False
translate_aug=False
FP16=True

#OriginModel MeanPoolingModel LastLayerCLSModel MaxPoolingModel
#SecondToLastLayerCLSModel ConcatenateLastFourModel WeightedLayerPoolingModel WeightedLayerPoolingModel
#AttentionPoolingModel
translate_text=["text_fr","text_de","text_es"]

CONFIG['group'] = f'{HASH_NAME}-Baseline'


# W&B

In [33]:
import wandb

try:
#     from kaggle_secrets import UserSecretsClient
#     user_secrets = UserSecretsClient()
#     api_key = user_secrets.get_secret("wandb_api")
    api_key="ebe051612bfb733306f4e4b5df4b043050ebea6e"
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/py/.netrc


# 数据处理

In [34]:
data_df=pd.read_csv(os.path.join(input_dir,"validation_data.csv"))

数据增强

In [35]:
from sklearn.preprocessing import LabelEncoder
def generate_comments(data):
    more_toxic_text=data["more_toxic"].values
    less_toxic_text=data["less_toxic"].values    
    comments=np.concatenate((more_toxic_text,less_toxic_text))
    comments=np.unique(comments)
    comments=pd.DataFrame({"text":comments})
    text_encoder=LabelEncoder()
    text_encoder.fit(comments)
    comments["encode_text"]=text_encoder.transform(comments["text"])
    comments["toxic_value"]=0
    comments["access_time"]=0
    data["encode_less"]=text_encoder.transform(data["less_toxic"])
    data["encode_more"]=text_encoder.transform(data["more_toxic"])
    
    return data,comments


In [36]:
def bsearch(start,more2less_dict):
    queue = deque([start])
    visit_list=[]
    while len(queue)!=0:
        visit_id=queue.popleft()
        if visit_id in visit_list:
            continue
        visit_list.append(visit_id)
        queue+=deque(more2less_dict[visit_id])
    visit_list.remove(start)
    return [ x for x in visit_list if x not in more2less_dict[start] ]

def search_lessText(more2less_dict):
    aug_dict= defaultdict(list)
    for start in list(more2less_dict.keys()):
        
        aug_list=bsearch(start,more2less_dict)
        aug_dict[start]=aug_list
    return aug_dict

def data_aug1(data_df,comments):
    data_df["label_min"]=data_df.apply(lambda row:row["encode_less"] 
                                   if row["encode_less"]<row["encode_more"] else row["encode_more"],axis=1)
    data_df["label_max"]=data_df.apply(lambda row:row["encode_more"] 
                                       if row["encode_less"]<row["encode_more"] else row["encode_less"],axis=1)

    data_df["win_min"]=data_df.apply(lambda row:1 if row["encode_more"]<row["encode_less"] else 0 ,axis=1)
    data_df["win_max"]=data_df.apply(lambda row:0 if row["encode_more"]<row["encode_less"] else 1 ,axis=1)

    data_df_agg=data_df.groupby(["label_min","label_max"]).agg({"win_min":"sum","win_max":"sum"}).reset_index()
    data_df_agg["encode_less"]=data_df_agg.apply(lambda row:row["label_min"] 
                                                 if row["win_min"]<row["win_max"] else row["label_max"],axis=1)
    data_df_agg["encode_more"]=data_df_agg.apply(lambda row:row["label_min"] 
                                                 if row["win_min"]>row["win_max"] else row["label_max"],axis=1)
    
    more2less_dict= defaultdict(list)
    data_df_agg.apply(lambda row:more2less_dict[row["encode_more"]].append(row["encode_less"]),axis=1)
    
    aug_dict=search_lessText(more2less_dict)
    aug_dict={key:value for key,value in aug_dict.items() if len(value)!=0}
    aug_df=pd.DataFrame(columns=(tuple(data_df.columns)))
    
    id2text_dict=comments.to_dict()["text"]
    
    for key,value in aug_dict.items():
        encode_more=key
        encode_less_list=value

        more_toxic=id2text_dict[encode_more]
        for encode_less in encode_less_list:
            less_toxic=id2text_dict[encode_less]
            row=pd.DataFrame({"worker":[999],"less_toxic":[less_toxic],"more_toxic":[more_toxic],"encode_less":[encode_less],
                                       "encode_more":[encode_more]})
            aug_df=aug_df.append(row,ignore_index=True)
    work_list=np.array([999]*len(aug_df),dtype=np.int64)
    aug_df["worker"]=work_list
    return aug_df

In [37]:
data_df,comments=generate_comments(data_df)
if translate_aug==True:
    comment_translation=pd.read_csv("../input/translate-toxic/comment_translation.csv")
    comment_translation=comment_translation.merge(comments,on="text",how="left")
if data_aug==True:
    aug_df=data_aug1(data_df,comments)
    data_df=pd.concat([data_df,aug_df],axis=0)
    data_df=data_df.reset_index(drop=True)

In [38]:
if DATASET_TEST==True:
    data_df=data_df[0:400]

交叉

In [39]:
from sklearn.model_selection import GroupKFold
class UnionFind():
    def __init__(self, n):
        self.n = n
        self.parents = [-1] * n

    def find(self, x):
        if self.parents[x] < 0:
            return x
        else:
            self.parents[x] = self.find(self.parents[x])
            return self.parents[x]

    def union(self, x, y):
        x = self.find(x)
        y = self.find(y)
        if x == y:
            return
        if self.parents[x] > self.parents[y]:
            x, y = y, x
        self.parents[x] += self.parents[y]
        self.parents[y] = x


def get_group_unionfind(train: pd.DataFrame):
    less_unique_text = train['less_toxic'].unique()
    more_unique_text = train['more_toxic'].unique()
    unique_text = np.hstack([less_unique_text, more_unique_text])
    unique_text = np.unique(unique_text).tolist()    
    text2num = {text: i for i, text in enumerate(unique_text)}
    num2text = {num: text for text, num in text2num.items()}
    train['num_less_toxic'] = train['less_toxic'].map(text2num)
    train['num_more_toxic'] = train['more_toxic'].map(text2num)

    uf = UnionFind(len(unique_text))
    for seq1, seq2 in train[['num_less_toxic', 'num_more_toxic']].to_numpy():
        uf.union(seq1, seq2)

    text2group = {num2text[i]: uf.find(i) for i in range(len(unique_text))}
    train['group'] = train['less_toxic'].map(text2group)
    train = train.drop(columns=['num_less_toxic', 'num_more_toxic'])
    return train

In [40]:
data_df = get_group_unionfind(data_df)
group_kfold = GroupKFold(n_splits=CONFIG["fold_num"])
for fold, (trn_idx, val_idx) in enumerate(group_kfold.split(data_df, data_df, data_df['group'])): 
    data_df.loc[val_idx , "kfold"] = fold

data_df["kfold"] = data_df["kfold"].astype(int)
data_df.to_csv('train_noleak.csv', index=False)
data_df.head()

Unnamed: 0,worker,less_toxic,more_toxic,encode_less,encode_more,group,kfold
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,2405,12151,2405,2
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,7215,653,697,2
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",2632,7222,2632,2
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,7973,12968,7973,2
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",3524,3266,3524,0


In [41]:
class DatasetRetriever(Dataset):
    def __init__(self,data,tokenizer,max_len=CONFIG["MAX_LENGTH"]):
        self.data=data
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.more_toxic=data["more_toxic"].values
        self.less_toxic=data["less_toxic"].values
        
    def __len__(self):
        return len(self.data)
    def __getitem__(self, item):
        more_toxic=self.more_toxic[item]
        less_toxic=self.less_toxic[item]

        features1=self.convert_examples_to_features(more_toxic)
        features2=self.convert_examples_to_features(less_toxic)
        features1={"input_ids":features1["input_ids"],"attention_mask":features1["attention_mask"]}
        features2={"input_ids":features2["input_ids"],"attention_mask":features2["attention_mask"]}
        target=1
        return {"more_toxic":{key:torch.tensor(value,dtype=torch.long) for key,value in features1.items()},
                "less_toxic":{key:torch.tensor(value,dtype=torch.long) for key,value in features2.items()},
                "target":torch.tensor(target,dtype=torch.long)}
    def convert_examples_to_features(self, example):
        encoded = self.tokenizer.encode_plus(
            example,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            is_split_into_words=False,
            )
        return encoded
def make_dataloader(data,batch_size,model_dir=MODEL_DIR,max_len=CONFIG["MAX_LENGTH"]):
    tokenizer=tokenizer_func_dict.get(Model_type).from_pretrained(model_dir)
    dataset=DatasetRetriever(data,tokenizer,max_len)
    sampler=RandomSampler(dataset)
    
    dataloader=DataLoader(dataset,
                          batch_size=batch_size,
                          sampler=sampler
                         )
    return dataloader

class DatasetRetriever_cv(Dataset):
    def __init__(self,data,tokenizer,max_len=CONFIG["MAX_LENGTH"]):
        self.data=data
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.text=self.data["text"].values
        
    def __len__(self):
        return len(self.data)
    def __getitem__(self, item):
        text=self.text[item]
        features1=self.convert_examples_to_features(text)
        ##roberta 没有tokentype ids 为了统一这里也不进行输入 反正训练也用不着
        features1={"input_ids":features1["input_ids"],"attention_mask":features1["attention_mask"]}
        return {"text":{key:torch.tensor(value,dtype=torch.long) for key,value in features1.items()}}
    def convert_examples_to_features(self, example):
        encoded = self.tokenizer.encode_plus(
            example,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            is_split_into_words=False,
            )
        return encoded
def make_dataloader_cv(data,batch_size,model_dir=MODEL_DIR,max_len=CONFIG["MAX_LENGTH"]):
    tokenizer=AutoTokenizer.from_pretrained(model_dir)
    dataset=DatasetRetriever_cv(data,tokenizer,max_len)
    sampler=SequentialSampler(dataset)
    
    dataloader=DataLoader(dataset,
                          batch_size=batch_size,
                          sampler=sampler
                         )
    return dataloader

In [42]:
def prepare_loaders(df,fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    if translate_aug==True:
        df_train_encode=df_train.drop(["less_toxic","more_toxic"],axis=1)
        for language_text in translate_text:
            temp_train=df_train_encode
            
            temp_train=temp_train.merge(comment_translation[["encode_text",language_text]],left_on="encode_less",right_on="encode_text",how="left")
            temp_train=temp_train.rename(columns={language_text:"less_toxic"})
            temp_train.drop(["encode_text"],axis=1,inplace=True)
            
            temp_train=temp_train.merge(comment_translation[["encode_text",language_text]],left_on="encode_more",right_on="encode_text",how="left")
            temp_train=temp_train.rename(columns={language_text:"more_toxic"})
            temp_train.drop(["encode_text"],axis=1,inplace=True)
            df_train=pd.concat([df_train,temp_train])
    train_loader=make_dataloader(df_train,CONFIG["TRAIN_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])
    
    valid_loader=make_dataloader(df_valid,CONFIG["DEV_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])

 
    return train_loader, valid_loader

** 模型输出结构

In [43]:
class OriginModel(nn.Module):
    def __init__(self,model_name):
        super(OriginModel,self).__init__()
        self.config=config_func_dict.get(Model_type).from_pretrained(model_name)
        self.config.update({"hidden_dropout_prob": 0.0,"attention_probs_dropout_prob":0.0
            })   
        self.model=model_func_dict.get(Model_type).from_pretrained(model_name,config=self.config)
        self.drop=nn.Dropout(p=0)
        
        self.linear=nn.Linear(self.config.to_dict()[hidden_size],CONFIG["num_class"])
           
        self.dense = nn.Linear(self.config.to_dict()[hidden_size], self.config.to_dict()[hidden_size])
        self.activation = nn.Tanh()
    def forward(self,input_ids,attention_mask):
        out=self.model(input_ids=input_ids,attention_mask=attention_mask,output_hidden_states=False)
        last_hidden_state = out[0]
        cls_embeddings = last_hidden_state[:,0]
        pooled_output = self.dense(cls_embeddings)
        pooled_output = self.activation(pooled_output)
        
        out=self.drop(pooled_output)
        
        outputs=self.linear(out)
        
        return outputs

In [44]:
func_dict={"OriginModel":OriginModel}
JigsawModel=func_dict.get(model_struct)


In [45]:
def criterion(outputs1, outputs2, targets):
    return nn.MarginRankingLoss(margin=CONFIG["margin"])(outputs1, outputs2, targets)

In [46]:
def get_parameters(model,model_init_lr,multiplier, classifier_lr):
    #权重分层，越靠近下游学习率越高
    parameters=[]
    lr=model_init_lr
    # 迭代器包含 层名字和参数 parameters()函数只包含参数
    #定义的层字典，参数的key必须叫params，否则在optimizer 父类中冲突
    for layer in range(model.config.to_dict()[num_hidden_layers]-1,-1,-1):
        layer_parameters={
            "params":[p for n,p in model.named_parameters() if f"encoder.layer.{layer}." in n],
            "lr":lr
        }
        lr*=multiplier
        parameters.append(layer_parameters)
    
    
    classify_parameters={
        #自己定义了什么分类层在此更改名字
        "params":[p for n,p in model.named_parameters() if "linear"  in n],
        "lr":classifier_lr
    }
        
    parameters.append(classify_parameters)
    return parameters

In [47]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [48]:
def evaluate_cv(model,test_dataloader):
    model.eval()
    Preds=[]
    for index,batch in enumerate(test_dataloader):

        text_inputs=batch["text"]
        
        text_inputs={key: value.to(DEVICE) for key,value in text_inputs.items()}
        with torch.no_grad():
            out_more=model(**text_inputs)
            Preds.append(out_more.view(-1).cpu().detach().numpy())
    
    Preds = np.concatenate(Preds) 
    gc.collect()
    
    return Preds
def evaluate_comments(model,df,fold):
    
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    comments_fold_id=np.concatenate((df_valid["encode_more"].values,df_valid["encode_less"].values))
    comments_fold_id=np.unique(comments_fold_id)
    select_fold_list=comments.apply(lambda row : True if row["encode_text"] in comments_fold_id else False ,axis=1)
    comments_fold=comments[select_fold_list]
    valid_loader=make_dataloader_cv(comments_fold,CONFIG["DEV_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])   
    
    preds=evaluate_cv(model,valid_loader)
    preds=np.array(preds)
    comments_fold["toxic_value"]=preds
    comments_fold.index=comments_fold["encode_text"]
    index_score_dict=comments_fold.to_dict()["toxic_value"]
    df_valid["less_value"]=df_valid["encode_less"].map(lambda x:index_score_dict[x])
    df_valid["more_value"]=df_valid["encode_more"].map(lambda x:index_score_dict[x])
    df_valid["pair_True"]=df_valid.apply(lambda row:True if row["more_value"]>row["less_value"] else False,axis=1)
    cv=df_valid["pair_True"].mean()
    return -1*cv



In [49]:
def evaluate(model,dev_dataloader):
    model.eval()
    dev_loss=0
    for index,batch in enumerate(dev_dataloader):
        
        more_toxic_inputs=batch["more_toxic"]
        less_toxic_inputs=batch["less_toxic"]
        target=batch["target"].to(DEVICE)

        more_toxic_inputs={key: value.to(DEVICE) for key,value in more_toxic_inputs.items()}
        less_toxic_inputs={key: value.to(DEVICE) for key,value in less_toxic_inputs.items()}
        with torch.no_grad():
            out_more=model(**more_toxic_inputs)
            out_less=model(**less_toxic_inputs)

            loss=criterion(out_more, out_less, target)
        
            dev_loss+=loss.item()
        
    return dev_loss/len(dev_dataloader)
def train(model,train_dataloader,dev_dataloader,evaluate_step=None,swa_start=None,fold=0):

    if run_db==True:
        wandb.watch(model,log_freq=100)
#     optimizer=AdamW(get_parameters(model, model_init_lr=CONFIG["model_init_lr"], multiplier=CONFIG["multiplier"], 
#                                    classifier_lr=CONFIG["classifier_lr"]),
#                     lr = CONFIG['LR'], eps = CONFIG['EPS'],weight_decay=CONFIG['weight_decay'])
    optimizer = AdamW(model.parameters(),lr= CONFIG['LR'], eps = CONFIG['EPS'],weight_decay=CONFIG['weight_decay'])
    if evaluate_step==None:
        evaluate_step=len(train_dataloader)
    if swa_use==True:
        swa_model=AveragedModel(model).to(DEVICE)
        swa_scheduler = SWALR(optimizer, swa_lr=CONFIG["swa_lr"])
    """
    get_linear_schedule_with_warmup:学习率先从0开始warm_up到设定学习率，再逐渐减到0
    num_warmup_steps：完成预热的步数
    num_training_steps：训练批次*epochs 训练的step数
    """
    scheduler = fetch_scheduler(optimizer)
    if scheduler==None:
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                                num_training_steps=len(train_dataloader) * CONFIG["EPOCHS"])
    best_val_loss=100
    best_model_param=None

    scaler = GradScaler()
    start=time.time()
    for epoch in range(CONFIG["EPOCHS"]):
        print(f"\n Epoch{epoch} train start \n")
        train_loss=0
        model.train()
        #total 更新进度 
        bar=tqdm(enumerate(train_dataloader),total=len(train_dataloader))
        for index,batch in bar:
            model.zero_grad()
            more_toxic_inputs=batch["more_toxic"]
            less_toxic_inputs=batch["less_toxic"]
            target=batch["target"].to(DEVICE)

            more_toxic_inputs={key: value.to(DEVICE) for key,value in more_toxic_inputs.items()}
            less_toxic_inputs={key: value.to(DEVICE) for key,value in less_toxic_inputs.items()}
            if FP16==True:
                with autocast():
                    out_more=model(**more_toxic_inputs)
                    out_less=model(**less_toxic_inputs)
                    loss=criterion(out_more, out_less, target)

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                out_more=model(**more_toxic_inputs)
                out_less=model(**less_toxic_inputs)
                loss=criterion(out_more, out_less, target)
                loss.backward()
                optimizer.step()
            if swa_use==True and epoch>=swa_start-1:

                swa_model.update_parameters(model)
                swa_scheduler.step()
            else:
                scheduler.step()

            train_loss+=loss.item()
            if (index+1)%evaluate_step==0 or (index+1)==len(train_dataloader):
                if swa_use==True and epoch>=swa_start-1:
                    val_loss=evaluate(swa_model,dev_dataloader)
#                     val_loss=evaluate_comments(swa_model,data_df,fold)
                    
                    
                else:
                    val_loss=evaluate(model,dev_dataloader)
#                     val_loss=evaluate_comments(model,data_df,fold)
                    
                if run_db==True:
                    wandb.log({"Train LOSS":loss})
                    wandb.log({"Valid LOSS":val_loss})

                if val_loss<best_val_loss:
                    best_val_loss=val_loss
                    if swa_use==True and epoch>=swa_start-1:
                        best_model_param=swa_model.module.state_dict()
                    else:
                        best_model_param=model.state_dict()
                    print(f"best_model saved ,val_loss:{best_val_loss}")
        avg_train_loss=train_loss/len(train_dataloader)
        print(f"EPOCH:{epoch+1},train_loss:{avg_train_loss},val_loss:{val_loss}")

    end=time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
    time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    if run_db==True:
        run.summary["time (hour)"]=time_elapsed /3600
    return best_val_loss,best_model_param

In [23]:
for fold in range(CONFIG["fold_num"]):
    print(f"\n Fold{fold} train start")
    if run_db==True:
        run = wandb.init(project='Jigsaw', 
                     config=CONFIG,
                     job_type='Train',
                     group=CONFIG['group'],
                     tags=['roberta-base', f'{HASH_NAME}', 'margin-loss'],
                     name=f'{HASH_NAME}-fold-{fold}',
                     anonymous='must')
    train_loader,dev_loader=prepare_loaders(data_df,fold)    
    model=JigsawModel(MODEL_DIR)
    model.to(DEVICE)
    dev_loss,best_model_param=train(model,train_loader,dev_loader,evaluate_step=CONFIG["evaluate_step"],swa_start=CONFIG["swa_start"],fold=fold)
    model_path=f"./output/jigsaw_server_albert/bestmodel-{fold}.pth"
    torch.save(best_model_param,model_path)
    if run_db==True:
        run.finish()
    
    del model,train_loader,dev_loader        
    gc.collect()


 Fold0 train start



 Epoch0 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

best_model saved ,val_loss:0.3100709993588297
EPOCH:1,train_loss:0.32408953081089187,val_loss:0.3100709993588297

 Epoch1 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

EPOCH:2,train_loss:0.3297381360989168,val_loss:0.44975944506494625

 Epoch2 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

EPOCH:3,train_loss:0.33516147569474947,val_loss:0.32381702112524136

 Epoch3 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

EPOCH:4,train_loss:0.31094562983607865,val_loss:0.3179875085228368
Training complete in 0h 27m 7s


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train LOSS,▁█▄▂
Valid LOSS,▁█▂▁

0,1
Train LOSS,0.32418
Valid LOSS,0.31799
time (hour),0.45189



 Fold1 train start



 Epoch0 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

best_model saved ,val_loss:0.30711959082829327
EPOCH:1,train_loss:0.32814309651039514,val_loss:0.30711959082829327

 Epoch1 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

EPOCH:2,train_loss:0.3009010499335855,val_loss:0.3257108885990946

 Epoch2 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

EPOCH:3,train_loss:0.3536465850721792,val_loss:0.32254256794327185

 Epoch3 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

EPOCH:4,train_loss:0.31571471272593,val_loss:0.3169236719608307
Training complete in 0h 27m 26s


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train LOSS,▅▇█▁
Valid LOSS,▁█▇▅

0,1
Train LOSS,0.2441
Valid LOSS,0.31692
time (hour),0.45718



 Fold2 train start



 Epoch0 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

best_model saved ,val_loss:0.3134022074310403
EPOCH:1,train_loss:0.32501104539371584,val_loss:0.3134022074310403

 Epoch1 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

EPOCH:2,train_loss:0.30371212407174814,val_loss:0.45022765084316857

 Epoch2 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

EPOCH:3,train_loss:0.4505133356349877,val_loss:0.4500026577397397

 Epoch3 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

EPOCH:4,train_loss:0.44999920294816753,val_loss:0.44999900805322746
Training complete in 0h 27m 14s


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train LOSS,▁█▆▆
Valid LOSS,▁███

0,1
Train LOSS,0.44999
Valid LOSS,0.45
time (hour),0.45381



 Fold3 train start



 Epoch0 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

best_model saved ,val_loss:0.33996284462903675
EPOCH:1,train_loss:0.3564360869595729,val_loss:0.33996284462903675

 Epoch1 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

best_model saved ,val_loss:0.3275292412230843
EPOCH:2,train_loss:0.32473164737462046,val_loss:0.3275292412230843

 Epoch2 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

best_model saved ,val_loss:0.3242498048041996
EPOCH:3,train_loss:0.3016152434733759,val_loss:0.3242498048041996

 Epoch3 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

EPOCH:4,train_loss:0.2848412596965691,val_loss:0.325944459908887
Training complete in 0h 27m 16s


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train LOSS,█▇▅▁
Valid LOSS,█▂▁▂

0,1
Train LOSS,0.2018
Valid LOSS,0.32594
time (hour),0.45438



 Fold4 train start



 Epoch0 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

best_model saved ,val_loss:0.3632579627789949
EPOCH:1,train_loss:0.3678912064171882,val_loss:0.3632579627789949

 Epoch1 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

best_model saved ,val_loss:0.35080386055143253
EPOCH:2,train_loss:0.3433732163027463,val_loss:0.35080386055143253

 Epoch2 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

best_model saved ,val_loss:0.3229456498434669
EPOCH:3,train_loss:0.3162900400589187,val_loss:0.3229456498434669

 Epoch3 train start 



  0%|          | 0/502 [00:00<?, ?it/s]

best_model saved ,val_loss:0.319041445380763
EPOCH:4,train_loss:0.29949449976364456,val_loss:0.319041445380763
Training complete in 0h 27m 25s


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train LOSS,█▅█▁
Valid LOSS,█▆▂▁

0,1
Train LOSS,0.21764
Valid LOSS,0.31904
time (hour),0.45693


# cv

In [24]:
def inference(model_paths,data_df,comments):
    
    for fold in range(CONFIG["fold_num"]):
        print(f"fold{fold} dev start")

        data_fold=data_df[data_df.kfold == fold]
#         data_fold.drop(["label_min","label_max","win_min","win_max"],axis=1,inplace=True)
    
        comments_fold_id=np.concatenate((data_fold["encode_more"].values,data_fold["encode_less"].values))
        comments_fold_id=np.unique(comments_fold_id)
        select_fold_list=comments.apply(lambda row : True if row["encode_text"] in comments_fold_id else False ,axis=1)
        comments_fold=comments[select_fold_list]
        comments_fold["access_time"]=comments_fold["access_time"]+1
        
        
        test_loader=make_dataloader_cv(comments_fold,CONFIG["DEV_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])
        model=JigsawModel(MODEL_DIR)
        model.to(DEVICE)
        path=model_paths[fold]
        
        model.load_state_dict(torch.load(path))
        preds = evaluate_cv(model, test_loader)
        comments_fold["toxic_value"]=comments_fold["toxic_value"]+preds
        
        data_df.loc[data_fold.index]=data_fold
        comments.loc[comments_fold.index]=comments_fold

        del model,test_loader        
    
    return data_df,comments

In [25]:
if run_db==True:
    run = wandb.init(project='Jigsaw', 
             config=CONFIG,
             job_type='cv',
             group=CONFIG['group'],
             tags=['roberta-base', f'{HASH_NAME}', 'margin-loss'],
             name=f'cv',
             anonymous='must')

MODEL_PATHS=[os.path.join(OUT_DIR,f"bestmodel-{num}.pth") for num in range(CONFIG["fold_num"])]

data_df,comments= inference(MODEL_PATHS, data_df,comments)

comments["toxic_value"]=comments["toxic_value"]/comments["access_time"]
comments.index=comments["encode_text"]
index_score_dict=comments.to_dict()["toxic_value"]
data_df["less_value"]=data_df["encode_less"].map(lambda x:index_score_dict[x])
data_df["more_value"]=data_df["encode_more"].map(lambda x:index_score_dict[x])
data_df["pair_True"]=data_df.apply(lambda row:True if row["more_value"]>row["less_value"] else False,axis=1)
cv=data_df["pair_True"].mean()
data_df.to_csv(os.path.join(OUT_DIR,"data_df_cv.csv"))
# data_df.to_csv("./output/jigsawserver/data_df_cv.csv")
print(cv)
if run_db==True:
    wandb.log({"cv":data_df["pair_True"].mean()})
    run.finish()

fold0 dev start
fold1 dev start
fold2 dev start
fold3 dev start
fold4 dev start


IsADirectoryError: [Errno 21] Is a directory: './output/jigsaw_server_albert/'

In [26]:
data_df.to_csv(os.path.join(OUT_DIR,"data_df_cv.csv"))
# data_df.to_csv("./output/jigsawserver/data_df_cv.csv")
print(cv)
if run_db==True:
    wandb.log({"cv":data_df["pair_True"].mean()})
    run.finish()

0.6463730569948186


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv,▁

0,1
cv,0.64637


In [27]:
torch.cuda.empty_cache()
gc.collect()
ipypath=os.path.join(OUT_DIR,".ipynb_checkpoints")
if os.path.exists(ipypath):
    os.removedirs(ipypath)

In [None]:
# jc_df=pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")

# min_len = (jc_df['toxic'] == 1).sum()
# df_y0_undersample = jc_df[jc_df['toxic'] == 0].sample(n=min_len, random_state=201)
# comments_fold = pd.concat([jc_df[jc_df['toxic'] == 1], df_y0_undersample])

# comments_fold.rename(columns={"comment_text":"text"},inplace=True)
# comments_fold["toxic_value"]=0

In [None]:
# for fold in range(CONFIG["fold_num"]):
#     print(f"fold{fold} dev start")
#     test_loader=make_dataloader_cv(comments_fold,CONFIG["DEV_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])
#     model=JigsawModel(MODEL_DIR)
#     model.to(DEVICE)
#     path=MODEL_PATHS[fold]

#     model.load_state_dict(torch.load(path))
#     preds = evaluate_cv(model, test_loader)
#     comments_fold["toxic_value"]=comments_fold["toxic_value"]+preds
#     del model,test_loader

In [None]:
# toxicSeperateValue=comments_fold["toxic_value"].min()+(comments_fold["toxic_value"].max()-comments_fold["toxic_value"].min())/2
# comments_fold["toxic_predict"]=comments_fold.apply(lambda row : 1 if row["toxic_value"]>=toxicSeperateValue else 0,axis=1)
# comments_fold["predict_acc"]=comments_fold.apply(lambda row : True if row["toxic_predict"]==row["toxic"] else False,axis=1)
# cv=comments_fold["predict_acc"].mean()
# print("cv in first competition data:",cv)
# if run_db==True:
#     wandb.log({"cv in first competition ":cv})
#     run.finish()