In [1]:
import os
import numpy as np
import pandas as pd
import random
import time
import string
from collections import defaultdict, deque

from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, logging
from transformers import BertConfig, BertTokenizer, BertModel

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, SequentialSampler, RandomSampler, DataLoader
from torch.cuda.amp import autocast as autocast
from torch.cuda.amp import  GradScaler

from torch.optim import lr_scheduler
from tqdm.notebook import tqdm

import gc; gc.enable()
from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
logging.set_verbosity_error()

In [2]:
!pip install torchcontrib
from torchcontrib.optim import SWA


Collecting torchcontrib
  Downloading torchcontrib-0.0.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hBuilding wheels for collected packages: torchcontrib
  Building wheel for torchcontrib (setup.py) ... [?25l- \ done
[?25h  Created wheel for torchcontrib: filename=torchcontrib-0.0.2-py3-none-any.whl size=7532 sha256=788c518b664462d1a83550a3567edd5168a3b6ccd6a79cda17ce677f7b6fc46e
  Stored in directory: /root/.cache/pip/wheels/91/58/d0/f03811c3e34e1f14031294b5f30d8693689972af874d1225b8
Successfully built torchcontrib
Installing collected packages: torchcontrib
Successfully installed torchcontrib-0.0.2


In [3]:
# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

#产生一个哈希值
# def id_generator(size=12, chars=string.ascii_lowercase + string.digits):
#     return ''.join(random.SystemRandom().choice(chars) for _ in range(size))

# HASH_NAME = id_generator(size=12)
HASH_NAME="2021_12_31 aug2"
print(HASH_NAME)

2021_12_31 aug2


# config

In [4]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CONFIG={
    "TRAIN_BATCH_SIZE":32,
    "MAX_LENGTH":128,
    "CONFIG["DEV_BATCH_SIZE"]": 64,
    "LR":2e-5,
    "EPS":1e-8,
    "weight_decay":1e-6,
    
    "scheduler": 'CosineAnnealingLRf',
    "min_lr": 1e-6,
    "T_max": 500,
    "T_0":500,
    
    "margin":0.5,
    "fold_num":5,
    "seed":2021,
    "num_class":1,
    "EPOCHS":3,
}



EPOCHS=3


CONFIG['group'] = f'{HASH_NAME}-Baseline'

input_dir="../input/jigsaw-toxic-severity-rating"
MODEL_DIR="../input/roberta-transformers-pytorch/roberta-base"

In [5]:
# comment_to_csv=pd.read_csv(os.path.join(input_dir,"comments_to_score.csv"))
data_df=pd.read_csv(os.path.join(input_dir,"validation_data.csv"))

In [6]:
data_aug=False
translate_aug=True
run_db=False
swa_use=False
DATASET_TEST=False
FP16=False

# translate_text=["text_fr","text_de","text_es"]
translate_text=["text_fr","text_de","text_es"]

# W&B

In [7]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# 数据处理

数据增强

In [8]:
from sklearn.preprocessing import LabelEncoder
def generate_comments(data):
    more_toxic_text=data["more_toxic"].values
    less_toxic_text=data["less_toxic"].values    
    comments=np.concatenate((more_toxic_text,less_toxic_text))
    comments=np.unique(comments)
    comments=pd.DataFrame({"text":comments})
    text_encoder=LabelEncoder()
    text_encoder.fit(comments)
    comments["encode_text"]=text_encoder.transform(comments["text"])
    comments["toxic_value"]=0
    comments["access_time"]=0
    data["encode_less"]=text_encoder.transform(data["less_toxic"])
    data["encode_more"]=text_encoder.transform(data["more_toxic"])
    
    return data,comments


In [9]:
def bsearch(start,more2less_dict):
    queue = deque([start])
    visit_list=[]
    while len(queue)!=0:
        visit_id=queue.popleft()
        if visit_id in visit_list:
            continue
        visit_list.append(visit_id)
        queue+=deque(more2less_dict[visit_id])
    visit_list.remove(start)
    return [ x for x in visit_list if x not in more2less_dict[start] ]

def search_lessText(more2less_dict):
    aug_dict= defaultdict(list)
    for start in list(more2less_dict.keys()):
        
        aug_list=bsearch(start,more2less_dict)
        aug_dict[start]=aug_list
    return aug_dict

def data_aug1(data_df,comments):
    data_df["label_min"]=data_df.apply(lambda row:row["encode_less"] 
                                   if row["encode_less"]<row["encode_more"] else row["encode_more"],axis=1)
    data_df["label_max"]=data_df.apply(lambda row:row["encode_more"] 
                                       if row["encode_less"]<row["encode_more"] else row["encode_less"],axis=1)

    data_df["win_min"]=data_df.apply(lambda row:1 if row["encode_more"]<row["encode_less"] else 0 ,axis=1)
    data_df["win_max"]=data_df.apply(lambda row:0 if row["encode_more"]<row["encode_less"] else 1 ,axis=1)

    data_df_agg=data_df.groupby(["label_min","label_max"]).agg({"win_min":"sum","win_max":"sum"}).reset_index()
    data_df_agg["encode_less"]=data_df_agg.apply(lambda row:row["label_min"] 
                                                 if row["win_min"]<row["win_max"] else row["label_max"],axis=1)
    data_df_agg["encode_more"]=data_df_agg.apply(lambda row:row["label_min"] 
                                                 if row["win_min"]>row["win_max"] else row["label_max"],axis=1)
    
    more2less_dict= defaultdict(list)
    data_df_agg.apply(lambda row:more2less_dict[row["encode_more"]].append(row["encode_less"]),axis=1)
    
    aug_dict=search_lessText(more2less_dict)
    aug_dict={key:value for key,value in aug_dict.items() if len(value)!=0}
    aug_df=pd.DataFrame(columns=(tuple(data_df.columns)))
    
    id2text_dict=comments.to_dict()["text"]
    
    for key,value in aug_dict.items():
        encode_more=key
        encode_less_list=value

        more_toxic=id2text_dict[encode_more]
        for encode_less in encode_less_list:
            less_toxic=id2text_dict[encode_less]
            row=pd.DataFrame({"worker":[999],"less_toxic":[less_toxic],"more_toxic":[more_toxic],"encode_less":[encode_less],
                                       "encode_more":[encode_more]})
            aug_df=aug_df.append(row,ignore_index=True)
    work_list=np.array([999]*len(aug_df),dtype=np.int64)
    aug_df["worker"]=work_list
    return aug_df

In [10]:
data_df,comments=generate_comments(data_df)
if translate_aug==True:
    comment_translation=pd.read_csv("../input/translate-toxic/comment_translation.csv")
    comment_translation=comment_translation.merge(comments,on="text",how="left")
if data_aug==True:
    aug_df=data_aug1(data_df,comments)
    data_df=pd.concat([data_df,aug_df],axis=0)
    data_df=data_df.reset_index(drop=True)

In [11]:
if DATASET_TEST==True:
    data_df=data_df[0:400]

交叉

In [12]:
skf=StratifiedKFold(n_splits=CONFIG["fold_num"],shuffle=True,random_state=CONFIG["seed"])
for fold,(_,val_) in enumerate(skf.split(X=data_df,y=data_df.worker)):
    data_df.loc[val_,"kfold"]=int(fold)
    
data_df["kfold"]=data_df["kfold"].astype(int)
data_df.head()

Unnamed: 0,worker,less_toxic,more_toxic,encode_less,encode_more,kfold
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,2405,12151,4
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,7215,653,0
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",2632,7222,0
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,7973,12968,2
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",3524,3266,0


In [13]:
class DatasetRetriever(Dataset):
    def __init__(self,data,tokenizer,max_len=CONFIG["MAX_LENGTH"]):
        self.data=data
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.more_toxic=data["more_toxic"].values
        self.less_toxic=data["less_toxic"].values
        
    def __len__(self):
        return len(self.data)
    def __getitem__(self, item):
        more_toxic=self.more_toxic[item]
        less_toxic=self.less_toxic[item]

        features1=self.convert_examples_to_features(more_toxic)
        features2=self.convert_examples_to_features(less_toxic)
        target=1
        return {"more_toxic":{key:torch.tensor(value,dtype=torch.long) for key,value in features1.items()},
                "less_toxic":{key:torch.tensor(value,dtype=torch.long) for key,value in features2.items()},
                "target":torch.tensor(target,dtype=torch.long)}
    def convert_examples_to_features(self, example):
        encoded = self.tokenizer.encode_plus(
            example,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            is_split_into_words=False,
            )
        return encoded
def make_dataloader(data,batch_size,model_dir=MODEL_DIR,max_len=CONFIG["MAX_LENGTH"]):
    tokenizer=AutoTokenizer.from_pretrained(model_dir)
    dataset=DatasetRetriever(data,tokenizer,max_len)
    sampler=RandomSampler(dataset)
    
    dataloader=DataLoader(dataset,
                          batch_size=batch_size,
                          sampler=sampler
                         )
    return dataloader

In [14]:
def prepare_loaders(df,fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    if translate_aug==True:
        df_train_encode=df_train.drop(["less_toxic","more_toxic"],axis=1)
        for language_text in translate_text:
            temp_train=df_train_encode
            
            temp_train=temp_train.merge(comment_translation[["encode_text",language_text]],left_on="encode_less",right_on="encode_text",how="left")
            temp_train=temp_train.rename(columns={language_text:"less_toxic"})
            temp_train.drop(["encode_text"],axis=1,inplace=True)
            
            temp_train=temp_train.merge(comment_translation[["encode_text",language_text]],left_on="encode_more",right_on="encode_text",how="left")
            temp_train=temp_train.rename(columns={language_text:"more_toxic"})
            temp_train.drop(["encode_text"],axis=1,inplace=True)
            df_train=pd.concat([df_train,temp_train])
    train_loader=make_dataloader(df_train,CONFIG["TRAIN_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])
    
    valid_loader=make_dataloader(df_valid,CONFIG["DEV_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])
    
    return train_loader, valid_loader

In [15]:
class JigsawModel(nn.Module):
    def __init__(self,model_name):
        super(JigsawModel,self).__init__()
        self.model=AutoModel.from_pretrained(model_name)
        self.drop=nn.Dropout(p=0.2)
        self.config=AutoConfig.from_pretrained(model_name)
        self.linear=nn.Linear(self.config.to_dict()["hidden_size"],CONFIG["num_class"])
    
    def forward(self,input_ids,attention_mask):
        out=self.model(input_ids=input_ids,attention_mask=attention_mask,output_hidden_states=False)
        out=self.drop(out[1])
        
        outputs=self.linear(out)
        
        return outputs

In [16]:
def criterion(outputs1, outputs2, targets):
    return nn.MarginRankingLoss(margin=CONFIG["margin"])(outputs1, outputs2, targets)

In [17]:
def get_parameters(model,model_init_lr,multiplier, classifier_lr):
    #权重分层，越靠近下游学习率越高
    parameters=[]
    lr=model_init_lr
    # 迭代器包含 层名字和参数 parameters()函数只包含参数
    #定义的层字典，参数的key必须叫params，否则在optimizer 父类中冲突
    for layer in range(model.config.to_dict()["num_hidden_layers"]-1,-1,-1):
        layer_parameters={
            "params":[p for n,p in model.named_parameters() if f"encoder.layer.{layer}." in n],
            "lr":lr
        }
        lr*=multiplier
        parameters.append(layer_parameters)
    
    
    classify_parameters={
        #自己定义了什么分类层在此更改名字
        "params":[p for n,p in model.named_parameters() if "classify"  in n],
        "lr":classifier_lr
    }
        
    parameters.append(classify_parameters)
    return parameters

In [18]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [19]:
def evaluate(model,dev_dataloader):
    model.eval()
    dev_loss=0
    for index,batch in enumerate(dev_dataloader):
        
        more_toxic_inputs=batch["more_toxic"]
        less_toxic_inputs=batch["less_toxic"]
        target=batch["target"].to(DEVICE)

        more_toxic_inputs={key: value.to(DEVICE) for key,value in more_toxic_inputs.items()}
        less_toxic_inputs={key: value.to(DEVICE) for key,value in less_toxic_inputs.items()}
        with torch.no_grad():
            out_more=model(**more_toxic_inputs)
            out_less=model(**less_toxic_inputs)

            loss=criterion(out_more, out_less, target)
        
            dev_loss+=loss.item()
        
    return dev_loss/len(dev_dataloader)
def train(model,train_dataloader,dev_dataloader):
    
    if run_db==True:
        wandb.watch(model,log_freq=100)
    optimizer=AdamW(get_parameters(model, model_init_lr=3e-05, multiplier=0.975, classifier_lr=2e-5),
                    lr = CONFIG['LR'], eps = CONFIG['EPS'],weight_decay=CONFIG['weight_decay'])
    if swa_use==True:
        # swa_start :经过swa_start 个step后 使用swa_lr
        optimizer=SWA(optimizer,swa_start=4, swa_freq=1, swa_lr=1e-4)
    """
    get_linear_schedule_with_warmup:学习率先从0开始warm_up到设定学习率，再逐渐减到0
    num_warmup_steps：完成预热的步数
    num_training_steps：训练批次*epochs 训练的step数
    """
    scheduler = fetch_scheduler(optimizer)
    if scheduler==None:
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                                num_training_steps=len(train_dataloader) * CONFIG["EPOCHS"])
    best_val_loss=100
    best_model_param=None
    #用于fp16
    scaler = GradScaler()
    
    start=time.time()
    for epoch in range(CONFIG["EPOCHS"]):
        print(f"\n Epoch{epoch} train start \n")
        train_loss=0
        model.train()
        #total 更新进度 
        bar=tqdm(enumerate(train_dataloader),total=len(train_dataloader))
        for index,batch in bar:
            model.zero_grad()
            more_toxic_inputs=batch["more_toxic"]
            less_toxic_inputs=batch["less_toxic"]
            target=batch["target"].to(DEVICE)

            more_toxic_inputs={key: value.to(DEVICE) for key,value in more_toxic_inputs.items()}
            less_toxic_inputs={key: value.to(DEVICE) for key,value in less_toxic_inputs.items()}
            if FP16==True:
                with autocast():
                    out_more=model(**more_toxic_inputs)
                    out_less=model(**less_toxic_inputs)
                    loss=criterion(out_more, out_less, target)
                    
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
            else:
                out_more=model(**more_toxic_inputs)
                out_less=model(**less_toxic_inputs)
                loss=criterion(out_more, out_less, target)
                loss.backward()
                optimizer.step()

            scheduler.step()
            train_loss+=loss.item()
            
        avg_train_loss=train_loss/len(train_dataloader)
        if swa_use==True:
            optimizer.update_swa()
            optimizer.swap_swa_sgd()
            val_loss=evaluate(model,dev_dataloader)
            optimizer.swap_swa_sgd()
        else:
            val_loss=evaluate(model,dev_dataloader)
        
        print(f"EPOCH:{epoch+1}/{EPOCHS},train_loss:{avg_train_loss},val_loss:{val_loss}")
        if run_db==True:
            wandb.log({"Train LOSS":avg_train_loss})
            wandb.log({"Valid LOSS":val_loss})
        
        if val_loss<best_val_loss:
            best_val_loss=val_loss
            if swa_use==True:
                optimizer.swap_swa_sgd()
            best_model_param=model.state_dict()
            print(f"best_model saved ,val_loss:{best_val_loss}")

    end=time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
    time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    if run_db==True:
        run.summary["time (hour)"]=time_elapsed /3600
    return best_val_loss,best_model_param

In [20]:
for fold in range(CONFIG["fold_num"]):
    print(f"\n Fold{fold} train start")
    if run_db==True:
        run = wandb.init(project='Jigsaw', 
                     config=CONFIG,
                     job_type='Train',
                     group=CONFIG['group'],
                     tags=['roberta-base', f'{HASH_NAME}', 'margin-loss'],
                     name=f'{HASH_NAME}-fold-{fold}',
                     anonymous='must')
    train_loader,dev_loader=prepare_loaders(data_df,fold)    
    model=JigsawModel(MODEL_DIR)
    model.to(DEVICE)
    dev_loss,best_model_param=train(model,train_loader,dev_loader)
    model_path=f"bestmodel-{fold}.pth"
    torch.save(best_model_param,model_path)
    if run_db==True:
        run.finish()
    
    del model,train_loader,dev_loader        
    gc.collect()


 Fold0 train start

 Epoch0 train start 



  0%|          | 0/3011 [00:00<?, ?it/s]

EPOCH:1/2,train_loss:0.34384405777744187,val_loss:0.3429202139377594
best_model saved ,val_loss:0.3429202139377594

 Epoch1 train start 



  0%|          | 0/3011 [00:00<?, ?it/s]

EPOCH:2/2,train_loss:0.3036093347019549,val_loss:0.37296849473526605
Training complete in 1h 20m 53s

 Fold1 train start

 Epoch0 train start 



  0%|          | 0/3011 [00:00<?, ?it/s]

EPOCH:1/2,train_loss:0.343128315624368,val_loss:0.34654534568912104
best_model saved ,val_loss:0.34654534568912104

 Epoch1 train start 



  0%|          | 0/3011 [00:00<?, ?it/s]

EPOCH:2/2,train_loss:0.2986937397413577,val_loss:0.3488291908251612
Training complete in 1h 20m 50s

 Fold2 train start

 Epoch0 train start 



  0%|          | 0/3011 [00:00<?, ?it/s]

EPOCH:1/2,train_loss:0.34388436825914426,val_loss:0.3467181340644234
best_model saved ,val_loss:0.3467181340644234

 Epoch1 train start 



  0%|          | 0/3011 [00:00<?, ?it/s]

EPOCH:2/2,train_loss:0.30182605935059603,val_loss:0.3559480044402574
Training complete in 1h 20m 46s

 Fold3 train start

 Epoch0 train start 



  0%|          | 0/3011 [00:00<?, ?it/s]

EPOCH:1/2,train_loss:0.3436528055256524,val_loss:0.3449101040237828
best_model saved ,val_loss:0.3449101040237828

 Epoch1 train start 



  0%|          | 0/3011 [00:00<?, ?it/s]

EPOCH:2/2,train_loss:0.30251074881004913,val_loss:0.3471742802544644
Training complete in 1h 20m 53s

 Fold4 train start

 Epoch0 train start 



  0%|          | 0/3011 [00:00<?, ?it/s]

EPOCH:1/2,train_loss:0.3448894509234645,val_loss:0.3435552091974961
best_model saved ,val_loss:0.3435552091974961

 Epoch1 train start 



  0%|          | 0/3011 [00:00<?, ?it/s]

EPOCH:2/2,train_loss:0.30416359837042933,val_loss:0.3583311399346904
Training complete in 1h 21m 23s


# cv

In [21]:
class DatasetRetriever_cv(Dataset):
    def __init__(self,data,tokenizer,max_len=CONFIG["MAX_LENGTH"]):
        self.data=data
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.text=self.data["text"].values
        
    def __len__(self):
        return len(self.data)
    def __getitem__(self, item):
        text=self.text[item]

        features1=self.convert_examples_to_features(text)
        return {"text":{key:torch.tensor(value,dtype=torch.long) for key,value in features1.items()}}
    def convert_examples_to_features(self, example):
        encoded = self.tokenizer.encode_plus(
            example,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            is_split_into_words=False,
            )
        return encoded
def make_dataloader_cv(data,batch_size,model_dir=MODEL_DIR,max_len=CONFIG["MAX_LENGTH"]):
    tokenizer=AutoTokenizer.from_pretrained(model_dir)
    dataset=DatasetRetriever_cv(data,tokenizer,max_len)
    sampler=SequentialSampler(dataset)
    
    dataloader=DataLoader(dataset,
                          batch_size=batch_size,
                          sampler=sampler
                         )
    return dataloader
def evaluate_cv(model,test_dataloader):
    model.eval()
    Preds=[]
    for index,batch in enumerate(test_dataloader):
        
        text_inputs=batch["text"]
        
        text_inputs={key: value.to(DEVICE) for key,value in text_inputs.items()}
        with torch.no_grad():
            out_more=model(**text_inputs)
            Preds.append(out_more.view(-1).cpu().detach().numpy())
    
    Preds = np.concatenate(Preds) 
    gc.collect()
    
    return Preds

def inference(model_paths,data_df,comments):
    
    for fold in range(CONFIG["fold_num"]):
        print(f"fold{fold} dev start")

        data_fold=data_df[data_df.kfold == fold]
#         data_fold.drop(["label_min","label_max","win_min","win_max"],axis=1,inplace=True)
    
        comments_fold_id=np.concatenate((data_fold["encode_more"].values,data_fold["encode_less"].values))
        comments_fold_id=np.unique(comments_fold_id)
        select_fold_list=comments.apply(lambda row : True if row["encode_text"] in comments_fold_id else False ,axis=1)
        comments_fold=comments[select_fold_list]
        comments_fold["access_time"]=comments_fold["access_time"]+1
        
        
        test_loader=make_dataloader_cv(comments_fold,CONFIG["DEV_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])
        model=JigsawModel(MODEL_DIR)
        model.to(DEVICE)
        path=model_paths[fold]
        
        model.load_state_dict(torch.load(path))
        preds = evaluate_cv(model, test_loader)
        comments_fold["toxic_value"]=comments_fold["toxic_value"]+preds
        
        data_df.loc[data_fold.index]=data_fold
        comments.loc[comments_fold.index]=comments_fold

        del model,test_loader        
    
    return data_df,comments

In [22]:
if run_db==True:
    run = wandb.init(project='Jigsaw', 
             config=CONFIG,
             job_type='cv',
             group=CONFIG['group'],
             tags=['roberta-base', f'{HASH_NAME}', 'margin-loss'],
             name=f'{HASH_NAME}-fold-{fold}',
             anonymous='must')
MODEL_PATHS=[f"bestmodel-{num}.pth" for num in range(CONFIG["fold_num"])]
# MODEL_PATHS=[f"../input/baseline1-toxic-value/bestmodel-{num}.pth" for num in range(CONFIG["fold_num"])]
data_df,comments= inference(MODEL_PATHS, data_df,comments)

comments["toxic_value"]=comments["toxic_value"]/comments["access_time"]
comments.index=comments["encode_text"]
index_score_dict=comments.to_dict()["toxic_value"]
data_df["less_value"]=data_df["encode_less"].map(lambda x:index_score_dict[x])
data_df["more_value"]=data_df["encode_more"].map(lambda x:index_score_dict[x])
data_df["pair_True"]=data_df.apply(lambda row:True if row["more_value"]>row["less_value"] else False,axis=1)
cv=data_df["pair_True"].mean()
print(cv)
if run_db==True:
    wandb.log({"cv":data_fold["pair_True"].mean()})
    run.finish()

fold0 dev start
fold1 dev start
fold2 dev start
fold3 dev start
fold4 dev start
0.7633519330410522


In [23]:
jc_df=pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")

min_len = (jc_df['toxic'] == 1).sum()
df_y0_undersample = jc_df[jc_df['toxic'] == 0].sample(n=min_len, random_state=201)
comments_fold = pd.concat([jc_df[jc_df['toxic'] == 1], df_y0_undersample])

comments_fold.rename(columns={"comment_text":"text"},inplace=True)
comments_fold["toxic_value"]=0

In [24]:
for fold in range(CONFIG["fold_num"]):
    print(f"fold{fold} dev start")
    test_loader=make_dataloader_cv(comments_fold,CONFIG["DEV_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])
    model=JigsawModel(MODEL_DIR)
    model.to(DEVICE)
    path=MODEL_PATHS[fold]

    model.load_state_dict(torch.load(path))
    preds = evaluate_cv(model, test_loader)
    comments_fold["toxic_value"]=comments_fold["toxic_value"]+preds
    del model,test_loader

fold0 dev start
fold1 dev start
fold2 dev start
fold3 dev start
fold4 dev start


In [25]:
toxicSeperateValue=comments_fold["toxic_value"].min()+(comments_fold["toxic_value"].max()-comments_fold["toxic_value"].min())/2
comments_fold["toxic_predict"]=comments_fold.apply(lambda row : 1 if row["toxic_value"]>=toxicSeperateValue else 0,axis=1)
comments_fold["predict_acc"]=comments_fold.apply(lambda row : True if row["toxic_predict"]==row["toxic"] else False,axis=1)
cv=comments_fold["predict_acc"].mean()
print("cv in first competition data:",cv)

cv in first competition data: 0.8312410095462273


In [26]:
data_df.to_csv(f"data_df_aug1")