In [None]:
import os
import numpy as np
import pandas as pd
import random
import time
import string
from collections import defaultdict, deque

from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, logging
from transformers import BertConfig, BertTokenizer, BertModel

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, SequentialSampler, RandomSampler, DataLoader
from torch.cuda.amp import autocast as autocast
from torch.cuda.amp import  GradScaler

from torch.optim import lr_scheduler
from tqdm.notebook import tqdm

import gc; gc.enable()
from IPython.display import clear_output
from torch.optim.swa_utils import AveragedModel, SWALR
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
logging.set_verbosity_error()

In [None]:
# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

#产生一个哈希值
# def id_generator(size=12, chars=string.ascii_lowercase + string.digits):
#     return ''.join(random.SystemRandom().choice(chars) for _ in range(size))

# HASH_NAME = id_generator(size=12)


# config

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CONFIG={
    "TRAIN_BATCH_SIZE":32,
    "MAX_LENGTH":128,
    "DEV_BATCH_SIZE": 64,
    "LR":1e-4,
    "EPS":1e-8,
    "weight_decay":1e-6,
    
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    "T_max": 500,
    "T_0":500,
    "margin":0.5,
    "fold_num":5,
    "seed":2021,
    "num_class":1,
    
    "EPOCHS":2,
    "evaluate_step":None,
    "swa_start":2,
    "model_init_lr":0.9e-4,
    "multiplier":0.9,
    "classifier_lr":1e-4 ,
    "swa_lr": 1e-5
}


input_dir="../input/jigsaw-toxic-severity-rating"


更换模型

In [None]:
hidden_size="hidden_size"
num_hidden_layers="num_hidden_layers"
#for xlnet
# hidden_size="d_model"
# num_hidden_layers="n_layer"
MODEL_DIR="../input/roberta-transformers-pytorch/roberta-base"
# MODEL_DIR="../input/pretrained-albert-pytorch/albert-base-v1"
# MODEL_DIR="../input/transformers/xlnet-base-cased"

##检查事项
* 提交之前 注意 run_db 是否打开 是否创建了正确的hash值
* test是否关闭
* 如果 换模型 model_dir 是否正确 model struct是否正确
* gpu 是否需要打开

In [None]:
DATASET_TEST=False
run_db=True
model_struct="OriginModel"

HASH_NAME="复现starter "

swa_use=False
data_aug=False
translate_aug=False
FP16=False

#OriginModel MeanPoolingModel LastLayerCLSModel MaxPoolingModel
#SecondToLastLayerCLSModel ConcatenateLastFourModel WeightedLayerPoolingModel WeightedLayerPoolingModel
#AttentionPoolingModel
translate_text=["text_fr","text_de","text_es"]

CONFIG['group'] = f'{HASH_NAME}-Baseline'


# W&B

In [None]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

# 数据处理

In [None]:
data_df=pd.read_csv(os.path.join(input_dir,"validation_data.csv"))

数据增强

In [None]:
from sklearn.preprocessing import LabelEncoder
def generate_comments(data):
    more_toxic_text=data["more_toxic"].values
    less_toxic_text=data["less_toxic"].values    
    comments=np.concatenate((more_toxic_text,less_toxic_text))
    comments=np.unique(comments)
    comments=pd.DataFrame({"text":comments})
    text_encoder=LabelEncoder()
    text_encoder.fit(comments)
    comments["encode_text"]=text_encoder.transform(comments["text"])
    comments["toxic_value"]=0
    comments["access_time"]=0
    data["encode_less"]=text_encoder.transform(data["less_toxic"])
    data["encode_more"]=text_encoder.transform(data["more_toxic"])
    
    return data,comments


In [None]:
def bsearch(start,more2less_dict):
    queue = deque([start])
    visit_list=[]
    while len(queue)!=0:
        visit_id=queue.popleft()
        if visit_id in visit_list:
            continue
        visit_list.append(visit_id)
        queue+=deque(more2less_dict[visit_id])
    visit_list.remove(start)
    return [ x for x in visit_list if x not in more2less_dict[start] ]

def search_lessText(more2less_dict):
    aug_dict= defaultdict(list)
    for start in list(more2less_dict.keys()):
        
        aug_list=bsearch(start,more2less_dict)
        aug_dict[start]=aug_list
    return aug_dict

def data_aug1(data_df,comments):
    data_df["label_min"]=data_df.apply(lambda row:row["encode_less"] 
                                   if row["encode_less"]<row["encode_more"] else row["encode_more"],axis=1)
    data_df["label_max"]=data_df.apply(lambda row:row["encode_more"] 
                                       if row["encode_less"]<row["encode_more"] else row["encode_less"],axis=1)

    data_df["win_min"]=data_df.apply(lambda row:1 if row["encode_more"]<row["encode_less"] else 0 ,axis=1)
    data_df["win_max"]=data_df.apply(lambda row:0 if row["encode_more"]<row["encode_less"] else 1 ,axis=1)

    data_df_agg=data_df.groupby(["label_min","label_max"]).agg({"win_min":"sum","win_max":"sum"}).reset_index()
    data_df_agg["encode_less"]=data_df_agg.apply(lambda row:row["label_min"] 
                                                 if row["win_min"]<row["win_max"] else row["label_max"],axis=1)
    data_df_agg["encode_more"]=data_df_agg.apply(lambda row:row["label_min"] 
                                                 if row["win_min"]>row["win_max"] else row["label_max"],axis=1)
    
    more2less_dict= defaultdict(list)
    data_df_agg.apply(lambda row:more2less_dict[row["encode_more"]].append(row["encode_less"]),axis=1)
    
    aug_dict=search_lessText(more2less_dict)
    aug_dict={key:value for key,value in aug_dict.items() if len(value)!=0}
    aug_df=pd.DataFrame(columns=(tuple(data_df.columns)))
    
    id2text_dict=comments.to_dict()["text"]
    
    for key,value in aug_dict.items():
        encode_more=key
        encode_less_list=value

        more_toxic=id2text_dict[encode_more]
        for encode_less in encode_less_list:
            less_toxic=id2text_dict[encode_less]
            row=pd.DataFrame({"worker":[999],"less_toxic":[less_toxic],"more_toxic":[more_toxic],"encode_less":[encode_less],
                                       "encode_more":[encode_more]})
            aug_df=aug_df.append(row,ignore_index=True)
    work_list=np.array([999]*len(aug_df),dtype=np.int64)
    aug_df["worker"]=work_list
    return aug_df

In [None]:
data_df,comments=generate_comments(data_df)
if translate_aug==True:
    comment_translation=pd.read_csv("../input/translate-toxic/comment_translation.csv")
    comment_translation=comment_translation.merge(comments,on="text",how="left")
if data_aug==True:
    aug_df=data_aug1(data_df,comments)
    data_df=pd.concat([data_df,aug_df],axis=0)
    data_df=data_df.reset_index(drop=True)

In [None]:
if DATASET_TEST==True:
    data_df=data_df[0:400]

交叉

In [None]:
from sklearn.model_selection import GroupKFold
class UnionFind():
    def __init__(self, n):
        self.n = n
        self.parents = [-1] * n

    def find(self, x):
        if self.parents[x] < 0:
            return x
        else:
            self.parents[x] = self.find(self.parents[x])
            return self.parents[x]

    def union(self, x, y):
        x = self.find(x)
        y = self.find(y)
        if x == y:
            return
        if self.parents[x] > self.parents[y]:
            x, y = y, x
        self.parents[x] += self.parents[y]
        self.parents[y] = x


def get_group_unionfind(train: pd.DataFrame):
    less_unique_text = train['less_toxic'].unique()
    more_unique_text = train['more_toxic'].unique()
    unique_text = np.hstack([less_unique_text, more_unique_text])
    unique_text = np.unique(unique_text).tolist()    
    text2num = {text: i for i, text in enumerate(unique_text)}
    num2text = {num: text for text, num in text2num.items()}
    train['num_less_toxic'] = train['less_toxic'].map(text2num)
    train['num_more_toxic'] = train['more_toxic'].map(text2num)

    uf = UnionFind(len(unique_text))
    for seq1, seq2 in train[['num_less_toxic', 'num_more_toxic']].to_numpy():
        uf.union(seq1, seq2)

    text2group = {num2text[i]: uf.find(i) for i in range(len(unique_text))}
    train['group'] = train['less_toxic'].map(text2group)
    train = train.drop(columns=['num_less_toxic', 'num_more_toxic'])
    return train

In [None]:
data_df = get_group_unionfind(data_df)
group_kfold = GroupKFold(n_splits=CONFIG["fold_num"])
for fold, (trn_idx, val_idx) in enumerate(group_kfold.split(data_df, data_df, data_df['group'])): 
    data_df.loc[val_idx , "kfold"] = fold

data_df["kfold"] = data_df["kfold"].astype(int)
data_df.to_csv('train_noleak.csv', index=False)
data_df.head()

In [None]:
# skf=StratifiedKFold(n_splits=CONFIG["fold_num"],shuffle=True,random_state=CONFIG["seed"])
# for fold,(_,val_) in enumerate(skf.split(X=data_df,y=data_df.worker)):
#     data_df.loc[val_,"kfold"]=int(fold)
    
# data_df["kfold"]=data_df["kfold"].astype(int)
# data_df.head()

In [None]:
class DatasetRetriever(Dataset):
    def __init__(self,data,tokenizer,max_len=CONFIG["MAX_LENGTH"]):
        self.data=data
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.more_toxic=data["more_toxic"].values
        self.less_toxic=data["less_toxic"].values
        
    def __len__(self):
        return len(self.data)
    def __getitem__(self, item):
        more_toxic=self.more_toxic[item]
        less_toxic=self.less_toxic[item]

        features1=self.convert_examples_to_features(more_toxic)
        features2=self.convert_examples_to_features(less_toxic)
        features1={"input_ids":features1["input_ids"],"attention_mask":features1["attention_mask"]}
        features2={"input_ids":features2["input_ids"],"attention_mask":features2["attention_mask"]}
        target=1
        return {"more_toxic":{key:torch.tensor(value,dtype=torch.long) for key,value in features1.items()},
                "less_toxic":{key:torch.tensor(value,dtype=torch.long) for key,value in features2.items()},
                "target":torch.tensor(target,dtype=torch.long)}
    def convert_examples_to_features(self, example):
        encoded = self.tokenizer.encode_plus(
            example,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            is_split_into_words=False,
            )
        return encoded
def make_dataloader(data,batch_size,model_dir=MODEL_DIR,max_len=CONFIG["MAX_LENGTH"]):
    tokenizer=AutoTokenizer.from_pretrained(model_dir)
    dataset=DatasetRetriever(data,tokenizer,max_len)
    sampler=RandomSampler(dataset)
    
    dataloader=DataLoader(dataset,
                          batch_size=batch_size,
                          sampler=sampler
                         )
    return dataloader

In [None]:
def prepare_loaders(df,fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    if translate_aug==True:
        df_train_encode=df_train.drop(["less_toxic","more_toxic"],axis=1)
        for language_text in translate_text:
            temp_train=df_train_encode
            
            temp_train=temp_train.merge(comment_translation[["encode_text",language_text]],left_on="encode_less",right_on="encode_text",how="left")
            temp_train=temp_train.rename(columns={language_text:"less_toxic"})
            temp_train.drop(["encode_text"],axis=1,inplace=True)
            
            temp_train=temp_train.merge(comment_translation[["encode_text",language_text]],left_on="encode_more",right_on="encode_text",how="left")
            temp_train=temp_train.rename(columns={language_text:"more_toxic"})
            temp_train.drop(["encode_text"],axis=1,inplace=True)
            df_train=pd.concat([df_train,temp_train])
    train_loader=make_dataloader(df_train,CONFIG["TRAIN_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])
    
    valid_loader=make_dataloader(df_valid,CONFIG["DEV_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])
    
    return train_loader, valid_loader

** 模型输出结构

In [None]:
class OriginModel(nn.Module):
    def __init__(self,model_name):
        super(OriginModel,self).__init__()
        self.config=AutoConfig.from_pretrained(model_name)
        self.config.update({"hidden_dropout_prob": 0.0,"attention_probs_dropout_prob":0.0
            })   
        self.model=AutoModel.from_pretrained(model_name,config=self.config)
        self.drop=nn.Dropout(p=0)
        
        self.linear=nn.Linear(self.config.to_dict()[hidden_size],CONFIG["num_class"])
           
        self.dense = nn.Linear(self.config.to_dict()[hidden_size], self.config.to_dict()[hidden_size])
        self.activation = nn.Tanh()
    def forward(self,input_ids,attention_mask):
        out=self.model(input_ids=input_ids,attention_mask=attention_mask,output_hidden_states=False)
        last_hidden_state = out[0]
        cls_embeddings = last_hidden_state[:,0]
        pooled_output = self.dense(cls_embeddings)
        pooled_output = self.activation(pooled_output)
        
        out=self.drop(pooled_output)
        
        outputs=self.linear(out)
        
        return outputs

In [None]:
       
LAYER_START = 4   # for WeightedLayerPoolingModel

HIDDEN_DIM_FC = 128    # for AttentionPooling

class LastLayerCLSModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, config=self.config)
        self.linear = nn.Linear(self.config.to_dict()[hidden_size], CONFIG["num_class"])
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        cls_embeddings = last_hidden_state[:,0]
        logits = self.linear(cls_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds
        
        
class MeanPoolingModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, config=self.config)
        self.linear = nn.Linear(self.config.to_dict()[hidden_size], 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        logits = self.linear(mean_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds
        
        
class MaxPoolingModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update({"hidden_dropout_prob": 0.0,"attention_probs_dropout_prob":0.0
            })      
        self.model = AutoModel.from_pretrained(model_name, config=self.config)
        self.linear = nn.Linear(self.config.to_dict()[hidden_size], 1)
        self.loss = nn.MSELoss()
        
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        last_hidden_state[input_mask_expanded == 0] = -1e9   # large negative value
        max_embeddings, _ = torch.max(last_hidden_state, 1)
        logits = self.linear(max_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds
        
        
class SecondToLastLayerCLSModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update({'output_hidden_states':True})
        self.config.update({"hidden_dropout_prob": 0.0,"attention_probs_dropout_prob":0.0
            })      
        self.model = AutoModel.from_pretrained(model_name, config=self.config)
        self.linear = nn.Linear(self.config.to_dict()[hidden_size], 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        all_hidden_states = torch.stack(outputs[2])
        second_to_last_layer = self.config.to_dict()[num_hidden_layers]-2
        cls_embeddings = all_hidden_states[second_to_last_layer,:,0]
        logits = self.linear(cls_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds
        
class ConcatenateLastFourModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update({'output_hidden_states':True})
        self.model = AutoModel.from_pretrained(model_name, config=self.config)
        self.linear = nn.Linear(4*self.config.to_dict()[hidden_size], 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        all_hidden_states = torch.stack(outputs[2])
        concatenate_pooling = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]), -1
        )
        concatenate_pooling = concatenate_pooling[:,0]
        logits = self.linear(concatenate_pooling)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds
        
        
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average
    
class WeightedLayerPoolingModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update({'output_hidden_states':True})
        self.model = AutoModel.from_pretrained(model_name, config=self.config)
        self.pooling = WeightedLayerPooling(self.config.to_dict()[num_hidden_layers], 
                                      layer_start=LAYER_START,
                                      layer_weights=None)
        self.layer_norm = nn.LayerNorm(self.config.to_dict()[hidden_size])
        self.linear = nn.Linear(self.config.to_dict()[hidden_size], 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        all_hidden_states = torch.stack(outputs[2])
        
        weighted_pooling_embeddings = self.pooling(all_hidden_states)
        weighted_pooling_embeddings = weighted_pooling_embeddings[:,0]
        
        norm_embeddings = self.layer_norm(weighted_pooling_embeddings)
        logits = self.linear(norm_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds
        
class AttentionPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_fc):
        super(AttentionPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_fc = hiddendim_fc
        self.dropout = nn.Dropout(0.1)

        q_t = np.random.normal(loc=0.0, scale=0.1, size=(1, self.hidden_size))
        self.q = nn.Parameter(torch.from_numpy(q_t)).float().to(DEVICE)
        w_ht = np.random.normal(loc=0.0, scale=0.1, size=(self.hidden_size, self.hiddendim_fc))
        self.w_h = nn.Parameter(torch.from_numpy(w_ht)).float().to(DEVICE)

    def forward(self, all_hidden_states):
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers+1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out = self.attention(hidden_states)
        out = self.dropout(out)
        return out

    def attention(self, h):
        v = torch.matmul(self.q, h.transpose(-2, -1)).squeeze(1)
        v = F.softmax(v, -1)
        v_temp = torch.matmul(v.unsqueeze(1), h).transpose(-2, -1)
        v = torch.matmul(self.w_h.transpose(1, 0), v_temp).squeeze(2)
        return v

class AttentionPoolingModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update({'output_hidden_states':True})
        self.model = AutoModel.from_pretrained(model_name, config=self.config)
        self.pooler = AttentionPooling(self.config.to_dict()[num_hidden_layers], self.config.to_dict()[hidden_size], HIDDEN_DIM_FC)
        
        self.linear = nn.Linear(HIDDEN_DIM_FC, 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        all_hidden_states = torch.stack(outputs[2])
        
        attention_pooling_embeddings = self.pooler(all_hidden_states)
        
        logits = self.linear(attention_pooling_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds
        
# class LitModel(nn.Module):
#     def __init__(self, n_hidden_state=4):
#         super().__init__()

#         self.config = AutoConfig.from_pretrained(model_name)
#         config.update({"output_hidden_states":True, 
#                        "hidden_dropout_prob": 0.0,
#                        "layer_norm_eps": 1e-7})                       
        
#         self.roberta = AutoModel.from_pretrained(model_name, config=self.config)
#         self.n_hidden_state = self.config.to_dict()[hidden_size]
        
#         for i in range(self.n_hidden_state):
#             layer_name = f"attention_{i+1}"
#             layer = nn.Sequential(
#                 nn.Linear(768, 512),
#                 nn.Tanh(),                       
#                 nn.Linear(512, 1),
#                 nn.Softmax(dim=1)
#             )
#             setattr(self, layer_name, layer)

#         self.regressor = nn.Sequential(                        
#             nn.Linear(768, 1)                        
#         )
        

#     def forward(self, input_ids, attention_mask):
#         roberta_output = self.roberta(input_ids=input_ids,
#                                       attention_mask=attention_mask)
        
#         weights = [
#             getattr(self, f"attention_{i+1}")(roberta_output.hidden_states[-(i+1)]) for i in range(self.n_hidden_state)
#         ]
        
#         context_vectors = [
#             torch.sum(weights[i] * roberta_output.hidden_states[-(i+1)], dim=1) for i in range(self.n_hidden_state)
#         ]
        
#         stacked = torch.stack(context_vectors)
#         hidden = torch.sum(stacked, dim=0) / self.n_hidden_state
        
#         # Now we reduce the context vector to the prediction score.
#         return self.regressor(hidden)

In [None]:
func_dict={"OriginModel":OriginModel,"LastLayerCLSModel":LastLayerCLSModel,"MeanPoolingModel":MeanPoolingModel,
           "MaxPoolingModel":MaxPoolingModel,"SecondToLastLayerCLSModel":SecondToLastLayerCLSModel,"ConcatenateLastFourModel":
           ConcatenateLastFourModel,"WeightedLayerPoolingModel":WeightedLayerPoolingModel,"AttentionPoolingModel":AttentionPoolingModel}
JigsawModel=func_dict.get(model_struct)


In [None]:
def criterion(outputs1, outputs2, targets):
    return nn.MarginRankingLoss(margin=CONFIG["margin"])(outputs1, outputs2, targets)

In [None]:
def get_parameters(model,model_init_lr,multiplier, classifier_lr):
    #权重分层，越靠近下游学习率越高
    parameters=[]
    lr=model_init_lr
    # 迭代器包含 层名字和参数 parameters()函数只包含参数
    #定义的层字典，参数的key必须叫params，否则在optimizer 父类中冲突
    for layer in range(model.config.to_dict()[num_hidden_layers]-1,-1,-1):
        layer_parameters={
            "params":[p for n,p in model.named_parameters() if f"encoder.layer.{layer}." in n],
            "lr":lr
        }
        lr*=multiplier
        parameters.append(layer_parameters)
    
    
    classify_parameters={
        #自己定义了什么分类层在此更改名字
        "params":[p for n,p in model.named_parameters() if "linear"  in n],
        "lr":classifier_lr
    }
        
    parameters.append(classify_parameters)
    return parameters

In [None]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [None]:
def evaluate(model,dev_dataloader):
    model.eval()
    dev_loss=0
    for index,batch in enumerate(dev_dataloader):
        
        more_toxic_inputs=batch["more_toxic"]
        less_toxic_inputs=batch["less_toxic"]
        target=batch["target"].to(DEVICE)

        more_toxic_inputs={key: value.to(DEVICE) for key,value in more_toxic_inputs.items()}
        less_toxic_inputs={key: value.to(DEVICE) for key,value in less_toxic_inputs.items()}
        with torch.no_grad():
            out_more=model(**more_toxic_inputs)
            out_less=model(**less_toxic_inputs)

            loss=criterion(out_more, out_less, target)
        
            dev_loss+=loss.item()
        
    return dev_loss/len(dev_dataloader)
def train(model,train_dataloader,dev_dataloader,evaluate_step=None,swa_start=None):

    if run_db==True:
        wandb.watch(model,log_freq=100)
#     optimizer=AdamW(get_parameters(model, model_init_lr=CONFIG["model_init_lr"], multiplier=CONFIG["multiplier"], 
#                                    classifier_lr=CONFIG["classifier_lr"]),
#                     lr = CONFIG['LR'], eps = CONFIG['EPS'],weight_decay=CONFIG['weight_decay'])
    optimizer = AdamW(model.parameters(),lr= CONFIG['LR'], eps = CONFIG['EPS'],weight_decay=CONFIG['weight_decay'])
    if evaluate_step==None:
        evaluate_step=len(train_dataloader)
    if swa_use==True:
        swa_model=AveragedModel(model).to(DEVICE)
        swa_scheduler = SWALR(optimizer, swa_lr=CONFIG["swa_lr"])
    """
    get_linear_schedule_with_warmup:学习率先从0开始warm_up到设定学习率，再逐渐减到0
    num_warmup_steps：完成预热的步数
    num_training_steps：训练批次*epochs 训练的step数
    """
    scheduler = fetch_scheduler(optimizer)
    if scheduler==None:
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                                num_training_steps=len(train_dataloader) * CONFIG["EPOCHS"])
    best_val_loss=100
    best_model_param=None

    scaler = GradScaler()
    start=time.time()
    for epoch in range(CONFIG["EPOCHS"]):
        print(f"\n Epoch{epoch} train start \n")
        train_loss=0
        model.train()
        #total 更新进度 
        bar=tqdm(enumerate(train_dataloader),total=len(train_dataloader))
        for index,batch in bar:
            model.zero_grad()
            more_toxic_inputs=batch["more_toxic"]
            less_toxic_inputs=batch["less_toxic"]
            target=batch["target"].to(DEVICE)

            more_toxic_inputs={key: value.to(DEVICE) for key,value in more_toxic_inputs.items()}
            less_toxic_inputs={key: value.to(DEVICE) for key,value in less_toxic_inputs.items()}
            if FP16==True:
                with autocast():
                    out_more=model(**more_toxic_inputs)
                    out_less=model(**less_toxic_inputs)
                    loss=criterion(out_more, out_less, target)

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                out_more=model(**more_toxic_inputs)
                out_less=model(**less_toxic_inputs)
                loss=criterion(out_more, out_less, target)
                loss.backward()
                optimizer.step()
            if swa_use==True and epoch>=swa_start-1:

                swa_model.update_parameters(model)
                swa_scheduler.step()
            else:
                scheduler.step()

            train_loss+=loss.item()
            if (index+1)%evaluate_step==0 or (index+1)==len(train_dataloader):
                if swa_use==True and epoch>=swa_start-1:
                    val_loss=evaluate(swa_model,dev_dataloader)
                else:
                    val_loss=evaluate(model,dev_dataloader)

                if run_db==True:
                    wandb.log({"Train LOSS":loss})
                    wandb.log({"Valid LOSS":val_loss})

                if val_loss<best_val_loss:
                    best_val_loss=val_loss
                    if swa_use==True and epoch>=swa_start-1:
                        best_model_param=swa_model.module.state_dict()
                    else:
                        best_model_param=model.state_dict()
                    print(f"best_model saved ,val_loss:{best_val_loss}")
        avg_train_loss=train_loss/len(train_dataloader)
        print(f"EPOCH:{epoch+1},train_loss:{avg_train_loss},val_loss:{val_loss}")

    end=time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
    time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    if run_db==True:
        run.summary["time (hour)"]=time_elapsed /3600
    return best_val_loss,best_model_param

In [None]:
for fold in range(CONFIG["fold_num"]):
    print(f"\n Fold{fold} train start")
    if run_db==True:
        run = wandb.init(project='Jigsaw', 
                     config=CONFIG,
                     job_type='Train',
                     group=CONFIG['group'],
                     tags=['roberta-base', f'{HASH_NAME}', 'margin-loss'],
                     name=f'{HASH_NAME}-fold-{fold}',
                     anonymous='must')
    train_loader,dev_loader=prepare_loaders(data_df,fold)    
    model=JigsawModel(MODEL_DIR)
    model.to(DEVICE)
    dev_loss,best_model_param=train(model,train_loader,dev_loader,evaluate_step=CONFIG["evaluate_step"],swa_start=CONFIG["swa_start"])
    model_path=f"bestmodel-{fold}.pth"
    torch.save(best_model_param,model_path)
    if run_db==True:
        run.finish()
    
    del model,train_loader,dev_loader        
    gc.collect()

# cv

In [None]:
class DatasetRetriever_cv(Dataset):
    def __init__(self,data,tokenizer,max_len=CONFIG["MAX_LENGTH"]):
        self.data=data
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.text=self.data["text"].values
        
    def __len__(self):
        return len(self.data)
    def __getitem__(self, item):
        text=self.text[item]

        features1=self.convert_examples_to_features(text)
        features1={"input_ids":features1["input_ids"],"attention_mask":features1["attention_mask"]}

        return {"text":{key:torch.tensor(value,dtype=torch.long) for key,value in features1.items()}}
    def convert_examples_to_features(self, example):
        encoded = self.tokenizer.encode_plus(
            example,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            is_split_into_words=False,
            )
        return encoded
def make_dataloader_cv(data,batch_size,model_dir=MODEL_DIR,max_len=CONFIG["MAX_LENGTH"]):
    tokenizer=AutoTokenizer.from_pretrained(model_dir)
    dataset=DatasetRetriever_cv(data,tokenizer,max_len)
    sampler=SequentialSampler(dataset)
    
    dataloader=DataLoader(dataset,
                          batch_size=batch_size,
                          sampler=sampler
                         )
    return dataloader
def evaluate_cv(model,test_dataloader):
    model.eval()
    Preds=[]
    for index,batch in enumerate(test_dataloader):
        
        text_inputs=batch["text"]
        
        text_inputs={key: value.to(DEVICE) for key,value in text_inputs.items()}
        with torch.no_grad():
            out_more=model(**text_inputs)
            Preds.append(out_more.view(-1).cpu().detach().numpy())
    
    Preds = np.concatenate(Preds) 
    gc.collect()
    
    return Preds

def inference(model_paths,data_df,comments):
    
    for fold in range(CONFIG["fold_num"]):
        print(f"fold{fold} dev start")

        data_fold=data_df[data_df.kfold == fold]
#         data_fold.drop(["label_min","label_max","win_min","win_max"],axis=1,inplace=True)
    
        comments_fold_id=np.concatenate((data_fold["encode_more"].values,data_fold["encode_less"].values))
        comments_fold_id=np.unique(comments_fold_id)
        select_fold_list=comments.apply(lambda row : True if row["encode_text"] in comments_fold_id else False ,axis=1)
        comments_fold=comments[select_fold_list]
        comments_fold["access_time"]=comments_fold["access_time"]+1
        
        
        test_loader=make_dataloader_cv(comments_fold,CONFIG["DEV_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])
        model=JigsawModel(MODEL_DIR)
        model.to(DEVICE)
        path=model_paths[fold]
        
        model.load_state_dict(torch.load(path))
        preds = evaluate_cv(model, test_loader)
        comments_fold["toxic_value"]=comments_fold["toxic_value"]+preds
        
        data_df.loc[data_fold.index]=data_fold
        comments.loc[comments_fold.index]=comments_fold

        del model,test_loader        
    
    return data_df,comments

In [None]:
if run_db==True:
    run = wandb.init(project='Jigsaw', 
             config=CONFIG,
             job_type='cv',
             group=CONFIG['group'],
             tags=['roberta-base', f'{HASH_NAME}', 'margin-loss'],
             name=f'{HASH_NAME}-fold-{fold}',
             anonymous='must')
MODEL_PATHS=[f"bestmodel-{num}.pth" for num in range(CONFIG["fold_num"])]
# MODEL_PATHS=[f"../input/baseline1-toxic-value/bestmodel-{num}.pth" for num in range(CONFIG["fold_num"])]
data_df,comments= inference(MODEL_PATHS, data_df,comments)

comments["toxic_value"]=comments["toxic_value"]/comments["access_time"]
comments.index=comments["encode_text"]
index_score_dict=comments.to_dict()["toxic_value"]
data_df["less_value"]=data_df["encode_less"].map(lambda x:index_score_dict[x])
data_df["more_value"]=data_df["encode_more"].map(lambda x:index_score_dict[x])
data_df["pair_True"]=data_df.apply(lambda row:True if row["more_value"]>row["less_value"] else False,axis=1)
cv=data_df["pair_True"].mean()
print(cv)
if run_db==True:
    wandb.log({"cv":data_df["pair_True"].mean()})
    run.finish()

In [None]:
# jc_df=pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")

# min_len = (jc_df['toxic'] == 1).sum()
# df_y0_undersample = jc_df[jc_df['toxic'] == 0].sample(n=min_len, random_state=201)
# comments_fold = pd.concat([jc_df[jc_df['toxic'] == 1], df_y0_undersample])

# comments_fold.rename(columns={"comment_text":"text"},inplace=True)
# comments_fold["toxic_value"]=0

In [None]:
# for fold in range(CONFIG["fold_num"]):
#     print(f"fold{fold} dev start")
#     test_loader=make_dataloader_cv(comments_fold,CONFIG["DEV_BATCH_SIZE"],MODEL_DIR,CONFIG["MAX_LENGTH"])
#     model=JigsawModel(MODEL_DIR)
#     model.to(DEVICE)
#     path=MODEL_PATHS[fold]

#     model.load_state_dict(torch.load(path))
#     preds = evaluate_cv(model, test_loader)
#     comments_fold["toxic_value"]=comments_fold["toxic_value"]+preds
#     del model,test_loader

In [None]:
# toxicSeperateValue=comments_fold["toxic_value"].min()+(comments_fold["toxic_value"].max()-comments_fold["toxic_value"].min())/2
# comments_fold["toxic_predict"]=comments_fold.apply(lambda row : 1 if row["toxic_value"]>=toxicSeperateValue else 0,axis=1)
# comments_fold["predict_acc"]=comments_fold.apply(lambda row : True if row["toxic_predict"]==row["toxic"] else False,axis=1)
# cv=comments_fold["predict_acc"].mean()
# print("cv in first competition data:",cv)
# if run_db==True:
#     wandb.log({"cv in first competition ":cv})
#     run.finish()

In [None]:
data_df.to_csv(f"data_df_aug1")