In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
import os
import numpy as np
import pandas as pd
import random
import time

from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, logging
from transformers import BertConfig, BertTokenizer, BertModel
from transformers import AlbertConfig, AlbertTokenizer, AlbertModel
from transformers import RobertaConfig,RobertaTokenizer, RobertaModel

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, SequentialSampler, RandomSampler, DataLoader

from tqdm.notebook import tqdm

import gc; gc.enable()
from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
logging.set_verbosity_error()

In [8]:
CONFIG={
    "TRAIN_BATCH_SIZE":32,
    "MAX_LENGTH":128,
    "DEV_BATCH_SIZE": 64,
    "LR":3.6e-6,
    "EPS":1e-8,
    "weight_decay":1e-6,
    
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    "T_max": 500,
    "T_0":500,
    "margin":0.5,
    "fold_num":5,
    "seed":2021,
    "num_class":1,
    
    "EPOCHS":4,
    "evaluate_step":None,
    "swa_start":3,
    "model_init_lr":3.5e-6,
    "multiplier":0.9,
    "classifier_lr":3.6e-6 ,
    "swa_lr": 1e-5
}

input_dir="../input/jigsaw-toxic-severity-rating"

#检查事项
1. model_struct 如果需要更换模型检查更换结构部分  
2.更新model path来源版本
3.如果是来自firefox的更换 modelpaths
4.fold是否正确 

In [9]:
#OriginModel MeanPoolingModel LastLayerCLSModel MaxPoolingModel
#SecondToLastLayerCLSModel ConcatenateLastFourModel WeightedLayerPoolingModel WeightedLayerPoolingModel
#AttentionPoolingModel
model_struct="OriginModel"

# MODEL_PATHS=[f"../input/baseline1-toxic-value/bestmodel-{num}.pth" for num in range(CONFIG["fold_num"])]
MODEL_PATHS=[f"../input/jigsawserver/bestmodel-{num}.pth" for num in range(CONFIG["fold_num"])]

# MODEL_PATHS=[f"../input/bert-from-firefox/bestmodel-{num}.pth" for num in range(CONFIG["fold_num"])]

更换模型

In [10]:
hidden_size="hidden_size"
num_hidden_layers="num_hidden_layers"
#for xlnet
# hidden_size="d_model"
# num_hidden_layers="n_layer"

# MODEL_DIR="../input/roberta-transformers-pytorch/roberta-base"
# MODEL_DIR="../input/roberta-transformers-pytorch/roberta-large"
MODEL_DIR="../input/pretrained-albert-pytorch/albert-xlarge-v2"
# MODEL_DIR="../input/transformers/xlnet-base-cased"
# MODEL_DIR="../input/hatebert/hateBERT"

Model_type="Albert"
tokenizer_func_dict={"Albert":AlbertTokenizer,"auto":AutoTokenizer,"Roberta":RobertaTokenizer}
config_func_dict={"Albert":AlbertConfig,"auto":AutoConfig,"Roberta":RobertaConfig}
model_func_dict={"Albert":AlbertModel,"auto":AutoModel,"Roberta":RobertaModel}


In [11]:
MAX_LENGTH=128
TRAIN_BATCH_SIZE = 32
DEV_BATCH_SIZE = 64
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
data_df=pd.read_csv(os.path.join(input_dir,"comments_to_score.csv"))
data_df.head()

In [13]:
class DatasetRetriever(Dataset):
    def __init__(self,data,tokenizer,max_len=MAX_LENGTH):
        self.data=data
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.text=self.data["text"].values
        
    def __len__(self):
        return len(self.data)
    def __getitem__(self, item):
        text=self.text[item]

        features1=self.convert_examples_to_features(text)
         ##roberta 没有tokentype ids 为了统一这里也不进行输入 反正训练也用不着
        features1={"input_ids":features1["input_ids"],"attention_mask":features1["attention_mask"]}
        return {"text":{key:torch.tensor(value,dtype=torch.long) for key,value in features1.items()}}
    def convert_examples_to_features(self, example):
        encoded = self.tokenizer.encode_plus(
            example,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            is_split_into_words=False,
            )
        return encoded
def make_dataloader(data,batch_size,model_dir=MODEL_DIR,max_len=MAX_LENGTH):
    
    tokenizer=tokenizer_func_dict.get(Model_type).from_pretrained(model_dir)
    dataset=DatasetRetriever(data,tokenizer,max_len)
    sampler=SequentialSampler(dataset)
    
    dataloader=DataLoader(dataset,
                          batch_size=batch_size,
                          sampler=sampler
                         )
    return dataloader

In [14]:
test_loader=make_dataloader(data_df,DEV_BATCH_SIZE,MODEL_DIR,MAX_LENGTH)

In [15]:
class OriginModel(nn.Module):
    def __init__(self,model_name):
        super(OriginModel,self).__init__()

        self.model=model_func_dict.get(Model_type).from_pretrained(model_name)
#         AutoModel.from_pretrained(model_name)
        self.drop=nn.Dropout(p=0.2)
        self.config=config_func_dict.get(Model_type).from_pretrained(model_name)
#         self.config=AutoConfig.from_pretrained(model_name)
        self.linear=nn.Linear(self.config.to_dict()[hidden_size],CONFIG["num_class"])
        
        self.dense = nn.Linear(self.config.to_dict()[hidden_size], self.config.to_dict()[hidden_size])
        self.activation = nn.Tanh()
    def forward(self,input_ids,attention_mask):
        out=self.model(input_ids=input_ids,attention_mask=attention_mask,output_hidden_states=False)
        last_hidden_state = out[0]
        cls_embeddings = last_hidden_state[:,0]
        pooled_output = self.dense(cls_embeddings)
        pooled_output = self.activation(pooled_output)
        
        out=self.drop(pooled_output)
        
        outputs=self.linear(out)
        
        return outputs

In [16]:
def evaluate(model,test_dataloader):
    model.eval()
    Preds=[]
    for index,batch in enumerate(test_dataloader):

        text_inputs=batch["text"]
        
        text_inputs={key: value.to(DEVICE) for key,value in text_inputs.items()}
        with torch.no_grad():
            out_more=model(**text_inputs)
            Preds.append(out_more.view(-1).cpu().detach().numpy())
    
    Preds = np.concatenate(Preds) 
    gc.collect()
    
    return Preds
        


In [17]:
func_dict={"OriginModel":OriginModel}
JigsawModel=func_dict.get(model_struct)

In [18]:
def inference(model_paths, dataloader):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JigsawModel(MODEL_DIR)
        model.to(DEVICE)
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = evaluate(model, dataloader)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

In [None]:
preds = inference(MODEL_PATHS, test_loader)

In [None]:
data_df['score'] = preds
data_df.head()

In [None]:
data_df['score'] = data_df['score'].rank(method='first')

In [None]:
data_df.drop('text', axis=1, inplace=True)
data_df.to_csv("submission.csv", index=False)