In [1]:
#!/usr/bin/env python
# coding: utf-8
import os
import pandas as pd
import numpy as np
from collections import Counter
import json
import time
import random
import warnings
import wandb

warnings.filterwarnings("ignore")

import scipy as sp
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from scipy.special import softmax

import torch
import torch.nn as nn
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import transformers
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer
from utils import sample_context_by_list, bm25_sample
# from  dice_loss import  DiceLoss
# from  focalloss import  FocalLoss

transformers.__version__: 4.5.1


### 参数

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CFG:
    # train_file = '../nlp_data/train.sample.txt'    
    # test_file = '../nlp_data/test.txt'
    # model="/home/yjw/ZYJ_WorkSpace/PTMs/chinese-roberta-wwm-ext/" 
    train_file = '../../nlp_data/final/train.mix.txt'    
    valid_file = '../../nlp_data/final/valid.mix.txt'    
#     train_file = '../../nlp_data/final/train1.txt'    
#     valid_file = '../../nlp_data/final/valid1.txt'    
    test_file = '../../nlp_data/test.txt'
    model="/home/zyj/PTMs/chinese-roberta-wwm-ext/" 
    output_dir = './roberta-saved'
    epochs=5
    learning_rate = 2e-5
    batch_size=16
    max_len=512        
    weight_decay=0.01        
    seed=42 


In [3]:
#=======设置全局seed保证结果可复现====
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

### mask数据format

#### prepare_input

In [4]:
MAPPER = {-2:"坏",-1:"差",0:"平",1:"好",2:"棒"}

def prepare_input(content, entitys, labels=None, TOKENIZER=None):
    inputs = TOKENIZER(content,add_special_tokens=True,
                       truncation = True,
                       max_length=CFG.max_len,
                       padding="max_length",
                       return_offsets_mapping=False)
    if labels==None:
        pass
    else: ## 形成标签inputs
        mlm_labels = []
        labels_idx = 0
        for tk in inputs.input_ids:
            if TOKENIZER.convert_ids_to_tokens(tk) == TOKENIZER.mask_token:
                label_token = MAPPER[labels[labels_idx]]
                mlm_labels.append(TOKENIZER.convert_tokens_to_ids(label_token)) # 加入映射后的prompt token id
                labels_idx += 1
            else:
                mlm_labels.append(-100) # 非mask部分
        assert labels_idx==len(labels)
        assert len(mlm_labels)==len(inputs.input_ids)
        inputs['labels'] = mlm_labels
    ret = {}
    ret['input_ids'] = torch.tensor(inputs['input_ids'])
    ret['attention_mask'] = torch.tensor(inputs['attention_mask'])
    ret['labels'] = torch.tensor(inputs['labels'])
    return ret

#### format_line

In [5]:
def format_line(line, TOKENIZER):
    tmp = json.loads(line.strip())
    raw_contents = tmp['content'].strip()
    if type(tmp['entity']) == dict:
        entityArr = list(tmp['entity'].keys())
        labels = list(tmp['entity'].values())
    elif type(tmp['entity']) == list:
        entityArr = tmp['entity']
        labels = None
    else:
        print('entity type error!')
    prompt = '在这篇新闻中'
    for entity in entityArr:
        prompt += f'，{entity}是{TOKENIZER.mask_token}'
    prompt += '。'
    prompt_token_len = len(TOKENIZER(prompt,add_special_tokens=False).input_ids)
    
    text = sample_context_by_list(entityArr, raw_contents, length=CFG.max_len-prompt_token_len)
    text = prompt + text
    
    inputs = prepare_input(text, entityArr, labels, TOKENIZER)
    return inputs

#### Dataset

In [6]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
class TrainDataset(Dataset):
    def __init__(self, input_file):
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        self.inputs = []
        for line in tqdm(lines):
            self.inputs.append(format_line(line.strip(), tokenizer))
        print(f'load data from {input_file} len={len(self.inputs)}')

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        return self.inputs[item]

### metric

In [7]:
def compute_metrics(p) -> dict:
    preds,labels=p
    preds = np.argmax(preds, axis=-1)
    valid_idx = np.where(labels!=-100)  # 只计算情感标签部分
    preds = preds[valid_idx].ravel()
    labels = labels[valid_idx].ravel()
    print(classification_report(labels, preds))
    macro_f1 = f1_score(labels, preds, average='macro')
    return {
        'macro_f1': macro_f1,
    }

### 主程序

#### 加载数据和模型

In [8]:
train_dataset = TrainDataset(CFG.train_file)
valid_dataset = TrainDataset(CFG.valid_file)

  0%|          | 0/96699 [00:00<?, ?it/s]

load data from ../../nlp_data/final/train.mix.txt len=96699


  0%|          | 0/5789 [00:00<?, ?it/s]

load data from ../../nlp_data/final/valid.mix.txt len=5789


In [9]:
model = AutoModelForMaskedLM.from_pretrained(CFG.model)

Some weights of the model checkpoint at /home/zyj/PTMs/chinese-roberta-wwm-ext/ were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### 训练

In [None]:
EVAL_STEP = len(train_dataset)//CFG.batch_size//3  # 每轮3次

training_args = TrainingArguments(
    output_dir=CFG.output_dir,
    evaluation_strategy="steps",
    eval_steps=EVAL_STEP,
    save_steps=EVAL_STEP,
    save_total_limit=10,
#     eval_delay=len(train_dataset)//CFG.batch_size*3,
    logging_steps=1,
    learning_rate=CFG.learning_rate,
    per_device_train_batch_size=CFG.batch_size,
    per_device_eval_batch_size=CFG.batch_size,
    num_train_epochs=CFG.epochs,
    weight_decay=CFG.weight_decay,
    disable_tqdm=False,
    eval_accumulation_steps = 5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
#     data_collator=data_collator,
    compute_metrics=compute_metrics,
)
wandb.init(project='sohu-2022-mlm')
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzyijie[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss
