In [8]:
import pandas as pd
import numpy as np
from collections import Counter
import copy
import json
import time
import random
import os
import warnings
# import wandb

warnings.filterwarnings("ignore")

import scipy as sp
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from scipy.special import softmax

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import transformers
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, BertForMaskedLM, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
# from  dice_loss import  DiceLoss
# from  focalloss import  FocalLoss
transformers.logging.set_verbosity_error()

transformers.__version__: 4.5.1


## CFG

In [9]:
class CFG:
    max_len=512
    model = '/home/zyj/PTMs/chinese-roberta-wwm-ext/'
    train_file = '../../nlp_data/mlm.txt'
    
    '''mlm_config'''
    mlm_probability=0.15        # 被mask的token总数
    special_tokens_mask=None    # 特殊token
    prob_replace_mask=0.8       # 被替换成[MASK]的token比率
    prob_replace_rand=0.1       # 被随机替换成其他token比率
    prob_keep_ori=0.1           # 保留原token的比率
    assert sum([prob_replace_mask, prob_replace_rand, prob_keep_ori]) == 1,    ValueError("Sum of the probs must equal to 1.")
    
    '''train_config'''
    batch_size = 32
    epochs = 20
    learning_rate = 1e-5
    weight_decay = 0
    device='cuda:1'
    apex = True

## Content Sample

In [10]:
def merge_idx(idxArr, span, content):
    assert len(idxArr) >= 1
    if len(idxArr)==1:
        return content[max(0,idxArr[0]-span) : min(len(content),idxArr[0]+span)]
    i = 0
    ret = []
    while True:
        if i>=len(idxArr):break
        temp_i = i
        for j in range(i+1,len(idxArr)):
            if idxArr[j]-idxArr[temp_i] > 2*span:
                temp_i = j-1
                break
            else:
                temp_i = j
        ret.append(content[max(0,idxArr[i]-span) : min(len(content),idxArr[temp_i]+span)])    
        i = temp_i+1
    return '#'.join(ret)
            
def sample_context_by_list(entitys:list, content:str, length:int):
    '''
    通过entity列表筛选content中对应每个实体位置的前后文
    '''
    cnt = 0
    for entity in entitys:
        cnt += content.count(entity)
    if cnt == 0 or len(content)<=length:
        return content
    span = int(length/cnt/2)
    idxArr = []
    for entity in entitys:
        idx = content.find(entity,0)
        while idx != -1:
            idxArr.append(idx)
            idx = content.find(entity,idx+1)
    idxArr = sorted(idxArr)
    result = merge_idx(idxArr, span, content)
    return result

## 数据预处理

### Read Data

In [11]:
def get_train_data(input_file):
    corpus = []
    entitys = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            tmp = json.loads(line.strip())
            raw_contents = tmp['content'].strip()
            if type(tmp['entity']) == dict:
                entityArr = list(tmp['entity'].keys())
            elif type(tmp['entity']) == list:
                entityArr = tmp['entity']
            else:
                print('type error!')
            entity_content = ''.join(entityArr)
            
            text = sample_context_by_list(entityArr, raw_contents, length=CFG.max_len)
            
            corpus.append(text)
                
    train = {'content':corpus}
    train = pd.DataFrame(train)
    return train

### TrainDataset

In [12]:
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.input_texts = df['content'].values
        self.ori_inputs = copy.deepcopy(df['content'].values)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts) // CFG.batch_size
    
    def mask_tokens(self, inputs):
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
        labels = inputs.clone()
        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
        probability_matrix = torch.full(labels.shape, CFG.mlm_probability)
        if CFG.special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = CFG.special_tokens_mask.bool()

        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # We only compute loss on masked tokens

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, CFG.prob_replace_mask)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10% of the time, we replace masked input tokens with random word
        current_prob = CFG.prob_replace_rand / (1 - CFG.prob_replace_mask)
        indices_random = torch.bernoulli(torch.full(labels.shape, current_prob)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels

    def __getitem__(self, item):
        batch_text = list(self.input_texts[: CFG.batch_size])
        features = self.tokenizer(batch_text, max_length=512, truncation=True, padding=True, return_tensors='pt')
        inputs, labels = self.mask_tokens(features['input_ids'])
        batch = {"inputs": inputs, "labels": labels}
        self.input_texts = self.input_texts[CFG.batch_size: ]
        if not len(self):
            self.input_texts = self.ori_inputs
        return batch

## 训练代码

In [13]:
def train(model, train_dataloader, save_path):
    assert CFG.device.startswith('cuda') or CFG.device == 'cpu', ValueError("Invalid device.")
    device = torch.device(CFG.device)
    
    model.to(device)
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    if not len(train_dataloader):
        raise EOFError("Empty train_dataloader.")
        
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}]
    
    optimizer = AdamW(params=optimizer_grouped_parameters, lr=CFG.learning_rate, weight_decay=CFG.weight_decay)
    
    for cur_epc in range(int(CFG.epochs)):
        training_loss = 0
        print("Epoch: {}".format(cur_epc))
        model.train()
        tk0 = tqdm(enumerate(train_dataloader),total=len(train_dataloader))
        for step, batch in tk0:
            input_ids = batch['inputs'].squeeze(0).to(device)
            labels = batch['labels'].squeeze(0).to(device)
            with torch.cuda.amp.autocast(enabled=CFG.apex):
                loss = model(input_ids=input_ids, labels=labels).loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            training_loss += loss.item()
            tk0.set_postfix(Epoch=cur_epc, Loss=training_loss/(step+1))
        if cur_epc % 2 == 0:
            model_save_path = os.path.join(save_path,'epoch%d'%cur_epc)
            if not os.path.exists(model_save_path):
                os.makedirs(model_save_path)
            model.save_pretrained(model_save_path)                              
            print(f'save at {save_path}')
        print("Training loss: ", training_loss)

## 主程序

In [14]:
data = get_train_data(CFG.train_file)
print(f'load from {CFG.train_file} len={len(data)}')

tokenizer = AutoTokenizer.from_pretrained(CFG.model)
print('原始词表大小=', len(tokenizer))

tokenizer.save_pretrained('./mask_roberta_saved/')
train_dataset = TrainDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset)

bert_mlm_model = BertForMaskedLM.from_pretrained(CFG.model)
print(f'load PTM from {CFG.model}')

load from ../../nlp_data/mlm.txt len=141282
原始词表大小= 21128
load PTM from /home/zyj/PTMs/chinese-roberta-wwm-ext/


In [35]:
train(model=bert_mlm_model, train_dataloader=train_dataloader, save_path='./mask_ernie_saved/')

Epoch: 0


  0%|          | 0/4415 [00:00<?, ?it/s]

save at ./mask_ernie_saved/
Training loss:  15245.878112077713
Epoch: 1


  0%|          | 0/4415 [00:00<?, ?it/s]

Training loss:  9509.89437174797
Epoch: 2


  0%|          | 0/4415 [00:00<?, ?it/s]

save at ./mask_ernie_saved/
Training loss:  8333.385088205338
Epoch: 3


  0%|          | 0/4415 [00:00<?, ?it/s]

Training loss:  7682.473536014557
Epoch: 4


  0%|          | 0/4415 [00:00<?, ?it/s]

save at ./mask_ernie_saved/
Training loss:  7239.4794890880585
Epoch: 5


  0%|          | 0/4415 [00:00<?, ?it/s]

Training loss:  6921.462135791779
Epoch: 6


  0%|          | 0/4415 [00:00<?, ?it/s]

save at ./mask_ernie_saved/
Training loss:  6668.978135585785
Epoch: 7


  0%|          | 0/4415 [00:00<?, ?it/s]

Training loss:  6461.259273648262
Epoch: 8


  0%|          | 0/4415 [00:00<?, ?it/s]

save at ./mask_ernie_saved/
Training loss:  6294.676764726639
Epoch: 9


  0%|          | 0/4415 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
train(model=bert_mlm_model, train_dataloader=train_dataloader, save_path='./mask_roberta_saved/')

Epoch: 0


  0%|          | 0/4357 [00:00<?, ?it/s]

save at ./mask_roberta_saved/
Training loss:  4010.3472693562508
Epoch: 1


  0%|          | 0/4415 [00:00<?, ?it/s]

Training loss:  3905.3408071398735
Epoch: 2


  0%|          | 0/4415 [00:00<?, ?it/s]

save at ./mask_roberta_saved/
Training loss:  3829.033541381359
Epoch: 3


  0%|          | 0/4415 [00:00<?, ?it/s]

Training loss:  3769.4380006194115
Epoch: 4


  0%|          | 0/4415 [00:00<?, ?it/s]

save at ./mask_roberta_saved/
Training loss:  3720.1822280287743
Epoch: 5


  0%|          | 0/4415 [00:00<?, ?it/s]

Training loss:  3682.223714828491
Epoch: 6


  0%|          | 0/4415 [00:00<?, ?it/s]

save at ./mask_roberta_saved/
Training loss:  3644.104201734066
Epoch: 7


  0%|          | 0/4415 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



save at ./mask_roberta_saved/
Training loss:  3585.7946223020554
Epoch: 9


  0%|          | 0/4415 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

