In [2]:
import os
import random
from typing import List
import jsonlines
import json
import pandas as pd
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import transformers
from transformers.optimization import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig, BertTokenizer
# from torchmetrics import MetricCollection, Accuracy, Precision, Recall, F1Score
from sklearn.metrics import mean_squared_error, classification_report, f1_score
from tqdm.auto import tqdm
from scipy.special import softmax
from scipy import stats

transformers.logging.set_verbosity_error()

## 参数

In [3]:
class CFG:
    apex=True
    num_workers=0
    test_file = '../../nlp_data/test.txt'
    base_bert_path = "/home/zyj/PTMs/ernie-gram-zh/" 
    large_bert_path = "/home/zyj/PTMs/chinese-roberta-wwm-ext-large/"
    classify_model = './newbaseline/roberta-large_all_saved/'
    labeling_model = './seqLabeling/roberta-large_len3_random_saved/'
#     labeling_model = './seqLabeling/model_saved/allData_roberta-large_batchWeight/'
    max_len=512
    save_path = './output.txt'

    # max_grad_norm=1000  

os.environ["CUDA_VISIBLE_DEVICES"] = '1'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
# DEVICE = torch.device('cpu') 

# # 预训练模型目录
# base_tokenizer = AutoTokenizer.from_pretrained(CFG.base_bert_path)
# # large_tokenizer = AutoTokenizer.from_pretrained(CFG.large_bert_path)



## 模型定义

### SentenceClassifier

In [4]:
class SentenceClassifier(nn.Module):
    def __init__(self, cfg, config_path=None, model_path=False):
        super().__init__()
        self.cfg = cfg
        self.config = torch.load(config_path)
        self.model = AutoModel.from_config(self.config)
        self.model.load_state_dict(torch.load(model_path), map_location=torch.device('cuda'))
        self.model.cuda()
        print(f'load model from {model_path}')
        
        self.fc = nn.Linear(self.config.hidden_size, 5)
        self._init_weights(self.fc)
        self.drop1=nn.Dropout(0.1)
        self.drop2=nn.Dropout(0.2)
        self.drop3=nn.Dropout(0.3)
        self.drop4=nn.Dropout(0.4)
        self.drop5=nn.Dropout(0.5)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = torch.mean(outputs[0], axis=1)
        return last_hidden_states
    
    def loss(self,logits,labels):

        loss_fnc = nn.CrossEntropyLoss(weight=torch.from_numpy(np.array([2,1,0.5,1,3])).float() ,
                                        size_average=True).cuda()

        loss = loss_fnc(logits, labels)
        return loss

    def forward(self, inputs, labels=None, training=True):
        feature = self.feature(inputs)
        if  training:
            logits1 = self.fc(self.drop1(feature))
            logits2 = self.fc(self.drop2(feature))
            logits3 = self.fc(self.drop3(feature))
            logits4 = self.fc(self.drop4(feature))
            logits5 = self.fc(self.drop5(feature))
            _loss=0
            if labels is not None:
                loss1 = self.loss(logits1,labels)
                loss2 = self.loss(logits2,labels)
                loss3 = self.loss(logits3,labels)
                loss4 = self.loss(logits4,labels)
                loss5 = self.loss(logits5,labels)
                _loss = (loss1 + loss2 + loss3 + loss4 + loss5)/5
            return _loss
        else:
            output = self.fc(feature)
            output = F.softmax(output, dim=1)
            return output
       

### SeqLabeling

In [5]:
class SeqLabeling(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.base_bert_path, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            print('='*10+f' load PTM from {cfg.base_bert_path} '+'='*10)
            self.model = AutoModel.from_pretrained(cfg.base_bert_path, config=self.config)
        else:
            print(self.config)
            self.model = AutoModel.from_config(self.config)
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        self.drop1=nn.Dropout(0.1)
        self.drop2=nn.Dropout(0.2)
        self.drop3=nn.Dropout(0.3)
        self.drop4=nn.Dropout(0.4)
        self.drop5=nn.Dropout(0.5)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def loss(self,logits,labels,weights):
        # loss_fnc = FocalLoss(6)
        # loss_fnc = nn.CrossEntropyLoss()

        # loss_fnc = nn.CrossEntropyLoss(ignore_index=0)

        # loss_fnc = nn.CrossEntropyLoss(weight=torch.from_numpy(np.array([0.1, 2, 1, 0.5, 1, 3])).float() ,
        #                                 size_average=True).cuda()
        loss_fnc = nn.CrossEntropyLoss(weight=torch.from_numpy(np.array([0.1, 2, 1, 0.5, 1, 3])).float() ,
                                        size_average=True,
                                        reduction='none').cuda()
        # loss_fnc = nn.CrossEntropyLoss(weight=torch.from_numpy(np.array([0.1, 2, 1, 0.5, 1, 3])).float() ,
        #                                 size_average=True,
        #                                 ignore_index=0).cuda()
        # loss_fnc = DiceLoss(smooth = 1, square_denominator = True, with_logits = True,  alpha = 0.01 )
        loss = loss_fnc(logits, labels)

        loss = (loss * weights).mean()
        return loss

    def forward(self, inputs, labels=None, weights=None, training=True):
        feature = self.model(**inputs)[0]
        # out = self.model(**inputs)
        # # first-last-avg
        # avg = torch.cat((out.hidden_states[1].unsqueeze(2), 
        #                  out.hidden_states[-1].unsqueeze(2)), dim=2)                # [batch, seq, 2, emd_dim]
        # feature = F.avg_pool2d(avg.transpose(2, 3),kernel_size=(1,2)).squeeze(-1)   # [batch, seq, emd_dim]

        if  training:
            logits1 = self.fc(self.drop1(feature))
            logits2 = self.fc(self.drop2(feature))
            logits3 = self.fc(self.drop3(feature))
            logits4 = self.fc(self.drop4(feature))
            logits5 = self.fc(self.drop5(feature))
            _loss=0
            if labels is not None:
                loss1 = self.loss(logits1.permute(0, 2, 1), labels, weights)
                loss2 = self.loss(logits2.permute(0, 2, 1), labels, weights)
                loss3 = self.loss(logits3.permute(0, 2, 1), labels, weights)
                loss4 = self.loss(logits4.permute(0, 2, 1), labels, weights)
                loss5 = self.loss(logits5.permute(0, 2, 1), labels, weights)
                _loss = (loss1 + loss2 + loss3 + loss4 + loss5) / 5
                # _loss = loss3
            return _loss
        else:
            output = self.fc(feature)
            # output = F.softmax(output, dim=1)
            return output


## formater

### BM25_Model

In [6]:
class BM25_Model(object):
    def __init__(self, documents_list, k1=2, k2=1, b=0.75):
        self.documents_list = documents_list
        self.documents_number = len(documents_list)
        self.avg_documents_len = sum([len(document) for document in documents_list]) / self.documents_number
        self.f = []
        self.idf = {}
        self.k1 = k1
        self.k2 = k2
        self.b = b
        self.init()

    def init(self):
        df = {}
        for document in self.documents_list:
            temp = {}
            for word in document:
                temp[word] = temp.get(word, 0) + 1
            self.f.append(temp)
            for key in temp.keys():
                df[key] = df.get(key, 0) + 1
        for key, value in df.items():
            self.idf[key] = np.log((self.documents_number - value + 0.5) / (value + 0.5))

    def get_score(self, index, query):
        score = 0.0
        document_len = len(self.f[index])
        qf = Counter(query)
        for q in query:
            if q not in self.f[index]:
                continue
            score += self.idf[q] * (self.f[index][q] * (self.k1 + 1) / (
                    self.f[index][q] + self.k1 * (1 - self.b + self.b * document_len / self.avg_documents_len))) * (
                             qf[q] * (self.k2 + 1) / (qf[q] + self.k2))

        return score

    def get_documents_score(self, query):
        score_list = []
        for i in range(self.documents_number):
            score_list.append(self.get_score(i, query))
        return score_list
    
def split_sentence(content:str):
    for each in '。；！!?？':
        content = content.replace(each, each+'##')
    return content.split('##')

def bm25_sample(content, query, augment=1, length=512):
    """
    bm25相似度打分，然后进行截断，使其<=length
    :param query:
    :param content:
    :param length:
    :return:
    """

    if len(content) <= length:
        return [content]
    else:
        document_list = split_sentence(content.strip())
        rest_document_list = list()
        for document in document_list:
            if len(document) != 0:
                rest_document_list.append(document)

        document_list = rest_document_list
        model = BM25_Model(document_list)
        scores = model.get_documents_score(query)
        index_scores = []
        for index, score_i in enumerate(scores):
            index_scores.append((index, score_i))

        index_scores.sort(key=lambda index_score: index_score[1], reverse=True)
        
        save_document = [0] * len(document_list)
        content_length = 0
        
        for item in index_scores:
            index = item[0]
            save_document[index] = 1
            if content_length + len(document_list[index]) > length:
                break
            else:
                content_length += len(document_list[index])
        if augment ==1: # 不进行数据增强
            new_content = ""
            for i, save in enumerate(save_document):
                if save != 0:
                    new_content += document_list[i]
            # print(len(new_content),'|',new_content)
            return [new_content]

### sample_context_by_list

In [7]:
def merge_idx(idxArr, span, content):
    assert len(idxArr) >= 1
    if len(idxArr)==1:
        return content[max(0,idxArr[0]-span) : min(len(content),idxArr[0]+span)]
    i = 0
    ret = []
    while True:
        if i>=len(idxArr):break
        temp_i = i
        for j in range(i+1,len(idxArr)):
            if idxArr[j]-idxArr[temp_i] > 2*span:
                temp_i = j-1
                break
            else:
                temp_i = j
        ret.append(content[max(0,idxArr[i]-span) : min(len(content),idxArr[temp_i]+span)])    
        i = temp_i+1
    return '#'.join(ret)
            
def sample_context_by_list(entitys:list, content:str, length:int):
    '''
    通过entity列表筛选content中对应每个实体位置的前后文
    '''
    cnt = 0
    for entity in entitys:
        cnt += content.count(entity)
    if cnt == 0 or len(content)<=length:
        return content
    span = int(length/cnt/2)
    idxArr = []
    for entity in entitys:
        idx = content.find(entity,0)
        while idx != -1:
            idxArr.append(idx)
            idx = content.find(entity,idx+1)
    idxArr = sorted(idxArr)
    result = merge_idx(idxArr, span, content)
    return result

### SentenceClassifier Formater

In [8]:
def sc_prepare_input(text, feature_text):
    inputs = base_tokenizer(text, feature_text, 
                               add_special_tokens=True,
                               truncation = True,
                               max_length=CFG.max_len,
                               padding="max_length",
                               return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

def sc_formater(dic:dict):
    ret = []
    raw_contents = dic['content']
    raw_entitys = dic['entity']
    for entity in raw_entitys:
        raw_content = raw_contents.strip()
        # text = sample_context(entity, text, CFG.max_len-20)
        texts = bm25_sample(raw_content, entity, length=CFG.max_len-len(entity))
        text = texts[0]
        inputs = sc_prepare_input(text, entity)
        ret.append(inputs)
    return ret

### SeqLabeling Formater

In [9]:
def maxtch_token(token_ids:list, sentence_ids:list):
    # 得到实体的token list在句子的所有开始位置
    ret = []
    startId = token_ids[0]
    for idx, candId in enumerate(sentence_ids):
        if candId == startId and sentence_ids[idx:idx+len(token_ids)] == token_ids:
            ret.append(idx)
    assert len(ret) > 0
    return ret

def tag_entity_span(entity_startIndexs:list, entityLen:int, tag:int, targets:list):
    for index in entity_startIndexs:
        for span in range(entityLen):
            targets[index+span] = tag
    return targets

def getTrainEntityInfo(entityDic, TOKENIZER):
    # entityDic{实体名:标签}
    tagDic = {}  # {实体名:[实体ids, 实体情感标签, 实体在原字典中的次序, 实体ids长度]}
    for idx, (entity,label) in enumerate(entityDic.items()):
        entityIds = TOKENIZER(entity).input_ids[1:-1]
        tagDic[entity] = [
            entityIds,
            int(label)+3,  #-2~2 => 1~5
            idx+1, len(entityIds)
        ]
    '''
    按实体ids长度从短到长排序，后面标注时若出现嵌套实体，会先标注短实体，然后标注长实体覆盖短实体标签
    '''
    return sorted(tagDic.items(), key=lambda x:x[1][-1])

def getTestEntityInfo(entityArr, TOKENIZER):
    # entityArr[实体名]
    tagDic = {}  # {实体名:[实体ids, 实体在原字典中的次序, 实体ids长度]}
    for idx, entity in enumerate(entityArr):
        entityIds = TOKENIZER(entity).input_ids[1:-1]
        tagDic[entity] = [
            entityIds,
            idx+1, len(entityIds)
        ]
    '''
    按实体ids长度从短到长排序，后面标注时若出现嵌套实体，会先标注短实体，然后标注长实体覆盖短实体标签
    '''
    return sorted(tagDic.items(), key=lambda x:x[1][-1])


def SL_formater(entityArr, text, TOKENIZER):
    entity_content = '、'.join(entityArr)
    text = sample_context_by_list(entityArr, text, CFG.max_len-len(entity_content))
    inputs = TOKENIZER(text, entity_content, 
               add_special_tokens=True,
               truncation = True,
               max_length=CFG.max_len,
               padding="max_length",
               return_offsets_mapping=False)

    labels_ids = [0] * len(inputs.input_ids)
    entityInfoItems = getTestEntityInfo(entityArr, TOKENIZER)
    for entity, info in entityInfoItems:
#         print(1111, entity, info)
        entity_startIndexs = maxtch_token(info[0], inputs.input_ids)
        labels_ids = tag_entity_span(entity_startIndexs, info[-1], info[1], labels_ids) # 标注labels_ids序列

    assert len(inputs['input_ids']) == len(labels_ids)

    # 转换为tensor
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    labels_ids = torch.tensor(labels_ids, dtype=torch.long)

    return inputs, labels_ids

### MaskSeqLabel Formater

In [10]:
def mask_SL_formater(entityArr, text, TOKENIZER):
    
    text = sample_context_by_list(entityArr, text, CFG.max_len)
    # 保证每个实体出现在文本中
    text = '你对%s怎么看？' % '、'.join(entityArr) + text
    
    entitys = []
    temp = {}
    for i,entity in enumerate(entityArr):
        key = '[et%d]' % i
        entitys.append(key)
        temp[entity] = len(entity)
    temp = sorted(temp.items(), key=lambda x:-x[1]) # 实体按长度排序，避免长词包含短词的情况

    for idx, item in enumerate(temp):
        key = '[et%d]' % idx
        text = text.replace(item[0], key) # 替换原实体
        
    inputs = TOKENIZER(text,
               add_special_tokens=True,
               truncation = True,
               max_length=CFG.max_len,
               padding="max_length",
               return_offsets_mapping=False)

    idDic = {}  # label_id
    label_ids = []
    for idx, entity in enumerate(entitys):
        idDic[tokenizer(entity).input_ids[1]] = idx+1
    for each in inputs.input_ids:
        if each in idDic:
            label_ids.append(idDic[each])  
        else:
            label_ids.append(0) 
#     print(text)
#     print(entitys)
#     print(inputs.input_ids)
#     en_cnt = 0
#     for each in label_ids:
#         if each :
#             en_cnt += 1
#     print(en_cnt, label_ids)
    label_ids = torch.tensor(label_ids, dtype=torch.long)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    
    return inputs, label_ids

In [11]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

## inference

### seqLabel模型

In [40]:
# CFG.base_bert_path = '/home/zyj/PTMs/albert_chinese_base/'
CFG.base_bert_path = '/home/zyj/PTMs/chinese-roberta-wwm-ext/'
# CFG.base_bert_path = '/home/zyj/PTMs/ernie-gram-zh/'
path1 = './seqLabeling/final_roberta_epoch16_saved/'
model = SeqLabeling(CFG, config_path=path1+'config.pth', pretrained=True)
model.to(DEVICE)
model_path = os.path.join(path1, f"model_fold0_best.bin")
print(f'=========== load model from {model_path} ===========')
model.load_state_dict(torch.load(model_path,map_location=torch.device('cuda')))
model.eval()
logits_path = path1 + 'valid.npy'



In [41]:
cnt = 0
tokenizer = BertTokenizer.from_pretrained(CFG.base_bert_path)
raw_logits = [] # [样本数, 实体数, 5]
with open('./section1.txt', 'w') as fw:
    with open('../nlp_data/final/valid.mix.txt', 'r') as f:
#     with open('../nlp_data/newTest.txt', 'r') as f:
#     with open('../Recommendation/data/rec_data/all_content.txt', 'r') as f:
        lines = f.readlines()
        for line in tqdm(lines):
            cnt += 1
            dic = json.loads(line.strip())
            entityLen = len(dic['entity'])
            if type(dic['entity']) == dict:
                entityArr = list(dic['entity'].keys())
            elif type(dic['entity']) == list:
                entityArr = dic['entity']
            else:
                print('type error!')
            index = 0
#             print(dic['entity'])
            inputs, label_ids = SL_formater(dic['entity'], dic['content'], tokenizer)
            for k, v in inputs.items():
                inputs[k] = v.unsqueeze(0).to(DEVICE)
            with torch.no_grad():
                with autocast():
                    pred_logits = model(inputs,training=False)
            pred_logits = pred_logits.squeeze(0).detach().cpu().numpy()
            tmp_result = {}
#             print(111, pred_logits.shape,max(label_ids))
#             print(111, label_ids)
            sample_logits = []  # [样本实体数, 5]
            for i in range(1, max(label_ids)+1):
                ind = np.where(label_ids==i)
                logits = pred_logits[ind]   # [实体字数, 6] 
                logits = logits[:,1: ]      # [实体字数, 5] 
                logits = softmax(logits, axis=-1) 
                
                sample_logits.append(np.mean(logits, axis=0))
                label = np.mean(logits, axis=0).argmax()-2
#                 print(dic['entity'][i], label)
                tmp_result[entityArr[i-1]] = label
                index += 1
            raw_logits.append(sample_logits)
#             print(1111, str(dic['id']), tmp_result)
            fw.write(str(dic['id']) + '	' + json.dumps(tmp_result, ensure_ascii=False, cls=NpEncoder) + '\n')
#             if cnt==5:
#                 break
np.save(logits_path, np.array(raw_logits))

  0%|          | 0/5789 [00:00<?, ?it/s]



### mask SeqLabel模型

In [55]:
CFG.base_bert_path = '/home/zyj/PTMs/chinese-roberta-wwm-ext-large/'
# path1 = './seqLabeling/model_saved/allData_roberta-large_batchWeight/'
# CFG.base_bert_path = '/home/zyj/PTMs/ernie-gram-zh/'
path1 = './seqLabeling/mask_final_roberta_epoch16_saved/'
model = SeqLabeling(CFG, config_path=path1+'config.pth', pretrained=False)
model.to(DEVICE)
model_path = os.path.join(path1, f"model_fold0_best.bin")
print(f'=========== load model from {model_path} ===========')
model.load_state_dict(torch.load(model_path,map_location=torch.device('cuda')))
model.eval()
logits_path = path1 + 'valid.npy'

BertConfig {
  "_name_or_path": "/home/zyj/sohu/SentimentClassification/domainAdaption/mask_roberta_saved/epoch16/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "directionality": "bidi",
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21158
}



In [56]:
# 载入预训练模型的分词器
tokenizer = AutoTokenizer.from_pretrained(CFG.base_bert_path)

print('原始词表大小=', len(tokenizer))
characters=[]
for i in range(30):
    characters.append('[et%d]' % i )
tokenizer.add_tokens(characters)
print('当前词表大小=',len(tokenizer))

原始词表大小= 21128
当前词表大小= 21158


In [58]:
cnt = 0
raw_logits = [] # [样本数, 实体数, 5]
with open('./section1.txt', 'w') as fw:
    with open('../nlp_data/final/valid.mix.txt', 'r') as f:
#     with open('../nlp_data/newTest.txt', 'r') as f:
        lines = f.readlines()
#         for line in tqdm(random.sample(lines,1)):
        for line in tqdm(lines):
            cnt += 1
            dic = json.loads(line.strip())
            entityLen = len(dic['entity'])
            if type(dic['entity']) == dict:
                entityArr = list(dic['entity'].keys())
            elif type(dic['entity']) == list:
                entityArr = dic['entity']
            else:
                print('type error!')
            index = 0
#             print(len(dic['content']), dic['content'],'\n')
            inputs, label_ids = mask_SL_formater(dic['entity'], dic['content'], tokenizer)
            for k, v in inputs.items():
                inputs[k] = v.unsqueeze(0).to(DEVICE)
            with torch.no_grad():
                with autocast():
                    pred_logits = model(inputs,training=False)
            pred_logits = pred_logits.squeeze(0).detach().cpu().numpy()
            tmp_result = {}
#             print(111, pred_logits.shape,max(label_ids))
#             print(111, label_ids)
            sample_logits = []  # [样本实体数, 5]
            for i in range(1, max(label_ids)+1):
                ind = np.where(label_ids==i)
                logits = pred_logits[ind]   # [实体字数, 6] 
                logits = logits[:,1: ]      # [实体字数, 5] 
                logits = softmax(logits, axis=-1) 
                
                sample_logits.append(np.mean(logits, axis=0))
                label = np.mean(logits, axis=0).argmax()-2
#                 print(dic['entity'][i], label)
                tmp_result[entityArr[i-1]] = label
                index += 1
            raw_logits.append(sample_logits)
#             print(1111, str(dic['id']), tmp_result)
            fw.write(str(dic['id']) + '	' + json.dumps(tmp_result, ensure_ascii=False, cls=NpEncoder) + '\n')
#             if cnt==5:
#                 break
np.save(logits_path, np.array(raw_logits))

  0%|          | 0/5789 [00:00<?, ?it/s]



### 计算指标

In [80]:
import json
import random
import pandas as pd
import numpy as np

def loadNP(path):
    ret = []
    logits = np.load(path,allow_pickle=True)
    for each in logits:
        ret.append(np.array(each))
    return np.array(ret)
model_names = ['final_ernie_saved', 'final_ernie_epoch8_saved', 'final_roberta_saved', 'final_roberta_epoch16_saved',
              'mask_final_ernie_saved', 'mask_final_ernie_epoch8_saved', 'mask_final_roberta_saved', 'mask_final_roberta_epoch16_saved']
single_model_num = len(model_names)
logitsArr = []
for name in model_names:
    path = f'./seqLabeling/{name}/valid.npy'
    logitsArr.append(loadNP(path))

l1 = 0.5*logitsArr[0]+0.5*logitsArr[3]
l2 = 0.5*logitsArr[5]+0.5*logitsArr[7]
l3 = 0.9*l1+0.1*l2
l4 = 0.8*l1+0.2*l2
model_names.append('seqEnsemble1')
# model_names.append('seqEnsemble2')
model_names.append('maskEnsemble1')
# model_names.append('maskEnsemble2')
model_names.append('allEnsemble1')
model_names.append('allEnsemble2')
logitsArr.extend([l1, l2, l3, l4])

  # This is added back by InteractiveShellApp.init_path()


In [81]:
truths = []
data = '../nlp_data/final/valid.mix.txt'
with open(data, 'r') as f:
    for line in f.readlines():
        dic = json.loads(line.strip())
        truths.extend(list(dic['entity'].values()))

for idx, logits in enumerate(logitsArr):
    preds = []
    for entity_logits in logits:
        for logit in entity_logits:
            preds.append(logit.argmax()-2)
    print(model_names[idx], '\t', f1_score(truths, preds,average='macro'))
#     print(classification_report(truths, preds))
votes = []
print('单模数量 =', single_model_num)
for i in range(len(logitsArr[0])):
    for j in range(len(logitsArr[0][i])):
        temp = []
        for k in range(single_model_num):
            label = logitsArr[k][i][j].argmax()-2
            temp.append(label)
        retult = stats.mode(temp)[0][0]
        votes.append(retult)
print('vote', '\t', f1_score(truths, votes,average='macro'))   


final_ernie_saved 	 0.7618706372318993
final_ernie_epoch8_saved 	 0.7609359471499404
final_roberta_saved 	 0.764467757109265
final_roberta_epoch16_saved 	 0.7660353209646705
mask_final_ernie_saved 	 0.7491064837306081
mask_final_ernie_epoch8_saved 	 0.7506124828787996
mask_final_roberta_saved 	 0.7317627257159598
mask_final_roberta_epoch16_saved 	 0.7400643683468536
seqEnsemble1 	 0.783115017661839
maskEnsemble1 	 0.759238357322033
allEnsemble1 	 0.7844019167626832
allEnsemble2 	 0.7817128350934827
单模数量 = 8
vote 	 0.7838416106311322


### Case study

In [31]:
data = '../nlp_data/newTrain.txt'
lines = []
idx = 0
with open(data, 'r') as f:
    for line in f.readlines():
        lines.append([idx, line])
        idx += 1
lines  = pd.DataFrame(lines, columns=['idx', 'content'])
lines

Unnamed: 0,idx,content
0,0,"{""id"": 6000001, ""content"": ""看来大家对于《余生请多指教》还是信心..."
1,1,"{""id"": 6000002, ""content"": ""阳性感染者92（女，29岁），后经市..."
2,2,"{""id"": 6000003, ""content"": ""为进一步完善反洗钱监管制度，提高反洗..."
3,3,"{""id"": 6000004, ""content"": ""1、锅中水烧开，放入面条，煮至九成熟..."
4,4,"{""id"": 6000005, ""content"": ""导语：2022年，翻身财运旺，好运随..."
...,...,...
13288,13288,"{""id"": 6013289, ""content"": ""生肖羊的人外表会感觉忠厚老实，其实他..."
13289,13289,"{""id"": 6013290, ""content"": ""韩国女团APink组合的女星孙娜恩在..."
13290,13290,"{""id"": 6013291, ""content"": ""我现在是不是应该多给他鼓励，不要逼他..."
13291,13291,"{""id"": 6013292, ""content"": ""8、一个人要赢得另一个人很容易，那就..."


In [65]:
change1 = 0
gaizheng1 = 0
change2 = 0
gaizheng2 = 0
for item in lines.sample(100).itertuples():
# for item in lines.itertuples():
    dic = json.loads(item.content.strip())
    idx = item.idx
#     print(dic)
    id = dic['id']
    tempA = []
    tempB = []
    tempC = []
    flag = False
    for i,(entity,label) in enumerate(dic['entity'].items()):
        a = logits1[idx][i].argmax()-2
        b = logits2[idx][i].argmax()-2
        c = (0.7*logits1[idx][i]+0.3*logits2[idx][i]).argmax()-2
        tempA.append(a)
        tempB.append(b)
        tempC.append(c)
        if a != label:
            change1 += 1
            if b == label:
                gaizheng1 += 1
        if b != label:
            change2 += 1
            if a == label:
                gaizheng2 += 1
        if abs(a-label)>=1 and abs(b-label)==0:
            flag = True
    if flag :
        print(len(dic['content']), dic['content'], dic['entity'], tempA, tempB, tempC,sep='\n')
            
print(change1, gaizheng1)
print(change2, gaizheng2)


84
问及拥有38000名成员的山口组为何出现在许老的葬礼上，山口组则表示：“我们是来表达对蚊哥（许海清）的尊敬。”山口组在日本横行，在许海清面前却还是很恭敬的自称“小弟”。
{'许海清': 0}
[1]
[0]
[0]
457
孙卓的生日可谓是意义非凡，自从成为了打拐明星后，孙海洋一家的动态就格外受到网友们的关注，孙海洋和孙妈妈也很乐于分享家里的快乐，但很多时候，网友们过分的热情给一家人也带来了不小的困扰。近期，有网友因孙海洋一家不开直播打赏不带货，选择直接用转账的方式对孙海洋一家进行打赏，希望孙海洋能够快点换上一个大房子，给三个孩子更优质的生活。在陆续收到网友们的转账后，孙海洋吓得连夜紧急关闭转账功能。其实这段时间对于孙海洋来说，还有很多东西需要进行整理，官司还没开始，想要胜诉多年积累的资料还需要进一步的归纳提炼，这毕竟会是一场持久战。这次直播中，孙海洋回应孙卓在学校很适应，但现在对孙卓的态度已经有了360度的转变，不光给孙卓起了一个新的称呼，从原来的卓总到卓卓，小卓，现在已经成了憨卓，恭喜卓总喜提新称号。甚至搬家的新房中，只有他的屋没有空调，家庭地位一落千丈，可以见得孙卓现在是真的适应了。但孙海洋也是真的飘了，拿起了一家之主的风范，程孙卓现在一切都好，就是有点小的问题，治一下就好了，所有的问题就是孙卓现在还不喜欢做家务。
{'孙海洋': 0, '孙卓': 0}
[-1, -1]
[-1, 0]
[-1, 0]
418
2010年，姜文投资1.3亿拍《让子弹飞》，妻子周韵向他推荐祖峰，但姜文却不同意：他没有流量，找他演商业片，是要赔钱的。 面对姜文的拒绝，周韵依然不放弃，还是坚持想让祖峰来出演：我和他有过合作，他的演技我很清楚，是非常不错的，要不你再考虑考虑。 是的，周韵曾和祖峰合作过电视剧《金婚风雨情》，两人也因此相识，祖峰精湛的演技给周韵留下了深刻的印象。 所以在姜文筹拍《让子弹飞》时，周韵首先想到了祖峰，并向丈夫力荐他。站在周韵的角度，她首先考虑的是演技，但作为导演的姜文却首要考虑的是片子能不能赚到钱，演员有没有票房号召力。 最终姜文还是选择了既有实力又有流量的周润发和葛优，祖峰知道后，沮丧地称：感觉就像是失恋了。 其实并不是祖峰不好，只能说是姜文不敢轻易冒这个险，来启用祖峰。 在当下演艺圈，祖峰是大器晚成的演员，也是以精湛的演技和低调的品行著称的实力派演

## 输出

In [72]:
with open('./section1.txt', 'w') as fw:
    fw.write("id	result\n")
    with open('../nlp_data/newTest.txt', 'r') as f:
        lines = f.readlines()
        for i, line in enumerate(tqdm(lines)):
            dic = json.loads(line.strip())
            entityLen = len(dic['entity'])
            tmp_result = {}
            for j in range(entityLen):
                label = l3[i][j].argmax()-2
                tmp_result[dic['entity'][j]] = label
#             print(dic)
#             print(tmp_result)
            fw.write(str(dic['id']) + '	' + json.dumps(tmp_result, ensure_ascii=False, cls=NpEncoder) + '\n')
#             break

  0%|          | 0/12028 [00:00<?, ?it/s]