In [1]:
import os
import random
from typing import List
import jsonlines
import json
import pandas as pd
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import transformers
from transformers.optimization import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig, BertTokenizer
from sklearn.metrics import mean_squared_error, classification_report, f1_score
from scipy.special import softmax

from sklearn.metrics import roc_auc_score, f1_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tqdm.notebook import tqdm
from deepctr_torch.inputs import SparseFeat, get_feature_names, VarLenSparseFeat, DenseFeat
from pprint import pprint

In [2]:
class CFG:
    apex=True
    num_workers=0
    test_file = '../nlp_data/newTest.txt'
    seq_model1 = "./nlp_final_model/final_ernie_saved/" 
    seq_model2 = "./nlp_final_model/final_roberta_epoch16_saved/" 
    mask_model1 = "./nlp_final_model/mask_final_ernie_epoch8_saved/" 
    mask_model2 = "./nlp_final_model/mask_final_roberta_epoch16_saved/" 
    max_len=512
    
    # rec
    rec_test = '../Recommendation/data/rec_data/newTest-dataset.csv'
    rec_model = './DIFM.h5'


os.environ["CUDA_VISIBLE_DEVICES"] = '0'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
device = 'cuda'
# DEVICE = torch.device('cpu') 
# device = 'cpu'

# NLP

## sample_context_by_list

In [3]:
def merge_idx(idxArr, span, content):
    assert len(idxArr) >= 1
    if len(idxArr)==1:
        return content[max(0,idxArr[0]-span) : min(len(content),idxArr[0]+span)]
    i = 0
    ret = []
    while True:
        if i>=len(idxArr):break
        temp_i = i
        for j in range(i+1,len(idxArr)):
            if idxArr[j]-idxArr[temp_i] > 2*span:
                temp_i = j-1
                break
            else:
                temp_i = j
        ret.append(content[max(0,idxArr[i]-span) : min(len(content),idxArr[temp_i]+span)])    
        i = temp_i+1
    return '#'.join(ret)
            
def sample_context_by_list(entitys:list, content:str, length:int):
    '''
    通过entity列表筛选content中对应每个实体位置的前后文
    '''
    cnt = 0
    for entity in entitys:
        cnt += content.count(entity)
    if cnt == 0 or len(content)<=length:
        return content
    span = int(length/cnt/2)
    idxArr = []
    for entity in entitys:
        idx = content.find(entity,0)
        while idx != -1:
            idxArr.append(idx)
            idx = content.find(entity,idx+1)
    idxArr = sorted(idxArr)
    result = merge_idx(idxArr, span, content)
    return result

## SeqLabeling Formater

In [4]:
def maxtch_token(token_ids:list, sentence_ids:list):
    # 得到实体的token list在句子的所有开始位置
    ret = []
    startId = token_ids[0]
    for idx, candId in enumerate(sentence_ids):
        if candId == startId and sentence_ids[idx:idx+len(token_ids)] == token_ids:
            ret.append(idx)
    assert len(ret) > 0
    return ret

def tag_entity_span(entity_startIndexs:list, entityLen:int, tag:int, targets:list):
    for index in entity_startIndexs:
        for span in range(entityLen):
            targets[index+span] = tag
    return targets

def getTrainEntityInfo(entityDic, TOKENIZER):
    # entityDic{实体名:标签}
    tagDic = {}  # {实体名:[实体ids, 实体情感标签, 实体在原字典中的次序, 实体ids长度]}
    for idx, (entity,label) in enumerate(entityDic.items()):
        entityIds = TOKENIZER(entity).input_ids[1:-1]
        tagDic[entity] = [
            entityIds,
            int(label)+3,  #-2~2 => 1~5
            idx+1, len(entityIds)
        ]
    '''
    按实体ids长度从短到长排序，后面标注时若出现嵌套实体，会先标注短实体，然后标注长实体覆盖短实体标签
    '''
    return sorted(tagDic.items(), key=lambda x:x[1][-1])

def getTestEntityInfo(entityArr, TOKENIZER):
    # entityArr[实体名]
    tagDic = {}  # {实体名:[实体ids, 实体在原字典中的次序, 实体ids长度]}
    for idx, entity in enumerate(entityArr):
        entityIds = TOKENIZER(entity).input_ids[1:-1]
        tagDic[entity] = [
            entityIds,
            idx+1, len(entityIds)
        ]
    '''
    按实体ids长度从短到长排序，后面标注时若出现嵌套实体，会先标注短实体，然后标注长实体覆盖短实体标签
    '''
    return sorted(tagDic.items(), key=lambda x:x[1][-1])


def SL_formater(entityArr, text, TOKENIZER):
    entity_content = '、'.join(entityArr)
    text = sample_context_by_list(entityArr, text, CFG.max_len-len(entity_content))
    inputs = TOKENIZER(text, entity_content, 
               add_special_tokens=True,
               truncation = True,
               max_length=CFG.max_len,
               padding="max_length",
               return_offsets_mapping=False)

    labels_ids = [0] * len(inputs.input_ids)
    entityInfoItems = getTestEntityInfo(entityArr, TOKENIZER)
    for entity, info in entityInfoItems:
#         print(1111, entity, info)
        entity_startIndexs = maxtch_token(info[0], inputs.input_ids)
        labels_ids = tag_entity_span(entity_startIndexs, info[-1], info[1], labels_ids) # 标注labels_ids序列

    assert len(inputs['input_ids']) == len(labels_ids)

    # 转换为tensor
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    labels_ids = torch.tensor(labels_ids, dtype=torch.long)

    return inputs, labels_ids

##  MaskSeqLabel Formater

In [5]:
def mask_SL_formater(entityArr, text, TOKENIZER):
    
    text = sample_context_by_list(entityArr, text, CFG.max_len)
    # 保证每个实体出现在文本中
    text = '你对%s怎么看？' % '、'.join(entityArr) + text
    
    entitys = []
    temp = {}
    for i,entity in enumerate(entityArr):
        key = '[et%d]' % i
        entitys.append(key)
        temp[entity] = len(entity)
    temp = sorted(temp.items(), key=lambda x:-x[1]) # 实体按长度排序，避免长词包含短词的情况

    for idx, item in enumerate(temp):
        key = '[et%d]' % idx
        text = text.replace(item[0], key) # 替换原实体
        
    inputs = TOKENIZER(text,
               add_special_tokens=True,
               truncation = True,
               max_length=CFG.max_len,
               padding="max_length",
               return_offsets_mapping=False)

    idDic = {}  # label_id
    label_ids = []
    for idx, entity in enumerate(entitys):
        idDic[TOKENIZER(entity).input_ids[1]] = idx+1
    for each in inputs.input_ids:
        if each in idDic:
            label_ids.append(idDic[each])  
        else:
            label_ids.append(0) 
#     print(text)
#     print(entitys)
#     print(inputs.input_ids)
#     en_cnt = 0
#     for each in label_ids:
#         if each :
#             en_cnt += 1
#     print(en_cnt, label_ids)
    label_ids = torch.tensor(label_ids, dtype=torch.long)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    
    return inputs, label_ids

## Model

In [6]:
class SeqLabeling(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        self.config = torch.load(config_path)
        print(f'load from {config_path}')
        self.model = AutoModel.from_config(self.config)
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        self.drop1=nn.Dropout(0.1)
        self.drop2=nn.Dropout(0.2)
        self.drop3=nn.Dropout(0.3)
        self.drop4=nn.Dropout(0.4)
        self.drop5=nn.Dropout(0.5)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def loss(self,logits,labels,weights):
        loss_fnc = nn.CrossEntropyLoss(weight=torch.from_numpy(np.array([0.1, 2, 1, 0.5, 1, 3])).float() ,
                                        size_average=True,
                                        reduction='none').cuda()
        loss = loss_fnc(logits, labels)
        loss = (loss * weights).mean()
        return loss

    def forward(self, inputs, labels=None, weights=None, training=True):
        feature = self.model(**inputs)[0]
        if  training:
            logits1 = self.fc(self.drop1(feature))
            logits2 = self.fc(self.drop2(feature))
            logits3 = self.fc(self.drop3(feature))
            logits4 = self.fc(self.drop4(feature))
            logits5 = self.fc(self.drop5(feature))
            _loss=0
            if labels is not None:
                loss1 = self.loss(logits1.permute(0, 2, 1), labels, weights)
                loss2 = self.loss(logits2.permute(0, 2, 1), labels, weights)
                loss3 = self.loss(logits3.permute(0, 2, 1), labels, weights)
                loss4 = self.loss(logits4.permute(0, 2, 1), labels, weights)
                loss5 = self.loss(logits5.permute(0, 2, 1), labels, weights)
                _loss = (loss1 + loss2 + loss3 + loss4 + loss5) / 5
                # _loss = loss3
            return _loss
        else:
            output = self.fc(feature)
            return output


## Load model

In [7]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

ernie_tokenizer = AutoTokenizer.from_pretrained('./nlp_final_model/ernie-gram-zh-tokenizer/')
roberta_tokenizer = AutoTokenizer.from_pretrained('./nlp_final_model/chinese-roberta-base-tokenizer/')

characters=[]
for i in range(30):
    characters.append('[et%d]' % i )
add_ernie_tokenizer = AutoTokenizer.from_pretrained('./nlp_final_model/ernie-gram-zh-tokenizer/')
add_ernie_tokenizer.add_tokens(characters)

add_roberta_tokenizer = AutoTokenizer.from_pretrained('./nlp_final_model/chinese-roberta-base-tokenizer/')
add_roberta_tokenizer.add_tokens(characters)

30

In [8]:
seq_model1 = SeqLabeling(CFG, config_path=CFG.seq_model1+'config.pth')
seq_model1.to(DEVICE)
model_path = os.path.join(CFG.seq_model1, f"model_fold0_best.bin")
seq_model1.load_state_dict(torch.load(model_path,map_location=torch.device(device)))
seq_model1.eval()

seq_model2 = SeqLabeling(CFG, config_path=CFG.seq_model2+'config.pth')
seq_model2.to(DEVICE)
model_path = os.path.join(CFG.seq_model2, f"model_fold0_best.bin")
seq_model2.load_state_dict(torch.load(model_path,map_location=torch.device(device)))
seq_model2.eval()

mask_model1 = SeqLabeling(CFG, config_path=CFG.mask_model1+'config.pth')
mask_model1.to(DEVICE)
model_path = os.path.join(CFG.mask_model1, f"model_fold0_best.bin")
# mask_model1.resize_token_embeddings(len(add_ernie_tokenizer)) 
mask_model1.load_state_dict(torch.load(model_path,map_location=torch.device(device)))
mask_model1.eval()


mask_model2 = SeqLabeling(CFG, config_path=CFG.mask_model2+'config.pth')
mask_model2.to(DEVICE)
model_path = os.path.join(CFG.mask_model2, f"model_fold0_best.bin")
# mask_model2.resize_token_embeddings(len(add_roberta_tokenizer)) 
mask_model2.load_state_dict(torch.load(model_path,map_location=torch.device(device)))
mask_model2.eval()

load from ./nlp_final_model/final_ernie_saved/config.pth
load from ./nlp_final_model/final_roberta_epoch16_saved/config.pth
load from ./nlp_final_model/mask_final_ernie_epoch8_saved/config.pth
load from ./nlp_final_model/mask_final_roberta_epoch16_saved/config.pth


SeqLabeling(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21158, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)

## inference

In [9]:
pred = []
truth = []
with open('./section1.txt', 'w') as fw:
    with open(CFG.test_file, 'r') as f:
#     with open('../nlp_data/newTest.txt', 'r') as f:
#     with open('../Recommendation/data/rec_data/all_content.txt', 'r') as f:
        lines = f.readlines()
        for line in tqdm(lines):
            dic = json.loads(line.strip())
            entityLen = len(dic['entity'])
            if type(dic['entity']) == dict:
                entityArr = list(dic['entity'].keys())
                truth.extend(list(dic['entity'].values()))
            elif type(dic['entity']) == list:
                entityArr = dic['entity']
            else:
                print('type error!')
            index = 0
#             print(dic['entity'])
            inputs1, label_ids1 = SL_formater(dic['entity'], dic['content'], ernie_tokenizer)
            inputs2, label_ids2 = SL_formater(dic['entity'], dic['content'], roberta_tokenizer)
            inputs3, label_ids3 = mask_SL_formater(dic['entity'], dic['content'], add_ernie_tokenizer)
            inputs4, label_ids4 = mask_SL_formater(dic['entity'], dic['content'], add_roberta_tokenizer)
            
            for inputs in [inputs1,inputs2,inputs3,inputs4]:
                for k, v in inputs.items():
                    inputs[k] = v.unsqueeze(0).to(DEVICE)
            
                
            with torch.no_grad():
                with autocast():
                    pred_logits1 = seq_model1(inputs1,training=False)
                    pred_logits2 = seq_model2(inputs2,training=False)
                    pred_logits3 = mask_model1(inputs3,training=False)
                    pred_logits4 = mask_model2(inputs4,training=False)
            
            tmp_result = {}
            model_logits = []
            label_ids_arr = [label_ids1, label_ids2, label_ids3, label_ids4]
            for idx, pred_logits in enumerate([pred_logits1, pred_logits2, pred_logits3, pred_logits4]):
                pred_logits = pred_logits.squeeze(0).detach().cpu().numpy()
                label_ids = label_ids_arr[idx]
                
#                 print(f'model{idx} label = ', end='')
                temp = []  # [样本实体数, 5]
                for i in range(1, max(label_ids)+1):
                    ind = np.where(label_ids==i)
                    logits = pred_logits[ind]   # [实体字数, 6] 
                    logits = logits[:,1: ]      # [实体字数, 5] 
                    logits = softmax(logits, axis=-1) 
                    logits = np.mean(logits, axis=0)
                    temp.append(logits)
#                     print(logits.argmax()-2, end=' ')
#                 print()
                model_logits.append(temp)
            for i in range(len(model_logits[0])):
                final_logits = 0.9*(0.5*model_logits[0][i]+0.5*model_logits[1][i])+0.1*(0.5*model_logits[2][i]+0.5*model_logits[3][i])
                label = final_logits.argmax()-2
                tmp_result[entityArr[i]] = label
                pred.append(label)
#             print(1111, str(dic['id']), tmp_result)
            fw.write(str(dic['id']) + '	' + json.dumps(tmp_result, ensure_ascii=False, cls=NpEncoder) + '\n')

  0%|          | 0/12028 [00:00<?, ?it/s]

In [10]:
# f1_score(truth, pred, average='macro')

# Recommendation

In [11]:
newTest_all_feature = pd.read_csv('./newTest_all_feature.csv')
feature_names = ['province', 'city', 'deviceType', 'logTs_gap', 'Hour', 'bm25_mean', 'co_bm25_mean', 
                 'bm25', 'bm25Len', 'gapMeanEmotion', 'groupLen', 'historyHitSort', 'gongxian']
test_model_input = {name: newTest_all_feature[name] for name in feature_names}
newTest_all_feature.head()

Unnamed: 0,province,city,deviceType,logTs_gap,Hour,bm25_mean,co_bm25_mean,bm25,bm25Len,gapMeanEmotion,groupLen,historyHitSort,gongxian
0,2,9,0,0,23,3,7,6,19,32,14,1,3
1,2,9,0,0,23,22,7,44,19,15,14,8,27
2,2,9,0,0,23,19,7,37,19,24,14,11,25
3,2,9,0,0,23,47,7,94,19,12,14,2,7
4,2,9,0,0,23,20,7,39,19,9,14,7,9


In [12]:
model = torch.load(CFG.rec_model)
testSampleId = pd.read_csv(CFG.rec_test)['testSampleId']
pred_ans = model.predict(test_model_input, batch_size=1024)
pred = pd.Series(pred_ans.flatten('F'))
output_df = pd.concat([testSampleId, pred], axis=1)
output_path = os.path.join('./', "section2.txt")
output_df.to_csv(output_path, index=False,sep="\t", header=["Id", "result"])
output_df

Unnamed: 0,testSampleId,0
0,7000001,0.281802
1,7000002,0.314887
2,7000003,0.275916
3,7000004,0.287720
4,7000005,0.263973
...,...,...
1405333,8405334,0.241028
1405334,8405335,0.113012
1405335,8405336,0.238159
1405336,8405337,0.147870
