In [None]:
savedmodel_name="/seed42_e3_vs545_score0.926.pth"
savemode_dir = "/kaggle/input/seed42_score0.991/pytorch/v41/1"

In [None]:
import numpy as np#进行矩阵运算的库
import random#提供了一些用于生成随机数的函数
import torch
from transformers import AutoTokenizer
import json
import os
import sys
import gc

import pandas as pd
from pathlib import Path
from datasets import Dataset
from torch.utils.data import DataLoader

import time
import numpy as np
from unidecode import unidecode
import copy
from tqdm import tqdm,tqdm_notebook


class Config:

    num_proc=10
    seed=42 #随机种子

    split_by_paragraph=True #False to split by sentence windows
    max_length=1024 # for sentence sliding windows 
    stride=256 # overlap count


    batch_size=3 # TODO:change batch size
    logging_steps=100
    epochs=3
    lr=2e-5
    weight_decay=0.01
    accumulation_steps=1 # batch size不同，不能直接除。有空再修
    evaltimes=3
    num_warmup_steps=0

    resume_train_epoch=0

 

#设置随机种子,保证模型可以复现
def seed_everything():
    seed=Config.seed
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything()




# batch 对齐
class Collate:
    def __init__(self, tokenizer,if_train=True):
        self.tokenizer = tokenizer
        self.if_train=if_train
    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["berttokenids"] for sample in batch]
        output["type_ids"] = [sample["berttokentoken_type_ids"] for sample in batch]
        output["mask"] = [sample["berttokenmask"] for sample in batch]
        if self.if_train: 
            output["targets"] = [sample["bertlabels"] for sample in batch]
        output["token_org_length"]=[len(ids) for ids in output["ids"]]

         # calculate max token length of this batch
        batch_max = max(output["token_org_length"])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["ids"]]
            output["mask"] = [s + (batch_max - len(s)) * [0] for s in output["mask"]]
            output["type_ids"] = [s + (batch_max - len(s)) * [0] for s in output["type_ids"]]
            if self.if_train: output["targets"] = [s + (batch_max - len(s)) * [-100] for s in output["targets"]]
        else:
            output["ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["ids"]]
            output["mask"] = [(batch_max - len(s)) * [0] + s for s in output["mask"]]
            output["type_ids"] = [(batch_max - len(s)) * [0] + s for s in output["type_ids"]]
            if self.if_train: output["targets"] = [(batch_max - len(s)) * [-100] + s for s in output["targets"]]

        # convert to tensors
        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)
        output["type_ids"] = torch.tensor(output["type_ids"], dtype=torch.long)
        if self.if_train: output["targets"] = torch.tensor(output["targets"], dtype=torch.long)

        return output

def text_precessor(data):
    # 预处理
    # 文本转小写,unicode 轉換ascii
    # 转换失败转'*'
    data=copy.deepcopy(data)
    for i in tqdm(range(len(data)),desc="preprocess texe"):
        data[i]['full_text']=unidecode(data[i]['full_text'],errors='replace',replace_str='*')
        data[i]['full_text']=data[i]['full_text'].lower()
        for j in range(len(data[i]['tokens'])):
            data[i]['tokens'][j]=unidecode(data[i]['tokens'][j],errors='replace',replace_str='*')  
            data[i]['tokens'][j]=data[i]['tokens'][j].lower().strip()
    return data

def split_by_paragraph(text):
    res=[i+'\n\n' for i in  text.split('\n\n')]
    return res

# 分句子
def split_token(text):
    """
        分句子，返回列表
    """
    data_out=[]
    if Config.split_by_paragraph:
        doc=split_by_paragraph(text)
        return doc
    else:
        doc=sent_tokenize(text)
    split_sentence=doc
    length=len(split_sentence)
    idx=0
    while idx+Config.max_sen_count<length:
        split_sentence[idx+Config.max_sen_count-Config.stride]="[SEP]"+split_sentence[idx+Config.max_sen_count-Config.stride]
        if idx==0:
            data_out.append("".join(split_sentence[idx:idx+Config.max_sen_count]))
        else:
            data_out.append("".join(split_sentence[idx-Config.stride:idx+Config.max_sen_count]))
        idx+=Config.max_sen_count
    # deal with reminder
    if idx<Config.max_sen_count:# few sentences,concatenate directly
        data_out.append("".join(split_sentence[idx:length]))
    else:
        data_out.append("".join(split_sentence[idx-Config.stride:length]))
    return data_out

def train_preprocesss(example,tokenizer,label2id):
    # rebuild text from tokens

    example['bertlabels']=[]
    example['berttokenids']=[]
    example['berttokenmask']=[]
    example['berttokentoken_type_ids']=[]

    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")

    # actual tokenization
    tokenizeds = tokenizer("".join(text),
                        stride=Config.stride,
                        max_length=Config.max_length,
                        truncation=True ,
                        return_overflowing_tokens=True,
                        return_offsets_mapping=True,
                        )

    labels = np.array(labels)

    text = "".join(text)
    
    
    for  i in range(len( tokenizeds['input_ids'])):
        token_labels = []

        for start_idx, end_idx in tokenizeds['offset_mapping'][i]:
            # CLS token
            if start_idx == 0 and end_idx == 0:
                token_labels.append(label2id["O"])
                continue

            # case when token starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1

            token_labels.append(label2id[labels[start_idx]])

        example['bertlabels'].append(token_labels)
        example['berttokenids'].append(tokenizeds['input_ids'][i])
        example['berttokenmask'].append(tokenizeds['attention_mask'][i])
        example['berttokentoken_type_ids'].append(tokenizeds['token_type_ids'][i])

    return example


def expanddataset(ds,if_train=True):
    """
        将dataset的 bertlabels,berttokenpos2orgtokenpos expand到列,返回新的dataset
    """
    df=ds.to_pandas()
    merge_list_key=[]
    # s1 = pd.DataFrame(df.pop('bertlabels').values.tolist(), 
    #       index=df.index).stack().rename('bertlabels').reset_index(level=1, drop=True)
    if if_train :
        merge_list_key=['berttokenids','berttokenmask','berttokentoken_type_ids','bertlabels']
    else:
        merge_list_key=['berttokenids','berttokenmask','berttokentoken_type_ids','offset_mapping']
    s_l=[]
    for i in merge_list_key:
        tmp_s= pd.DataFrame(df.pop(i).values.tolist(), 
                    index=df.index).stack().rename(i).reset_index(level=1, drop=True)
        s_l.append(tmp_s)
    df = df.join(pd.concat(s_l, axis=1))
    return df.reset_index(drop=True)

def logit2truepredic(batch_predictions,batch_org_len):
    """
        按顺序返回列表
    """

    preds_final=[]
    batch_len=max(batch_org_len)
    for i,l in enumerate(batch_org_len):
        predictions=batch_predictions[i*batch_len:i*batch_len+l]

        softmaxed_pred=np.exp(predictions) / np.sum(np.exp(predictions), axis = 1).reshape(-1,1)
        preds = predictions.argmax(-1)
        preds_without_O = softmaxed_pred[:,:12].argmax(-1)
        O_preds = predictions[:,12]
        preds_final.append( list(np.where(O_preds < Config.threshold, preds_without_O , preds)))


    return preds_final



In [None]:
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})
ds

In [None]:
tokenizer = AutoTokenizer.from_pretrained(savemode_dir)
idlabel= json.load(open(savemode_dir+"/idlabel.json"))
id2label=idlabel['id2label']
label2id=idlabel['label2id']

In [None]:
def inference_preprocesss(example,tokenizer,label2id):
    # rebuild text from tokens

    example['token_map']=[]
    example['berttokenids']=[]
    example['berttokenmask']=[]
    example['berttokentoken_type_ids']=[]
    example['offset_mapping']=[]

    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
        
        # actual tokenization
    tokenizeds = tokenizer("".join(text),
                        stride=Config.stride,
                        max_length=Config.max_length,
                        truncation=True ,
                        return_overflowing_tokens=True,
                        return_offsets_mapping=True,
                        )


    example['berttokenids']=(tokenizeds['input_ids'])
    example['berttokenmask']=(tokenizeds['attention_mask'])
    example['berttokentoken_type_ids']=(tokenizeds['token_type_ids'])
    example['offset_mapping']=(tokenizeds['offset_mapping'])
    example['token_map']=(token_map)
        
    return example

In [None]:
preprocesssed_ds=ds.map(inference_preprocesss, fn_kwargs={'tokenizer':tokenizer,'label2id':label2id},num_proc=10,desc="prepocessing data")

In [None]:
# rebuid dataset 
tmp_pd=expanddataset(preprocesssed_ds,if_train=False)
print(tmp_pd['berttokenids'].str.len().agg(['mean','max','std','min']))
print(len(tmp_pd))

In [None]:
full_ds=Dataset.from_pandas(tmp_pd)

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForTokenClassification, AutoModel

class NLPModel(nn.Module):
    def __init__(self,id2label,label2id,modelname,training=True):
        super().__init__()
    #     self.model=AutoModelForTokenClassification.from_pretrained(modelname, num_labels=len(id2label),
    # id2label=id2label,label2id=label2id,ignore_mismatched_sizes=True)#label映射到id
        self.model = AutoModel.from_pretrained(modelname, # num_labels=len(id2label),
    id2label=id2label,label2id=label2id,ignore_mismatched_sizes=True)
        self.lossfunc = nn.CrossEntropyLoss()
        self.num_labels=len(id2label)
        if training: self.model.save_pretrained(Config.modelsavepath)

        hidden_dim = 1024
        self.lstm = nn.LSTM(input_size = hidden_dim ,
                            hidden_size = hidden_dim  // 2,
                            num_layers = 1,
                            batch_first=True,
                            bidirectional=True,
                            # dropout=0.1
                        )
        self.l = nn.Linear(hidden_dim, self.num_labels)

    def forward(self,input_ids, token_type_ids,attention_mask,labels=None):
        output = self.model(input_ids, token_type_ids, attention_mask)
        # batchsize*seq length
        # logit=output[0]

        self.lstm.flatten_parameters()
        hidden, (_, _) = self.lstm(output[0])
        logit = self.l(hidden)
        loss=0
        logit=logit.view(-1,logit.shape[-1])
        if labels is not None:
             labels=labels.view(-1)
             loss=self.lossfunc(logit,labels)
        
        
        logit=logit.detach().cpu().numpy()
        return logit,loss


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model=NLPModel(id2label,label2id,"/kaggle/input/deberta-v3-large/pytorch/deberta-large/1",training=False).to(device)
model.load_state_dict(torch.load(savemode_dir+savedmodel_name, map_location = device))

In [None]:
preprocesssed_ds=ds.map(inference_preprocesss, fn_kwargs={'tokenizer':tokenizer,'label2id':label2id},num_proc=10,desc="prepocessing data")

In [None]:
# rebuid dataset 
tmp_pd=expanddataset(preprocesssed_ds,if_train=False)
print(tmp_pd['berttokenids'].str.len().agg(['mean','max','std','min']))
print(len(tmp_pd))

In [None]:
full_ds=Dataset.from_pandas(tmp_pd)

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForTokenClassification, AutoModel

class NLPModel(nn.Module):
    def __init__(self,id2label,label2id,modelname,training=True):
        super().__init__()
    #     self.model=AutoModelForTokenClassification.from_pretrained(modelname, num_labels=len(id2label),
    # id2label=id2label,label2id=label2id,ignore_mismatched_sizes=True)#label映射到id
        self.model = AutoModel.from_pretrained(modelname, # num_labels=len(id2label),
    id2label=id2label,label2id=label2id,ignore_mismatched_sizes=True)
        self.lossfunc = nn.CrossEntropyLoss()
        self.num_labels=len(id2label)
        if training: self.model.save_pretrained(Config.modelsavepath)

        hidden_dim = 1024
        self.lstm = nn.LSTM(input_size = hidden_dim ,
                            hidden_size = hidden_dim  // 2,
                            num_layers = 1,
                            batch_first=True,
                            bidirectional=True,
                            # dropout=0.1
                        )
        self.l = nn.Linear(hidden_dim, self.num_labels)

    def forward(self,input_ids, token_type_ids,attention_mask,labels=None):
        output = self.model(input_ids, token_type_ids, attention_mask)
        # batchsize*seq length
        # logit=output[0]

        self.lstm.flatten_parameters()
        hidden, (_, _) = self.lstm(output[0])
        logit = self.l(hidden)
        loss=0
        logit=logit.view(-1,logit.shape[-1])
        if labels is not None:
             labels=labels.view(-1)
             loss=self.lossfunc(logit,labels)
        
        
        logit=logit.detach().cpu().numpy()
        return logit,loss


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model=NLPModel(id2label,label2id,"/kaggle/input/deberta-v3-large/pytorch/deberta-large/1",training=False).to(device)
model.load_state_dict(torch.load(savemode_dir+savedmodel_name, map_location = device))

In [None]:
data_collator = Collate(tokenizer=tokenizer,if_train=False)
val_dataloader=DataLoader(full_ds,batch_size=1,pin_memory=True,collate_fn=data_collator)
gc.collect()
torch.cuda.empty_cache()

In [None]:
def ret_before_pading_logit(batch_logits,batch_org_len):
    """
        按顺序返回pading前的logit列表
    """
    logit_list=[]
    batch_len=max(batch_org_len)
    for i,l in enumerate(batch_org_len):
        loggit=batch_logits[i*batch_len:i*batch_len+l]
        logit_list.append(loggit)
    return logit_list

In [None]:
model.eval()

logits=[]
for step,dataset in enumerate(tqdm(val_dataloader)):

    with torch.no_grad():
        ids = dataset["ids"].to(device,non_blocking=True)
        mask = dataset["mask"].to(device,non_blocking=True)
        tokentype = dataset["type_ids"].to(device,non_blocking=True)
        logit,loss = model(ids,mask,tokentype)
        logits+=(ret_before_pading_logit(logit,dataset['token_org_length']))
        del ids,mask,tokentype


In [None]:
triplet_map = {}
document, token, label, token_str = [], [], [], []

for p,token_map, offsets,tokens, doc in zip(logits,full_ds["token_map"], full_ds['offset_mapping'],full_ds["tokens"], full_ds["document"]):

    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        
        if start_idx + end_idx == 0: continue
        if token_map[start_idx] == -1:
            start_idx += 1

         # ignore "\n\n"
        while start_idx < len(token_map) and  tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if token_id != -1:
            triplet_key = (doc,token_id,tokens[token_id])

            if triplet_key not in triplet_map:
                triplet_map[triplet_key]=[token_pred]
            else:
                triplet_map[triplet_key].append(token_pred)

In [None]:
Config.threshold=0.9875
for key in triplet_map:
    averged_logit=np.mean(triplet_map[key],axis=0)
    pred_softmax= np.exp(averged_logit) / np.sum(np.exp(averged_logit))
    if pred_softmax[12]> Config.threshold:
        true_predict='O'
    else:
        true_predict=id2label[str(pred_softmax[:12].argmax())]
    triplet_map[key]=true_predict

In [None]:
df=pd.DataFrame(triplet_map.items(), columns=['triplet_map', 'label'])
df=df.loc[df['label']!='O'].reset_index(drop=True)

In [None]:
df['document'],df['token'], df['token_str'] =  zip(*df.triplet_map)

In [None]:
df["row_id"] = list(range(len(df)))
(df.head(10))

In [None]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)