## Installing and importing libraries

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install demoji
!pip install neologdn

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 22.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 34.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


In [2]:
import pandas as pd
import numpy as np
import glob
import ast
import sentencepiece as spm
import multiprocessing as mp
from tqdm import tqdm

import torch
import torch.nn as nn 
from torch.utils.data import DataLoader, Dataset

from transformers import BertConfig, BertModel

## Data loading

In [3]:
from google.colab import drive
drive.mount('/content/mnt')

Mounted at /content/mnt


In [4]:
%cd /content/mnt/MyDrive/incident/entity

/content/mnt/MyDrive/incident/entity


In [5]:
files = glob.glob("./20220217/*")
df = pd.DataFrame(columns=["Unnamed: 0", "id", "report", "tokenized_report", "diff_unk","IOB_report", "actual_ent"])
for f in files:
  df_ = pd.read_excel(f)
  df = pd.concat([df, df_], ignore_index=True)
df

Unnamed: 0.1,Unnamed: 0,id,report,tokenized_report,diff_unk,IOB_report,actual_ent
0,0,H00041ECB539DB2B1,SSSにて入院。本日11時にHCUより転入。昼食後薬準備しPCを持って患者の元で内服薬確認し...,"['▁s', 'ss', 'にて', '入院', '。', '本', '日', '11', ...",[],"['O', 'O', 'O', 'O', 'O', 'B-Date', 'I-Date', ...","['本日', '11時に', '昼食後', '内服薬', 'オパルモン', 'メコバラミン'..."
1,1,H000B05C61BD26562,11時10分、訪室し昼食後のベサノイドを渡した。11時50分、血液データによっては、ベサノイ...,"['▁11', '時', '10', '分', '、', '訪', '室', 'し', '昼...",[],"['B-Timing', 'I-Timing', 'I-Timing', 'I-Timing...","['▁11時10分', '昼食後の', 'ベサノイド', '11時50分', '血液', '..."
2,2,H001EE2CF377B2F30,調剤時にメバロチンとメジコン錠の調剤間違えをした。鑑査時発見され、患者の元には渡らなかった。,"['▁', '調', '剤', '時に', 'メ', 'バ', 'ロ', 'チン', 'と'...",[],"['O', 'O', 'O', 'O', 'B-Drug', 'I-Drug', 'I-Dr...","['メバロチン', 'メジコン', '錠']"
3,3,H00341976599F77E1,抗がん剤治療に定期的に入院している患者。病棟から抗腫瘍剤の調製可能の連絡が入り混合調剤を開始...,"['▁', '抗', 'がん', '剤', '治療', 'に', '定期的に', '入院',...",[],"['O', 'B-Drug', 'I-Drug', 'I-Drug', 'O', 'O', ...",['抗がん剤']
4,4,H004275492EEA4CDD,患者が入院後、病棟クラークが入院オリエンテーション時に身長・体重測定を実施し、測定した数値を...,"['▁', '患者', 'が', '入院', '後', '、', '病', '棟', 'クラ...",[],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['体重', '体重', '体重', '翌日', '体重', '2日目', '2回目', '..."
...,...,...,...,...,...,...,...
58702,4089,HFFCB8F9ED5BA8337,内服後、嚥下痛あり、腹部レントゲン、胃カメラ実施。タケプロンのシートが胃内から見つかった。,"['▁', '内', '服', '後', '、', '<unk>', '下', '痛', '...",[{'<unk>': '嚥'}],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",['タケプロン']
58703,4090,HFFD3518EEA063365,抗癌剤のランダ48mgを生食500mlに混合注射する際に、49mg(98ml)注入したところ...,"['▁', '抗', '癌', '剤', 'の', 'ラン', 'ダ', '48', 'mg...",[],"['O', 'B-Drug', 'I-Drug', 'I-Drug', 'O', 'B-Dr...","['抗癌剤', 'ランダ', '48mg', '生食', '500ml', '注射', '4..."
58704,4091,HFFD841D85493276D,当日から内服開始の抗凝固剤の与薬を忘れた。,"['▁当', '日から', '内', '服', '開始', 'の', '抗', '凝固', ...",[],"['O', 'O', 'O', 'O', 'O', 'O', 'B-Drug', 'I-Dr...",['抗凝固剤']
58705,4092,HFFE65CB2AABF2EDC,化学療法センターでの勤務を行っていた。患者の点滴が更新となるため、点滴とカルテを照合し、指さ...,"['▁', '化学', '療法', 'センター', 'での', '勤務', 'を行っていた'...",[],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['点滴', '点滴', '輸液', '1時間', '1時間', '点滴', '点滴']"


## Data cleaning

In [6]:
import html
import re
import demoji
import neologdn
URL = re.compile(
    r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]"
    + r"[a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\."
    + r"[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\."
    + r"[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\."
    + r"[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
)
MULTI_SPACE = re.compile(r"[\t ]+")
MULTI_LINE = re.compile(r"\n\s+")
LINE = re.compile(r"\n")

def clean_basic(text):
    text = text.lower()
    # remove urls
    text = URL.sub(" ", text)
    # replace multispaces to single
    text = MULTI_SPACE.sub(" ", text)
    text = MULTI_LINE.sub("\n", text)
    text = LINE.sub("。", text)
    # remove leading, trailing spaces
    text = text.strip()
    return text

In [7]:
df["report"] = df["report"].apply(lambda x: neologdn.normalize(x))
text = df.report.tolist()
with mp.Pool(mp.cpu_count()) as pool:
    text = pool.map(clean_basic, text)
df.report = text
df

Unnamed: 0.1,Unnamed: 0,id,report,tokenized_report,diff_unk,IOB_report,actual_ent
0,0,H00041ECB539DB2B1,sssにて入院。本日11時にhcuより転入。昼食後薬準備しpcを持って患者の元で内服薬確認し...,"['▁s', 'ss', 'にて', '入院', '。', '本', '日', '11', ...",[],"['O', 'O', 'O', 'O', 'O', 'B-Date', 'I-Date', ...","['本日', '11時に', '昼食後', '内服薬', 'オパルモン', 'メコバラミン'..."
1,1,H000B05C61BD26562,11時10分、訪室し昼食後のベサノイドを渡した。11時50分、血液データによっては、ベサノイ...,"['▁11', '時', '10', '分', '、', '訪', '室', 'し', '昼...",[],"['B-Timing', 'I-Timing', 'I-Timing', 'I-Timing...","['▁11時10分', '昼食後の', 'ベサノイド', '11時50分', '血液', '..."
2,2,H001EE2CF377B2F30,調剤時にメバロチンとメジコン錠の調剤間違えをした。鑑査時発見され、患者の元には渡らなかった。,"['▁', '調', '剤', '時に', 'メ', 'バ', 'ロ', 'チン', 'と'...",[],"['O', 'O', 'O', 'O', 'B-Drug', 'I-Drug', 'I-Dr...","['メバロチン', 'メジコン', '錠']"
3,3,H00341976599F77E1,抗がん剤治療に定期的に入院している患者。病棟から抗腫瘍剤の調製可能の連絡が入り混合調剤を開始...,"['▁', '抗', 'がん', '剤', '治療', 'に', '定期的に', '入院',...",[],"['O', 'B-Drug', 'I-Drug', 'I-Drug', 'O', 'O', ...",['抗がん剤']
4,4,H004275492EEA4CDD,患者が入院後、病棟クラークが入院オリエンテーション時に身長・体重測定を実施し、測定した数値を...,"['▁', '患者', 'が', '入院', '後', '、', '病', '棟', 'クラ...",[],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['体重', '体重', '体重', '翌日', '体重', '2日目', '2回目', '..."
...,...,...,...,...,...,...,...
58702,4089,HFFCB8F9ED5BA8337,内服後、嚥下痛あり、腹部レントゲン、胃カメラ実施。タケプロンのシートが胃内から見つかった。,"['▁', '内', '服', '後', '、', '<unk>', '下', '痛', '...",[{'<unk>': '嚥'}],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",['タケプロン']
58703,4090,HFFD3518EEA063365,抗癌剤のランダ48mgを生食500mlに混合注射する際に、49mg(98ml)注入したところ...,"['▁', '抗', '癌', '剤', 'の', 'ラン', 'ダ', '48', 'mg...",[],"['O', 'B-Drug', 'I-Drug', 'I-Drug', 'O', 'B-Dr...","['抗癌剤', 'ランダ', '48mg', '生食', '500ml', '注射', '4..."
58704,4091,HFFD841D85493276D,当日から内服開始の抗凝固剤の与薬を忘れた。,"['▁当', '日から', '内', '服', '開始', 'の', '抗', '凝固', ...",[],"['O', 'O', 'O', 'O', 'O', 'O', 'B-Drug', 'I-Dr...",['抗凝固剤']
58705,4092,HFFE65CB2AABF2EDC,化学療法センターでの勤務を行っていた。患者の点滴が更新となるため、点滴とカルテを照合し、指さ...,"['▁', '化学', '療法', 'センター', 'での', '勤務', 'を行っていた'...",[],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['点滴', '点滴', '輸液', '1時間', '1時間', '点滴', '点滴']"


## Define arguments and paths

In [19]:
MAX_LEN = 256
DEVICE = "cuda"

BERT_CONIFG_FILE = "/content/mnt/MyDrive/bert (1)/bert_pytorch/config.json"
BERT_TRAINED_MODEL = "/content/mnt/MyDrive/incident/entity/model/model_3_220310_2.bin"
TOKENIZER_MODEL = "/content/mnt/MyDrive/bert (1)/bert_pytorch/wiki-ja.model"
SAVE_PATH = "incident_inference_220314.xlsx"

## Prediction

In [13]:
class IncidentDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len, eos_token, pad_token):

        self.tokens, self.mask, self.id = self.process_text(texts, tokenizer, max_len, eos_token, pad_token)

    def process_text(self, data, tokenizer, max_len, eos_token, pad_token):
        tokenized_text = tokenizer.encode(data)

        for i, tokens in enumerate(tokenized_text):
            if len(tokens) >= max_len:
                tokens = tokens[:max_len]
                tokens[-1] = eos_token
            else:
                tokens.append(eos_token)
                n = max_len - len(tokens)
                paddings = [pad_token] * n
                tokens.extend(paddings)

            tokenized_text[i] = tokens
        
        attention_mask = np.ones((len(tokenized_text), max_len))

        attention_mask[np.array(tokenized_text) == 1] = 0
        token_type_ids = np.zeros((len(tokenized_text), max_len))

        return tokenized_text, attention_mask, token_type_ids

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, item):

        return {
            "input_ids" : torch.tensor(self.tokens[item], dtype=torch.long), 
            "attention_mask" : torch.tensor(self.mask[item], dtype=torch.long), 
            "token_type_ids" : torch.tensor(self.id[item], dtype=torch.long)
        }
        

In [14]:
class IncidentBert(nn.Module):
    def __init__(self, reinit_n_layers=0):
        super().__init__()
        config = BertConfig.from_pretrained(BERT_CONIFG_FILE)
        self.bert = BertModel(config)

        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, 28)

        self._init_weights(self.fc)

        self.reinit_n_layers = reinit_n_layers
        if reinit_n_layers > 0: 
            self._do_reinit()  

    def _do_reinit(self):
        for n in range(self.reinit_n_layers):
            self.bert.encoder.layer[-(n+1)].apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.bert.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)  

    def forward(self, input_ids, attention_mask, token_type_ids):

        bert_out = self.bert(input_ids=input_ids, 
                             attention_mask=attention_mask, 
                             token_type_ids=token_type_ids)

        output = bert_out.last_hidden_state

        output = self.dropout(output)
        output = self.fc(output)

        return output # 256

In [15]:
class IncidentModel(nn.Module):
    def __init__(self, reinit_n_layers=0):
        super().__init__()
        self.bert = IncidentBert()

        self.fc_int = nn.Linear(28, 5) 
        self.fc_rel = nn.Linear(28, 4)

    def forward(self, input_ids, attention_mask, token_type_ids):

        bert_out = self.bert(input_ids=input_ids, 
                             attention_mask=attention_mask, 
                             token_type_ids=token_type_ids)

        output_int = self.fc_int(bert_out)
        output_rel = self.fc_rel(bert_out)

        return bert_out, output_int, output_rel

In [17]:
model = IncidentModel()
model.to(DEVICE)
model.load_state_dict(torch.load(BERT_TRAINED_MODEL))
tokenizer = spm.SentencePieceProcessor(TOKENIZER_MODEL)
texts = df.report.tolist()

dataset = IncidentDataset(texts, tokenizer, MAX_LEN, 2, 1)
dataloader = DataLoader(dataset, batch_size=8, drop_last=False, shuffle=False)
model.eval()

outs_pos, outs_int, outs_rel = [], [], []

for i, batch in tqdm(enumerate(dataloader)):
  ids = batch["input_ids"].to("cuda")
  mask = batch["attention_mask"].to("cuda")
  types = batch["token_type_ids"].to("cuda")

  with torch.no_grad():
    out_pos, out_int, out_rel = model(input_ids=ids, attention_mask=mask, token_type_ids=types)

  out_pos = torch.argmax(out_pos, dim = -1)
  out_pos = out_pos.cpu().numpy().tolist()
  outs_pos.extend(out_pos)

  out_int = torch.argmax(out_int, dim = -1)
  out_int = out_int.cpu().numpy().tolist()
  outs_int.extend(out_int)

  out_rel = torch.argmax(out_rel, dim = -1)
  out_rel = out_rel.cpu().numpy().tolist()
  outs_rel.extend(out_rel)

7339it [08:39, 14.13it/s]


## Post process

In [18]:
map_rev = {
    0: "<mask>",
    1: 'O',
    2: 'B-Date',
    3: 'I-Date',
    4: 'B-Dosage',
    5: 'I-Dosage',
    6: 'B-Drug',
    7: 'I-Drug',
    8: 'B-Duration',
    9: 'I-Duration',
    10: 'B-Form_form',
    11: 'I-Form_form',
    12: 'B-Form_mode',
    13: 'I-Form_mode',
    14: 'B-Frequency',
    15: 'I-Frequency',
    16: 'B-Route',
    17: 'I-Route',
    18: 'B-Strength_amount',
    19: 'I-Strength_amount',
    20: 'B-Strength_concentration',
    21: 'I-Strength_concentration',
    22: 'B-Strength_rate',
    23: 'I-Strength_rate',
    24: 'B-Timing',
    25: 'I-Timing',
    26: 'B-Wrong_patient',
    27: 'I-Wrong_patient',
}

map_int = {0: "<pad>", 1: "O", 2: "IA", 3: "IN", 4: "NA"}
map_rel = {0: "<pad>", 1: "O", 2: "1", 3: "2"}

In [20]:
for i, text in enumerate(texts):
  length = len(tokenizer.encode(text))
  outs_pos[i] = list(map(lambda x: map_rev[x], outs_pos[i][:length]))
  outs_int[i] = list(map(lambda x: map_int[x], outs_int[i][:length]))
  outs_rel[i] = list(map(lambda x: map_rel[x], outs_rel[i][:length]))

In [21]:
df = df[["id", "report", "tokenized_report", "IOB_report"]]
df["outs_pos"] = outs_pos
df["outs_int"] = outs_int
df["outs_rel"] = outs_rel
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,report,tokenized_report,IOB_report,outs_pos,outs_int,outs_rel
0,H00041ECB539DB2B1,sssにて入院。本日11時にhcuより転入。昼食後薬準備しpcを持って患者の元で内服薬確認し...,"['▁s', 'ss', 'にて', '入院', '。', '本', '日', '11', ...","['O', 'O', 'O', 'O', 'O', 'B-Date', 'I-Date', ...","[O, O, O, O, O, B-Date, I-Date, B-Timing, I-Ti...","[O, O, O, O, O, IA, IA, IA, IA, O, O, O, O, O,...","[O, O, O, O, O, 1, 1, 1, 1, O, O, O, O, O, 1, ..."
1,H000B05C61BD26562,11時10分、訪室し昼食後のベサノイドを渡した。11時50分、血液データによっては、ベサノイ...,"['▁11', '時', '10', '分', '、', '訪', '室', 'し', '昼...","['B-Timing', 'I-Timing', 'I-Timing', 'I-Timing...","[B-Timing, I-Timing, I-Timing, I-Timing, O, O,...","[IA, IA, IA, IA, O, O, O, O, IA, IA, NA, IA, I...","[1, 1, 1, 1, O, O, O, O, 1, 1, 1, 1, 1, 1, O, ..."
2,H001EE2CF377B2F30,調剤時にメバロチンとメジコン錠の調剤間違えをした。鑑査時発見され、患者の元には渡らなかった。,"['▁', '調', '剤', '時に', 'メ', 'バ', 'ロ', 'チン', 'と'...","['O', 'O', 'O', 'O', 'B-Drug', 'I-Drug', 'I-Dr...","[O, O, O, O, B-Drug, I-Drug, I-Drug, I-Drug, O...","[O, O, O, O, IN, IN, IN, IN, O, NA, NA, NA, IA...","[O, O, O, O, 1, 1, 1, 1, O, 1, 1, 1, 1, O, O, ..."
3,H00341976599F77E1,抗がん剤治療に定期的に入院している患者。病棟から抗腫瘍剤の調製可能の連絡が入り混合調剤を開始...,"['▁', '抗', 'がん', '剤', '治療', 'に', '定期的に', '入院',...","['O', 'B-Drug', 'I-Drug', 'I-Drug', 'O', 'O', ...","[O, B-Drug, I-Drug, I-Drug, O, O, O, O, O, O, ...","[O, IA, IA, IA, O, O, O, O, O, O, O, O, O, O, ...","[O, 1, 1, 1, O, O, O, O, O, O, O, O, O, O, 1, ..."
4,H004275492EEA4CDD,患者が入院後、病棟クラークが入院オリエンテーション時に身長・体重測定を実施し、測定した数値を...,"['▁', '患者', 'が', '入院', '後', '、', '病', '棟', 'クラ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...,...,...,...
58702,HFFCB8F9ED5BA8337,内服後、嚥下痛あり、腹部レントゲン、胃カメラ実施。タケプロンのシートが胃内から見つかった。,"['▁', '内', '服', '後', '、', '<unk>', '下', '痛', '...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, IA, IA, IA, O, O, O, O, O, O, O, O, O, O, ...","[O, 1, 1, O, O, O, O, O, O, O, O, O, O, O, O, ..."
58703,HFFD3518EEA063365,抗癌剤のランダ48mgを生食500mlに混合注射する際に、49mg(98ml)注入したところ...,"['▁', '抗', '癌', '剤', 'の', 'ラン', 'ダ', '48', 'mg...","['O', 'B-Drug', 'I-Drug', 'I-Drug', 'O', 'B-Dr...","[O, B-Drug, I-Drug, I-Drug, O, B-Drug, I-Drug,...","[O, IA, IA, IA, O, IA, IA, IN, IN, O, IA, IA, ...","[O, 1, 1, 1, O, 1, 1, 1, 1, O, 1, 1, 1, 1, O, ..."
58704,HFFD841D85493276D,当日から内服開始の抗凝固剤の与薬を忘れた。,"['▁当', '日から', '内', '服', '開始', 'の', '抗', '凝固', ...","['O', 'O', 'O', 'O', 'O', 'O', 'B-Drug', 'I-Dr...","[B-Date, I-Date, O, O, O, O, B-Drug, I-Drug, I...","[IA, IA, IA, IA, O, O, IA, IA, IA, O, O, O, O,...","[1, 1, 1, 1, O, O, 1, 1, 1, O, O, O, O, O, O]"
58705,HFFE65CB2AABF2EDC,化学療法センターでの勤務を行っていた。患者の点滴が更新となるため、点滴とカルテを照合し、指さ...,"['▁', '化学', '療法', 'センター', 'での', '勤務', 'を行っていた'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[O, O, O, O, O, O, O, O, O, B-Route, I-Drug, O...","[O, O, O, O, O, O, O, O, O, IA, IA, O, O, O, O...","[O, O, O, O, O, O, O, O, O, 1, 1, O, O, O, O, ..."


## Save prediction as excel file

In [22]:
df.to_excel(SAVE_PATH, index=False)