# Readme
This is a machine annotator for Japanese free text incident reports of medication errors. This pipeline has been tested to be stable and works best on Google Colab. The machine annotator utilizes CUDA, so please set the Runtime type to 'GPU'.

### SETUP:
(1) Please mount the shared files (including "wiki-ja.model", "config.json", "model_entity_220309.bin", and "model_3_220310_2.bin") accordingly into "TOKENIZER_MODEL", "BERT_CONIFG_FILE", "BERT_PRETRAINED_MODEL", "MODEL_SAVE_PATH".

(2) Next please set up input/output file paths:
  - "in_dir": the input file path, it should be a xlsx file with two columns: "id" and "report", saved in the "freetext" sheet. 
  - "out_dir": the output file path, it will generate the entity-level predicted output from the trained machine-annotator.

### RUN:
Please run the code chunk by chunk and you will find the entity-level annotation in "out_dir".

For any inquiries, please email to Dr Zoie SY Wong (zoiesywong@gmail.com)

## Installing and importing libraries

In [13]:
!pip install transformers
!pip install sentencepiece
!pip install demoji
!pip install neologdn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
#!/usr/bin/python
# -*- coding: US-ASCII -*-

import pandas as pd
import numpy as np
import sentencepiece as spm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertConfig, AdamW, BertModel
import re
import neologdn
from tqdm import tqdm


In [15]:
from google.colab import drive
drive.mount('/content/mnt')



Drive already mounted at /content/mnt; to attempt to forcibly remount, call drive.mount("/content/mnt", force_remount=True).


In [16]:
in_dir = '/content/mnt/MyDrive/program/in/freetext.xlsx'
out_dir = '/content/mnt/MyDrive/program/out/annotation.xlsx'
TOKENIZER_MODEL = '/content/mnt/MyDrive/program/tokenizer/wiki-ja.model'
BERT_CONIFG_FILE = '/content/mnt/MyDrive/program/entity_model/config.json'
BERT_PRETRAINED_MODEL = '/content/mnt/MyDrive/program/entity_model/model_entity_220309.bin'
MODEL_SAVE_PATH = '/content/mnt/MyDrive/program/trained_model/model_3_220310_2.bin'

## Preprocessing

In [17]:
URL = re.compile(
    r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]"
    + r"[a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\."
    + r"[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\."
    + r"[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\."
    + r"[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
)
MULTI_SPACE = re.compile(r"[\t ]+")
MULTI_LINE = re.compile(r"\n\s+")
LINE = re.compile(r"\n")

def clean_basic(text):
    text = text.lower()
    # remove urls
    text = URL.sub(" ", text)
    # replace multispaces to single
    text = MULTI_SPACE.sub(" ", text)
    text = MULTI_LINE.sub("\n", text)
    text = LINE.sub("?", text)
    # remove leading, trailing spaces
    text = text.strip()
    return text

## BERT Model

In [18]:
LEARNING_RATE = 1e-5
MAX_LEN = 256
BATCH_SIZE = 8
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 0
EPOCHS = 20
EVAL_STEP = 50
ACCUMULATION_STEP = 1
DROPOUT_RATE = 0.3
DEVICE = "cuda"
POS_NUM_LABELS = 28
INT_NUM_LABELS = 5
REL_NUM_LABELS = 4

mapping = {
    'PAD': 0,
    'O': 1,
    'B-Date': 2,
    'I-Date': 3,
    'B-Dosage': 4,
    'I-Dosage': 5,
    'B-Drug': 6,
    'I-Drug': 7,
    'B-Duration': 8,
    'I-Duration': 9,
    'B-Form_form': 10,
    'I-Form_form': 11,
    'B-Form_mode': 12,
    'I-Form_mode': 13,
    'B-Frequency': 14,
    'I-Frequency': 15,
    'B-Route': 16,
    'I-Route': 17,
    'B-Strength_amount': 18,
    'I-Strength_amount': 19,
    'B-Strength_concentration': 20,
    'I-Strength_concentration': 21,
    'B-Strength_rate': 22,
    'I-Strength_rate': 23,
    'B-Timing': 24,
    'I-Timing': 25,
    'B-Wrong_patient': 26,
    'I-Wrong_patient': 27,
}

ids_to_labels = dict(zip(list(mapping.values()), list(mapping.keys())))
ids_to_labels_int = {1: 'O', 2: 'IA', 3: 'IN', 4: 'NA', 0: 'PAD'}
ids_to_labels_rel = {1: 'O', 2: '1', 3: '2', 0: 'PAD'}

In [19]:
class IncidentDataset(Dataset):
    def __init__(self, texts, labels_pos, labels_int, labels_rel, tokenizer, max_len, eos_token, pad_token):

        self.tokens, self.mask, self.id, self.labels_pos, self.labels_int, self.labels_rel = self.process_text(texts,
                                                                                                               labels_pos,
                                                                                                               labels_int,
                                                                                                               labels_rel,
                                                                                                               tokenizer,
                                                                                                               max_len,
                                                                                                               eos_token,
                                                                                                               pad_token)

    def process_text(self, data, labels_pos, labels_int, labels_rel, tokenizer, max_len, eos_token, pad_token):
        tokenized_text = tokenizer.encode(data)

        for i, tokens in enumerate(tokenized_text):
            if len(tokens) >= max_len:
                tokens = tokens[:max_len]
                tokens[-1] = eos_token
            else:
                tokens.append(eos_token)
                n = max_len - len(tokens)
                paddings = [pad_token] * n
                tokens.extend(paddings)

            tokenized_text[i] = tokens

        attention_mask = np.ones((len(tokenized_text), max_len))

        attention_mask[np.array(tokenized_text) == 1] = 0
        token_type_ids = np.zeros((len(tokenized_text), max_len))

        for i, label in enumerate(labels_pos):
            if len(label) >= max_len:
                label = label[:max_len]
                label[-1] = 0
            else:
                # label.append(0)
                n = max_len - len(label)
                paddings = [0] * n
                label.extend(paddings)

            labels_pos[i] = label

        for i, label in enumerate(labels_int):
            if len(label) >= max_len:
                label = label[:max_len]
                label[-1] = 0
            else:
                # label.append(0)
                n = max_len - len(label)
                paddings = [0] * n
                label.extend(paddings)

            labels_int[i] = label

        for i, label in enumerate(labels_rel):
            if len(label) >= max_len:
                label = label[:max_len]
                label[-1] = 0
            else:
                # label.append(0)
                n = max_len - len(label)
                paddings = [0] * n
                label.extend(paddings)

            labels_rel[i] = label

        return tokenized_text, attention_mask, token_type_ids, labels_pos, labels_int, labels_rel

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, item):

        return {
            "input_ids": torch.tensor(self.tokens[item], dtype=torch.long),
            "attention_mask": torch.tensor(self.mask[item], dtype=torch.long),
            "token_type_ids": torch.tensor(self.id[item], dtype=torch.long),
            "label_pos": torch.tensor(self.labels_pos[item], dtype=torch.long),
            "label_int": torch.tensor(self.labels_int[item], dtype=torch.long),
            "label_rel": torch.tensor(self.labels_rel[item], dtype=torch.long)
        }


class IncidentBert(nn.Module):
    def __init__(self, BERT_CONIFG_FILE, reinit_n_layers=0):
        super().__init__()
        config = BertConfig.from_pretrained(BERT_CONIFG_FILE)
        self.bert = BertModel(config)

        self.dropout = nn.Dropout(DROPOUT_RATE)
        self.fc = nn.Linear(768, POS_NUM_LABELS)

        self._init_weights(self.fc)

        self.reinit_n_layers = reinit_n_layers
        if reinit_n_layers > 0:
            self._do_reinit()

    def _do_reinit(self):
        for n in range(self.reinit_n_layers):
            self.bert.encoder.layer[-(n+1)].apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.bert.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask, token_type_ids):

        bert_out = self.bert(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)

        output = bert_out.last_hidden_state

        output = self.dropout(output)
        output = self.fc(output)

        return output


class IncidentModel(nn.Module):
    def __init__(self, BERT_CONIFG_FILE, BERT_PRETRAINED_MODEL, reinit_n_layers=0):
        super().__init__()
        self.bert = IncidentBert(BERT_CONIFG_FILE)
        print("Loading weights")
        self.bert.load_state_dict(torch.load(BERT_PRETRAINED_MODEL))
        print("Loaded weights")

        self.fc_int = nn.Linear(POS_NUM_LABELS, INT_NUM_LABELS)
        self.fc_rel = nn.Linear(POS_NUM_LABELS, REL_NUM_LABELS)

    def forward(self, input_ids, attention_mask, token_type_ids):

        bert_out = self.bert(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)

        output_int = self.fc_int(bert_out)
        output_rel = self.fc_rel(bert_out)

        return bert_out, output_int, output_rel


def loss_fn_pos(output, target, mask):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, POS_NUM_LABELS)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss


def loss_fn_int(output, target, mask):
    weights = [0.0001, 0.001, 0.6, 0.9, 0.9]
    class_weights = torch.FloatTensor(weights).cuda()
    lfn = nn.CrossEntropyLoss(weight=class_weights)
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, INT_NUM_LABELS)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss


def loss_fn_rel(output, target, mask):
    weights = [0.0001, 0.001, 0.1, 0.9]
    class_weights = torch.FloatTensor(weights).cuda()
    lfn = nn.CrossEntropyLoss(weight=class_weights)
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, REL_NUM_LABELS)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss


def create_optimizer(model):
    named_parameters = list(model.named_parameters())

    parameters = []

    for layer_num, (name, params) in enumerate(named_parameters):
        weight_decay = 0.0 if "bias" in name else WEIGHT_DECAY

        lr = LEARNING_RATE

        if layer_num >= 69:
            lr = LEARNING_RATE * 2

        if layer_num >= 133:
            lr = LEARNING_RATE * 5

        if layer_num >= 192:
            lr = LEARNING_RATE * 10

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

## Predict from AI model


In [20]:
def report_predict(in_dir, out_dir, TOKENIZER_MODEL, BERT_CONIFG_FILE, BERT_PRETRAINED_MODEL, MODEL_SAVE_PATH):
    tokenizer = spm.SentencePieceProcessor(TOKENIZER_MODEL)
    model = IncidentModel(BERT_CONIFG_FILE, BERT_PRETRAINED_MODEL,)
    model.load_state_dict(torch.load(MODEL_SAVE_PATH))
    model.to(DEVICE)

    newdf = pd.read_excel(in_dir, sheet_name='freetext')

    newdf["report"] = newdf["report"].apply(lambda x: neologdn.normalize(x))
    text = newdf.report.tolist()
    cleaned_text = [clean_basic(i) for i in text]
    newdf.report = cleaned_text

    texts = newdf.report.apply(str).tolist()
    tokenized_texts = []
    labels_pos = []
    labels_int = []
    labels_rel = []

    for i in range(len(newdf)):
        tokenized_text = tokenizer.encode(newdf.report[i])
        t = len(tokenized_text)
        tokenized_texts.append(tokenized_text)
        labels_pos.append([1]*t)
        labels_int.append([1]*t)
        labels_rel.append([1]*t)

    texts = np.array(texts)
    labels_pos = np.array(labels_pos)
    labels_int = np.array(labels_int)
    labels_rel = np.array(labels_rel)

    dataset = IncidentDataset(texts.tolist(), labels_pos.tolist(), labels_int.tolist(), labels_rel.tolist(), tokenizer, MAX_LEN, 2, 1)
    dataloader = DataLoader(dataset, batch_size=1, drop_last=False, shuffle=False)
    model.eval()

    outs_pos, outs_int, outs_rel = [], [], []
    for i, batch in enumerate(dataloader):
        ids = batch["input_ids"].to("cuda")
        mask = batch["attention_mask"].to("cuda")
        types = batch["token_type_ids"].to("cuda")

        with torch.no_grad():
            out_pos, out_int, out_rel = model(input_ids=ids, attention_mask=mask, token_type_ids=types)

        out_pos = torch.argmax(out_pos, dim=-1)
        out_int = torch.argmax(out_int, dim=-1)
        out_rel = torch.argmax(out_rel, dim=-1)

        for s in range(out_pos.shape[0]):
            pos = torch.masked_select(out_pos[s], mask[s] > 0)
            pos = pos.cpu().numpy().tolist()[:-1]
            pos = [ids_to_labels[id] for id in pos]
            outs_pos.append(pos)

            intention = torch.masked_select(out_int[s], mask[s] > 0)
            intention = intention.cpu().numpy().tolist()[:-1]
            intention = [ids_to_labels_int[id] for id in intention]
            outs_int.append(intention)

            rel = torch.masked_select(out_rel[s], mask[s] > 0)
            rel = rel.cpu().numpy().tolist()[:-1]
            rel = [ids_to_labels_rel[id] for id in rel]
            outs_rel.append(rel)

    tokenized_texts=[]
    for i in range(len(texts.tolist())):
        tokenized_texts.append(tokenizer.encode(texts.tolist()[i], out_type=str))
    ss = pd.DataFrame({
        "report":texts.tolist(),
        "tokenized_report": tokenized_texts,
        "outs_pos": outs_pos,
        "outs_int": outs_int,
        "outs_rel": outs_rel})

    ss['id'] = newdf.id
    ss.to_excel(out_dir,index=False)

    return out_dir

## Post-processing - incident type prediction

In [21]:
def drug_incidents(data):
    drug_entities = data[data["entity"] == "Drug"]
    intentions = data["label"].values.tolist()
    if "IN" in intentions:
        if "NA" in intentions:
            return ["Wrong Drug"]
        else:
            return ["Drug Omission"]
    if "NA" in intentions:
        return ["Extra Drug"]

    return []


def other_incidents(data, entity_type):
    entities = data[((data["entity"] == entity_type) & ((data["label"] == "IN") | (data["label"] == "NA")))]
    if entities.shape[0] > 0:
        return ["Wrong " + entity_type]
    else:
        return []


def find_incidents(data):
    incidents = []
    # incidents += drug_incidents(data)
    for i in [
        "Strength_amount",
        "Strength_rate",
        "Strength_concentration",
        "Dosage",
        "Form_form",
        "Form_mode",
        "Route",
        "Date",
        "Duration",
        "Timing",
        "Frequency",
        "Drug"
    ]:
        incidents += other_incidents(data, i)

    if len(incidents) == 0:
        return "Others"
    else:
        return ",".join(incidents)


def entity_results(in_dir, out_dir):
    # path to the file/output of model
    ss = pd.read_excel(in_dir, index_col=0)
    ss = ss.reset_index()

    # convert list input to string in dataframe
    ss.outs_pos = ss.outs_pos.apply(eval)
    ss.outs_int = ss.outs_int.apply(eval)
    ss.outs_rel = ss.outs_rel.apply(eval)
    ss.tokenized_report = ss.tokenized_report.apply(eval)

    df_total = pd.DataFrame()
    for idx, row in tqdm(ss.loc[:].iterrows()):

        ans = []
        for idx2, ele in enumerate(row.outs_pos):

            mini = []
            if ele.startswith("B"):
                count = idx2
                while count <= len(row.outs_pos) - 2:
                    mini.append(count)
                    count += 1
                    if row.outs_pos[count] == "O" or row.outs_pos[count].startswith("B"):
                        break
            if mini != []:
                ans.append(mini)
        df_sub = pd.DataFrame(
            columns=["id", "reports", "entity_name", "start_idx", "end_idx", "entity", "label", "rel_index"])
        final_ans = []
        for a in ans:
            entity = "".join(row.tokenized_report[a[0]:a[-1] + 1])

            ent_e = row.outs_pos[a[0]:a[-1] + 1]
            ent_e = [ele.replace("I-", "") for ele in ent_e]
            ent_e = [ele.replace("B-", "") for ele in ent_e]
            ent_e = list(set(ent_e))

            for ele in reversed(ent_e):
                if ele in ["O", "<pad>"]:
                    ent_e.remove(ele)
            if len(ent_e) > 0:
                ent_ele = ent_e[0]
            else:
                ent_ele = None

            int_f = row.outs_int[a[0]:a[-1] + 1]
            int_f = list(set(int_f))
            for ele in reversed(int_f):
                if ele in ["O", "<pad>"]:
                    int_f.remove(ele)
            if len(int_f) > 0:
                int_fac_ele = int_f[0]
            else:
                int_fac_ele = None

            rel_a = row.outs_rel[a[0]:a[-1] + 1]

            rel_a = list(set(rel_a))
            for ele in reversed(rel_a):
                if ele in ["O", "<pad>"]:
                    rel_a.remove(ele)
            if len(rel_a) > 0:
                rel_a_ele = rel_a[0]
            else:
                rel_a_ele = None
            if len(row.tokenized_report) > 1:
                test = "".join(list(row.tokenized_report[0].replace("?", "")) + list(row.tokenized_report[1:a[0]]))
            else:
                print("error_occured")

            # start_idx = len(test)+1
            start_idx = len(test)
            end_idx = start_idx + len(entity)
            new_row = [row.id, row.report, entity, start_idx, end_idx, ent_ele, int_fac_ele, rel_a_ele]
            df_sub.loc[df_sub.shape[0]] = new_row
        df_total = pd.concat([df_total, df_sub])

    df_total.to_excel(out_dir, index=False)

    df_total = pd.read_excel(out_dir, engine='openpyxl', keep_default_na=False)
    output_df = pd.DataFrame(columns=list(df_total.columns) + ["incident_types"])
    groups = df_total.groupby(["id", "reports"]).groups

    for group in groups.items():
        report_id = group[0][0]
        entities = df_total.iloc[list(group[1])].sort_values(by='start_idx')
        entities["incident_types"] = find_incidents(entities)
        output_df = output_df.append(entities, ignore_index=True)

    output_df.to_excel(out_dir)

    return out_dir



## Output annotations

In [22]:
sent_out_dir = out_dir.replace('.xlsx','-predict.xlsx')
sent_out_dir = report_predict(in_dir, sent_out_dir,TOKENIZER_MODEL, BERT_CONIFG_FILE, BERT_PRETRAINED_MODEL, MODEL_SAVE_PATH)
out_dir = entity_results(in_dir = sent_out_dir, out_dir=out_dir)
print('Entity Level Output from AI Model:', out_dir)


Loading weights
Loaded weights


  labels_pos = np.array(labels_pos)
  labels_int = np.array(labels_int)
  labels_rel = np.array(labels_rel)
4it [00:00, 79.75it/s]
  output_df = output_df.append(entities, ignore_index=True)
  output_df = output_df.append(entities, ignore_index=True)
  output_df = output_df.append(entities, ignore_index=True)


Entity Level Output from AI Model: /content/mnt/MyDrive/program/out/annotation.xlsx
