In [1]:
from openprompt.data_utils import InputExample
import torch
import pandas as pd
import os
import json, csv
from abc import ABC, abstractmethod
from collections import defaultdict, Counter
from typing import List, Dict, Callable

from openprompt.utils.logging import logger

from openprompt.data_utils.utils import InputExample
from openprompt.data_utils.data_processor import DataProcessor

import pandas as pd
import numpy as np
from tqdm import tqdm

from torchnlp.encoders import LabelEncoder

In [9]:
# top 50 icd 9 data

# set a local pc directory if not on alejos machines
local_pc = True
if local_pc:
    mimic_data_dir = "C://Users/ntaylor/Documents/GitHub/Neural_Networks/DPhil_NLP/mimic-icd9-classification/clinical-longformer/data/intermediary-data/top_50_icd9"
else:

    mimic_data_dir = "/home/niallt/NLP_DPhil/DPhil_projects/mimic-icd9-classification/clinical-longformer/data/intermediary-data/"
mimic_data = pd.read_csv(f"{mimic_data_dir}/train.csv")

In [10]:
mimic_data.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,1,: : : Sex: F Service: CARDIOTHORACIC Allergies...,4240
1,3,: : : Sex: F Service: NEONATOLOGY HISTORY: wee...,V3001
2,6,: : : Sex: M Service: CARDIOTHORACIC Allergies...,41041
3,7,: : : Sex: F Service: MEDICINE Allergies: Peni...,51881
4,8,: : : Sex: F Service: CARDIOTHORACIC Allergies...,3962


In [12]:
class MimicProcessor(DataProcessor):
    # TODO Test needed
    def __init__(self):
        super().__init__()
#         self.labels = ["contradiction", "entailment", "neutral"]
        

    def get_examples(self, data_dir, set = "train"):
        path = f"{data_dir}/{set}.csv"
        print(f"loading {set} data")
        print(f"data path provided was: {path}")
        examples = []
        df = pd.read_csv(path)
        self.label_encoder = LabelEncoder(np.unique(df.label).tolist(), reserved_labels = [])
        
        for idx, row in tqdm(df.iterrows()):
#             print(row)
            _, body, label = row
            label = self.label_encoder.encode(label)
#             print(f"body : {body}")
#             print(f"label: {label}")
#             print(f"labels original: {self.label_encoder.index_to_token[label]}")
            
            text_a = body.replace('\\', ' ')

            example = InputExample(
                guid=str(idx), text_a=text_a, label=int(label)-1)
            examples.append(example)
            
                
        return examples

In [13]:
# get different splits
train_data = MimicProcessor().get_examples(data_dir = f"{mimic_data_dir}", set = "train")
valid_data = MimicProcessor().get_examples(data_dir = f"{mimic_data_dir}", set = "valid")
test_data = MimicProcessor().get_examples(data_dir = f"{mimic_data_dir}", set = "test")

loading train data
data path provided was: C://Users/ntaylor/Documents/GitHub/Neural_Networks/DPhil_NLP/mimic-icd9-classification/clinical-longformer/data/intermediary-data/top_50_icd9/train.csv


14360it [00:00, 16621.83it/s]


loading valid data
data path provided was: C://Users/ntaylor/Documents/GitHub/Neural_Networks/DPhil_NLP/mimic-icd9-classification/clinical-longformer/data/intermediary-data/top_50_icd9/valid.csv


4693it [00:00, 12144.25it/s]


loading test data
data path provided was: C://Users/ntaylor/Documents/GitHub/Neural_Networks/DPhil_NLP/mimic-icd9-classification/clinical-longformer/data/intermediary-data/top_50_icd9/test.csv


4754it [00:00, 16130.62it/s]


In [14]:
test_data

[{
   "guid": "0",
   "label": 29,
   "meta": {},
   "text_a": ": : Service: CCU HISTORY OF PRESENT ILLNESS: The patient is a year old female with a history of congestive heart failure, coronary artery disease, reported myocardial infarction, transferred to from after being intubated there for respiratory distress. The patient had been admitted to four weeks ago and had reportedly ruled in for myocardial infarction there and refused further intervention. After discharge, the patient had tried treatment with sublingual Nitroglycerin but had recurrent and more frequent anginal episodes. The patient also had been discharged on Bumex but had skipped doses. In the past day prior to admission, the patient had increasing symptoms of congestive heart failure including paroxysmal nocturnal dyspnea as well as increasing angina not relieved by sublingual Nitroglycerin. The patient was taken to . Initial blood pressure there was /. The patient received mg of Morphine, mg intravenous Lasix, intrave

# adapt below to work with mimic data

In [None]:
# load pretrained language model (plm)


from openprompt.plms import load_plm

# plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")
plm, tokenizer, model_config, WrapperClass = load_plm("roberta", "roberta-large")

In [None]:
# set up templates - either manual, knowledgeable or soft
from openprompt.prompts import ManualTemplate
# mytemplate = ManualTemplate(tokenizer=tokenizer, text='{"placeholder":"text_a"} {"placeholder":"text_b"} In this sentence, the topic is {"mask"}.')
mytemplate = ManualTemplate(tokenizer=tokenizer).from_file("scripts/TextClassification/agnews/manual_template.txt", choice=0)


wrapped_example = mytemplate.wrap_one_example(dataset['train'][0]) 
print(wrapped_example)

In [None]:



from openprompt.prompts import ManualTemplate
# mytemplate = ManualTemplate(tokenizer=tokenizer, text='{"placeholder":"text_a"} {"placeholder":"text_b"} In this sentence, the topic is {"mask"}.')
mytemplate = ManualTemplate(tokenizer=tokenizer).from_file("scripts/TextClassification/agnews/manual_template.txt", choice=0)


wrapped_example = mytemplate.wrap_one_example(dataset['train'][0]) 
print(wrapped_example)

from openprompt import PromptDataLoader

train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer, 
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3, 
    batch_size=2,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="tail")
# next(iter(train_dataloader))

# ## Define the verbalizer
# In classification, you need to define your verbalizer, which is a mapping from logits on the vocabulary to the final label probability. Let's have a look at the verbalizer details:

from openprompt.prompts import SoftVerbalizer, ManualVerbalizer
import torch

# for example the verbalizer contains multiple label words in each class
# myverbalizer = SoftVerbalizer(tokenizer, plm, num_classes=4,
#          label_words=["politics", "sports", "business", "technology"])
# or without label words
# myverbalizer = SoftVerbalizer(tokenizer, plm, num_classes=4)

# or manual
myverbalizer = ManualVerbalizer(tokenizer, num_classes=4).from_file("scripts/TextClassification/agnews/manual_verbalizer.txt")



from openprompt import PromptForClassification

use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)
if use_cuda:
    prompt_model=  prompt_model.cuda()

# ## below is standard training


from transformers import  AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()

no_decay = ['bias', 'LayerNorm.weight']

# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters1 = [
    {'params': [p for n, p in prompt_model.plm.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.plm.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# Using different optimizer for prompt parameters and model parameters

# optimizer_grouped_parameters2 = [
#     {'params': prompt_model.verbalizer.group_parameters_1, "lr":3e-5},
#     {'params': prompt_model.verbalizer.group_parameters_2, "lr":3e-4},
# ]


optimizer1 = AdamW(optimizer_grouped_parameters1, lr=3e-5)
# optimizer2 = AdamW(optimizer_grouped_parameters2)


for epoch in range(5):
    print(f"On epoch: {epoch}")
    tot_loss = 0 
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer1.step()
        optimizer1.zero_grad()
        # optimizer2.step()
        # optimizer2.zero_grad()
        print(tot_loss/(step+1))
    
# ## evaluate

# %%

print("running validation!")
validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer, 
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3, 
    batch_size=2,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

prompt_model.eval()

allpreds = []
alllabels = []
with torch.no_grad():
    for step, inputs in enumerate(validation_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        alllabels.extend(labels.cpu().tolist())
        allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print("validation:",acc)


test_dataloader = PromptDataLoader(dataset=dataset["test"], template=mytemplate, tokenizer=tokenizer, 
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3, 
    batch_size=2,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")
allpreds = []
alllabels = []
with torch.no_grad():
    for step, inputs in enumerate(test_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        alllabels.extend(labels.cpu().tolist())
        allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print("test:", acc)  # roughly ~0.85