In [5]:
!pwd

/home/debo/bs/biomedical/notebooks


In [8]:
import sys
sys.path.append('../../evaluation/')

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
from jinja2 import Template
from torch.utils.data import Dataset
from tqdm import tqdm

from evaluation.tasks.auto_task import AutoTask

device = torch.device("cuda:0")

model_name_or_path = 'gpt2'

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    pad_token_id=tokenizer.eos_token,
)
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(len(tokenizer))
model.to(device)


TEMPLATE = Template(
    """
    Given that: {{context}}
    {{question}}
    Answer:
    """
)

In [10]:
dataset_l = load_dataset("pubmed_qa", "pqa_labeled")
pp = dataset_l['train'].select(range(10))

Reusing dataset pubmed_qa (/home/debo/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/2e65addecca4197502cd10ab8ef1919a47c28672f62d7abac7cc9afdcf24fb2d)


In [14]:
class PubMedQADatset(Dataset):
    def __init__(self, tokenizer):
        super().__init__()
        assert tokenizer.pad_token == tokenizer.eos_token
        dataset_l = load_dataset("pubmed_qa", "pqa_labeled")
        self.items = []
        for sample in dataset_l['train']:
                # Filter out samples in languages that are not used during training
                prompt = TEMPLATE.render(
                    question=sample["question"],
                    context = " ".join(sample["context"]["contexts"])
                )
                prompt = prompt.strip()  # Remove trailing white space and newline

                # Tokenize and construct this sample
                inputs = tokenizer(
                    prompt,
                    padding=True,
                    return_tensors="pt",
                    truncation=True
                )
                self.items.append(
                    {
                        "prompt": prompt,
                        "input_ids": inputs["input_ids"],
                        "attention_mask": inputs["attention_mask"],
                        "input_len": inputs["attention_mask"].shape[1],
                        "target_answer": sample["final_decision"],
                    }
                )
    def __len__(self):
        return len(self.items)

    def __getitem__(self, index):
        return self.items[index]

In [15]:
dsd = PubMedQADatset(tokenizer)

Reusing dataset pubmed_qa (/home/debo/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/2e65addecca4197502cd10ab8ef1919a47c28672f62d7abac7cc9afdcf24fb2d)


In [76]:
# dsd[8:16]

In [17]:
substring_matches = 0
for sample in tqdm(dsd[:10], desc=f"Evaluating pubmedQA"):
    output = model.generate(
        input_ids=sample["input_ids"].to(device),
        attention_mask=sample["attention_mask"].to(device),
        max_length=min(sample["input_len"] * 2, model.config.n_positions),
    )

    prompt_len = len(sample["prompt"])
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    predicted_answer = decoded_output[prompt_len:]

    target_answers = sample["target_answer"]
    substring_match = any([target_answer in predicted_answer.lower() for target_answer in target_answers])
    substring_matches += substring_match

Evaluating pubmedQA: 100%|██████████| 10/10 [00:51<00:00,  5.16s/it]


In [18]:
substring_matches

9