In [None]:
!nvidia-smi

Mon Sep 18 04:55:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    43W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers
!pip install pytorch_lightning
!pip install sentencepiece datasets seqeval

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
Inst

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
import csv
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    MT5ForConditionalGeneration,
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    get_linear_schedule_with_warmup,
    Adafactor
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import Trainer

In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Model

Majority of the code here is adapted from [here](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) which uses the pytorch-lightning framework for training neural networks. T5 has shown that it can generate state of the art on many tasks as long as it can be cast as a text-to-text problem

In [None]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparam):
        super(T5FineTuner, self).__init__()
        self.hparam = hparam
        self.model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/clinical-t5-large-language-models-built-using-mimic-clinical-text-1.0.0/Clinical-T5-Large")
        self.tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/clinical-t5-large-language-models-built-using-mimic-clinical-text-1.0.0/Clinical-T5-Large")
        self.save_hyperparameters()
        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.test_step_outputs = []

    def is_logger(self):
        return True

    def forward(
        self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.training_step_outputs.append(loss)
        ##
        # Log loss
        self.log("train_loss", loss, on_epoch=True, prog_bar=True)
        ##
        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def on_train_epoch_end(self):
        epoch_average = torch.stack(self.training_step_outputs).mean()
        self.log("training_epoch_average", epoch_average, on_epoch=True, prog_bar=True)
        self.training_step_outputs.clear()  # free memory

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.validation_step_outputs.append(loss)
        self.log("val_loss", loss, on_epoch=True, prog_bar=True)
        return {"val_loss": loss}

    def on_validation_epoch_end(self):
        epoch_average = torch.stack(self.validation_step_outputs).mean()
        self.log("validation_epoch_average", epoch_average)
        self.validation_step_outputs.clear()  # free memory

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparam.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
                         lr=self.hparam.learning_rate, eps=self.hparam.adam_epsilon)
        self.opt = optimizer
        return [optimizer]


    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(
            self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(
            tokenizer=self.tokenizer, type_path="train", args=self.hparam)
        dataloader = DataLoader(train_dataset, batch_size=self.hparam.train_batch_size,
                                drop_last=True, shuffle=True, num_workers=2)
        t_total = (
            (len(dataloader.dataset) //
             (self.hparam.train_batch_size * max(1, self.hparam.n_gpu)))
            // self.hparam.gradient_accumulation_steps
            * float(self.hparam.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparam.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(
            tokenizer=self.tokenizer, type_path="validation", args=self.hparam)
        return DataLoader(val_dataset, batch_size=self.hparam.eval_batch_size, num_workers=2)

In [None]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
from sklearn.metrics import f1_score

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [None]:
# I changed seq_length from 256 to 128, LR from 3e-8 to 1e-8 Best Lr=3e-4
# in 30 jul, changed adam_epsilon=1e-8 to 1e-6 and batch sizes from 8 to 16

args_dict = dict(
    data_dir="/content/", # path for data files
    output_dir="/content/drive/MyDrive/biocreative/checkpoints", # path to save the checkpoints
    model_name_or_path="/content/drive/MyDrive/clinical-t5-large-language-models-built-using-mimic-clinical-text-1.0.0/Clinical-T5-Large",
    tokenizer_name_or_path="/content/drive/MyDrive/clinical-t5-large-language-models-built-using-mimic-clinical-text-1.0.0/Clinical-T5-Large",
    max_seq_length=128,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=20,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

# Dataset

In [None]:
class Seq2SeqDataset():
  def __init__(self, tokenizer, dataset, max_len=128):

    self.data = dataset
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.tokenizer.max_length = max_len
    self.tokenizer.model_max_length = max_len
    self.inputs = []
    self.targets = []

    self._build()

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  # here how to concatenate the tokens with spans
  def _build(self):
    for idx in range(len(self.data)):
      input_, target = self.data[idx]["Text"], self.data[idx]["Spans"]

      input_ = input_.lower()
      target = target.lower()

       # tokenize inputs
      tokenized_inputs = self.tokenizer.batch_encode_plus(
          [input_], max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
      )
       # tokenize targets
      tokenized_targets = self.tokenizer.batch_encode_plus(
          [target],max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
      )

      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)

In [None]:
# change tokenizer to "/content/drive/MyDrive/clinical-t5-large-language-models-built-using-mimic-clinical-text-1.0.0/Clinical-T5-Large"
# instead of "t5-small"
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/clinical-t5-large-language-models-built-using-mimic-clinical-text-1.0.0/Clinical-T5-Large")

def csv_to_list(csv_file, output_file=None):
    data_list = []

    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file, delimiter='\t')
        for row in reader:
            data_list.append(row)
    if output_file:
      keys = data_list[0].keys()  # Assuming the first row contains the keys/column names
      with open(output_file, 'w', newline='') as file:
          writer = csv.DictWriter(file, fieldnames=keys, delimiter='\t')
          writer.writeheader()
          writer.writerows(data_list)
    return data_list
dataset =[]
dataset = csv_to_list("/content/drive/MyDrive/biocreative/Preprocessed/Final_Training.tsv", output_file="/content/Training.tsv")

input_dataset = Seq2SeqDataset(tokenizer=tokenizer, dataset=dataset)
print(tokenizer)

Using bos_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


T5TokenizerFast(name_or_path='/content/drive/MyDrive/clinical-t5-large-language-models-built-using-mimic-clinical-text-1.0.0/Clinical-T5-Large', vocab_size=32000, model_max_length=128, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra

In [None]:
for i in range(len(input_dataset)):
    _ = input_dataset[i]

In [None]:
data = input_dataset[0]

print(tokenizer.decode(data["source_ids"], skip_special_tokens=False))
print(tokenizer.decode(data["target_ids"], skip_special_tokens=False))

mouth: high arched palate, superior lingular frenulum attaches low on gumline</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
keyf: high arched palate</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [None]:
!mkdir -p t5_ner

In [None]:
wandb_logger = WandbLogger(project='Pheno', log_model='all'))

[34m[1mwandb[0m: Currently logged in as: [33malhassan-areej[0m ([33mareej[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
args = argparse.Namespace(**args_dict)
model = T5FineTuner(args)

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filename=args.output_dir+"/checkpoint.pth", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    #gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    #amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    #checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
    #logger=wandb_logger
)

In [None]:
def get_dataset(tokenizer, type_path, args):
    tokenizer.max_length = args.max_seq_length
    tokenizer.model_max_length = args.max_seq_length
    if type_path == "train":
      dataset = csv_to_list("/content/drive/MyDrive/biocreative/Preprocessed/Final_Training.tsv", output_file="/content/Training.tsv")
    if type_path == "validation":
      dataset = csv_to_list("/content/drive/MyDrive/biocreative/Preprocessed/Final_Validation.tsv", output_file="/content/Validation.tsv")
    return Seq2SeqDataset(tokenizer=tokenizer, dataset=dataset)

In [None]:
trainer = pl.Trainer(**train_params)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 737 M 
-----------------------------------------------------
737 M     Trainable params
0         Non-trainable params
737 M     Total params
2,950.558 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:__main__:***** Validation results *****
INFO:__main__:val_loss = tensor(6.1752, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(6.1752, device='cuda:0')

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(1.8073, device='cuda:0')

INFO:__main__:train_loss_step = tensor(1.8073, device='cuda:0')

INFO:__main__:val_loss = tensor(1.9034, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(1.9056, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(1.4889, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(3.2284, device='cuda:0')

INFO:__main__:train_loss_step = tensor(1.4889, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(3.2284, device='cuda:0')

INFO:__main__:val_loss = tensor(1.2109, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(1.2123, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.2253, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(1.7083, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.2253, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(1.7083, device='cuda:0')

INFO:__main__:val_loss = tensor(0.4983, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.5001, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.3900, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.8120, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.3900, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.8120, device='cuda:0')

INFO:__main__:val_loss = tensor(0.4431, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.4445, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.1554, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.3843, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.1554, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.3843, device='cuda:0')

INFO:__main__:val_loss = tensor(0.3827, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.3836, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.1927, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.2621, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.1927, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.2621, device='cuda:0')

INFO:__main__:val_loss = tensor(0.3564, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.3577, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.1599, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.1792, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.1599, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.1792, device='cuda:0')

INFO:__main__:val_loss = tensor(0.3919, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.3922, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.1464, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.1291, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.1464, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.1291, device='cuda:0')

INFO:__main__:val_loss = tensor(0.3963, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.3966, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.0637, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.1141, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.0637, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.1141, device='cuda:0')

INFO:__main__:val_loss = tensor(0.4132, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.4133, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.0924, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.0970, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.0924, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.0970, device='cuda:0')

INFO:__main__:val_loss = tensor(0.4316, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.4318, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.0409, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.0875, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.0409, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.0875, device='cuda:0')

INFO:__main__:val_loss = tensor(0.4487, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.4482, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.1853, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.0792, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.1853, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.0792, device='cuda:0')

INFO:__main__:val_loss = tensor(0.4435, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.4434, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.0179, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.0743, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.0179, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.0743, device='cuda:0')

INFO:__main__:val_loss = tensor(0.4687, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.4683, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.0358, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.0671, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.0358, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.0671, device='cuda:0')

INFO:__main__:val_loss = tensor(0.4547, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.4543, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.0814, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.0623, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.0814, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.0623, device='cuda:0')

INFO:__main__:val_loss = tensor(0.4644, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.4641, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.0580, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.0602, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.0580, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.0602, device='cuda:0')

INFO:__main__:val_loss = tensor(0.5264, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.5264, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.0673, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.0536, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.0673, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.0536, device='cuda:0')

INFO:__main__:val_loss = tensor(0.5097, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.5095, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.0460, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.0502, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.0460, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.0502, device='cuda:0')

INFO:__main__:val_loss = tensor(0.5161, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.5167, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.0153, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.0422, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.0153, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.0422, device='cuda:0')

INFO:__main__:val_loss = tensor(0.5149, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.5150, device='cuda:0')



Validation: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:train_loss = tensor(0.0329, device='cuda:0')

INFO:__main__:train_loss_epoch = tensor(0.0393, device='cuda:0')

INFO:__main__:train_loss_step = tensor(0.0329, device='cuda:0')

INFO:__main__:training_epoch_average = tensor(0.0393, device='cuda:0')

INFO:__main__:val_loss = tensor(0.5239, device='cuda:0')

INFO:__main__:validation_epoch_average = tensor(0.5240, device='cuda:0')

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


## Load the Stored Model and Evaluate

In [None]:
model = model.load_from_checkpoint("/content/lightning_logs/version_0/checkpoints/epoch=19-step=280.ckpt")

In [None]:
import textwrap
Val_dataset = csv_to_list("/content/drive/MyDrive/biocreative/Preprocessed/Final_Validation.tsv", output_file="/content/Validation.tsv")
Val_dataset = Seq2SeqDataset(tokenizer=tokenizer, dataset=Val_dataset)
dataloader = DataLoader(Val_dataset, batch_size=32, num_workers=2)
# remove the shuffle later,doesn't make sense
model.model.eval()
model = model.to("cpu")
outputs = []
targets = []
texts = []
for batch in dataloader:

    outs = model.model.generate(input_ids=batch['source_ids'],
                                attention_mask=batch['source_mask'],max_new_tokens=128)
    dec = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip() for ids in outs]
    target = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
                for ids in batch["target_ids"]]
    text = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
                for ids in batch["source_ids"]]
    texts.extend(text)
    outputs.extend(dec)
    targets.extend(target)

for i in range(10):
    c = texts[i]
    lines = textwrap.wrap("text:\n%s\n" % c, width=100)
    print("\n".join(lines))
    print("\nActual Entities: %s" % targets[i])
    print("Predicted Entities: %s" % outputs[i])
    print("=====================================================================\n")

text: mouth: mildly high arched palate. normal lips and tongue.

Actual Entities: keyf: high arched palate; normf: normal tongue; normf: normal lips
Predicted Entities: keyf: high arched palate; normf: normal lips; normf: normal tongue

text: eyes: prominent infraorbital creases.

Actual Entities: keyf: prominent infraorbital creases
Predicted Entities: keyf: infraorbital creases

text: eyes: mild epicanthus, mild up-slant

Actual Entities: keyf: epicanthus; keyf: eyes: mild up-slant
Predicted Entities: keyf: epicanthus; keyf: eyes: up-slant

text: neurologic: cooperative with exam and follows all instructions in both english and spanish.

Actual Entities: na
Predicted Entities: na

text: eyes: bluish hue to sclerae.

Actual Entities: keyf: bluish hue to sclerae
Predicted Entities: keyf: bluish hue to sclerae

text: ears: somewhat crumped appearance of the ears

Actual Entities: keyf: ears: crumped
Predicted Entities: keyf: crumped appearance of the ears

text: hands feet: pes planus



In [None]:
total_examples = len(outputs)
exact_match_count = 0

for predicted, reference in zip(outputs, targets):
    if predicted == reference:
        exact_match_count += 1

exact_match_score = exact_match_count / total_examples

print("Exact Match (EM) Score:", exact_match_score)

Exact Match (EM) Score: 0.552863436123348


In [None]:
Obs = []
Txt = []
with open("/content/drive/MyDrive/biocreative/Preprocessed/Final_Validation.tsv", 'r', encoding='utf-8') as tsv_file:
        reader = csv.reader(tsv_file, delimiter='\t')
        next(reader)
        for row in reader:
            Obs.append(row[0])
            Txt.append(row[1])
data = list(zip(Obs, Txt, outputs, targets))

# Specify the file path and name
file_path = "ValResults.tsv"

# Write the data to the .tsv file
with open(file_path, "w", newline="", encoding="utf-8") as tsvfile:
    writer = csv.writer(tsvfile, delimiter="\t")
    writer.writerow(["ObservationID","Text", "Output", "Target"])  # Write the header row
    writer.writerows(data)  # Write the data rows

In [None]:
def find_sub_list(sl, l):
    results = []
    sll = len(sl)
    for ind in (i for i, e in enumerate(l) if e == sl[0]):
        if l[ind:ind+sll] == sl:
            results.append((ind, ind+sll-1))
    return results

def generate_label(input: str, target: str):
    mapper = {'O': 0, 'B-NORMF': 1, 'I-NORMF': 2, 'B-KEYF': 3,
              'I-KEYF': 4}
    inv_mapper = {v: k for k, v in mapper.items()}

    input = input.split(" ")
    target = target.split("; ")

    init_target_label = [mapper['O']]*len(input)

    for ent in target:
        ent = ent.split(": ")
        try:
            sent_end = ent[1].split(" ")
            index = find_sub_list(sent_end, input)
        except:
            continue
        # print(index)
        try:
            init_target_label[index[0][0]] = mapper[f"B-{ent[0].upper()}"]
            for i in range(index[0][0]+1, index[0][1]+1):
                init_target_label[i] = mapper[f"I-{ent[0].upper()}"]
        except:
            continue
    init_target_label = [inv_mapper[j] for j in init_target_label]
    return init_target_label

In [None]:
from tqdm import tqdm
test_dataset = csv_to_list("/content/drive/MyDrive/biocreative/Preprocessed/Final_Validation.tsv", output_file="/content/Validation.tsv")

# test_dataset= حطي هنا التست
test_dataset = Seq2SeqDataset(tokenizer=tokenizer, dataset=test_dataset)
test_loader = DataLoader(test_dataset, batch_size=32,
                             num_workers=2, shuffle=True)
model.model.eval()
model = model.to("cuda")
outputs = []
targets = []
all_text = []
true_labels = []
pred_labels = []
for batch in tqdm(test_loader):
    input_ids = batch['source_ids'].to("cuda")
    attention_mask = batch['source_mask'].to("cuda")
    outs = model.model.generate(input_ids=input_ids,
                                attention_mask=attention_mask)
    dec = [tokenizer.decode(ids, skip_special_tokens=True,
                            clean_up_tokenization_spaces=False).strip() for ids in outs]
    target = [tokenizer.decode(ids, skip_special_tokens=True,  clean_up_tokenization_spaces=False).strip()
                for ids in batch["target_ids"]]
    texts = [tokenizer.decode(ids, skip_special_tokens=True,  clean_up_tokenization_spaces=False).strip()
                for ids in batch["source_ids"]]
    true_label = [generate_label(texts[i].strip(), target[i].strip()) if target[i].strip() != 'none' else [
        "O"]*len(texts[i].strip().split()) for i in range(len(texts))]
    pred_label = [generate_label(texts[i].strip(), dec[i].strip()) if dec[i].strip() != 'none' else [
        "O"]*len(texts[i].strip().split()) for i in range(len(texts))]

    outputs.extend(dec)
    targets.extend(target)
    true_labels.extend(true_label)
    pred_labels.extend(pred_label)
    all_text.extend(texts)

100%|██████████| 15/15 [00:15<00:00,  1.01s/it]


In [None]:
all_text[1]

'eyes: mild downslant to palpebral fissures'

In [None]:
# This metric needs outputs to be in BIO format :( I'm not going to use it. use classification report instead
from datasets import load_metric

metric = load_metric("seqeval")

for i in range(10):
    print(f"Text:  {all_text[i]}")
    print(f"Predicted Token Class:  {pred_labels[i]}")
    print(f"True Token Class:  {true_labels[i]}")
    print("=====================================================================\n")

print(metric.compute(predictions=pred_labels, references=true_labels))

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Text:  mouth: tongue inside mouth
Predicted Token Class:  ['O', 'O', 'O', 'O']
True Token Class:  ['O', 'O', 'O', 'O']

Text:  eyes: mild downslant to palpebral fissures
Predicted Token Class:  ['O', 'O', 'B-KEYF', 'I-KEYF', 'I-KEYF', 'I-KEYF']
True Token Class:  ['O', 'O', 'B-KEYF', 'I-KEYF', 'I-KEYF', 'I-KEYF']

Text:  head: mild posterior plagiocephally.
Predicted Token Class:  ['O', 'O', 'O', 'O']
True Token Class:  ['O', 'O', 'O', 'O']

Text:  head: relative macrocephaly with high anterior hairline
Predicted Token Class:  ['O', 'B-KEYF', 'I-KEYF', 'O', 'B-KEYF', 'I-KEYF', 'I-KEYF']
True Token Class:  ['O', 'B-KEYF', 'I-KEYF', 'O', 'B-KEYF', 'I-KEYF', 'I-KEYF']

Text:  ears: posteriorly rotated, thickened helices
Predicted Token Class:  ['O', 'O', 'O', 'B-KEYF', 'I-KEYF']
True Token Class:  ['O', 'O', 'O', 'B-KEYF', 'I-KEYF']

Text:  extremities: right arm in blue cast, left arm in no-no. grossly normal limb lengths and porportions. no obvious joint laxity.
Predicted Token Class:  

In [None]:
####### Test set Block #########
import textwrap
Tst_dataset = csv_to_list("/content/drive/MyDrive/biocreative/Preprocessed/Test.tsv")
Tst_dataset = Seq2SeqDataset(tokenizer=tokenizer, dataset=Tst_dataset)
dataloader = DataLoader(Tst_dataset, batch_size=32, num_workers=2)
# remove the shuffle later,doesn't make sense
model.model.eval()
model = model.to("cpu")
outputs = []
texts = []
targets = []
for batch in dataloader:

    outs = model.model.generate(input_ids=batch['source_ids'],
                                attention_mask=batch['source_mask'],max_new_tokens=128)
    dec = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip() for ids in outs]
    target = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
                for ids in batch["target_ids"]]
    text = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
                for ids in batch["source_ids"]]
    texts.extend(text)
    outputs.extend(dec)
    targets.extend(target)

for i in range(10):
    c = texts[i]
    lines = textwrap.wrap("text:\n%s\n" % c, width=100)
    print("\n".join(lines))
    print("\nActual Entities: %s" % targets[i])
    print("Predicted Entities: %s" % outputs[i])
    print("=====================================================================\n")

KeyboardInterrupt: ignored

In [None]:
Obs = []
Txt = []
with open("/content/drive/MyDrive/biocreative/dataset/BioCreativeVIII3_TestSetWithDecoy.tsv", 'r', encoding='utf-8') as tsv_file:
        reader = csv.reader(tsv_file, delimiter='\t')
        next(reader)
        for row in reader:
            Obs.append(row[0])
            Txt.append(row[1])
data = list(zip(Obs, Txt, outputs, targets))

# Specify the file path and name
file_path = "TestResults.tsv"

# Write the data to the .tsv file
with open(file_path, "w", newline="", encoding="utf-8") as tsvfile:
    writer = csv.writer(tsvfile, delimiter="\t")
    writer.writerow(["ObservationID","Text", "Output", "Target"])  # Write the header row
    writer.writerows(data)  # Write the data rows