In [79]:
!nvidia-smi

Mon Nov 22 15:51:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  On   | 00000000:84:00.0 Off |                    0 |
| N/A   27C    P0    25W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
#!pip install --quiet transformers==4.1.1

In [3]:
#!pip install --quiet https://github.com/PyTorchLightning/pytorch-lightning/releases/download/1.2.6/pytorch-lightning-1.2.6.tar.gz

In [4]:
#!pip install --quiet tokenizers==0.9.4

In [5]:
#!pip install --quiet sentencepiece==0.1.94

In [6]:
#!pip install gdown

In [77]:
import gdown as gdown
import numpy as np
import pandas as pd
import math
import glob
import argparse
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import tensorflow as tf
from termcolor import colored
import textwrap

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [8]:
import pytorch_lightning as pl

In [78]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Number of GPU's available: ", len(physical_devices))

if(len(physical_devices)>0):
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

Number of GPU's available:  0


In [9]:
pl.seed_everything(0)

Global seed set to 0


0

In [10]:
#! unzip - bio-QA.zip

In [11]:
with Path("BioASQ/BioASQ-train-factoid-4b.json").open() as json_file:
    data = json.load(json_file)

In [12]:
data.keys()

dict_keys(['data', 'version'])

In [13]:
data['version']

'BioASQ6b'

In [14]:
len(data['data'])

1

In [15]:
data['data'][0].keys()

dict_keys(['paragraphs', 'title'])

In [16]:
data['data'][0]['title']

'BioASQ6b'

In [17]:
len(data['data'][0]['paragraphs'])

3266

In [18]:
questions = data['data'][0]['paragraphs']

In [19]:
questions[0]

{'qas': [{'id': '52bf208003868f1b06000019_002',
   'question': 'What is the inheritance pattern of Li–Fraumeni syndrome?',
   'answers': [{'text': 'autosomal dominant', 'answer_start': 213}]}],
 'context': 'Balanced t(11;15)(q23;q15) in a TP53+/+ breast cancer patient from a Li-Fraumeni syndrome family. Li-Fraumeni Syndrome (LFS) is characterized by early-onset carcinogenesis involving multiple tumor types and shows autosomal dominant inheritance. Approximately 70% of LFS cases are due to germline mutations in the TP53 gene on chromosome 17p13.1. Mutations have also been found in the CHEK2 gene on chromosome 22q11, and others have been mapped to chromosome 11q23. While characterizing an LFS family with a documented defect in TP53, we found one family member who developed bilateral breast cancer at age 37 yet was homozygous for wild-type TP53. Her mother also developed early-onset primary bilateral breast cancer, and a sister had unilateral breast cancer and a soft tissue sarcoma. Cytog

In [20]:
 def extract_questions_and_answers(factoid_path = Path):
    with factoid_path.open() as json_file:
        data = json.load(json_file)
        
    questions = data['data'][0]['paragraphs']
     
    data_rows = []
    
    for question in questions:
        context = question['context']

        for question_and_answers in question['qas']:
            question = question_and_answers['question']
            answers = question_and_answers['answers']

            for answer in answers:
                answer_text = answer['text']
                answer_start = answer['answer_start']
                answer_end = answer['answer_start'] + len(answer_text)
                    
                data_rows.append({
                     "question" : question,
                     "context"  : context,
                     "answer_text" : answer_text,
                     "answer_start" : answer_start,
                     "answer_end" : answer_end
                     })
    return pd.DataFrame(data_rows)

In [21]:
extract_questions_and_answers(Path("BioASQ/BioASQ-train-factoid-4b.json")).head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [22]:
factoid_paths = sorted(list(Path("BioASQ/").glob("BioASQ-train-*")))
factoid_paths

[PosixPath('BioASQ/BioASQ-train-factoid-4b.json'),
 PosixPath('BioASQ/BioASQ-train-factoid-5b.json'),
 PosixPath('BioASQ/BioASQ-train-factoid-6b.json')]

In [23]:
dfs = []

for factoid_path in factoid_paths:
    dfs.append(extract_questions_and_answers(factoid_path))
    
df = pd.concat(dfs)

In [24]:
df.head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [25]:
df.shape

(12988, 5)

In [26]:
df = df.drop_duplicates(subset = ["context"]).reset_index(drop=True)

In [27]:
df.shape

(2582, 5)

In [28]:
len(df.question.unique())

441

In [29]:
len(df.context.unique())

2582

In [30]:
sample_question = df.iloc[240]
sample_question

question        What is the characteristic feature of the Dyke...
context         Left hemisphere and male sex dominance of cere...
answer_text                                  cerebral hemiatrophy
answer_start                                                  130
answer_end                                                    150
Name: 240, dtype: object

In [31]:
def color_answer(question):
    answer_start, answer_end = question["answer_start"], question["answer_end"]
    context = question["context"]
    
    return colored(context[:answer_start], "yellow") + \
            colored(context[answer_start:answer_end + 1], "green") + \
            colored(context[answer_end + 1:], "blue")

In [32]:
print(sample_question["question"])
print()
print("Answer : ")

for wrap in textwrap.wrap(color_answer(sample_question), width = 120):
    print(wrap)

What is the characteristic feature of the Dyke-Davidoff-Masson syndrome.

Answer : 
[33mLeft hemisphere and male sex dominance of cerebral hemiatrophy (Dyke-Davidoff-Masson Syndrome). Although
radiological findings of [0m[32mcerebral hemiatrophy [0m[34m(Dyke-Davidoff-Masson Syndrome) are well known, there is
no systematic study about the gender and the affected side in this syndrome. Brain images in 26 patients (mean aged 11)
with cerebral hemiatrophy were retrospectively reviewed. Nineteen patients (73.5%) were male and seven patients (26.5%)
were female. Left hemisphere involvement was seen in 18 patients (69.2%) and right hemisphere involvement was seen in
eight patients (30.8%). We conclude that male gender and left side involvement are frequent in cerebral hemiatrophy
disease.[0m


In [33]:
MODEL_NAME = 't5-base'

In [34]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [35]:
sample_encoding = tokenizer("Would I rather be loved or feared?","Easy. Both. I want people be afraid of how much they love me.")

In [36]:
print(sample_encoding.keys())

dict_keys(['input_ids', 'attention_mask'])


In [37]:
print(sample_encoding["input_ids"])

[5328, 27, 1066, 36, 1858, 42, 3, 27625, 58, 1, 6844, 5, 2867, 5, 27, 241, 151, 36, 7403, 13, 149, 231, 79, 333, 140, 5, 1]


In [38]:
print(sample_encoding["attention_mask"])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [39]:
preds = [
    tokenizer.decode(input_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    for input_id in sample_encoding["input_ids"]
]

In [40]:
" ".join(preds)

'Would I rather be loved or  feared ? </s> Easy . Both . I want people be afraid of how much they love me . </s>'

In [41]:
encoding = tokenizer(sample_question["question"],sample_question["context"], max_length = 396, padding = "max_length", 
          truncation = "only_second", return_attention_mask=True, add_special_tokens=True, return_tensors="pt")

In [42]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [43]:
tokenizer.special_tokens_map

{'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'additional_special_tokens': "['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45>', '<extra_id_46>', '<extra_id_47>', '<extra_id_48>', '<extra_id_49>', '<extra_id_50>', '<extra_id_51>', '<extra_id_52>', '<extra_i

In [44]:
tokenizer.eos_token, tokenizer.eos_token_id

('</s>', 1)

In [45]:
tokenizer.decode(encoding["input_ids"].squeeze())

'What is the characteristic feature of the Dyke-Davidoff-Masson syndrome.</s> Left hemisphere and male sex dominance of cerebral hemiatrophy (Dyke-Davidoff-Masson Syndrome). Although radiological findings of cerebral hemiatrophy (Dyke-Davidoff-Masson Syndrome) are well known, there is no systematic study about the gender and the affected side in this syndrome. Brain images in 26 patients (mean aged 11) with cerebral hemiatrophy were retrospectively reviewed. Nineteen patients (73.5%) were male and seven patients (26.5%) were female. Left hemisphere involvement was seen in 18 patients (69.2%) and right hemisphere involvement was seen in eight patients (30.8%). We conclude that male gender and left side involvement are frequent in cerebral hemiatrophy disease.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

In [46]:
answer_encoding = tokenizer(
     sample_question['answer_text'],
     max_length=32,
     padding='max_length',
     truncation=True,
     return_attention_mask=True,
     add_special_tokens=True,
     return_tensors="pt"
     )

In [47]:
tokenizer.decode(answer_encoding['input_ids'].squeeze())

'cerebral hemiatrophy</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [48]:
labels = answer_encoding["input_ids"]

In [49]:
labels

tensor([[24387,     3,   107, 11658,    17, 29006,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]])

In [50]:
labels[labels == 0] = -100

In [51]:
labels

tensor([[24387,     3,   107, 11658,    17, 29006,     1,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100]])

In [52]:
class BioAQDataset(Dataset):
    
    def __init__(self, data:pd.DataFrame, tokenizer:T5Tokenizer, source_max_token_len: int =369, target_max_token_len: int =32):
        
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index:int):
        
        data_row = self.data.iloc[index]
        
        source_encoding = tokenizer(data_row['question'], data_row['context'], max_length=self.source_max_token_len,
                                        padding='max_length', truncation="only_second", return_attention_mask=True,
                                        add_special_tokens=True, return_tensors="pt")
        
        target_encoding = tokenizer(data_row['answer_text'], max_length=self.target_max_token_len, padding='max_length',
                                        truncation=True, return_attention_mask=True, add_special_tokens=True,
                                        return_tensors="pt")
        
        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100
        
        return dict(question = data_row["question"], context = data_row["context"], answer_text = data_row["answer_text"],
                       input_ids = source_encoding["input_ids"].flatten(), 
                       attention_mask = source_encoding["attention_mask"].flatten(), labels = labels.flatten())

In [53]:
sample_dataset = BioAQDataset(df,tokenizer)

In [54]:
for data in sample_dataset:
    print("question: ", data["question"])
    print("answers: ",data["answer_text"])
    print("input_ids: ", data["input_ids"][:10])
    print("labels: ", data["labels"][:10])
    break

question:  What is the inheritance pattern of Li–Fraumeni syndrome?
answers:  autosomal dominant
input_ids:  tensor([  363,    19,     8, 28915,  3275,    13,  1414,   104,   371,  6340])
labels:  tensor([ 1510, 10348,   138, 12613,     1,  -100,  -100,  -100,  -100,  -100])


In [55]:
train_df, val_df = train_test_split(df, test_size=0.05)

In [56]:
print(train_df.shape)
print(val_df.shape)

(2452, 5)
(130, 5)


In [57]:
class BioDataModule(pl.LightningDataModule):
    def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame, tokenizer:T5Tokenizer, batch_size: int = 8,
                    source_max_token_len: int = 396, target_max_token_len: int = 32):
        
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len
   
    def setup(self):
        
        self.train_dataset = BioAQDataset(self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        
        self.test_dataset = BioAQDataset(self.test_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        
    def train_dataloader(self):
        
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)
    
    def val_dataloader(self):
        
        return DataLoader(self.test_dataset, batch_size=1, num_workers=4)
    
    def test_dataloader(self):
        
        return DataLoader(self.test_dataset, batch_size=1, num_workers=4)

In [58]:
BATCH_SIZE = 4
N_EPOCHS = 6

data_module = BioDataModule(train_df, val_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [59]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict =True)

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [60]:
output = model(input_ids=encoding["input_ids"], attention_mask=encoding["attention_mask"], labels=labels)

In [61]:
print(output.logits.shape)

torch.Size([1, 32, 32128])


In [62]:
output.loss

tensor(5.0636, grad_fn=<NllLossBackward>)

In [63]:
class BioQAModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
   
    def forward(self, input_ids, attention_mask, labels=None):
        
        output = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits
   
    def training_step(self, batch, batch_idx):
        
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions":outputs, "labels": labels}
       
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=0.0001)
        return optimizer      

In [64]:
model = BioQAModel()

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [65]:
from keras.callbacks import ModelCheckpoint

In [66]:
checkpoint_callback = ModelCheckpoint(dirpath="checkpoints", filename="best-checkpoint",filepath="./", save_top_k=1, verbose=True,
                                        monitor="val_loss", mode="min")

In [67]:
trainer = pl.Trainer(checkpoint_callback=checkpoint_callback, max_epochs=N_EPOCHS, progress_bar_refresh_rate = 30)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores


In [68]:
%load_ext tensorboard

In [74]:
%tensorboard --logdir ./lightning_logs

Reusing TensorBoard on port 6006 (pid 2466), started 3:21:13 ago. (Use '!kill 2466' to kill it.)

In [73]:
!rm -rf lightning_logs

In [71]:
trainer.fit(model, data_module)

Set SLURM handle signals.

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

In [72]:
trainer.test()

AttributeError: 'NoneType' object has no attribute 'best_model_path'

In [None]:
trained_model = BioQAModel.load_from_checkpoint("checkpoints/best-checkpoint.ckpt")

In [None]:
trained_model.freeze()

In [None]:
def generate_answer(question):
    source_encoding=tokenizer(
            question["question"],
            question['context'],
            max_length = 396,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
       )
    
    generated_ids = trained_model.model.generate(
            input_ids=source_encoding["input_ids"],
            attention_mask=source_encoding["attention_mask"],
            num_beams=1,  # greedy search
            max_length=80,
            repetition_penalty=2.5,
            early_stopping=True,
            use_cache=True)
       
    preds = [
               tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
               for generated_id in generated_ids
            ]
    
    return "".join(preds)
     
    sample_question = val_df.iloc[20]
    sample_question["question"]  
    sample_question["answer_text"]  # Label Answer
    generate_answer(sample_question)  # Predicted answer
    sample_question = val_df.iloc[66]
    sample_question["answer_text"]
    generate_answer(sample_question) 

    #mkdir zip
    
    !zip -r /content.zip /content
    
    from google.colab import files
        files.download("/content.zip") 