In [1]:
import time
import logging
import os
import glob
import json
import re
import random
import textwrap
from termcolor import colored
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib as mlt
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import tensorflow as tf

import torch
from torch.utils.data import Dataset,DataLoader
import pytorch_lightning as pl

from transformers import T5Tokenizer,T5ForConditionalGeneration,AdamW,get_linear_schedule_with_warmup

In [2]:
pl.seed_everything(42)

Global seed set to 42


42

In [3]:
def extract_data(path):
    with path.open() as f:
        data=json.load(f)
    
    questions=data['data'][0]['paragraphs']
    rows=list()

    for question in questions:
        context=question['context']
        for qa in question['qas']:
            question_=qa['question']
            answers=qa['answers']
            for answer in answers:
                answer_text=answer['text']
                answer_start=answer['answer_start']
                answer_end=answer_start+len(answer_text)

                rows.append({
                    'question':question_,
                    'context':context,
                    'answer_text':answer_text,
                    'answer_start':answer_start,
                    'answer_end':answer_end
                })
    return pd.DataFrame(rows)

In [4]:
extract_data(Path('data/BioASQ/BioASQ-train-factoid-4b.json')).head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [5]:
paths=sorted(list(Path('data/BioASQ').glob("BioASQ-train-*")))
ds=list()
for path in paths:
    ds.append(extract_data(path))
df=pd.concat(ds)
df.head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [6]:
tokenizer=T5Tokenizer.from_pretrained('t5-base',cache_dir='cache/t5-base')

In [7]:
class BioQADataset(Dataset):

    def __init__(self,data,tokenizer,encoder_maxlen=396,decoder_maxlen=32):
        self.data=data
        self.tokenizer=tokenizer
        self.encoder_maxlen=encoder_maxlen
        self.decoder_maxlen=decoder_maxlen

    def __len__(self):
        return len(self.data)

    def __getitem__(self,index):
        instance=self.data.iloc[index]

        source_encoding=tokenizer(instance['question'],instance['context'],max_length=self.encoder_maxlen,padding='max_length',truncation='only_second',return_attention_mask=True,
            add_special_tokens=True,return_tensors='pt')
        target_encoding=tokenizer(instance['answer_text'],max_length=self.decoder_maxlen,padding='max_length',truncation=True,return_attention_mask=True,
            add_special_tokens=True,return_tensors='pt')

        labels=target_encoding['input_ids']
        labels[labels==0]=-100

        return dict(question=instance['question'],context=instance['context'],answer_text=instance['answer_text'],input_ids=source_encoding['input_ids'],
            attention_mask=source_encoding['attention_mask'],labels=labels.flatten())

In [8]:
train_df,val_df=train_test_split(df,test_size=0.05)

In [9]:
class BioQADataModule(pl.LightningDataModule):

    def __init__(self,train_df,test_df,tokenizer,encoder_maxlen=396,decoder_maxlen=32,batch_size=8):
        super().__init__()
        self.batch_size=batch_size
        self.train_df=train_df
        self.test_df=test_df
        self.tokenizer=tokenizer
        self.encoder_maxlen=encoder_maxlen
        self.decoder_maxlen=decoder_maxlen

    def setup(self):
        self.train_ds=BioQADataset(self.train_df,self.tokenizer,self.encoder_maxlen,self.decoder_maxlen)
        self.test_ds=BioQADataset(self.test_df,self.tokenizer,self.encoder_maxlen,self.decoder_maxlen)

    def train_dataloader(self):
        return DataLoader(train_ds,batch_size=8,shuffle=True,num_workers=4)
    
    def val_dataloader(self):
        return DataLoader(test_ds,batch_size=1,shuffle=True,num_workers=4)
    
    def test_dataloader(self):
        return DataLoader(test_ds,batch_size=1,shuffle=True,num_workers=4)

In [10]:
BATCH_SIZE=2
EPOCHS=6

data_module=BioQADataModule(train_df,val_df,tokenizer,batch_size=BATCH_SIZE)
data_module.setup()

In [11]:
model=T5ForConditionalGeneration.from_pretrained('t5-base',cache_dir='model_cache/t5-base',return_dict=True)

# Test

In [14]:
input_ids=tokenizer("translate English to German: I am a boy.",return_tensors='pt').input_ids
generated_ids=model.generate(input_ids)
preds=[tokenizer.decode(id,skip_special_tokens=True,clean_up_tokenization_spaces=True) for id in generated_ids]
translated_text="".join(preds)
translated_text

'Ich bin ein Junge.'

In [15]:
text="""
Sunflowers are thought to have been domesticated 3000–5000 years ago by Native Americans who would use them primarily as a source for edible seeds. They were then introduced to Europe in the early 16th century and made their way to Russia. In Russia, where oilseed cultivators were located, these flowers were developed and grown on an industrial scale. Russia then reintroduced this oilseed cultivation process to North America in the mid-20th century; North America began their commercial era of sunflower production and breeding.[12] New breeds of the Helianthus spp. began to become more prominent in new geographical areas.

This species' geographical history accounts for its evolutionary history, with its levels of genetic variation across its gene pool increasing as new hybrids are created both for commercial use and in the wild. Subsequent to this, sunflower species are also experiencing the bottle neck effect in their gene pool as a result of selective breeding for industrial use.[12]
"""
text='summarize: '+ text
input_ids=tokenizer(text,return_tensors='pt').input_ids
generated_ids=model.generate(input_ids)
preds=[tokenizer.decode(id,skip_special_tokens=True,clean_up_tokenization_spaces=True) for id in generated_ids]
summarized_text="".join(preds)
summarized_text

'sunflowers were domesticated 3000–5000 years ago by Native Americans. they were'