#### T5  
https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html (https://youtu.be/r6XY80Z9eSA?t=348)  
https://huggingface.co/transformers/model_doc/t5.html  
https://arxiv.org/abs/1910.10683  


https://www.youtube.com/watch?v=_l2wJb3QPdk  
https://www.youtube.com/watch?v=r6XY80Z9eSA  

## Tutorial, Part 1
https://www.youtube.com/watch?v=_l2wJb3QPdk

In google colab change runtime type to GPU

In [None]:
!nvidia-smi

In [None]:
!pip install --quiet transformers==4.1.1
# !pip install --quiet pytorch-lightning==1.1.1
!pip install --quiet torchtext==0.8.0 torch==1.7.1 pytorch-lightning==1.2.2
!pip install --quiet tokenizers==0.9.4
!pip install --quiet sentencepiece==0.1.94
!pip install --quiet pandas  
!pip install --quiet sklearn
!pip install --quiet keras
!pip install --quiet tensorflow
!pip install --quiet termcolor

In [None]:
import json
import pandas as pd
import numpy as py
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)


# model files are downloaded from https://huggingface.co/valhalla/t5-base-qa-qg-hl/tree/main
# if Internet access is available just use
# MODEL_FILES = "t5-base"
# instead of path to model files

#from sys import platform
#if "linux" in platform.lower():
#    MODEL_FILES = "/home/myuser/TransformerModels/t5-base-qa-qg-hl"
#    CHECKPOINT_PATH="/home/myuserTransformerModels/_CheckPoints"
#else:
#    MODEL_FILES = "C:/TransformerModels/t5-base-qa-qg-hl"
#    CHECKPOINT_PATH="C:/TransformerModels/_CheckPoints"

MODEL_FILES = "t5-base"
CHECKPOINT_PATH="./CheckPoints"


N_GPUS = 1 # Change here if you have GPUs
N_WORKERS = 4 # 4 in the tutorial. 0 if running on windows without GPU...


In [None]:
pl.seed_everything(42)

In [None]:
#model = AutoModelWithLMHead.from_pretrained("deep-learning-analytics/triviaqa-t5-base")
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#model = model.to(device)

In [None]:
# https://github.com/dmis-lab/biobert#datasets
!gdown --id 19ft5q44W4SuptJgTwR84xZjsHg1jvjSZ

In [None]:
!unzip -q QA.zip

In [None]:
with Path("BioASQ/BioASQ-train-factoid-4b.json").open() as json_file:
    data = json.load(json_file)

In [None]:
questions = data["data"][0]["paragraphs"]
questions[0]

In [None]:
def extract_questions_and_answers(factoid_path: Path):
    with factoid_path.open() as json_file:
        data = json.load(json_file)
        
    questions = data["data"][0]["paragraphs"]
    data_rows = []
    for question in questions:
        context = question['context']
        for question_and_answers in question['qas']:
            question = question_and_answers["question"]
            answers = question_and_answers["answers"]
            
        for answer in answers:
            answer_text = answer["text"]
            answer_start = answer["answer_start"]
            answer_end = answer_start + len(answer_text)
            
            data_rows.append({
                "question":question,
                "context":context,
                "answer_text": answer_text,
                "answer_start":answer_start,
                "answer_end":answer_end
            })
            
    return pd.DataFrame(data_rows)

In [None]:
extract_questions_and_answers(Path("BioASQ/BioASQ-train-factoid-4b.json")).head

In [None]:
factoid_paths = sorted(list(Path("BioASQ/").glob("BioASQ-train-*")))
factoid_paths

In [None]:
dfs = []

for factoid_path in factoid_paths:
    dfs.append(extract_questions_and_answers(factoid_path))
    
df = pd.concat(dfs)


In [None]:
print(len(df.question.unique()))
print(len(df.answer_text.unique()))
print(len(df.context.unique()))

In [None]:
df.head()

## The duplicates removal in the following cells is done and explained in the second video

In [None]:
print(df.shape)
print(len(df.question.unique()))

In [None]:
df = df.drop_duplicates(subset=["context"]).reset_index(drop=True)

In [None]:
print(df.shape)
print(len(df.question.unique()))

In [None]:
sample_question = df.iloc[240]
sample_question

In [None]:
def color_answer(question):
    answer_start , answer_end = question["answer_start"], question["answer_end"]
    context = question["context"]

    return colored(context[:answer_start], "white") + \
        colored(context[answer_start:answer_end], "green") + \
        colored(context[answer_end:], "white")

In [None]:
print(sample_question["question"])
print()
for wrap in textwrap.wrap(color_answer(sample_question), width = 130):
    print(wrap)

### Tokenization

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_FILES)

In [None]:
sample_encoding = tokenizer(
    "Would I rather be feared or loved?",
    "Easy. Both, I want both."
    )

In [None]:
sample_encoding.keys()

In [None]:
print(sample_encoding["input_ids"])

In [None]:
print(sample_encoding["attention_mask"])

In [None]:
preds = [
    tokenizer.decode(input_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    for input_id in sample_encoding["input_ids"]
]

In [None]:
" ".join(preds)

In [None]:
encoding = tokenizer(
    sample_question["question"],
    sample_question["context"],
    max_length=396,
    padding="max_length",
    truncation="only_second",
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
    )
# truncation="only_second" because we do not want to truncate the question

In [None]:
encoding.keys()

In [None]:
tokenizer.special_tokens_map

In [None]:
tokenizer.eos_token, tokenizer.eos_token_id

In [None]:
tokenizer.decode(encoding["input_ids"].squeeze())

In [None]:
answer_encoding = tokenizer(
    sample_question["answer_text"],
    max_length=32,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
    )

In [None]:
tokenizer.decode(answer_encoding["input_ids"].squeeze())

In [None]:
labels = answer_encoding["input_ids"]
labels

In [None]:
# We need to convert the labels that are ignored or masked to -100
labels[labels == 0] = -100
labels

In [None]:
class BioQADataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_len: int = 396,
        target_max_token_len: int = 32
    ):

        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        source_encoding = tokenizer(
            data_row["question"],
            data_row["context"],
            max_length=self.source_max_token_len,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        target_encoding = tokenizer(
            data_row["answer_text"],
            max_length=self.source_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100

        return dict(
            question=data_row["question"],
            context=data_row["context"],
            answer_text=data_row["answer_text"],
            input_ids=source_encoding["input_ids"].flatten(),
            attention_mask=source_encoding["attention_mask"].flatten(),
            labels=labels.flatten()
        )

In [None]:
sample_dataset = BioQADataset(df, tokenizer)

In [None]:
for data in sample_dataset:
    print(data["question"])
    print(data["answer_text"])
    print(data["input_ids"][:20])
    print(data["labels"][:20])
    break

In [None]:
train_df, val_df = train_test_split(df, test_size=0.05)
train_df.shape, val_df.shape

In [None]:
class BioQADataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        source_max_token_len: int = 396,
        target_max_token_len: int = 32
    ):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self):
        self.train_dataset = BioQADataset(
            self.train_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )

        self.test_dataset = BioQADataset(
            self.test_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=N_WORKERS
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=N_WORKERS
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=N_WORKERS
        )

In [None]:
BATCH_SIZE = 8
N_EPOCHS = 6

data_module = BioQADataModule(train_df, val_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

# Second video  
https://www.youtube.com/watch?t=348&v=r6XY80Z9eSA

In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_FILES, return_dict=True)

# Translation  

In [None]:
input_ids = tokenizer(
    "translate English to German: I talk a lot, so I've learned to tune myself out",
    return_tensors="pt"
).input_ids

generated_ids = model.generate(input_ids=input_ids)
generated_ids

In [None]:
preds = [
    tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    for gen_id in generated_ids
]

preds

In [None]:
" ".join(preds)

### back to english with google  
https://translate.google.com/?sl=auto&tl=en&text=Ich%20rede%20viel%2C%20also%20habe%20ich%20gelernt%2C%20mich%20auszuschalten&op=translate


# Summarization


How to generate text: using different decoding methods for language generation with Transformers  
https://huggingface.co/blog/how-to-generate

In [None]:
text = """
summarize: The FDA, an agency within the U.S. Department of Health and Human Services, protects the public health by assuring the safety, effectiveness, and security of human and veterinary drugs, vaccines and other biological products for human use, and medical devices.
The agency also is responsible for the safety and security of our nation’s food supply, cosmetics, dietary supplements, products that give off electronic radiation, and for regulating tobacco products.
The agency has updated its FDA COVID-19 Response At-A-Glance Summary, which provides a quick look at facts, figures, and highlights on the FDA's response efforts.
"""

In [None]:
input_ids = tokenizer(
    text,
    return_tensors="pt"
).input_ids

generated_ids = model.generate(input_ids=input_ids)

preds = [
    tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    for gen_id in generated_ids
]

" ".join(preds)

# Question answering

In [None]:
output = model(
 input_ids = encoding["input_ids"],
 attention_mask=encoding["attention_mask"],
 labels=labels
)

#### encoding was defined previously:
<pre>
encoding = tokenizer(
    sample_question["question"],
    sample_question["context"],
    max_length=396,
    padding="max_length",
    truncation="only_second",
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
    )
# truncation="only_second" because we do not want to truncate the question
</pre>

In [None]:
model.config

In [None]:
output.logits.shape # see model.config ; 32102 is from vocabulary size; 32 comes from relative_attention_num_buckets; 1 is the batch size, a single example
# for each one of the 32102 vocabulary entry we have 32 outputs

In [None]:
output.loss

### Modeling

In [None]:
class BioQAModel(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_FILES, return_dict=True)

    def forward(self, input_ids, attention_mask, labels=None): # labels are optional because they are not supplied when testing
        output = self.model(
            input_ids = input_ids,
            attention_mask=attention_mask,
            labels=labels
            )

        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

In [None]:
model = BioQAModel()

# Model Training using our dataset

In [None]:
from keras.callbacks import ModelCheckpoint

# Checkpoint callback to save best model found during trainig
checkpoint_callback = ModelCheckpoint(
    filepath=CHECKPOINT_PATH,
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1, #just keep the best one
    verbose=True,
    monitor="val_loss",
    mode="min" # save the one with minimum validation loss
)

In [None]:
trainer = pl.Trainer(
    checkpoint_callback = checkpoint_callback,
    max_epochs = N_EPOCHS,
    gpus=N_GPUS,
    progress_bar_refresh_rate=30
)

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./lightning_logs

In [None]:
trainer.fit(model, data_module)

# Predictions

In [None]:
trained_model = BioQAModel.load_from_checkpoint(CHECKPOINT_PATH + "/best-checkpoint.ckpt")
trained_model.freeze()

In [None]:
def generate_answer(question):
    source_encoding = tokenizer(
        question["question"],
        question["context"],
        max_length=396,
        padding="max_length",
        truncation="only_second", # do not truncate question
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    generated_ids = trained_model.model.generate(
        input_ids=source_encoding["input_ids"],
        attention_mask=source_encoding["attention_mask"],
        num_beams=1,
        max_length=80,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
        use_cache=True
    )

    preds = [
        tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    ]

    return " ".join(preds)

In [None]:
sample_question = val_df.iloc[0]
print(sample_question["question"])
print(sample_question["answer_text"])

In [None]:
generate_answer(sample_question)