<a href="https://colab.research.google.com/github/ambideXtrous9/T5-FineTuned-Model-for-NewsQA/blob/main/NewsQA_FLAN_T5_model-METEOR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
!nvidia-smi

NotImplementedError: ignored

In [66]:
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet tokenizers

NotImplementedError: ignored

# **Import packages**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split

import nltk
nltk.download('wordnet')
nltk.download('wordnet_ic')
nltk.download('punkt')
from nltk.translate.meteor_score import meteor_score as meteor
from nltk import word_tokenize

from transformers import (
    AdamW, AutoTokenizer as Tokenizer , AutoModelForSeq2SeqLM)

In [None]:
pl.seed_everything (42)

# **Dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = '/content/drive/MyDrive/MTP CODE/NewsQA_SPAN.feather'

In [None]:
df = pd.read_feather(path)
df

In [None]:
df['len_ans'] = df['len_ans'].apply(lambda x: len(x.split()))
df['len_ques'] = df['question'].apply(lambda x: len(x.split()))

In [None]:
max_index = df['len_ans'].idxmax()
sample = df.iloc[max_index]
print(sample['answer'])
print(sample['len_ans'])

In [None]:
min_index = df['len_ans'].idxmin()
sample = df.iloc[min_index]
print(sample['answer'])
print(sample['len_ans'])

In [8]:
# Plot histogram of length column
plt.hist(df['len_ans'], bins=30)
# Set x and y labels
plt.xlabel('Length of Summary')
plt.ylabel('Count')

# Show the plot
plt.show()

# **Tokenization**

In [9]:
MODEL_NAME = 'google/flan-t5-small'

In [10]:
tokenizer = Tokenizer.from_pretrained(MODEL_NAME)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [11]:
class NQADataset(Dataset):
  def __init__(self,data : pd.DataFrame,tokenizer : Tokenizer,source_max_token_len : int = 400,target_max_token_len : int = 32):

    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self,index : int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
        data_row['question'],
        data_row['paragraph'],
        max_length = self.source_max_token_len,
        padding = "max_length",
        truncation = "only_second",
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    target_encoding = tokenizer(
        data_row['answer'],
        max_length = self.target_max_token_len,
        padding = "max_length",
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    labels = target_encoding["input_ids"]
    labels[labels == 0] = -100

    return dict(
        answer = data_row['answer'],
        input_ids = source_encoding['input_ids'].flatten(),
        attention_mask = source_encoding['attention_mask'].flatten(),
        labels = labels.flatten())

In [12]:
sample_dataset = NQADataset(df,tokenizer)

In [14]:
for data in sample_dataset:
  print(data['answer'])
  print(data['input_ids'][:10])
  print(data['labels'][:10])
  break


Vikram Dalal
tensor([2645,   19,    8, 5037, 2090,   13, 8951,   49,  397,   15])
tensor([ 1813,   157,  2375, 10729,   138,     1,  -100,  -100,  -100,  -100])


In [15]:
train_df, val_df = train_test_split(df,test_size=0.1)

In [16]:
train_df.shape, val_df.shape

((4500, 6), (500, 6))

In [17]:
class NQADataModule(pl.LightningDataModule):
  def __init__(self,train_df : pd.DataFrame,test_df : pd.DataFrame,tokenizer : Tokenizer,batch_size : int = 8,source_max_token_len : int = 400,target_max_token_len : int = 32):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self,stage=None):
    self.train_dataset = NQADataset(self.train_df,self.tokenizer,self.source_max_token_len,self.target_max_token_len)
    self.test_dataset = NQADataset(self.test_df,self.tokenizer,self.source_max_token_len,self.target_max_token_len)

  def train_dataloader(self):
    return DataLoader(self.train_dataset,batch_size = self.batch_size,shuffle=True,num_workers=4)

  def val_dataloader(self):
    return DataLoader(self.test_dataset,batch_size = self.batch_size,num_workers=4)

  def test_dataloader(self):
    return DataLoader(self.test_dataset,batch_size = self.batch_size,num_workers=4)   

In [18]:
type(train_df)

pandas.core.frame.DataFrame

In [19]:
BATCH_SIZE = 4
N_EPOCHS = 2

data_module = NQADataModule(train_df,val_df,tokenizer,batch_size = BATCH_SIZE)
data_module.setup()

In [40]:
def compute_meteor_scores(predictions, answers):
    scores = []
    for pred, ans in zip(predictions, answers):
        pred_tokens = word_tokenize(pred)
        ans_tokens = word_tokenize(ans)
        score = meteor([ans_tokens], pred_tokens, gamma=0)
        scores.append(score)
    return sum(scores) / len(scores)

In [41]:
class NQAModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME,return_dict=True)

  def forward(self,input_ids,attention_mask,labels=None):
    output = self.model(
        input_ids = input_ids,
        attention_mask = attention_mask,
        labels = labels)
    
    return output.loss, output.logits

  def training_step(self,batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids,attention_mask,labels)
    self.log("train_loss",loss,prog_bar=True,logger=True)
    return loss

  def validation_step(self,batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    answer = batch['answer']
    loss, outputs = self(input_ids,attention_mask,labels)

    # Generate predictions from the model
    predictions = self.model.generate(input_ids=input_ids, attention_mask=attention_mask)

    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    score = compute_meteor_scores(predictions, answer)

    self.log_dict({"val_loss" : loss,"val_METEOR" : score},prog_bar=True,logger=True)

    return loss

  def test_step(self,batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids,attention_mask,labels)
    self.log("test_loss",loss,prog_bar=True,logger=True)
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(),lr = 0.0001)

In [42]:
model = NQAModel()

In [43]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'best_cp',
    save_top_k = 1,
    verbose = True,
    monitor = 'val_loss',
    mode = 'min'
)

In [44]:
trainer = pl.Trainer(devices=-1, accelerator="gpu",
    callbacks=[checkpoint_callback],
    max_epochs = N_EPOCHS
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [45]:
trainer.fit(model,data_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 77.0 M
-----------------------------------------------------
77.0 M    Trainable params
0         Non-trainable params
77.0 M    Total params
307.845   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 1125: 'val_loss' reached 0.14391 (best 0.14391), saving model to '/content/checkpoints/best_cp.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 2250: 'val_loss' reached 0.09822 (best 0.09822), saving model to '/content/checkpoints/best_cp.ckpt' as top 1
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


In [46]:
#trainer.test(model, data_module)

# **Load the Model from chechkpoint**

In [48]:
cppath = '/content/checkpoints/best_cp.ckpt'
trained_model = NQAModel.load_from_checkpoint(cppath)
trained_model.freeze()

In [49]:
trainer.test(trained_model, data_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.09821627289056778}]

## **Make Prediction on Sample**

In [57]:
def generate_ans(question):
    source_encoding = tokenizer(
        question['question'],
        question['paragraph'],
        max_length = 400,
        padding = "max_length",
        truncation = "only_second",
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    generated_ids = trained_model.model.generate(
        input_ids = source_encoding['input_ids'],
        attention_mask = source_encoding['attention_mask'],
        num_beams = 1,
        max_length = 32,
        repetition_penalty = 2.5,
        length_penalty = 1.0,
        early_stopping = True,
        use_cache = True)
    
    return tokenizer.decode(generated_ids[0],skip_special_tokens=True,clean_up_tokenization_spaces=True)
    

In [58]:
sample_question = val_df.iloc[1]

In [59]:
sample_question['question']

'When is the first developmental flight of the SSLV scheduled for?'

In [60]:
sample_question['answer']

'the fourth quarter of this year'

In [61]:
generate_ans(sample_question)

'the fourth quarter of this year'

In [62]:
data = {'question': ['What is the amount spent on Helicopter for VVIP?'],
        'paragraph' : ['The federal government in its written response informed the Senate that according to the details provided by the Cabinet Division, an amount of ₹ 946.3 million was spent on VVIP helicopter missions by 6 Aviation Squadron under instructions of the PMs Office from 2019 to 2021 when Imran Khan was the prime minister of the country.']
       }

d = pd.DataFrame(data)

In [63]:
generate_ans(d.iloc[0])

'946.3 million'