<a href="https://colab.research.google.com/github/ambideXtrous9/T5-FineTuned-Model-for-NewsQA/blob/main/NewsQA_T5_model-PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet transformers
!pip install --quiet tokenizers
!pip install --quiet torch

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m96.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h

# **Import packages**

In [2]:
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import (
    AdamW,T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer)

# **Dataset**

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = '/content/drive/MyDrive/MTP CODE/NewsQA_SPAN.feather'

In [5]:
df = pd.read_feather(path)
df

Unnamed: 0,question,answer,ans_pos,paragraph,answer_start,answer_end
0,Who is the managing director of Synergee Capital?,Vikram Dalal,"[133, 145]","""Investors can use a combination of governmen...",133,145
1,What is the yield of 30- and 40-year governmen...,7%,"[565, 567]","""Investors can use a combination of governmen...",565,567
2,What is the name of the ETF 2027 that a conser...,SDL,"[209, 212]","According to financial planners, an example o...",209,212
3,When would a conservative fixed income investo...,2027,"[217, 221]","According to financial planners, an example o...",217,221
4,What year would a conservative fixed income in...,2040,"[260, 264]","According to financial planners, an example o...",260,264
...,...,...,...,...,...,...
481753,When does Uncle Sam reopen for fully vaccinate...,November 8,"[295, 305]",NEW DELHI: This could be the last expansion of...,295,305
481754,When will there be three more weekly flights b...,from second week of November,"[116, 144]",It currently has 23 weekly flights to America....,116,144
481755,What type of 777s would have helped AI have mo...,Boeing,"[306, 312]",It currently has 23 weekly flights to America....,306,312
481756,What was the first wave of AI nonstops?,second,"[11, 17]","Before the second wave this summer, AI had abo...",11,17


In [6]:
df = df.iloc[:1000]

# **Tokenization**

In [7]:
MODEL_NAME = 't5-base'

In [8]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [33]:
class NQADataset(Dataset):
  def __init__(self,data : pd.DataFrame,tokenizer : T5Tokenizer,source_max_token_len : int = 400,target_max_token_len : int = 32):

    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self,index : int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
        data_row['question'],
        data_row['paragraph'],
        max_length = self.source_max_token_len,
        padding = "max_length",
        truncation = "only_second",
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    target_encoding = tokenizer(
        data_row['answer'],
        max_length = self.target_max_token_len,
        padding = "max_length",
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    labels = target_encoding["input_ids"]
    labels[labels == 0] = -100

    return dict(
        answer = data_row['answer'],
        input_ids = source_encoding['input_ids'].flatten(),
        attention_mask = source_encoding['attention_mask'].flatten(),
        labels = labels.flatten())

In [34]:
train_df, val_df = train_test_split(df,test_size=0.2)
test_df, val_df = train_test_split(val_df,test_size=0.5)

In [35]:
train_df.shape, val_df.shape

((800, 6), (100, 6))

In [36]:
class NQADataModule:
    def __init__(self, train_df, val_df ,test_df,tokenizer, batch_size=8, source_max_token_len=400, target_max_token_len=32):
        self.batch_size = batch_size
        self.train_df = train_df
        self.test_df = test_df
        self.val_df = val_df
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self, stage=None):
        pass

    def train_dataloader(self):
        train_dataset = NQADataset(self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        return DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)

    def val_dataloader(self):
        test_dataset = NQADataset(self.val_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        return DataLoader(test_dataset, batch_size=self.batch_size, num_workers=4)

    def test_dataloader(self):
        test_dataset = NQADataset(self.test_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        return DataLoader(test_dataset, batch_size=self.batch_size, num_workers=4)

In [37]:
BATCH_SIZE = 4
N_EPOCHS = 2

data_module = NQADataModule(train_df,val_df,test_df,tokenizer=tokenizer, batch_size=BATCH_SIZE, source_max_token_len=400, target_max_token_len=32)

In [38]:
len(data_module.test_dataloader())



25

In [39]:
import torch.nn as nn
from transformers import AutoModelForSeq2SeqLM, AdamW

class NQAModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name, return_dict=True)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels)
        return output.loss, output.logits

In [40]:
model = NQAModel(MODEL_NAME)

In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [42]:
model.to(device)

NQAModel(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_features=3

In [43]:
optimizer = AdamW(model.parameters(), lr=0.0001)



In [61]:
import nltk
nltk.download('wordnet')
nltk.download('wordnet_ic')
nltk.download('punkt')
from nltk.translate.meteor_score import meteor_score as meteor
from nltk import word_tokenize

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [62]:
sentence = "This is a very long sentence."


In [64]:
meteor([word_tokenize(sentence)], word_tokenize(sentence), gamma=1)

0.9970845481049563

In [67]:
for epoch in range(N_EPOCHS):
    for batch in data_module.train_dataloader():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        loss, _ = model(input_ids, attention_mask, labels)
        loss.backward()
        optimizer.step()
        
    # Evaluate the model on the test set
    with torch.no_grad():
        val_loss = 0.0
        meteor_score_total = 0.0
        for batch in data_module.val_dataloader():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            answer = batch['answer']
            loss, _ = model(input_ids, attention_mask, labels)
            val_loss += loss.item()

            generated_ids = model.model.generate(
                input_ids,
                max_length=32,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=2,
                num_return_sequences=1,
            )

            for i in range(input_ids.size(0)):
              generated_sequence = generated_ids[i].tolist()
              # Decode the generated sequence and the target sequence
              gen_text = tokenizer.decode(generated_sequence, skip_special_tokens=True)
              target_text = answer[i]
              score = meteor([word_tokenize(target_text)], word_tokenize(gen_text), gamma=0)
              print(f"actual :  {target_text}, Pred : {gen_text} , score : {score}")
              # Update the metric with the generated and target sequences
              meteor_score_total += score
          
        val_loss /= len(data_module.val_dataloader())
        meteor_score_avg = meteor_score_total / len(data_module.val_dataloader())
    
    # Print the test loss and save the best checkpoint
    print(f"Epoch {epoch+1}/{N_EPOCHS}, val Loss: {val_loss}, METEOR: {meteor_score_avg}")



actual :  Donald Trump, Pred : Donald Trump , score : 0.9375
actual :  Jammu, Pred : Jammu , score : 0.5
actual :  Congress, Pred : Congress , score : 0.5
actual :  2020, Pred : 2020 , score : 0.5
actual :  the Securities and Exchange Board of India, Pred : the Securities and Exchange Board , score : 0.7323529411764707
actual :  Bengaluru Urban, Pred : Bengaluru Urban , score : 0.9375
actual :  Mamunul, Pred : Mamunul , score : 0.5
actual :  Eicher Motors, Pred : Eicher Motors , score : 0.9375
actual :  the South China Sea, Pred : South China Sea , score : 0.754985754985755
actual :  Amazon India, Pred : Amazon India , score : 0.9375
actual :  June, Pred : June , score : 0.5
actual :  $140 million, Pred : $140 million , score : 0.9814814814814815
actual :  Jane Fraser, Pred : Jane Fraser , score : 0.9375
actual :  India, Pred : India , score : 0.5
actual :  Vigo Video, Pred : Vigo Video , score : 0.9375
actual :  the coming years, Pred : the coming years , score : 0.9814814814814815
ac

KeyboardInterrupt: ignored

# **Save Checkpoint to Gdrive**