In [2]:
!nvidia-smi

Sun Jan 29 18:10:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|       

In [3]:
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet tokenizers
!pip install --quiet torch

[0m

# **Import packages**

In [4]:
import json
import pandas as pd
import numpy as np
from pathlib import Path

from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap
from transformers import (
    AdamW,T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer)

In [5]:
pl.seed_everything (42)

42

# **Dataset**

In [6]:
path = '/kaggle/input/indian-newsqa/NewsQA_SPAN.feather'

In [7]:
df = pd.read_feather(path)
df

Unnamed: 0,question,answer,ans_pos,paragraph,answer_start,answer_end
0,Who is the managing director of Synergee Capital?,Vikram Dalal,"[133, 145]","""Investors can use a combination of governmen...",133,145
1,What is the yield of 30- and 40-year governmen...,7%,"[565, 567]","""Investors can use a combination of governmen...",565,567
2,What is the name of the ETF 2027 that a conser...,SDL,"[209, 212]","According to financial planners, an example o...",209,212
3,When would a conservative fixed income investo...,2027,"[217, 221]","According to financial planners, an example o...",217,221
4,What year would a conservative fixed income in...,2040,"[260, 264]","According to financial planners, an example o...",260,264
...,...,...,...,...,...,...
481753,When does Uncle Sam reopen for fully vaccinate...,November 8,"[295, 305]",NEW DELHI: This could be the last expansion of...,295,305
481754,When will there be three more weekly flights b...,from second week of November,"[116, 144]",It currently has 23 weekly flights to America....,116,144
481755,What type of 777s would have helped AI have mo...,Boeing,"[306, 312]",It currently has 23 weekly flights to America....,306,312
481756,What was the first wave of AI nonstops?,second,"[11, 17]","Before the second wave this summer, AI had abo...",11,17


In [8]:
df = df.iloc[:5000]

# **Tokenization**

In [9]:
MODEL_NAME = 't5-base'

In [10]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [11]:
class NQADataset(Dataset):
  def __init__(self,data : pd.DataFrame,tokenizer : T5Tokenizer,source_max_token_len : int = 396,target_max_token_len : int = 32):

    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self,index : int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
        data_row['question'],
        data_row['paragraph'],
        max_length = self.source_max_token_len,
        padding = "max_length",
        truncation = "only_second",
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    target_encoding = tokenizer(
        data_row['answer'],
        max_length = self.target_max_token_len,
        padding = "max_length",
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    labels = target_encoding["input_ids"]
    labels[labels == 0] = -100

    return dict(
        question = data_row['question'],
        context = data_row['paragraph'],
        answer = data_row['answer'],
        input_ids = source_encoding['input_ids'].flatten(),
        attention_mask = source_encoding['attention_mask'].flatten(),
        labels = labels.flatten())

In [12]:
sample_dataset = NQADataset(df,tokenizer)

In [13]:
for data in sample_dataset:
  print(data['question'])
  print(data['answer'])
  print(data['input_ids'][:10])
  print(data['labels'][:10])
  break


Who is the managing director of Synergee Capital?
Vikram Dalal
tensor([2645,   19,    8, 5037, 2090,   13, 8951,   49,  397,   15])
tensor([ 1813,   157,  2375, 10729,   138,     1,  -100,  -100,  -100,  -100])


In [14]:
train_df, val_df = train_test_split(df,test_size=0.1)

In [15]:
train_df.shape, val_df.shape

((4500, 6), (500, 6))

In [16]:
class NQADataModule(pl.LightningDataModule):
  def __init__(self,train_df : pd.DataFrame,test_df : pd.DataFrame,tokenizer : T5Tokenizer,batch_size : int = 8,source_max_token_len : int = 396,target_max_token_len : int = 32):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self,stage=None):
    self.train_dataset = NQADataset(self.train_df,self.tokenizer,self.source_max_token_len,self.target_max_token_len)
    self.test_dataset = NQADataset(self.test_df,self.tokenizer,self.source_max_token_len,self.target_max_token_len)

  def train_dataloader(self):
    return DataLoader(self.train_dataset,batch_size = self.batch_size,shuffle=True,num_workers=4)

  def val_dataloader(self):
    return DataLoader(self.test_dataset,batch_size = 1,num_workers=4)

  def test_dataloader(self):
    return DataLoader(self.test_dataset,batch_size = 1,num_workers=4)   

In [17]:
type(train_df)

pandas.core.frame.DataFrame

In [18]:
BATCH_SIZE = 4
N_EPOCHS = 2

data_module = NQADataModule(train_df,val_df,tokenizer,batch_size = BATCH_SIZE)
data_module.setup()

In [19]:
class NQAModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,return_dict=True)

  def forward(self,input_ids,attention_mask,labels=None):
    output = self.model(
        input_ids = input_ids,
        attention_mask = attention_mask,
        labels = labels)
    
    return output.loss, output.logits

  def training_step(self,batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids,attention_mask,labels)
    self.log("train_loss",loss,prog_bar=True,logger=True)
    return loss

  def validation_step(self,batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids,attention_mask,labels)
    self.log("val_loss",loss,prog_bar=True,logger=True)
    return loss

  def test_step(self,batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids,attention_mask,labels)
    self.log("test_loss",loss,prog_bar=True,logger=True)
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(),lr = 0.0001)

In [20]:
model = NQAModel()

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [21]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'best_cp',
    save_top_k = 1,
    verbose = True,
    monitor = 'val_loss',
    mode = 'min'
)

In [22]:
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs = N_EPOCHS,
    gpus = -1
)

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"


In [23]:
!rm -rf checkpoints
!rm -rf lightning_logs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [24]:
trainer.fit(model,data_module)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  cpuset_checked))


Sanity Checking: 0it [00:00, ?it/s]

  cpuset_checked))
  "num_workers>0, persistent_workers=False, and strategy=ddp_spawn"


Training: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [25]:
trainer.test(model, data_module)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  cpuset_checked))
  cpuset_checked))
  "num_workers>0, persistent_workers=False, and strategy=ddp_spawn"


Testing: 0it [00:00, ?it/s]



[{'test_loss': 0.12451238185167313}]

In [32]:
import os
os.chdir(r'/kaggle/working/checkpoints')

In [33]:
from IPython.display import FileLink
FileLink(r'best_cp.ckpt')

In [35]:
model.freeze()

In [57]:
def generate_ans(question):
    source_encoding = tokenizer(
        question['question'],
        question['paragraph'],
        max_length = 396,
        padding = "max_length",
        truncation = "only_second",
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    generated_ids = model.model.generate(
        input_ids = source_encoding['input_ids'],
        attention_mask = source_encoding['attention_mask'],
        num_beams = 1,
        max_length = 80,
        repetition_penalty = 2.5,
        length_penalty = 1.0,
        early_stopping = True,
        use_cache = True)
    
    preds = [
        tokenizer.decode(generated_id,skip_special_tokens=True,clean_up_tokenization_spaces=True)
        for generated_id in generated_ids]
    
    return " ".join(preds)

In [58]:
sample_question = val_df.iloc[5]

In [59]:
sample_question['question']

'Who ordered the sale of spectrum to Reliance Communications?'

In [60]:
sample_question['answer']

'the National Company Law Appellate Tribunal'

In [61]:
generate_ans(sample_question)

'the National Company Law Appellate Tribunal'

In [63]:
data = {'question': ['What is the amount spent on Helicopter?'],
        'paragraph' : ['The federal government in its written response informed the Senate that according to the details provided by the Cabinet Division, an amount of ₹ 946.3 million was spent on VVIP helicopter missions by 6 Aviation Squadron under instructions of the PMs Office from 2019 to 2021 when Imran Khan was the prime minister of the country.']
       }

d = pd.DataFrame(data)

In [66]:
generate_ans(d.iloc[0])

' 946.3 million'