In [54]:
!nvidia-smi

Thu Feb  2 09:44:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    31W /  70W |   1318MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [55]:
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet tokenizers
!pip install --quiet torch
!pip install --quiet rank-bm25

# **Import packages**

In [56]:
import json
import pandas as pd
import numpy as np
from pathlib import Path

from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap
from torch.nn import CrossEntropyLoss
from rank_bm25 import BM25Okapi
from transformers import (
    AdamW,GPT2LMHeadModel, GPT2Tokenizer as Tokenizer)

In [57]:
pl.seed_everything (42)

INFO:lightning_fabric.utilities.seed:Global seed set to 42


RuntimeError: ignored

# **Dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = '/content/drive/MyDrive/MTP CODE/NewsQA_SPAN.feather'

In [None]:
df = pd.read_feather(path)
df

In [None]:
df = df.iloc[:5000]

# **Tokenization**

In [None]:
corpus = df['paragraph'].tolist()
tokenized_corpus = [doc.lower().split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
MODEL_NAME = 'gpt2'

In [None]:
tokenizer = Tokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
class NQADataset(Dataset):
  def __init__(self,data : pd.DataFrame,tokenizer : Tokenizer,source_max_token_len : int = 400,target_max_token_len : int = 32):

    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self,index : int):
    data_row = self.data.iloc[index]

    ques = data_row['question']
    scores = bm25.get_scores(ques.lower().split(" "))
    best_match = np.argmax(scores)
    context = df.iloc[best_match]['paragraph']

    

    source_encoding = tokenizer(
        ques,
        context,
        max_length = self.source_max_token_len,
        padding = "max_length",
        truncation = "only_second",
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    target_encoding = tokenizer(
        data_row['answer'],
        max_length = self.target_max_token_len,
        padding = "max_length",
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    labels = target_encoding["input_ids"]
    labels[labels == 0] = -100

    return dict(
        input_ids = source_encoding['input_ids'].flatten(),
        attention_mask = source_encoding['attention_mask'].flatten(),
        labels = labels.flatten())

In [None]:
sample_dataset = NQADataset(df,tokenizer)

In [None]:
for data in sample_dataset:
  print(data['input_ids'][:10])
  print(data['labels'][:10])
  break


In [None]:
train_df, val_df = train_test_split(df,test_size=0.1)

In [None]:
train_df.shape, val_df.shape

In [None]:
class NQADataModule(pl.LightningDataModule):
  def __init__(self,train_df : pd.DataFrame,test_df : pd.DataFrame,tokenizer : Tokenizer,batch_size : int = 8,source_max_token_len : int = 400,target_max_token_len : int = 32):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self,stage=None):
    self.train_dataset = NQADataset(self.train_df,self.tokenizer,self.source_max_token_len,self.target_max_token_len)
    self.test_dataset = NQADataset(self.test_df,self.tokenizer,self.source_max_token_len,self.target_max_token_len)

  def train_dataloader(self):
    return DataLoader(self.train_dataset,batch_size = self.batch_size,shuffle=True,num_workers=4)

  def val_dataloader(self):
    return DataLoader(self.test_dataset,batch_size = self.batch_size,num_workers=4)

  def test_dataloader(self):
    return DataLoader(self.test_dataset,batch_size = self.batch_size,num_workers=4)   

In [None]:
type(train_df)

In [None]:
BATCH_SIZE = 2
N_EPOCHS = 3

data_module = NQADataModule(train_df,val_df,tokenizer,batch_size = BATCH_SIZE)
data_module.setup()

In [None]:
class NQAModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    
    self.model = GPT2LMHeadModel.from_pretrained(MODEL_NAME,return_dict=True)
    self.criterion = CrossEntropyLoss()

  def forward(self,input_ids,attention_mask,labels=None):
    output = self.model(
        input_ids = input_ids,
        attention_mask = attention_mask,
        labels = labels)
    
    return output.loss, output.logits

  def training_step(self,batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss = self(input_ids,attention_mask,labels)
    self.log("train_loss",loss,prog_bar=True,logger=True)
    return loss

  def validation_step(self,batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss = self(input_ids,attention_mask,labels)
    self.log("val_loss",loss,prog_bar=True,logger=True)
    return loss

  def test_step(self,batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss = self(input_ids,attention_mask,labels)
    self.log("test_loss",loss,prog_bar=True,logger=True)
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(),lr = 0.0001)

In [None]:
model = NQAModel()

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'best_cp',
    save_top_k = 1,
    verbose = True,
    monitor = 'val_loss',
    mode = 'min'
)

In [None]:
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs = N_EPOCHS,
    gpus = -1
)

In [None]:
!rm -rf checkpoints
!rm -rf lightning_logs

In [None]:
trainer.fit(model,data_module)

In [None]:
#trainer.test(model, data_module)

# **Save Checkpoint to Gdrive**

In [None]:
#!cp -r '/content/checkpoints' '/content/drive/MyDrive/SMDM'

# **Load the Model from chechkpoint**

In [None]:
cppath = '/content/checkpoints/best_cp.ckpt'
trained_model = NQAModel.load_from_checkpoint(cppath)
trained_model.freeze()

In [None]:
trainer.test(trained_model, data_module)

## **Make Prediction on Sample**

In [None]:
def generate_ans(ques):
    scores = bm25.get_scores(ques.lower().split(" "))
    best_match = np.argmax(scores)
    context = df.iloc[best_match]['paragraph']
    print("-----------------------------------------------------------------")
    print(context)
    print("-----------------------------------------------------------------")
    source_encoding = tokenizer(
        ques,
        context,
        max_length = 400,
        padding = "max_length",
        truncation = "only_second",
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    generated_ids = trained_model.model.generate(
        input_ids = source_encoding['input_ids'],
        attention_mask = source_encoding['attention_mask'],
        num_beams = 1,
        max_length = 32,
        repetition_penalty = 2.5,
        length_penalty = 1.0,
        early_stopping = True,
        use_cache = True)
    
    preds = [
        tokenizer.decode(generated_id,skip_special_tokens=True,clean_up_tokenization_spaces=True)
        for generated_id in generated_ids]
    
    return " ".join(preds)

In [None]:
sample_question = val_df.iloc[10]

In [None]:
sample_question['question']

In [None]:
sample_question['answer']

In [None]:
sample_question['paragraph']

In [None]:
generate_ans(sample_question['question'])

In [None]:
strng = 'Who taught Jaar to turn her singing back inwards?'
generate_ans(strng)