In [1]:
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet tokenizers
!pip install --quiet torch
!pip install --quiet rank-bm25

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m826.2/826.2 KB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.2/517.2 KB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import json
import pandas as pd
import numpy as np
from pathlib import Path

import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap
from rank_bm25 import BM25Okapi
from transformers import (
    AdamW,T5ForConditionalGeneration, AutoTokenizer as Tokenizer)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = '/content/drive/MyDrive/MTP CODE/NewsQA_SPAN.feather'

In [5]:
df = pd.read_feather(path)
df = df.iloc[:5000]

In [None]:
train_df, val_df = train_test_split(df,test_size=0.1)

In [6]:
tokenizer = Tokenizer.from_pretrained('t5-small')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [7]:
class MyDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data_row = self.data.iloc[idx]
        question = data_row['question']
        context = data_row['paragraph']
        context = self.tokenizer(context, padding='max_length',truncation = True,return_attention_mask = True,add_special_tokens = True, max_length=512, return_tensors='pt')
        question = self.tokenizer(question, padding='max_length',truncation = True,return_attention_mask = True,add_special_tokens = True, max_length=64, return_tensors='pt')
        return {'context_ids': context['input_ids'].flatten(),
                'context_mask': context['attention_mask'].flatten(),
                'question_ids': question['input_ids'].flatten(),
                'question_mask': question['attention_mask'].flatten()}

In [8]:
class NQADataModule(pl.LightningDataModule):
  def __init__(self,train_df : pd.DataFrame,val_df : pd.DataFrame,tokenizer : Tokenizer,batch_size : int = 4):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.val_df = val_df
    self.tokenizer = tokenizer
  
  def setup(self,stage=None):
    self.train_dataset = MyDataset(self.train_df,self.tokenizer)
    self.val_dataset = MyDataset(self.val_df,self.tokenizer)
  
  def train_dataloader(self):
    return DataLoader(self.train_dataset,batch_size = self.batch_size,shuffle=True,num_workers=4)
  def val_dataloader(self):
    return DataLoader(self.val_dataset,batch_size = self.batch_size,num_workers=4)
  def test_dataloader(self):
    return DataLoader(self.val_dataset,batch_size = self.batch_size,num_workers=4)

In [16]:
class MyModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained('t5-small')
        
    def forward(self, input_ids, attention_mask,labels=None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask,labels=labels,return_dict=True)
        return output.loss, output.encoder_last_hidden_state[:, 0, :]
    
    def training_step(self, batch, batch_idx):
        context_ids, context_mask,question_ids, question_mask = batch['context_ids'], batch['context_mask'], batch['question_ids'], batch['question_mask']
        loss1, question_embedded = self(context_ids, context_mask,question_ids)
        loss2, context_outputs = self(question_ids, question_mask,context_ids)
        mean_loss = (loss1 + loss2) / 2
        similarity_scores = torch.cosine_similarity(context_outputs, question_embedded, dim=1)
        top_k_scores, top_k_indices = torch.topk(similarity_scores, k=3)
        self.log("train_loss",mean_loss,prog_bar=True,logger=True)
        return {'loss': top_k_scores[0]}

    def validation_step(self, batch, batch_idx):
        context_ids, context_mask,question_ids, question_mask = batch['context_ids'], batch['context_mask'], batch['question_ids'], batch['question_mask']
        loss1, question_embedded = self(context_ids, context_mask,question_ids)
        loss2, context_outputs = self(question_ids, question_mask,context_ids)
        mean_loss = (loss1 + loss2) / 2
        similarity_scores = torch.cosine_similarity(context_outputs, question_embedded, dim=1)
        top_k_scores, top_k_indices = torch.topk(similarity_scores, k=3)
        self.log("val_loss",mean_loss,prog_bar=True,logger=True)
        return {'loss': top_k_scores[0]}

    def test_step(self, batch, batch_idx):
        context_ids, context_mask,question_ids, question_mask = batch['context_ids'], batch['context_mask'], batch['question_ids'], batch['question_mask']
        loss1, question_embedded = self(context_ids, context_mask,question_ids)
        loss2, context_outputs = self(question_ids, question_mask,context_ids)
        mean_loss = (loss1 + loss2) / 2
        similarity_scores = torch.cosine_similarity(context_outputs, question_embedded, dim=1)
        top_k_scores, top_k_indices = torch.topk(similarity_scores, k=3)
        self.log("test_loss",mean_loss,prog_bar=True,logger=True)
        return {'loss': top_k_scores[0]}

    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-5)
        return optimizer

In [None]:
data_module = NQADataModule(df,tokenizer,batch_size = 8)
data_module.setup()

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'best_cp',
    save_top_k = 1,
    verbose = True,
    monitor = 'val_loss',
    mode = 'min'
)

In [None]:
model = MyModel()

In [None]:
trainer = pl.Trainer(gpus=1,
    callbacks=[checkpoint_callback],
    max_epochs = 5)

In [17]:
trainer.fit(model,data_module)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [None]:
trainer.test(model, data_module)

In [46]:
import numpy as np
model.freeze()

def predict_top_k_context(model, data_module, question, k=3):
    # Tokenize the question
    question = data_module.tokenizer(question, padding='max_length', truncation=True, max_length=64, return_tensors='pt')
    question_ids = question['input_ids'].flatten()
    question_mask = question['attention_mask'].flatten()

    # Forward pass through the model to get question embeddings
    question_embeddings = model.model.generate(question_ids.unsqueeze(0), attention_mask=question_mask.unsqueeze(0))
    question_embeddings = torch.transpose(question_embeddings, 0, 1)


    # Initialize list to store context paragraphs and cosine similarity scores
    context_list = []
    scores_list = []

    # Iterate over batches in the data loader
    cnt = 0
    for batch in data_module.train_dataloader():
        cnt += 1
        print("batch = ",cnt)
        # Extract inputs from batch
        context_ids = batch['context_ids']
        context_mask = batch['context_mask']

        # Forward pass through the model to get context embeddings
        context_embeddings = model.model.generate(context_ids, attention_mask=context_mask)

        # Compute cosine similarity between question and context embeddings
        similarity_scores = torch.nn.functional.cosine_similarity(context_embeddings.float(), question_embeddings.float(), dim=1)
        

        # Update context and scores lists with current batch
        for i in range(len(similarity_scores)):
            context_list.append(data_module.train_df.iloc[i]['paragraph'])
            scores_list.append(similarity_scores[i].item())

    # Get indices of top k scores
    top_k_indices = np.argsort(scores_list)[::-1][:k]

    # Get top k context paragraphs
    top_k_contexts = [context_list[i] for i in top_k_indices]

    return top_k_contexts

In [56]:
question = "Who is the managing director of Synergee Capital?"
top_k_contexts = predict_top_k_context(model, data_module, question, k=3)
print(top_k_contexts)

batch =  1


RuntimeError: ignored

In [48]:
top_k_contexts

[' According to financial planners, an example of a ladder strategy for a conservative fixed income investor could be to allocate 25% of her corpus each to a 1 year T-bill, Bharat bond ETF 2025 and AAA Bond plus SDL ETF 2027, 15% to 2031 GOI bonds and 5% each to 2040 and 2050 GOI bonds. "In a rising interest rate cycle that we are now in, investors could have a higher allocation to shorter maturity products typically in the 3-5-year bucket," says Dalal. He believes this allocation can change once the interest rate cycle changes and then investors could allocate more money beyond five-year buckets. Investors betting that interest rates are headed down over the next decade, could allocate some amount to long tenure bonds like 2040, 2050 and 2060. This allows them to lock in their investments into interest rates of 6.9-7% per annum in the long term.',
 ' "Investors can use a combination of government bonds and low expense passive debt funds to build a ladder for their portfolio," says Vik