In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
torch.set_float32_matmul_precision('high')
import random
import numpy as np
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from pytorch_lightning import Trainer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer as Tokenizer,AutoModelForSeq2SeqLM
from torch.optim import AdamW



In [2]:
pl.seed_everything (42)

Global seed set to 42


42

In [3]:
MODEL_NAME = 'google/flan-t5-base'

In [4]:
tokenizer = Tokenizer.from_pretrained(MODEL_NAME)

In [5]:
path = 'NewsQA_SPAN.feather'

In [6]:
df = pd.read_feather(path)

In [7]:
df = df.iloc[:5000]

In [8]:
train_df, val_df = train_test_split(df,test_size=0.2)
val_df, test_df = train_test_split(val_df,test_size=0.5)

In [9]:
class QADataset(Dataset):
  def __init__(self,data : pd.DataFrame,tokenizer : Tokenizer,source_max_token_len : int = 200,
               target_max_token_len : int = 20):

    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self,index : int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
        data_row['question'],
        data_row['paragraph'],
        max_length = self.source_max_token_len,
        padding = "max_length",
        truncation = "only_second",
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    target_encoding = tokenizer(
        data_row['answer'],
        max_length = self.target_max_token_len,
        padding = "max_length",
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt")
    
    labels = target_encoding["input_ids"]
    

    return dict(
        answer = data_row['answer'],
        input_ids = source_encoding['input_ids'].flatten(),
        attention_mask = source_encoding['attention_mask'].flatten(),
        labels = labels.flatten())


In [10]:
sample_dataset = QADataset(df,tokenizer)

In [11]:
tokenizer.vocab_size

32100

In [12]:
for data in sample_dataset:
  print(type(data['input_ids']))
  print(data['input_ids'])
  print(data['labels'])
  break

<class 'torch.Tensor'>
tensor([ 2645,    19,     8,  5037,  2090,    13,  8951,    49,   397,    15,
         5826,    58,     1,    96, 13898,   127,     7,    54,   169,     3,
            9,  2711,    13,   789, 13237,    11,   731,  8225, 14410,  2814,
         2731,    12,   918,     3,     9, 15812,    21,    70,  4833,   976,
          845,  1813,   157,  2375, 10729,   138,     6,  5037,  2090,     6,
         8951,    49,   397,    15,  5826,     5,  5421, 13015,     7,   243,
         4367,   228,  2153,   710,    18,  1987,     6,  2768,    18,  1987,
           11,   307,    18,  1987,   789, 13237,    28,   128,  8543,  3069,
          494,     5,     3, 17229,     6,     3,     9,   386,    18,  7393,
        20792,  3259,   133,   428,  4367,     3,     9,  6339,    13,  1877,
         2712,     6,     3,     9,    80,    18,  1201,   332,  2876,   133,
         6339,     3, 19708,  4704,    11,     3,     9,  9445,  1201,   789,
         1034,  6339,     7,     3, 27865

In [13]:
class QADataModule(pl.LightningDataModule):
  def __init__(self,train_df , val_df, test_df,tokenizer : Tokenizer,batch_size : int = 8,
               source_max_token_len : int = 200,target_max_token_len : int = 20):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.val_df = val_df
    self.tokenizer = tokenizer
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self,stage=None):
    self.train_dataset = QADataset(self.train_df,self.tokenizer,self.source_max_token_len,self.target_max_token_len)
    self.val_dataset = QADataset(self.val_df,self.tokenizer,self.source_max_token_len,self.target_max_token_len)
    self.test_dataset = QADataset(self.test_df,self.tokenizer,self.source_max_token_len,self.target_max_token_len)
    

  def train_dataloader(self):
    return DataLoader(self.train_dataset,batch_size = self.batch_size,shuffle=True)

  def val_dataloader(self):
    return DataLoader(self.val_dataset,batch_size = self.batch_size)

  def test_dataloader(self):
    return DataLoader(self.test_dataset,batch_size = self.batch_size)   

In [14]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers, dropout=dropout, batch_first=True,bidirectional=True)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, hidden = self.gru(embedded)
        #print("outputs = ",outputs.shape)
        #print("hidden = ",hidden.shape)
        return outputs, hidden
    

In [15]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        
        self.attn = nn.Linear(hidden_size * 3, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden shape: (batch_size, hidden_size)
        # encoder_outputs shape: (batch_size, seq_len, hidden_size)
        
        # Calculate attention energies
        seq_len = encoder_outputs.size(1)
        hidden_expanded = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        attn_inputs = torch.cat((hidden_expanded, encoder_outputs), dim=2)
        #print("attn_inputs = ",attn_inputs.shape)
        attn_energies = self.attn(attn_inputs)
        attn_energies = torch.tanh(attn_energies)
        
        # Calculate attention weights and context vector
        attn_weights = torch.softmax(self.v(attn_energies), dim=1)
        context = torch.bmm(attn_weights.transpose(1, 2), encoder_outputs)

        # The resulting context vector is a summary of the relevant parts of the input sequence that 
        # the decoder should use to generate the next output element. By attending to different parts 
        # of the input sequence at each time step, the Attention mechanism can help the model 
        # generate more accurate outputs.
        
        # Return context vector
        return context


In [16]:
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers, dropout):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size*3, hidden_size, num_layers, dropout=dropout, batch_first=True,bidirectional=True)
        self.fc_out = nn.Linear(hidden_size*2, output_size)
        self.attention = Attention(hidden_size)

    def forward(self, x, hidden, encoder_outputs):
        embedded = self.dropout(self.embedding(x))
        context = self.attention(hidden[-1], encoder_outputs)
        rnn_input = torch.cat((embedded, context), dim=2)
        #print("rnn_input = ",rnn_input.shape)
        output, hidden = self.gru(rnn_input, hidden)
        output = self.fc_out(output)
        return output, hidden

In [17]:
class Seq2Seq(pl.LightningModule):
    def __init__(self, encoder, decoder, pad_idx):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=pad_idx)
        

    def forward(self, src, attn,trg,teacher_forcing_ratio = 0.55):
        batch_size = src.size(0)
        max_len = trg.size(1)
        trg_vocab_size = self.decoder.fc_out.out_features

        encoder_outputs, hidden = self.encoder(src)


        teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        
        
        outputs = torch.zeros(batch_size, max_len, trg_vocab_size).to(self.device)
        output = trg[:, 0]



        if teacher_forcing : 
           for t in range(1, max_len):
                
                output, hidden = self.decoder(output.unsqueeze(1), hidden, encoder_outputs)
                outputs[:, t, :] = output.squeeze(1)
                output = trg[:, t]
        
        else:
            
            for t in range(1, max_len):
                output, hidden = self.decoder(output.unsqueeze(1), hidden, encoder_outputs)
                outputs[:, t, :] = output.squeeze(1)
                top1 = output.argmax(2)
                output = top1.squeeze(1)
        
        return outputs, hidden

    def training_step(self, batch, batch_idx):
        
        src = batch['input_ids']
        attn = batch['attention_mask']
        trg = batch['labels']

        trg_input = trg
        trg_output = trg

        output, hidden = self(src, attn,trg_input)


        output = output.reshape(-1, output.shape[-1])
        trg_output = trg_output.reshape(-1)

        train_loss = self.loss_fn(output, trg_output)

        self.log_dict({"train_loss" : train_loss,
                       },prog_bar=True,logger=True)

        return train_loss

    def validation_step(self, batch, batch_idx):
        src = batch['input_ids']
        attn = batch['attention_mask']
        trg = batch['labels']

        trg_input = trg
        trg_output = trg

        output, hidden = self(src, attn,trg_input)


        output = output.reshape(-1, output.shape[-1])
        trg_output = trg_output.reshape(-1)

        val_loss = self.loss_fn(output, trg_output)

        self.log_dict({"val_loss" : val_loss
                       },prog_bar=True,logger=True)


        return val_loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return optimizer


In [18]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'Seq2SeqBiGRUTF',
    save_top_k = 1,
    verbose = True,
    monitor = 'val_loss',
    mode = 'min'
)

In [19]:
encoder = Encoder(input_size=tokenizer.vocab_size, hidden_size=512, num_layers=2, dropout=0.2)

In [20]:
decoder = Decoder(output_size=tokenizer.vocab_size, hidden_size=512, num_layers=2, dropout=0.2)

In [21]:
model = Seq2Seq(encoder,decoder,pad_idx=0)

In [22]:
BATCH_SIZE = 12
N_EPOCHS = 30

data_module = QADataModule(train_df,val_df,test_df,tokenizer,batch_size = BATCH_SIZE)
data_module.setup()

In [23]:
trainer = pl.Trainer(devices=-1, accelerator="gpu",
    callbacks=[checkpoint_callback],
    max_epochs = N_EPOCHS
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [24]:
trainer.fit(model,data_module)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params
---------------------------------------------
0 | encoder | Encoder          | 24.3 M
1 | decoder | Decoder          | 61.1 M
2 | loss_fn | CrossEntropyLoss | 0     
---------------------------------------------
85.5 M    Trainable params
0         Non-trainable params
85.5 M    Total params
341.837   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 376/376 [03:26<00:00,  1.82it/s, loss=5.87, v_num=1, train_loss=5.800, val_loss=5.820]

Epoch 0, global step 334: 'val_loss' reached 5.81502 (best 5.81502), saving model to '/home/sushovan/Seq2Seq/checkpoints/Seq2SeqBiGRUTF-v1.ckpt' as top 1


Epoch 1: 100%|██████████| 376/376 [03:21<00:00,  1.86it/s, loss=4.66, v_num=1, train_loss=5.040, val_loss=4.700]

Epoch 1, global step 668: 'val_loss' reached 4.70041 (best 4.70041), saving model to '/home/sushovan/Seq2Seq/checkpoints/Seq2SeqBiGRUTF-v1.ckpt' as top 1


Epoch 2: 100%|██████████| 376/376 [03:13<00:00,  1.94it/s, loss=3.98, v_num=1, train_loss=5.200, val_loss=4.310]

Epoch 2, global step 1002: 'val_loss' reached 4.31266 (best 4.31266), saving model to '/home/sushovan/Seq2Seq/checkpoints/Seq2SeqBiGRUTF-v1.ckpt' as top 1


Epoch 3: 100%|██████████| 376/376 [03:13<00:00,  1.95it/s, loss=3.44, v_num=1, train_loss=4.190, val_loss=4.170]

Epoch 3, global step 1336: 'val_loss' reached 4.17260 (best 4.17260), saving model to '/home/sushovan/Seq2Seq/checkpoints/Seq2SeqBiGRUTF-v1.ckpt' as top 1


Epoch 4: 100%|██████████| 376/376 [03:13<00:00,  1.95it/s, loss=3.43, v_num=1, train_loss=3.150, val_loss=4.130]

Epoch 4, global step 1670: 'val_loss' reached 4.13438 (best 4.13438), saving model to '/home/sushovan/Seq2Seq/checkpoints/Seq2SeqBiGRUTF-v1.ckpt' as top 1


Epoch 5: 100%|██████████| 376/376 [03:13<00:00,  1.94it/s, loss=3.16, v_num=1, train_loss=2.800, val_loss=3.950]

Epoch 5, global step 2004: 'val_loss' reached 3.94689 (best 3.94689), saving model to '/home/sushovan/Seq2Seq/checkpoints/Seq2SeqBiGRUTF-v1.ckpt' as top 1


Epoch 6: 100%|██████████| 376/376 [03:13<00:00,  1.94it/s, loss=3.03, v_num=1, train_loss=2.790, val_loss=4.010]

Epoch 6, global step 2338: 'val_loss' was not in top 1


Epoch 7: 100%|██████████| 376/376 [03:13<00:00,  1.94it/s, loss=3.06, v_num=1, train_loss=4.210, val_loss=3.850]

Epoch 7, global step 2672: 'val_loss' reached 3.84899 (best 3.84899), saving model to '/home/sushovan/Seq2Seq/checkpoints/Seq2SeqBiGRUTF-v1.ckpt' as top 1


Epoch 8: 100%|██████████| 376/376 [03:13<00:00,  1.95it/s, loss=2.93, v_num=1, train_loss=2.320, val_loss=3.840]

Epoch 8, global step 3006: 'val_loss' reached 3.84105 (best 3.84105), saving model to '/home/sushovan/Seq2Seq/checkpoints/Seq2SeqBiGRUTF-v1.ckpt' as top 1


Epoch 9: 100%|██████████| 376/376 [03:13<00:00,  1.95it/s, loss=2.88, v_num=1, train_loss=2.610, val_loss=3.840]

Epoch 9, global step 3340: 'val_loss' reached 3.83760 (best 3.83760), saving model to '/home/sushovan/Seq2Seq/checkpoints/Seq2SeqBiGRUTF-v1.ckpt' as top 1


Epoch 10: 100%|██████████| 376/376 [03:13<00:00,  1.95it/s, loss=3, v_num=1, train_loss=5.220, val_loss=3.800]   

Epoch 10, global step 3674: 'val_loss' reached 3.80436 (best 3.80436), saving model to '/home/sushovan/Seq2Seq/checkpoints/Seq2SeqBiGRUTF-v1.ckpt' as top 1


Epoch 11: 100%|██████████| 376/376 [03:12<00:00,  1.95it/s, loss=2.91, v_num=1, train_loss=3.780, val_loss=3.850]

Epoch 11, global step 4008: 'val_loss' was not in top 1


Epoch 12: 100%|██████████| 376/376 [03:13<00:00,  1.95it/s, loss=2.9, v_num=1, train_loss=2.610, val_loss=3.820] 

Epoch 12, global step 4342: 'val_loss' was not in top 1


Epoch 13: 100%|██████████| 376/376 [03:13<00:00,  1.95it/s, loss=3.06, v_num=1, train_loss=3.880, val_loss=3.890]

Epoch 13, global step 4676: 'val_loss' was not in top 1


Epoch 14: 100%|██████████| 376/376 [03:12<00:00,  1.95it/s, loss=3.12, v_num=1, train_loss=3.850, val_loss=3.930]

Epoch 14, global step 5010: 'val_loss' was not in top 1


Epoch 15: 100%|██████████| 376/376 [03:13<00:00,  1.95it/s, loss=2.9, v_num=1, train_loss=4.900, val_loss=3.900] 

Epoch 15, global step 5344: 'val_loss' was not in top 1


Epoch 16: 100%|██████████| 376/376 [03:13<00:00,  1.95it/s, loss=3.01, v_num=1, train_loss=4.190, val_loss=3.970]

Epoch 16, global step 5678: 'val_loss' was not in top 1


Epoch 17: 100%|██████████| 376/376 [03:13<00:00,  1.95it/s, loss=2.92, v_num=1, train_loss=2.200, val_loss=3.900]

Epoch 17, global step 6012: 'val_loss' was not in top 1


Epoch 18: 100%|██████████| 376/376 [03:13<00:00,  1.95it/s, loss=2.95, v_num=1, train_loss=1.730, val_loss=3.760]

Epoch 18, global step 6346: 'val_loss' reached 3.76360 (best 3.76360), saving model to '/home/sushovan/Seq2Seq/checkpoints/Seq2SeqBiGRUTF-v1.ckpt' as top 1


Epoch 19: 100%|██████████| 376/376 [03:12<00:00,  1.95it/s, loss=2.77, v_num=1, train_loss=2.800, val_loss=3.880]

Epoch 19, global step 6680: 'val_loss' was not in top 1


Epoch 20: 100%|██████████| 376/376 [03:16<00:00,  1.92it/s, loss=2.69, v_num=1, train_loss=3.120, val_loss=3.850]

Epoch 20, global step 7014: 'val_loss' was not in top 1


Epoch 21: 100%|██████████| 376/376 [03:31<00:00,  1.78it/s, loss=2.85, v_num=1, train_loss=3.460, val_loss=3.900]

Epoch 21, global step 7348: 'val_loss' was not in top 1


Epoch 22: 100%|██████████| 376/376 [03:24<00:00,  1.84it/s, loss=2.61, v_num=1, train_loss=2.320, val_loss=3.930]

Epoch 22, global step 7682: 'val_loss' was not in top 1


Epoch 23: 100%|██████████| 376/376 [03:15<00:00,  1.92it/s, loss=2.71, v_num=1, train_loss=2.770, val_loss=3.950]

Epoch 23, global step 8016: 'val_loss' was not in top 1


Epoch 24: 100%|██████████| 376/376 [03:15<00:00,  1.92it/s, loss=2.8, v_num=1, train_loss=3.770, val_loss=3.960] 

Epoch 24, global step 8350: 'val_loss' was not in top 1


Epoch 25: 100%|██████████| 376/376 [03:15<00:00,  1.92it/s, loss=2.78, v_num=1, train_loss=2.730, val_loss=3.990]

Epoch 25, global step 8684: 'val_loss' was not in top 1


Epoch 26: 100%|██████████| 376/376 [03:15<00:00,  1.92it/s, loss=2.82, v_num=1, train_loss=2.990, val_loss=4.040]

Epoch 26, global step 9018: 'val_loss' was not in top 1


Epoch 27: 100%|██████████| 376/376 [03:15<00:00,  1.92it/s, loss=2.76, v_num=1, train_loss=2.980, val_loss=3.990]

Epoch 27, global step 9352: 'val_loss' was not in top 1


Epoch 28: 100%|██████████| 376/376 [03:15<00:00,  1.92it/s, loss=2.9, v_num=1, train_loss=1.770, val_loss=3.930] 

Epoch 28, global step 9686: 'val_loss' was not in top 1


Epoch 29: 100%|██████████| 376/376 [03:15<00:00,  1.92it/s, loss=2.88, v_num=1, train_loss=2.600, val_loss=3.980]

Epoch 29, global step 10020: 'val_loss' was not in top 1
`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 376/376 [03:15<00:00,  1.92it/s, loss=2.88, v_num=1, train_loss=2.600, val_loss=3.980]


In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [60]:
cppath = 'checkpoints/Seq2SeqBiGRUTF.ckpt'

In [61]:
trained_model = Seq2Seq.load_from_checkpoint(cppath,encoder=encoder,decoder=decoder,pad_idx=0)
trained_model.freeze()

In [62]:
def predict(question,model):
    
    model.eval()


    ques = question['question'],
    ans = question['answer']

    print("QUESTION : ",ques)
    print("ACTUAL ANS : ",ans)



    # Tokenize the source text
    source_tokens = tokenizer(
        question['question'],
        question['paragraph'],
        max_length=200,
        padding="max_length",
        truncation="only_second",
        add_special_tokens=True,
        return_tensors="pt")['input_ids'].flatten().to(device)

    # Reshape the source tokens to match the expected input shape of the encoder
    source_tokens = source_tokens.unsqueeze(0).to(device)

    model.to(device)

    # Encode the source text
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(source_tokens)

    max_length = 20
    # Initialize the predicted sentence
    outputs = [0]

    # Generate the output sequence token by token
    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).unsqueeze(0).to(device)

        # Decode the next token
        with torch.no_grad():
            output, hidden = model.decoder(previous_word, hidden, encoder_outputs)
            best_guess = output.argmax(2).item()

        # Add the predicted token to the predicted sentence
        outputs.append(best_guess)

        # If the predicted token is the end-of-sequence token, stop generating further tokens
        if best_guess == tokenizer.sep_token_id:
            break

    # Convert the predicted sentence back to text
    
    #print(outputs)
    predicted_text = tokenizer.decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print("PREDICTED ANS : ",predicted_text)

In [63]:
sample_question = test_df.iloc[11]
predict(sample_question,trained_model)

QUESTION :  ('How much did Mudrick Capital Management buy new shares of AMC in June?',)
ACTUAL ANS :  about $230 million
PREDICTED ANS :  $230 million
