# Zeroshot
- T5
- Flan-T5
- GPT2
- DistilBERT
- RoBERTa
- Llama2 (with PEFT & LoRA)

# Install Necessary Libararies

In [None]:
# !pip install -Uq transformers
# !pip install -Uq evaluate
# !pip install -Uq SentencePiece
# !pip install rouge-score

In [None]:
# !pip install accelerate>=0.20.1
# !pip install transformers[torch]
# # You need to restart the kernel after this step

# Load Libraries and Configurations

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
# from tqdm import tqdm

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, RandomSampler
import torch.nn.functional as F
# import tensorflow as tf

# import nltk
# import spacy
# import string

from sklearn.model_selection import train_test_split

import transformers
import evaluate  # Bleu

import warnings
warnings.filterwarnings("ignore")

In [None]:
apipath = r'H:\\My Drive\\config\\hbqa.txt'
# apipath = r'/content/drive/MyDrive/config/hbqa-colab.txt'
import configparser

config = configparser.ConfigParser()
config.read(apipath)

datapath = config['global']['DATA_FOLDER']
corpuspath = config['global']['CORPUS_FOLDER']
corpus_sectionpath = config['global']['CORPUS_FOLDER_SECTIONS']
OPENAI_KEY = config['global']['OPENAI_KEY']
PINECONE_KEY = config['global']['PINECONE_KEY']
PINECONE_ENV = config['global']['PINECONE_ENV']
CHATPDF_KEY = config['global']['CHATPDF_KEY']

In [None]:
# df = pd.read_csv(datapath + '06-HBQA_Manual_with_Chunk.csv') # for local machine/vedavit colab

# Uncomment below code
# !pip install -q gdown
# import gdown

# # Replace the shared link with the actual link to your file
# file_url = 'https://drive.google.com/uc?id=1Euvnmp8yJ2LGlL2uDvDjER87PYz9RVvS'
# output_path = '/content/hbqa-colab.txt'  # Specify the desired file name and path

# gdown.download(file_url, output_path, quiet=False)

df = pd.read_csv(datapath + '06-HBQA_Manual_with_Chunk.csv')


In [None]:
print(df.shape)
df.head(2)

(1104, 3)


Unnamed: 0,Ques_Id,Ref_Answer,T5_Pred_Answer
0,10000,The Muni wandered over the earth and wept loud...,The Muni wandered over the earth and weep loud...
1,10001,The Muni asked for a bride from the creatures ...,The Muni asked for a bride from the


In [None]:

# df['WordsInChunk'] = df.Chunk.str.split(' ').apply(len)
# df['CharInChunk'] = df.Chunk.apply(len)
# df.to_csv(r'H:\My Drive\HBQA\Data\06-HBQA_Manual_with_Chunk.csv')

In [None]:
# # Detect and initialize TPU
# tpu_available = tf.config.experimental.list_logical_devices("TPU")
# if tpu_available:
#     print("TPU available")
# else:
#     print("No TPU available")

In [None]:
import torch

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")

print(DEVICE)

cpu


In [None]:
# sample code
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# TOKENIZER = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-ReformerForQuestionAnswering")
# MODEL = AutoModelForQuestionAnswering.from_pretrained("hf-internal-testing/tiny-random-ReformerForQuestionAnswering")
# MODEL.to(DEVICE)

# Common Setting for Training

In [None]:
Question_Len = int(max(len(ques) for ques in df.Question)/4)
Answer_Len = int(max(len(ans) for ans in df.Ref_Answer)/4)
Question_Len,Answer_Len # in Tokens

(34, 222)

In [None]:
Q_LEN =  150 #256   # Question Length
T_LEN =  500 #32  # Target Length
BATCH_SIZE = 4
# DEVICE = "cuda:0"

# Load Base Model for Finetuning

In [None]:
class QA_Dataset(Dataset):
  def __init__(self, tokenizer, dataframe, q_len, t_len):
      self.tokenizer = tokenizer
      self.q_len = q_len
      self.t_len = t_len
      self.data = dataframe
      self.questions = self.data["Question"]
      self.context = self.data["Chunk"]
      self.answer = self.data['Ref_Answer']

  def __len__(self):
      return len(self.questions)

  def __getitem__(self, idx):
      question = self.questions[idx]
      context = self.context[idx]
      answer = self.answer[idx]

      ques_cont_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                  truncation=True, pad_to_max_length=True, add_special_tokens=True)
      answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
                                        truncation=True, pad_to_max_length=True, add_special_tokens=True)

      labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long) # padded token has 0 value
      labels[labels == 0] = -100 #padded token set to -100

      return {
          "input_ids": torch.tensor(ques_cont_tokenized["input_ids"], dtype=torch.long),
          "attention_mask": torch.tensor(ques_cont_tokenized["attention_mask"], dtype=torch.long),
          "labels": labels,
          "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
      }

# T5 Model

In [None]:
# Zero shot Transformers/Models
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

In [None]:
tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
model.to(DEVICE)
optimizer = Adam(model.parameters(), lr=0.00001)

## Function to predict Answers from t5 model

In [None]:
def predict_answer(context, question):

    inputs = tokenizer(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

    predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True)
    return predicted_answer

## Check Random Answers

In [None]:
import random
qno=[]
N=5

df1 = pd.DateOffset(['Ques_Id','Ref_Answer','T5Pred_Answer'])

for i in range(N):
    qno.append(random.randint(0,df.shape[0] ) )

pred_answers=[]
ref_answers=[]
question=[]
j=0
for i in qno:
    chunk    = df.iloc[i]['Chunk'],
    ques     = df.iloc[i]['Question']
    ref_ans  = df.iloc[i]['Ref_Answer']

    pred_ans = predict_answer(chunk, ques)

    pred_answers.append(pred_ans)
    ref_answers.append(ref_ans)
    question.append(ques)

    df1.iloc[j] = {ques,ref_ans,pred_ans}
    j+=1

    print('Question  :', ques)
    print("Ref Answer:", ref_ans)
    print("Pred Ans  :", pred_ans)
    print('--------')

Question  : Who was the ruler of the kingdom of Maghadha that King Pandu defeated?
Ref Answer: King Pandu defeated Dhirga, the ruler of the kingdom of Maghadha.
Pred Ans  : King Pandu defeated the ruler of the kingdom of Maghadha, which was the kingdom of
--------
Question  : What did Krishna do with the alms they obtained during their eleemosynary visits?
Ref Answer: Krishna devoted a portion of the alms to the gods, gave another portion to Brahmanas as gifts, gave a portion to the venerable lady, and distributed the rest among the five foremost men.
Pred Ans  : Krishna disposed of the alms that he had given to the Pandavas during their
--------
Question  : Why did the Muni throw himself into the river Satadru (the river of a hundred courses)?
Ref Answer: The Muni threw himself into the river Satadru, mistaking it for a mass of unquenchable fire, seeking to end his life.
Pred Ans  : The Muni threw himself into the river Satadru because he was afraid of
--------
Question  : Whom did Ya

## Predict All Answer & Save Predictions

In [None]:
import random
pred_answers=[]
ref_answers=[]
questions=[]
for i in range(df.shape[0]):
    chunk    = df.iloc[i]['Chunk'],
    ques     = df.iloc[i]['Question']
    ref_ans  = df.iloc[i]['Ref_Answer']

    pred_ans = predict_answer(chunk, ques)

    pred_answers.append(pred_ans)
    ref_answers.append(ref_ans)
    questions.append(ques)

    # print('Question  :', ques)
    # print("Ref Answer:", ref_ans)
    # print("Pred Ans  :", pred_ans)
    # print('--------')
    print(f"Predicting ans for question {df.iloc[i]['Ques_Id']}")
    df.loc[i,'T5Pred_Answer'] = pred_ans

Predicting ans for question 10000
Predicting ans for question 10001
Predicting ans for question 10002
Predicting ans for question 10003
Predicting ans for question 10004
Predicting ans for question 10005
Predicting ans for question 10006
Predicting ans for question 10007
Predicting ans for question 10008
Predicting ans for question 10009
Predicting ans for question 10010
Predicting ans for question 10011
Predicting ans for question 10012
Predicting ans for question 10013
Predicting ans for question 10014
Predicting ans for question 10015
Predicting ans for question 10016
Predicting ans for question 10017
Predicting ans for question 10018
Predicting ans for question 10019
Predicting ans for question 10020
Predicting ans for question 10021
Predicting ans for question 10022
Predicting ans for question 10023
Predicting ans for question 10024
Predicting ans for question 10025
Predicting ans for question 10026
Predicting ans for question 10027
Predicting ans for question 10028
Predicting ans

In [None]:
df[['Ques_Id','Ref_Answer','T5Pred_Answer']].to_csv(datapath + '09.11-T5Predicted_Ans-Zeroshot.csv')
# df= pd.read_csv(datapath + '11.1-t5Predicted_ans.csv')

## Calculate and Save Metrics - 1

In [None]:
df1 = pd.read_csv(datapath + '09.11-T5Predicted_Ans_E1.csv')


# Util Functions

In [None]:
#Select Model Function

# https://www.sbert.net/docs/pretrained_models.html

#250MB, multi-qa-distilbert-cos-v1',  Max Sequence Length:	512, Dimensions:768, Normalized Embeddings:	true
#80MB, all-MiniLM-L6-v2, Max Sequence Length:	256, Dimensions:	384, Normalized Embeddings:	true
#290MB, all-distilroberta-v1, Max Sequence Length:	512, Dimensions:	768, Normalized Embeddings:	true
#420MB, all-mpnet-base-v2, Max Sequence Length:	384, Dimensions:	768, Normalized Embeddings:	true
#1.36GB, all-roberta-large-v1, Max Sequence Length:	256, Dimensions: 1024, Normalized Embeddings:	true

def select_model(num):
    emb_modelshortlist = ['distilbert','minilm','distilroberta','mpnet','roberta']

    emb_modellist = ['multi-qa-distilbert-cos-v1',
                'all-MiniLM-L6-v2',
                'all-distilroberta-v1',
                'multi-qa-mpnet-base-dot-v1',
                'all-roberta-large-v1']

    embmodelname = emb_modellist[num]
    embmodelshort = emb_modelshortlist[num]
    embmodelname1 = "_" + embmodelname

    print (embmodelname,'\t',embmodelshort,'\t', embmodelname1)
    return embmodelname, embmodelshort, embmodelname1

In [None]:
# def calculate_score(ref_answer, predicted_answer):

#   pred_answer_ids = tokenizer.encode(predicted_answer, return_tensors='pt')[0][0]
#   # pred_answer_ids = pred_answer_ids.to(DEVICE)

#   ref_answer_ids = tokenizer.encode(ref_answer, return_tensors='pt')[0][0]
#   # ref_answer_ids = pred_answer_ids.to(DEVICE)

#   bleu = evaluate.load("google_bleu")
#   bleu_score1  = bleu.compute(predictions=[predicted_answer], references=[ref_answer])

#   # squad = evaluate.load("squad")
#   glue_qqp = evaluate.load('glue', 'qqp')

#   glue_qqp_score1 = glue_qqp.compute(predictions=[pred_answer_ids],
#                       references=[ref_answer_ids])

#   return bleu_score1, glue_qqp_score1 #squad_score1 #bleu_score1#, squad_score1, glue_score1

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics import precision_score, recall_score
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
import numpy as np

smoother = SmoothingFunction()
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def get_nlp_metrics(ref_tokens, pred_tokens):

  # Calculate ROUGE-1 and ROUGE-L scores
  scores = scorer.score(ref_ans, pred_ans)

  # Access individual ROUGE scores
  rouge_1_precision = scores['rouge1'].precision
  rouge_1_recall = scores['rouge1'].recall
  rouge_1_f1 = scores['rouge1'].fmeasure

  rouge_l_precision = scores['rougeL'].precision
  rouge_l_recall = scores['rougeL'].recall
  rouge_l_f1 = scores['rougeL'].fmeasure

  rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1,

  # Calculate BLEU score for a single sentence
  bleu_score = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoother.method2)

  # Calculate BLEU score for multiple sentences
  # corpus_bleu_score = corpus_bleu([[ref_tokens]], [pred_tokens],  smoothing_function=smoother.method2)

  tp = len(set(ref_tokens).intersection(set(pred_tokens)))
  precision = tp / len(pred_tokens)
  recall =  tp / len(ref_tokens)

  return np.array([bleu_score, rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1, precision, recall])


In [None]:
# Select Model
embmodelname, embmodelshort, embmodelname1 = select_model(1)

all-MiniLM-L6-v2 	 minilm 	 _all-MiniLM-L6-v2


# Create Answer Embedding

In [None]:
emb_model = SentenceTransformer('sentence-transformers/'+embmodelname)

In [None]:
T5PredAns_Sentences = df_t5Predicted_Ans.T5Pred_Answer.tolist()
T5PredAns_Embeddings = emb_model.encode(T5PredAns_Sentences)

T5PredAnsVectors= torch.tensor(T5PredAns_Embeddings, dtype=torch.float).to(DEVICE)

T5PredAnsVec_list = T5PredAnsVectors.tolist()

df_t5Predicted_Ans['T5_AnsVector'] = T5PredAnsVec_list

df_t5Predicted_Ans.to_csv(datapath+'09.11-T5Predicted_AnsVector_E2.csv')

In [None]:
df1 = pd.DataFrame(columns = ['Ques_Id','BLEU1', 'ROUGE1_P', 'ROUGE1_R', 'ROUGE1_F1',
                              'ROUGEL_P', 'ROUGEL_R', 'ROUGE1_F1', 'Precision', 'Recall'])


In [None]:
bleu_score=[]
rogue_score=[]
precision=[]
recall=[]

# from datasets import load_metric

N= df.shape[0]

for i in range(N):
  # bleu_score1 = calculate_score( df1.iloc[i]['ref_answer'],df1.iloc[i]['pred_answer'])
  ques_id = df.loc[i,'Ques_Id']
  ref_ans  = df.loc[i,'Ref_Answer']
  pred_ans = df.loc[i,'T5Pred_Answer']


  # bleu, glue = calculate_score(ref_ans, pred_ans)
  # print(bleu,glue,ques_id)

  # bleu_score.append(bleu['google_bleu'])
  # glue_qqp_score.append(glue['accuracy'])
  # glue_qqp_score.append(glue['f1'])

  # df1.loc[i] = (ques_id, bleu['google_bleu'], glue['accuracy'], glue['f1'] )

Downloading builder script:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'google_bleu': 0.3673469387755102} {'accuracy': 1.0, 'f1': 0.0} 10000
{'google_bleu': 0.23529411764705882} {'accuracy': 1.0, 'f1': 0.0} 10001
{'google_bleu': 0.14102564102564102} {'accuracy': 1.0, 'f1': 0.0} 10002
{'google_bleu': 0.0425531914893617} {'accuracy': 1.0, 'f1': 0.0} 10003
{'google_bleu': 0.0660377358490566} {'accuracy': 1.0, 'f1': 0.0} 10004
{'google_bleu': 0.5476190476190477} {'accuracy': 1.0, 'f1': 0.0} 10005
{'google_bleu': 0.3} {'accuracy': 1.0, 'f1': 0.0} 10006
{'google_bleu': 0.07407407407407407} {'accuracy': 1.0, 'f1': 0.0} 10007
{'google_bleu': 0.15306122448979592} {'accuracy': 1.0, 'f1': 0.0} 10008
{'google_bleu': 0.23636363636363636} {'accuracy': 1.0, 'f1': 0.0} 10009
{'google_bleu': 0.12222222222222222} {'accuracy': 1.0, 'f1': 0.0} 10010
{'google_bleu': 0.46938775510204084} {'accuracy': 1.0, 'f1': 0.0} 10011
{'google_bleu': 0.14935064935064934} {'accuracy': 1.0, 'f1': 0.0} 10012
{'google_bleu': 0.46938775510204084} {'accuracy': 1.0, 'f1': 0.0} 10013
{'google_ble

In [None]:
df1.sample(5)

Unnamed: 0,Ques_Id,BLEU_Score,GLUE_Acc,GLUE_F1
309,10309.0,0.030471,1.0,0.0
1041,11041.0,0.088889,1.0,0.0
381,10381.0,0.015385,1.0,0.0
497,10497.0,0.144444,1.0,0.0
551,10551.0,0.090909,1.0,0.0


In [None]:
df1.to_csv(datapath + '09.12-t5Predicted_Ans_Score_E2.csv')

In [None]:
df1[[ 'BLEU_Score', 'GLUE_Acc', 'GLUE_F1']].mean()

BLEU_Score    0.206810
GLUE_Acc      1.000000
GLUE_F1       0.004529
dtype: float64

## Embedding/Vectorizing Predicted Answer

Unnamed: 0.1,Unnamed: 0,Ques_Id,Ref_Answer,T5Pred_Answer
0,0,10000,The Muni wandered over the earth and wept loud...,The Muni wandered over the earth and weeped lo...
1,1,10001,The Muni asked for a bride from the creatures ...,The Muni asked for a bride from his mother who...
2,2,10002,The snakes that had been set upon Jaratkaru's ...,The king of the sacrificial spheres informed V...
3,3,10003,"Vasuki took a maiden, who was decked with orna...",Vasuki smiled and thanked the Muni for his kin...
4,4,10004,The Rishi hesitated to accept the maiden becau...,The Rishi feared that the maiden would be beat...
...,...,...,...,...
1099,1099,11099,"Duryodhana asked for troops, while Arjuna requ...",Duryodhana asked Arjuna to provide assistance ...
1100,1100,11100,Sanjaya went as an envoy to the Pandavas from ...,"He was a virtuous man, a virtuous man, and"
1101,1101,11101,The name of the parva that describes the estab...,The name of the parva that describes the estab...
1102,1102,11102,The princes confined in the mountain-pass by J...,The princes confined in the mountain-pass by J...


# Calculate and Save Metrics -2

In [None]:

def calculate_cosine(row):
  # Remove the brackets and split the string by commas
  predAns_vector_values = row['T5_AnsVector'].strip('[]').split(',')
  refAns_vector_values  = row['AnsVector'].strip('[]').split(',')


  # Convert the string values to floats
  predAns_vector_values = [float(value) for value in predAns_vector_values]
  refAns_vector_values  = [float(value) for value in refAns_vector_values]


  # Convert tensor
  predAns_vector_values = torch.tensor(predAns_vector_values).reshape(1, -1)
  refAns_vector_values = torch.tensor(refAns_vector_values).reshape(1, -1)

  # print(predAns_vector_values)
  # print('----')
  # print (refAns_vector_values)
  # Calculate Cosine
  return round(F.cosine_similarity(predAns_vector_values,refAns_vector_values).item(),3)
