# Load Configurations & Libraries

In [None]:
# If we run this machine on GPU machine it will be faster.

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

In [5]:
apipath = r'H:\\My Drive\\config\\hbqa.txt'
# apipath = r'/content/drive/MyDrive/config/hbqa-colab.txt'
import configparser

config = configparser.ConfigParser()
config.read(apipath)

datapath = config['global']['DATA_FOLDER']
corpuspath = config['global']['CORPUS_FOLDER']
corpus_sectionpath = config['global']['CORPUS_FOLDER_SECTIONS']
OPENAI_KEY = config['global']['OPENAI_KEY']
PINECONE_KEY = config['global']['PINECONE_KEY']
PINECONE_ENV = config['global']['PINECONE_ENV']
CHATPDF_KEY = config['global']['CHATPDF_KEY']

# PINECONE is Vector Database. To store the vector so that we can quickly search the vector space.
# https://app.pinecone.io
# get PINECONE_API_KEY key from app.pinecone.io
# find your PINECONE_ENVIRONMENT next to the api key in pinecone console

### Load Embedding Model. It will create 384 dim vector for the sentences.

In [None]:
# !pip install -Uq sentence-transformers

In [6]:
from sentence_transformers import SentenceTransformer

embmodel = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Predict Document Id from Question

In [33]:
df_qa = pd.read_csv(datapath + '06-HBQA_Manual_with_Chunk.csv')

In [34]:
df_qa.head(2)

Unnamed: 0,Ques_Id,Chunk_Id,Section_Id,Question,Ref_Answer,Chunk,Reference,WordsInQues,WordsInAns,WordsInRef,WordsInChunk
0,0,389,Book03_002,What is the significance of performing the Agn...,Performing the Agnihotra is considered importa...,Even this is eternal morality. They that perfo...,The significance of the Agnihotra and the cons...,16,50,50,809
1,1,390,Book03_003,"What predicament does Yudhishthira face, and h...",Yudhishthira faces the predicament of being un...,"Section III\n""Vaisampayana said, 'Yudhishthira...",Yudhishthira's predicament and his consultatio...,14,41,53,852


In [35]:
import pinecone
index_name = 'hbqa'
# now connect to the index
index = pinecone.Index(index_name)

In [36]:
# for i in df_qa.index:
df_pred = pd.DataFrame(columns = ['Ques_Id','Chunk_Id','Pred_Chunk_Id'])

for i in df_qa.index:
  query = df_qa.loc[i,'Question']
  query_enc = embmodel.encode(query).tolist()
  topk=10
  query_results = index.query(query_enc, top_k=topk, include_metadata=True)

  predictedId=[]
  for predid in query_results['matches']:
    predictedId.append(predid['id'])

  df_pred.loc[i]= (df_qa.loc[i,'Ques_Id'], df_qa.loc[i,'Chunk_Id'], predictedId)
  print('Predicting: ',i)
  # break



Predicting:  0
Predicting:  1
Predicting:  2
Predicting:  3
Predicting:  4
Predicting:  5
Predicting:  6
Predicting:  7
Predicting:  8
Predicting:  9
Predicting:  10
Predicting:  11
Predicting:  12
Predicting:  13
Predicting:  14
Predicting:  15
Predicting:  16
Predicting:  17
Predicting:  18
Predicting:  19
Predicting:  20
Predicting:  21
Predicting:  22
Predicting:  23
Predicting:  24
Predicting:  25
Predicting:  26
Predicting:  27
Predicting:  28
Predicting:  29
Predicting:  30
Predicting:  31
Predicting:  32
Predicting:  33
Predicting:  34
Predicting:  35
Predicting:  36
Predicting:  37
Predicting:  38
Predicting:  39
Predicting:  40
Predicting:  41
Predicting:  42
Predicting:  43
Predicting:  44
Predicting:  45
Predicting:  46
Predicting:  47
Predicting:  48
Predicting:  49
Predicting:  50
Predicting:  51
Predicting:  52
Predicting:  53
Predicting:  54
Predicting:  55
Predicting:  56
Predicting:  57
Predicting:  58
Predicting:  59
Predicting:  60
Predicting:  61
Predicting:  62
Pr

In [37]:
df_pred.to_csv(datapath + '08.1-Predict-DocumentId-for-Ques.csv')

In [38]:
df_pred.head(3)

Unnamed: 0,Ques_Id,Chunk_Id,Pred_Chunk_Id
0,0,389,"[id500, id1338, id289, id511, id669, id586, id..."
1,1,390,"[id2825, id809, id390, id1855, id779, id726, i..."
2,2,390,"[id557, id783, id415, id1898, id2825, id390, i..."


In [39]:
def check_id_in_1pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'][0:1])

def check_id_in_2pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'][0:2])

def check_id_in_3pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'][0:3])

def check_id_in_4pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'][0:4])

def check_id_in_5pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'][0:5])

def check_id_in_10pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'])


# Create a new column 'MRR' based on the condition
df_pred['FoundIn@1'] = df_pred.apply(check_id_in_1pred, axis=1)
df_pred['FoundIn@2'] = df_pred.apply(check_id_in_2pred, axis=1)
df_pred['FoundIn@3'] = df_pred.apply(check_id_in_3pred, axis=1)
df_pred['FoundIn@4'] = df_pred.apply(check_id_in_4pred, axis=1)
df_pred['FoundIn@5'] = df_pred.apply(check_id_in_5pred, axis=1)
df_pred['FoundIn@10'] = df_pred.apply(check_id_in_10pred, axis=1)


In [40]:
df_pred.sample(5)

Unnamed: 0,Ques_Id,Chunk_Id,Pred_Chunk_Id,FoundIn@1,FoundIn@2,FoundIn@3,FoundIn@4,FoundIn@5,FoundIn@10
191,191,467,"[id467, id1714, id1375, id814, id804, id847, i...",1,1,1,1,1,1
77,77,414,"[id332, id448, id387, id416, id415, id417, id7...",0,0,0,0,0,0
629,629,644,"[id1543, id690, id557, id605, id783, id602, id...",0,0,0,0,0,0
379,379,524,"[id766, id473, id524, id393, id736, id459, id2...",0,0,1,1,1,1
444,444,544,"[id544, id784, id471, id1108, id1756, id393, i...",1,1,1,1,1,1


In [41]:
df_pred[['FoundIn@1','FoundIn@2',	'FoundIn@3','FoundIn@4',	'FoundIn@5',	'FoundIn@10']].mean()

FoundIn@1     0.310345
FoundIn@2     0.404719
FoundIn@3     0.464610
FoundIn@4     0.501815
FoundIn@5     0.539020
FoundIn@10    0.624319
dtype: float64

In [42]:
# Number of relevant items in the top K/K

df_pred['P@1'] = df_pred['FoundIn@1']
df_pred['P@2'] = (df_pred['FoundIn@2']/2).apply(lambda x: round(x, 2))
df_pred['P@3'] = (df_pred['FoundIn@3']/3).apply(lambda x: round(x, 2))
df_pred['P@4'] = (df_pred['FoundIn@3']/4).apply(lambda x: round(x, 2))
df_pred['P@5'] = (df_pred['FoundIn@5']/5).apply(lambda x: round(x, 2))
df_pred['P@10'] = (df_pred['FoundIn@10']/10).apply(lambda x: round(x, 2))


In [43]:
# Number of Relevant Items in the Top K/ Total Number of Relevant Items in the Collection

df_pred['R@1'] = df_pred['FoundIn@1']/1
df_pred['R@2'] = df_pred['FoundIn@2']/1
df_pred['R@3'] = df_pred['FoundIn@3']/1
df_pred['R@4'] = df_pred['FoundIn@4']/1
df_pred['R@5'] = df_pred['FoundIn@5']/1
df_pred['R@10'] = df_pred['FoundIn@10']/1

In [44]:
df_pred['F1@1']   = ( (2*df_pred['P@1']*df_pred['R@1'])/ ( df_pred['P@1']+df_pred['R@1'])).apply(lambda x: round(x, 2)).fillna(0)
df_pred['F1@2']   = ( (2*df_pred['P@2']*df_pred['R@2'])/ ( df_pred['P@2']+df_pred['R@2'])).apply(lambda x: round(x, 2)).fillna(0)
df_pred['F1@3']   = ( (2*df_pred['P@3']*df_pred['R@3'])/ ( df_pred['P@3']+df_pred['R@3'])).apply(lambda x: round(x, 2)).fillna(0)
df_pred['F1@4']   = ( (2*df_pred['P@4']*df_pred['R@4'])/ ( df_pred['P@4']+df_pred['R@4'])).apply(lambda x: round(x, 2)).fillna(0)
df_pred['F1@5']   = ( (2*df_pred['P@5']*df_pred['R@5'])/ ( df_pred['P@5']+df_pred['R@5'])).apply(lambda x: round(x, 2)).fillna(0)
df_pred['F1@10'] =  ( (2*df_pred['P@10']*df_pred['R@10'])/ ( df_pred['P@10']+df_pred['R@10'])).apply(lambda x: round(x, 2)).fillna(0)

In [45]:
# Mean Average Precision (MAP) is a metric used to evaluate the performance of information retrieval systems,
# particularly in scenarios where there are multiple queries and each query can have multiple relevant documents.
# It's a measure of how well a retrieval system ranks and retrieves relevant documents across different queries.
def calc_MAP(row):
  return round((row['P@1']+row['P@2']+row['P@3']+row['P@4']+row['P@5'])/5,2)

df_pred['MAP'] = df_pred.apply(calc_MAP, axis=1)


In [46]:
# Mean Reciprocal Rank (MRR) is a metric commonly used to evaluate the performance of ranking-based retrieval systems,
# such as search engines. MRR assesses how well a retrieval system ranks relevant items by considering the position of the
# first relevant item in the ranked list. It's calculated as the average of the reciprocal ranks across a set of queries.

def getReciprocalRank(row):
  K=1 # Number of relevent document
  try:
    i = row['Pred_Chunk_Id'].index('id'+str(row['Chunk_Id']) )
  except:
    i=-1
  if i==-1:
    RR=0
  else:
    RR = round(1/(i + 1),2)
  return RR

df_pred['RR'] = df_pred.apply(getReciprocalRank, axis=1)

In [47]:
df_pred[['Ques_Id',
    'P@1', 'P@2', 'P@3', 'P@4', 'P@5', 'P@10',
    'R@1', 'R@2', 'R@3', 'R@4', 'R@5', 'R@10',
    'F1@1', 'F1@2', 'F1@3', 'F1@4', 'F1@5', 'F1@10',
    'MAP', 'RR',]]

Unnamed: 0,Ques_Id,P@1,P@2,P@3,P@4,P@5,P@10,R@1,R@2,R@3,...,R@5,R@10,F1@1,F1@2,F1@3,F1@4,F1@5,F1@10,MAP,RR
0,0,0,0.0,0.00,0.00,0.0,0.1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.00,0.0,0.0,0.00,0.18,0.00,0.14
1,1,0,0.0,0.33,0.25,0.2,0.1,0.0,0.0,1.0,...,1.0,1.0,0.0,0.00,0.5,0.4,0.33,0.18,0.16,0.33
2,2,0,0.0,0.00,0.00,0.0,0.1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.00,0.0,0.0,0.00,0.18,0.00,0.17
3,3,1,0.5,0.33,0.25,0.2,0.1,1.0,1.0,1.0,...,1.0,1.0,1.0,0.67,0.5,0.4,0.33,0.18,0.46,1.00
4,4,0,0.0,0.00,0.00,0.2,0.1,0.0,0.0,0.0,...,1.0,1.0,0.0,0.00,0.0,0.0,0.33,0.18,0.04,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097,1097,0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00
1098,1098,0,0.5,0.33,0.25,0.2,0.1,0.0,1.0,1.0,...,1.0,1.0,0.0,0.67,0.5,0.4,0.33,0.18,0.26,0.50
1099,1099,0,0.5,0.33,0.25,0.2,0.1,0.0,1.0,1.0,...,1.0,1.0,0.0,0.67,0.5,0.4,0.33,0.18,0.26,0.50
1100,1100,0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00


In [48]:
df_pred[[
    'P@1', 'P@2', 'P@3', 'P@4', 'P@5', 'P@10',
    'R@1', 'R@2', 'R@3', 'R@4', 'R@5', 'R@10',
    'F1@1', 'F1@2', 'F1@3', 'F1@4', 'F1@5', 'F1@10',
    'MAP', 'RR',]].mean()

P@1      0.310345
P@2      0.202359
P@3      0.153321
P@4      0.116152
P@5      0.107804
P@10     0.062432
R@1      0.310345
R@2      0.404719
R@3      0.464610
R@4      0.501815
R@5      0.539020
R@10     0.624319
F1@1     0.310345
F1@2     0.271162
F1@3     0.232305
F1@4     0.185844
F1@5     0.177877
F1@10    0.112377
MAP      0.179855
RR       0.405762
dtype: float64

In [30]:
df_pred.to_csv(datapath + '08.2-Predict-DocumentId-for-Ques-Metrics.csv')