# Load Configurations & Libraries

In [None]:
# If we run this machine on GPU machine it will be faster.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
apipath = r'H:\\My Drive\\config\\hbqa.txt'
apipath = r'/content/drive/MyDrive/config/hbqa-colab.txt'
import configparser

config = configparser.ConfigParser()
config.read(apipath)
secret_key = config['global']['OPENAI_API_KEY']
datapath = config['global']['DATA_FOLDER']
corpuspath = config['global']['CORPUS_FOLDER']
PINECONE_API_KEY = config['global']['PINECONE_API_KEY']
PINECONE_ENV = config['global']['PINECONE_ENV']

# PINECONE is Vector Database. To store the vector so that we can quickly search the vector space.
# https://app.pinecone.io
# get PINECONE_API_KEY key from app.pinecone.io
# find your PINECONE_ENVIRONMENT next to the api key in pinecone console

### Load Embedding Model. It will create 384 dim vector for the sentences.

In [None]:
!pip install -Uq sentence-transformers

In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Predict Document Id from Question

In [9]:
df_qa = pd.read_csv(datapath + '06-HBQA_Manual_with_Chunk.csv')

In [10]:
df_qa.head(2)

Unnamed: 0,Ques_Id,Chunk_Id,Section_Id,Question,Ref_Answer,Chunk,Reference,WordsInQues,WordsInAns
0,10000,10193,Book01_046,Why did the Muni wander over the earth and wee...,The Muni wandered over the earth and wept loud...,"Sauti continued, 'The Muni, having said so unt...","""'O ye, being directed by my ancestors, I am r...",14,25
1,10001,10193,Book01_046,Who did the Muni ask for a bride and why?,The Muni asked for a bride from the creatures ...,"Sauti continued, 'The Muni, having said so unt...","""My ancestors, afflicted with grief, have dire...",10,26


In [11]:
# for i in df_qa.index:
df_pred = pd.DataFrame(columns = ['Ques_Id','Chunk_Id','Pred_Chunk_Id'])

for i in df_qa.index:
  query = df_qa.loc[i,'Question']
  query_enc = model.encode(query).tolist()
  topk=10
  query_results = index.query(query_enc, top_k=topk, include_metadata=True)

  predictedId=[]
  for predid in query_results['matches']:
    predictedId.append(predid['id'])

  df_pred.loc[i]= (df_qa.loc[i,'Ques_Id'], df_qa.loc[i,'Chunk_Id'], predictedId)
  print('Predicting: ',i)
  # break



Predicting:  0
Predicting:  1
Predicting:  2
Predicting:  3
Predicting:  4
Predicting:  5
Predicting:  6
Predicting:  7
Predicting:  8
Predicting:  9
Predicting:  10
Predicting:  11
Predicting:  12
Predicting:  13
Predicting:  14
Predicting:  15
Predicting:  16
Predicting:  17
Predicting:  18
Predicting:  19
Predicting:  20
Predicting:  21
Predicting:  22
Predicting:  23
Predicting:  24
Predicting:  25
Predicting:  26
Predicting:  27
Predicting:  28
Predicting:  29
Predicting:  30
Predicting:  31
Predicting:  32
Predicting:  33
Predicting:  34
Predicting:  35
Predicting:  36
Predicting:  37
Predicting:  38
Predicting:  39
Predicting:  40
Predicting:  41
Predicting:  42
Predicting:  43
Predicting:  44
Predicting:  45
Predicting:  46
Predicting:  47
Predicting:  48
Predicting:  49
Predicting:  50
Predicting:  51
Predicting:  52
Predicting:  53
Predicting:  54
Predicting:  55
Predicting:  56
Predicting:  57
Predicting:  58
Predicting:  59
Predicting:  60
Predicting:  61
Predicting:  62
Pr

In [50]:
df_pred.to_csv(datapath + '08.1-Predict-DocumentId-for-Ques.csv')

In [12]:
df_pred.head(3)

Unnamed: 0,Ques_Id,Chunk_Id,Pred_Chunk_Id
0,10000,10193,"[id10140, id10143, id130, id60, id186, id10130..."
1,10001,10193,"[id10193, id89, id61, id90, id10396, id85, id3..."
2,10002,10193,"[id162, id40, id10435, id61, id10753, id216, i..."


In [14]:
def check_id_in_1pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'][0:1])

def check_id_in_2pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'][0:2])

def check_id_in_3pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'][0:3])

def check_id_in_4pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'][0:4])

def check_id_in_5pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'][0:5])

def check_id_in_10pred(row):
    return int('id' + str(row['Chunk_Id']) in row['Pred_Chunk_Id'])


# Create a new column 'MRR' based on the condition
df_pred['FoundIn@1'] = df_pred.apply(check_id_in_1pred, axis=1)
df_pred['FoundIn@2'] = df_pred.apply(check_id_in_2pred, axis=1)
df_pred['FoundIn@3'] = df_pred.apply(check_id_in_3pred, axis=1)
df_pred['FoundIn@4'] = df_pred.apply(check_id_in_4pred, axis=1)
df_pred['FoundIn@5'] = df_pred.apply(check_id_in_5pred, axis=1)
df_pred['FoundIn@10'] = df_pred.apply(check_id_in_10pred, axis=1)


In [15]:
df_pred.sample(5)

Unnamed: 0,Ques_Id,Chunk_Id,Pred_Chunk_Id,FoundIn@1,FoundIn@2,FoundIn@3,FoundIn@4,FoundIn@5,FoundIn@10
166,10166,10320,"[id10320, id67, id47, id79, id70, id10375, id1...",1,1,1,1,1,1
605,10605,10595,"[id10595, id127, id10235, id10237, id10602, id...",1,1,1,1,1,1
802,10802,10396,"[id10396, id85, id10400, id61, id86, id89, id1...",1,1,1,1,1,1
271,10271,10328,"[id119, id10557, id10315, id25, id105, id10270...",0,0,0,0,0,0
622,10622,10636,"[id292, id179, id26, id10133, id19, id10235, i...",0,0,0,0,0,0


In [16]:
df_pred[['FoundIn@1','FoundIn@2',	'FoundIn@3','FoundIn@4',	'FoundIn@5',	'FoundIn@10']].mean()

FoundIn@1     0.323370
FoundIn@2     0.443841
FoundIn@3     0.509964
FoundIn@4     0.550725
FoundIn@5     0.595109
FoundIn@10    0.687500
dtype: float64

In [22]:
# Number of relevant items in the top K/K

df_pred['P@1'] = df_pred['FoundIn@1']
df_pred['P@2'] = (df_pred['FoundIn@2']/2).apply(lambda x: round(x, 2))
df_pred['P@3'] = (df_pred['FoundIn@3']/3).apply(lambda x: round(x, 2))
df_pred['P@4'] = (df_pred['FoundIn@3']/4).apply(lambda x: round(x, 2))
df_pred['P@5'] = (df_pred['FoundIn@5']/5).apply(lambda x: round(x, 2))
df_pred['P@10'] = (df_pred['FoundIn@10']/10).apply(lambda x: round(x, 2))


In [23]:
# Number of Relevant Items in the Top K/ Total Number of Relevant Items in the Collection

df_pred['R@1'] = df_pred['FoundIn@1']/1
df_pred['R@2'] = df_pred['FoundIn@2']/1
df_pred['R@3'] = df_pred['FoundIn@3']/1
df_pred['R@4'] = df_pred['FoundIn@4']/1
df_pred['R@5'] = df_pred['FoundIn@5']/1
df_pred['R@10'] = df_pred['FoundIn@10']/1

In [45]:
df_pred['F1@1']   = ( (2*df_pred['P@1']*df_pred['R@1'])/ ( df_pred['P@1']+df_pred['R@1'])).apply(lambda x: round(x, 2)).fillna(0)
df_pred['F1@2']   = ( (2*df_pred['P@2']*df_pred['R@2'])/ ( df_pred['P@2']+df_pred['R@2'])).apply(lambda x: round(x, 2)).fillna(0)
df_pred['F1@3']   = ( (2*df_pred['P@3']*df_pred['R@3'])/ ( df_pred['P@3']+df_pred['R@3'])).apply(lambda x: round(x, 2)).fillna(0)
df_pred['F1@4']   = ( (2*df_pred['P@4']*df_pred['R@4'])/ ( df_pred['P@4']+df_pred['R@4'])).apply(lambda x: round(x, 2)).fillna(0)
df_pred['F1@5']   = ( (2*df_pred['P@5']*df_pred['R@5'])/ ( df_pred['P@5']+df_pred['R@5'])).apply(lambda x: round(x, 2)).fillna(0)
df_pred['F1@10'] =  ( (2*df_pred['P@10']*df_pred['R@10'])/ ( df_pred['P@10']+df_pred['R@10'])).apply(lambda x: round(x, 2)).fillna(0)

In [36]:
# Mean Average Precision (MAP) is a metric used to evaluate the performance of information retrieval systems,
# particularly in scenarios where there are multiple queries and each query can have multiple relevant documents.
# It's a measure of how well a retrieval system ranks and retrieves relevant documents across different queries.
def calc_MAP(row):
  return round((row['P@1']+row['P@2']+row['P@3']+row['P@4']+row['P@5'])/5,2)

df_pred['MAP'] = df_pred.apply(calc_MAP, axis=1)


In [51]:
# Mean Reciprocal Rank (MRR) is a metric commonly used to evaluate the performance of ranking-based retrieval systems,
# such as search engines. MRR assesses how well a retrieval system ranks relevant items by considering the position of the
# first relevant item in the ranked list. It's calculated as the average of the reciprocal ranks across a set of queries.

def getReciprocalRank(row):
  K=1 # Number of relevent document
  try:
    i = row['Pred_Chunk_Id'].index('id'+str(row['Chunk_Id']) )
  except:
    i=-1
  if i==-1:
    RR=0
  else:
    RR = round(1/(i + 1),2)
  return RR

df_pred['RR'] = df_pred.apply(getReciprocalRank, axis=1)

In [52]:
df_pred[['Ques_Id',
    'P@1', 'P@2', 'P@3', 'P@4', 'P@5', 'P@10',
    'R@1', 'R@2', 'R@3', 'R@4', 'R@5', 'R@10',
    'F1@1', 'F1@2', 'F1@3', 'F1@4', 'F1@5', 'F1@10',
    'MAP', 'RR',]]

Unnamed: 0,Ques_Id,P@1,P@2,P@3,P@4,P@5,P@10,R@1,R@2,R@3,...,R@5,R@10,F1@1,F1@2,F1@3,F1@4,F1@5,F1@10,MAP,RR
0,10000,0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.0
1,10001,1,0.5,0.33,0.25,0.2,0.1,1.0,1.0,1.0,...,1.0,1.0,1.0,0.67,0.5,0.4,0.33,0.18,0.46,1.0
2,10002,0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.0
3,10003,0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.0
4,10004,0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,11099,0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.0
1100,11100,0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.0
1101,11101,1,0.5,0.33,0.25,0.2,0.1,1.0,1.0,1.0,...,1.0,1.0,1.0,0.67,0.5,0.4,0.33,0.18,0.46,1.0
1102,11102,0,0.0,0.00,0.00,0.2,0.1,0.0,0.0,0.0,...,1.0,1.0,0.0,0.00,0.0,0.0,0.33,0.18,0.04,0.2


In [53]:
df_pred[[
    'P@1', 'P@2', 'P@3', 'P@4', 'P@5', 'P@10',
    'R@1', 'R@2', 'R@3', 'R@4', 'R@5', 'R@10',
    'F1@1', 'F1@2', 'F1@3', 'F1@4', 'F1@5', 'F1@10',
    'MAP', 'RR',]].mean()

P@1      0.323370
P@2      0.221920
P@3      0.168288
P@4      0.127491
P@5      0.119022
P@10     0.068750
R@1      0.323370
R@2      0.443841
R@3      0.509964
R@4      0.550725
R@5      0.595109
R@10     0.687500
F1@1     0.323370
F1@2     0.297373
F1@3     0.254982
F1@4     0.203986
F1@5     0.196386
F1@10    0.123750
MAP      0.194058
RR       0.436920
dtype: float64

In [55]:
df_pred.to_csv(datapath + '08.2-Predict-DocumentId-for-Ques-Metrics.csv')