In [1]:
import json
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [2]:
# 1. import testing Question dataset [Q]
with open("../math_qa_all_v1.json", "r") as qf:
    data = json.load(qf)
    page_content = [q['content'] for q in data]
    page_number = [q['page'] for q in data]
    questions: list = [q['Q'] for q in data]
    answers: list = [q['A'] for q in data]

In [3]:
model_name = 'intfloat/multilingual-e5-large-instruct'
model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/140k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

In [4]:
page_embeddings: np.ndarray = model.encode(page_content) 
questions_embeddings: np.ndarray  = model.encode(questions)

In [5]:
for i in range(len(data)):
    data[i]['page_embedding'] = page_embeddings[i]
    data[i]['question_embedding'] = questions_embeddings[i]

In [6]:
# question_retrived_page_rank = []
for i in range(len(data)):
    similarity = [float(cos_sim(data[i]['question_embedding'], page_embed)) for page_embed in page_embeddings]
    question_retrived_page_rank = [{k: v} for k, v in zip(page_number, similarity)]
    data[i]['question_retrived_page_rank'] = question_retrived_page_rank

In [7]:
for i in range(len(data)):
    data[i]['question_retrived_page_rank'] = sorted(data[i]['question_retrived_page_rank'], key = lambda d: list(d.values())[0], reverse=True)

In [8]:
top1 = 0
top3 = 0
top5 = 0
top10 = 0
top20 = 0
top50 = 0
top100 = 0

page_distances = [] # the distances of the correct page and retrieved page
scores = [] # the score of the corresponding page and question
correct_rank = [] # the index of the correct page in the retrieved results.

for i in range(len(data)):
    page_number = data[i]['page']
    retrived_page_rank = [list(data[i]['question_retrived_page_rank'][d].keys())[0] for d in range(len(data[i]['question_retrived_page_rank']))]
    correct_page_rank = retrived_page_rank.index(page_number)
    correct_rank.append(correct_page_rank)
    page_distances.append(abs(page_number - retrived_page_rank[0]))
    scores.append(cos_sim(data[i]['question_embedding'], data[i]['page_embedding']))
    
    if page_number in retrived_page_rank[:1]: top1 += 1
    if page_number in retrived_page_rank[:3]: top3 += 1
    if page_number in retrived_page_rank[:5]: top5 += 1
    if page_number in retrived_page_rank[:10]: top10 += 1
    if page_number in retrived_page_rank[:20]: top20 += 1
    if page_number in retrived_page_rank[:50]: top50 += 1
    if page_number in retrived_page_rank[:100]: top100 += 1   

In [9]:
print(top1)
print(top3)
print(top5)
print(top10)
print(top20)
print(top50)
print(top100)


218
335
379
415
446
461
471


In [10]:
print("Avg score: " + str(float(sum(scores) / len(scores))))
print("Avg rank: " + str(sum(correct_rank) / len(correct_rank)))
print("Avg page distance: " + str(sum(page_distances)/len(page_distances)))

Avg score: 0.8831433057785034
Avg rank: 6.591194968553459
Avg page distance: 24.832285115303982


In [11]:
import pickle

with open('multilingual.pkl', 'wb') as f:
    pickle.dump(data, f)