In [1]:
import json
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from sentence_transformers.util import cos_sim
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# %pip install datasets
# %cd drive/MyDrive/LearnLabTest

Mounted at /content/drive


In [6]:
# 1. import testing Question dataset [Q]
with open("LearnLab/math_qa_all_v1.json", "r") as qf:
    data = json.load(qf)
    page_content = [q['content'] for q in data]
    page_number = [q['page'] for q in data]
    questions: list = [q['Q'] for q in data]
    answers: list = [q['A'] for q in data]

In [4]:
# model_name = 'nvidia/NV-Embed-v2'
# model = SentenceTransformer(model_name, trust_remote_code=True)
model = SentenceTransformer(model_name_or_path="./nvidia/NV-Embed-v2", local_files_only=True, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
def add_eos(input_examples):
  input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
  return input_examples

In [8]:
instruction = "Instruct: "+ "Given a question, retrieve page that answer the question" + "\nQuery: "
batch_size = 10
questions_embeddings = model.encode(add_eos(questions), batch_size=batch_size, prompt=instruction, normalize_embeddings=True)
page_embeddings = model.encode(add_eos(page_content), batch_size=batch_size, normalize_embeddings=True)

# page_embeddings: np.ndarray = model.encode(page_content)
# questions_embeddings: np.ndarray  = model.encode(questions, prompt_name=instruction)

  self.gen = func(*args, **kwds)


In [9]:
for i in range(len(data)):
    data[i]['page_embedding'] = page_embeddings[i]
    data[i]['question_embedding'] = questions_embeddings[i]

In [10]:
# question_retrived_page_rank = []
for i in range(len(data)):
    similarity = [float(cos_sim(data[i]['question_embedding'], page_embed)) for page_embed in page_embeddings]
    question_retrived_page_rank = [{k: v} for k, v in zip(page_number, similarity)]
    data[i]['question_retrived_page_rank'] = question_retrived_page_rank

In [11]:
for i in range(len(data)):
    data[i]['question_retrived_page_rank'] = sorted(data[i]['question_retrived_page_rank'], key = lambda d: list(d.values())[0], reverse=True)

In [12]:
top1 = 0
top3 = 0
top5 = 0
top10 = 0
top20 = 0
top50 = 0
top100 = 0

page_distances = [] # the distances of the correct page and retrieved page
scores = [] # the score of the corresponding page and question
correct_rank = [] # the index of the correct page in the retrieved results.

for i in range(len(data)):
    page_number = data[i]['page']
    retrived_page_rank = [list(data[i]['question_retrived_page_rank'][d].keys())[0] for d in range(len(data[i]['question_retrived_page_rank']))]
    correct_page_rank = retrived_page_rank.index(page_number)
    correct_rank.append(correct_page_rank)
    page_distances.append(abs(page_number - retrived_page_rank[0]))
    scores.append(cos_sim(data[i]['question_embedding'], data[i]['page_embedding']))

    if page_number in retrived_page_rank[:1]: top1 += 1
    if page_number in retrived_page_rank[:3]: top3 += 1
    if page_number in retrived_page_rank[:5]: top5 += 1
    if page_number in retrived_page_rank[:10]: top10 += 1
    if page_number in retrived_page_rank[:20]: top20 += 1
    if page_number in retrived_page_rank[:50]: top50 += 1
    if page_number in retrived_page_rank[:100]: top100 += 1

In [13]:
print(top1)
print(top3)
print(top5)
print(top10)
print(top20)
print(top50)
print(top100)

279
402
435
460
470
475
476


In [None]:
print("Avg score: " + str(float(sum(scores) / len(scores))))
print("Avg rank: " + str(sum(correct_rank) / len(correct_rank)))
print("Avg page distance: " + str(sum(page_distances)/len(page_distances)))

Avg score: 0.6062510013580322
Avg rank: 23.210843373493976
Avg page distance: 51.20281124497992


# Generation

In [1]:
import requests
from openai import OpenAI
import pandas as pd
import re

In [2]:
def calculate_f1(string1, string2):
    set1 = set(string1.split())
    set2 = set(string2.split())
    tp = len(set1 & set2)
    precision = tp / len(set2) if set2 else 0
    recall = tp / len(set1) if set1 else 0
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [3]:
# # data[0]['content']
# # data[0]['page']
# # page_num = list(data[0]['question_retrived_page_rank'][0].keys())[0]
# test = ""
# retrived_indexes = data[0]['question_retrived_page_rank'][:3]
# retrived_indexes = sorted([list(retrived_indexes[u].keys())[0] for u in range(len(retrived_indexes))])
# retrived_indexes

# for x in data:
#     if(int(x['page']) in retrived_indexes):
#         test += f"page {int(x['page'])}: " + x['content'] + "\n"

In [4]:
import pickle

# Save the object to a file
# with open('nvidia_NV-Embed-v2.pkl', 'wb') as f:
#     pickle.dump(data, f)

# # Load the object from the file
with open('nvidia_NV-Embed-v2.pkl', 'rb') as f:
    data = pickle.load(f)

In [None]:
Error_L = []
F1_L = []
Errors = 0
page_check = 0
tokens = 0
top_k = 5

for d in data:
    query = d['Q']
    answer = d['A']
    retrived_pages = ""
    retrived_indexes = d['question_retrived_page_rank'][:top_k]
    retrived_indexes = sorted([list(retrived_indexes[u].keys())[0] for u in range(len(retrived_indexes))])
    
    for x in data:
        if(int(x['page']) in retrived_indexes):
            retrived_pages += f"page {int(x['page'])}: " + x['content'] + "\n"

    try:
        client = OpenAI(api_key='OPENAI_API_KEY_PLACEHOLDER')
        r = client.chat.completions.create(
            messages=[
                {
                "role": "system", "content": "You are a math expert",
                "content": "user", "content": "I am giving you a question and a text content. Provide a short answer for the question based on context I gave you. Say nothing else. Context:" + retrived_pages + ", Question:" + query
                }
            ],
            model="gpt-4o-2024-11-20",
        )
        res = r.choices[0].message.content
        f1 = calculate_f1(d['A'], res)
        F1_L.append(f1)
        usage = r.usage
        tokens += usage.total_tokens

        print(f"retrived pages: {retrived_indexes}")
        # print(f"retrived_pages: {retrived_pages[:30]}")
        print(f"f1: {f1}")
        print(f"tokens {tokens}")
        
    except Exception as e:
        Errors += 1
        # Error_L.append(i)
        print(e)
        
    # print(f"""Question for page {d['page']}""")
    # break

retrived pages: [19, 20, 44, 47, 48]
f1: 0.4081632653061225
tokens 2655
retrived pages: [19, 20, 44, 100, 101]
f1: 0.13953488372093023
tokens 5721
retrived pages: [104, 105, 106, 115, 129]
f1: 0.8947368421052632
tokens 8338
retrived pages: [21, 22, 23, 239, 501]
f1: 0.8571428571428572
tokens 11053
retrived pages: [23, 24, 26, 302, 302]
f1: 0.4383561643835616
tokens 14777
retrived pages: [24, 37, 301, 301, 400]
f1: 0.5454545454545454
tokens 18602
retrived pages: [24, 25, 37, 301, 301]
f1: 0.41379310344827586
tokens 22232
retrived pages: [26, 27, 46, 258, 260]
f1: 0.7567567567567568
tokens 25735
retrived pages: [27, 258, 282, 284, 296]
f1: 0.7058823529411764
tokens 29388
retrived pages: [25, 30, 31, 173, 399]
f1: 0.5423728813559322
tokens 32535
retrived pages: [30, 31, 32, 278, 352]
f1: 0.4418604651162791
tokens 35791
retrived pages: [31, 32, 198, 199, 418]
f1: 0.8076923076923077
tokens 38750
retrived pages: [32, 34, 36, 39, 198]
f1: 0.25
tokens 41620
retrived pages: [34, 35, 36, 37, 39]

In [12]:
# print("Total QAs:", len(questions))
# print("Percentage of page match:", page_check / len(questions))
print("Errors Percentage:", Errors / len(F1_L))
print("Average F1-Score", sum(F1_L) / len(F1_L))
print("Tokens Completion",tokens)

Errors Percentage: 0.0
Average F1-Score 0.5209160713311258
Tokens Completion 1785448
