In [1]:
import requests
from openai import OpenAI
import pandas as pd
import re

In [2]:
import json
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [3]:
# 1. import testing Question dataset [Q]
with open("math_qa_all.json", "r") as qf:
    data = json.load(qf)
    page_content = [q['content'] for q in data]
    page_number = [q['page'] for q in data]
    questions: list = [q['Q'] for q in data]
    answers: list = [q['A'] for q in data]

In [4]:
query_prompt_name = "s2p_query"
model_name = 'dunzhang/stella_en_1.5B_v5'
model = SentenceTransformer(model_name)

In [5]:
page_embeddings: np.ndarray = model.encode(page_content) 
questions_embeddings: np.ndarray  = model.encode(questions, prompt_name=query_prompt_name)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [6]:
for i in range(len(data)):
    data[i]['page_embedding'] = page_embeddings[i]
    data[i]['question_embedding'] = questions_embeddings[i]

In [7]:
for i in range(len(data)):
    similarity = [float(cos_sim(data[i]['question_embedding'], page_embed)) for page_embed in page_embeddings]
    question_retrived_page_rank = [{k: v} for k, v in zip(page_number, similarity)]
    data[i]['question_retrived_page_rank'] = question_retrived_page_rank

In [8]:
for i in range(len(data)):
    data[i]['question_retrived_page_rank'] = sorted(data[i]['question_retrived_page_rank'], key = lambda d: list(d.values())[0], reverse=True)

In [9]:
def calculate_f1(string1, string2):
    set1 = set(string1.split())
    set2 = set(string2.split())
    tp = len(set1 & set2)
    precision = tp / len(set2) if set2 else 0
    recall = tp / len(set1) if set1 else 0
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [10]:
Error_L = []
F1_L = []
Errors = 0
page_check = 0
tokens = 0
top_k = 3

for d in data:
    query = d['Q']
    answer = d['A']
    retrived_pages = ""
    retrived_indexes = d['question_retrived_page_rank'][:top_k]
    retrived_indexes = [list(retrived_indexes[u].keys())[0] for u in range(len(retrived_indexes))]
    
    for page_num in retrived_indexes:
        for d in data:
            if str(page_num) == str(d['page']):
                retrived_pages += d['content']
                break  # Stop checking other keys for this dictionary

    try:
        client = OpenAI(api_key='OPENAI_API_KEY_PLACEHOLDER')
        r = client.chat.completions.create(
            messages=[
                {
                "role": "system", "content": "You are a math expert",
                "content": "user", "content": "I am giving you a question and a text content. Provide a short answer for the question based on context I gave you. Say nothing else. Context:" + retrived_pages + ", Question:" + query
                }
            ],
            model="gpt-4o-mini",
        )
        res = r.choices[0].message.content
        f1 = calculate_f1(d['A'], res)
        print(f1)
        F1_L.append(f1)
        usage = r.usage
        tokens += usage.total_tokens
        print(tokens)
        
    except Exception as e:
        Errors += 1
        Error_L.append(i)
        print(e)
        
    print(f"""Question for page {d['page']}""")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0
128
Question for page 130
0
262
Question for page 130
0.25
1922
Question for page 286
0
2026
Question for page 422
0.13333333333333333
2198
Question for page 422
0.18181818181818182
2317
Question for page 422
0.07142857142857142
2444
Question for page 422
0.2857142857142857
4654
Question for page 27
0.17857142857142855
7080
Question for page 260
0.15384615384615383
9449
Question for page 362
0.08
11112
Question for page 199
0.19354838709677422
13314
Question for page 284
0.10909090909090909
13460
Question for page 40
0.34285714285714286
15346
Question for page 35
0.08695652173913043
17177
Question for page 39
0
17340
Question for page 422
0.1568627450980392
17533
Question for page 422
0.12765957446808512
19298
Question for page 374
0
19448
Question for page 130
0.06896551724137931
19558
Question for page 422
0.06451612903225808
19702
Question for page 422
0
19824
Question for page 130
0
19964
Question for page 130
0.21428571428571427
22177
Question for page 284
0.07843137254901962
24

In [14]:
len(F1_L)

498

In [15]:
print("Total QAs:", len(questions))
# print("Percentage of page match:", page_check / len(questions))
print("Errors Percentage:", Errors / len(questions))
print("Average F1-Score", sum(F1_L) / len(F1_L))
print("Tokens Completion",tokens)

Total QAs: 498
Errors Percentage: 0.0
Average F1-Score 0.1998151789023536
Tokens Completion 976744
