In [4]:
import requests
from openai import OpenAI
import pandas as pd
import re

In [8]:
import json
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [13]:
# 1. import testing Question dataset [Q]
with open("../math_qa_all_v1.json", "r") as qf:
    data = json.load(qf)
    page_content = [q['content'] for q in data]
    page_number = [q['page'] for q in data]
    questions: list = [q['Q'] for q in data]
    answers: list = [q['A'] for q in data]

In [6]:
model_name = 'thenlper/gte-large'
model = SentenceTransformer(model_name)

In [7]:
page_embeddings: np.ndarray = model.encode(page_content) 
questions_embeddings: np.ndarray  = model.encode(questions)

In [8]:
for i in range(len(data)):
    data[i]['page_embedding'] = page_embeddings[i]
    data[i]['question_embedding'] = questions_embeddings[i]

In [9]:
for i in range(len(data)):
    similarity = [float(cos_sim(data[i]['question_embedding'], page_embed)) for page_embed in page_embeddings]
    question_retrived_page_rank = [{k: v} for k, v in zip(page_number, similarity)]
    data[i]['question_retrived_page_rank'] = question_retrived_page_rank

In [10]:
for i in range(len(data)):
    data[i]['question_retrived_page_rank'] = sorted(data[i]['question_retrived_page_rank'], key = lambda d: list(d.values())[0], reverse=True)

In [17]:
import pickle
with open("gte_large.pkl", "rb") as f:
    data = pickle.load(f)

In [18]:
def calculate_f1(string1, string2):
    set1 = set(string1.split())
    set2 = set(string2.split())
    tp = len(set1 & set2)
    precision = tp / len(set2) if set2 else 0
    recall = tp / len(set1) if set1 else 0
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [22]:
Error_L = []
F1_L = []
Errors = 0
page_check = 0
tokens = 0
top_k = 5

for d in data:
    query = d['Q']
    answer = d['A']
    retrived_pages = ""
    retrived_indexes = d['question_retrived_page_rank'][:top_k]
    retrived_indexes = sorted([list(retrived_indexes[u].keys())[0] for u in range(len(retrived_indexes))])
    
    for x in data:
        if(int(x['page']) in retrived_indexes):
            retrived_pages += f"page {int(x['page'])}: " + x['content'] + "\n"
    
    try:
        client = OpenAI(api_key='OPENAI_API_KEY_PLACEHOLDER')
        r = client.chat.completions.create(
            messages=[
                {
                "role": "system", "content": "You are a math expert",
                "content": "user", "content": "I am giving you a question and a text content. Provide a short answer for the question based on context I gave you. Say nothing else. Context:" + retrived_pages + ", Question:" + query
                }
            ],
            model="gpt-4o-2024-11-20",
        )
        res = r.choices[0].message.content
        f1 = calculate_f1(d['A'], res)
        F1_L.append(f1)
        usage = r.usage
        tokens += usage.total_tokens

        print(f"retrived pages: {retrived_indexes}")
        # print(f"retrived_pages: {retrived_pages[:30]}")
        print(f"f1: {f1}")
        print(f"tokens {tokens}")
        
    except Exception as e:
        Errors += 1
        # Error_L.append(i)
        print(e)
        
    # print(f"""Question for page {d['page']}""")

retrived pages: [20, 48, 66, 67, 68]
f1: 0.5517241379310344
tokens 3131
retrived pages: [19, 20, 43, 44, 95]
f1: 0.14634146341463417
tokens 5593
retrived pages: [21, 105, 108, 129, 217]
f1: 0.4166666666666667
tokens 8059
retrived pages: [21, 22, 23, 425, 501]
f1: 0.9523809523809523
tokens 11282
retrived pages: [23, 24, 26, 302, 302]
f1: 0.4375
tokens 14992
retrived pages: [23, 24, 37, 400, 404]
f1: 0.27586206896551724
tokens 18808
retrived pages: [24, 25, 37, 61, 73]
f1: 0.4444444444444444
tokens 22163
retrived pages: [26, 45, 46, 258, 260]
f1: 0.7567567567567568
tokens 25653
retrived pages: [26, 27, 63, 260, 284]
f1: 0.8387096774193549
tokens 29469
retrived pages: [31, 61, 70, 260, 503]
f1: 0.5090909090909091
tokens 32679
retrived pages: [26, 31, 61, 116, 260]
f1: 0.5
tokens 35984
retrived pages: [31, 32, 62, 260, 373]
f1: 0.6808510638297872
tokens 39349
retrived pages: [34, 39, 73, 308, 308]
f1: 0.37500000000000006
tokens 42259
retrived pages: [35, 36, 39, 57, 60]
f1: 0.8965517241379

In [37]:
Errors

0

In [23]:
print("Total QAs:", 477)
# print("Percentage of page match:", page_check / len(questions))
print("Errors Percentage:", Errors / 477)
print("Average F1-Score", sum(F1_L) / len(F1_L))
print("Tokens Completion",tokens)

Total QAs: 477
Errors Percentage: 0.0
Average F1-Score 0.5095416556352046
Tokens Completion 1798895
