In [20]:
import requests
from openai import OpenAI
import pandas as pd
import re

In [21]:
df = pd.read_json('math_qa_all_v1.json')

questions = df['Q'].tolist()
answers = df['A'].tolist()
pages = df['page'].tolist()

print("Questions:", questions[0])
print("Answers:", answers[0])
print("page:", pages[0])
print(len(questions))

Questions: Q: What distinguishes a proposition from a non-proposition in the context of mathematical logic? 
Answers:  A: A proposition is a statement that can be assigned a truth value ('true' or 'false'), whereas a non-proposition cannot be assigned a truth value, such as sentences that are self-referential or nonsensical.
page: 19
477


In [22]:
from sklearn.metrics import f1_score

def calculate_f1(string1, string2):
    set1 = set(string1.split())
    set2 = set(string2.split())
    tp = len(set1 & set2)
    precision = tp / len(set2) if set2 else 0
    recall = tp / len(set1) if set1 else 0
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [None]:
url = "http://71.182.171.221:5021/search"

headers = {
    "Content-Type": "application/json",
}
Error_L = []
F1_L = []
Errors = 0
page_check = 0
tokens = 0
for i, q in enumerate(questions):
    queryN = q
    payload = {
    "query": queryN
    }
    response = requests.post(url, headers=headers, json=payload)
    print(response.status_code)
    pattern = r'"pages":\[(.*?)\]'
    matches = re.findall(pattern, response.text)
    contains_page = [match for match in matches if str(pages[i]) in match.split(', ') if match.strip()]
    if len(contains_page) != 0:
        page_check += 1
    print(pages[i], page_check)
    try:
        client = OpenAI(api_key='OPENAI_API_KEY_PLACEHOLDER')
        r = client.chat.completions.create(
            messages=[
                {
                "role": "system", "content": "You are a math expert",
                "content": "user", "content": "I am giving you a question and a text content. Provide a short answer for the question based on context I gave you. Say nothing else. Context:" + response.text + ", Question:" + q
                }
            ],
            model="gpt-4o-mini",
        )
        res = r.choices[0].message.content
        f1 = calculate_f1(answers[i], res)
        print(f1)
        F1_L.append(f1)
        usage = r.usage
        tokens += usage.total_tokens
        print(tokens)
        
    except Exception as e:
        Errors += 1
        Error_L.append(i)
        print(e)

    print("--------") 


200
19 1
0.5357142857142857
72203
--------
200
20 2
Error code: 400 - {'error': {'message': "This model's maximum context length is 128000 tokens. However, your messages resulted in 144229 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
--------
200
21 3
0.8947368421052632
127875
--------
200
22 4
0.8571428571428572
211789
--------
200
23 5
0.5074626865671642
262900
--------
200
24 6
0.588235294117647
296535
--------
200
25 7
0.5581395348837209
335547
--------
200
26 8
0.6315789473684211
388107
--------
200
27 9
0.962962962962963
451856
--------
200
30 10
0.7076923076923077
487029
--------
200
31 11
0.4090909090909091
531212
--------
200
32 12
0.7755102040816326
578033
--------
200
34 13
0.16666666666666666
609303
--------
200
35 14
0.782608695652174
639007
--------
200
36 15
0.5416666666666666
663883
--------
200
37 15
0.6849315068493151
702377
--------
200
39 16
0.4210526315789473
740391
---

In [25]:
print("Total QAs:", len(questions))
print("Percentage of page match:", page_check / len(questions))
print("Errors Percentage:", Errors / len(questions))
print("Average F1-Score", sum(F1_L) / len(F1_L))
print("Tokens Completion",tokens)

Total QAs: 477
Percentage of page match: 0.9140461215932913
Errors Percentage: 0.0020964360587002098
Average F1-Score 0.5238022353572597
Tokens Completion 20788162
