In [13]:
import requests
from openai import OpenAI
import pandas as pd
import re

In [14]:
df = pd.read_json('math_qa_2.json')

questions = df['Q'].tolist()
answers = df['A'].tolist()
pages = df['page'].tolist()

print("Questions:", questions[0])
print("Answers:", answers[0])
print("page:", pages[0])
print(len(questions))

Questions: Q: What are the necessary and sufficient conditions on the moduli \( n_1, n_2, \ldots, n_r \) for which the system of congruences \( x \equiv a_1 \mod n_1, x \equiv a_2 \mod n_2, \ldots, x \equiv a_r \mod n_r \) has a solution, and how can we express the solution explicitly? 
Answers:  A: A solution exists if and only if for every pair of moduli \( n_i \) and \( n_j \), the greatest common divisor \( \gcd(n_i, n_j) \) divides \( a_j - a_i \). An explicit solution for \( x \) can be given using the formula: 

\[
x \equiv \sum_{i=1}^{r} a_i \cdot M_i \cdot y_i \mod n
\]

where \( M = n_1 n_2 \cdots n_r \), \( M_i = \frac{M}{n_i} \), and \( y_i \) is the modular inverse of \( M_i \) modulo \( n_i \). The modulus \( n \) can be chosen as the least common multiple \( \text{lcm}(n_1, n_2, \ldots, n_r) \).
page: 300
211


In [15]:
from sklearn.metrics import f1_score

def calculate_f1(string1, string2):
    set1 = set(string1.split())
    set2 = set(string2.split())
    tp = len(set1 & set2)
    precision = tp / len(set2) if set2 else 0
    recall = tp / len(set1) if set1 else 0
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [16]:
url = "http://i9ubuntu.eason.best:5020/search"

headers = {
    "Content-Type": "application/json",
}
Error_L = []
F1_L = []
Errors = 0
page_check = 0
tokens = 0
for i, q in enumerate(questions):
    queryN = q
    payload = {
    "query": queryN
    }
    response = requests.post(url, headers=headers, json=payload)
    print(response.status_code)
    pattern = r'"pages":\[(.*?)\]'
    matches = re.findall(pattern, response.text)
    contains_page = [match for match in matches if str(pages[i]) in match.split(', ') if match.strip()]
    if len(contains_page) != 0:
        page_check += 1
    print(pages[i], page_check)
    try:
        client = OpenAI(api_key='OPENAI_API_KEY_PLACEHOLDER')
        r = client.chat.completions.create(
            messages=[
                {
                "role": "system", "content": "You are a math expert",
                "content": "user", "content": "I am giving you a question and a text content. Provide a short answer for the question based on context I gave you. Say nothing else. Context:" + response.text + ", Question:" + q
                }
            ],
            model="gpt-4o-mini",
        )
        res = r.choices[0].message.content
        f1 = calculate_f1(answers[i], res)
        print(f1)
        F1_L.append(f1)
        usage = r.usage
        tokens += usage.total_tokens
        print(tokens)
        
    except Exception as e:
        Errors += 1
        Error_L.append(i)
        print(e)

    print("--------") 


200
300 0
0.3185840707964602
16961
--------
200
301 1
0.6764705882352942
42604
--------
200
302 2
0.3733333333333333
60528
--------
200
303 3
0.4943820224719101
149588
--------
200
304 4
0.7692307692307692
194892
--------
200
305 5
0.5277777777777778
222026
--------
200
306 6
0.8000000000000002
261898
--------
200
307 7
0.6153846153846154
299635
--------
200
308 7
0.32989690721649484
372167
--------
200
309 7
0.36666666666666664
452365
--------
200
310 7
0.4878048780487805
497140
--------
200
311 8
0.8148148148148148
546914
--------
200
312 8
0.25
561985
--------
200
313 9
0.28571428571428575
612619
--------
200
314 10
0.4444444444444445
624945
--------
200
316 11
0.13636363636363638
675941
--------
200
317 11
Error code: 400 - {'error': {'message': "This model's maximum context length is 128000 tokens. However, your messages resulted in 181290 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
-

In [17]:
print("Total QAs:", len(questions))
print("Percentage of page match:", page_check / len(questions))
print("Errors Percentage:", Errors / len(questions))
print("Average F1-Score", sum(F1_L) / len(F1_L))
print("Tokens Completion",tokens)

Total QAs: 211
Percentage of page match: 0.8293838862559242
Errors Percentage: 0.0947867298578199
Average F1-Score 0.49517604109636026
Tokens Completion 8175679
