In [2]:

def ask_gpt(model, context, question_text, options):
    prompt = f"""You are an expert in probability theory. Here is a university-level probability exam question.
Please provide a step-by-step analysis  (no more than 100 words) and calculation.
At the end, output ONLY the final selected option number (for example: '2') on a new line.

Context: {context}
Question: {question_text}
Options:
"""
    for opt in options:
        prompt += opt + '\n'
    prompt += (
        "\nFirst, write your detailed solution and reasoning. "
        "Finally, in the last line, write only the selected option number (for example: '2')."
    )

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an expert in probability theory. Explain each question step by step. On the last line, output ONLY the selected option number (for example: '2'), with no extra text."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.1,
        max_tokens=1000
    )
    return response.choices[0].message.content

results = []
for idx, q in enumerate(questions):
    print(f"Processing question {q['question_id']}")
    try:
        raw_response = ask_gpt('gpt-4o', q['context'], q['question'], q['options'])
        lines = [line.strip() for line in raw_response.strip().split('\n') if line.strip()]
        if lines and lines[-1].isdigit():
            answer = lines[-1]
            analysis = "\n".join(lines[:-1])
        else:
            answer = ""
            analysis = raw_response
    except Exception as e:
        answer = ""
        analysis = f"Error: {e}"
    print(f"AI selected answer: {answer}")
    results.append({
        "question_id": q['question_id'],
        "context": q['context'],
        "question": q['question'],
        "options": q['options'],
        "answer": answer,
        "analysis": analysis
    })
    time.sleep(1)

with open("result_gpt-4o.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)


Processing question 1
AI selected answer: 2
Processing question 2
AI selected answer: 4
Processing question 3
AI selected answer: 2
Processing question 4
AI selected answer: 2
Processing question 5
AI selected answer: 3
Processing question 6
AI selected answer: 4
Processing question 7
AI selected answer: 2
Processing question 8
AI selected answer: 4
Processing question 9
AI selected answer: 3
Processing question 10
AI selected answer: 4
Processing question 11
AI selected answer: 4
Processing question 12
AI selected answer: 1
Processing question 13
AI selected answer: 3
Processing question 14
AI selected answer: 1
Processing question 15
AI selected answer: 1
Processing question 16
AI selected answer: 3
Processing question 17
AI selected answer: 2
Processing question 18
AI selected answer: 2
Processing question 19
AI selected answer: 4
Processing question 20
AI selected answer: 5
Processing question 21
AI selected answer: 1
Processing question 22
AI selected answer: 3
Processing question

In [3]:

results = []
for idx, q in enumerate(questions):
    print(f"Processing question {q['question_id']}")
    try:
        raw_response = ask_gpt('gpt-4.1', q['context'], q['question'], q['options'])
        lines = [line.strip() for line in raw_response.strip().split('\n') if line.strip()]
        if lines and lines[-1].isdigit():
            answer = lines[-1]
            analysis = "\n".join(lines[:-1])
        else:
            answer = ""
            analysis = raw_response
    except Exception as e:
        answer = ""
        analysis = f"Error: {e}"
    print(f"AI selected answer: {answer}")
    results.append({
        "question_id": q['question_id'],
        "context": q['context'],
        "question": q['question'],
        "options": q['options'],
        "answer": answer,
        "analysis": analysis
    })
    time.sleep(1)

with open("result_gpt-4.1.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

Processing question 1
AI selected answer: 2
Processing question 2
AI selected answer: 5
Processing question 3
AI selected answer: 4
Processing question 4
AI selected answer: 2
Processing question 5
AI selected answer: 4
Processing question 6
AI selected answer: 4
Processing question 7
AI selected answer: 2
Processing question 8
AI selected answer: 4
Processing question 9
AI selected answer: 3
Processing question 10
AI selected answer: 4
Processing question 11
AI selected answer: 4
Processing question 12
AI selected answer: 1
Processing question 13
AI selected answer: 3
Processing question 14
AI selected answer: 1
Processing question 15
AI selected answer: 1
Processing question 16
AI selected answer: 3
Processing question 17
AI selected answer: 2
Processing question 18
AI selected answer: 2
Processing question 19
AI selected answer: 4
Processing question 20
AI selected answer: 5
Processing question 21
AI selected answer: 1
Processing question 22
AI selected answer: 3
Processing question

In [4]:
std_ans = {1: '2', 2: '5', 3: '3', 4: '1', 5: '4', 6: '4', 7: '2', 8: '4', 9: '3', 10: '4', 11: '4', 12: '1', 13: '3', 14: '1', 15: '1', 16: '3', 17: '2', 18: '2', 19: '4', 20: '5', 21: '1', 22: '3', 23: '5', 24: '2', 25: '5', 26: '3', 27: '4', 28: '2', 29: '5', 30: '5'}


In [5]:
with open("result_gpt-4.1.json", 'r', encoding='utf-8') as f:
    res_41 = json.load(f)
with open("result_gpt-4o.json", 'r', encoding='utf-8') as f:
    res_4o = json.load(f)

In [7]:
import pandas as pd
df_41 = pd.DataFrame(res_41)
df_4o = pd.DataFrame(res_4o)

df_41 = df_41.rename(columns={'answer': 'answer_4.1', 'analysis': 'analysis_4.1'})
df_4o = df_4o.rename(columns={'answer': 'answer_4o', 'analysis': 'analysis_4o'})


In [8]:
df_all = pd.merge(df_41, df_4o[['question_id', 'answer_4o', 'analysis_4o']], on='question_id', how='inner')

df_all['key'] = df_all['question_id'].astype(int).map(std_ans)
df_all['correct_4.1'] = (df_all['answer_4.1'] == df_all['key'])
df_all['correct_4o'] = (df_all['answer_4o'] == df_all['key'])


In [9]:
total = len(df_all)
num_correct_41 = df_all['correct_4.1'].sum()
num_correct_4o = df_all['correct_4o'].sum()

print(f"GPT‑4.1 Accuracy: {num_correct_41}/{total} ({num_correct_41/total:.2%})")
print(f"GPT‑4o   Accuracy: {num_correct_4o}/{total} ({num_correct_4o/total:.2%})")

diff = df_all[df_all['answer_4.1'] != df_all['answer_4o']]
print(f"Different answers between models: {len(diff)}")
display(diff[['question_id','answer_4.1','answer_4o','key']])


GPT‑4.1 Accuracy: 28/30 (93.33%)
GPT‑4o   Accuracy: 26/30 (86.67%)
Different answers between models: 3


Unnamed: 0,question_id,answer_4.1,answer_4o,key
1,2,5,4,5
2,3,4,2,3
4,5,4,3,4


Question 20: Both models wrongly assumed the round-trip time was normally distributed and applied a z-score cutoff, where the task actually needed a distribution-free bound (via Cantelli/Chebyshev), which gives the correct 30 ms threshold.

In [108]:
wrong_df = df_all[~(df_all['correct_4.1'] & df_all['correct_4o'])].copy()


In [114]:
def compare_analysis_with_gpt41(
    q_text, options, analysis_41, answer_41, correct_41,
    analysis_4o, answer_4o, correct_4o, key
):
    prompt = f"""You are an expert probability exam reviewer.
Below is a university-level probability exam question and options. Then you will see analyses and answers from GPT-4.1 and GPT-4o, plus the official answer key.

**Your task:** Write a concise, structured comparison (4-5 bullet points) focusing only on:
- The main reasoning/approach of each model.
- Accuracy and any critical errors (e.g. domain, formula misuse, wrong option mapping).
- Which model is better and why.

**Format:**
- GPT-4.1: [summary, correctness]
- GPT-4o: [summary, correctness]
- Key difference or typical mistake: [one short sentence]
- Best analysis: [which model and one-line reason]

**Question:**
{q_text}

**Options:**
"""
    for opt in options:
        prompt += opt + "\n"
    prompt += f"""
**Official answer:** {key}

**GPT-4.1 answer:** {answer_41} ({'correct' if correct_41 else 'wrong'})
Analysis:
{analysis_41}

**GPT-4o answer:** {answer_4o} ({'correct' if correct_4o else 'wrong'})
Analysis:
{analysis_4o}

Please reply ONLY with the comparison in the format above. No extra introduction or conclusion."""

    response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {
                "role": "system",
                "content": "You are an expert reviewer for probability exam explanations. Output only concise, structured English bullet point comparisons as instructed."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0.1,
        max_tokens=700
    )
    return response.choices[0].message.content.strip()


In [115]:
comparison_results = []
for idx, row in wrong_df.iterrows():
    q_text = row['question']
    options = row['options']
    analysis_41 = row['analysis_4.1']
    answer_41 = row['answer_4.1']
    correct_41 = row['correct_4.1']
    analysis_4o = row['analysis_4o']
    answer_4o = row['answer_4o']
    correct_4o = row['correct_4o']
    key = row['key']
    try:
        cmp_text = compare_analysis_with_gpt41(
            q_text, options, analysis_41, answer_41, correct_41,
            analysis_4o, answer_4o, correct_4o, key
        )
    except Exception as e:
        cmp_text = f"Error: {e}"
    print(f"Question {row['question_id']}\n{cmp_text}\n{'-'*60}")
    comparison_results.append({
        "question_id": row['question_id'],
        "Comparison": cmp_text
    })
    time.sleep(1)


Question 5
- GPT-4.1: Carefully derives the density using the given joint distribution, applies the correct change of variables and Jacobian, integrates properly, and matches the support, arriving at option 1 (correct).
- GPT-4o: Recognizes the need for a transformation but incorrectly assumes the ratio of independent exponentials applies directly, ignores the support restriction, and selects option 5 (incorrect).
- Key difference or typical mistake: GPT-4o misapplies a standard result for the ratio of independent exponentials without considering the specific joint distribution and support.
- Best analysis: GPT-4.1—because it correctly applies the transformation, integration, and support, matching the official answer.
------------------------------------------------------------
Question 13
- GPT-4.1: Uses the joint density, sets up the correct double integral, and applies a small-interval approximation to match the form of option 1; correct reasoning and answer.
- GPT-4o: Correctly set

In [116]:
cmp_df = pd.DataFrame(comparison_results)

In [117]:
cmp_df

Unnamed: 0,question_id,Comparison
0,5,"- GPT-4.1: Carefully derives the density using the given joint distribution, applies the correct change of variables and Jacobian, integrates properly, and matches the support, arriving at option 1 (correct).\n- GPT-4o: Recognizes the need for a transformation but incorrectly assumes the ratio of independent exponentials applies directly, ignores the support restriction, and selects option 5 (incorrect).\n- Key difference or typical mistake: GPT-4o misapplies a standard result for the ratio of independent exponentials without considering the specific joint distribution and support.\n- Best analysis: GPT-4.1—because it correctly applies the transformation, integration, and support, matching the official answer."
1,13,"- GPT-4.1: Uses the joint density, sets up the correct double integral, and applies a small-interval approximation to match the form of option 1; correct reasoning and answer.\n- GPT-4o: Correctly sets up and computes the double integral, but misinterprets the result as matching option 3 (the numerical value), rather than recognizing the form in option 1; makes a critical mapping error.\n- Key difference or typical mistake: GPT-4o confuses the computed numerical value with the required answer form, while GPT-4.1 correctly matches the symbolic expression.\n- Best analysis: GPT-4.1—because it correctly interprets both the probability calculation and the required answer format."
2,14,"- GPT-4.1: Correctly applies the change of variables formula, accounts for both preimages, computes the derivative properly, and selects option 2, matching the official answer.\n- GPT-4o: Uses the correct transformation method and recognizes both preimages, but miscalculates the density by omitting the factor of 2, leading to the selection of option 1 (incorrect).\n- Key difference or typical mistake: GPT-4o misses the doubling effect from both positive and negative roots, resulting in a density that's half the correct value.\n- Best analysis: GPT-4.1—because it accurately applies the transformation formula and selects the correct answer."
3,15,"- GPT-4.1: Correctly computes mean and variance, properly accounts for negative covariance, standardizes accurately, and selects option 4 (\(\Phi\left(\frac{1}{4}\right)\)).\n- GPT-4o: Computes mean correctly but incorrectly calculates variance by using the wrong sign for the covariance term, leading to the wrong standard deviation and selects option 1 (\(\Phi\left(\frac{1}{\sqrt{13}}\right)\)).\n- Key difference or typical mistake: GPT-4o misapplies the covariance formula, using a positive instead of a negative sign.\n- Best analysis: GPT-4.1—correct variance calculation, standardization, and answer selection."
4,20,"- GPT-4.1: Uses normal distribution quantile approach, correctly sets up the probability equation, but selects option 5, which does not match the official answer.\n- GPT-4o: Follows the same reasoning as GPT-4.1, clearly explains the probability transformation, and also selects option 5, which is incorrect.\n- Key difference or typical mistake: Both models misinterpret the problem, defaulting to a normal quantile formula when the correct answer is a fixed value (option 1).\n- Best analysis: Neither; both models make the same error by overcomplicating the problem and missing the intended simple answer."
5,21,"- GPT-4.1: Correctly applies the memoryless property of the exponential distribution, derives the conditional probability, and simplifies to \(\lambda \mathrm{d}t\) (option 1); fully accurate.\n- GPT-4o: Uses the PDF at time \(t\) without conditioning, selects \(\lambda e^{-\lambda t} \mathrm{d}t\) (option 2); misses the memoryless property and conditional aspect, leading to a critical error.\n- Key difference or typical mistake: GPT-4o confuses the unconditional PDF with the conditional probability required by the question.\n- Best analysis: GPT-4.1, because it correctly identifies and applies the memoryless property and conditional probability."
6,24,"- GPT-4.1: Correctly relates the slope to \(\tan\theta\), identifies the correct intervals, sums their lengths, and computes the probability as \(\frac{1}{2}\); matches the official answer.\n- GPT-4o: Uses the correct geometric reasoning and intervals, but miscalculates the total favorable angle (takes only one interval instead of both), leading to an incorrect probability of \(\frac{1}{4}\).\n- Key difference or typical mistake: GPT-4o miscounts the total favorable angle, using only half the correct measure.\n- Best analysis: GPT-4.1—because it applies the correct intervals, sums them properly, and matches the official answer."
7,25,"- GPT-4.1: Applies Bayes’ theorem correctly, computes all probabilities step-by-step, simplifies accurately to \(\frac{1}{4}\), and matches the correct option.\n- GPT-4o: Sets up Bayes’ theorem and the law of total probability correctly, but makes a calculation error in the denominator, leading to an incorrect probability (\(\frac{2}{5}\)), and thus chooses the wrong option (6).\n- Key difference or typical mistake: GPT-4o miscalculates the denominator in the law of total probability, leading to a wrong final answer and option.\n- Best analysis: GPT-4.1—correct computation, clear reasoning, and matches the official answer."
