In [12]:
import requests
import json

def ollama_chat(model, messages, think=False, stream=True, verbose=False):
    url = "http://localhost:11434/api/chat"
    payload = {
        "model": model,
        "messages": messages,
        "think": think
    }
    # Ollama returns multiple lines of JSON in stream mode
    resp = requests.post(url, json=payload, stream=stream)
    content = ""
    for line in resp.iter_lines():
        if line:
            try:
                data = json.loads(line.decode())
                if "message" in data and "content" in data["message"]:
                    chunk = data["message"]["content"]
                    if verbose:
                        print(chunk, end="", flush=True)
                    content += chunk
            except Exception as e:
                continue
    return content


In [13]:
def ask_ollama(model, context, question_text, options):
    prompt = f"""You are an expert in probability theory. Here is a university-level probability exam question.
Please provide a step-by-step analysis (no more than 100 words) and calculation.
At the end, output ONLY the final selected option number (for example: '2') on a new line.

Context: {context}
Question: {question_text}
Options:
"""
    for opt in options:
        prompt += opt + '\n'
    prompt += (
        "\nFirst, write your detailed solution and reasoning. "
        "Finally, in the last line, write only the selected option number (for example: '2')."
    )
    messages = [
        {"role": "system", "content": "You are an expert in probability theory. Explain each question step by step. On the last line, output ONLY the selected option number (for example: '2'), with no extra text."},
        {"role": "user", "content": prompt}
    ]
    return ollama_chat(model, messages, verbose=True)  # verbose=False to suppress real-time output


In [14]:
with open('test.json', 'r', encoding='utf-8') as f:
    questions = json.load(f)

import time    
results = []
for idx, q in enumerate(questions):
    print(f"Processing question {q['question_id']}")
    try:
        raw_response = ask_ollama('qwen3:8b', q['context'], q['question'], q['options'])
        lines = [line.strip() for line in raw_response.strip().split('\n') if line.strip()]
        if lines and lines[-1].isdigit():
            answer = lines[-1]
            analysis = "\n".join(lines[:-1])
        else:
            answer = ""
            analysis = raw_response
    except Exception as e:
        answer = ""
        analysis = f"Error: {e}"
    print(f"AI selected answer: {answer}")
    results.append({
        "question_id": q['question_id'],
        "context": q['context'],
        "question": q['question'],
        "options": q['options'],
        "answer": answer,
        "analysis": analysis
    })
    time.sleep(1)

with open("result_qwen3-14b.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

Processing question 1
The daily profit is given by $ P = 12.50X - 35 $, where $ X \sim \text{Poisson}(8.4) $. The variance of $ X $ is $ \text{Var}(X) = 8.4 $. Using the property of variance, $ \text{Var}(P) = (12.50)^2 \cdot \text{Var}(X) = (12.50)^2 \cdot 8.4 $. Therefore, the standard deviation is $ \sqrt{(12.50)^2 \cdot 8.4} = 12.50 \cdot \sqrt{8.4} $. The correct option is **2**.

2AI selected answer: 2
Processing question 2
The given formula for $P_i$ resembles the Poisson probability mass function, which is $\frac{A^i}{i!}e^{-A}$. However, the sum of probabilities over all $i$ must equal 1. Since the given formula sums only from $i=0$ to $8$, the normalization constant $c$ must be the sum of the terms from $i=0$ to $8$ to ensure the total probability is 1. Hence, $c = \sum_{i=0}^{8}\frac{A^{i}}{i!}$. 

5AI selected answer: 5
Processing question 3
Given that the probability a dim holds for more than $ t $ is 0.2, the probability it fails within $ t $ is $ 1 - 0.2 = 0.8 $. For the

In [15]:
std_ans = {1: '2', 2: '5', 3: '3', 4: '1', 5: '4', 6: '4', 7: '2', 8: '4', 9: '3', 10: '4', 11: '4', 12: '1', 13: '3', 14: '1', 15: '1', 16: '3', 17: '2', 18: '2', 19: '4', 20: '5', 21: '1', 22: '3', 23: '5', 24: '2', 25: '5', 26: '3', 27: '4', 28: '2', 29: '5', 30: '5'}
with open("result_qwen3-14b.json", 'r', encoding='utf-8') as f:
    res_qwen = json.load(f)

In [16]:
import pandas as pd
records = []
for entry in res_qwen:
    qid = int(entry['question_id'])
    ans = entry.get('answer', "").strip()
    key = std_ans.get(qid, "")
    records.append({
        'question_id': qid,
        'answer_qwen3': ans,
        'key': key,
        'correct_qwen3': (ans == key)
    })

df_qwen = pd.DataFrame(records).sort_values('question_id').reset_index(drop=True)


In [17]:
total = len(df_qwen)
correct_count = df_qwen['correct_qwen3'].sum()
accuracy = df_qwen['correct_qwen3'].mean()

print(f"Qwen3 accuracy: {accuracy:.2%} ({correct_count}/{total})")


Qwen3 accuracy: 66.67% (20/30)


In [18]:
wrong = df_qwen[~df_qwen['correct_qwen3']]
print("Incorrect answers:")
print(wrong[['question_id','answer_qwen3','key']])


Incorrect answers:
    question_id answer_qwen3 key
2             3            6   3
3             4            2   1
4             5            2   4
6             7                2
8             9            2   3
14           15            3   1
16           17                2
21           22            5   3
24           25            6   5
25           26            5   3
