In [15]:
import requests
import json

def ollama_chat(model, messages, think=False, stream=True, verbose=False):
    url = "http://localhost:11434/api/chat"
    payload = {
        "model": model,
        "messages": messages,
        "think": think
    }
    # Ollama returns multiple lines of JSON in stream mode
    resp = requests.post(url, json=payload, stream=stream)
    content = ""
    for line in resp.iter_lines():
        if line:
            try:
                data = json.loads(line.decode())
                if "message" in data and "content" in data["message"]:
                    chunk = data["message"]["content"]
                    if verbose:
                        print(chunk, end="", flush=True)
                    content += chunk
            except Exception as e:
                continue
    return content


In [16]:
def ask_ollama(model, context, question_text, options):
    prompt = f"""You are an expert in probability theory. Here is a university-level probability exam question.
Please provide a step-by-step analysis (no more than 100 words) and calculation.
At the end, output ONLY the final selected option number (for example: '2') on a new line.

Context: {context}
Question: {question_text}
Options:
"""
    for opt in options:
        prompt += opt + '\n'
    prompt += (
        "\nFirst, write your detailed solution and reasoning. "
        "Finally, in the last line, write only the selected option number (for example: '2')."
    )
    messages = [
        {"role": "system", "content": "You are an expert in probability theory. Explain each question step by step. On the last line, output ONLY the selected option number (for example: '2'), with no extra text."},
        {"role": "user", "content": prompt}
    ]
    return ollama_chat(model, messages, verbose=True)  # verbose=False to suppress real-time output


In [17]:
with open('questions.json', 'r', encoding='utf-8') as f:
    questions = json.load(f)

import time    
results = []
for idx, q in enumerate(questions):
    print(f"Processing question {q['question_id']}")
    try:
        raw_response = ask_ollama('qwen3:14b', q['context'], q['question'], q['options'])
        lines = [line.strip() for line in raw_response.strip().split('\n') if line.strip()]
        if lines and lines[-1].isdigit():
            answer = lines[-1]
            analysis = "\n".join(lines[:-1])
        else:
            answer = ""
            analysis = raw_response
    except Exception as e:
        answer = ""
        analysis = f"Error: {e}"
    print(f"AI selected answer: {answer}")
    results.append({
        "question_id": q['question_id'],
        "context": q['context'],
        "question": q['question'],
        "options": q['options'],
        "answer": answer,
        "analysis": analysis
    })
    time.sleep(1)

with open("answers_result_qwen3-14b.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

Processing question 1
This is a binomial probability problem with $ n = 100 $ and $ p = 0.05 $. We are to find $ P(X \geq 7) $, which is $ 1 - P(X \leq 6) $. Since $ n $ is large and $ p $ is small, we approximate the binomial distribution with a Poisson distribution with $ \lambda = np = 5 $. The probability is $ 1 - \sum_{i=0}^{6} \frac{e^{-5}5^i}{i!} $. 

5AI selected answer: 5
Processing question 2
To find the probability that the target is destroyed, we compute the expected value of the destruction probability over the distribution of $R$. This is given by the integral $\int_{0}^{\infty} e^{-r^{2}} \cdot re^{-\frac{1}{2}r^{2}} dr = \int_{0}^{\infty} re^{-\frac{3}{2}r^{2}} dr$. Let $u = \frac{3}{2}r^{2}$, then $du = 3r dr$, and the integral becomes $\frac{1}{3} \int_{0}^{\infty} e^{-u} du = \frac{1}{3}$.  
2AI selected answer: 2
Processing question 3
To find the probability of exactly two butterfly specialists among four selected swimmers, we use the hypergeometric distribution. Th

In [18]:
std_ans = {1: '4', 2: '2', 3: '4', 4: '2', 5: '1', 6: '3', 7: '2', 8: '5', 9: '4', 10: '3', 11: '5', 12: '4', 13: '1', 14: '2', 15: '4', 16: '3', 17: '1', 18: '2', 19: '3', 20: '1', 21: '1', 22: '5', 23: '5', 24: '5', 25: '1', 26: '5', 27: '4', 28: '2', 29: '3', 30: '3'}
with open("answers_result_qwen3-14b.json", 'r', encoding='utf-8') as f:
    res_qwen = json.load(f)

In [20]:
import pandas as pd
records = []
for entry in res_qwen:
    qid = int(entry['question_id'])
    ans = entry.get('answer', "").strip()
    key = std_ans.get(qid, "")
    records.append({
        'question_id': qid,
        'answer_qwen3': ans,
        'key': key,
        'correct_qwen3': (ans == key)
    })

df_qwen = pd.DataFrame(records).sort_values('question_id').reset_index(drop=True)


In [21]:
total = len(df_qwen)
correct_count = df_qwen['correct_qwen3'].sum()
accuracy = df_qwen['correct_qwen3'].mean()

print(f"Qwen3 accuracy: {accuracy:.2%} ({correct_count}/{total})")


Qwen3 accuracy: 66.67% (20/30)


In [22]:
wrong = df_qwen[~df_qwen['correct_qwen3']]
print("Incorrect answers:")
print(wrong[['question_id','answer_qwen3','key']])


Incorrect answers:
    question_id answer_qwen3 key
0             1            5   4
8             9            3   4
11           12            1   4
12           13            2   1
16           17            2   1
18           19            1   3
19           20            5   1
20           21            5   1
23           24            3   5
24           25            3   1
