In [13]:
import json
from collections import defaultdict

path = "Llama-3.1-8B-Instruct__ctx131072__rep1.jsonl"


counts = defaultdict(int)
correct = defaultdict(int)

with open(path, "r", encoding="utf-8") as f:
    for line in f:
        x = json.loads(line)
        key = (x["difficulty"], x["length"])
        counts[key] += 1
        if x["judge"] is True:
            correct[key] += 1

# print table
for diff in ["easy", "hard"]:
    for length in ["short", "medium", "long"]:
        k = (diff, length)
        if counts[k] == 0:
            acc = float("nan")
        else:
            acc = correct[k] / counts[k]
        print(f"difficulty={diff:4s} length={length:6s} "
              f"acc={acc:.4f} ({correct[k]}/{counts[k]})")

# overall
total = sum(counts.values())
total_correct = sum(correct.values())
print(f"\nOVERALL acc={total_correct/total:.4f} ({total_correct}/{total})")


difficulty=easy length=short  acc=0.4333 (26/60)
difficulty=easy length=medium acc=0.2500 (22/88)
difficulty=easy length=long   acc=0.2889 (13/45)
difficulty=hard length=short  acc=0.3306 (40/121)
difficulty=hard length=medium acc=0.2756 (35/127)
difficulty=hard length=long   acc=0.3538 (23/65)

OVERALL acc=0.3142 (159/506)


In [14]:
items = []
id_counts = defaultdict(int)

# First pass: read and count _id occurrences
with open(path, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        items.append(obj)
        if "_id" in obj:
            id_counts[obj["_id"]] += 1
        if not obj["pred"]:
            print(obj["response"])

# Second pass: print questions with repeated _id
for obj in items:
    _id = obj.get("_id")
    if _id and id_counts[_id] > 1:
        print(obj.get("_id"))

To answer this question, we need to analyze the reports by the Diocesan Church Society of New Brunswick and identify the focus of the Church's activities during the late 19th century (1850-1900).

Let's examine the reports:

* Reports from 1850-1860: Many reports discuss the construction of new churches, repairs to existing buildings, and the expansion of the Church's physical infrastructure. For example, the 1851 report mentions the "Fifteenth report of the proceedings of the Diocesan Church Society of New Brunswick, during the year 1850" and the 1854 report mentions the

[REPROMPT]
A.
To find the Black-Scholes value of the option, we need to calculate the following:

1. d1 = ln(S0/X) + (r + (s^2)/2)T
2. d2 = d1 - s*sqrt(T)
3. N(d1) and N(d2) are the cumulative normal distribution functions
4. C = S0*N(d1) - X*e^(-r*T)*N(d2)

Given values:
S0 = $90
X = $85
T = 3 months = 0.25 years
r = 0.6%

[REPROMPT]
To find
66fcffd9bb02136c067c94c5
66f36490821e116aacb2cc22
66f94f9ebb02136c067c4fde


In [16]:
import json
from collections import defaultdict
import math

def get_acc(path):
    items = []
    id_counts = defaultdict(int)
    
    # First pass: read + count _id occurrences
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            items.append(obj)
            if "_id" in obj:
                id_counts[obj["_id"]] += 1
    
    # Filter: remove ALL duplicated _id items
    filtered = []
    for obj in items:
        _id = obj.get("_id")
        if _id is None or id_counts[_id] == 1:
            filtered.append(obj)
    
    # Recompute counts
    counts = defaultdict(int)
    correct = defaultdict(int)
    
    for x in filtered:
        key = (x["difficulty"], x["length"])
        counts[key] += 1
        if x["judge"] is True:
            correct[key] += 1
    
    # Print table
    for diff in ["easy", "hard"]:
        for length in ["short", "medium", "long"]:
            k = (diff, length)
            if counts[k] == 0:
                acc = float("nan")
            else:
                acc = correct[k] / counts[k]
            print(
                f"difficulty={diff:4s} length={length:6s} "
                f"acc={acc:.4f} ({correct[k]}/{counts[k]})"
            )
    
    # Overall accuracy
    total = sum(counts.values())
    total_correct = sum(correct.values())
    
    print("\nAFTER DEDUPLICATION")
    print(f"OVERALL acc={total_correct/total:.4f} ({total_correct}/{total})")
    
    # Null baseline (random guessing, 4-way MC)
    null_acc = 0.25
    print(f"NULL baseline acc={null_acc:.4f} ({null_acc * total:.1f}/{total})")


In [20]:
from pathlib import Path

for path in Path(".").glob("*.jsonl"):
    print(path.name)
    get_acc(path.name)
    print()



Qwen2.5-7B-Instruct__ctx131072__rep1 1.jsonl
difficulty=easy length=short  acc=0.4655 (27/58)
difficulty=easy length=medium acc=0.3068 (27/88)
difficulty=easy length=long   acc=0.2889 (13/45)
difficulty=hard length=short  acc=0.3802 (46/121)
difficulty=hard length=medium acc=0.2913 (37/127)
difficulty=hard length=long   acc=0.3226 (20/62)

AFTER DEDUPLICATION
OVERALL acc=0.3393 (170/501)
NULL baseline acc=0.2500 (125.2/501)

mamba-codestral-7b__ctx131072__rep1.jsonl
difficulty=easy length=short  acc=0.2414 (14/58)
difficulty=easy length=medium acc=0.2619 (22/84)
difficulty=easy length=long   acc=0.1190 (5/42)
difficulty=hard length=short  acc=0.2437 (29/119)
difficulty=hard length=medium acc=0.2459 (30/122)
difficulty=hard length=long   acc=0.2833 (17/60)

AFTER DEDUPLICATION
OVERALL acc=0.2412 (117/485)
NULL baseline acc=0.2500 (121.2/485)

Llama-3.1-8B-Instruct__ctx131072__rep1.jsonl
difficulty=easy length=short  acc=0.4138 (24/58)
difficulty=easy length=medium acc=0.2500 (22/88)
dif