In [39]:
from __future__ import annotations
import json, asyncio, aiohttp
import json
import os
import sys
import re
import time
import pathlib
import collections

from typing import Dict, List, Tuple, Any
import random

import openai
from openai import OpenAI

import sympy as sp

from google.colab import drive
drive.mount('/content/drive')

sys.path.append(os.path.abspath("/content/drive/MyDrive/Erdos/"))

import judge
import test_judge
import llm_judge

import linear_system_templates as lst


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


First we import the saved openai key.

In [40]:
openai_key_file_path = '/content/drive/MyDrive/Erdos/openai_key.txt'
with open(openai_key_file_path, 'r', encoding='utf-8') as f:
    openai_key = f.read()

openai.api_key = openai_key
os.environ["OPENAI_API_KEY"] = openai_key

print(openai_key[:5])


sk-pr


# Step 1: generating student solutions.

Here we create the `generate_sample` function, which will generate incorrect student solutions.

We will have different mistake templates, each of them will give the writer LLM a different system prompt which describes the intended mistake. The function `build_prompt` creates these system prompts, and augments them with an example of the given template.

The dictionary `FEW_SHOTS` contains these examples, we write this first.

In [41]:
# -----------------------------------------------------------------------------
# Few‑shot examples for each error template (hand‑written)
# -----------------------------------------------------------------------------

FEW_SHOTS: Dict[str, List[Dict[str, str]]] = {
    # ------------------------------------------------------------------ #
    "SWAP_RHS": [
        {
            "role": "user",
            "content": (
                "Solve the system and *show your work*:\n\n"
                "2x + y = 5\n 3x - 4y = 7"
            ),
        },
        {
            "role": "assistant",
            "content": (
                "2x + y = 7\n 3x - 4y = 5\n"
                "→ From the first equation: y = 7 - 2x\n"
                "→ Substitute into the second: 3x - 4(7 - 2x) = 5\n"
                "  3x - 28 + 8x = 5 ⇒ 11x = 33 ⇒ x = 3\n"
                "→ y = 7 - 2·3 = 1\n"
                "Answer: (3, 1)"
            ),
        },
    ],
    # ------------------------------------------------------------------ #
    "SIGN_ERROR_EQ1": [
        {
            "role": "user",
            "content": (
                "Solve the system and *show your work*:\n\n"
                "4x + 2y = 6\n -3x + y = 5"
            ),
        },
        {
            "role": "assistant",
            "content": (
                "4x - 2y = 6\n -3x + y = 5\n"
                "→ From the second: y = 5 + 3x\n"
                "→ Substitute: 4x - 2(5 + 3x) = 6\n"
                "  4x - 10 - 6x = 6 ⇒ -2x = 16 ⇒ x = -8\n"
                "→ y = 5 + 3( -8 ) = -19\n"
                "Answer: (-8, -19)"
            ),
        },
    ],
    # ------------------------------------------------------------------ #
    "FLIP_SIGN": [
        {
            "role": "user",
            "content": (
                "Solve the system and *show your work*:\n\n"
                "x + 3y = 4\n 2x - y = 1"
            ),
        },
        {
            "role": "assistant",
            "content": (
                "x + 3y = -4\n 2x - y = -1\n"
                "→ From the first: x = -4 - 3y\n"
                "→ Substitute: 2(-4 - 3y) - y = -1\n"
                "  -8 - 6y - y = -1 ⇒ -7y = 7 ⇒ y = -1\n"
                "→ x = -4 - 3( -1 ) = -1\n"
                "Answer: (-1, -1)"
            ),
        },
    ],
}


Now we write the `build_prompt` function.

In [42]:
# -----------------------------------------------------------------------------
# Prompt builder
# -----------------------------------------------------------------------------

LENGTH_HINT = (
    "Keep your working to **no more than six short lines** and then finish "
    "with exactly:  Answer: ( … ).  Do not write anything after that. Do not use LaTeX in your final answer, even for fractions." # LaTeX confuses the parser... with a more sophisticated parser, could remove this condition
)

TEMPLATE_DESCRIPTIONS = {
    "SWAP_RHS":       "swap the independent coefficients of the equations (the coefficients after the = sign, which do not multiply x or y) before solving.For example, if the original problem is 2x + 3y = 4, 5x + 6y = 7, then swapping we get 2x + 3y = 7, 5x + 6y =4",
    "SIGN_ERROR_EQ1": "flip the sign of the coefficient *b* (coefficient for y) in the first equation. For example if the first equation is 2x + 3y = 4, it transforms into 2x - 3y = 4",
    "FLIP_SIGN":      "multiply both right-hand side constants by −1 before solving. the other coefficients remain the same. For example 2x + 3y = 4, 5x + 6y = 7 transforms into 2x + 3y = -4, 5x + 6y = -7",
    # add future templates here
}

# ---------------------------------------------------------------------
# build_prompt
# ---------------------------------------------------------------------
def build_prompt(coeffs: Tuple[int, int, int, int, int, int],  template: str,) -> List[Dict[str, str]]:

    """Return a chat prompt (list of role/content dicts) for one problem."""
    a, b, c, d, e, f = coeffs
    problem_lines = f"{a}x + {b}y = {e}\n{c}x + {d}y = {f}"

    # ── 1 · SYSTEM MESSAGE ────────────────────────────────────────────
    system_msg = {
        "role": "system",
        "content": (
            "You are a student solving 2×2 linear systems in algebra class. "
            f"You always make the following algebra mistake: "
            f"{TEMPLATE_DESCRIPTIONS[template]}. Remember to make the mistake instead of solving the original problem correctly."
            "Show your work line-by-line using exact fractions. Do not use LaTeX for fractions, write them as e.g. 4/5 or -6/7. "
            + LENGTH_HINT
        ),
    }

    # ── 2 · USER TASK  (ordinary teacher prompt) ──────────────────────
    user_msg = {
        "role": "user",
        "content": "Solve the system and show your work:\n\n" + problem_lines,
    }

    # ── 3 · Assemble prompt:  system -> few-shot -> live task ───────────
    prompt: List[Dict[str, str]] = [system_msg]
    prompt.extend(FEW_SHOTS[template])
    prompt.append(user_msg)

    return prompt


The function `parse_final_xy` parses the final answer, this is used for the programmatic/numerical judge.

In [43]:
# -----------------------------------------------------------------------------
# Regex parser for final answer tuple
# -----------------------------------------------------------------------------

FINAL_PAIR_RE = re.compile(r"Answer:\s*\(\s*([^,]+?)\s*,\s*([^\)]+?)\s*\)") # ok...

def parse_final_xy(text: str) -> Tuple[str, str] | None:
    """Return the two string components of the final tuple, or None if not found."""

    m = FINAL_PAIR_RE.search(text)
    if not m:
        return None
    return m.group(1).strip(), m.group(2).strip()


Small test of the prompt and regex:

In [44]:
def _test_prompt_and_regex():

    coeffs = (2, 1, -3, 4, 7, -5)
    prompt = build_prompt(coeffs, "SWAP_RHS")

    assert prompt[-1]["role"] == "user"

    fake_answer = "Some text…\nAnswer: (1/2, -3/4)"
    assert parse_final_xy(fake_answer) == ("1/2", "-3/4")

_test_prompt_and_regex()
print("Prompt builder and regex parser test passed.")


Prompt builder and regex parser test passed.


Now we write the `generate_sample` function, which generates incorrect student solutions according to a given error template.

The function `_has_unique_solution` verifies that the modified system corresponding to an incorrect student answer has a unique solution, in order to avoid having "(No solution)" or "(Infinitely many solutions)" as answer (this can be removed later, it's only here for now because it messes with the final answer parser, but a better parser would deal with it).

In [53]:
# -----------------------------------------------------------------------------
# Main generation function
# -----------------------------------------------------------------------------

# select model and temperature
WRITER_MODEL_NAME = "gpt-4.1-mini"
WRITER_TEMPERATURE = 0.8

# want to limit number of tokens just in case
WRITER_MAX_TOKENS = 300


client = OpenAI(api_key=openai.api_key)


# ------------------------------------------------------------------
# helper: unique‐solution test for the *wrong* system
# ------------------------------------------------------------------
def _has_unique_solution(coeffs: Tuple[int, int, int, int, int, int],
                         template: str) -> bool:
    a, b, c, d, e, f = coeffs

    # apply the template to get the wrong right-hand side
    wrong_x, wrong_y = lst.ERROR_TEMPLATES[template](*coeffs)

    # 2×2 determinant of the *coefficient* matrix after the template
    det = sp.Matrix([[a, b], [c, d]]).det()
    if det == 0:
        return False                     # infinite/none → reject

    # if det != 0 SymPy guarantees a unique solution exists
    return True

# ------------------------------------------------------------------
# generate_sample
# ------------------------------------------------------------------

MAX_RAND_RETRIES = 2   # draw-new-coefficients limit, in case we get a system with no/infinitely many solutions (this should be rare...)

def generate_sample(template: str,
                    rand_retries: int = MAX_RAND_RETRIES,
                    llm_retries: int = 3) -> Dict[str, Any]:

    """Return a fully checked row or raise RuntimeError after many draws."""

    attempt = 0
    while attempt < rand_retries:

        # print("attempt number", attempt)
        coeffs = lst.random_coefficients()
        if not _has_unique_solution(coeffs, template):
            attempt += 1
            continue                       # redraw coefficients

        # ------------ original writer -> judge block ------------
        for _ in range(llm_retries):

            # print("llm generating...")
            messages = build_prompt(coeffs, template)
            resp = client.chat.completions.create(
                model = WRITER_MODEL_NAME,
                messages=messages,
                temperature= WRITER_TEMPERATURE,
                max_tokens = WRITER_MAX_TOKENS,
            )
            assistant_text = resp.choices[0].message.content
            # print("generated:", assistant_text)

            parsed = parse_final_xy(assistant_text)
            wrong_xy = lst.ERROR_TEMPLATES[template](*coeffs)

            if parsed is None:

                time.sleep(0.1)            # small back-off then retry LLM
                continue

            # ---------- build record ----------
            a, b, c, d, e, f = coeffs
            return {
                "problem": f"{a}x + {b}y = {e}; {c}x + {d}y = {f}",
                "coefficients": list(coeffs),
                "template": template,
                "template_description": TEMPLATE_DESCRIPTIONS[template],
                "answer_steps": assistant_text,
                "final_xy": [str(parsed[0]), str(parsed[1])],
                "wrong_xy": [str(wrong_xy[0]), str(wrong_xy[1])],
                "label": template,
                "writer_model": MODEL_NAME,
                "created_utc": int(time.time()),
            }

        # LLM failed all retries for this coefficient set: redraw
        attempt += 1

    raise RuntimeError(
        f"Could not generate a valid sample for {template} "
        f"after {rand_retries} coefficient draws."
    )


In [54]:
record = generate_sample(template = "FLIP_SIGN")

record

{'problem': '0x + 5y = 1; -1x + 2y = -4',
 'coefficients': [0, 5, -1, 2, 1, -4],
 'template': 'FLIP_SIGN',
 'template_description': 'multiply both right-hand side constants by −1 before solving. the other coefficients remain the same. For example 2x + 3y = 4, 5x + 6y = 7 transforms into 2x + 3y = -4, 5x + 6y = -7',
 'answer_steps': '0x + 5y = -1\n-1x + 2y = 4\n→ 5y = -1 ⇒ y = -1/5\n→ -x + 2(-1/5) = 4 ⇒ -x - 2/5 = 4 ⇒ -x = 4 + 2/5 = 22/5 ⇒ x = -22/5\nAnswer: (-22/5, -1/5)',
 'final_xy': ['-22/5', '-1/5'],
 'wrong_xy': ['-22/5', '-1/5'],
 'label': 'FLIP_SIGN',
 'writer_model': 'gpt-4.1-mini',
 'created_utc': 1750619084}

The function `write_jsonl` generates a number `n_rows` of samples, and writes them to a JSONL file stored in filepath `path`.

In [47]:
# -----------------------------------------------------------------------------
# dump N rows to a JSONL file
# -----------------------------------------------------------------------------

def write_jsonl(path: Path, n_rows: int = 100, verbose = False):

    path.parent.mkdir(parents=True, exist_ok=True)

    with path.open("w", encoding="utf-8") as f:
        for _ in range(n_rows):
            template = random.choice(list(lst.ERROR_TEMPLATES))
            rec = generate_sample(template)
            if verbose:
              print(rec)
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    print(f"Wrote {n_rows} rows ➜ {path}")

We generate student answers and write them to `sample_data_path`.

In [48]:
sample_data_path = "/content/drive/MyDrive/Erdos/sample_data/sample_data.jsonl"

write_jsonl(Path(sample_data_path), n_rows = 15, verbose = True)


{'problem': '-2x + -5y = -4; 0x + 2y = -3', 'coefficients': [-2, -5, 0, 2, -4, -3], 'template': 'SIGN_ERROR_EQ1', 'template_description': 'flip the sign of the coefficient *b* (coefficient for y) in the first equation. For example if the first equation is 2x + 3y = 4, it transforms into 2x - 3y = 4', 'answer_steps': '-2x - (-5y) = -4  \n0x + 2y = -3  \n→ Second: 2y = -3 ⇒ y = -3/2  \n→ Substitute y: -2x + 5(-3/2) = -4  \n-2x - 15/2 = -4 ⇒ -2x = -4 + 15/2  \n-2x = -8/2 + 15/2 = 7/2 ⇒ x = -7/4  \nAnswer: (-7/4, -3/2)', 'final_xy': ['-7/4', '-3/2'], 'wrong_xy': ['-7/4', '-3/2'], 'label': 'SIGN_ERROR_EQ1', 'writer_model': 'gpt-4.1-mini', 'created_utc': 1750618730}
{'problem': '-1x + 2y = 0; 4x + 3y = 4', 'coefficients': [-1, 2, 4, 3, 0, 4], 'template': 'FLIP_SIGN', 'template_description': 'multiply both right-hand side constants by −1 before solving. the other coefficients remain the same. For example 2x + 3y = 4, 5x + 6y = 7 transforms into 2x + 3y = -4, 5x + 6y = -7', 'answer_steps': '-1

# Step 2: Programmatic/numerical judge.

Here we use the `judge` module to test if generated student solutions have a final answer which coincides with the final answer obtained when making the intended mistake. We also get the reason why solutions are rejected (answer does not match intended answer, or collides with correct problem answer).

This can be wrapped into a function later...

In [52]:
# ---------------------------------------------------------------------
sample_path = "/content/drive/MyDrive/Erdos/sample_data/sample_data.jsonl"
# ---------------------------------------------------------------------

path = pathlib.Path(sample_path)
assert path.is_file(), f"{path} doesn’t exist"

results = collections.Counter()
bad_rows = []       # save failing rows for inspection

with path.open("r", encoding="utf-8") as f:
    for lineno, line in enumerate(f, 1):
        row = json.loads(line)
        verdict = judge.judge_sample(row)
        results[verdict.reason] += 1

        if verdict.is_reject:
            bad_rows.append((lineno, row, verdict.reason))

# --- summary ----------------------------------------------------------
total = sum(results.values())
print(f"\nJudged {total} rows from {path.name}")
for reason, count in results.items():
    print(f"  {reason:15s} : {count:>6}")

# --- look at first few failures --------------------------------------
if bad_rows:
    print("\nFirst 3 rejects:")
    for ln, row, reason in bad_rows[:3]:
        print(f"  line {ln}: {reason}  →  {row['answer_steps'][:80]}…")




Judged 15 rows from sample_data.jsonl
  ok              :     15


Does the same as above, but saves good data to `clean_path` and bad data to `rejects_path`.

Good data is saved to a JSONL file with the same name as the original file, appending '_numerical_ok' at the end of the filename. For bad data we append '_numerical_reject' instead.

In [51]:
# paths
raw_path   = Path("/content/drive/MyDrive/Erdos/sample_data/sample_data.jsonl")
clean_path = raw_path.with_name(raw_path.stem + "_numerical_ok.jsonl")
rejects_path = raw_path.with_name(raw_path.stem + "_numerical_reject.jsonl")
# ---------------------------------------------------
keep, toss = 0, 0
with raw_path.open("r", encoding="utf-8") as inp, \
     clean_path.open("w", encoding="utf-8") as ok_out, \
     rejects_path.open("w", encoding="utf-8") as bad_out:

    for line in inp:
        row = json.loads(line)
        v   = judge.judge_sample(row)
        if v.ok:
            ok_out.write(line)
            keep += 1
        else:
            bad_out.write(line)
            toss += 1

print(f"Kept {keep} rows  ➜  {clean_path.name}")
print(f"Rejected {toss} rows ➜  {rejects_path.name}")


Kept 15 rows  ➜  sample_data_numerical_ok.jsonl
Rejected 0 rows ➜  sample_data_numerical_reject.jsonl


# Step 3: LLM judge

Here we use an LLM to check whether the student solutions generated earlier and which passed the numerical filter contain the desired reasoning mistakes. This can also be wrapped into a function, and possibly incorporating more than one judge LLM.



The prompt given to the LLM judge will contain:


1.   General instructions on what to do (see `JUDGE_SYSTEM_MSG`).
2.   A few examples of student solutions and a judge verdict (see `FEW_SHOT_llm_judge`).
3.   The problem statement, a description of the intended error, and the student's solution.






In [102]:
"""
LLM-based reasoning checker.
Reads a JSONL of rows that have ALREADY passed numeric judging and
outputs two files:
  *_reasoning_ok.jsonl
  *_reasoning_rejects.jsonl
"""

JUDGE_MODEL = "gpt-4.1-mini"
JUDGE_TEMPERATURE = 0.1
JUDGE_MAX_TOKENS = 60

JUDGE_SYSTEM_MSG = (
    "You are a strict grader. You will be given a problem together with: a student's solution and a description of a mathematical mistake."
    "You should look at the student's reasoning, and determine if it matches the described mistake. "
    "Answer YES if the solution matches the described mistake, otherwise answer NO."
    "Before saying YES or NO, give your concise justification in at most two sentences."
)



In [85]:
FEW_SHOT_llm_judge = [
    # ---------- 1-2. SWAP_RHS ----------
    # 1. Positive — student swaps the right-hand constants and nothing else
    {
        "role": "user",
        "content": (
            "PROBLEM:\n2x + 3y = 4 ; 5x − y = 8\n\n"
            "TEMPLATE (intended mistake): swap the independent coefficients of the equations\n\n"
            "STUDENT STEPS:\n2x + 3y = 8   ← swapped\n5x − y = 4    ← swapped\n\n"
            "Does the student’s reasoning follow exactly that template? "
            "Respond with your reasoning and finish with YES or NO on the last line."
        ),
    },
    {
        "role": "assistant",
        "content": (
            "The constants 4 and 8 have been exchanged while every other coefficient stays the same, "
            "matching the swap-RHS template precisely.\nYES"
        ),
    },

    # 2. Negative — student swaps but also flips a sign on one constant
    {
        "role": "user",
        "content": (
            "PROBLEM:\n2x + 3y = 4 ; 5x − y = 8\n\n"
            "TEMPLATE (intended mistake): swap the independent coefficients of the equations\n\n"
            "STUDENT STEPS:\n2x + 3y = −8  ← swapped & sign-flipped\n5x − y = 4     ← unchanged sign\n\n"
            "Does the student’s reasoning follow exactly that template? "
            "Respond with your reasoning and finish with YES or NO on the last line."
        ),
    },
    {
        "role": "assistant",
        "content": (
            "The student did swap the constants, but also changed the sign of 8 to −8, adding an extra error. "
            "Therefore the work does **not** match the template exactly.\nNO"
        ),
    },

    # ---------- 3-4. SIGN_ERROR_EQ1 ----------
    # 3. Positive — student flips the y-coefficient sign only in the first equation
    {
        "role": "user",
        "content": (
            "PROBLEM:\n4x + 2y = 6 ; 3x − y = 7\n\n"
            "TEMPLATE (intended mistake): flip the sign of the y-coefficient in the first equation only\n\n"
            "STUDENT STEPS:\n4x − 2y = 6   ← sign flipped\n3x − y = 7     ← unchanged\n\n"
            "Does the student’s reasoning follow exactly that template? "
            "Respond with your reasoning and finish with YES or NO on the last line."
        ),
    },
    {
        "role": "assistant",
        "content": (
            "Only the sign of 2y in the first equation changed; the second equation stayed the same. "
            "This matches the SIGN_ERROR_EQ1 template.\nYES"
        ),
    },

    # 4. Negative — student flips the sign and also changes the RHS constant
    {
        "role": "user",
        "content": (
            "PROBLEM:\n4x + 2y = 6 ; 3x − y = 7\n\n"
            "TEMPLATE (intended mistake): flip the sign of the y-coefficient in the first equation only\n\n"
            "STUDENT STEPS:\n4x − 2y = −6  ← sign flipped *and* RHS sign flipped\n3x − y = 7      ← unchanged\n\n"
            "Does the student’s reasoning follow exactly that template? "
            "Respond with your reasoning and finish with YES or NO on the last line."
        ),
    },
    {
        "role": "assistant",
        "content": (
            "Besides flipping the y-coefficient’s sign, the student also negated the constant 6, "
            "introducing an extra deviation. So it doesn’t follow the template exactly.\nNO"
        ),
    },

    # ---------- 5-6. FLIP_SIGN ----------
    # 5. Positive — student multiplies both RHS constants by −1, nothing else
    {
        "role": "user",
        "content": (
            "PROBLEM:\n3x + y = 0 ; −x + 4y = −2\n\n"
            "TEMPLATE (intended mistake): multiply both right-hand side constants by −1\n\n"
            "STUDENT STEPS:\n3x + y = 0     ← RHS flipped since -0 = 0\n−x + 4y = 2    ← RHS flipped\n\n"
            "Does the student’s reasoning follow exactly that template? "
            "Respond with your reasoning and finish with YES or NO on the last line."
        ),
    },
    {
        "role": "assistant",
        "content": (
            "Both right-hand constants were negated (recall -0 = 0) while all other coefficients stayed the same, "
            "exactly as the FLIP_SIGN template describes.\nYES"
        ),
    },

    # 6. Negative — student flips RHS but also changes a left-hand coefficient
    {
        "role": "user",
        "content": (
            "PROBLEM:\n3x + y = 9 ; −x + 4y = −2\n\n"
            "TEMPLATE (intended mistake): multiply both right-hand side constants by −1\n\n"
            "STUDENT STEPS:\n3x + y = −9      ← RHS flipped correctly\nx + 4y = 2       ← flipped RHS *and* sign of x-coefficient\n\n"
            "Does the student’s reasoning follow exactly that template? "
            "Respond with your reasoning and finish with YES or NO on the last line."
        ),
    },
    {
        "role": "assistant",
        "content": (
            "The RHS constants were negated, but the x-coefficient in the second equation also changed "
            "from −1 to +1, adding a new error. Hence this does **not** match the template.\nNO"
        ),
    },
]


The function `reasoning_ok` evaluates a single student solution.

In [96]:
def reasoning_ok(problem: str, template_desc: str, steps: str, client, verbose = False) -> bool:
    msgs = (
        [{"role": "system", "content": JUDGE_SYSTEM_MSG}]
        + FEW_SHOT_llm_judge
        + [
            {
                "role": "user",
                "content": (
                    f"PROBLEM:\n{problem}\n\n"
                    f"TEMPLATE (intended mistake): {template_desc}\n\n"
                    f"STUDENT STEPS:\n{steps}\n\n"
                    "Does the student’s reasoning follow **exactly** that template? "
                    "Be very brief in your justification! One sentence only."
                    "Respond with your reasoning and finish with YES or NO on the last line."
                ),
            }
        ]
    )
    if JUDGE_MODEL != 'o3':
      resp = client.chat.completions.create(
          model= JUDGE_MODEL, messages = msgs, temperature = JUDGE_TEMPERATURE, max_tokens = JUDGE_MAX_TOKENS)
    elif JUDGE_MODEL == 'o3':
      resp = client.chat.completions.create(
          model= JUDGE_MODEL, messages = msgs, max_completion_tokens = JUDGE_MAX_TOKENS)

    reply = resp.choices[0].message.content.strip().upper()

    if verbose:
      print(reply)

    verdict = reply.split()[-1]
    return verdict == "YES"


Given student solutions stored in a JSONL file in `src_path`, the function `run_llm_judge` evaluates whether they contain the intended mistakes.

It then saves the good and bad entries into JSONL files contained in the same filepath, with the filenames for the good and bad entries being the same as for the original file plus a suffix of '_reasoning_ok' or '_reasoning_rejects'.

For debugging (there are many false rejections with weaker models...), we print the student reasoning in the case of rejection. Should also randomly look at some of the acceptances in case errors are slipping through.

In [94]:
def run_llm_judge(src_path: Path, verbose = False):
    ok_path  = src_path.with_name(src_path.stem + "_reasoning_ok.jsonl")
    bad_path = src_path.with_name(src_path.stem + "_reasoning_rejects.jsonl")

    client = openai.OpenAI()  # uses OPENAI_API_KEY from env

    keep = toss = 0
    with src_path.open() as src, ok_path.open("w") as okf, bad_path.open("w") as badf:
        for line in src:
            row = json.loads(line)
            if reasoning_ok(row["problem"], row["template_description"], row["answer_steps"], client, verbose = verbose):
                okf.write(line)
                keep += 1
            else:
                print("Problem statement:", row["problem"])
                print("Error template:", row["template_description"])
                print("Student solution:", row["answer_steps"])
                badf.write(line)
                toss += 1

    print(
        f"LLM judge done: {keep} OK  |  {toss} rejects "
        f"→  {ok_path.name}, {bad_path.name}"
    )

In [103]:
run_llm_judge(Path("/content/drive/MyDrive/Erdos/sample_data/sample_data_numerical_ok.jsonl"), verbose = True)

THE STUDENT INCORRECTLY REWROTE -5Y AS -(-5Y) = +5Y, EFFECTIVELY FLIPPING THE SIGN OF THE Y-COEFFICIENT IN THE FIRST EQUATION AS THE TEMPLATE REQUIRES. YES
THE STUDENT ONLY NEGATED THE SECOND EQUATION'S RHS, NOT BOTH AS THE TEMPLATE REQUIRES.  
NO
Problem statement: -1x + 2y = 0; 4x + 3y = 4
Error template: multiply both right-hand side constants by −1 before solving. the other coefficients remain the same. For example 2x + 3y = 4, 5x + 6y = 7 transforms into 2x + 3y = -4, 5x + 6y = -7
Student solution: -1x + 2y = 0
4x + 3y = -4
→ From the first: -x = -2y ⇒ x = 2y
→ Substitute in second: 4(2y) + 3y = -4 ⇒ 8y + 3y = -4 ⇒ 11y = -4 ⇒ y = -4/11
→ x = 2(-4/11) = -8/11
Answer: (-8/11, -4/11)
THE STUDENT FLIPPED THE SIGN OF THE Y-COEFFICIENT IN THE FIRST EQUATION FROM -2Y TO +2Y, MATCHING THE TEMPLATE EXACTLY.  
YES
THE STUDENT SWAPPED THE CONSTANTS -2 AND 1 EXACTLY AS THE TEMPLATE INSTRUCTS BEFORE SOLVING.  
YES
THE STUDENT CORRECTLY FLIPPED THE SIGN OF THE Y-COEFFICIENT IN THE FIRST EQUATIO