In [23]:
import os
import json
from openai import OpenAI
from tqdm import tqdm

In [26]:
client = OpenAI(api_key="")

SYSTEM_PROMPT = """
You are a classifier that decides whether a question is a MATH problem or NOT_MATH.

Definition of MATH:
- Pure or mostly pure mathematics: arithmetic, algebra, geometry, trigonometry, calculus,
  probability, statistics, linear algebra, number theory, combinatorics, etc.
- It's okay if the question includes some context (e.g., word problems), as long as the core task
  is to solve a math problem.

Definition of NOT_MATH:
- Programming / coding questions, even if they involve math.
- Algorithmic or complexity-analysis questions.
- Physics, engineering, or other sciences.
- General reasoning or logic puzzles not explicitly math-based.
- Any other domain.

Your ENTIRE response must be exactly one token from the set:
MATH
NOT_MATH

Do not output anything else.
"""


def classify_question(question: str) -> str:
    """
    Returns "MATH" or "NOT_MATH" for the given question using gpt-4.1.
    The model is instructed to output only one of those two tokens.
    """
    response = client.responses.create(
        model="gpt-4.1",
        input=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"Question:\n{question}\n\nClassify this question."},
        ],
    )

    # Extract raw text output
    out_text = response.output[0].content[0].text.strip().upper()

    # Normalize & fallback
    if out_text in {"MATH", "NOT_MATH"}:
        return out_text
    return "NOT_MATH"     # safe fallback

def filter_math_questions_llm(path_in: str, path_out: str):
    """
    Read a JSONL Arena log from path_in, classify each entry with an LLM judge,
    and write only MATH entries to path_out as a single JSON array.
    """
    kept = 0
    total = 0
    results = []   # collect entries here

    with open(path_in, "r") as f_in:
        for i, raw_line in tqdm(enumerate(f_in)):
            # skip first line
            if i == 0:
                continue

            line = raw_line.strip()
            if not line:
                continue

            try:
                entry = json.loads(line)
            except json.JSONDecodeError:
                print(f"Skipping malformed line {i+1}")
                continue

            question = entry.get("question")
            if not question:
                continue

            total += 1
            label = classify_question(question)

            if label == "MATH":
                results.append(entry)
                kept += 1

            # if total % 50 == 0:
            #     print(f"Processed {total} entries, kept {kept} as MATH")

            

    # Write a single JSON array
    with open(path_out, "w") as f_out:
        json.dump(results, f_out, indent=2, ensure_ascii=False)

    print(f"Done. Processed {total} entries, kept {kept} MATH entries.")
    print(f"Saved to: {path_out}")



In [27]:
input_path = "./data/arena_140k_math.jsonl"
output_path = "./data/arena_140k_math_filtered.json"
filter_math_questions_llm(input_path, output_path)

3167it [42:46,  1.23it/s]

Skipping malformed line 3167
Done. Processed 3165 entries, kept 894 MATH entries.
Saved to: ./data/arena_140k_math_filtered.json





In [28]:
import json

def reencode_json_ascii_to_utf8(path_in, path_out):
    with open(path_in, "r", encoding="utf-8") as f:
        data = json.load(f)        # loads & decodes \uXXXX automatically

    with open(path_out, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"Converted JSON saved to {path_out}")

reencode_json_ascii_to_utf8("./data/arena_140k_math_filtered.json", "./data/arena_140k_math_filtered_new.json")

Converted JSON saved to ./data/arena_140k_math_filtered_new.json
