In [1]:
### Cell 1: Setup and Configuration

import os
import asyncio
from pathlib import Path
import pandas as pd
import google.generativeai as genai
from dotenv import load_dotenv

# --- Path Definitions ---
# The notebook is inside the 'baseline' directory. The project root is its parent.
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent
BASELINE_DIR = NOTEBOOK_DIR
DATA_DIR = PROJECT_ROOT / "data"
RAW_OUTPUT_DIR = BASELINE_DIR / "results" / "raw_api_outputs"

# --- Create output directories ---
# This structure will be created inside the 'baseline' directory
(RAW_OUTPUT_DIR / "conceptual_check" / "sft_test_set").mkdir(parents=True, exist_ok=True)
(RAW_OUTPUT_DIR / "conceptual_check" / "final_test_set").mkdir(parents=True, exist_ok=True)
(RAW_OUTPUT_DIR / "computational_check" / "sft_test_set").mkdir(parents=True, exist_ok=True)
(RAW_OUTPUT_DIR / "computational_check" / "final_test_set").mkdir(parents=True, exist_ok=True)

print(f"Project Root identified at: {PROJECT_ROOT}")
print(f"Baseline directory is: {BASELINE_DIR}")
print(f"Raw API outputs will be saved to: {RAW_OUTPUT_DIR}")

# --- API Key Configuration ---
# The .env file should be located at the project root
load_dotenv(PROJECT_ROOT / ".env")
API_KEY = os.getenv("GOOGLE_API_KEY")
if not API_KEY:
    raise ValueError("GOOGLE_API_KEY not found in the .env file at the project root.")
genai.configure(api_key=API_KEY)

print("\n✅ Setup complete. Paths and API key are configured.")

Project Root identified at: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math
Baseline directory is: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/baseline
Raw API outputs will be saved to: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/baseline/results/raw_api_outputs

✅ Setup complete. Paths and API key are configured.


In [2]:
### Cell 2: Load Test Data

SFT_TEST_SET_PATH = DATA_DIR / "final-datasets" / "error_detection_dataset.csv"
FINAL_TEST_SET_PATH = DATA_DIR / "final-datasets" / "final-test-with-wrong-answers.csv"

# --- 1. Process the Final Test Set (Identical to final-testing.md) ---
final_test_raw_df = pd.read_csv(FINAL_TEST_SET_PATH)

final_test_data = []
for idx, row in final_test_raw_df.iterrows():
    # Append the correct version of the solution
    final_test_data.append({
        "index": idx, # use the test set index
        "question": row["question"],
        "answer": row["correct_answer"],
        "error_type": "correct"
    })
    # Append the flawed version of the solution
    error = "conceptual_error" if row["error_type"] == "concep" else "computational_error"
    final_test_data.append({
        "index": idx,
        "question": row["question"],
        "answer": row["wrong_answer"],
        "error_type": error
    })
final_test_df = pd.DataFrame(final_test_data)


# --- 2. Process the SFT Test Set (Identical to final-testing.md) ---
sft_full_df = pd.read_csv(SFT_TEST_SET_PATH)
sft_test_df_filtered = sft_full_df[sft_full_df['split'] == 'test']

sft_test_data = []
for _, row in sft_test_df_filtered.iterrows():
    if row["error_type"] == "correct":
        answer = row["correct_answer"]
    else:
        answer = row["wrong_answer"]
    sft_test_data.append({
        "index": row["index"], # use the original gsm8k index
        "question": row["question"],
        "answer": answer,
        "error_type": row["error_type"]
    })
sft_test_df = pd.DataFrame(sft_test_data)


# --- 3. Final Verification ---
print(f"Loaded {len(sft_test_df)} samples from the SFT test set.")
print(f"Loaded {len(final_test_df)} samples from the final test set.")

# Verify that the 'answer' column exists and other columns are consistent
print(f"\nSFT Test Set Columns: {sft_test_df.columns.tolist()}")
print(f"Final Test Set Columns: {final_test_df.columns.tolist()}")

Loaded 1214 samples from the SFT test set.
Loaded 302 samples from the final test set.

SFT Test Set Columns: ['index', 'question', 'answer', 'error_type']
Final Test Set Columns: ['index', 'question', 'answer', 'error_type']


In [3]:
### Cell 3: Prompt Templates

SYSTEM_PROMPT_CONCEPTUAL = """You are a mathematics tutor.
You will be given a math word problem and a solution written by a student.
Carefully analyze the problem and solution LINE-BY-LINE and determine whether there are any errors in the solution.

IMPORTANT: Your entire response should consist of ONLY the single word 'correct' or 'flawed'. Do NOT provide any explanation or surrounding text."""

def format_conceptual_prompt(row):
    return f"### Problem:\n{row['question']}\n\n### Student's Solution:\n{row['answer']}\n\n### Verdict:"

SYSTEM_PROMPT_COMPUTATIONAL = \
"""[ROLE]
You are an expert at parsing mathematical solutions.

[TASK]
You are given a single line from a mathematical solution. Your task is to extract the calculation from this line.

**This is a literal transcription task. Follow these rules with extreme precision:**
- **RULE 1: Transcribe EXACTLY.** Do not correct mathematical errors. If a line implies `2+2=5`, your output for that line must be `2+2=5`.
- **RULE 2: Isolate the Equation.** Your output must contain ONLY the equation, with no surrounding text, units, or currency symbols.

[RESPONSE FORMAT]
Your response must ONLY contain the extracted equation, wrapped in <eq> and </eq> tags.
If the line contains no calculation, respond with empty tags: <eq></eq>.

[EXAMPLE 1]
### Input:
First find how many liters of the seawater are salt: 2 liters * 20% = .4 liters
### Output:
<eq>2*20*.01=.4</eq>

[EXAMPLE 2]
### Input:
Therefore, there are 2 $10 bills.
### Output:
<eq></eq>

[EXAMPLE 3]
### Input:
There are 100-94 = 11 people in the theater with green eyes.
### Output:
<eq>100-94=11</eq>
"""

def format_computational_prompt(line):
    return f"### Solution Line:\n{line}"

print("Prompt templates are defined.")

Prompt templates are defined.


In [4]:
### Cell 4: Asynchronous API Call Logic

async def call_gemini_api_with_retries(prompt: str, system_prompt: str, max_attempts: int = 5):
    """Calls the Gemini API with a simple exponential backoff retry mechanism."""
    model = genai.GenerativeModel(
        model_name="gemini-1.5-flash-latest",
        system_instruction=system_prompt
    )
    for attempt in range(max_attempts):
        try:
            response = await model.generate_content_async(prompt, generation_config={"temperature": 0})
            return response.text
        except Exception as e:
            if attempt == max_attempts - 1:
                print(f"API call failed after {max_attempts} attempts. Error: {e}")
                return f"--- ERROR: {e} ---"
            wait_time = 2 ** attempt
            print(f"API call failed (attempt {attempt + 1}). Retrying in {wait_time}s...")
            await asyncio.sleep(wait_time)
    return "--- ERROR: Max retries exceeded ---"

In [5]:
### Cell 5: Main Execution Loop

async def process_sample(index, row, test_set_name):
    """Processes a single sample: one conceptual call and multiple computational calls."""
    if test_set_name == "sft_test_set":
        file_index = row['index']
    else:
        file_index = index

    # --- 1. Conceptual Check ---
    conceptual_output_path = RAW_OUTPUT_DIR / "conceptual_check" / test_set_name / f"{file_index}.txt"
    if not conceptual_output_path.exists():
        conceptual_prompt = format_conceptual_prompt(row)
        response = await call_gemini_api_with_retries(conceptual_prompt, SYSTEM_PROMPT_CONCEPTUAL)
        conceptual_output_path.write_text(response, encoding='utf-8')

    # --- 2. Computational Check ---
    solution_lines = [line.strip() for line in row['answer'].strip().split('\n') if line.strip()]
    for i, line in enumerate(solution_lines):
        computational_output_path = RAW_OUTPUT_DIR / "computational_check" / test_set_name / f"{file_index}_line_{i+1}.txt"
        if not computational_output_path.exists():
            computational_prompt = format_computational_prompt(line)
            response = await call_gemini_api_with_retries(computational_prompt, SYSTEM_PROMPT_COMPUTATIONAL)
            computational_output_path.write_text(response, encoding='utf-8')

# async def process_sample(index, row, test_set_name):
#     """Processes a single sample: one conceptual call and multiple computational calls."""
#     # --- 1. Conceptual Check ---
#     conceptual_output_path = RAW_OUTPUT_DIR / "conceptual_check" / test_set_name / f"{index}.txt"
#     if not conceptual_output_path.exists():
#         conceptual_prompt = format_conceptual_prompt(row)
#         response = await call_gemini_api_with_retries(conceptual_prompt, SYSTEM_PROMPT_CONCEPTUAL)
#         conceptual_output_path.write_text(response, encoding='utf-8')

#     # --- 2. Computational Check ---
#     solution_lines = [line.strip() for line in row['answer'].strip().split('\n') if line.strip()]
#     for i, line in enumerate(solution_lines):
#         computational_output_path = RAW_OUTPUT_DIR / "computational_check" / test_set_name / f"{index}_line_{i+1}.txt"
#         if not computational_output_path.exists():
#             computational_prompt = format_computational_prompt(line)
#             response = await call_gemini_api_with_retries(computational_prompt, SYSTEM_PROMPT_COMPUTATIONAL)
#             computational_output_path.write_text(response, encoding='utf-8')

In [6]:
# Testing

from tqdm.asyncio import tqdm_asyncio

# --- 1. Create Subsets for the Test Run ---
NUM_TEST_SAMPLES = 30
sft_test_subset = sft_test_df.head(NUM_TEST_SAMPLES)
final_test_subset = final_test_df.head(NUM_TEST_SAMPLES)

print(f"Created a test batch with {len(sft_test_subset)} samples from the SFT test set.")
print(f"Created a test batch with {len(final_test_subset)} samples from the final test set.")


# --- 2. Define Specific Functions for the Test Runs ---
async def run_sft_test_batch():
    """An asynchronous function to process the SFT test batch."""
    tasks = []
    print("\n--- Preparing tasks for the SFT Test Set (Subset) ---")
    for index, row in sft_test_subset.iterrows():
        # The `process_sample` function correctly checks if files exist and skips
        # API calls if this test is re-run.
        tasks.append(process_sample(index, row, "sft_test_set"))
    
    if tasks:
        print(f"Starting {len(tasks)} sample processing tasks for the SFT test batch...")
        await tqdm_asyncio.gather(*tasks)
    else:
        print("All SFT test samples already have output files. No API calls needed.")
    print("--- ✅ SFT test run complete. ---")


async def run_final_test_batch():
    """An asynchronous function to process the final test batch."""
    tasks = []
    print("\n--- Preparing tasks for the Final Test Set (Subset) ---")
    for index, row in final_test_subset.iterrows():
        tasks.append(process_sample(index, row, "final_test_set"))

    if tasks:
        print(f"Starting {len(tasks)} sample processing tasks for the final test batch...")
        await tqdm_asyncio.gather(*tasks)
    else:
        print("All final test samples already have output files. No API calls needed.")
    print("--- ✅ Final test run complete. ---")


# --- 3. Execute the Test Runs Sequentially ---
print("\n🚀 Starting small test runs...")

# Run the SFT test batch
await run_sft_test_batch()

# Run the final test batch
await run_final_test_batch()

print("\n🎉 Both test runs are finished.")
print("Please check the 'results/raw_api_outputs' directory to verify the output files.")

Created a test batch with 30 samples from the SFT test set.
Created a test batch with 30 samples from the final test set.

🚀 Starting small test runs...

--- Preparing tasks for the SFT Test Set (Subset) ---
Starting 30 sample processing tasks for the SFT test batch...


  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [00:00<00:00, 4269.74it/s]


--- ✅ SFT test run complete. ---

--- Preparing tasks for the Final Test Set (Subset) ---
Starting 30 sample processing tasks for the final test batch...


100%|██████████| 30/30 [00:00<00:00, 8522.70it/s]

--- ✅ Final test run complete. ---

🎉 Both test runs are finished.
Please check the 'results/raw_api_outputs' directory to verify the output files.





In [None]:
async def final_test_main():
    """Processes all samples in the final test set."""
    tasks = []
    print("--- Preparing tasks for the Final Test Set ---")
    for index, row in final_test_df.iterrows():
        tasks.append(process_sample(index, row, "final_test_set"))
    
    print(f"\nCreated a total of {len(tasks)} sample processing tasks for the final test set.")
    print("Starting concurrent API calls...")
    
    await tqdm_asyncio.gather(*tasks)

    print("\n--- ✅ Final Test Set processing complete. ---")

In [19]:
print("\n🚀 Starting full baseline generation for the Final Test Set...")
await final_test_main()


🚀 Starting full baseline generation for the Final Test Set...
--- Preparing tasks for the Final Test Set ---

Created a total of 302 sample processing tasks for the final test set.
Starting concurrent API calls...


100%|██████████| 302/302 [00:09<00:00, 33.04it/s]


--- ✅ Final Test Set processing complete. ---





In [7]:
async def sft_test_main():
    """
    Processes all samples in the SFT test set by splitting them into
    manageable chunks to avoid overwhelming the API with too many concurrent requests.
    """
    CHUNK_SIZE = 150  # Number of samples to process in each sequential batch
    num_chunks = (len(sft_test_df) + CHUNK_SIZE - 1) // CHUNK_SIZE  # Ceiling division

    print(f"--- Preparing to process {len(sft_test_df)} SFT samples in {num_chunks} chunks of size {CHUNK_SIZE} ---")

    for i in range(num_chunks):
        start_index = i * CHUNK_SIZE
        end_index = start_index + CHUNK_SIZE
        chunk_df = sft_test_df.iloc[start_index:end_index]
        
        print(f"\n--- Processing Chunk {i + 1} of {num_chunks} (Samples {start_index} to {end_index-1}) ---")

        tasks = []
        for index, row in chunk_df.iterrows():
            tasks.append(process_sample(index, row, "sft_test_set"))

        if tasks:
            print(f"Starting {len(tasks)} sample processing tasks for this chunk...")
            await tqdm_asyncio.gather(*tasks)
            print(f"--- ✅ Chunk {i + 1} complete. ---")
        
        if (i + 1) < num_chunks:
            # Add a small delay between chunks as a courtesy to the API
            print("Waiting 5 seconds before starting the next chunk...")
            await asyncio.sleep(5)

    print("\n--- ✅ All SFT Test Set chunks have been processed. ---")

In [8]:
print("🚀 Starting full baseline generation for the SFT Test Set...")
await sft_test_main()

🚀 Starting full baseline generation for the SFT Test Set...
--- Preparing to process 1214 SFT samples in 9 chunks of size 150 ---

--- Processing Chunk 1 of 9 (Samples 0 to 149) ---
Starting 150 sample processing tasks for this chunk...


100%|██████████| 150/150 [00:00<00:00, 4683.97it/s]

--- ✅ Chunk 1 complete. ---
Waiting 5 seconds before starting the next chunk...






--- Processing Chunk 2 of 9 (Samples 150 to 299) ---
Starting 150 sample processing tasks for this chunk...


100%|██████████| 150/150 [00:00<00:00, 5693.01it/s]

--- ✅ Chunk 2 complete. ---
Waiting 5 seconds before starting the next chunk...






--- Processing Chunk 3 of 9 (Samples 300 to 449) ---
Starting 150 sample processing tasks for this chunk...


100%|██████████| 150/150 [00:00<00:00, 6194.76it/s]

--- ✅ Chunk 3 complete. ---
Waiting 5 seconds before starting the next chunk...






--- Processing Chunk 4 of 9 (Samples 450 to 599) ---
Starting 150 sample processing tasks for this chunk...


100%|██████████| 150/150 [00:00<00:00, 5455.13it/s]

--- ✅ Chunk 4 complete. ---
Waiting 5 seconds before starting the next chunk...






--- Processing Chunk 5 of 9 (Samples 600 to 749) ---
Starting 150 sample processing tasks for this chunk...


100%|██████████| 150/150 [00:00<00:00, 5763.25it/s]

--- ✅ Chunk 5 complete. ---
Waiting 5 seconds before starting the next chunk...






--- Processing Chunk 6 of 9 (Samples 750 to 899) ---
Starting 150 sample processing tasks for this chunk...


100%|██████████| 150/150 [00:00<00:00, 6628.10it/s]

--- ✅ Chunk 6 complete. ---
Waiting 5 seconds before starting the next chunk...






--- Processing Chunk 7 of 9 (Samples 900 to 1049) ---
Starting 150 sample processing tasks for this chunk...


100%|██████████| 150/150 [00:00<00:00, 6770.18it/s]

--- ✅ Chunk 7 complete. ---
Waiting 5 seconds before starting the next chunk...






--- Processing Chunk 8 of 9 (Samples 1050 to 1199) ---
Starting 150 sample processing tasks for this chunk...


100%|██████████| 150/150 [00:00<00:00, 5414.80it/s]

--- ✅ Chunk 8 complete. ---
Waiting 5 seconds before starting the next chunk...






--- Processing Chunk 9 of 9 (Samples 1200 to 1349) ---
Starting 14 sample processing tasks for this chunk...


100%|██████████| 14/14 [00:00<00:00, 4874.67it/s]

--- ✅ Chunk 9 complete. ---

--- ✅ All SFT Test Set chunks have been processed. ---



