### Basic imports

In [1]:
import json
import re
from pathlib import Path
from typing import Dict, Any, List
import pandas as pd
from IPython.display import display, Markdown
from datasets import load_dataset

### Directories and Paths

In [2]:
def find_project_root():
    """Traverse upwards to find the project root, marked by the .git folder."""
    current_path = Path.cwd()
    while current_path != current_path.parent:
        if (current_path / ".git").is_dir():
            return current_path
        current_path = current_path.parent
    raise FileNotFoundError("Could not find project root. Is this a git repository?")

PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / 'data'
MANIFEST_OUTPUT_DIR = DATA_DIR / "july-8-manifests-raw"
SAMPLE_MANIFEST_DIR = DATA_DIR / "july-7-sample-manifests"
TIER_OUTPUT_DIRS = {f"tier{i}": MANIFEST_OUTPUT_DIR / f"tier{i}" for i in range(1, 6)}
TIER_SAMPLE_DIRS = {f"tier{i}": SAMPLE_MANIFEST_DIR / f"tier{i}" for i in range(1, 6)}

# Make the directory for the tier if it doesn't exist
for tier_dir in TIER_OUTPUT_DIRS.values():
    tier_dir.mkdir(parents=True, exist_ok=True)

for tier_dir in TIER_SAMPLE_DIRS.values():
    tier_dir.mkdir(parents=True, exist_ok=True)

print(f"Project root found at: {PROJECT_ROOT}")
print(f"Data directory found at: {DATA_DIR}")
print(f"Raw manifest output directory set to: {MANIFEST_OUTPUT_DIR}")

Project root found at: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math
Data directory found at: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data
Raw manifest output directory set to: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/july-8-manifests-raw


### Loading the dataset

In [3]:
# Load the GSM8K dataset
GSM8K_TRAIN = load_dataset("gsm8k", "main", split="train")

### Generating tier lists

In [4]:
def has_computational_division(solution_text: str) -> bool:
    """Returns True if a '/' is followed by optional whitespace and then a digit."""
    pattern = re.compile(r'/\s*\d')
    return bool(pattern.search(solution_text))

def has_float(solution_text: str) -> bool:
    """Returns True if the solution text contains a floating-point number."""
    pattern = re.compile(r'(?<!\d)\.\d+|\d+\.\d+')
    return bool(pattern.search(solution_text))

def is_symbolic(solution_text: str) -> bool:
    """Returns True if the solution contains a symbolic reasoning line (Let @ ...)."""
    pattern = re.compile(r'^Let [a-zA-Z] ', re.MULTILINE)
    return bool(pattern.search(solution_text))

def mutually_disjoint_tiers(dataset):
    tiers = {}
    symbolic_set = set(
        idx for idx, sample in enumerate(dataset)
        if is_symbolic(sample.get("answer", ""))
    )
    non_symbolic_indices = [
        idx for idx in range(len(dataset)) if idx not in symbolic_set
    ]

    # Tier 1: Only integer arithmetic (no floats, no computational division)
    tiers["tier1"] = sorted([
        idx for idx in non_symbolic_indices
        if not has_float(dataset[idx].get("answer", "")) and not has_computational_division(dataset[idx].get("answer", ""))
    ])

    # Tier 2: Float arithmetic, no computational division
    tiers["tier2"] = sorted([
        idx for idx in non_symbolic_indices
        if has_float(dataset[idx].get("answer", "")) and not has_computational_division(dataset[idx].get("answer", ""))
    ])

    # Tier 3: Computational division, no floats
    tiers["tier3"] = sorted([
        idx for idx in non_symbolic_indices
        if not has_float(dataset[idx].get("answer", "")) and has_computational_division(dataset[idx].get("answer", ""))
    ])

    # Tier 4: Both floats and computational division
    tiers["tier4"] = sorted([
        idx for idx in non_symbolic_indices
        if has_float(dataset[idx].get("answer", "")) and has_computational_division(dataset[idx].get("answer", ""))
    ])

    # Tier 5: Symbolic reasoning (Let @ ...)
    tiers["tier5"] = sorted(symbolic_set)

    return tiers

TIER_LISTS = mutually_disjoint_tiers(GSM8K_TRAIN)

# Display the number of samples in each tier
for tier, indices in TIER_LISTS.items():
    print(f"{tier:<10}: {len(indices)} samples")
print(f"{'Total':<10}: {len(GSM8K_TRAIN)} samples")

tier1     : 2767 samples
tier2     : 837 samples
tier3     : 3113 samples
tier4     : 544 samples
tier5     : 212 samples
Total     : 7473 samples


### Load the system prompt and static prefixes for each tier's user prompt

In [5]:
SYSTEM_PROMPT = """You are a data formalization expert who excels in mathematical reasoning and writing python code. You will be presented with a math word problem accompanied by a step-by-step natural language solution. You goal is to carefully and meticulously analyze the given question and solution, and formalize it by converting it into a structured json object that deconstructs the logic of the solution.

You MUST follow all rules and formatting instructions provided in the user prompt without deviation. Your entire output MUST be a single JSON object wrapped in ```json ... ```. Do not include any text or explanation before or after the JSON object."""

STATIC_PREFIXES = {}
for tier in TIER_LISTS.keys():
    prefix_file = TIER_SAMPLE_DIRS[tier] / f"{tier}_user_prompt_prefix.txt"
    with open(prefix_file, 'r', encoding='utf-8') as f:
        STATIC_PREFIXES[tier] = f.read()

# Display the prefix for tier 1
# print("Static prefix for tier 1:")
# print("="*50,"\n")
# print(STATIC_PREFIXES["tier1"])

### A simple function to append a chosen sample to a user prompt

In [6]:
def build_solution_mapping(
        index: int, 
        dataset: 'datasets.Dataset' = GSM8K_TRAIN,
        exclude_FA: bool = True
    ):
    """
    Extracts the natural language solution for a given problem index,
    cleans it, and structures it into a line-numbered dictionary.
    """
    solution_mapping = {}
    solution_text = dataset[index]["answer"]
    lines = [ln.strip() for ln in solution_text.splitlines() if ln.strip()]

    # Improved regex to handle commas in the final answer
    if lines and re.match(r"^####\s*[\d\.,]+$", lines[-1]):
        solution_mapping["FA"] = lines.pop(-1).strip()

    for i, line in enumerate(lines, 1):
        solution_mapping[f"L{i}"] = line

    if exclude_FA and "FA" in solution_mapping:
        del solution_mapping["FA"]

    return solution_mapping

def append_sample_to_user_prompt(
        tier: str, 
        index: int, 
        dataset: 'datasets.Dataset' = GSM8K_TRAIN
    ):
    """
    Appends a chosen sample from the GSM8K dataset to the user prompt for a specific tier. Returns the complete user prompt, ready to be sent to the LLM for manifest generation.
    """
    if tier not in TIER_LISTS:
        raise ValueError(f"Invalid tier: {tier}. Must be one of {list(TIER_LISTS.keys())}.")

    sample = dataset[index]
    question = sample['question']
    answer = build_solution_mapping(index, dataset)

    task_block = f"""## Input

**Index:**:
{index}

**Question:**:
{question}

**Solution mapping:**:
{answer}

## Output

"""
    return STATIC_PREFIXES[tier] + task_block

# Example usage
idx = TIER_LISTS["tier1"][0]  # Get the first index from tier 1
user_prompt = append_sample_to_user_prompt("tier1", idx, GSM8K_TRAIN)
print("User prompt for tier 1, index", idx, ":\n")
print(user_prompt)

User prompt for tier 1, index 4 :

In the TASK below, you will be given a math problem and its corresponding step-by-step solution. Each step in the solution is numbered (e.g. "L1", "L2" and so on), and many of the steps include calculator annotations (e.g. "<<20*0.1=2>>"). Your goal is to convert this information into a structured JSON object according to the following schema and detailed instructions.

# JSON Schema Definition

Your output must adhere to the following JSON structure:

```json
{
  "function_code": "A single string containing a complete, self-contained Python function that constitutes an end-to-end formalization of the solution.",
  "logical_steps": [
    {
      "line_number": "The line number from the original solution (e.g., 'L1', 'L2').",
      "question_inputs": "A (possibly empty) list of variable names with values extracted from the question text, used for the first time in this step.",
      "WK_inputs": "A (possibly empty) list of variable names with values co

### API Clients, Concurrency Limits, Models

In [7]:
# Imports for API clients and related functionality
import os
import openai
import google.generativeai as genai
import anthropic
import asyncio
import nest_asyncio
from openai import AsyncOpenAI
from anthropic import AsyncClient
from dotenv import load_dotenv

# This must be done once per kernel to allow asyncio to run in a Jupyter notebook..
nest_asyncio.apply()

# Load API Keys from .env file
load_dotenv()
print("Loaded environment variables from .env file.")

# Initialize Asynchronous API Clients
try:
    openai_client_async = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    anthropic_client_async = AsyncClient(api_key=os.getenv("ANTHROPIC_API_KEY"))
    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
    print("API clients initialized successfully.")
except TypeError:
    print("API key not found for one or more services. Please check your .env file.")
    # Assign None to prevent errors in subsequent cells
    openai_client_async = None
    anthropic_client_async = None

# Define API Concurrency Limits to prevent 429 "Too Many Requests" errors.
API_CONCURRENCY_LIMITS = {
    "google": 2,    
    "anthropic": 2, 
    "openai": 2,    
}
print(f"API concurrency limits set to: {API_CONCURRENCY_LIMITS}")

MODEL_DICT = {
  "openai": "gpt-4.1",
  "google": "gemini-2.5-flash"
}

MODELS = [f"{provider}_{model}" for provider, model in MODEL_DICT.items()]
print(f"Available models: {MODELS}")

Loaded environment variables from .env file.
API clients initialized successfully.
API concurrency limits set to: {'google': 2, 'anthropic': 2, 'openai': 2}
Available models: ['openai_gpt-4.1', 'google_gemini-2.5-flash']


### Main code for making API calls

In [12]:
import time
import math
import random
import datetime
from tqdm.notebook import tqdm

# --- 1. Helper Functions ---
# These two helpers are generic and can be used by any provider's function.

_anthropic_bucket = {"tokens": 50_000, "reset_at": time.monotonic() + 60}

async def _anthropic_throttle(tokens_needed: int):
    # (Code for this function is unchanged)
    global _anthropic_bucket
    while True:
        now = time.monotonic()
        if now >= _anthropic_bucket["reset_at"]:
            _anthropic_bucket = {"tokens": 50_000, "reset_at": now + 60}
        if tokens_needed <= _anthropic_bucket["tokens"]:
            _anthropic_bucket["tokens"] -= tokens_needed
            return
        else:
            to_sleep = _anthropic_bucket["reset_at"] - now
            await asyncio.sleep(max(to_sleep, 0.01))

async def with_api_retries(
        send_coroutine_factory,
        *,
        model_info: str,  # For informative logging
        max_attempts: int = 10,
        base_wait_seconds: int = 10  # Increased from 5
    ):
    """A wrapper to handle API retries with exponential backoff."""
    for attempt in range(max_attempts):
        try:
            return await send_coroutine_factory()
        except (openai.RateLimitError, anthropic.RateLimitError, Exception) as e:
            # Check for specific rate limit error types or a 429 status code in the error string
            if isinstance(e, (openai.RateLimitError, anthropic.RateLimitError)) or "429" in str(e):
                if attempt == max_attempts - 1:
                    print(f"❌ Final attempt failed for {model_info}. Giving up.")
                    raise
                
                # Exponential backoff with jitter
                wait_time = base_wait_seconds * (2 ** attempt) + random.uniform(0, 1)
                
                # More informative error message
                print(f"🕒 Rate limit on {model_info}. Retrying in {wait_time:.2f}s... (Attempt {attempt + 1}/{max_attempts})")
                await asyncio.sleep(wait_time)
            else:
                # If it's not a rate limit error, re-raise immediately
                raise
    return None

# --- 2. Provider-Specific API Calling Functions ---

async def call_openai_async(
        model: str,
        system_prompt: str,
        user_prompt: str,
        index: int  # Pass index for logging
    ):
    """Handles an API call to OpenAI."""
    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
    model_info = f"{model} (Index {index})" # Create info string for logger
    
    response = await with_api_retries(
        lambda: openai_client_async.chat.completions.create(
            model=model, 
            messages=messages, 
            temperature=0, 
            max_tokens=4000, 
            response_format={"type": "json_object"}
        ),
        model_info=model_info # Pass info to the retry wrapper
    )
    
    text_response = response.choices[0].message.content
    usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}
    if response.usage:
        usage["input_tokens"] = response.usage.prompt_tokens
        usage["output_tokens"] = response.usage.completion_tokens
        if hasattr(response.usage, 'prompt_tokens_details') and response.usage.prompt_tokens_details:
             usage["cached_tokens"] = response.usage.prompt_tokens_details.get("cached_tokens", 0)
    return text_response, usage

async def call_google_async(
        model: str,
        system_prompt: str,
        user_prompt: str,
        index: int  # Pass index for logging
    ):
    """Handles an API call to Google."""
    gemini = genai.GenerativeModel(model_name=model, system_instruction=system_prompt)
    cfg = genai.types.GenerationConfig(temperature=0, max_output_tokens=4000)
    model_info = f"{model} (Index {index})" # Create info string for logger

    response = await with_api_retries(
        lambda: gemini.generate_content_async(user_prompt, generation_config=cfg),
        model_info=model_info # Pass info to the retry wrapper
    )

    text_response = response.text
    usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}
    if response.usage_metadata:
        usage["input_tokens"] = response.usage_metadata.prompt_token_count
        usage["output_tokens"] = response.usage_metadata.candidates_token_count
    return text_response, usage

async def call_anthropic_async(
        model: str,
        system_prompt: str,
        user_prompt: str,
        index: int  # Pass index for logging
    ):
    """Handles an API call to Anthropic, including prompt caching."""
    system_block = {"type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}}
    model_info = f"{model} (Index {index})" # Create info string for logger
    
    est_tokens = math.ceil(1.2 * len(system_prompt.split()))
    await _anthropic_throttle(est_tokens)

    response = await with_api_retries(
        lambda: anthropic_client_async.messages.create(
            model=model, max_tokens=4000, temperature=0,
            system=[system_block], messages=[{"role": "user", "content": user_prompt}],
        ),
        model_info=model_info # Pass info to the retry wrapper
    )

    text_response = response.content[0].text
    usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}
    if response.usage:
        usage["input_tokens"] = response.usage.input_tokens
        usage["output_tokens"] = response.usage.output_tokens
        usage["cached_tokens"] = response.usage.cache_read_input_tokens if response.usage.cache_read_input_tokens else 0
    return text_response, usage


# # --- 2. Per-Problem Concurrent Runner ---
# # This function runs API calls for ONE problem concurrently across all models.

# async def run_one_problem_concurrently(
#     index: int,
#     tier: str,
#     dataset: 'datasets.Dataset',
#     system_prompt: str,
#     model_dict: Dict[str, str],
#     provider_sems: Dict[str, asyncio.Semaphore],
#     output_dir: Path
# ) -> List[Dict]:
#     """
#     Generates manifests for a single problem by calling all model APIs concurrently.
#     """
#     user_prompt = append_sample_to_user_prompt(
#         tier=tier,
#         index=index,
#         dataset=dataset
#     )
    
#     tasks = []
#     # Create concurrent tasks for each model
#     for provider, model in model_dict.items():
#         async with provider_sems[provider]: # Acquire semaphore before creating the task
#             if provider == "openai":
#                 coro = call_openai_async(model, system_prompt, user_prompt, index)
#             elif provider == "google":
#                 coro = call_google_async(model, system_prompt, user_prompt, index)
#             elif provider == "anthropic":
#                 coro = call_anthropic_async(model, system_prompt, user_prompt, index)
#             else:
#                 async def unknown_provider(): raise ValueError(f"Unknown provider: {provider}")
#                 coro = unknown_provider()
        
#             task = asyncio.create_task(coro)
#             task.meta = {
#                 "provider": provider, "model": model, "index": index, "start_time": time.time()
#             }
#             tasks.append(task)
            
#     # Wait for all models to finish for this single problem
#     task_results = await asyncio.gather(*tasks, return_exceptions=True)
    
#     problem_results = []
#     # Process the results
#     for task, result in zip(tasks, task_results):
#         meta = task.meta
#         elapsed = time.time() - meta["start_time"]
#         status = "Failed"
#         usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}
        
#         output_path = output_dir / str(meta['index']) / f"{meta['provider']}_{meta['model']}.txt"
#         output_path.parent.mkdir(parents=True, exist_ok=True)

#         if isinstance(result, Exception):
#             error_message = f"--- ERROR ---\nIndex: {meta['index']}, Model: {meta['model']}\n{type(result).__name__}: {result}"
#             output_path.write_text(error_message, encoding='utf-8')
#             # The retry wrapper already logs, this is for the final failure
#             print(f"❌ FINAL Error for Index {meta['index']}, Model {meta['model']}: {type(result).__name__}")
#         else:
#             text_response, usage = result
#             output_path.write_text(text_response, encoding='utf-8')
#             status = "Success"
#             print(f"✅ Success for Index {meta['index']}, Model {meta['model']} in {elapsed:.2f}s")
        
#         problem_results.append({
#             "provider": meta["provider"], "model": meta["model"], "index": meta["index"],
#             "status": status, "time_s": round(elapsed, 2),
#             "input_tokens": usage["input_tokens"], "output_tokens": usage["output_tokens"],
#             "cached_tokens": usage["cached_tokens"],
#             "utc_completed": datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds")
#         })
    
#     return problem_results


# # --- 3. Main Hybrid Batch Generation Function ---
# # This function processes each problem in the list SEQUENTIALLY.

# async def generate_manifests_hybrid(
#     indices_to_generate: List[int],
#     tier: str = "tier1",
#     dataset: 'datasets.Dataset' = GSM8K_TRAIN,
#     model_dict: Dict[str, str] = MODEL_DICT,
#     system_prompt: str = SYSTEM_PROMPT,
#     output_dir: Path = TIER_OUTPUT_DIRS["tier1"],
#     concurrency_limits: Dict[str, int] = API_CONCURRENCY_LIMITS
# ):
#     """
#     Runs manifest generation by processing problems sequentially, but running
#     model API calls for each problem concurrently.
#     """
#     print("--- Starting Manifest Generation (Hybrid: Sequential Problems, Concurrent Models) ---")
#     start_time = time.time()
    
#     provider_semaphores = {prov: asyncio.Semaphore(limit) for prov, limit in concurrency_limits.items()}
#     all_results = []
    
#     # Process each problem sequentially using a simple for loop
#     for index in tqdm(indices_to_generate, desc=f"Generating Manifests for {tier}"):
#         problem_results = await run_one_problem_concurrently(
#             index=index,
#             tier=tier,
#             dataset=dataset,
#             system_prompt=system_prompt,
#             model_dict=model_dict,
#             provider_sems=provider_semaphores,
#             output_dir=output_dir
#         )
#         all_results.extend(problem_results)

#     # Create and save the performance DataFrame
#     df = pd.DataFrame(all_results)
#     run_ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d_%H%M%S")
#     csv_path = output_dir / f"generation_performance_{run_ts}.csv"
#     df.to_csv(csv_path, index=False)
    
#     end_time = time.time()
#     print(f"\n--- Manifest Generation Complete ---")
#     print(f"Processed {len(indices_to_generate)} indices in {end_time - start_time:.2f} seconds.")
#     print(f"Performance log saved to: {csv_path}")
    
#     return df

# --- 3. Per-Problem Orchestration (Dispatcher Logic) ---

async def run_one_problem_async(
    index: int, 
    tier: str,
    dataset: 'datasets.Dataset',
    system_prompt: str,
    model_dict: Dict[str, str],
    provider_sems: Dict[str, asyncio.Semaphore], 
    output_dir: Path,
    pbar: tqdm
) -> List[Dict]:
    """
    Generates manifests for a single problem and returns a list of result dictionaries.
    """
    user_prompt = append_sample_to_user_prompt(
        tier=tier,
        index=index,
        dataset=dataset
    )
    
    problem_results = []
    
    tasks = []
    for provider, model in model_dict.items():
        async with provider_sems[provider]: # Acquire semaphore before creating the task
            if provider == "openai":
                coro = call_openai_async(model, system_prompt, user_prompt, index)
            elif provider == "google":
                coro = call_google_async(model, system_prompt, user_prompt, index)
            elif provider == "anthropic":
                coro = call_anthropic_async(model, system_prompt, user_prompt, index)
            else:
                # Create a coroutine that will immediately raise an error
                async def unknown_provider(): raise ValueError(f"Unknown provider: {provider}")
                coro = unknown_provider()
        
            task = asyncio.create_task(coro)
            task.meta = {
                "provider": provider, 
                "model": model, 
                "index": index, 
                "start_time": time.time()}
            tasks.append(task)
        
    task_results = await asyncio.gather(*tasks, return_exceptions=True)
    
    for task, result in zip(tasks, task_results):
        meta = task.meta
        elapsed = time.time() - meta["start_time"]
        status = "Failed"
        usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}
        
        output_path = output_dir / str(meta['index']) / f"{meta['provider']}_{meta['model']}.txt"
        output_path.parent.mkdir(parents=True, exist_ok=True)

        if isinstance(result, Exception):
            error_message = f"--- ERROR ---\nIndex: {meta['index']}, Model: {meta['model']}\n{type(result).__name__}: {result}"
            output_path.write_text(error_message, encoding='utf-8')
            print(f"❌ Error for Index {meta['index']}, Model {meta['model']}: {type(result).__name__}")
        else:
            text_response, usage = result
            output_path.write_text(text_response, encoding='utf-8')
            status = "Success"
        
        # Append to the list of results for this problem
        problem_results.append({
            "provider": meta["provider"], 
            "model": meta["model"], 
            "index": meta["index"],
            "status": status, 
            "time_s": round(elapsed, 2),
            "input_tokens": usage["input_tokens"], "output_tokens": usage["output_tokens"],
            "cached_tokens": usage["cached_tokens"],
            "utc_completed": datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds")
        })
    
    pbar.update(1)
    # Return result for this problem
    return problem_results


# --- 4. Main Batch Generation Function ---

async def generate_manifests_parallel(
    indices_to_generate: List[int],
    tier: str = "tier1",
    dataset: 'datasets.Dataset' = GSM8K_TRAIN,
    model_dict: Dict[str, str] = MODEL_DICT,
    system_prompt: str = SYSTEM_PROMPT,
    concurrency_limits: Dict[str, int] = API_CONCURRENCY_LIMITS
):
    """
    Runs the manifest generation process and returns a DataFrame with performance stats.
    """
    print("--- Starting Manifest Generation ---")
    start_time = time.time()
    
    provider_semaphores = {prov: asyncio.Semaphore(limit) for prov, limit in concurrency_limits.items()}

    output_dir = TIER_OUTPUT_DIRS[tier]
    
    with tqdm(total=len(indices_to_generate), desc="Generating Manifests") as pbar:
        problem_tasks = [
            run_one_problem_async(
                index=idx, 
                tier=tier,
                dataset=dataset,
                system_prompt=system_prompt,
                model_dict=model_dict,
                provider_sems=provider_semaphores, 
                output_dir=output_dir,
                pbar=pbar
            )
            for idx in indices_to_generate
        ]
        # This will now be a list of lists, e.g., [[results_for_p0], [results_for_p1], ...]
        all_results = await asyncio.gather(*problem_tasks)

    # Flatten the list of lists into a single list of result dictionaries
    flat_results = [item for sublist in all_results for item in sublist]

    # Create and save the performance DataFrame
    df = pd.DataFrame(flat_results)
    run_ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d_%H%M%S")
    csv_path = output_dir / f"generation_performance_{run_ts}.csv"
    df.to_csv(csv_path, index=False)
    
    end_time = time.time()
    print(f"\n--- Manifest Generation Complete ---")
    print(f"Processed {len(indices_to_generate)} indices in {end_time - start_time:.2f} seconds.")
    print(f"Performance log saved to: {csv_path}")
    
    return df


### Running the manifest generation process

In [13]:
# Choose the list of indices to generate manifests for
UPPER_LIMIT = 500
INDICES_TO_GENERATE = [idx for idx in TIER_LISTS["tier1"] if idx <= UPPER_LIMIT]

# Run the manifest generation process
perf_df = await generate_manifests_parallel(indices_to_generate=INDICES_TO_GENERATE)

--- Starting Manifest Generation ---


Generating Manifests:   0%|          | 0/187 [00:00<?, ?it/s]

I0000 00:00:1751997005.543962 11806203 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


🕒 Rate limit on gpt-4.1 (Index 79). Retrying in 10.89s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 431). Retrying in 10.03s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 78). Retrying in 10.53s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 21). Retrying in 10.62s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 174). Retrying in 10.30s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 53). Retrying in 10.32s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 294). Retrying in 10.64s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 264). Retrying in 10.04s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 208). Retrying in 10.49s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 440). Retrying in 10.58s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 396). Retrying in 10.14s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 138). Retrying in 10.91s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 142). Retrying in 10.48s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 423). Retry

CancelledError: 

### Display the performance DataFrame

In [None]:
display(perf_df)

Unnamed: 0,provider,model,index,status,time_s,input_tokens,output_tokens,cached_tokens,utc_completed
0,openai,gpt-4.1,4,Success,35.76,4548,453,4352,2025-07-08T16:22:55+00:00
1,google,gemini-2.5-flash,4,Success,35.76,5115,636,0,2025-07-08T16:22:55+00:00
2,openai,gpt-4.1,6,Success,7.9,4585,473,4352,2025-07-08T16:22:27+00:00
3,google,gemini-2.5-flash,6,Success,7.9,5157,695,0,2025-07-08T16:22:27+00:00
4,openai,gpt-4.1,7,Success,9.82,4671,519,4352,2025-07-08T16:22:29+00:00
5,google,gemini-2.5-flash,7,Success,9.82,5229,697,0,2025-07-08T16:22:29+00:00
6,openai,gpt-4.1,17,Success,16.91,4755,879,4352,2025-07-08T16:22:36+00:00
7,google,gemini-2.5-flash,17,Success,16.91,5340,868,0,2025-07-08T16:22:36+00:00
8,openai,gpt-4.1,18,Success,10.56,4664,748,4352,2025-07-08T16:22:30+00:00
9,google,gemini-2.5-flash,18,Success,10.56,5224,1124,0,2025-07-08T16:22:30+00:00


### Helper function to concatenate manifests (useful for sharing in AI chat)

In [None]:
# def concatenate_complete_manifests(
#         indices: List[int], 
#         models: List[str], 
#         manifest_dir: Path, 
#         dataset: List[Dict[str, Any]]
#     ) -> str:
#     """
#     For each index, concatenate the wrapper output and manifest outputs for all models.
#     Returns the full text as a string.
#     """
#     def wrapper(index: int):
#         sample = dataset[index]
#         to_return = "=" * 50 + "\n\n"
#         to_return += f"**Index:** {index}\n\n"
#         to_return += "**Question:**\n"
#         to_return += f"{sample['question']}\n\n"
#         to_return += "**Solution mapping:**\n"
#         to_return += f"{build_solution_mapping(index, dataset)}\n\n"
#         return to_return

#     all_text = ""
#     for index in indices:
#         all_text += wrapper(index)
#         for model in models:
#             filepath = manifest_dir / f"{index}" / f"{model}.txt"
#             with open(filepath, 'r', encoding='utf-8') as f:
#                 content = f.read().strip()
#             all_text += f"--- {model} Output for Index {index} ---\n{content}\n\n"
#     return all_text.strip()

# # Usage and save to file
# initial_manifests_text = concatenate_complete_manifests(
#     indices=INDICES_TO_GENERATE, 
#     models=MODELS, 
#     manifest_dir=MANIFEST_OUTPUT_DIR, 
#     dataset=GSM8K_TRAIN
# )
# with open(MANIFEST_OUTPUT_DIR / "initial_manifests.txt", 'w', encoding='utf-8') as f:
#     f.write(initial_manifests_text)

In [None]:
import asyncio
import time
from collections import deque
import google.generativeai as genai
import threading # For thread-safe deque access in a multi-threaded context (optional, if using threads)

# Configure your API key (replace with your actual key)
# genai.configure(api_key="YOUR_API_KEY")

# --- Asynchronous Client Setup ---
# Initialize the asynchronous model
# Note: genai.GenerativeModel can be used asynchronously directly.
# The async client handles the underlying aiohttp requests.
model = genai.GenerativeModel('gemini-2.5-flash')

# --- Asynchronous Client-side RPM Tracker ---
# Use a deque for timestamps.
# In an async context, if multiple coroutines are modifying this, it's inherently single-threaded
# within the event loop, but if you were to spawn multiple processes or actual threads,
# you'd need synchronization (e.g., threading.Lock). For pure asyncio, it's fine.
request_timestamps = deque()
MAX_REQUESTS_PER_MINUTE = 60 # Example limit: adjust based on actual API limits

# An asyncio Lock to protect the deque if multiple tasks are modifying it concurrently
# (though for a single shared deque, careful design can avoid explicit locking).
# For simplicity in this pattern, we'll manage the deque in a way that minimizes race conditions.
# However, if you were dynamically creating many concurrent tasks, a lock would be safer.
deque_lock = asyncio.Lock()


async def make_api_request_with_rpm_check_async(prompt: str, task_id: int):
    """
    Makes an asynchronous API request, respecting RPM limits.
    """
    global request_timestamps

    while True:
        current_time = time.time() # Use time.time() for wall-clock time
        # Acquire lock to safely modify shared deque
        async with deque_lock:
            # Remove timestamps older than 60 seconds
            while request_timestamps and request_timestamps[0] < current_time - 60:
                request_timestamps.popleft()

            # Check if we are about to exceed the limit
            if len(request_timestamps) >= MAX_REQUESTS_PER_MINUTE:
                # Calculate how long to wait until the oldest request falls out of the window
                time_to_wait = (request_timestamps[0] + 60) - current_time
                if time_to_wait > 0:
                    print(f"Task {task_id}: RPM limit approaching. Waiting {time_to_wait:.2f} seconds...")
                    # Release lock before awaiting sleep
                    await asyncio.sleep(time_to_wait)
                    continue # Re-evaluate after waiting

            # If we reach here, we're allowed to make a request
            request_timestamps.append(current_time)
            current_estimated_rpm = len(request_timestamps)
            # Release lock before making the actual API call
            break

    try:
        # Make the actual asynchronous API call
        response = await model.generate_content_async(prompt)
        print(f"Task {task_id}: Request successful. Current Estimated RPM: {current_estimated_rpm}. Response: {response.text[:50]}...")
        return response
    except Exception as e:
        print(f"Task {task_id}: API Request Failed: {e}")
        # Implement specific error handling (e.g., retries for RESOURCE_EXHAUSTED)
        return None

async def main():
    test_prompts = [f"Tell me a short story about a cat named {i}." for i in range(1, 101)] # More prompts for concurrent testing

    # Create a list of tasks
    tasks = []
    for i, prompt in enumerate(test_prompts):
        task = asyncio.create_task(make_api_request_with_rpm_check_async(prompt, i + 1))
        tasks.append(task)

    # Await all tasks to complete
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    start_time = time.time()
    asyncio.run(main())
    end_time = time.time()
    print(f"\nAll tasks completed in {end_time - start_time:.2f} seconds.")


Task 61: RPM limit approaching. Waiting 60.00 seconds...
Task 51: Request successful. Current Estimated RPM: 51. Response: 51 wasn't like other cats. He appeared one Tuesday...
Task 38: Request successful. Current Estimated RPM: 38. Response: 38 was not a name for a cat. It was a serial numbe...
Task 48: Request successful. Current Estimated RPM: 48. Response: In the chaotic symphony of a busy animal shelter, ...
Task 3: Request successful. Current Estimated RPM: 3. Response: Three wasn't just a number; Three was a cat. A sle...
Task 58: Request successful. Current Estimated RPM: 58. Response: The first thing you noticed about 58 was his name....
Task 37: Request successful. Current Estimated RPM: 37. Response: In a facility of cool, clinical whites and humming...
Task 7: Request successful. Current Estimated RPM: 7. Response: The first light of dawn was always 7's favorite. N...
Task 13: Request successful. Current Estimated RPM: 13. Response: The name 13 wasn't given out of malice, b

KeyboardInterrupt: 

In [14]:
import time
import math
import random
import datetime
from tqdm.notebook import tqdm
from collections import deque

# --- 1. Helper Functions ---
# These helpers are generic and can be used by any provider's function.

_anthropic_bucket = {"tokens": 50_000, "reset_at": time.monotonic() + 60}

# RPM tracking for OpenAI
_openai_request_timestamps = deque()
_openai_rpm_lock = asyncio.Lock()
OPENAI_MAX_RPM = 300  # Stay below your 500 RPM limit

# RPM tracking for Google
_google_request_timestamps = deque()
_google_rpm_lock = asyncio.Lock()
GOOGLE_MAX_RPM = 300  # Google's free tier limit is typically 60 RPM, stay below it

async def _anthropic_throttle(tokens_needed: int):
    global _anthropic_bucket
    while True:
        now = time.monotonic()
        if now >= _anthropic_bucket["reset_at"]:
            _anthropic_bucket = {"tokens": 50_000, "reset_at": now + 60}
        if tokens_needed <= _anthropic_bucket["tokens"]:
            _anthropic_bucket["tokens"] -= tokens_needed
            return
        else:
            to_sleep = _anthropic_bucket["reset_at"] - now
            await asyncio.sleep(max(to_sleep, 0.01))

async def _openai_rpm_check():
    """Ensure we don't exceed OpenAI RPM limits"""
    global _openai_request_timestamps
    
    async with _openai_rpm_lock:
        current_time = time.time()
        
        # Remove timestamps older than 60 seconds
        while _openai_request_timestamps and _openai_request_timestamps[0] < current_time - 60:
            _openai_request_timestamps.popleft()
        
        # Check if we need to wait
        if len(_openai_request_timestamps) >= OPENAI_MAX_RPM:
            wait_time = (_openai_request_timestamps[0] + 60) - current_time
            if wait_time > 0:
                print(f"🕒 OpenAI RPM limit reached. Waiting {wait_time:.2f} seconds...")
                await asyncio.sleep(wait_time)
                # Re-check after waiting
                return await _openai_rpm_check()
        
        # Record this request
        _openai_request_timestamps.append(current_time)

async def _google_rpm_check():
    """Ensure we don't exceed Google RPM limits"""
    global _google_request_timestamps
    
    async with _google_rpm_lock:
        current_time = time.time()
        
        # Remove timestamps older than 60 seconds
        while _google_request_timestamps and _google_request_timestamps[0] < current_time - 60:
            _google_request_timestamps.popleft()
        
        # Check if we need to wait
        if len(_google_request_timestamps) >= GOOGLE_MAX_RPM:
            wait_time = (_google_request_timestamps[0] + 60) - current_time
            if wait_time > 0:
                print(f"🕒 Google RPM limit reached. Waiting {wait_time:.2f} seconds...")
                await asyncio.sleep(wait_time)
                # Re-check after waiting
                return await _google_rpm_check()
        
        # Record this request
        _google_request_timestamps.append(current_time)

async def with_api_retries(
        send_coroutine_factory,
        *,
        model_info: str,  # For informative logging
        max_attempts: int = 10,
        base_wait_seconds: int = 10
    ):
    """A wrapper to handle API retries with exponential backoff."""
    for attempt in range(max_attempts):
        try:
            return await send_coroutine_factory()
        except (openai.RateLimitError, anthropic.RateLimitError, Exception) as e:
            # Check for specific rate limit error types or a 429 status code in the error string
            if isinstance(e, (openai.RateLimitError, anthropic.RateLimitError)) or "429" in str(e):
                if attempt == max_attempts - 1:
                    print(f"❌ Final attempt failed for {model_info}. Giving up.")
                    raise
                
                # Exponential backoff with jitter
                wait_time = base_wait_seconds * (2 ** attempt) + random.uniform(0, 1)
                
                # More informative error message
                print(f"🕒 Rate limit on {model_info}. Retrying in {wait_time:.2f}s... (Attempt {attempt + 1}/{max_attempts})")
                await asyncio.sleep(wait_time)
            else:
                # If it's not a rate limit error, re-raise immediately
                raise
    return None

# --- 2. Provider-Specific API Calling Functions ---

async def call_openai_async(
        model: str,
        system_prompt: str,
        user_prompt: str,
        index: int
    ):
    """Handles an API call to OpenAI with RPM limiting."""
    # Check RPM limits before making the request
    await _openai_rpm_check()
    
    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
    model_info = f"{model} (Index {index})"
    
    response = await with_api_retries(
        lambda: openai_client_async.chat.completions.create(
            model=model, 
            messages=messages, 
            temperature=0, 
            max_tokens=4000, 
            response_format={"type": "json_object"}
        ),
        model_info=model_info
    )
    
    text_response = response.choices[0].message.content
    usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}
    if response.usage:
        usage["input_tokens"] = response.usage.prompt_tokens
        usage["output_tokens"] = response.usage.completion_tokens
        if hasattr(response.usage, 'prompt_tokens_details') and response.usage.prompt_tokens_details:
             usage["cached_tokens"] = response.usage.prompt_tokens_details.get("cached_tokens", 0)
    return text_response, usage

async def call_google_async(
        model: str,
        system_prompt: str,
        user_prompt: str,
        index: int
    ):
    """Handles an API call to Google with RPM limiting."""
    # Check RPM limits before making the request
    await _google_rpm_check()
    
    gemini = genai.GenerativeModel(model_name=model, system_instruction=system_prompt)
    cfg = genai.types.GenerationConfig(temperature=0, max_output_tokens=4000)
    model_info = f"{model} (Index {index})"

    response = await with_api_retries(
        lambda: gemini.generate_content_async(user_prompt, generation_config=cfg),
        model_info=model_info
    )

    text_response = response.text
    usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}
    if response.usage_metadata:
        usage["input_tokens"] = response.usage_metadata.prompt_token_count
        usage["output_tokens"] = response.usage_metadata.candidates_token_count
    return text_response, usage

async def call_anthropic_async(
        model: str,
        system_prompt: str,
        user_prompt: str,
        index: int
    ):
    """Handles an API call to Anthropic, including prompt caching."""
    system_block = {"type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}}
    model_info = f"{model} (Index {index})"
    
    est_tokens = math.ceil(1.2 * len(system_prompt.split()))
    await _anthropic_throttle(est_tokens)

    response = await with_api_retries(
        lambda: anthropic_client_async.messages.create(
            model=model, max_tokens=4000, temperature=0,
            system=[system_block], messages=[{"role": "user", "content": user_prompt}],
        ),
        model_info=model_info
    )

    text_response = response.content[0].text
    usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}
    if response.usage:
        usage["input_tokens"] = response.usage.input_tokens
        usage["output_tokens"] = response.usage.output_tokens
        usage["cached_tokens"] = response.usage.cache_read_input_tokens if response.usage.cache_read_input_tokens else 0
    return text_response, usage



In [15]:
# --- 3. Fixed Single API Call Function ---

async def run_single_api_call(
    provider: str,
    model: str,
    index: int,
    tier: str,
    dataset: 'datasets.Dataset',
    system_prompt: str,
    output_dir: Path,
    provider_sem: asyncio.Semaphore
) -> Dict:
    """
    Runs a single API call for one provider/model/problem combination.
    Semaphore is acquired HERE, before any async operations.
    """
    async with provider_sem:  # Acquire semaphore at the start
        user_prompt = append_sample_to_user_prompt(
            tier=tier,
            index=index,
            dataset=dataset
        )
        
        start_time = time.time()
        status = "Failed"
        usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}
        
        try:
            # Call the appropriate API function
            if provider == "openai":
                text_response, usage = await call_openai_async(model, system_prompt, user_prompt, index)
            elif provider == "google":
                text_response, usage = await call_google_async(model, system_prompt, user_prompt, index)
            elif provider == "anthropic":
                text_response, usage = await call_anthropic_async(model, system_prompt, user_prompt, index)
            else:
                raise ValueError(f"Unknown provider: {provider}")
            
            status = "Success"
            
        except Exception as e:
            text_response = f"--- ERROR ---\nIndex: {index}, Model: {model}\n{type(e).__name__}: {e}"
            print(f"❌ Error for Index {index}, Model {model}: {type(e).__name__}")
        
        # Save the output
        output_path = output_dir / str(index) / f"{provider}_{model}.txt"
        output_path.parent.mkdir(parents=True, exist_ok=True)
        output_path.write_text(text_response, encoding='utf-8')
        
        elapsed = time.time() - start_time
        
        return {
            "provider": provider,
            "model": model,
            "index": index,
            "status": status,
            "time_s": round(elapsed, 2),
            "input_tokens": usage["input_tokens"],
            "output_tokens": usage["output_tokens"],
            "cached_tokens": usage["cached_tokens"],
            "utc_completed": datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds")
        }

# --- 4. Batch Processing Function ---

async def generate_manifests_batched(
    indices_to_generate: List[int],
    tier: str = "tier1",
    dataset: 'datasets.Dataset' = GSM8K_TRAIN,
    model_dict: Dict[str, str] = MODEL_DICT,
    system_prompt: str = SYSTEM_PROMPT,
    concurrency_limits: Dict[str, int] = API_CONCURRENCY_LIMITS,
    batch_size: int = 20  # Process 20 problems at a time
):
    """
    Runs manifest generation in batches to avoid overwhelming APIs.
    """
    print("--- Starting Manifest Generation (Batched) ---")
    start_time = time.time()
    
    provider_semaphores = {prov: asyncio.Semaphore(limit) for prov, limit in concurrency_limits.items()}
    output_dir = TIER_OUTPUT_DIRS[tier]
    all_results = []
    
    # Process in batches
    for batch_start in range(0, len(indices_to_generate), batch_size):
        batch_indices = indices_to_generate[batch_start:batch_start + batch_size]
        batch_num = batch_start // batch_size + 1
        total_batches = math.ceil(len(indices_to_generate) / batch_size)
        
        print(f"\n--- Processing Batch {batch_num}/{total_batches} (Indices {batch_indices[0]} to {batch_indices[-1]}) ---")
        
        # Create all tasks for this batch
        batch_tasks = []
        for index in batch_indices:
            for provider, model in model_dict.items():
                task = asyncio.create_task(
                    run_single_api_call(
                        provider=provider,
                        model=model,
                        index=index,
                        tier=tier,
                        dataset=dataset,
                        system_prompt=system_prompt,
                        output_dir=output_dir,
                        provider_sem=provider_semaphores[provider]
                    )
                )
                batch_tasks.append(task)
        
        # Wait for all tasks in this batch to complete
        with tqdm(total=len(batch_tasks), desc=f"Batch {batch_num}", leave=False) as pbar:
            batch_results = []
            for task in asyncio.as_completed(batch_tasks):
                result = await task
                batch_results.append(result)
                pbar.update(1)
        
        all_results.extend(batch_results)
        print(f"✅ Batch {batch_num} completed: {len(batch_results)} API calls")
        
        # Brief pause between batches to be nice to the APIs
        if batch_start + batch_size < len(indices_to_generate):
            await asyncio.sleep(2)
    
    # Create and save the performance DataFrame
    df = pd.DataFrame(all_results)
    run_ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d_%H%M%S")
    csv_path = output_dir / f"generation_performance_{run_ts}.csv"
    df.to_csv(csv_path, index=False)
    
    end_time = time.time()
    print(f"\n--- Manifest Generation Complete ---")
    print(f"Processed {len(indices_to_generate)} indices in {end_time - start_time:.2f} seconds.")
    print(f"Performance log saved to: {csv_path}")
    
    # Print summary statistics
    success_count = len(df[df['status'] == 'Success'])
    total_calls = len(df)
    print(f"Success rate: {success_count}/{total_calls} ({100*success_count/total_calls:.1f}%)")
    
    return df

# --- 5. Alternative: Fully Parallel with Better Semaphore Management ---

async def generate_manifests_parallel_fixed(
    indices_to_generate: List[int],
    tier: str = "tier1",
    dataset: 'datasets.Dataset' = GSM8K_TRAIN,
    model_dict: Dict[str, str] = MODEL_DICT,
    system_prompt: str = SYSTEM_PROMPT,
    concurrency_limits: Dict[str, int] = API_CONCURRENCY_LIMITS
):
    """
    Fully parallel version with proper semaphore management.
    """
    print("--- Starting Manifest Generation (Parallel Fixed) ---")
    start_time = time.time()
    
    provider_semaphores = {prov: asyncio.Semaphore(limit) for prov, limit in concurrency_limits.items()}
    output_dir = TIER_OUTPUT_DIRS[tier]
    
    # Create ALL tasks upfront
    all_tasks = []
    for index in indices_to_generate:
        for provider, model in model_dict.items():
            task = asyncio.create_task(
                run_single_api_call(
                    provider=provider,
                    model=model,
                    index=index,
                    tier=tier,
                    dataset=dataset,
                    system_prompt=system_prompt,
                    output_dir=output_dir,
                    provider_sem=provider_semaphores[provider]
                )
            )
            all_tasks.append(task)
    
    print(f"Created {len(all_tasks)} total API call tasks")
    
    # Execute all tasks with progress bar
    with tqdm(total=len(all_tasks), desc="API Calls") as pbar:
        results = []
        for task in asyncio.as_completed(all_tasks):
            result = await task
            results.append(result)
            pbar.update(1)
    
    # Create and save the performance DataFrame
    df = pd.DataFrame(results)
    run_ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d_%H%M%S")
    csv_path = output_dir / f"generation_performance_{run_ts}.csv"
    df.to_csv(csv_path, index=False)
    
    end_time = time.time()
    print(f"\n--- Manifest Generation Complete ---")
    print(f"Processed {len(indices_to_generate)} indices in {end_time - start_time:.2f} seconds.")
    print(f"Performance log saved to: {csv_path}")
    
    # Print summary statistics
    success_count = len(df[df['status'] == 'Success'])
    total_calls = len(df)
    print(f"Success rate: {success_count}/{total_calls} ({100*success_count/total_calls:.1f}%)")
    
    return df

In [16]:
# # Option 1: Batched (recommended for stability)
# perf_df = await generate_manifests_batched(indices_to_generate=INDICES_TO_GENERATE)

# Choose the list of indices to generate manifests for
UPPER_LIMIT = 3000
INDICES_TO_GENERATE = [idx for idx in TIER_LISTS["tier1"] if idx <= UPPER_LIMIT]

# Option 2: Fully parallel (faster but may still hit some rate limits)
perf_df = await generate_manifests_parallel_fixed(indices_to_generate=INDICES_TO_GENERATE)

--- Starting Manifest Generation (Parallel Fixed) ---
Created 2276 total API call tasks


API Calls:   0%|          | 0/2276 [00:00<?, ?it/s]

🕒 Rate limit on gpt-4.1 (Index 24). Retrying in 10.59s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 32). Retrying in 10.92s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 32). Retrying in 20.50s... (Attempt 2/10)
🕒 Rate limit on gpt-4.1 (Index 51). Retrying in 10.73s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 85). Retrying in 10.87s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 97). Retrying in 10.67s... (Attempt 1/10)
❌ Error for Index 216, Model gemini-2.5-flash: ValueError
🕒 Rate limit on gpt-4.1 (Index 103). Retrying in 10.00s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 131). Retrying in 10.12s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 131). Retrying in 20.31s... (Attempt 2/10)
🕒 Rate limit on gpt-4.1 (Index 146). Retrying in 10.94s... (Attempt 1/10)
🕒 Rate limit on gpt-4.1 (Index 146). Retrying in 20.66s... (Attempt 2/10)
🕒 Rate limit on gpt-4.1 (Index 146). Retrying in 40.38s... (Attempt 3/10)
🕒 Rate limit on gpt-4.1 (Index 190). Retrying in 10.79s... (