#### Cell 1: Paths

In [1]:
from pathlib import Path

def find_project_root():
    """Traverse upwards to find the project root, marked by the .git folder."""
    current_path = Path.cwd()
    while current_path != current_path.parent:
        if (current_path / ".git").is_dir():
            return current_path
        current_path = current_path.parent
    raise FileNotFoundError("Could not find project root. Is this a git repository?")

PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / 'data'

# To contain generated manifest txt files
MANIFEST_OUTPUT_DIR = DATA_DIR / "tier-manifests-gen-txt" 

# Contains sample manifest json files, questions answers, and user prompt prefixes
MANIFEST_EXAMPLES_DIR = DATA_DIR / "tier-manifests-examples-json" 

TIER_OUTPUT_DIRS = {f"tier{i}": MANIFEST_OUTPUT_DIR / f"tier{i}" for i in range(1, 6)}
TIER_EXAMPLES_DIRS = {f"tier{i}": MANIFEST_EXAMPLES_DIR / f"tier{i}" for i in range(1, 6)}

# Make the directory for the tier if it doesn't exist
for tier_dir in TIER_OUTPUT_DIRS.values():
    tier_dir.mkdir(parents=True, exist_ok=True)

for tier_dir in TIER_EXAMPLES_DIRS.values():
    tier_dir.mkdir(parents=True, exist_ok=True)

print(f"Project root found at: {PROJECT_ROOT}")
print(f"Data directory found at: {DATA_DIR}")
print(f"Raw manifest output directory set to: {MANIFEST_OUTPUT_DIR}")

Project root found at: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math
Data directory found at: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data
Raw manifest output directory set to: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/tier-manifests-gen-txt


#### Cell 2: Dataset and tier definitions

In [None]:
import datasets
from datasets import load_dataset 
import re

GSM8K_TRAIN = load_dataset("gsm8k", "main", split="train")

def has_computational_division(solution_text: str):
    """Returns True if a '/' is followed by optional whitespace and then a digit."""
    pattern = re.compile(r'/\s*\d')
    return bool(pattern.search(solution_text))


def has_float(solution_text: str):
    """Returns True if the solution text contains a floating-point number."""
    pattern = re.compile(r'(?<!\d)\.\d+|\d+\.\d+')
    return bool(pattern.search(solution_text))


def is_symbolic(solution_text: str):
    """Returns True if the solution contains a symbolic reasoning line (Let @ ...)."""
    pattern = re.compile(r'^Let [a-zA-Z] ', re.MULTILINE)
    return bool(pattern.search(solution_text))


def mutually_disjoint_tiers(dataset: datasets.Dataset):
    tiers = {}
    symbolic_set = set(
        idx for idx, sample in enumerate(dataset)
        if is_symbolic(sample.get("answer", ""))
    )
    non_symbolic_indices = [
        idx for idx in range(len(dataset)) if idx not in symbolic_set
    ]

    # Tier 1: Only integer arithmetic (no floats, no computational division)
    tiers["tier1"] = sorted([
        idx for idx in non_symbolic_indices
        if not has_float(dataset[idx].get("answer", "")) and not has_computational_division(dataset[idx].get("answer", ""))
    ])

    # Tier 2: Float arithmetic, no computational division
    tiers["tier2"] = sorted([
        idx for idx in non_symbolic_indices
        if has_float(dataset[idx].get("answer", "")) and not has_computational_division(dataset[idx].get("answer", ""))
    ])

    # Tier 3: Computational division, no floats
    tiers["tier3"] = sorted([
        idx for idx in non_symbolic_indices
        if not has_float(dataset[idx].get("answer", "")) and has_computational_division(dataset[idx].get("answer", ""))
    ])

    # Tier 4: Both floats and computational division
    tiers["tier4"] = sorted([
        idx for idx in non_symbolic_indices
        if has_float(dataset[idx].get("answer", "")) and has_computational_division(dataset[idx].get("answer", ""))
    ])

    # Tier 5: Symbolic reasoning (Let @ ...)
    tiers["tier5"] = sorted(symbolic_set)

    return tiers


TIER_LISTS = mutually_disjoint_tiers(GSM8K_TRAIN)

# Display the number of samples in each tier
for tier, indices in TIER_LISTS.items():
    print(f"{tier:<10}: {len(indices)} samples")
print(f"{'Total':<10}: {len(GSM8K_TRAIN)} samples")

tier1     : 2767 samples
tier2     : 837 samples
tier3     : 3113 samples
tier4     : 544 samples
tier5     : 212 samples
Total     : 7473 samples


#### **Selecting the current tier for this notebook**

In [None]:
### --- Select the tier ONCE for this notebook --- ###
CURRENT_TIER = "tier2"

CURRENT_INDICES = TIER_LISTS[CURRENT_TIER]
CURRENT_TIER_OUTPUT_DIR = TIER_OUTPUT_DIRS[CURRENT_TIER]
CURRENT_TIER_EXAMPLES_DIR = TIER_EXAMPLES_DIRS[CURRENT_TIER]

print(f"Tier for this notebook: {CURRENT_TIER}")
print(f"Number of samples in {CURRENT_TIER}: {len(CURRENT_INDICES)}")
print(f"Output directory for {CURRENT_TIER}: {CURRENT_TIER_OUTPUT_DIR}")
print(f"Examples directory for {CURRENT_TIER}: {CURRENT_TIER_EXAMPLES_DIR}")

Tier for this notebook: tier2
Number of samples in tier2: 837
Output directory for tier2: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/tier-manifests-gen-txt/tier2
Examples directory for tier2: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/tier-manifests-examples-json/tier2


#### Cell 3: Prompt creation

In [None]:
import json


def sanitize_text(text: str):
    """
    Replaces a comprehensive set of problematic Unicode characters with their
    ASCII equivalents to prevent model generation and string parsing errors.
    """
    replacements = {
        # Mathematical Operators
        "\u2212": "-",  # Minus Sign
        "\u00d7": "*",  # Multiplication Sign
        "\u00f7": "/",  # Division Sign
        "\u22c5": "*",  # Dot Operator
        
        # Typographic Quotes
        "\u201c": '"',  # Left Double Quotation Mark
        "\u201d": '"',  # Right Double Quotation Mark
        "\u2018": "'",  # Left Single Quotation Mark
        "\u2019": "'",  # Right Single Quotation Mark
        
        # Typographic Dashes
        "\u2014": "-",  # Em Dash
        "\u2013": "-",  # En Dash
        
        # Other
        "\u2026": "...", # Horizontal Ellipsis
        "\u00a0": " ",  # Non-breaking Space
    }
    for uni, ascii_char in replacements.items():
        text = text.replace(uni, ascii_char)
    return text


def build_solution_mapping(
        index: int, 
        dataset: 'datasets.Dataset' = GSM8K_TRAIN,
        exclude_FA: bool = True
    ):
    """
    Extracts the natural language solution, sanitizes it, and structures
    it into a line-numbered dictionary.
    """
    solution_mapping = {}
    solution_text = dataset[index]["answer"]
    sanitized_solution_text = sanitize_text(solution_text)
    
    # --- CORRECTION: Use the sanitized variable ---
    lines = [ln.strip() for ln in sanitized_solution_text.splitlines() if ln.strip()]

    if lines and re.match(r"^####\s*[\d\.,]+$", lines[-1]):
        solution_mapping["FA"] = lines.pop(-1).strip()

    for i, line in enumerate(lines, 1):
        solution_mapping[f"L{i}"] = line

    if exclude_FA and "FA" in solution_mapping:
        del solution_mapping["FA"]

    return solution_mapping


BASE_MANIFEST_SCHEMA = {
    "type": "object",
    "properties": {
        "function_code": {
            "type": "string",
            "description": "A single string containing a complete, self-contained Python function that constitutes an end-to-end formalization of the solution."
        },
        "logical_steps": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "line_number": {"type": "string"},
                    "question_inputs": {"type": "array", "items": {"type": "string"}},
                    "WK_inputs": {"type": "array", "items": {"type": "string"}},
                    "output_variable": {"type": "string"},
                    "solution_line_template": {"type": "string"}
                },
                "required": ["line_number", "question_inputs", "WK_inputs", "output_variable", "solution_line_template"]
            }
        }
    },
    "required": ["function_code", "logical_steps"]
}


SYSTEM_PROMPT = """You are a data formalization expert who excels in mathematical reasoning and writing python code. You will be presented with a math word problem accompanied by a step-by-step natural language solution. You goal is to carefully and meticulously analyze the given question and solution, and formalize it by converting it into a structured json object that deconstructs the logic of the solution.

You MUST follow all rules and formatting instructions provided in the user prompt without deviation. Your entire output MUST be a single JSON object wrapped in ```json ... ```. Do not include any text or explanation before or after the JSON object."""

STATIC_PREFIXES = {}
for tier in TIER_LISTS.keys():
    prefix_file = TIER_EXAMPLES_DIRS[tier] / f"{tier}_user_prompt_prefix.txt"
    with open(prefix_file, 'r', encoding='utf-8') as f:
        STATIC_PREFIXES[tier] = f.read()


def append_sample_to_user_prompt(
        tier: str, 
        index: int, 
        dataset: datasets.Dataset = GSM8K_TRAIN
    ):
    """
    Appends a chosen sample from the GSM8K dataset to the user prompt for a specific tier. Returns the complete user prompt, ready to be sent to the LLM for manifest generation.
    """
    if tier not in TIER_LISTS:
        raise ValueError(f"Invalid tier: {tier}. Must be one of {list(TIER_LISTS.keys())}.")

    sample = dataset[index]
    question = sample['question']
    answer = build_solution_mapping(index, dataset)

    task_block = f"""## Input

**Index:**:
{index}

**Question:**:
{question}

**Solution mapping:**:
{json.dumps(answer, indent=2)}

## Output

"""
    return STATIC_PREFIXES[tier] + task_block


# Example usage
idx = CURRENT_INDICES[0]
user_prompt = append_sample_to_user_prompt(CURRENT_TIER, idx, GSM8K_TRAIN)
print(f"User prompt for index {idx} in {CURRENT_TIER} has {len(user_prompt)} characters:")
print("-"*80)
print(user_prompt)

User prompt for index 9 in tier2 has 18046 characters:
--------------------------------------------------------------------------------
In the TASK below, you will be given a math problem and its corresponding step-by-step solution. Each step in the solution is numbered (e.g. "L1", "L2" and so on), and many of the steps include calculator annotations (e.g. "<<20*0.1=2>>"). Your goal is to convert this information into a structured JSON object according to the following schema and detailed instructions.

# Detailed Field Instructions

## 1. "function_code"

This string must contain a Python function with the following characteristics:

*   **1.A. No Imports:** You should not have ANY imports. The very first line MUST be the function definition (i.e. `def solve():`).
*   **1.B. Function Naming & Docstring:** The function must be named `solve`, and it should not have any args. It must begin with a docstring that has exactly two lines:
    *   **1.B.i.** The first line must be: "Index: [In

#### Cell 4: API clients, concurrency limits, models

In [6]:
# Imports for API clients and related functionality
import os
import openai
import google.generativeai as genai
import anthropic
import asyncio
import nest_asyncio
from openai import AsyncOpenAI
from anthropic import AsyncClient
from dotenv import load_dotenv

# This must be done once per kernel to allow asyncio to run in a Jupyter notebook..
nest_asyncio.apply()

# Load API Keys from .env file
load_dotenv()
print("Loaded environment variables from .env file.")

# Initialize Asynchronous API Clients
try:
    openai_client_async = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    anthropic_client_async = AsyncClient(api_key=os.getenv("ANTHROPIC_API_KEY"))
    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
    print("API clients initialized successfully.")
except TypeError:
    print("API key not found for one or more services. Please check your .env file.")
    # Assign None to prevent errors in subsequent cells
    openai_client_async = None
    anthropic_client_async = None

# Define API Concurrency Limits to prevent 429 "Too Many Requests" errors.
API_CONCURRENCY_LIMITS = {
    "google": 30,    
    "anthropic": 2, 
    "openai": 2,    
}
print(f"API concurrency limits set to: {API_CONCURRENCY_LIMITS}")

MODEL_DICT = {
  "openai": "gpt-4.1",
  "google": "gemini-2.5-flash"
}

MODELS = [f"{provider}_{model}" for provider, model in MODEL_DICT.items()]
print(f"Available models: {MODELS}")

Loaded environment variables from .env file.
API clients initialized successfully.
API concurrency limits set to: {'google': 30, 'anthropic': 2, 'openai': 2}
Available models: ['openai_gpt-4.1', 'google_gemini-2.5-flash']


#### Cell 5: Helper functions to avoid rate limits in API calls

In [None]:
import time
import random

import threading
import datetime


_log_lock = threading.Lock()
def log_event(
        level: str, 
        index: int, 
        model: str, 
        message: str
    ):
    """A thread-safe logger for concurrent operations."""
    with _log_lock:
        ts = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="milliseconds")
        task_id = asyncio.current_task().get_name()
        print(f"{ts} [{level:^7s}] [Idx {index:<4}] [Mdl: {model:<15}] [Task {task_id:<8}] {message}")
 

async def with_api_retries(
        send_coroutine_factory,
        *,
        model_info: str,  # For informative logging
        max_attempts: int = 10,
        base_wait_seconds: int = 10
    ):
    """A wrapper to handle API retries with exponential backoff."""
    for attempt in range(max_attempts):
        try:
            return await send_coroutine_factory()
        except (openai.RateLimitError, anthropic.RateLimitError, Exception) as e:
            # Check for specific rate limit error types or a 429 status code in the error string
            if isinstance(e, (openai.RateLimitError, anthropic.RateLimitError)) or "429" in str(e):
                if attempt == max_attempts - 1:
                    print(f"❌ Final attempt failed for {model_info}. Giving up.")
                    raise
                
                # Exponential backoff with jitter
                wait_time = base_wait_seconds * (2 ** attempt) + random.uniform(0, 1)
                
                # More informative error message
                print(f"🕒 Rate limit on {model_info}. Retrying in {wait_time:.2f}s... (Attempt {attempt + 1}/{max_attempts})")
                await asyncio.sleep(wait_time)
            else:
                # If it's not a rate limit error, re-raise immediately
                raise
    return None


class RateLimitCoordinator:
    def __init__(self, refill_rate_per_sec: float, max_tokens: int):
        self.refill_rate_per_sec = refill_rate_per_sec
        self.max_tokens = float(max_tokens)
        self.tokens = self.max_tokens
        self._lock = asyncio.Lock()
        self._refill_task = None
        
    async def _refill(self):
        """The background task that refills the token bucket at 1-second intervals."""
        while True:
            await asyncio.sleep(1)
            async with self._lock:
                self.tokens = min(self.max_tokens, self.tokens + self.refill_rate_per_sec)
                if int(time.time()) % 10 == 0:
                    log_event("REFILL", -1, "Coordinator", f"Bucket budget at {int(self.tokens)} / {int(self.max_tokens)}")

    async def start(self):
        if self._refill_task is None:
            self._refill_task = asyncio.create_task(self._refill())

    async def stop(self):
        if self._refill_task:
            self._refill_task.cancel()
            self._refill_task = None

    async def get_tokens(self, index: int, model: str, tokens_needed: int):
        while True:
            async with self._lock:
                if self.tokens >= tokens_needed:
                    self.tokens -= tokens_needed
                    log_event("GRANT", index, model, f"Permission granted. Budget: {int(self.tokens + tokens_needed)} -> {int(self.tokens)}")
                    return
            
            deficit = tokens_needed - self.tokens
            wait_time = max(0.1, deficit / self.refill_rate_per_sec) # Ensure at least a small wait
            
            log_event("WAIT", index, model, f"Budget low. Needed: {tokens_needed}, Have: {int(self.tokens)}. Waiting ~{wait_time:.2f}s...")
            await asyncio.sleep(wait_time)

    async def return_tokens(self, index: int, model: str, tokens_returned: int):
        async with self._lock:
            self.tokens += tokens_returned
            log_event("RETURN", index, model, f"Tokens returned on failure. Budget restored to {int(self.tokens)}.")

    async def refund_tokens(self, index: int, model: str, tokens_refunded: int):
        """Refunds tokens to the budget after a successful call, correcting our estimate."""
        async with self._lock:
            self.tokens += tokens_refunded
            log_event("REFUND", index, model, f"Correcting estimate. Budget restored by {tokens_refunded} -> {int(self.tokens)}.")

#### Cell 6: Provider-specific API calling functions

In [None]:
import math
import copy

async def call_openai_async(
        model: str,
        system_prompt: str,
        user_prompt: str,
        index: int,
        json_schema: dict,
        openai_max_tokens: int 
    ) -> tuple[str, dict, dict]:
    """
    Prepares a provider-specific schema and handles an API call to OpenAI.
    """
    
    openai_schema = copy.deepcopy(json_schema)

    def add_additional_properties(schema_part):
        if isinstance(schema_part, dict):
            if schema_part.get("type") == "object":
                schema_part["additionalProperties"] = False
            for value in schema_part.values():
                add_additional_properties(value)
        elif isinstance(schema_part, list):
            for item in schema_part:
                add_additional_properties(item)

    add_additional_properties(openai_schema)
    openai_schema_wrapper = {
        "name": "manifest_formalization", 
        "strict": True, 
        "schema": openai_schema
    }
    
    messages = [{"role": "system", "content": system_prompt}, 
                {"role": "user", "content": user_prompt}]
    
    model_info = f"{model} (Index {index})"
    
    response_with_headers = await with_api_retries(
        lambda: openai_client_async.chat.completions.with_raw_response.create(
            model=model, 
            messages=messages, 
            temperature=0, 
            max_tokens=openai_max_tokens, 
            response_format={"type": "json_schema", "json_schema": openai_schema_wrapper} # type: ignore
        ),
        model_info=model_info
    )
    
    response = response_with_headers.parse()
    text_response = response.choices[0].message.content
    
    usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}
    if response.usage:
        usage["input_tokens"] = response.usage.prompt_tokens
        usage["output_tokens"] = response.usage.completion_tokens
        if hasattr(response.usage, 'prompt_tokens_details') and response.usage.prompt_tokens_details:
             usage["cached_tokens"] = response.usage.prompt_tokens_details.get("cached_tokens", 0)

    headers = response_with_headers.headers
    rate_limit_info = {
        "limit_requests": headers.get("x-ratelimit-limit-requests"),
        "limit_tokens": headers.get("x-ratelimit-limit-tokens"),
        "remaining_requests": headers.get("x-ratelimit-remaining-requests"),
        "remaining_tokens": headers.get("x-ratelimit-remaining-tokens"),
        "reset_requests": headers.get("x-ratelimit-reset-requests"),
        "reset_tokens": headers.get("x-ratelimit-reset-tokens"),
    }
             
    return text_response, usage, rate_limit_info


async def call_google_async(
        model: str,
        system_prompt: str,
        user_prompt: str,
        index: int,
        json_schema: dict
    ):
    """
    Handles a Google API call with schema enforcement and a safe
    upper bound on output tokens to prevent runaway generation.
    """
    
    safety_settings = {
        "HARM_CATEGORY_HARASSMENT": "BLOCK_NONE",
        "HARM_CATEGORY_HATE_SPEECH": "BLOCK_NONE",
        "HARM_CATEGORY_SEXUALLY_EXPLICIT": "BLOCK_NONE",
        "HARM_CATEGORY_DANGEROUS_CONTENT": "BLOCK_NONE",
    }
    
    gemini = genai.GenerativeModel(
        model_name=model,
        system_instruction=system_prompt,
        safety_settings=safety_settings
    )
    
    cfg = genai.types.GenerationConfig(
        temperature=0, 
        max_output_tokens=8192, # Safe upper bound
        response_mime_type="application/json",
        response_schema=json_schema
    )
    
    model_info = f"{model} (Index {index})"

    response = await with_api_retries(
        lambda: gemini.generate_content_async(user_prompt, generation_config=cfg),
        model_info=model_info
    )

    if not response.parts:
        raise ValueError(f"Google API returned an empty response for Index {index}.")

    text_response = response.text
    usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}
    if response.usage_metadata:
        usage["input_tokens"] = response.usage_metadata.prompt_token_count
        usage["output_tokens"] = response.usage_metadata.candidates_token_count
        
    return text_response, usage


#### Cell 7: Single API call function

In [None]:

import datetime
import json

async def run_single_api_call(
    provider: str,
    model: str,
    index: int,
    tier: str,
    dataset: 'datasets.Dataset',
    system_prompt: str,
    output_dir: Path,
    provider_sem: asyncio.Semaphore,
    json_schema: dict,
    coordinator: RateLimitCoordinator,
    openai_max_tokens: int,
    ):
    """
    Runs a single API call, but first checks if the output file already
    exists to avoid re-generating completed work.
    """

    output_path = output_dir / str(index) / f"{provider}_{model}.txt"
    if output_path.exists() and output_path.stat().st_size > 0:
        log_event("SKIPPED", index, model, "Output file already exists.")
        return {
            "provider": provider, "model": model, "index": index,
            "status": "Skipped", "time_s": 0,
            "input_tokens": 0, "output_tokens": 0, "cached_tokens": 0,
            "utc_completed": datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds")
        }
        
    async with provider_sem:
        user_prompt = append_sample_to_user_prompt(tier=tier, index=index, dataset=dataset)

        tokens_needed = 0
        if provider == "openai":
            estimated_prompt_tokens = len(user_prompt.split()) * 1.25
            tokens_needed = int(estimated_prompt_tokens + openai_max_tokens)
            await coordinator.get_tokens(index, model, tokens_needed)

        start_time = time.time()
        status = "Failed"
        usage = {"input_tokens": 0, "output_tokens": 0, "cached_tokens": 0}

        try:
            if provider == "openai":
                text_response, usage, _ = await call_openai_async(
                    model, system_prompt, user_prompt, index, json_schema, openai_max_tokens
                )
                log_event("SUCCESS", index, model, f"Call OK. Usage: {json.dumps(usage)}")
                
                true_cost = usage['input_tokens'] + usage['output_tokens']
                refund_amount = tokens_needed - true_cost
                if refund_amount > 0:
                    await coordinator.refund_tokens(index, model, refund_amount)
                    
            elif provider == "google":
                text_response, usage = await call_google_async(
                    model, system_prompt, user_prompt, index, json_schema
                )
                log_event("SUCCESS", index, model, f"Call OK. Usage: {json.dumps(usage)}")
            
            status = "Success"

        except Exception as e:
            log_event("ERROR", index, model, f"{type(e).__name__}: {str(e).splitlines()[0]}")
            text_response = f"--- ERROR ---\nIndex: {index}, Model: {model}\n{type(e).__name__}: {e}"
            
        finally:
            if status == "Failed" and provider == "openai" and tokens_needed > 0:
                await coordinator.return_tokens(index, model, tokens_needed)

        output_path.parent.mkdir(parents=True, exist_ok=True)
        output_path.write_text(text_response, encoding='utf-8')
        
        elapsed = time.time() - start_time
        
        return {
            "provider": provider, "model": model, "index": index,
            "status": status, "time_s": round(elapsed, 2),
            "input_tokens": usage["input_tokens"], "output_tokens": usage["output_tokens"],
            "cached_tokens": usage["cached_tokens"],
            "utc_completed": datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds")
        }

#### Cell 8: Main manifest generation functions

In [None]:
from tqdm.notebook import tqdm
import pandas as pd

async def generate_manifests_parallel_fixed(
    indices_to_generate: list[int],
    tier: str = CURRENT_TIER,
    dataset: 'datasets.Dataset' = GSM8K_TRAIN,
    model_dict: dict[str, str] = MODEL_DICT,
    system_prompt: str = SYSTEM_PROMPT,
    concurrency_limits: dict[str, int] = API_CONCURRENCY_LIMITS,
    json_schema: dict = BASE_MANIFEST_SCHEMA,
    openai_max_tokens: int = 4000
    ):
    """
    Fully parallel version with a robust token bucket coordinator for OpenAI.
    """
    print("--- Starting Manifest Generation (Token Bucket) ---")
    start_time = time.time()

    openai_coordinator = RateLimitCoordinator(refill_rate_per_sec=800, max_tokens=30000)
    await openai_coordinator.start()
    
    provider_semaphores = {prov: asyncio.Semaphore(limit) for prov, limit in concurrency_limits.items()}
    output_dir = TIER_OUTPUT_DIRS[tier]
    
    all_tasks = []
    for index in indices_to_generate:
        for provider, model in model_dict.items():
            task = asyncio.create_task(
                run_single_api_call(
                    provider=provider, model=model, index=index,
                    tier=tier, dataset=dataset, system_prompt=system_prompt,
                    output_dir=output_dir, provider_sem=provider_semaphores[provider],
                    json_schema=json_schema,
                    coordinator=openai_coordinator,
                    openai_max_tokens=openai_max_tokens
                )
            )
            all_tasks.append(task)
    
    print(f"Created {len(all_tasks)} total API call tasks")
    
    results = []
    try:
        with tqdm(total=len(all_tasks), desc="API Calls") as pbar:
            for task in asyncio.as_completed(all_tasks):
                result = await task
                results.append(result)
                pbar.update(1)
    finally:
        await openai_coordinator.stop()
    
    df = pd.DataFrame(results)
    run_ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d_%H%M%S")
    csv_path = output_dir / f"generation_performance_{run_ts}.csv"
    df.to_csv(csv_path, index=False)
    
    end_time = time.time()
    print(f"\n--- Manifest Generation Complete ---")
    print(f"Processed {len(indices_to_generate)} indices in {end_time - start_time:.2f} seconds.")
    print(f"Performance log saved to: {csv_path}")
    
    success_count = len(df[df['status'] == 'Success'])
    total_calls = len(df)
    if total_calls > 0:
      print(f"Success rate: {success_count}/{total_calls} ({100*success_count/total_calls:.1f}%)")
    
    return df

#### Cell 9: Running the main manifest generation function

In [13]:
# --- Test Run Configuration ---
# Generate manifests for all indices up to 200 for the current tier.
LOWER_LIMIT = 200
UPPER_LIMIT = 3000
INDICES_TO_GENERATE = [idx for idx in CURRENT_INDICES if idx <= UPPER_LIMIT and idx >= LOWER_LIMIT]

print(f"Starting test run for {len(INDICES_TO_GENERATE)} indices in {CURRENT_TIER} between {LOWER_LIMIT} and {UPPER_LIMIT}.")

# Run the parallel generation function with the test indices.
# This will test the full pipeline, including the new RateLimitCoordinator.
perf_df = await generate_manifests_parallel_fixed(indices_to_generate=INDICES_TO_GENERATE)

# Display the head of the resulting performance dataframe to verify execution.
print("\n--- Test Run Performance Summary ---")
perf_df.head()

Starting test run for 311 indices in tier2 between 200 and 3000.
--- Starting Manifest Generation (Token Bucket) ---
Created 622 total API call tasks


API Calls:   0%|          | 0/622 [00:00<?, ?it/s]

2025-07-10T18:57:25.806+00:00 [SKIPPED] [Idx 210 ] [Mdl: gpt-4.1        ] [Task Task-3  ] Output file already exists.
2025-07-10T18:57:25.807+00:00 [SKIPPED] [Idx 210 ] [Mdl: gemini-2.5-flash] [Task Task-4  ] Output file already exists.
2025-07-10T18:57:25.807+00:00 [SKIPPED] [Idx 211 ] [Mdl: gpt-4.1        ] [Task Task-5  ] Output file already exists.
2025-07-10T18:57:25.807+00:00 [SKIPPED] [Idx 211 ] [Mdl: gemini-2.5-flash] [Task Task-6  ] Output file already exists.
2025-07-10T18:57:25.807+00:00 [SKIPPED] [Idx 215 ] [Mdl: gpt-4.1        ] [Task Task-7  ] Output file already exists.
2025-07-10T18:57:25.807+00:00 [SKIPPED] [Idx 215 ] [Mdl: gemini-2.5-flash] [Task Task-8  ] Output file already exists.
2025-07-10T18:57:25.807+00:00 [SKIPPED] [Idx 228 ] [Mdl: gpt-4.1        ] [Task Task-9  ] Output file already exists.
2025-07-10T18:57:25.807+00:00 [SKIPPED] [Idx 228 ] [Mdl: gemini-2.5-flash] [Task Task-10 ] Output file already exists.
2025-07-10T18:57:25.807+00:00 [SKIPPED] [Idx 242 ] [

Unnamed: 0,provider,model,index,status,time_s,input_tokens,output_tokens,cached_tokens,utc_completed
0,openai,gpt-4.1,210,Skipped,0.0,0,0,0,2025-07-10T18:57:25+00:00
1,google,gemini-2.5-flash,210,Skipped,0.0,0,0,0,2025-07-10T18:57:25+00:00
2,openai,gpt-4.1,211,Skipped,0.0,0,0,0,2025-07-10T18:57:25+00:00
3,google,gemini-2.5-flash,211,Skipped,0.0,0,0,0,2025-07-10T18:57:25+00:00
4,openai,gpt-4.1,215,Skipped,0.0,0,0,0,2025-07-10T18:57:25+00:00
