# Retrieve Batch Results from OpenAI API

This notebook retrieves completed batch results from the OpenAI API.

## Prerequisites
- Set the `OPENAI_API_KEY` environment variable
- Have a completed batch job from `create_batch_gpt.ipynb`
- Know the path to your batch log file (created during batch submission)

## Features
- Retrieves results from completed batches
- Tracks token usage (including reasoning tokens for o3/GPT-5)
- Automatic resubmission of failed prompts
- Merges with existing results for incremental collection
- Reports errors and missing results

## Output Format
```json
{
  "meta_data": {"file_name": "...", "inference_model": "..."},
  "token_stats": {"total_input_tokens": ..., "total_output_tokens": ...},
  "results": {"case_001": "response text", ...}
}
```

In [None]:
# ============================================================================
# CONFIGURATION - Edit these variables before running
# ============================================================================
from pathlib import Path

# Base directory (should match the one used in create_batch_gpt.ipynb)
BASE_DIR = Path(".")  # Change to your data directory

# Settings from batch creation (must match what you used)
INPUT_DIR = "data"  # Directory that contained input JSON
INPUT_FILE = "your_input_file"  # Name without .json extension
MODEL_NAME = "gpt-4.1-2025-04-14"  # Model used for batch
REASONING_EFFORT = "medium"  # For o3/gpt-5: "low", "medium", "high", or None

# Output directory for results
OUTPUT_DIR = BASE_DIR / "output"
LOGS_DIR = BASE_DIR / "logs"

# ============================================================================

In [None]:
import json
import os
from openai import OpenAI

# Determine thinking suffix
is_reasoning_model = "o3" in MODEL_NAME or "gpt-5" in MODEL_NAME
thinking_effort = REASONING_EFFORT if is_reasoning_model else None
thinking_suffix = f"_thinking_{thinking_effort}" if thinking_effort else ""

# Build paths
log_path = LOGS_DIR / INPUT_DIR / MODEL_NAME / f"{INPUT_FILE}{thinking_suffix}.json"
output_path = OUTPUT_DIR / INPUT_DIR / MODEL_NAME / f"{INPUT_FILE}{thinking_suffix}.json"

# Load log file to get batch_id and key_dict
with open(log_path, 'r') as f:
    data = json.load(f)

key_dict = data["key_dict"]
message_batch_id = data['message_batch']['id']

print(f"Log file: {log_path}")
print(f"Batch ID: {message_batch_id}")
print(f"Model: {MODEL_NAME}")
print(f"Expected results: {len(key_dict)}")
print(f"Output path: {output_path}")

# Load existing results if any
existing_dict = {}
if output_path.exists():
    with open(output_path, 'r') as f:
        existing_dict = json.load(f)
    print(f"Found existing results: {len(existing_dict.get('results', {}))}")

# Initialize client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [None]:
def merge_nested_dicts(dict1, dict2):
    """Merge two dictionaries, keeping existing values on conflict."""
    result = dict1.copy()
    for key, value in dict2.items():
        if key not in result:
            result[key] = value
        elif isinstance(value, dict) and isinstance(result[key], dict):
            result[key] = merge_nested_dicts(result[key], value)
    return result

In [None]:
# Check if we already have all results
if existing_dict:
    existing_keys = set(existing_dict.get("results", {}).keys())
    expected_keys = set(key_dict.values())
    
    print(f"Expected results: {len(expected_keys)}")
    print(f"Existing results: {len(existing_keys)}")
    
    if existing_keys == expected_keys:
        print(f"\nAll results already collected!")
        print(f"Results file: {output_path}")
        raise SystemExit("All results already collected.")
    elif len(existing_keys) > 0:
        missing = expected_keys - existing_keys
        print(f"Missing {len(missing)} results from existing file")

In [None]:
# Retrieve batch and results
batch = client.batches.retrieve(message_batch_id)
print(f"Batch ID: {batch.id}")
print(f"Status: {batch.status}")
print(f"\nRequest counts:")
print(f"  Completed: {batch.request_counts.completed}")
print(f"  Failed: {batch.request_counts.failed}")
print(f"  Total: {batch.request_counts.total}")

if batch.status != "completed":
    print(f"\nBatch not yet completed. Status: {batch.status}")
    raise SystemExit("Batch still processing. Try again later.")

print("\nBatch completed! Retrieving results...")

In [None]:
# Download and parse results
output_file_id = batch.output_file_id
file_response = client.files.content(output_file_id)

result_dict = {}
processed_ids = set()
errors_dict = {}

# Token counters
total_input_tokens = 0
total_output_tokens = 0
num_prompts = 0

for line in file_response.text.strip().splitlines():
    result = json.loads(line)
    custom_id = result['custom_id']
    processed_ids.add(custom_id)

    if result.get("error"):
        result_key = key_dict.get(custom_id)
        if result_key:
            errors_dict[result_key] = result['error']
        print(f"Error for {custom_id}: {result['error']}")
        continue

    result_key = key_dict[custom_id]
    output = result["response"]["body"]["choices"][0]["message"]["content"]

    if output:
        result_dict[result_key] = output
        
        # Extract token usage
        if "usage" in result["response"]["body"]:
            usage = result["response"]["body"]["usage"]
            total_input_tokens += usage.get("prompt_tokens", 0)
            total_output_tokens += usage.get("completion_tokens", 0)
            num_prompts += 1

# Calculate statistics
token_stats = {
    "total_input_tokens": total_input_tokens,
    "total_output_tokens": total_output_tokens,
    "num_prompts": num_prompts,
    "avg_input_tokens": total_input_tokens / num_prompts if num_prompts > 0 else 0,
    "avg_output_tokens": total_output_tokens / num_prompts if num_prompts > 0 else 0
}

# Report results
missing_ids = set(key_dict.keys()) - processed_ids
print(f"\nTotal submitted: {len(key_dict)}")
print(f"Successful: {len(result_dict)}")
print(f"Errored: {len(errors_dict)}")
print(f"Missing: {len(missing_ids)}")

print(f"\n=== Token Statistics ===")
print(f"Total input tokens: {token_stats['total_input_tokens']:,}")
print(f"Total output tokens: {token_stats['total_output_tokens']:,}")
print(f"Average input tokens: {token_stats['avg_input_tokens']:.1f}")
print(f"Average output tokens: {token_stats['avg_output_tokens']:.1f}")

In [None]:
# Handle failed prompts (optional: automatic resubmission)
if missing_ids:
    print(f"\n{len(missing_ids)} prompts failed or missing.")
    print(f"Missing custom_ids: {sorted(missing_ids)}")
    print("\nTo resubmit failed prompts:")
    print("1. Filter your original input JSON to include only failed cases")
    print("2. Run create_batch_gpt.ipynb with the filtered input")
    print("3. Retrieve results and merge with existing results")
else:
    print("\nAll prompts completed successfully!")

In [None]:
# Merge with existing results and save
if existing_dict:
    result_dict = merge_nested_dicts(existing_dict.get("results", {}), result_dict)
    
    # Merge token stats
    if "token_stats" in existing_dict:
        old_stats = existing_dict["token_stats"]
        token_stats["total_input_tokens"] += old_stats.get("total_input_tokens", 0)
        token_stats["total_output_tokens"] += old_stats.get("total_output_tokens", 0)
        token_stats["num_prompts"] += old_stats.get("num_prompts", 0)
        if token_stats["num_prompts"] > 0:
            token_stats["avg_input_tokens"] = token_stats["total_input_tokens"] / token_stats["num_prompts"]
            token_stats["avg_output_tokens"] = token_stats["total_output_tokens"] / token_stats["num_prompts"]
    print("Merged with existing results")

# Create output directory and save
output_path.parent.mkdir(parents=True, exist_ok=True)

final_dict = {
    "meta_data": {
        "file_name": INPUT_FILE,
        "inference_model": MODEL_NAME
    },
    "token_stats": token_stats,
    "results": result_dict
}

with open(output_path, "w") as f:
    json.dump(final_dict, f, indent=4)

print(f"\nSaved results to: {output_path}")
print(f"Total cases with results: {len(result_dict)}")