In [3]:
"""
Download OpenAI Batch outputs and name them after their input file (+ '_out').

Requirements:
  - pip install openai
  - Set environment variable: OPENAI_API_KEY
Usage:
  - Edit OUTPUT_DIR below, or pass a path to download_batch_outputs(output_dir=...)
  - Optional: pass a list of batch IDs to download only those batches.
"""

import os
from pathlib import Path
from typing import Iterable, Optional

from openai import OpenAI
import apikey
# === CONFIG ===
OUTPUT_DIR = r"D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches"  # <-- change as you like

client = OpenAI( api_key=apikey.get_api_key())


# Set your API key

def safe_mkdir(path: str | Path) -> Path:
    p = Path(path)
    p.mkdir(parents=True, exist_ok=True)
    return p

def filename_from_input_file(input_file_id: str) -> str:
    """
    Retrieve the original filename of the uploaded input file.
    Fallback to the file ID if the name isn't available.
    """
    f = client.files.retrieve(input_file_id)
    return getattr(f, "filename", None) or input_file_id

def build_output_name(input_filename: str) -> str:
    """
    Turn 'batch_0.jsonl' -> 'batch_0_out.jsonl'
    If there is no extension, just append '_out.jsonl'.
    """
    stem = Path(input_filename).stem  # 'batch_0'
    return f"{stem}_out.jsonl"



def download_file(file_id: str, dest_path: Path) -> None:
    resp = client.files.content(file_id)   # returns a readable object
    data = resp.read()                     # just call .read(), no "with"
    dest_path.write_bytes(data)


def list_completed_batches(limit: int = 1000):
    """
    Generator over completed batches (status == 'completed').
    Paginates through all batches up to `limit`.
    """
    fetched = 0
    after = None
    while fetched < limit:
        resp = client.batches.list(limit=min(100, limit - fetched), after=after)
        if not resp.data:
            break
        for b in resp.data:
            yield b
        fetched += len(resp.data)
        # batches are returned sorted by created time desc; pagination token:
        after = resp.last_id
        if not after:
            break

def resolve_batches(batch_ids: Optional[Iterable[str]] = None):
    """
    Yield batch objects either by specific IDs or all completed batches.
    """
    if batch_ids:
        for bid in batch_ids:
            yield client.batches.retrieve(bid)
    else:
        yield from list_completed_batches()

def download_batch_outputs(
    output_dir: str | Path = OUTPUT_DIR,
    batch_ids: Optional[Iterable[str]] = None,
    skip_existing: bool = True,
) -> None:
    """
    Download outputs for completed batches, naming them after their input file.

    Parameters
    ----------
    output_dir : directory to save outputs
    batch_ids : iterable of batch IDs to restrict downloads; if None, process all completed batches
    skip_existing : if True, do not overwrite files already present
    """
    outdir = safe_mkdir(output_dir)

    count_ok = 0
    count_skip = 0
    count_waiting = 0
    count_error = 0

    for batch in resolve_batches(batch_ids):
        status = getattr(batch, "status", "unknown")
        input_file_id = getattr(batch, "input_file_id", None)
        output_file_id = getattr(batch, "output_file_id", None)
        error_file_id = getattr(batch, "error_file_id", None)

        # Only proceed if completed
        if status != "completed":
            print(f"↷ Batch {batch.id}: status={status} (not downloaded)")
            if status in {"finalizing", "in_progress"}:
                count_waiting += 1
            continue

        if not input_file_id:
            print(f"⚠️  Batch {batch.id}: missing input_file_id; skipping")
            count_error += 1
            continue

        if not output_file_id:
            # Batch completed but no output file (should be rare); surface error file if present
            msg = f"⚠️  Batch {batch.id}: completed but no output_file_id"
            if error_file_id:
                msg += f" (error_file_id={error_file_id})"
            print(msg)
            count_error += 1
            continue

        # Get the original input filename and build the destination name
        input_filename = filename_from_input_file(input_file_id)
        out_name = build_output_name(input_filename)
        dest = outdir / out_name

        if skip_existing and dest.exists():
            print(f"⏭️  {dest.name} already exists; skipping")
            count_skip += 1
            continue

        try:
            download_file(output_file_id, dest)
            print(f"✅  Saved: {dest}")
            count_ok += 1
        except Exception as e:
            print(f"❌  Failed to save {dest.name} from batch {batch.id}: {e}")
            count_error += 1

    print("\n=== Summary ===")
    print(f"Downloaded: {count_ok}")
    print(f"Skipped:    {count_skip}")
    print(f"Waiting:    {count_waiting} (not completed yet)")
    print(f"Errors:     {count_error}")

if __name__ == "__main__":
    # Example usage:
    #   - To process ALL completed batches: just run the script as-is.
    #   - To target specific batches, pass their IDs:
    #       download_batch_outputs(batch_ids=["batch_abc123", "batch_def456"])
    download_batch_outputs(output_dir=OUTPUT_DIR)


✅  Saved: D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\batch_59_G2_out.jsonl
✅  Saved: D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\batch_58_G2_out.jsonl
✅  Saved: D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\batch_57_G2_out.jsonl
✅  Saved: D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\batch_56_G2_out.jsonl
✅  Saved: D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\batch_55_G2_out.jsonl
✅  Saved: D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\batch_54_G2_out.jsonl
✅  Saved: D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\batch_53_G2_out.jsonl
✅  Saved: D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\batch_52_G2_out.jsonl
✅  Saved: D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\batch_51_G2_out.jsonl
✅

NotFoundError: Error code: 404 - {'error': {'message': 'No such File object: file-3vkP14bcr33DcMLLiSi1oN', 'type': 'invalid_request_error', 'param': 'id', 'code': None}}