In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Imports
from data_processing.gpt_processing import (
    set_api_client, 
    generate_messages, 
    create_jsonl_file_for_batch, 
    start_batch, 
    get_batch_response,
    get_completed_batches,
    set_model_settings,
    get_batch_status,
    get_active_batches,
    get_all_batch_info,
    token_count,
    run_immediate_chat_process,
    run_single_batch,
    get_last_batch_response,
    delete_old_files
)

from data_processing.xml_processing import ( 
    save_pages_to_xml,
    split_xml_pages
)

from data_processing.text_processing import (
    get_text_from_file,
    write_text_to_file
)
from pathlib import Path
%aimport time
%aimport json
%aimport datetime

from datetime import datetime

In [3]:
# Set up API client
client = set_api_client()

In [4]:
model_settings = {
    "gpt-4o": {
        "max_tokens": 5000,
        "context_limit": 20000,  # Total context limit for the model
        "temperature": 0.25
    },
    "gpt-3.5-turbo": {
        "max_tokens": 4096,  # Set conservatively to avoid errors
        "context_limit": 16384  # Same as gpt-4o
        }
    }

set_model_settings(model_settings)

In [5]:
# File paths
project_dir = Path("/Users/phapman/Desktop/tnh-scholar/")
data_dir = project_dir / "data_processing"
journal_dir = data_dir / "processed_journal_data"
journal_name = "phat-giao-viet-nam-1956-01"
working_dir = journal_dir / journal_name
input_xml = working_dir / f"TEST2_full_cleaned_{journal_name}.xml"
translated_xml = journal_dir / f"translation_{journal_name}.xml"
section_batch_jsonl = working_dir / "section_batch.jsonl"
translate_batch_jsonl = working_dir / "translation_batch.jsonl"
section_metadata = working_dir / "section_metadata.json"
logfile = data_dir / "gpt_processing" / "processing_info.log"

In [6]:
delete_old_files(datetime.now())

In [8]:
batches = get_all_batch_info()

In [9]:
[batch['id'] for batch in batches]

['batch_674bd5dd2f3c8190afd4d37df164f59a',
 'batch_674bd56e785c8190871cfad1b187918b',
 'batch_674bc8ac60d88190aa8fe812abf5660d',
 'batch_674bc89be1c08190ac09a0e87069c59e',
 'batch_674bc88b2528819097a5cbd4d203db92',
 'batch_674bc87abd908190af8edc436b216424',
 'batch_674bc86aa0c4819096ad016c4582e3a3',
 'batch_674bc7565d7881909b3e0bb90bb709e0',
 'batch_674bc74574608190bd395b0b741d240e',
 'batch_674bc73468708190b033c2223e34600d',
 'batch_674bc71f33008190a19c2a25bce075e6',
 'batch_674bc303cd608190b3564a2d739cf875',
 'batch_674bc2f39d888190afd26cbb9e321599',
 'batch_674bc2c96fac8190b5bc1c11d3d5e43e',
 'batch_674bc2b844a08190990718e450aefb82',
 'batch_674bc2a768b08190a66b0623a60ca245',
 'batch_674b92c18e848190a9791449ab587afc',
 'batch_674b92b10e548190be2f43597dd8297d',
 'batch_674b92a026c8819099d515684337feac',
 'batch_674b7547d5b881908b831d791cd0be4f',
 'batch_674b73cd5bd0819083c3d983d71da271',
 'batch_674b68d18c608190880b6e86ada37ac5',
 'batch_674b68c0d2c8819081ec1a36400d2490',
 'batch_674

In [None]:
import os
import time
from openai import OpenAI
from pathlib import Path

test_file = Path("./temp_batch_run.jsonl")

def start_and_poll_batch(jsonl_file: Path, interval: int = 10, description=""):
    """
    Starts a batch process and polls the batch status until it completes or fails.
    Runs for a maximum of 100 attempts to demonstrate intermittent failures.

    Args:
        jsonl_file (Path): Path to the .jsonl batch file.
        interval (int): Time interval in seconds to wait between polling attempts.
        description (str): Metadata description for the batch job.

    Returns:
        bool: True if successful, False if the batch fails.
    """
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    def get_batch_status(batch_id):
        batch = client.batches.retrieve(batch_id)
        return batch.status
    
    for attempt in range(100):
        # Start the batch
        with jsonl_file.open("rb") as file:
            batch_input_file = client.files.create(file=file, purpose="batch")

        batch = client.batches.create(
            input_file_id=batch_input_file.id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={"description": description}
        )
        batch_id = batch.id
        print(f"Batch started successfully: {batch_id}")

        time.sleep(interval)

        while(True):
            batch_status = get_batch_status(batch_id)

            if batch_status == "completed":
                print("Batch completed successfully.")
                return True

            elif batch_status == "failed":
                print(f"Batch failed on attempt {attempt + 1}. Retrying...")
                break # exit this loop and retry batches.

            else:
                print(f"batch status: {batch_status}")
                time.sleep(interval)
                continue

    print("Exceeded maximum attempts (100). Exiting.")
    return False

In [20]:
start_and_poll_batch(test_file)

Batch started successfully: batch_674bdb63e2c88190a8ee013663b60dfa
Batch failed on attempt 1. Retrying...
Batch started successfully: batch_674bdb6fb724819089bedac2b018e2f7
Batch completed successfully.


True