In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Imports
from data_processing.gpt_processing import (
    generate_messages, 
    create_jsonl_file_for_batch, 
    start_batch,
    start_batch_with_retries, 
    get_batch_response,
    get_completed_batches,
    set_model_settings,
    get_batch_status,
    get_active_batches,
    get_all_batch_info,
    token_count,
    run_immediate_chat_process,
    run_single_batch,
    get_last_batch_response,
)

from data_processing.xml_processing import ( 
    save_pages_to_xml,
    split_xml_on_pagebreaks,
    join_xml_data_to_doc
)

from data_processing.text_processing import (
    get_text_from_file,
    write_text_to_file
)
%aimport time
%aimport json
%aimport datetime
%aimport logging
from pathlib import Path
from types import SimpleNamespace
from math import floor
from datetime import datetime
import json


In [None]:
# File paths
project_dir = Path("/Users/phapman/Desktop/tnh-scholar/")
data_dir = project_dir / "data_processing"
journal_dir = data_dir / "processed_journal_data"
journal_name = "phat-giao-viet-nam-1956-02"
working_dir = journal_dir / journal_name
input_xml = working_dir / f"full_cleaned_{journal_name}.xml"
translation_xml_path = working_dir / f"translation_{journal_name}.xml"
section_batch_jsonl = working_dir / "section_batch.jsonl"
translate_batch_jsonl = working_dir / "translation_batch.jsonl"
section_metadata_out = working_dir / "section_metadata.json"
raw_json_metadata_path = working_dir / "raw_metadata_response.txt"
logfile = data_dir / "gpt_processing" / "processing_info.log"


In [None]:
# constants
MAX_TOKEN_LIMIT = 20000
MAX_BATCH_RETRIES = 20  # Number of retries
BATCH_RETRY_DELAY = 5  # seconds to wait before retry

In [None]:
# Set up the logger
def setup_logger(log_file_path):
    """
    Configures the logger to write to a log file and the console.
    """
    # Remove existing handlers
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)

    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",  # Include logger name
        handlers=[
            logging.FileHandler(log_file_path, encoding="utf-8"),
            logging.StreamHandler()  # Optional: to log to the console as well
        ]
    )

    # Suppress DEBUG/INFO logs for specific noisy modules
    modules_to_suppress = ["httpx", "httpcore", "urllib3", "openai"]
    for module in modules_to_suppress:
        logger = logging.getLogger(module)
        logger.setLevel(logging.WARNING)  # Suppress DEBUG and INFO logs

    
    return logging.getLogger(__name__)

In [None]:
logger = setup_logger(logfile)

In [None]:
model_settings = {
    "gpt-4o": {
        "max_tokens": 5000,
        "context_limit": 20000,  # Total context limit for the model
        "temperature": 0.25
    }
}
set_model_settings(model_settings)

In [None]:
system_message_section = """
You are a highly skilled assistant processing a Vietnamese Buddhist journal scanned from OCR. Use the title: "Journal of Vietnamese Buddhism."
You will be determining the journal sections by page number. You will also generate metadata for the full text and each section. 
You will return this metadata in JSON format.

Instructions:
1. Analyze the text and divide it into sections based on logical breaks, such as headings, topic changes, or clear shifts in content.
2. Ensure every page is part of  a section, even if that section is titled "blank page" or "title page," for example.
3. For each section, provide:
   - The original title in Vietnamese (`section_title_vi`).
   - The translated title in English (`section_title_en`).
   - The author's name if it is available (`section_author`). 
   - A one-paragraph summary of the section in English (`section_summary`).
   - A list of keywords for the section that are related to its content, these can be proper names, specific concepts, or contextual information.
   - The section's start and end page numbers (`start_page` and `end_page`).
   - Use "null" for any data that is not available (such as author name) for the section.

4. Return the output as a JSON object with the following schema:
{
    "journal_summary": "A one-page summary of the whole journal in English.",
    "sections": [
        {
            "title_vi": "Original title in Vietnamese",
            "title_en": "Translated title in English",
            "author": "Name of the author of the section",
            "summary": "One-paragraph summary of the section in English",
            "keywords": "A list of keywords for the section",
            "start_page":  X,
            "end_page":  Y
        },
        ...
    ]
}

5.  Ensure the JSON is well-formed and adheres strictly to the provided schema.
"""

In [None]:
system_message_translate = """
You are Thich Nhat Hanh translating from Vietnamese to English for your experienced students. 
The text is based on an OCR scan of a journal you edited from 1956-1958. Use the title: "Journal of Vietnamese Buddhism" for the journal when it is referenced.
You will be translating a single section of the journal and will be provided with the section title in English. 
You want your students to understand the text in its larger historical context, in the context of Vietnamese Buddhism, and in the context of your own life.
Translate for the most meaningful, typical, and eloquent English interpretation that is simple, yet poetic. Translate literally, don't add any content. 
Notes on the text can be added in the <notes>.
Make corrections in the text only where necessary (for example if words are missing) to create logical flow. Note all corrections in the <translation-notes>. 
Do not change <pagebreak> tag postioning. Each translated page must match its original page source as pages will be studied side by side with the original Vietnamese.
Infer paragraphs and text structure from the text layout.
Add XML tags for clarity, using only the following tags: 

   <section> for major sections.
   <subsection> for subsections.
   <title> for main titles of sections and subsections. 
   <subtitle> for subtitles of sections and subsections. 
   <heading> for headings that do not mark titles or subtitles
   <p> for paragraphs.
   <br/> for linebreaks that add meaning such as in poems or other structures.
   <TOC> for tables of contents
   <author> for authors of sections or subsections
   <ol> <ul> <li> for lists
   <i> for italics. 
   <b> for bold.
   <notes>
   <translation-notes>

You may use <notes> at the end of the section for notes on historical, cultural, spiritual, or other interesting elements of the text.
You may add <translation-notes> at the end of the section as a commentary to summarize your translation choices. 
For <translation-notes>, you may include information on Sino-Vietnamese, complex, unusual, poetic, or other interesting terms, and significant corrections to the text. 
In the <translation-notes> include the original Vietnamese terms for reference.

IMPORTANT: All titles, XML sections, text, and terms should be translated. Do not however, translate names of people; leave names in Vietnamese with diacritics.
IMPORTANT: Return pure XML with no formatting marks such as xml or ```.
IMPORTANT: The returned XML should begin and end with <section> tags.
"""

In [None]:
import json

def deserialize_json(serialized_data: str):
    """
    Converts a serialized JSON string into a Python dictionary.

    Args:
        serialized_data (str): The JSON string to deserialize.

    Returns:
        dict: The deserialized Python dictionary.
    """
    if not isinstance(serialized_data, str):
        logger.error(f"String input required for deserialize_json. Received: {type(serialized_data)}")
        raise ValueError("String input required.")

    try:
        # Convert the JSON string into a dictionary
        return json.loads(serialized_data)
    except json.JSONDecodeError as e:
        logger.error(f"Failed to deserialize JSON: {e}")
        raise

In [None]:
# Define the schema
journal_schema = {
    "type": "object",
    "properties": {
        "journal_summary": {"type": "string"},
        "sections": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "title_vi": {"type": "string"},
                    "title_en": {"type": "string"},
                    "author": {"type": ["string", "null"]},
                    "summary": {"type": "string"},
                    "keywords": {"type": "array", "items": {"type": "string"}},
                    "start_page": {"type": "integer", "minimum": 1},
                    "end_page": {"type": "integer", "minimum": 1}
                },
                "required": [
                    "title_vi",
                    "title_en",
                    "summary",
                    "keywords",
                    "start_page",
                    "end_page"
                ]
            }
        }
    },
    "required": ["journal_summary", "sections"]
}

def validate_and_clean_data(data, schema):
    """
    Recursively validate and clean AI-generated data to fit the given schema.
    Any missing fields are filled with defaults, and extra fields are ignored.

    Args:
        data (dict): The AI-generated data to validate and clean.
        schema (dict): The schema defining the required structure.

    Returns:
        dict: The cleaned data adhering to the schema.
    """
    def clean_value(value, field_schema):
        """
        Clean a single value based on its schema, attempting type conversions where necessary.
        """
        field_type = field_schema["type"]

        # Handle type: string
        if field_type == "string":
            if isinstance(value, str):
                return value
            elif value is not None:
                return str(value)
            return "unset"

        # Handle type: integer
        elif field_type == "integer":
            if isinstance(value, int):
                return value
            elif isinstance(value, str) and value.isdigit():
                return int(value)
            try:
                return int(float(value))  # Handle cases like "2.0"
            except (ValueError, TypeError):
                return 0

        # Handle type: array
        elif field_type == "array":
            if isinstance(value, list):
                item_schema = field_schema.get("items", {})
                return [clean_value(item, item_schema) for item in value]
            elif isinstance(value, str):
                # Try splitting comma-separated strings into a list
                return [v.strip() for v in value.split(",")]
            return []

        # Handle type: object
        elif field_type == "object":
            if isinstance(value, dict):
                return validate_and_clean_data(value, field_schema)
            return {}

        # Handle nullable strings
        elif field_type == ["string", "null"]:
            if value is None or isinstance(value, str):
                return value
            return str(value)

        # Default case for unknown or unsupported types
        return "unset"

    def clean_object(obj, obj_schema):
        """
        Clean a dictionary object based on its schema.
        """
        if not isinstance(obj, dict):
            print(f"Expected dict but got: \n{type(obj)}: {obj}\nResetting to empty dict.")
            return {}
        cleaned = {}
        properties = obj_schema.get("properties", {})
        for key, field_schema in properties.items():
            # Set default value for missing fields
            cleaned[key] = clean_value(obj.get(key), field_schema)
        return cleaned

    # Handle the top-level object
    if schema["type"] == "object":
        cleaned_data = clean_object(data, schema)
        return cleaned_data
    else:
        raise ValueError("Top-level schema must be of type 'object'.")

def validate_and_save_metadata(output_file_path: Path, json_metadata_serial: str, schema):
    """
    Validates and cleans journal data against the schema, then writes it to a JSON file.

    Args:
        data (str): The journal data as a serialized JSON string to validate and clean.
        schema (dict): The schema defining the required structure.
        output_file_path (str): Path to the output JSON file.

    Returns:
        bool: True if successfully written to the file, False otherwise.
    """
    try:
        # Clean the data to fit the schema
        data = deserialize_json(json_metadata_serial)
        cleaned_data = validate_and_clean_data(data, schema)

        # Write the parsed data to the specified JSON file
        with open(output_file_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, indent=4, ensure_ascii=False)
        logger.info(f"Parsed and validated metadata successfully written to {output_file_path}")
        return True
    except Exception as e:
        logger.error(f"An error occurred during validation or writing: {e}")
        raise 

In [None]:
def extract_page_groups_from_metadata(metadata):
    """
    Extracts page groups from the section metadata for use with `split_xml_pages`.

    Parameters:
        metadata (dict): The section metadata containing sections with start and end pages.

    Returns:
        List[Tuple[int, int]]: A list of tuples, each representing a page range (start_page, end_page).
    """
    page_groups = []

    # Ensure metadata contains sections
    if "sections" not in metadata or not isinstance(metadata["sections"], list):
        raise ValueError("Metadata does not contain a valid 'sections' key with a list of sections.")

    for section in metadata["sections"]:
        try:
            # Extract start and end pages
            start_page = section.get("start_page")
            end_page = section.get("end_page")

            # Ensure both start_page and end_page are integers
            if not isinstance(start_page, int) or not isinstance(end_page, int):
                raise ValueError(f"Invalid page range in section: {section}")

            # Add the tuple to the page groups list
            page_groups.append((start_page, end_page))

        except KeyError as e:
            print(f"Missing key in section metadata: {e}")
        except ValueError as e:
            print(f"Error processing section metadata: {e}")

    logger.debug(f"page groups found: {page_groups}")

    return page_groups

In [None]:
def batch_sectioning(input_xml_path, journal_name):
    """
    Splits the journal content into sections using GPT, with retries for both starting and completing the batch.

    Args:
        input_xml_path (str): Path to the input XML file.
        output_json_path (str): Path to save validated metadata JSON.
        raw_output_path (str): Path to save the raw batch results.
        journal_name (str): Name of the journal being processed.
        max_retries (int): Maximum number of retries for batch processing.
        retry_delay (int): Delay in seconds between retries.

    Returns:
        str: the result of the batch sectioning process as a serialized json object. 
    """
    try:
        logger.info(f"Starting sectioning batch for {journal_name} with file:\n\t{input_xml_path}")
        # Load journal content
        journal_pages = get_text_from_file(input_xml_path)

        # Create GPT messages for sectioning
        user_message_wrapper = lambda text: f"{text}"
        messages = generate_messages(system_message_section, user_message_wrapper, [journal_pages])

        # Create JSONL file for batch processing
        jsonl_file = create_jsonl_file_for_batch(messages, section_batch_jsonl, json_mode=True)

    except Exception as e:
        logger.error(
            f"Failed to initialize batch sectioning data for journal '{journal_name}'.",
            extra={"input_xml_path": input_xml_path},
            exc_info=True
        )
        raise RuntimeError(f"Error initializing batch sectioning data for journal '{journal_name}'.") from e

    response = start_batch_with_retries(jsonl_file, description=f"Batch for sectioning journal: {journal_name} | input file: {input_xml_path}")
    
    if response:
        json_result = response[0]  # should return json, just one batch so first response
        # Log success and return output json
        logger.info(f"Successfully batch sectioned journal '{journal_name}' with input file: {input_xml_path}.")
        return json_result
    else:
        logger.error("Section batch failed to get response.")
        return ""

In [None]:
def save_sectioning_data(output_json_path: Path, raw_output_path: Path, serial_json: str):
    try:
        write_text_to_file(raw_output_path, serial_json, force=True)
    except Exception as e:
        logger.error(
            f"Failed to write raw response file for journal '{journal_name}'.",
            extra={"raw_output_path": raw_output_path},
            exc_info=True
        )
        raise RuntimeError(f"Failed to write raw response file for journal '{journal_name}'.") from e

    # Validate and save metadata
    try:
        valid = validate_and_save_metadata(output_json_path, serial_json, journal_schema)
        if not valid:
            raise RuntimeError(f"Validation failed for metadata of journal '{journal_name}'.")
    except Exception as e:
        logger.error(
            f"Error occurred while validating and saving metadata for journal '{journal_name}'.",
            extra={"output_json_path": output_json_path},
            exc_info=True
        )
        raise RuntimeError(f"Validation error for journal '{journal_name}'.") from e

    return output_json_path

In [None]:
def send_data_for_tx_batch(section_data_to_send, max_token_list):
    
    max_retries = MAX_BATCH_RETRIES
    retry_delay = BATCH_RETRY_DELAY

    # Build file for batch translation processing:
    try:
        # Create GPT messages for translation
        user_message_wrapper = lambda section_info: f"Translate this section with title '{section_info.title}':\n{section_info.content}"
        messages = generate_messages(system_message_translate, user_message_wrapper, section_data_to_send)

        # Create batch file
        jsonl_file = create_jsonl_file_for_batch(messages, translate_batch_jsonl, max_token_list=max_token_list)
        if not jsonl_file:
            raise RuntimeError("Failed to create JSONL file for translation batch.")

    except Exception as e:
        logger.error(f"Error creating JSONL file for journal '{journal_name}'.", exc_info=True)
        raise RuntimeError("Error creating JSONL file for translation batch.") from e

    translation_data = start_batch_with_retries(jsonl_file, description=f"Batch for translating journal '{journal_name}'")
    
    logger.info(f"Successfully translated section batch.")

    return translation_data

In [None]:
def translate_sections(section_contents, section_metadata):        
    """build up sections in batches to translate """

    section_mdata = section_metadata['sections']
    if len(section_contents) != len(section_mdata):
            raise RuntimeError("Section length mismatch.")
    
    #collate metadata and section content, calculate max_tokens per section:
    section_data_to_send = []    
    max_token_list = []
    current_token_count = 0
    collected_translations = []
    section_last_index = len(section_mdata) - 1

    for i, section_info in enumerate(section_mdata):
        section_content = section_contents[i]
        max_tokens = floor(token_count(section_content) * 1.3) + 1000
        max_token_list.append(max_tokens)
        current_token_count += max_tokens
        section_data = SimpleNamespace(
            title=section_info["title_en"], 
            content=section_content
        )
        section_data_to_send.append(section_data)
        logger.debug(f"section {i}: {section_data.title} added for batch processing.")

        if current_token_count >= MAX_TOKEN_LIMIT or i == section_last_index:
             # send sections for batch processing since token limit reached.
             batch_result = send_data_for_tx_batch(section_data_to_send, max_token_list)
             collected_translations.extend(batch_result)

            # reset containers to start building up next batch.
             section_data_to_send = []
             max_token_list = []
             current_token_count = 0
    
    return collected_translations


In [None]:
# Step 2: Translation
def batch_translate(input_xml_path, metadata_path, journal_name, max_retries=MAX_BATCH_RETRIES, retry_delay=BATCH_RETRY_DELAY):
    """
    Translates the journal sections using the GPT model.
    Saves the translated content back to XML.

    Args:
        input_xml_path (str): Path to the input XML file.
        metadata_path (str): Path to the metadata JSON file.
        journal_name (str): Name of the journal.
        xml_output_path (str): Path to save the translated XML.
        max_retries (int): Maximum number of retries for batch operations.
        retry_delay (int): Delay in seconds between retries.

    Returns:
        bool: True if the process succeeds, False otherwise.
    """
    logger.info(f"Starting translation batch for journal '{journal_name}':\n\twith file: {input_xml_path}\n\tmetadata: {metadata_path}")
    
    # Data initialization:
    try:
        # load metadata
        serial_json = get_text_from_file(metadata_path)

        section_metadata = deserialize_json(serial_json)
        if not section_metadata:
            raise RuntimeError(f"Metadata could not be loaded from {metadata_path}.")

        # Extract page groups and split XML content
        page_groups = extract_page_groups_from_metadata(section_metadata)
        xml_content = get_text_from_file(input_xml_path)
        section_contents = split_xml_on_pagebreaks(xml_content, page_groups)
        
    except Exception as e:
        logger.error(f"Failed to initialize data for translation batching for journal '{journal_name}'.", exc_info=True)
        raise RuntimeError(f"Error during data initialization for journal '{journal_name}'.") from e
        
    translation_data = translate_sections(section_contents, section_metadata)
    return translation_data
    

In [None]:
def save_translation_data(xml_output_path: Path, translation_data):
# Save translated content back to XML
    try:
        logger.info(f"Saving translated content to XML for journal '{journal_name}'.")
        join_xml_data_to_doc(xml_output_path, translation_data, overwrite=True)
        logger.info(f"Translated journal saved successfully to:\n\t{xml_output_path}")

    except Exception as e:
        logger.error(
            f"Failed to save translation data for journal '{journal_name}'.",
            extra={"xml_output_path": xml_output_path},
            exc_info=True
        )
        raise RuntimeError(f"Failed to save translation data for journal '{journal_name}'.") from e

In [None]:
input_xml

In [None]:
translation_xml_path

In [None]:
# Step 1: Sectioning
metadata_serial_json = batch_sectioning(input_xml, journal_name)
metadata_path = save_sectioning_data(section_metadata_out, raw_json_metadata_path, metadata_serial_json)  

In [None]:
model_settings = {
    "gpt-4o": {
        "max_tokens": 5000,
        "context_limit": 20000,  # Total context limit for the model
        "temperature": 0.75
    }
}
set_model_settings(model_settings)

In [None]:
if metadata_path:
    translation_data = batch_translate(input_xml, metadata_path, journal_name)

In [None]:
translation_data

In [None]:
save_translation_data(translation_xml_path, translation_data)

In [None]:
result = get_last_batch_response()

In [None]:
print(result[0])

In [None]:
validate_and_save_metadata(section_metadata_out, result[0], journal_schema)

In [None]:
# def batch_sectioning(input_xml_path, output_json_path, raw_output_path, journal_name, max_retries=MAX_BATCH_RETRIES, retry_delay=BATCH_RETRY_DELAY):
#     """
#     Splits the journal content into sections using GPT, with retries for both starting and completing the batch.
#     """
#     journal_pages = get_text_from_file(input_xml_path)

#     # Create GPT messages for sectioning
#     user_message_wrapper = lambda text: f"{text}"
#     messages = generate_messages(system_message_section, user_message_wrapper, [journal_pages])

#     # Create JSONL file for batch processing
#     jsonl_file = create_jsonl_file_for_batch(messages, section_batch_jsonl, json_mode=True)

#     for attempt in range(max_retries):
#         try:
#             # Try to start the batch
#             batch = start_batch(jsonl_file, description=f"Batch for sectioning journal: {journal_name} | input file: {input_xml_path}")
#             batch_id = batch.id
#             if not batch_id:
#                 raise RuntimeError("Batch started but no ID was returned.")

#             print(f"Batch for sectioning started successfully on attempt {attempt + 1}. ID: {batch_id}")

#             # Poll for batch completion
#             json_results = poll_batch_for_response(batch_id)
#             if json_results:
#                 break # exit retry loop
#             else:
#                 raise RuntimeError("Unknown error in polling for batch response.", exc_info=True)

#         except Exception as e:
#             print(f"Attempt {attempt + 1} failed: {e}. Retrying batch process in {retry_delay} seconds...")
#             time.sleep(retry_delay)
#     else:
#         logger.error("Failed to complete batch sectioning after maximum retries.")
#         raise RuntimeError("Error: Failed to complete batch sectioning after maximum retries.")

#     # save raw result
#     try:
#         write_text_to_file(raw_output_path, json_results, force=True)
#     except Exception as e:
#         logger.error(f"failed to write raw response file: {raw_output_path}")
#         raise

#     # If successful, try to validate and save metadata and exit loop
#     try:
#         valid = validate_and_save_metadata(output_json_path, json_results, journal_schema)
#     except Exception as e:
#         logger.error(f"Error occurred while validating and saving metadata for journal {journal_name}: '{output_json_path}' (batch ID: {batch_id}).", exc_info=True)
#         raise
    
#     if valid:
#         logger.info(f"Successfully processed {journal_name}: {input_xml_path} with batch: {batch_id} and saved metadata to {output_json_path} ")
#         return output_json_path
        
    


In [None]:
# # Step 1: Sectioning
# def batch_sectioning(input_xml_path, output_xml_path):
#     """
#     Splits the journal content into sections using the GPT model.
#     Saves the sectioned content back to XML.
#     """
#     # Load the input XML
#     journal_pages = load_xml(input_xml_path)
#     pages_content = [page.text for page in journal_pages]

#     # Create GPT messages for sectioning
#     user_message_wrapper = lambda text: f"Divide this content into sections:\n{text}"
#     messages = generate_messages(system_message_section, user_message_wrapper, pages_content)

#     # Create JSONL file for batch processing
#     jsonl_file = create_jsonl_file_for_batch(messages, section_batch_jsonl)

#     # Start the batch
#     batch = start_batch(jsonl_file, description="Batch for sectioning journal")
#     batch_id = batch.get("id")
#     if not batch_id:
#         print("Error: Failed to start batch for sectioning.")
#         return None

#     print(f"Batch for sectioning started with ID: {batch_id}")

#     # Poll for batch completion
#     results = poll_batch_status(batch_id)
#     if not results:
#         print("Error: Failed to retrieve sectioning batch results.")
#         return None

#     # Save sectioned content back to XML
#     for i, section_content in enumerate(results):
#         journal_pages[i].text = section_content  # Replace original content with sectioned content

#     save_xml(journal_pages, output_xml_path)
#     print(f"Sectioned journal saved to {output_xml_path}")

In [None]:
# import os
# import json
# from gpt_processing.gpt_interface import (
#     set_api_client, 
#     generate_messages, 
#     create_jsonl_file_for_batch, 
#     start_batch, 
#     get_batch_response
# )
# from data_processing.xml_processing import (
#     load_xml, 
#     save_xml, 
#     extract_sections_from_xml
# )

# # Initialize OpenAI client
# set_api_client()

# # File paths
# INPUT_XML = "input_journal.xml"
# SECTIONED_XML = "sectioned_journal.xml"
# TRANSLATED_XML = "translated_journal.xml"
# BATCH_SECTION_JSONL = "section_batch.jsonl"
# BATCH_TRANSLATE_JSONL = "translate_batch.jsonl"

# # System messages
# SYSTEM_MESSAGE_SECTION = """
# You are a helpful assistant. Divide the text into meaningful sections and add XML tags:
# <section> for major sections, <subsection> for subsections, <title> for titles, and <p> for paragraphs.
# """
# SYSTEM_MESSAGE_TRANSLATE = """
# You are Thich Nhat Hanh translating from Vietnamese to English. Provide meaningful translations with appropriate XML tags:
# <section>, <subsection>, <title>, <p>.
# """

# # Step 1: Sectioning
# def batch_sectioning(input_xml, output_xml):
#     # Load the input XML and extract pages or chunks
#     journal_pages = load_xml(input_xml)
#     pages_content = [page.text for page in journal_pages]  # Assuming .text contains the text of each page

#     # Create GPT messages for sectioning
#     user_message_wrapper = lambda text: f"Divide this content into sections:\n{text}"
#     messages = generate_messages(SYSTEM_MESSAGE_SECTION, user_message_wrapper, pages_content)

#     # Create JSONL file for batch processing
#     jsonl_file = create_jsonl_file_for_batch(messages, BATCH_SECTION_JSONL)

#     # Start batch
#     batch = start_batch(jsonl_file, description="Batch for sectioning journal")
#     batch_id = batch.get("id")
#     if not batch_id:
#         print("Error: Failed to start batch for sectioning.")
#         return None

#     print(f"Batch for sectioning started with ID: {batch_id}")

#     # Poll for batch completion and retrieve results
#     results = get_batch_response(batch_id)
#     if not results:
#         print("Error: Failed to retrieve sectioning batch results.")
#         return None

#     # Save the sectioned content back to XML
#     for i, section_content in enumerate(results):
#         journal_pages[i].text = section_content  # Replace original content with sectioned content

#     save_xml(journal_pages, output_xml)
#     print(f"Sectioned journal saved to {output_xml}")

# # Step 2: Translation
# def batch_translation(input_xml, output_xml):
#     # Load the sectioned XML and extract sections or chunks for translation
#     sections = extract_sections_from_xml(input_xml)

#     # Create GPT messages for translation
#     user_message_wrapper = lambda section: f"Translate this section:\n{section}"
#     messages = generate_messages(SYSTEM_MESSAGE_TRANSLATE, user_message_wrapper, sections)

#     # Create JSONL file for batch processing
#     jsonl_file = create_jsonl_file_for_batch(messages, BATCH_TRANSLATE_JSONL)

#     # Start batch
#     batch = start_batch(jsonl_file, description="Batch for translating journal")
#     batch_id = batch.get("id")
#     if not batch_id:
#         print("Error: Failed to start batch for translation.")
#         return None

#     print(f"Batch for translation started with ID: {batch_id}")

#     # Poll for batch completion and retrieve results
#     results = get_batch_response(batch_id)
#     if not results:
#         print("Error: Failed to retrieve translation batch results.")
#         return None

#     # Save the translated content back to XML
#     for i, translated_content in enumerate(results):
#         sections[i].text = translated_content  # Replace original content with translated content

#     save_xml(sections, output_xml)
#     print(f"Translated journal saved to {output_xml}")

# # Main process
# if __name__ == "__main__":
#     # Step 1: Sectioning
#     print("Starting batch sectioning...")
#     batch_sectioning(INPUT_XML, SECTIONED_XML)

#     # Step 2: Translation
#     print("Starting batch translation...")
#     batch_translation(SECTIONED_XML, TRANSLATED_XML)

In [None]:
# # Function schema for function calling
# function_schemas = [
#     {
#         "name": "save_processed_metadata",
#         "description": "Save metadata for a processed vietnamese journal, including sections and summaries, that will later be translated",
#         "parameters": {
#             "type": "object",
#             "properties": {
#                 "journal_summary": {"type": "string", "description": "A one-page summary of the journal in English."},
#                 "sections": {
#                     "type": "array",
#                     "items": {
#                         "type": "object",
#                         "properties": {
#                             "section_title_vi": {"type": "string", "description": "The original title of the section in Vietnamese."},
#                             "section_title_en": {"type": "string", "description": "The translated title of the section in English."},
#                             "section_summary": {"type": "string", "description": "A one paragraph summary of the section in English."},
#                             "page_range": {
#                                 "type": "array",
#                                 "items": {"type": "integer"},
#                                 "minItems": 2,
#                                 "maxItems": 2,
#                                 "description": "The start and end page numbers of the section."
#                             }
#                         },
#                         "required": ["section_title_en", "section_title_vi", "section_summary", "page_range"]
#                     }
#                 }
#             },
#             "required": ["journal_summary", "sections"]
#         }
#     }
# ]

In [None]:
# Step 2: Translation
# def batch_translate(input_xml_path, metadata_path):
#     """
#     Translates the journal sections using the GPT model.
#     Saves the translated content back to XML.
#     """
#     # Load the sectioned XML
#     section_metadata = #load json data from metadata_path and deserialize

#     # use the function split_xml_to_pages to get sections for translation:
#     sections = split_xml_pages(...)

#     # Create GPT messages for translation
#     user_message_wrapper = lambda section: f"Translate this section:\n{section}"
#     messages = generate_messages(system_message_translate, user_message_wrapper, sections)

#     # convert the blocks below to a series of nested try blocks with multiple attempts as in batch_section():
#     # add appropriate logging to match batch_section():

#     # Create JSONL file for batch processing
#     jsonl_file = create_jsonl_file_for_batch(messages, translate_batch_jsonl)

#     # Start the batch
#     batch = start_batch(jsonl_file, description="Batch for translating journal")
#     batch_id = batch.get("id")
#     if not batch_id:
#         print("Error: Failed to start batch for translation.")
#         return None

#     print(f"Batch for translation started with ID: {batch_id}")

#     # Poll for batch completion
#     results = poll_batch_for_response(batch_id)
#     if not results:
#         print("Error: Failed to retrieve translation batch results.")
#         return None

#     # Save translated content back to XML
#     translated_sections = []
#     for i, translated_content in enumerate(results):
#         translated_sections.append(translated_content)  # Replace original content with translated content

#     save_pages_to_xml(translated_sections, translated_xml)
#     print(f"Translated journal saved to {translated_xml}")

In [None]:
# # old system message

# system_message_section = """
# You are a highly skilled assistant processing a Vietnamese journal scanned from OCR. 
# You will be determining the journal sections by page number. You will also generate summaries for the full text and each section. 
# You will return this metadata in JSON format.

# Instructions:
# 1. Analyze the text and divide it into sections based on logical breaks, such as headings, topic changes, or clear shifts in content.
# 2. Ensure every page is part of  a section, even if that section is titled "blank page" or "title page," for example.
# 3. For each section, provide:
#    - The original title in Vietnamese (`section_title_vi`).
#    - The translated title in English (`section_title_en`).
#    - The author's name if it is available (`section_author`). 
#    - A one-paragraph summary of the section in English (`section_summary`).
#    - A list of keywords for the section that are related to its content, these can be proper names, specific concepts, or contextual information.
#    - The section's start and end page numbers (`start_page` and `end_page`).
#    - Use "null" for any data that is not available (such as author name) for the section.

# 4. Return the output as a JSON object with the following schema:
# {
#     "journal_summary": "A one-page summary of the whole journal in English.",
#     "sections": [
#         {
#             "section_title_vi": "Original title in Vietnamese",
#             "section_title_en": "Translated title in English",
#             "section_author": "Name of the author of the section",
#             "section_summary": "One-paragraph summary of the section in English",
#             "section_keywords": "A list of keywords for the section",
#             "start_page":  X,
#             "end_page":  Y
#         },
#         ...
#     ]
# }

# 5.  Ensure the JSON is well-formed and adheres strictly to the provided schema.
# """

In [None]:
# # Step 2: Translation
# def batch_translate(input_xml_path, output_xml_path):
#     """
#     Translates the journal sections using the GPT model.
#     Saves the translated content back to XML.
#     """
#     # Load the sectioned XML
#     section_metadata = 

#     # Create GPT messages for translation
#     user_message_wrapper = lambda section: f"Translate this section:\n{section}"
#     messages = generate_messages(system_message_translate, user_message_wrapper, sections)

#     # Create JSONL file for batch processing
#     jsonl_file = create_jsonl_file_for_batch(messages, translate_batch_jsonl)

#     # Start the batch
#     batch = start_batch(jsonl_file, description="Batch for translating journal")
#     batch_id = batch.get("id")
#     if not batch_id:
#         print("Error: Failed to start batch for translation.")
#         return None

#     print(f"Batch for translation started with ID: {batch_id}")

#     # Poll for batch completion
#     results = poll_batch_status(batch_id)
#     if not results:
#         print("Error: Failed to retrieve translation batch results.")
#         return None

#     # Save translated content back to XML
#     for i, translated_content in enumerate(results):
#         sections[i].text = translated_content  # Replace original content with translated content

#     save_pages_to_xml(sections, output_xml_path)
#     print(f"Translated journal saved to {output_xml_path}")

In [None]:
# # Step 2: Translation
# def batch_translate(input_xml_path, metadata_path, journal_name, xml_output_path, max_retries=MAX_BATCH_RETRIES, retry_delay=BATCH_RETRY_DELAY):
#     """
#     Translates the journal sections using the GPT model.
#     Saves the translated content back to XML.

#     Args:
#         input_xml_path (str): Path to the input XML file.
#         metadata_path (str): Path to the metadata JSON file.
#         max_retries (int): Maximum number of retries for batch operations.
#         retry_delay (int): Delay in seconds between retries.

#     Returns:
#         bool: True if the process succeeds, False otherwise.
#     """
#     logger.info(
#         f"starting translation batch {journal_name}...",
#         extra={
#             "input_xml": input_xml_path,
#             "metadata_path": metadata_path,
#             "journal_name": journal_name
#         }
#     )
#     try: # data initialization:
#         # get metadata
#         section_metadata = deserialize_json(metadata_path)
#         section_title = section_metadata.section_title_en

#         # Extract page groups and split XML content
#         page_groups = extract_page_groups_from_metadata(section_metadata)
#         xml_content = get_text_from_file(input_xml_path)
#         sections = split_xml_pages(xml_content, page_groups)

#     except Exception as e:
#         # Log the error with full traceback
#         logger.error(
#             "Could not initialize data for translation batching {journal_name}", exc_info=True)
#         raise  # Re-raise the exception to escalate

#     # Create GPT messages for translation

#     user_message_wrapper = lambda section: f"Translate this section with title {section_title}:\n{section}"
#     messages = generate_messages(system_message_translate, user_message_wrapper, sections)

#     # Create JSONL file for batch processing
    
#     jsonl_file = create_jsonl_file_for_batch(messages, translate_batch_jsonl)
#     if not jsonl_file:
#         logger.error(
#             "Failed to create JSONL file for translation batch.",
#             exc_info=True  # Logs the exception traceback if one exists
#         )
#         raise RuntimeError("Failed to create JSONL file for translation batch.")
    
#     for attempt in range(max_retries): # batching logic requires multiple retries due to issues with API:
#         try:
#             # Start the batch
#             batch = start_batch(jsonl_file, description="Batch for translating journal")
#             batch_id = batch.get("id")
#             if not batch_id:
#                 raise RuntimeError("Batch started but no ID was returned.")
            
#             print(f"Batch for translation started successfully on attempt {attempt + 1}. ID: {batch_id}")

#             # Poll for batch completion
#             print("Polling for batch completion...")
#             results = poll_batch_for_response(batch_id)

#             if results:
#                 break # exit the retry loop
#             else:
#                 raise RuntimeError("Unknown error. No results from batch polling.")
            
#         except Exception as e:
#             logger.error(
#                 f"Attempt {attempt + 1} failed during translation for journal '{input_xml_path}'. Retrying in {retry_delay} seconds...",
#                 exc_info=True
#             )
#             time.sleep(retry_delay)
#     else:
#         logger.error(f"Failed to complete translation after {max_retries} retries for journal '{input_xml_path}'.")
#         raise RuntimeError("Unable to run translate batch.")
        
#     # Save translated content back to XML
#     try: 
#         print("Saving translated content back to XML...")
#         translated_sections = []
#         for i, translated_content in enumerate(results):
#             translated_sections.append(translated_content)

#         save_pages_to_xml(translated_sections, xml_output_path, overwrite=True)
#         print(f"Translated journal saved to {xml_output_path}")
#     except Exception as e:
#         raise RuntimeError("Failed to save translation data.")


In [None]:
# testing
set_api_client()
msgs = generate_messages("you are assisting a software engineering/researcher looking to develop new AI platforms and processes.", lambda x: x, ["why is AI suddenly successful?", "What is the (immediate) future of AI?"])
run_immediate_chat_process(msgs[1])

In [None]:
# testing
model_settings = {
    "gpt-4o": {
        "max_tokens": 3000,
        "context_limit": 20000,  # Total context limit for the model
        "temperature": 1.3
    }}

set_model_settings(model_settings)
batch_id = run_single_oa_batch(["what is the square root of 2?", "why is the sky blue?"], "you are are explaining complex ideas to a 9 year old child.")

poll_batch_for_response(batch_id, 10)

msgs = generate_messages("you are assisting a software engineering/researcher looking to develop new AI platforms and processes.", lambda x: x, ["why is AI suddenly successful?", "What is the (immediate) future of AI?"])
run_immediate_chat_process(msgs[1])

get_last_batch_response()