In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
# Imports
from data_processing.gpt_processing import (
    set_api_client, 
    generate_messages, 
    create_jsonl_file_for_batch, 
    start_batch, 
    get_batch_response,
    get_completed_batches,
    set_model_settings,
    get_batch_status,
    get_active_batches,
    get_all_batch_info,
    token_count,
    run_immediate_chat_process,
    run_single_oa_batch,
    get_last_batch_response
)

from data_processing.xml_processing import ( 
    save_pages_to_xml,
    split_xml_pages
)

from data_processing.text_processing import (
    get_text_from_file,
    write_text_to_file
)
from pathlib import Path
%aimport time
%aimport json
%aimport datetime
%aimport logging

In [59]:
# Set up API client
client = set_api_client()

In [60]:
model_settings = {
    "gpt-4o": {
        "max_tokens": 5000,
        "context_limit": 20000,  # Total context limit for the model
        "temperature": 0.25
    },
    "gpt-3.5-turbo": {
        "max_tokens": 4096,  # Set conservatively to avoid errors
        "context_limit": 16384  # Same as gpt-4o
        }
    }

set_model_settings(model_settings)

In [61]:
MAX_BATCH_RETRIES = 20  # Number of retries
BATCH_RETRY_DELAY = 5  # seconds to wait before retry

In [None]:
# File paths
project_dir = Path("/Users/phapman/Desktop/tnh-scholar/")
data_dir = project_dir / "data_processing"
journal_dir = data_dir / "processed_journal_data"
journal_name = "phat-giao-viet-nam-1956-01"
working_dir = journal_dir / journal_name
input_xml = working_dir / f"TEST2_full_cleaned_{journal_name}.xml"
translated_xml_path = journal_dir / f"translation_{journal_name}.xml"
section_batch_jsonl = working_dir / "section_batch.jsonl"
translate_batch_jsonl = working_dir / "translation_batch.jsonl"
section_metadata_out = working_dir / "section_metadata.json"
raw_json_metadata_path = working_dir / "raw_metadata_response.txt"
logfile = data_dir / "gpt_processing" / "processing_info.log"

In [43]:
# Set up the logger
def setup_logger(log_file_path):
    """
    Configures the logger to write to a log file and the console.
    """
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[
            logging.FileHandler(log_file_path, encoding="utf-8"),
            logging.StreamHandler()  # Optional: to log to the console as well
        ]
    )
    return logging.getLogger(__name__)

In [44]:
logger = setup_logger(logfile)

In [None]:
system_message_section = """
You are a highly skilled assistant processing a Vietnamese Buddhist journal scanned from OCR. Use the title: "Journal of Vietnamese Buddhism."
You will be determining the journal sections by page number. You will also generate metadata for the full text and each section. 
You will return this metadata in JSON format.

Instructions:
1. Analyze the text and divide it into sections based on logical breaks, such as headings, topic changes, or clear shifts in content.
2. Ensure every page is part of  a section, even if that section is titled "blank page" or "title page," for example.
3. For each section, provide:
   - The original title in Vietnamese (`section_title_vi`).
   - The translated title in English (`section_title_en`).
   - The author's name if it is available (`section_author`). 
   - A one-paragraph summary of the section in English (`section_summary`).
   - A list of keywords for the section that are related to its content, these can be proper names, specific concepts, or contextual information.
   - The section's start and end page numbers (`start_page` and `end_page`).
   - Use "null" for any data that is not available (such as author name) for the section.

4. Return the output as a JSON object with the following schema:
{
    "journal_summary": "A one-page summary of the whole journal in English.",
    "sections": [
        {
            "section_title_vi": "Original title in Vietnamese",
            "section_title_en": "Translated title in English",
            "section_author": "Name of the author of the section",
            "section_summary": "One-paragraph summary of the section in English",
            "section_keywords": "A list of keywords for the section",
            "start_page":  X,
            "end_page":  Y
        },
        ...
    ]
}

5.  Ensure the JSON is well-formed and adheres strictly to the provided schema.
"""

In [None]:
system_message_translate = """
You are Thich Nhat Hanh translating from Vietnamese to English for your experienced students. 
The text is based on an OCR scan of a journal you edited from 1956-1958. Use the title: "Journal of Vietnamese Buddhism" for the journal when it is referenced.
You will be translating a single section of the journal and will be provided with the section title in English. 
You want your students to understand the text in its larger historical context, in the context of Vietnamese Buddhism, and in the context of your own life.
Translate for the most meaningful, typical, and eloquent English interpretation. 
Make corrections in the text only where necessary (for example if words are missing) to create logical flow . 
Keep pages together: each translated page must match its original page source as pages will be studied side by side with the original Vietnamese.
Infer paragraphs and text structure from the text layout.
Add XML tags for clarity. Use only the following tags: 

   <p> for paragraphs.
   <section> for major sections.
   <subsection> for subsections.
   <title> for main titles of sections and subsections. 
   <subtitle> for subtitles of sections and subsections. 
   <heading> for headings that do not mark titles or subtitles
   <TOC> for tables of contents
   <author> for authors of sections or subsections
   <ol> <ul> <li> for lists
   <i> for italics. 
   <b> for bold.
   <notes>
   <translation-notes>

You may use <notes> at the end of the section for notes that already exist in the text, or for interesting elements you wish to call attention to.
You may add <translation-notes> at the end of the section as a commentary to summarize your translation choices. 
For <translation-notes>, you may include information on Sino-Vietnamese, complex, unusual, poetic, or other interesting terms, and significant corrections to the text. 
In the <translation-notes> include the original Vietnamese terms for reference.

All titles, XML sections, text, and terms should be translated--do not leave any terms or expressions in Vietnamese, except names of Vietnamese people.
"""

In [71]:
import json

def deserialize_json(serialized_data):
    """
    Converts a serialized JSON string into a Python dictionary.

    Args:
        serialized_data (str): The JSON string to deserialize.

    Returns:
        dict: The deserialized Python dictionary.
    """
    try:
        # Convert the JSON string into a dictionary
        return json.loads(serialized_data)
    except json.JSONDecodeError as e:
        print(f"Failed to deserialize JSON: {e}")
        return None

In [None]:
import json
from copy import deepcopy

# Define the schema
journal_schema = {
    "type": "object",
    "properties": {
        "journal_summary": {"type": "string"},
        "sections": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "section_title_vi": {"type": "string"},
                    "section_title_en": {"type": "string"},
                    "section_author": {"type": ["string", "null"]},
                    "section_summary": {"type": "string"},
                    "section_keywords": {"type": "array", "items": {"type": "string"}},
                    "start_page": {"type": "integer", "minimum": 1},
                    "end_page": {"type": "integer", "minimum": 1}
                },
                "required": [
                    "section_title_vi",
                    "section_title_en",
                    "section_summary",
                    "section_keywords",
                    "start_page",
                    "end_page"
                ]
            }
        }
    },
    "required": ["journal_summary", "sections"]
}

def validate_and_clean_data(data, schema):
    """
    Recursively validate and clean AI-generated data to fit the given schema.
    Any missing fields are filled with defaults, and extra fields are ignored.

    Args:
        data (dict): The AI-generated data to validate and clean.
        schema (dict): The schema defining the required structure.

    Returns:
        dict: The cleaned data adhering to the schema.
    """
    def clean_value(value, field_schema):
        """
        Clean a single value based on its schema, attempting type conversions where necessary.
        """
        field_type = field_schema["type"]

        # Handle type: string
        if field_type == "string":
            if isinstance(value, str):
                return value
            elif value is not None:
                return str(value)
            return "unset"

        # Handle type: integer
        elif field_type == "integer":
            if isinstance(value, int):
                return value
            elif isinstance(value, str) and value.isdigit():
                return int(value)
            try:
                return int(float(value))  # Handle cases like "2.0"
            except (ValueError, TypeError):
                return 0

        # Handle type: array
        elif field_type == "array":
            if isinstance(value, list):
                item_schema = field_schema.get("items", {})
                return [clean_value(item, item_schema) for item in value]
            elif isinstance(value, str):
                # Try splitting comma-separated strings into a list
                return [v.strip() for v in value.split(",")]
            return []

        # Handle type: object
        elif field_type == "object":
            if isinstance(value, dict):
                return validate_and_clean_data(value, field_schema)
            return {}

        # Handle nullable strings
        elif field_type == ["string", "null"]:
            if value is None or isinstance(value, str):
                return value
            return str(value)

        # Default case for unknown or unsupported types
        return "unset"

    def clean_object(obj, obj_schema):
        """
        Clean a dictionary object based on its schema.
        """
        if not isinstance(obj, dict):
            print(f"Expected dict but got: \n{type(obj)}: {obj}\nResetting to empty dict.")
            return {}
        cleaned = {}
        properties = obj_schema.get("properties", {})
        for key, field_schema in properties.items():
            # Set default value for missing fields
            cleaned[key] = clean_value(obj.get(key), field_schema)
        return cleaned

    # Handle the top-level object
    if schema["type"] == "object":
        cleaned_data = clean_object(data, schema)
        return cleaned_data
    else:
        raise ValueError("Top-level schema must be of type 'object'.")

def validate_and_save_metadata(output_file_path: Path, json_metadata_serial: str, schema):
    """
    Validates and cleans journal data against the schema, then writes it to a JSON file.

    Args:
        data (str): The journal data as a serialized JSON string to validate and clean.
        schema (dict): The schema defining the required structure.
        output_file_path (str): Path to the output JSON file.

    Returns:
        bool: True if successfully written to the file, False otherwise.
    """
    try:
        # Clean the data to fit the schema
        data = deserialize_json(json_metadata_serial)
        cleaned_data = validate_and_clean_data(data, schema)

        # Write the cleaned data to the specified JSON file
        with open(output_file_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, indent=4, ensure_ascii=False)
        print(f"Cleaned data successfully written to {output_file_path}")
        return True
    except Exception as e:
        logger.error(f"An error occurred during validation or writing: {e}")
        raise 

In [48]:
# Implementing the save_processed_journal_metadata function
def save_processed_metadata(metadata):
    """
    Save processed journal metadata, including title, summary, and sections, to a JSON file.
    """
    output_file = project_dir / "processed_journal_metadata.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=4)
    print(f"Processed journal metadata saved to {output_file}")

In [73]:
def extract_page_groups_from_metadata(metadata):
    """
    Extracts page groups from the section metadata for use with `split_xml_pages`.

    Parameters:
        metadata (dict): The section metadata containing sections with start and end pages.

    Returns:
        List[Tuple[int, int]]: A list of tuples, each representing a page range (start_page, end_page).
    """
    page_groups = []

    # Ensure metadata contains sections
    if "sections" not in metadata or not isinstance(metadata["sections"], list):
        raise ValueError("Metadata does not contain a valid 'sections' key with a list of sections.")

    for section in metadata["sections"]:
        try:
            # Extract start and end pages
            start_page = section.get("start_page")
            end_page = section.get("end_page")

            # Ensure both start_page and end_page are integers
            if not isinstance(start_page, int) or not isinstance(end_page, int):
                raise ValueError(f"Invalid page range in section: {section}")

            # Add the tuple to the page groups list
            page_groups.append((start_page, end_page))

        except KeyError as e:
            print(f"Missing key in section metadata: {e}")
        except ValueError as e:
            print(f"Error processing section metadata: {e}")

    return page_groups

In [49]:
def log_batch_info(batch, logger):
    """
    Log the batch object and its metadata using the logger.

    Args:
        batch: The Batch object returned by start_batch.
        logger: The logger instance for logging.
    """
    try:
        # Serialize batch object to a dictionary
        batch_data = batch.to_dict() if hasattr(batch, "to_dict") else batch.__dict__

        # Add metadata
        batch_data["timestamp"] = datetime.datetime.now().isoformat()

        # Log the batch data as JSON
        logger.info("Batch Info: %s", json.dumps(batch_data, indent=4))

    except Exception as e:
        logger.error("Failed to log batch: %s", e)

In [None]:
# Polling function for batch completion
def poll_batch_for_response(batch_id, interval=10):
    """
    Poll the batch status until it completes or fails.
    Raises an exception if the batch fails.
    Returns the batch response if successful.
    """
    print(f"Polling batch status for batch ID {batch_id} ...")
    while True:
        time.sleep(interval)
        batch_status = get_batch_status(batch_id)

        if not batch_status:
            raise RuntimeError(f"Batch ID {batch_id} not found.")

        if batch_status == "completed":
            print("Batch processing completed successfully.")
            return get_batch_response(batch_id)
        
        elif batch_status == "failed":
            raise RuntimeError(f"Batch ID {batch_id} failed during processing.")
        
        else:
            print(f"Batch status: {batch_status}. Retrying in {interval} seconds...")

In [None]:
def batch_sectioning(input_xml_path, output_json_path, raw_output_path, journal_name, max_retries=MAX_BATCH_RETRIES, retry_delay=BATCH_RETRY_DELAY):
    """
    Splits the journal content into sections using GPT, with retries for both starting and completing the batch.
    """
    journal_pages = get_text_from_file(input_xml_path)

    # Create GPT messages for sectioning
    user_message_wrapper = lambda text: f"{text}"
    messages = generate_messages(system_message_section, user_message_wrapper, [journal_pages])

    # Create JSONL file for batch processing
    jsonl_file = create_jsonl_file_for_batch(messages, section_batch_jsonl, json_mode=True)

    for attempt in range(max_retries):
        try:
            # Try to start the batch
            batch = start_batch(jsonl_file, description=f"Batch for sectioning journal: {journal_name} | input file: {input_xml_path}")
            batch_id = batch.id
            if not batch_id:
                raise RuntimeError("Batch started but no ID was returned.")

            print(f"Batch for sectioning started successfully on attempt {attempt + 1}. ID: {batch_id}")

            # Poll for batch completion
            json_results = poll_batch_for_response(batch_id)
            if json_results:
                break # exit retry loop
            else:
                raise RuntimeError("Unknown error in polling for batch response.", exc_info=True)

        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}. Retrying batch process in {retry_delay} seconds...")
            time.sleep(retry_delay)
    else:
        logger.error("Failed to complete batch sectioning after maximum retries.")
        raise RuntimeError("Error: Failed to complete batch sectioning after maximum retries.")

    # save raw result
    try:
        write_text_to_file(raw_output_path, json_results, force=True)
    except Exception as e:
        logger.error(f"failed to write raw response file: {raw_output_path}")
        raise

    # If successful, try to validate and save metadata and exit loop
    try:
        valid = validate_and_save_metadata(output_json_path, json_results, journal_schema)
    except Exception as e:
        logger.error(f"Error occurred while validating and saving metadata for journal {journal_name}: '{output_json_path}' (batch ID: {batch_id}).", exc_info=True)
        raise
    
    if valid:
        logger.info(f"Successfully processed {journal_name}: {input_xml_path} with batch: {batch_id} and saved metadata to {output_json_path} ")
        return output_json_path
        
    


In [None]:
# Step 2: Translation
def batch_translate(input_xml_path, metadata_path, journal_name, xml_output_path, max_retries=MAX_BATCH_RETRIES, retry_delay=BATCH_RETRY_DELAY):
    """
    Translates the journal sections using the GPT model.
    Saves the translated content back to XML.

    Args:
        input_xml_path (str): Path to the input XML file.
        metadata_path (str): Path to the metadata JSON file.
        max_retries (int): Maximum number of retries for batch operations.
        retry_delay (int): Delay in seconds between retries.

    Returns:
        bool: True if the process succeeds, False otherwise.
    """
    logger.info(
        f"starting translation batch {journal_name}...",
        extra={
            "input_xml": input_xml_path,
            "metadata_path": metadata_path,
            "journal_name": journal_name
        }
    )
    try: # data initialization:
        # get metadata
        section_metadata = deserialize_json(metadata_path)
        section_title = section_metadata.section_title_en

        # Extract page groups and split XML content
        page_groups = extract_page_groups_from_metadata(section_metadata)
        xml_content = get_text_from_file(input_xml_path)
        sections = split_xml_pages(xml_content, page_groups)

    except Exception as e:
        # Log the error with full traceback
        logger.error(
            "Could not initialize data for translation batching {journal_name}", exc_info=True)
        raise  # Re-raise the exception to escalate

    # Create GPT messages for translation

    user_message_wrapper = lambda section: f"Translate this section with title {section_title}:\n{section}"
    messages = generate_messages(system_message_translate, user_message_wrapper, sections)

    # Create JSONL file for batch processing
    
    jsonl_file = create_jsonl_file_for_batch(messages, translate_batch_jsonl)
    if not jsonl_file:
        logger.error(
            "Failed to create JSONL file for translation batch.",
            exc_info=True  # Logs the exception traceback if one exists
        )
        raise RuntimeError("Failed to create JSONL file for translation batch.")
    
    for attempt in range(max_retries): # batching logic requires multiple retries due to issues with API:
        try:
            # Start the batch
            batch = start_batch(jsonl_file, description="Batch for translating journal")
            batch_id = batch.get("id")
            if not batch_id:
                raise RuntimeError("Batch started but no ID was returned.")
            
            print(f"Batch for translation started successfully on attempt {attempt + 1}. ID: {batch_id}")

            # Poll for batch completion
            print("Polling for batch completion...")
            results = poll_batch_for_response(batch_id)

            if results:
                break # exit the retry loop
            else:
                raise RuntimeError("Unknown error. No results from batch polling.")
            
        except Exception as e:
            logger.error(
                f"Attempt {attempt + 1} failed during translation for journal '{input_xml_path}'. Retrying in {retry_delay} seconds...",
                exc_info=True
            )
            time.sleep(retry_delay)
    else:
        logger.error(f"Failed to complete translation after {max_retries} retries for journal '{input_xml_path}'.")
        raise RuntimeError("Unable to run translate batch.")
        
    # Save translated content back to XML
    try: 
        print("Saving translated content back to XML...")
        translated_sections = []
        for i, translated_content in enumerate(results):
            translated_sections.append(translated_content)

        save_pages_to_xml(translated_sections, xml_output_path, overwrite=True)
        print(f"Translated journal saved to {xml_output_path}")
    except Exception as e:
        raise RuntimeError("Failed to save translation data.")


In [53]:
input_xml

PosixPath('/Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-01/TEST2_full_cleaned_phat-giao-viet-nam-1956-01.xml')

In [None]:
# Step 1: Sectioning
print("Starting batch sectioning...")
metadata_path = batch_sectioning(input_xml, section_metadata_out, raw_json_metadata_path, journal_name)
if metadata_path:
    batch_translate

# # Step 2: Translation
# print("Starting batch translation...")
# batch_translation(sectioned_xml, translated_xml)

Starting batch sectioning...
model: gpt-4o
max_tokens: 5000
temperature: 0.25
response_format: {'type': 'json_object'}
JSONL file created at: /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-01/section_batch.jsonl


2024-11-28 19:33:06,483 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"
2024-11-28 19:33:07,627 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"


Batch Initiated: 11-28-2024 19:33:05 PST | section_batch.jsonl | Batch for sectioning journal: phat-giao-viet-nam-1956-01 | input file: /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-01/TEST2_full_cleaned_phat-giao-viet-nam-1956-01.xml
Batch for sectioning started successfully on attempt 1. ID: batch_674935f36c408190b685bec894577e3c
Polling batch status for batch ID batch_674935f36c408190b685bec894577e3c ...


2024-11-28 19:33:17,887 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_674935f36c408190b685bec894577e3c "HTTP/1.1 200 OK"


Attempt 1 failed: Batch ID batch_674935f36c408190b685bec894577e3c failed during processing.. Retrying batch process in 5 seconds...


2024-11-28 19:33:23,397 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"
2024-11-28 19:33:24,014 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"


Batch Initiated: 11-28-2024 19:33:22 PST | section_batch.jsonl | Batch for sectioning journal: phat-giao-viet-nam-1956-01 | input file: /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-01/TEST2_full_cleaned_phat-giao-viet-nam-1956-01.xml
Batch for sectioning started successfully on attempt 2. ID: batch_67493603c5748190b1b783e83805285a
Polling batch status for batch ID batch_67493603c5748190b1b783e83805285a ...


2024-11-28 19:33:34,201 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_67493603c5748190b1b783e83805285a "HTTP/1.1 200 OK"


Attempt 2 failed: Batch ID batch_67493603c5748190b1b783e83805285a failed during processing.. Retrying batch process in 5 seconds...


2024-11-28 19:33:39,832 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"
2024-11-28 19:33:40,406 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"


Batch Initiated: 11-28-2024 19:33:39 PST | section_batch.jsonl | Batch for sectioning journal: phat-giao-viet-nam-1956-01 | input file: /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-01/TEST2_full_cleaned_phat-giao-viet-nam-1956-01.xml
Batch for sectioning started successfully on attempt 3. ID: batch_6749361435788190a9391a6839d845aa
Polling batch status for batch ID batch_6749361435788190a9391a6839d845aa ...


2024-11-28 19:33:50,606 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_6749361435788190a9391a6839d845aa "HTTP/1.1 200 OK"
2024-11-28 19:33:50,748 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_6749361435788190a9391a6839d845aa "HTTP/1.1 200 OK"


Batch processing completed successfully.


2024-11-28 19:33:51,252 - INFO - HTTP Request: GET https://api.openai.com/v1/files/file-9F8Ku2oW6NHdWfZp4DbBxH/content "HTTP/1.1 200 OK"


An error occurred during validation or writing: 'list' object has no attribute 'get'
Batch sectioning completed and metadata saved.


In [55]:
result = get_last_batch_response()

2024-11-28 21:15:49,819 - INFO - HTTP Request: GET https://api.openai.com/v1/batches?limit=30 "HTTP/1.1 200 OK"
2024-11-28 21:15:50,287 - INFO - HTTP Request: GET https://api.openai.com/v1/batches?limit=30&after=batch_6748f738f6948190b4b987ac65cf2a1d "HTTP/1.1 200 OK"
2024-11-28 21:15:50,778 - INFO - HTTP Request: GET https://api.openai.com/v1/batches?limit=30&after=batch_6736d4d981f48190beda7f734d6a75d9 "HTTP/1.1 200 OK"
2024-11-28 21:15:50,976 - INFO - HTTP Request: GET https://api.openai.com/v1/batches?limit=30&after=batch_672b85772df8819085957fa62c4e3020 "HTTP/1.1 200 OK"
2024-11-28 21:15:51,265 - INFO - HTTP Request: GET https://api.openai.com/v1/batches?limit=30&after=batch_672b091c02088190b0d4e5d9275ec258 "HTTP/1.1 200 OK"
2024-11-28 21:15:51,588 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_6749361435788190a9391a6839d845aa "HTTP/1.1 200 OK"
2024-11-28 21:15:52,007 - INFO - HTTP Request: GET https://api.openai.com/v1/files/file-9F8Ku2oW6NHdWfZp4DbBxH/content

In [56]:
print(result[0])

{
    "journal_summary": "This Vietnamese journal explores the deep-rooted connection between Buddhism and Vietnamese culture, tracing its historical influence and emphasizing the importance of maintaining Buddhist values in modern society. It also discusses the proper path for Vietnamese Buddhists, emphasizing the need for clear understanding and awareness in navigating life's challenges.",
    "sections": [
        {
            "section_title_vi": "PHẬT GIÁO VIỆT NAM NGUYỆT SAN SỐ 1",
            "section_title_en": "Vietnamese Buddhism Monthly Issue 1",
            "section_author": null,
            "section_summary": "This is the title page of the journal, indicating the publication as the first issue of the Vietnamese Buddhism Monthly, released on the 15th of August in the year of the Monkey.",
            "section_keywords": ["Vietnamese Buddhism", "Monthly Issue", "Publication Date"],
            "start_page": 1,
            "end_page": 1
        },
        {
            "sect

In [72]:
validate_and_save_metadata(section_metadata_out, result[0], journal_schema)

Cleaned data successfully written to /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-01/section_metadata.json


True

In [None]:
# # Step 1: Sectioning
# def batch_sectioning(input_xml_path, output_xml_path):
#     """
#     Splits the journal content into sections using the GPT model.
#     Saves the sectioned content back to XML.
#     """
#     # Load the input XML
#     journal_pages = load_xml(input_xml_path)
#     pages_content = [page.text for page in journal_pages]

#     # Create GPT messages for sectioning
#     user_message_wrapper = lambda text: f"Divide this content into sections:\n{text}"
#     messages = generate_messages(system_message_section, user_message_wrapper, pages_content)

#     # Create JSONL file for batch processing
#     jsonl_file = create_jsonl_file_for_batch(messages, section_batch_jsonl)

#     # Start the batch
#     batch = start_batch(jsonl_file, description="Batch for sectioning journal")
#     batch_id = batch.get("id")
#     if not batch_id:
#         print("Error: Failed to start batch for sectioning.")
#         return None

#     print(f"Batch for sectioning started with ID: {batch_id}")

#     # Poll for batch completion
#     results = poll_batch_status(batch_id)
#     if not results:
#         print("Error: Failed to retrieve sectioning batch results.")
#         return None

#     # Save sectioned content back to XML
#     for i, section_content in enumerate(results):
#         journal_pages[i].text = section_content  # Replace original content with sectioned content

#     save_xml(journal_pages, output_xml_path)
#     print(f"Sectioned journal saved to {output_xml_path}")

In [None]:
# import os
# import json
# from gpt_processing.gpt_interface import (
#     set_api_client, 
#     generate_messages, 
#     create_jsonl_file_for_batch, 
#     start_batch, 
#     get_batch_response
# )
# from data_processing.xml_processing import (
#     load_xml, 
#     save_xml, 
#     extract_sections_from_xml
# )

# # Initialize OpenAI client
# set_api_client()

# # File paths
# INPUT_XML = "input_journal.xml"
# SECTIONED_XML = "sectioned_journal.xml"
# TRANSLATED_XML = "translated_journal.xml"
# BATCH_SECTION_JSONL = "section_batch.jsonl"
# BATCH_TRANSLATE_JSONL = "translate_batch.jsonl"

# # System messages
# SYSTEM_MESSAGE_SECTION = """
# You are a helpful assistant. Divide the text into meaningful sections and add XML tags:
# <section> for major sections, <subsection> for subsections, <title> for titles, and <p> for paragraphs.
# """
# SYSTEM_MESSAGE_TRANSLATE = """
# You are Thich Nhat Hanh translating from Vietnamese to English. Provide meaningful translations with appropriate XML tags:
# <section>, <subsection>, <title>, <p>.
# """

# # Step 1: Sectioning
# def batch_sectioning(input_xml, output_xml):
#     # Load the input XML and extract pages or chunks
#     journal_pages = load_xml(input_xml)
#     pages_content = [page.text for page in journal_pages]  # Assuming .text contains the text of each page

#     # Create GPT messages for sectioning
#     user_message_wrapper = lambda text: f"Divide this content into sections:\n{text}"
#     messages = generate_messages(SYSTEM_MESSAGE_SECTION, user_message_wrapper, pages_content)

#     # Create JSONL file for batch processing
#     jsonl_file = create_jsonl_file_for_batch(messages, BATCH_SECTION_JSONL)

#     # Start batch
#     batch = start_batch(jsonl_file, description="Batch for sectioning journal")
#     batch_id = batch.get("id")
#     if not batch_id:
#         print("Error: Failed to start batch for sectioning.")
#         return None

#     print(f"Batch for sectioning started with ID: {batch_id}")

#     # Poll for batch completion and retrieve results
#     results = get_batch_response(batch_id)
#     if not results:
#         print("Error: Failed to retrieve sectioning batch results.")
#         return None

#     # Save the sectioned content back to XML
#     for i, section_content in enumerate(results):
#         journal_pages[i].text = section_content  # Replace original content with sectioned content

#     save_xml(journal_pages, output_xml)
#     print(f"Sectioned journal saved to {output_xml}")

# # Step 2: Translation
# def batch_translation(input_xml, output_xml):
#     # Load the sectioned XML and extract sections or chunks for translation
#     sections = extract_sections_from_xml(input_xml)

#     # Create GPT messages for translation
#     user_message_wrapper = lambda section: f"Translate this section:\n{section}"
#     messages = generate_messages(SYSTEM_MESSAGE_TRANSLATE, user_message_wrapper, sections)

#     # Create JSONL file for batch processing
#     jsonl_file = create_jsonl_file_for_batch(messages, BATCH_TRANSLATE_JSONL)

#     # Start batch
#     batch = start_batch(jsonl_file, description="Batch for translating journal")
#     batch_id = batch.get("id")
#     if not batch_id:
#         print("Error: Failed to start batch for translation.")
#         return None

#     print(f"Batch for translation started with ID: {batch_id}")

#     # Poll for batch completion and retrieve results
#     results = get_batch_response(batch_id)
#     if not results:
#         print("Error: Failed to retrieve translation batch results.")
#         return None

#     # Save the translated content back to XML
#     for i, translated_content in enumerate(results):
#         sections[i].text = translated_content  # Replace original content with translated content

#     save_xml(sections, output_xml)
#     print(f"Translated journal saved to {output_xml}")

# # Main process
# if __name__ == "__main__":
#     # Step 1: Sectioning
#     print("Starting batch sectioning...")
#     batch_sectioning(INPUT_XML, SECTIONED_XML)

#     # Step 2: Translation
#     print("Starting batch translation...")
#     batch_translation(SECTIONED_XML, TRANSLATED_XML)

In [None]:
# # Function schema for function calling
# function_schemas = [
#     {
#         "name": "save_processed_metadata",
#         "description": "Save metadata for a processed vietnamese journal, including sections and summaries, that will later be translated",
#         "parameters": {
#             "type": "object",
#             "properties": {
#                 "journal_summary": {"type": "string", "description": "A one-page summary of the journal in English."},
#                 "sections": {
#                     "type": "array",
#                     "items": {
#                         "type": "object",
#                         "properties": {
#                             "section_title_vi": {"type": "string", "description": "The original title of the section in Vietnamese."},
#                             "section_title_en": {"type": "string", "description": "The translated title of the section in English."},
#                             "section_summary": {"type": "string", "description": "A one paragraph summary of the section in English."},
#                             "page_range": {
#                                 "type": "array",
#                                 "items": {"type": "integer"},
#                                 "minItems": 2,
#                                 "maxItems": 2,
#                                 "description": "The start and end page numbers of the section."
#                             }
#                         },
#                         "required": ["section_title_en", "section_title_vi", "section_summary", "page_range"]
#                     }
#                 }
#             },
#             "required": ["journal_summary", "sections"]
#         }
#     }
# ]

In [None]:
# Step 2: Translation
# def batch_translate(input_xml_path, metadata_path):
#     """
#     Translates the journal sections using the GPT model.
#     Saves the translated content back to XML.
#     """
#     # Load the sectioned XML
#     section_metadata = #load json data from metadata_path and deserialize

#     # use the function split_xml_to_pages to get sections for translation:
#     sections = split_xml_pages(...)

#     # Create GPT messages for translation
#     user_message_wrapper = lambda section: f"Translate this section:\n{section}"
#     messages = generate_messages(system_message_translate, user_message_wrapper, sections)

#     # convert the blocks below to a series of nested try blocks with multiple attempts as in batch_section():
#     # add appropriate logging to match batch_section():

#     # Create JSONL file for batch processing
#     jsonl_file = create_jsonl_file_for_batch(messages, translate_batch_jsonl)

#     # Start the batch
#     batch = start_batch(jsonl_file, description="Batch for translating journal")
#     batch_id = batch.get("id")
#     if not batch_id:
#         print("Error: Failed to start batch for translation.")
#         return None

#     print(f"Batch for translation started with ID: {batch_id}")

#     # Poll for batch completion
#     results = poll_batch_for_response(batch_id)
#     if not results:
#         print("Error: Failed to retrieve translation batch results.")
#         return None

#     # Save translated content back to XML
#     translated_sections = []
#     for i, translated_content in enumerate(results):
#         translated_sections.append(translated_content)  # Replace original content with translated content

#     save_pages_to_xml(translated_sections, translated_xml)
#     print(f"Translated journal saved to {translated_xml}")

In [None]:
# # old system message

# system_message_section = """
# You are a highly skilled assistant processing a Vietnamese journal scanned from OCR. 
# You will be determining the journal sections by page number. You will also generate summaries for the full text and each section. 
# You will return this metadata in JSON format.

# Instructions:
# 1. Analyze the text and divide it into sections based on logical breaks, such as headings, topic changes, or clear shifts in content.
# 2. Ensure every page is part of  a section, even if that section is titled "blank page" or "title page," for example.
# 3. For each section, provide:
#    - The original title in Vietnamese (`section_title_vi`).
#    - The translated title in English (`section_title_en`).
#    - The author's name if it is available (`section_author`). 
#    - A one-paragraph summary of the section in English (`section_summary`).
#    - A list of keywords for the section that are related to its content, these can be proper names, specific concepts, or contextual information.
#    - The section's start and end page numbers (`start_page` and `end_page`).
#    - Use "null" for any data that is not available (such as author name) for the section.

# 4. Return the output as a JSON object with the following schema:
# {
#     "journal_summary": "A one-page summary of the whole journal in English.",
#     "sections": [
#         {
#             "section_title_vi": "Original title in Vietnamese",
#             "section_title_en": "Translated title in English",
#             "section_author": "Name of the author of the section",
#             "section_summary": "One-paragraph summary of the section in English",
#             "section_keywords": "A list of keywords for the section",
#             "start_page":  X,
#             "end_page":  Y
#         },
#         ...
#     ]
# }

# 5.  Ensure the JSON is well-formed and adheres strictly to the provided schema.
# """

In [None]:
# # Step 2: Translation
# def batch_translate(input_xml_path, output_xml_path):
#     """
#     Translates the journal sections using the GPT model.
#     Saves the translated content back to XML.
#     """
#     # Load the sectioned XML
#     section_metadata = 

#     # Create GPT messages for translation
#     user_message_wrapper = lambda section: f"Translate this section:\n{section}"
#     messages = generate_messages(system_message_translate, user_message_wrapper, sections)

#     # Create JSONL file for batch processing
#     jsonl_file = create_jsonl_file_for_batch(messages, translate_batch_jsonl)

#     # Start the batch
#     batch = start_batch(jsonl_file, description="Batch for translating journal")
#     batch_id = batch.get("id")
#     if not batch_id:
#         print("Error: Failed to start batch for translation.")
#         return None

#     print(f"Batch for translation started with ID: {batch_id}")

#     # Poll for batch completion
#     results = poll_batch_status(batch_id)
#     if not results:
#         print("Error: Failed to retrieve translation batch results.")
#         return None

#     # Save translated content back to XML
#     for i, translated_content in enumerate(results):
#         sections[i].text = translated_content  # Replace original content with translated content

#     save_pages_to_xml(sections, output_xml_path)
#     print(f"Translated journal saved to {output_xml_path}")

In [35]:
# testing
set_api_client()
msgs = generate_messages("you are assisting a software engineering/researcher looking to develop new AI platforms and processes.", lambda x: x, ["why is AI suddenly successful?", "What is the (immediate) future of AI?"])
run_immediate_chat_process(msgs[1])

2024-11-28 17:17:12,289 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


ChatCompletion(id='chatcmpl-AYjv0xkI0XcjvFgsIoqkv6onw62F9', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The immediate future of AI is poised to be characterized by several key trends and developments:\n\n1. **Continued Advancements in Machine Learning**: Expect ongoing improvements in algorithms, leading to more efficient, robust, and interpretable AI systems. Techniques like transfer learning, reinforcement learning, and unsupervised learning will gain prominence, making AI systems more adaptable and capable of complex tasks with less data.\n\n2. **Explainability and Transparency**: With increased AI integration in critical decision-making processes, there will be heightened demand for models that provide clear explanations of their outputs. This aligns with growing regulatory pressures and the need for trustworthiness in AI systems.\n\n3. **Ethical AI and Governance**: The focus will intensify on creating frameworks for ethical

In [None]:
# testing
model_settings = {
    "gpt-4o": {
        "max_tokens": 3000,
        "context_limit": 20000,  # Total context limit for the model
        "temperature": 1.3
    }}

set_model_settings(model_settings)
batch_id = run_single_oa_batch(["what is the square root of 2?", "why is the sky blue?"], "you are are explaining complex ideas to a 9 year old child.")

poll_batch_for_response(batch_id, 10)

msgs = generate_messages("you are assisting a software engineering/researcher looking to develop new AI platforms and processes.", lambda x: x, ["why is AI suddenly successful?", "What is the (immediate) future of AI?"])
run_immediate_chat_process(msgs[1])

get_last_batch_response()