## This notebook to solidfy journal cleaning process

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%aimport json
from pathlib import Path
%aimport re
from typing import List, Dict
%aimport os
from xml.sax.saxutils import escape
from pathlib import Path
import logging
from math import floor
from datetime import datetime
%aimport re

In [None]:
from data_processing.text_processing import get_text_from_file, set_working_directory, get_working_directory
from data_processing.text_processing import normalize_quotes

In [None]:
from data_processing.text_processing import write_text_to_file

In [None]:
from data_processing.gpt_processing import (
    token_count, get_active_batches,
    generate_messages, create_jsonl_file_for_batch, start_batch_with_retries,
    get_completed_batches, get_batch_response, set_model_settings, delete_old_files, run_immediate_chat_process,
    get_completion_content
)


In [None]:
from data_processing.xml_processing import split_xml_pages, split_xml_on_pagebreaks, save_pages_to_xml

In [None]:
project_dir = Path("/Users/phapman/Desktop/tnh-scholar/")
data_dir = project_dir / "data_processing"
journal_dir = data_dir / "processed_journal_data"
journal_name = "phat-giao-viet-nam-1956-02"
working_dir = journal_dir / journal_name
cleaned_xml_path = working_dir / f"full_cleaned_{journal_name}.xml"
batch_job_dir = working_dir / "processing_batch_files"
batch_file_path = batch_job_dir / f"clean_batch_{journal_name}.jsonl"
clean_batch_jsonl = working_dir / "clean_batch.jsonl"
ocr_file_to_process = journal_dir / journal_name / f"full_OCR_{journal_name}.xml"
logfile = data_dir / "gpt_processing" / "processing_info.log"

In [None]:
# Set up the logger
def setup_logger(log_file_path):
    """
    Configures the logger to write to a log file and the console.
    """
    # Remove existing handlers
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)

    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",  # Include logger name
        handlers=[
            logging.FileHandler(log_file_path, encoding="utf-8"),
            logging.StreamHandler()  # Optional: to log to the console as well
        ]
    )

    # Suppress DEBUG/INFO logs for specific noisy modules
    modules_to_suppress = ["httpx", "httpcore", "urllib3", "openai"]
    for module in modules_to_suppress:
        logger = logging.getLogger(module)
        logger.setLevel(logging.WARNING)  # Suppress DEBUG and INFO logs

    
    return logging.getLogger(__name__)

In [None]:
logger = setup_logger(logfile)

In [None]:
user_message_string_clean = """{text}"""

In [None]:
def user_wrap_function_clean(text_block):
    return user_message_string_clean.format(text=text_block)

In [None]:
system_message_clean = """You are a meticulous and consistent world expert at cleaning OCR-generated Vietnamese text. 
You are cleaning pages from a 1950's Buddhist Journal. 
Each line of scanned data will be enclosed in <> brackets. Leave <> brackets in place.
Your goal is to minimally modify the text to generate a cleaned version.
Do not remove any content from the main body of the text. 
Do not change the line formatting. 

You can use the semantic meaning of the text to infer corrections—but make no semantic changes. 
You can also add diacritical marks if they are missing or clearly inaccurate. 
Do not change any proper names, except to add missing diacritical marks or to fix orthographic errors if the context is clear.  

This particular text has a title marker in the footer, "Phat Giao Viet Nam," and also a publishing mark diagonally across the text.  
The publishing mark is "TU VIEN HUE QUANG"  and is faint so only parts of it may appear in some locations in the text. 
Text corresponding to these marks (or part thereof) and page numbers can be omitted.

IMPORTANT: If the page is blank return: blank page 
IMPORTANT: Output the corrected text only with no comments (including ``` xml)"""

In [None]:
def get_max_tokens_for_clean(data: str, factor: float=1, buffer: int=100):
    return floor(token_count(data) * factor) + buffer

In [None]:
text = get_text_from_file(ocr_file_to_process)
pages = split_xml_on_pagebreaks(text)

In [None]:
print(pages[3])

In [None]:
def wrap_lines(text: str) -> str:
    """
    Encloses each line of the input text with angle brackets.

    Args:
        text (str): The input string containing lines separated by '\n'.

    Returns:
        str: A string where each line is enclosed in angle brackets.
    
    Example:
        >>> enclose_lines("This is a string with   \n   two lines.")
        '<This is a string with  >\n<    two lines.>'
    """
    return '\n'.join(f"<{line}>" for line in text.split('\n'))

In [None]:
def wrap_all_lines(pages):
    return [wrap_lines(page) for page in pages]

In [None]:
def unwrap_lines(text: str) -> str:
    """
    Removes angle brackets (< >) from encapsulated lines and merges them into 
    a newline-separated string.

    Parameters:
        text (str): The input string with encapsulated lines.

    Returns:
        str: A newline-separated string with the encapsulation removed.
    
    Example:
        >>> merge_encapsulated_lines("<Line 1> <Line 2> <Line 3>")
        'Line 1\nLine 2\nLine 3'
        >>> merge_encapsulated_lines("<Line 1>\n<Line 2>\n<Line 3>")
        'Line 1\nLine 2\nLine 3'
    """
    # Find all content between < and > using regex
    matches = re.findall(r"<(.*?)>", text)
    # Join the extracted content with newlines
    return '\n'.join(matches)

In [None]:
def unwrap_all_lines(pages):
    result = []
    for page in pages:
        if page == "blank page":
            result.append(page)
        else:
            result.append(unwrap_lines(page))
    return result

In [None]:
def generate_clean_batch(
    input_xml_file: str,
    output_file: str,
    system_message: str,
    user_wrap_function,
    immediate: bool = False
):
    """
    Generate a batch file for the OpenAI (OA) API using a single input XML file.

    Parameters:
        batch_file (str): Full path to the input XML file to process.
        output_file (str): Full path to the output batch JSONL file.
        system_message (str): System message template for batch processing.
        user_wrap_function (callable): Function to wrap user input for processing pages.

    Returns:
        str: Path to the created batch file.

    Raises:
        Exception: If an error occurs during file processing.
    """

    try:
        # Read the OCR text from the batch file
        text = get_text_from_file(input_xml_file)
        logger.info(f"Processing file: {input_xml_file}")

        # Split the text into pages for processing
        pages = split_xml_on_pagebreaks(text)
        pages =  wrap_all_lines(pages) # wrap lines with brackets.
        if not pages:
            raise ValueError(f"No pages found in XML file: {input_xml_file}")
        logger.info(f"Found {len(pages)} pages in {input_xml_file}.")

        max_tokens = [get_max_tokens_for_clean(page) for page in pages]

        # Generate messages for the pages
        batch_message_seq = generate_messages(system_message, user_wrap_function, pages)

        if immediate:
            logger.info("Running immediate chat process for cleaning:")
            for message in batch_message_seq:
                logger.info("Starting page {i+1}...")
                result = run_immediate_chat_process(batch_message_seq, max_token_list=max_tokens)
                
            
        # Save the batch file
        create_jsonl_file_for_batch(batch_message_seq, output_file, max_token_list=max_tokens)
        logger.info(f"Batch file created successfully: {output_file}")

        return output_file

    except FileNotFoundError:
        logger.error(f"File not found.")
        raise
    except ValueError as e:
        logger.error(f"Value error: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error while processing {input_xml_file}: {e}")
        raise

In [None]:
def run_immediate_clean(
    input_xml_file: str,
    system_message: str,
    user_wrap_function,
):
    try:
        result_list = []
        # Read the OCR text from the batch file
        text = get_text_from_file(input_xml_file)
        logger.info(f"Processing {input_xml_file} for immediate cleaning:")

        # Split the text into pages for processing
        pages = split_xml_on_pagebreaks(text)
        pages =  wrap_all_lines(pages) # wrap lines with brackets.
        if not pages:
            raise ValueError(f"No pages found in XML file: {input_xml_file}")
        logger.info(f"Found {len(pages)} pages in {input_xml_file}.")

        max_tokens = [get_max_tokens_for_clean(page) for page in pages]

        # Generate messages for the pages
        batch_message_seq = generate_messages(system_message, user_wrap_function, pages)

        for i, message in enumerate(batch_message_seq):
            logger.info(f"Starting page {i+1}...")
            completion = run_immediate_chat_process(message, max_tokens=max_tokens[i])
            if completion:
                result_list.append(get_completion_content(completion))
            else:
                logger.error("Chat completion failed.")
                raise RuntimeError("Chat could not complete.")
            
        logger.info(f"Cleaning completed successfully.")

        return result_list

    except FileNotFoundError:
        logger.error(f"File not found.")
        raise
    except ValueError as e:
        logger.error(f"Value error: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error while processing {input_xml_file}: {e}")
        raise

In [None]:
model_settings_clean = {
    "gpt-4o": {
        "max_tokens": 1000,
        "temperature": 0
    }
}
set_model_settings(model_settings_clean)

In [None]:
ocr_file_to_process, ocr_file_to_process.exists()

In [None]:
# clean_result = run_immediate_clean(ocr_file_to_process, system_message_clean, user_wrap_function_clean)
# cleaned_data = clean_result

In [None]:
generate_clean_batch(ocr_file_to_process, batch_file_path, system_message_clean, user_wrap_function_clean)
job_description = f"cleaning test for {journal_name} on file: {ocr_file_to_process}"
cleaned_data = start_batch_with_retries(batch_file_path, job_description)

In [None]:
# cleanup files:
delete_old_files(datetime.now())

In [None]:
cleaned_data[5]

In [None]:
cleaned_data = unwrap_all_lines(cleaned_data)

In [None]:
cleaned_data

In [None]:
save_pages_to_xml(cleaned_xml_path, cleaned_data, overwrite=True)

In [None]:
full_cleaned_text = join_pages(cleaned_data)

In [None]:
print(full_cleaned_text)

In [None]:
full_cleaned_path = journal_dir / basename / f"full_cleaned_{basename}.xml"
full_cleaned_path

In [None]:
#write_text_to_file(full_cleaned_path, full_cleaned_text)

In [None]:
full_cleaned_current = get_text_from_file(full_cleaned_path)

In [None]:
print(full_cleaned_current)

In [None]:
token_count(full_cleaned_current)

In [None]:
cleaned_pages = split_xml_pages(full_cleaned_current)

In [None]:
cleaned_pages

In [None]:
import re

def remove_page_tags(text):
    """
    Removes <page ...> and </page> tags from a text string.

    Parameters:
    - text (str): The input text containing <page> tags.

    Returns:
    - str: The text with <page> tags removed.
    """
    # Remove opening <page ...> tags
    text = re.sub(r"<page[^>]*>", "", text)
    # Remove closing </page> tags
    text = re.sub(r"</page>", "", text)
    return text

In [None]:
cleaned_pages = [remove_page_tags(page) for page in cleaned_pages]

In [None]:
cleaned_pages[1]

In [None]:
cleaned_pages[8]