In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from lxml import etree
%aimport json
from pathlib import Path
%aimport re
from typing import List, Dict
%aimport os
from xml.sax.saxutils import escape

In [None]:
from data_processing.text_processing import get_text_from_file, set_working_directory, get_working_directory
from data_processing.text_processing import normalize_quotes

In [None]:
from data_processing.text_processing import write_text_to_file

In [None]:
from data_processing.gpt_processing import token_count, set_api_client, get_api_client, get_active_batches
from data_processing.gpt_processing import generate_messages, run_immediate_chat_process, create_jsonl_file_for_batch, start_batch
from data_processing.gpt_processing import get_completed_batches, get_batch_response


In [None]:
user_message_string_clean = """{text}"""

In [None]:
def user_wrap_function_clean(text_block):
    return user_message_string_clean.format(text=text_block)

In [None]:
system_message_clean = """
- You are a meticulous and consistent world expert at cleaning OCR-generated Vietnamese text. 
- You are cleaning text from a 1950's Buddhist Journal. 
IMPORTANT: You will receive one page of text listed by lines: do not modify the line structure.
- You will minimally modify the text to fix any OCR errors.
- You will add <section>, <title>, <author>, <p>, <li>, <ul>, <ol> tags (only) where appropriate in text, based on text layout and content. 
- Do not remove any content from the main body of the text.   
- Use patterns in the text blocks (given by page) to infer patterns in the text.
- Use the semantic meaning of the text to infer corrections—but make no semantic changes. 
- Add diacritical marks if they are missing or clearly inaccurate. 
- Do not change any proper names, except to add missing diacritical marks if the context is clear.  
IMPORTANT: Output the corrected text only with no comments or additional formatting marks of any kind.
"""

In [None]:
system_message_clean = """
- You are a meticulous and consistent world expert at cleaning OCR-generated Vietnamese text. 
- You are cleaning text from a 1950's Buddhist Journal. 
IMPORTANT: You will receive one page of text listed by lines: do not modify the line structure.
- You will minimally modify the text to fix any OCR errors.
- Do not remove any content from the main body of the text.   
- Use patterns in the text blocks (given by page) to infer patterns in the text.
- Use the semantic meaning of the text to infer corrections—but make no semantic changes. 
- Add diacritical marks if they are missing or clearly inaccurate. 
- Do not change any proper names, except to add missing diacritical marks if the context is clear.  
IMPORTANT: Output the corrected text only with no comments or additional formatting marks of any kind.

You will add the following xml tags (only) where appropriate in text, based on text layout and content: 

   <p> for paragraphs.
   <section> for major sections.
   <subsection> for subsections.
   <title> for main titles of sections and subsections. 
   <subtitle> for subtitles of sections and subsections. 
   <heading> for headings that do not mark titles or subtitles
   <contents> for tables of contents
   <author> for authors of sections or subsections
   <blockquote> for block or pull quotes
   <note> for notes within the text
   <ol> <ul> <li> for lists
   <i> for italics. 
   <b> for bold.

"""

### testing gpt-4o-mini system message

You are an intelligent, meticulous, and consistent expert at cleaning OCR-generated Vietnamese text. 
The text will be given in XML, and the output text should be in matching XML. 
Your goal is to minimally modify the text to generate a cleaned version. 
Do not remove any text from the main content.  
Formatting markers at the beginning and end of the text can be adjusted or removed as needed for clarity. 
You can use the semantic meaning of the text to infer corrections—but make no semantic changes. 
You can also add diacritical marks if they are missing or clearly inaccurate and can be determined by context.
Do not change any proper names, except to add missing diacritical marks if the context is clear.  
This particular text has a title marker: "Phat Giao Viet Nam," and also a publishing mark near the end of each page of text. 
The publishing mark is something like "Tu Vien HUE QUANG" + "TRUNG TAM DICH THUAT HAN NOM" and is often incomplete.
Text corresponding to these marks (or part thereof) and page numbers can be omitted.
Output the corrected text only with no comments.

### Process files in sequence and generate cleaning batch JSON files:

In [None]:
def split_xml_pages(text, page_groups=None):
    """
    Splits an XML document into individual pages based on <page> tags.
    Optionally groups pages together based on page_groups.

    Parameters:
    - text (str): The XML document as a string.
    - page_groups (list of tuples, optional): A list of tuples defining page ranges to group together.
                                              Each tuple is of the form (start_page, end_page), inclusive.

    Returns:
    - List[str]: A list of strings, where each element is a single page (if no groups) or a group of pages.
    """
    from lxml import etree

    # Parse the XML text into an element tree
    try:
        root = etree.fromstring(text.encode("utf-8"))
    except etree.XMLSyntaxError as e:
        # Handle parsing errors with helpful debugging information
        line_number = e.lineno
        column_number = e.offset
        lines = text.splitlines()
        error_line = lines[line_number - 1] if line_number - 1 < len(lines) else "Unknown line"
        print(f"XMLSyntaxError: {e}")
        print(f"Offending line {line_number}, column {column_number}: {error_line}")
        return []  # Return an empty list if parsing fails

    # Extract all pages as a list of strings
    pages = [
        (int(page.get("page")), etree.tostring(page, encoding="unicode"))
        for page in root.findall(".//page")
    ]
    
    # Sort pages by page number
    pages.sort(key=lambda x: x[0])

    # If no page_groups, return individual pages
    if not page_groups:
        return [content for _, content in pages]

    # Group pages based on page_groups
    grouped_pages = []
    for start, end in page_groups:
        group_content = ""
        for page_num, content in pages:
            if start <= page_num <= end:
                group_content += content
        if group_content:
            grouped_pages.append(group_content)

    return grouped_pages

In [None]:
import re

def clean_xml_keep_pages(xml_content):
    """
    Remove all XML tags except <page> and <document>, preserving all whitespace.

    Args:
        xml_content (str): The original XML document as a string.

    Returns:
        str: The cleaned XML content with only <page> and <document> tags preserved.
    """
    # Regex pattern to match unwanted tags (any tags except <page> and <document>)
    unwanted_tags_pattern = r"<(?!/?(page|document)\b)[^>]+>"
    
    # Use re.sub to replace unwanted tags with an empty string
    cleaned_content = re.sub(unwanted_tags_pattern, "", xml_content)
    
    return cleaned_content

# Example usage
xml_example = """
<document>
  <title>This is a title</title>
  <page page="1">
    <paragraph>This is some text in a paragraph.</paragraph>
    <section>Here is a section.</section>
  </page>
  <page page="2">
    <note>Some note text.</note>
  </page>
</document>
"""

cleaned_xml = clean_xml_keep_pages(xml_example)
print(cleaned_xml)

In [None]:
processed_journals = "../../processed_journal_data"
batch_ouput_prefix = "./journal_cleaning_batches/"

In [None]:
ocr_xml = get_text_from_file("phat-giao-viet-nam-1956-28/full_OCR_text_phat-giao-viet-nam-1956-28.xml", processed_journals)

In [None]:
pages = split_xml_pages(ocr_xml)

In [None]:
print(pages[3])

In [None]:
pages

In [None]:
print(system_message_clean)

In [None]:
# generate all cleaning batch files using pages:

processed_journals = "../../processed_journal_data"
batch_ouput_prefix = "./journal_cleaning_batches/"

for path in Path(processed_journals).iterdir():
    if path.is_dir():
        filename = path.name
        for subpath in Path(path).iterdir():
            regex = re.compile(r"^full_OCR_.*\.xml")
            if subpath.is_file() and regex.search(subpath.name):
                print(subpath.name)
                try:
                    text = get_text_from_file(subpath.name, path)
                    print(f"{text[:90]}...")
                    chunks = split_xml_pages(text)
                    clean_message_seq = generate_messages(system_message_clean, user_wrap_function_clean, chunks)
                    create_jsonl_file_for_batch(clean_message_seq, batch_ouput_prefix + "clean_batch_" + filename + ".jsonl")
                except Exception as e:
                    print(f"{e}\nfailed.. \nContinuing.")
                    

In [None]:
import logging
from pathlib import Path

def generate_single_oa_batch_from_pages(
    input_xml_file: str,
    output_file: str,
    system_message: str,
    user_wrap_function,
):
    """
    Generate a batch file for the OpenAI (OA) API using a single input XML file.

    Parameters:
        batch_file (str): Full path to the input XML file to process.
        output_file (str): Full path to the output batch JSONL file.
        system_message (str): System message template for batch processing.
        user_wrap_function (callable): Function to wrap user input for processing pages.

    Returns:
        str: Path to the created batch file.

    Raises:
        Exception: If an error occurs during file processing.
    """
    logger = logging.getLogger(__name__)

    try:
        # Read the OCR text from the batch file
        text = get_text_from_file(input_xml_file)
        logger.info(f"Processing file: {input_xml_file}")

        # Split the text into pages for processing
        pages = split_xml_pages(text)
        if not pages:
            raise ValueError(f"No pages found in XML file: {input_xml_file}")
        logger.info(f"Found {len(pages)} pages in {input_xml_file}.")

        # Generate messages for the pages
        batch_message_seq = generate_messages(system_message, user_wrap_function, pages)

        # Save the batch file
        create_jsonl_file_for_batch(batch_message_seq, output_file)
        logger.info(f"Batch file created successfully: {output_file}")

        return output_file

    except FileNotFoundError:
        logger.error(f"File not found: {input_xml_file}")
        raise
    except ValueError as e:
        logger.error(f"Value error: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error while processing {input_xml_file}: {e}")
        raise

In [None]:
import logging
from pathlib import Path
import re

def generate_all_batches(
    processed_document_dir: str,
    system_message: str,
    user_wrap_function,
    file_regex: str = r".*\.xml",
):
    """
    Generate cleaning batches for all journals in the specified directory.

    Parameters:
        processed_journals_dir (str): Path to the directory containing processed journal data.
        system_message (str): System message template for batch processing.
        user_wrap_function (callable): Function to wrap user input for processing pages.
        file_regex (str): Regex pattern to identify target files (default: ".*\\.xml").

    Returns:
        None
    """
    logger = logging.getLogger(__name__)
    document_dir = Path(processed_document_dir)
    regex = re.compile(file_regex)

    for journal_file in document_dir.iterdir():
        if journal_file.is_file() and regex.search(journal_file.name):
            try:
                # Derive output file path
                output_file = journal_file.with_suffix(".jsonl")
                logger.info(f"Generating batch for {journal_file}...")

                # Call single batch function
                generate_single_oa_batch_from_pages(
                    input_xml_file=str(journal_file),
                    output_file=str(output_file),
                    system_message=system_message,
                    user_wrap_function=user_wrap_function,
                )
            except Exception as e:
                logger.error(f"Failed to process {journal_file}: {e}")
                continue

    logger.info("Batch generation completed.")

In [None]:
batch_client = set_api_client()

In [None]:
batch_job_dir = "./journal_cleaning_batches"
batch_files = os.listdir(batch_job_dir)
batch_files


In [None]:
file_path = os.path.join(batch_job_dir, 'clean_batch_phat-giao-viet-nam-1956-28.jsonl')
file_path

## completed batches:
10, 25-26

In [None]:
batch28 = start_batch(file_path)

In [None]:
# file_path

In [None]:
# batch_27 = start_batch(file_path)

In [None]:
# batch_05_06 = start_batch(file_path)

In [None]:
get_active_batches()

In [None]:
completed = get_completed_batches()

In [None]:
completed

In [None]:
cleaned_data = get_batch_response(completed[0]['id'])

In [None]:
len(cleaned_data)

In [None]:
print(cleaned_data[3])

In [None]:
def join_pages(data):
    result = ["<document>"]
    result = result + data
    result.append("</document>")

    return "\n".join(result)


In [None]:
cleaned_data

In [None]:
full_cleaned_text = join_pages(cleaned_data)

In [None]:
print(full_cleaned_text)

In [None]:
#write_text_to_file("full_cleaned_journal_28.xml", full_cleaned_text)

In [None]:
processed_journal_path = Path("/Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data")
current_journal_dir = processed_journal_path / "phat-giao-viet-nam-1956-28"

In [None]:
full_cleaned_text = get_text_from_file(current_journal_dir / "full_cleaned_phat-giao-viet-nam-1956-28.xml")

In [None]:
token_count(full_cleaned_text)

In [None]:
cleaned_sections = split_xml_pages(full_cleaned_text, page_groups=[(1, 6), (7, 17),(18, 25), (26, 30), (31, 36), (37, 37), (38, 44), (45, 51)])

In [None]:
print(clean_xml_keep_pages(cleaned_sections[0]))

In [None]:
print(cleaned_sections[0])

In [None]:
[token_count(x) for x in cleaned_sections]

In [None]:
system_message_translate = """
You are Thich Nhat Hanh translating from Vietnamese to English for your experienced students. 
The text is a section of a journal you edited from 1956-1958. Use the title: "Journal of Vietnamese Buddhism" 
You want your students to understand the material and its historical and cultural context—in particular, as it relates to your life and teachings.
Give a full English translation in the style of Thich Nhat Hanh. Translate for the most meaningful, typical, and eloquent English interpretation.
Keep pages together: each translated page must match its original page source as pages will be studied side by side with the original Vietnamese.
Add XML tags for clarity. Use only the following: 

   <p> for paragraphs.
   <section> for major sections.
   <subsection> for subsections.
   <title> for main titles of sections and subsections. 
   <subtitle> for subtitles of sections and subsections. 
   <heading> for headings that do not mark titles or subtitles
   <contents> for tables of contents
   <author> for authors of sections or subsections
   <ol> <ul> <li> for lists
   <i> for italics. 
   <b> for bold.
   <footnote> <footnote-section> for footnotes.

All titles, XML sections, text, and terms should be translated--do not leave any terms or expressions in Vietnamese, except names of Vietnamese people.
Add footnotes as follows:

1. Structure
   - Inline Reference: Use `<footnote number="X">[X]</footnote>` directly after the reference in the text.
   - Footnote Section: Include all footnote explanations in `<footnote-section>` at the end of the document. Example:
     A sentence with a footnote.<footnote number="1">[1]</footnote>
     <footnote-section>
         <footnote number="1">Explanation for footnote 1.</footnote>
     </footnote-section>

2. Numbering
   - Start numbering at 1 for each new section.
   - Increment sequentially for each new reference.

4. Placement
   - Inline `<footnote>` tags immediately follow the referenced text.
   - `<footnote-section>` appears at the end of each section.

5. Formatting
   - Inline footnote references use square brackets: `[X]`.
   - Explanations appear only in `<footnote-section>`.
   - Highlight complex terms (Sanskrit, Sino-Vietnamese, French) with `<i>` tags and explain in footnotes. 
   - Each footnote should always include the original text or term before translation.

6. Content 
   - Use footnotes liberally to explain:
     * Elements of Vietnamese Buddhism or Buddhism in general.
     * Vietnamese culture and history.
     * Life, teachings, and practices of Thich Nhat Hanh.
   - For footnoted terms, include the original Vietnamese, Sino-Vietnamese, Sanskrit, or French in the explanation.

7. Examples:
   Inline: Thich Nhat Hanh emphasized <i>mindfulness</i> <footnote number="1">[1]</footnote>.
   Section: 
   <footnote-section>
       <footnote number="1"><i>Mindfulness</i>: Original term is "Chánh niệm" (Vietnamese).</footnote>
   </footnote-section>
"""

In [None]:
token_count(system_message_translate)

In [None]:
user_message_string_translate = """{text}"""

In [None]:
def user_wrap_function_translate(text_block):
    return user_message_string_translate.format(text=text_block)

In [None]:
translation_message_seq = generate_messages(system_message_translate, user_wrap_function_translate, cleaned_sections[1:2])

In [None]:
batch_job_dir = Path("../gpt_batch_files/journal_translate_batches")
journal_name = "phat-giao-viet-nam-1956-28"
batch_file_name = "translate_batch_" + journal_name + ".jsonl" 
batch_path = batch_job_dir / batch_file_name

In [None]:
create_jsonl_file_for_batch(translation_message_seq, batch_path)

In [None]:
tx_batch = start_batch(batch_path)
tx_batch

In [None]:
get_active_batches()

In [None]:
completed = get_completed_batches()

In [None]:
completed

In [None]:
translated_data = get_batch_response(completed[0]['id'])

In [None]:
len(translated_data)

In [None]:
print(translated_data[0])

In [None]:
full_translated_text = join_pages(translated_data)

In [None]:
print(full_translated_text)

In [None]:
token_count(full_translated_text)

In [None]:
write_text_to_file("full_tx_phat-giao-viet-nam-1956-28.xml", full_translated_text)