In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from lxml import etree
%aimport json
from pathlib import Path
%aimport re
from typing import List, Dict
%aimport os
from xml.sax.saxutils import escape
from pathlib import Path

In [None]:
from data_processing.text_processing import get_text_from_file, set_working_directory, get_working_directory
from data_processing.text_processing import normalize_quotes

In [None]:
from data_processing.text_processing import write_text_to_file

In [None]:
from data_processing.gpt_processing import token_count, set_api_client, get_api_client, get_active_batches
from data_processing.gpt_processing import generate_messages, run_immediate_chat_process, create_jsonl_file_for_batch, start_batch
from data_processing.gpt_processing import get_completed_batches, get_batch_response


In [None]:
from data_processing.xml_processing import split_xml_pages, save_pages_to_xml

In [None]:
project_dir = Path("/Users/phapman/Desktop/tnh-scholar/")
data_dir = project_dir / "data_processing"
journal_dir = data_dir / "processed_journal_data"

In [None]:
user_message_string_clean = """{text}"""

In [None]:
def user_wrap_function_clean(text_block):
    return user_message_string_clean.format(text=text_block)

In [None]:
system_message_clean = """
You are a meticulous and consistent world expert at cleaning OCR-generated Vietnamese text. 
You are cleaning text from a 1950's Buddhist Journal. 
The text will be given in XML, and the output text should be in matching XML. 
Your goal is to minimally modify the text to generate a cleaned version.
Do not remove any text from the main body of the text. 
Formatting markers (such as footers) at the end of the text can be adjusted or removed as needed for clarity. 
You can use patterns in the text blocks (given by page) to infer patterns in the text.
You can use the semantic meaning of the text to infer corrections—but make no semantic changes. 
You can also add diacritical marks if they are missing or clearly inaccurate. 
Do not change any proper names, except to add missing diacritical marks if the context is clear.  
This particular text has a title marker: "Phat Giao Viet Nam," and also a publishing mark near the end of each page of text. 
The publishing mark is something like "TU VIEN HUE QUANG"  and is very difficult for the OCR to process. 
Text corresponding to these marks (or part thereof) and page numbers can be omitted.
Output the corrected text only with no comments (including ``` xml).
"""

In [None]:
basename = "phat-giao-viet-nam-1956-01"
ocr_file_to_process = journal_dir / basename / f"full_OCR_{basename}.xml"
ocr_file_to_process

In [None]:
print(system_message_clean)

In [None]:


def generate_single_oa_batch_from_pages(
    input_xml_file: str,
    output_file: str,
    system_message: str,
    user_wrap_function,
):
    """
    Generate a batch file for the OpenAI (OA) API using a single input XML file.

    Parameters:
        batch_file (str): Full path to the input XML file to process.
        output_file (str): Full path to the output batch JSONL file.
        system_message (str): System message template for batch processing.
        user_wrap_function (callable): Function to wrap user input for processing pages.

    Returns:
        str: Path to the created batch file.

    Raises:
        Exception: If an error occurs during file processing.
    """
    logger = logging.getLogger(__name__)

    try:
        # Read the OCR text from the batch file
        text = get_text_from_file(input_xml_file)
        logger.info(f"Processing file: {input_xml_file}")

        # Split the text into pages for processing
        pages = split_xml_pages(text)
        if not pages:
            raise ValueError(f"No pages found in XML file: {input_xml_file}")
        logger.info(f"Found {len(pages)} pages in {input_xml_file}.")

        # Generate messages for the pages
        batch_message_seq = generate_messages(system_message, user_wrap_function, pages)

        # Save the batch file
        create_jsonl_file_for_batch(batch_message_seq, output_file)
        logger.info(f"Batch file created successfully: {output_file}")

        return output_file

    except FileNotFoundError:
        logger.error(f"File not found: {input_xml_file}")
        raise
    except ValueError as e:
        logger.error(f"Value error: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error while processing {input_xml_file}: {e}")
        raise

In [None]:
import logging
from pathlib import Path
import re

def generate_all_batches(
    processed_document_dir: str,
    system_message: str,
    user_wrap_function,
    file_regex: str = r".*\.xml",
):
    """
    Generate cleaning batches for all journals in the specified directory.

    Parameters:
        processed_journals_dir (str): Path to the directory containing processed journal data.
        system_message (str): System message template for batch processing.
        user_wrap_function (callable): Function to wrap user input for processing pages.
        file_regex (str): Regex pattern to identify target files (default: ".*\\.xml").

    Returns:
        None
    """
    logger = logging.getLogger(__name__)
    document_dir = Path(processed_document_dir)
    regex = re.compile(file_regex)

    for journal_file in document_dir.iterdir():
        if journal_file.is_file() and regex.search(journal_file.name):
            try:
                # Derive output file path
                output_file = journal_file.with_suffix(".jsonl")
                logger.info(f"Generating batch for {journal_file}...")

                # Call single batch function
                generate_single_oa_batch_from_pages(
                    input_xml_file=str(journal_file),
                    output_file=str(output_file),
                    system_message=system_message,
                    user_wrap_function=user_wrap_function,
                )
            except Exception as e:
                logger.error(f"Failed to process {journal_file}: {e}")
                continue

    logger.info("Batch generation completed.")

In [None]:
batch_client = set_api_client()

In [None]:
batch_job_dir = data_dir / "gpt_processing" / "gpt_batch_files"
batch_file_path = batch_job_dir / "journal_cleaning_batches" / f"clean_batch_{basename}.jsonl"
batch_file_path

In [None]:
generate_single_oa_batch_from_pages(ocr_file_to_process, batch_file_path, system_message_clean, user_wrap_function_clean)

## completed batches:
10, 25-26

In [None]:
#batch = start_batch(batch_file_path)

In [None]:
get_active_batches()

In [None]:
completed = get_completed_batches()
completed

In [None]:
cleaned_data = get_batch_response(completed[0]['id'])

In [None]:
len(cleaned_data)

In [None]:
print(cleaned_data[20])

In [None]:
def join_pages(data):
    result = ["<document>"]
    result = result + data
    result.append("</document>")
    return "\n".join(result)


In [None]:
cleaned_data

In [None]:
full_cleaned_text = join_pages(cleaned_data)

In [None]:
print(full_cleaned_text)

In [None]:
full_cleaned_path = journal_dir / basename / f"full_cleaned_{basename}.xml"
full_cleaned_path

In [None]:
#write_text_to_file(full_cleaned_path, full_cleaned_text)

In [None]:
full_cleaned_current = get_text_from_file(full_cleaned_path)

In [None]:
print(full_cleaned_current)

In [None]:
token_count(full_cleaned_current)

In [None]:
cleaned_pages = split_xml_pages(full_cleaned_current)

In [None]:
cleaned_pages

In [None]:
import re

def remove_page_tags(text):
    """
    Removes <page ...> and </page> tags from a text string.

    Parameters:
    - text (str): The input text containing <page> tags.

    Returns:
    - str: The text with <page> tags removed.
    """
    # Remove opening <page ...> tags
    text = re.sub(r"<page[^>]*>", "", text)
    # Remove closing </page> tags
    text = re.sub(r"</page>", "", text)
    return text

In [None]:
cleaned_pages = [remove_page_tags(page) for page in cleaned_pages]

In [None]:
cleaned_pages[1]

In [None]:
cleaned_pages[8]

In [None]:
# cleaned_sections = split_xml_pages(full_cleaned_text, page_groups=[(1, 6), (7, 17),(18, 25), (26, 30), (31, 36), (37, 37), (38, 44), (45, 51)])
# print(clean_xml_keep_pages(cleaned_sections[0]))

In [None]:
# print(cleaned_sections[0])

In [None]:
system_message_translate = """
You are Thich Nhat Hanh translating from Vietnamese to English for your experienced students. 
The text is based on an OCR scan of a journal you edited from 1956-1958. Use the title: "Journal of Vietnamese Buddhism" 
You want your students to understand the material and its historical and cultural context—in particular, as it relates to your life and teachings.
You will be translating a single section of the journal and with the title
Translate for the most meaningful, typical, and eloquent English interpretation.
Keep pages together: each translated page must match its original page source as pages will be studied side by side with the original Vietnamese.
Infer paragraphs and text structure from the text layout.
Add XML tags for clarity. Use only the following: 

   <p> for paragraphs.
   <section> for major sections.
   <subsection> for subsections.
   <title> for main titles of sections and subsections. 
   <subtitle> for subtitles of sections and subsections. 
   <heading> for headings that do not mark titles or subtitles
   <contents> for tables of contents
   <author> for authors of sections or subsections
   <ol> <ul> <li> for lists
   <i> for italics. 
   <b> for bold.
   <footnote> <footnote-section> for footnotes (see below).

All titles, XML sections, text, and terms should be translated--do not leave any terms or expressions in Vietnamese, except names of Vietnamese people.

Add footnotes as follows:

1. Structure
   - Inline Reference: Use `<footnote number="X">[X]</footnote>` directly after the reference in the text.
   - Footnote Section: Include all footnote explanations in `<footnote-section>` at the end of the document. Example:
     A sentence with a footnote.<footnote number="1">[1]</footnote>
     <footnote-section>
         <footnote number="1">Explanation for footnote 1.</footnote>
     </footnote-section>

2. Numbering
   - Start numbering at 1 for each new section.
   - Increment sequentially for each new reference.

4. Placement
   - Inline `<footnote>` tags immediately follow the referenced text.
   - `<footnote-section>` appears at the end of each section.

5. Formatting
   - Inline footnote references use square brackets: `[X]`.
   - Explanations appear only in `<footnote-section>`.
   - Highlight complex terms (Sanskrit, Sino-Vietnamese, French) with `<i>` tags and explain in footnotes. 
   - Each footnote should always include the original text or term before translation.

6. Content 
   - Use footnotes liberally to explain:
     * Elements of Vietnamese Buddhism or Buddhism in general.
     * Vietnamese culture and history.
     * Life, teachings, and practices of Thich Nhat Hanh.
   - For footnoted terms, include the original Vietnamese, Sino-Vietnamese, Sanskrit, or French in the explanation.

7. Examples:
   Inline: Thich Nhat Hanh emphasized <i>mindfulness</i> <footnote number="1">[1]</footnote>.
   Section: 
   <footnote-section>
       <footnote number="1"><i>Mindfulness</i>: Original term is "Chánh niệm" (Vietnamese).</footnote>
   </footnote-section>
"""

In [None]:
system_message_base_tranlate = """
You are Thich Nhat Hanh translating pages from an OCR scanned journal you edited in the 1950's. If words or text are garbled you may correct.
Give a precise English translation of the page in xml format. Strive for phrase-level accuracy. 
Add <section> <p> and <title> tags to any parts of the text which indicate sections paragraphs or titles.
"""

In [None]:
system_message_tagging = """
This page is from a Vietnamese Buddhist Journal published in the 1950's. Add XML tags where appropriate:
<section> for major sections.
<subsection> for subsections.
<p> for paragraphs.
<title> for titles of sections or subsections
<subtitle> for subtitles
<author> for authors
<heading> for any other headings in the text
<ol> <ul> <li> for lists
<contents> for tables of contents
<note> for notes
"""

### gpt-3.5 turbo test system message:

```
Give a precise English translation of this initially cleaned OCR text in the style of Thich Nhat Hanh. 
Strive for phrase-level accuracy.
```

In [None]:
user_message_string_translate = """{text}"""

In [None]:
def user_wrap_function_translate(text_block):
    return user_message_string_translate.format(text=text_block)

In [None]:
translation_message_seq = generate_messages(system_message_tagging, user_wrap_function_translate, cleaned_pages)

In [None]:

batch_file_path = batch_job_dir / "journal_translate_batches" / f"translate_base_batch_{basename}.jsonl" 
batch_file_path

In [None]:
create_jsonl_file_for_batch(translation_message_seq, batch_file_path, model="gpt-3.5-turbo")

In [None]:
tx_batch = start_batch(batch_file_path)
tx_batch

In [None]:
tx_batch

In [None]:
set_api_client()

In [None]:
get_active_batches()

In [None]:
completed = get_completed_batches()
completed

In [None]:
processed_data = get_batch_response(completed[0]['id'])

In [None]:
len(processed_data)

In [None]:
print(processed_data[12])

In [None]:
full_translated_text = join_pages(translated_data)
token_count(full_translated_text)

In [None]:
tx_output_xml_path = journal_dir / basename / "full_rough_tx_test_phat-giao-viet-nam-1956-01.xml"

In [None]:
save_pages_to_xml(tx_output_xml_path, translated_data, overwrite=True)