In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from lxml import etree
%aimport json
from pathlib import Path
%aimport re
from typing import List, Dict
%aimport os
from xml.sax.saxutils import escape

In [None]:
from data_processing.text_processing import get_text_from_file, set_working_directory, get_working_directory
from data_processing.text_processing import normalize_quotes

In [None]:
from data_processing.text_processing import write_text_to_file

In [None]:
from data_processing.gpt_processing import token_count, set_api_client, get_api_client, get_active_batches
from data_processing.gpt_processing import generate_messages, run_immediate_chat_process, create_jsonl_file_for_batch, start_batch
from data_processing.gpt_processing import get_completed_batches, get_batch_response


In [None]:
user_message_string_clean = """{text}"""

In [None]:
def user_wrap_function_clean(text_block):
    return user_message_string_clean.format(text=text_block)

In [None]:
system_message_clean = """
- You are a meticulous and consistent world expert at cleaning OCR-generated Vietnamese text. 
- You are cleaning text from a 1950's Buddhist Journal. 
IMPORTANT: You will receive one page of text listed by lines: do not modify the line structure.
- You will minimally modify the text to fix any OCR errors.
- You will add <section>, <title>, <author>, <p>, <li>, <ul>, <ol> tags (only) where appropriate in text, based on text layout and content. 
- Do not remove any content from the main body of the text.   
- Use patterns in the text blocks (given by page) to infer patterns in the text.
- Use the semantic meaning of the text to infer corrections—but make no semantic changes. 
- Add diacritical marks if they are missing or clearly inaccurate. 
- Do not change any proper names, except to add missing diacritical marks if the context is clear.  
IMPORTANT: Output the corrected text only with no comments or additional formatting marks of any kind.
"""

### testing gpt-4o-mini system message

You are an intelligent, meticulous, and consistent expert at cleaning OCR-generated Vietnamese text. 
The text will be given in XML, and the output text should be in matching XML. 
Your goal is to minimally modify the text to generate a cleaned version. 
Do not remove any text from the main content.  
Formatting markers at the beginning and end of the text can be adjusted or removed as needed for clarity. 
You can use the semantic meaning of the text to infer corrections—but make no semantic changes. 
You can also add diacritical marks if they are missing or clearly inaccurate and can be determined by context.
Do not change any proper names, except to add missing diacritical marks if the context is clear.  
This particular text has a title marker: "Phat Giao Viet Nam," and also a publishing mark near the end of each page of text. 
The publishing mark is something like "Tu Vien HUE QUANG" + "TRUNG TAM DICH THUAT HAN NOM" and is often incomplete.
Text corresponding to these marks (or part thereof) and page numbers can be omitted.
Output the corrected text only with no comments.

### Process files in sequence and generate cleaning batch JSON files:

In [None]:
def split_xml_pages(text):
    """
    Splits an XML document into individual pages based on <page> tags
    and returns the text content of each page, excluding the <page> tags.

    Parameters:
    - text (str): The XML document as a string.

    Returns:
    - List[str]: A list of strings, each representing the content of a page.
    """
    # Convert text to bytes
    text_bytes = text.encode("utf-8")
    
    try:
        # Parse the XML text into an element tree
        root = etree.fromstring(text_bytes)
    except etree.XMLSyntaxError as e:
        # Extract the line number and column number of the error
        line_number = e.lineno
        column_number = e.offset
        
        # Find the offending line in the original text
        lines = text.splitlines()
        error_line = lines[line_number - 1] if line_number - 1 < len(lines) else "Unknown line"
        
        print(f"XMLSyntaxError: {e}")
        print(f"Offending line {line_number}, column {column_number}: {error_line}")
        return []  # Return an empty list or handle the error as needed

    # Extract text content of each <page> element
    pages = [etree.tostring(page, encoding="unicode") for page in root.findall(".//page")]
    
    return pages

In [None]:
def split_xml_pages(text, page_groups=None):
    """
    Splits an XML document into individual pages based on <page> tags.
    Optionally groups pages together based on page_groups.

    Parameters:
    - text (str): The XML document as a string.
    - page_groups (list of tuples, optional): A list of tuples defining page ranges to group together.
                                              Each tuple is of the form (start_page, end_page), inclusive.

    Returns:
    - List[str]: A list of strings, where each element is a single page (if no groups) or a group of pages.
    """
    from lxml import etree

    # Parse the XML text into an element tree
    try:
        root = etree.fromstring(text.encode("utf-8"))
    except etree.XMLSyntaxError as e:
        # Handle parsing errors with helpful debugging information
        line_number = e.lineno
        column_number = e.offset
        lines = text.splitlines()
        error_line = lines[line_number - 1] if line_number - 1 < len(lines) else "Unknown line"
        print(f"XMLSyntaxError: {e}")
        print(f"Offending line {line_number}, column {column_number}: {error_line}")
        return []  # Return an empty list if parsing fails

    # Extract all pages as a list of strings
    pages = [
        (int(page.get("page")), etree.tostring(page, encoding="unicode"))
        for page in root.findall(".//page")
    ]
    
    # Sort pages by page number
    pages.sort(key=lambda x: x[0])

    # If no page_groups, return individual pages
    if not page_groups:
        return [content for _, content in pages]

    # Group pages based on page_groups
    grouped_pages = []
    for start, end in page_groups:
        group_content = ""
        for page_num, content in pages:
            if start <= page_num <= end:
                group_content += content
        if group_content:
            grouped_pages.append(group_content)

    return grouped_pages

In [None]:
processed_journals = "../../processed_journal_data"
batch_ouput_prefix = "./journal_cleaning_batches/"

In [None]:
ocr_xml = get_text_from_file("phat-giao-viet-nam-1956-28/full_OCR_text_phat-giao-viet-nam-1956-28.xml", processed_journals)

In [None]:
pages = split_xml_pages(ocr_xml)

In [None]:
pages

In [None]:
print(system_message_clean)

In [None]:
# generate all cleaning batch files using pages:

processed_journals = "../../processed_journal_data"
batch_ouput_prefix = "./journal_cleaning_batches/"

for path in Path(processed_journals).iterdir():
    if path.is_dir():
        journal_name = path.name
        for subpath in Path(path).iterdir():
            regex = re.compile(r"^full_OCR_.*\.xml")
            if subpath.is_file() and regex.search(subpath.name):
                print(subpath.name)
                try:
                    ocr_text = get_text_from_file(subpath.name, path)
                    print(f"{ocr_text[:90]}...")
                    chunks = split_xml_pages(ocr_text)
                    clean_message_seq = generate_messages(system_message_clean, user_wrap_function_clean, chunks)
                    create_jsonl_file_for_batch(clean_message_seq, batch_ouput_prefix + "clean_batch_" + journal_name + ".jsonl")
                except Exception as e:
                    print(f"{e}\nfailed.. \nContinuing.")
                    

In [None]:
batch_client = set_api_client()

In [None]:
batch_job_dir = "./journal_cleaning_batches"
batch_files = os.listdir(batch_job_dir)
batch_files


In [None]:
file_path = os.path.join(batch_job_dir, 'clean_batch_phat-giao-viet-nam-1956-28.jsonl')
file_path

## completed batches:
10, 25-26

In [None]:
batch28 = start_batch(file_path)

In [None]:
# file_path

In [None]:
# batch_27 = start_batch(file_path)

In [None]:
# batch_05_06 = start_batch(file_path)

In [None]:
get_active_batches()

In [None]:
completed = get_completed_batches()

In [None]:
completed

In [None]:
cleaned_data = get_batch_response(completed[0]['id'])

In [None]:
len(cleaned_data)

In [None]:
print(cleaned_data[3])

In [None]:
# deprectated: now <page> markers stay in cleaned text.
# def join_pages(data):
#     result = ["<document>"]
#     for i, page in enumerate(data):
#         result.append(f"<page page=\"{i+1}\">\n")
#         result.append(escape(page))
#         result.append("\n</page>\n")
#     result.append("</document>")

#     return "".join(result)

def join_pages(data):
    result = ["<document>"]
    result = result + data
    result.append("</document>")

    return "\n".join(result)


In [None]:
cleaned_data

In [None]:
full_cleaned_text = join_pages(cleaned_data)

In [None]:
print(full_cleaned_text)

In [None]:
#write_text_to_file("full_cleaned_journal_28.xml", full_cleaned_text)

In [None]:
full_cleaned_text = get_text_from_file("full_cleaned_journal_28.xml")

In [None]:
token_count(full_cleaned_text)

In [None]:
cleaned_sections = split_xml_pages(full_cleaned_text, page_groups=[(1, 6), (7, 17),(18, 25), (26, 30), (31, 36), (37, 37), (38, 44), (45, 51)])

In [None]:
[token_count(x) for x in cleaned_sections]

In [None]:
[type(x) for x in cleaned_sections]

In [None]:
print(cleaned_sections[0])

In [None]:
system_message_translate = """
You are translating as Thich Nhat Hanh from Vietnamese to English for your experienced students. 
The text is from 1956-1958. Use the title: "Vietnamese Buddhist Journal." 
You want experienced students to understand the material and its historical and cultural context—in particular, as it relates to the life and teaching of Thich Nhat Hanh.

- You will receive 1 section of the journal at a time. 
- Give a full English translation in the style of Thich Nhat Hanh. Precision and clarity are important.
- Format the text neatly.
- Keep pages together: each translated page must match it's original page source as pages will be studied side by side with the original Vietnamese.
- Exception: you can move a sentence if it is split between two pages.
- Pages will already have metadata tags that were generated page by page.
- Edit and adjust XML tags as needed to join page information and create clarity.
- Add additional XML tags for clarity.
- All titles and XML sections should also be translated.
- Do not make comments within the text itself. 
- Leave all names of people in original Vietnamese with diacritics. 
- Each section of the text should have its own dedicated `<footnotes>` section.
- Use footnotes generously to explain elements of Vietnamese Buddhism or Buddhism in general, elements of Vietnamese culture and history, or elements relative to the life, teachings, and practice of Thich Nhat Hanh.
- When translating complex terms (Sanskrit, Sino-Vietnamese, or French) mark these with <i></i> tags and give an explanation in the footnotes.  
- For any terms footnoted, give the original Vietnamese, Sino-Vietnamese, Sanskrit, or French in the footnote. 

Footnote Formatting:
Inline Footnote Markers:
   - In the main text, footnotes should be marked numerically in brackets, such as "[1]".
   - Enclose each inline footnote marker within a `<footnote number="n">` tag, where `n` is the footnote number.
   - Example of a footnoted paragraph:
     `<p>This paragraph has a footnote. This is a sentence in the paragraph.</p><footnote number="1">[1]</footnote>`

Footnote Section:
   - If a page has footnotes then it should have a `<footnotes>` section at the end of the page with the information for each footnote.
   - Each footnote in the `<footnotes>` section should use a `<footnote number="n">` tag, with `n` matching the inline reference.

Numbering:
   - Footnote numbers should restart at 1 for each new page of text.

Here's an example of a page containing two footnotes:

```
   <page> 
      <p>This is sentence has a footnote.<footnote number="1">[1]</footnote> <p>
      <p>Here is another statement that needs a footnote.<footnote number="2">[2]</footnote></p>`
      <p>The rest of the page follows ... </p>   
      ...
      <footnotes>
         <footnote number="1">This is the information for the first footnote</footnote>`
         <footnote number="2">This is the information for the second footnote</footnote>`
      </footnotes>
   </page>
```
"""

In [None]:
token_count(system_message_translate)

In [None]:
user_message_string_translate = """{text}"""

In [None]:
def user_wrap_function_translate(text_block):
    return user_message_string_translate.format(text=text_block)

In [None]:
translation_message_seq = generate_messages(system_message_translate, user_wrap_function_translate, cleaned_sections)

In [None]:
processed_journals = "../../processed_journal_data"
batch_ouput_prefix = "./journal_translate_batches/"
journal_name = "phat-giao-viet-nam-1956-28"

In [None]:
create_jsonl_file_for_batch(translation_message_seq, batch_ouput_prefix + "translate_batch_" + journal_name + ".jsonl")

In [None]:
batch_job_dir = "./journal_translate_batches"
batch_path = os.path.join(batch_job_dir, 'translate_batch_phat-giao-viet-nam-1956-28.jsonl')
batch_path

In [None]:
tx_batch = start_batch(batch_path)
tx_batch

In [None]:
get_active_batches()

In [None]:
completed = get_completed_batches()

In [None]:
completed

In [None]:
translated_data = get_batch_response(completed[0]['id'])

In [None]:
len(translated_data)

In [None]:
full_translated_text = join_pages(translated_data)

In [None]:
print(full_translated_text)

In [None]:
token_count(full_translated_text)

In [None]:
write_text_to_file("full_tx_phat-giao-viet-nam-1956-28.xml", full_translated_text)