In [1]:
from lxml import etree
import json

In [2]:
from data_processing.text_processing import get_text_from_file, set_working_directory, get_working_directory
from data_processing.text_processing import normalize_quotes

In [3]:
from data_processing.text_processing import write_text_to_file

In [4]:
from data_processing.gpt_processing import token_count, set_api_client, get_api_client
from data_processing.gpt_processing import generate_messages, run_immediate_chat_process, create_jsonl_file_for_batch


In [None]:
CHUNK_TOKEN_LIMIT = 15000

In [6]:
set_working_directory("../../processed_journal_data")

In [32]:
raw_text_xml = get_text_from_file("phat-giao-viet-nam-1956-25-26/full_OCR_text_phat-giao-viet-nam-1956-25-26.xml")

In [21]:
token_count(raw_text_xml) 

14705

In [27]:
tx_comp = get_text_from_file("phat-giao-viet-nam-1956-24/journal_1956_24_translation_full.xml")

In [28]:
token_count(tx_comp)

21557

cleaning prompts:

You are an intelligent, meticulous, and consistent world expert at cleaning OCR generated Vietnamese text. You are cleaning text from an old 1950's Buddhist Journal.  The text will be given in XML, and the output text should be matching XML. Your goal is to minimally modify the text to generate a cleaned version, removing extraneous text, incomplete words, and formatting markers, to get a clean text. You can use patterns in the text blocks (given in pages) to infer patterns in the text layout. You can use the semantic meaning of the text to infer corrections but strive to make minimal semantic changes. You can also add diacritical marks if they are missing or clearly inaccurate. It's important to note the degree of confidence you have in each cleaning step, and each cleaning step should be noted, with other possible interpretations listed.

You are an intelligent, meticulous, and consistent world expert at cleaning OCR-generated Vietnamese text. You are cleaning text from an old 1950's Buddhist Journal.  The text will be given in XML, and the output text should be in matching XML. Your goal is to minimally modify the text to generate a cleaned version, removing extraneous text, incomplete words, and formatting markers, to get a clean text. You can use patterns in the text blocks (given by page) to infer patterns in the text layout. You can use the semantic meaning of the text to infer corrections—but strive to make minimal semantic changes. You can also add diacritical marks if they are missing or clearly inaccurate. It's important to note the degree of confidence you have in each cleaning step, and each cleaning step should be noted as concisely as possible, also listing other possible interpretations of the text.

You are an intelligent, meticulous, and consistent world expert at cleaning OCR-generated Vietnamese text. You are cleaning text from a 1950's Buddhist Journal. The text will be given in XML, and the output text should be in matching XML. Your goal is to minimally modify the text to generate a cleaned version. Do not remove any text from the body of the text (unless it is clearly an extraneous mark). Any text that cannot be corrected from context or other means should still be included as bracketed [] text. However, formatting markers at the beginning and end of the text can be adjusted or removed as needed. You can use patterns in the text blocks (given by page) to infer patterns in the text layout. You can use the semantic meaning of the text to infer corrections—but strive to make no semantic changes. You can also add diacritical marks if they are missing or clearly inaccurate. It's important to note the degree of confidence you have in each cleaning step (high, medium or low). Each cleaning step should be noted as concisely as possible, also listing other possible interpretations of the text. This particular text has a publication marker "Phat Giao Viet Nam" and publishing mark near the end of each page of text. The publication mark is something like "Tu Vien HUE QUANG" and is very difficult for the OCR to process. Text corresponding to these marks can be ommitted.

You are an intelligent, meticulous, and consistent world expert at cleaning OCR-generated Vietnamese text. You are cleaning text from a 1950's Buddhist Journal. The text will be given in XML, and the output text should be in matching XML. Your goal is to minimally modify the text to generate a cleaned version. Do not remove any text from the body of the text (unless it is clearly an extraneous mark). Any text that cannot be corrected from context or other means should still be included as bracketed [] text. However, formatting markers at the beginning and end of the text can be adjusted or removed as needed. You can use patterns in the text blocks (given by page) to infer patterns in the text layout. You can use the semantic meaning of the text to infer corrections—but strive to make no semantic changes. You can also add diacritical marks if they are missing or clearly inaccurate. It's important to note the degree of confidence you have in each cleaning step (high, medium or low). Each cleaning step should be noted as concisely as possible, also listing other possible interpretations of the text. This particular text has a title marker "Phat Giao Viet Nam" and publishing mark near the end of each page of text. The publication mark is something like "Tu Vien HUE QUANG" + "TRUNG TAM DICH THUAT HAN NOM" and is very difficult for the OCR to process. Text corresponding to these marks and page numbers can be ommitted.

In [34]:
def split_xml_text(text, chunk_token_limit=10000):
    """
    Splits an XML document into chunks based on <page> tags, ensuring that no chunk
    exceeds the specified token limit.
    
    Parameters:
    - text (str): The XML document as a string.
    - chunk_token_limit (int): The maximum number of tokens allowed per chunk.
    
    Returns:
    - List[str]: A list of XML strings, each representing a chunk of pages.
    """
    
    # Convert text to bytes
    text_bytes = text.encode("utf-8")
    
    try:
        # Parse the XML text into an element tree
        root = etree.fromstring(text_bytes)
    except etree.XMLSyntaxError as e:
        # Extract the line number and column number of the error
        line_number = e.lineno
        column_number = e.offset
        
        # Find the offending line in the original text
        lines = text.splitlines()
        error_line = lines[line_number - 1] if line_number - 1 < len(lines) else "Unknown line"
        
        print(f"XMLSyntaxError: {e}")
        print(f"Offending line {line_number}, column {column_number}: {error_line}")
        return []  # Return an empty list or handle the error as needed
    
    # Initialize variables for splitting the document
    chunks = []
    current_chunk = []
    current_token_count = 0
    
    # Iterate over each <page> element in the document
    for page in root.findall(".//page"):
        # Convert the current page element back to a string for token counting
        page_text = etree.tostring(page, encoding="unicode")
        
        # Count the tokens in the page
        page_token_count = token_count(page_text)
        
        # Check if adding this page would exceed the token limit for the current chunk
        if current_token_count + page_token_count > chunk_token_limit:
            # If so, finalize the current chunk and start a new one
            chunks.append("<document>" + "".join(current_chunk) + "</document>")
            current_chunk = []
            current_token_count = 0
            
        # Add the current page to the chunk
        current_chunk.append(page_text)
        current_token_count += page_token_count
    
    # Add the last chunk if there are remaining pages
    if current_chunk:
        chunks.append("<document>" + "".join(current_chunk) + "</document>")
    
    return chunks

In [35]:
chunks = split_xml_text(raw_text_xml, chunk_token_limit=15000)

In [36]:
len(chunks)

1

In [69]:
system_message_clean = """
You are an intelligent, meticulous, and consistent world expert at cleaning OCR-generated Vietnamese text. 
You are cleaning text from a 1950's Buddhist Journal. The text will be given in XML, and the output text should be in matching XML. 
Your goal is to minimally modify the text to generate a cleaned version. Do not remove any text from the main body of the text.  
However, formatting markers at the beginning and end of the text can be adjusted or removed as needed for clarity. 
You can use patterns in the text blocks (given by page) to infer patterns in the text.
You can use the semantic meaning of the text to infer corrections—but make no semantic changes. 
You can also add diacritical marks if they are missing or clearly inaccurate. 
Do not change any proper names, except to add missing diacritical marks if the context is clear.  
This particular text has a title marker: "Phat Giao Viet Nam," and also a publishing mark near the end of each page of text. 
The publishing mark is something like "Tu Vien HUE QUANG" + "TRUNG TAM DICH THUAT HAN NOM" and is very difficult for the OCR to process. 
Text corresponding to these marks (or part thereof) and page numbers can be omitted.
Output the corrected text only with no comments.
"""

In [38]:
token_count(system_message_clean)

276

In [39]:
306 / 10000

0.0306

In [40]:
user_message_string_clean = """Clean this text: {text}"""

In [41]:
def user_wrap_function_clean(text_block):
    return user_message_string_clean.format(text=text_block)

In [44]:
message_sequence_clean = generate_messages(system_message_clean, user_wrap_function_clean, chunks)

In [46]:
set_api_client()

<openai.OpenAI at 0x12662ee90>

In [68]:
system_message_translate = """
You are translating as Thich Nhat Hanh from Vietnamese to English for your experienced students. 
The text is from 1956-1958. Use the title: "Vietnamese Buddhist Journal." 
You want experienced students to understand the material and its historical and cultural context—in particular, as it relates to the life and teaching of Thich Nhat Hanh.

- Give a full translation that flows well in English in the style of Thich Nhat Hanh.
- Format the text clearly and cleanly.
- Add additional standard XML metadata tags that will improve the readability and clarity of the content.
- Example tags: <p> <section> <title> <contents> <author> <li> <i> <b> etc.
- Titles and XML sections should also be translated.
- Do not make comments within the text itself. 
- Leave all names of people in original Vietnamese with diacritics. 
- Each section of the text should have its own dedicated `<footnotes>` section.
- Use Footnotes liberally to explain elements of Vietnamese Buddhism or Buddhism in general, elements of Vietnamese culture and history, or elements relative to the life, teachings, and practice of Thich Nhat Hanh.
- When translating complex terms (Sanskrit, Sino-Vietnamese, or French) mark these with <i></i> tags and give an explanation in the footnotes.  
- For any terms footnoted, give the original Vietnamese, Sino-Vietnamese, Sanskrit, or French in the footnote. 
- If a paragraph logically spans two pages, it can be moved to the most logical page to allow translation to flow seamlessly.
- Due to the size of the journal, you may be translating a piece of the document in the middle which will be split from its preceding pages or sections.

Footnote Formatting:
Inline Footnote Markers:
   - In the main text, footnotes should be marked numerically in brackets, such as “[1]”.
   - Enclose each inline footnote marker within a `<footnote number="n">` tag, where `n` is the footnote number.
   - Example of a footnoted paragraph:
     `<p>This paragraph has a footnote. This is a sentence in the paragraph.</p><footnote number="1">[1]</footnote>`

Footnote Section:
   - At the end of each logical section, include a `<footnotes>` section where explanations for each footnote of that section are listed.
   - Each footnote in the `<footnotes>` section should use a `<footnote number="n">` tag, with `n` matching the inline reference.

Numbering:
   - Footnote numbers should restart at 1 for each new section of text.

Here's an example of text containing two footnotes, followed by its `<footnotes>`:

`<p>This is the first sentence with a footnote.<footnote number="1">[1]</footnote> Here is another statement that needs a footnote.<footnote number="2">[2]</footnote></p>`

`<footnotes>`
    `<footnote number="1">This is the explanation of the first footnote</footnote>`
    `<footnote number="2">This is the explanation of the second footnote</footnote>`
`</footnotes>` 
"""

In [49]:
user_message_string_translate = """{text}"""

In [50]:
def user_wrap_function_translate(text_block):
    return user_message_string_translate.format(text=text_block)

### Setup a batch processes for all cleaning:

### clean the the chunks:

In [None]:
chunks = split_xml_text(raw_text_xml, chunk_token_limit=11000)
clean_message_seq = generate_messages(system_message_clean, user_wrap_function_clean, chunks)
cleaned_data = []
translated_chunks = []
for i, msgs in enumerate(clean_message_seq):
    print(f"cleaning chunk: {i}")
    completion_clean = run_immediate_chat_process(msgs)
    if completion_clean:
        clean_response = completion_clean.choices[0].message.content
        cleaned_data.append(clean_response)
    else:
        break
        print("failed.")



cleaning chunk: 0
cleaning chunk: 1


In [53]:
len(chunks)

2

In [55]:
full_cleaned_text = "\n".join(cleaned_data)

In [56]:
print(full_cleaned_text)

<document>
  <page page="1">
    PHẬT GIÁO VIỆT-NAM NGUYỆT-SAN 
    SỐ 25 VÀ 26 ĐẶC SAN KỶ NIỆM MỞ ĐẦU NĂM THỨ BA 
  </page>
  <page page="2">
    MỤC LỤC 
    Phật-giáo Việt Nam bước sang năm thứ ba 
    Chân lý không nằm trong văn tự, danh ngôn 
    Ngài Huyền-Trang đi Ấn Độ thỉnh Kinh (tiếp theo) 
    Bản-ngã là gì? 
    Vì sao cần thống nhất Phật-giáo Việt Nam 
    Cuộc phỏng vấn của Nguyệt-san P.G.V.N. về vấn đề thống nhất 
    Kế hoạch thống nhất nghi lễ 
    Chia gia-lài (Mẩu chuyện Đạo) 
    Tài liệu Gia đình Phật-tử (tiếp theo) 
    Cần hiểu đúng đắn ý-nghĩa Gia đình Phật tử 
    Hai ngọn bền (Truyện ngắn) 
    Thích Đức-Nhuận 
    Thích Tâm Châu 
    Thích Tâm-Thọ 
    Nguyễn-Khắc-Từ 
    Hùng-Khanh 
    Minh-Hữu 
    Đạo Phật tại Mỹ 
    Viên-Minh 
    Cội Phúc (Truyện ngắn được giải khuyến khích) 
    Lê-Văn 
  </page>
  <page page="4">
    PHẬT-GIÁO VIỆT-NAM BƯỚC SANG NĂM THỨ BA 
    Với số 25 và 26 Phật giáo Việt Nam Nguyệt san bước sang năm thứ ba 
    Trước khi bước nhữ

In [57]:
token_count(full_cleaned_text)

12714

In [59]:
write_text_to_file("phat-giao-viet-nam-1956-25-26/full_cleaned_text.xml", full_cleaned_text)

In [60]:
to_tx_text = get_text_from_file("phat-giao-viet-nam-1956-25-26/full_cleaned_text.xml")

In [62]:
input_text

'<document>\n  <page page="1">\n    PHẬT GIÁO VIỆT-NAM NGUYỆT-SAN \n    SỐ 25 VÀ 26 ĐẶC SAN KỶ NIỆM MỞ ĐẦU NĂM THỨ BA \n  </page>\n  <page page="2">\n    MỤC LỤC \n    Phật-giáo Việt Nam bước sang năm thứ ba \n    Chân lý không nằm trong văn tự, danh ngôn \n    Ngài Huyền-Trang đi Ấn Độ thỉnh Kinh (tiếp theo) \n    Bản-ngã là gì? \n    Vì sao cần thống nhất Phật-giáo Việt Nam \n    Cuộc phỏng vấn của Nguyệt-san P.G.V.N. về vấn đề thống nhất \n    Kế hoạch thống nhất nghi lễ \n    Chia gia-lài (Mẩu chuyện Đạo) \n    Tài liệu Gia đình Phật-tử (tiếp theo) \n    Cần hiểu đúng đắn ý-nghĩa Gia đình Phật tử \n    Hai ngọn bền (Truyện ngắn) \n    Thích Đức-Nhuận \n    Thích Tâm Châu \n    Thích Tâm-Thọ \n    Nguyễn-Khắc-Từ \n    Hùng-Khanh \n    Minh-Hữu \n    Đạo Phật tại Mỹ \n    Viên-Minh \n    Cội Phúc (Truyện ngắn được giải khuyến khích) \n    Lê-Văn \n  </page>\n  <page page="4">\n    PHẬT-GIÁO VIỆT-NAM BƯỚC SANG NĂM THỨ BA \n    Với số 25 và 26 Phật giáo Việt Nam Nguyệt san bước sang nă

In [64]:
translation_data = []
input_text_chunks = [to_tx_text]
translation_message_seq = generate_messages(system_message_translate, user_wrap_function_translate, input_text_chunks)
for i, tx_msgs in enumerate(translation_message_seq):
    print(f"translating chunk: {i}")
    completion_tx = run_immediate_chat_process(tx_msgs)
    if completion_tx:
        tx_response = completion_tx.choices[0].message.content
        translation_data.append(tx_response)
    else:
        print("failed.")
        break

translating chunk: 0


In [65]:
print(translation_data[0])

<document>
  <page page="1">
    <title>Journal of Buddhism in Vietnam</title>
    <contents>Số 25 và 26 - Special Edition to Commence the Third Year</contents>
    <author>Thích Nhất Hạnh</author>
  </page>
  <page page="2">
    <title>Table of Contents</title>
    <p>The Vietnamese Buddhism enters its third year</p>
    <p>Truth does not lie in language and words</p>
    <p>Venerable Huyền Trang's journey to India to seek scriptures (continued)</p>
    <p>What is the self?</p>
    <p>Why it's necessary to unify Vietnamese Buddhism</p>
    <p>The Vietnamese Buddhism Monthly's interview on the issue of unification</p>
    <p>Plan to unify rituals</p>
    <p><i>Chia gia-lài</i> (A Dharma story)</p><footnote number="1">[1]</footnote>
    <p>Documents on the Buddhist Family (continued)</p>
    <p>Proper understanding of the significance of the Buddhist Family</p>
    <p>Two enduring peaks (A short story)</p>
    <p>Thích Đức-Nhuận</p>
    <p>Thích Tâm Châu</p>
    <p>Thích Tâm-Thọ</p>
   

In [66]:
full_translated_text = "\n".join(translation_data)

In [67]:
write_text_to_file("journal_1956_25_26_translation_full.xml", full_translated_text)

In [None]:
# response_schema = {
#   "name": "text_cleaning",
#   "strict": True,
#   "schema": {
#     "type": "object",
#     "properties": {
#       "cleaned_text": {
#         "type": "string",
#         "description": "The resulting cleaned text after applying all cleaning steps."
#       },
#       "cleaning_steps": {
#         "type": "array",
#         "description": "A series of notes that explain each step taken to clean the input text.",
#         "items": {
#           "type": "object",
#           "properties": {
#             "step_description": {
#               "type": "string",
#               "description": "A detailed description of the cleaning step."
#             },
#             "text_example": {
#               "type": "string",
#               "description": "an example fragment of text that was corrected."
#             },
#             "confidence_level": {
#               "type": "integer",
#               "description": "3 for high confidence, 2 for medium, 1 for low confidence corrections."
#             }
#           },
#           "required": [
#             "step_description",
#             "text_example",
#             "confidence_level"
#           ],
#           "additionalProperties": False
#         }
#       }
#     },
#     "required": [
#       "cleaned_text",
#       "cleaning_steps"
#     ],
#     "additionalProperties": False
#   }
# }