In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from lxml import etree
%aimport json
from pathlib import Path
%aimport re
from typing import List, Dict
%aimport os

In [3]:
from data_processing.text_processing import get_text_from_file, set_working_directory, get_working_directory
from data_processing.text_processing import normalize_quotes

In [4]:
from data_processing.text_processing import write_text_to_file

In [42]:
from data_processing.gpt_processing import token_count, set_api_client, get_api_client, get_active_batches
from data_processing.gpt_processing import generate_messages, run_immediate_chat_process, create_jsonl_file_for_batch, start_batch
from data_processing.gpt_processing import get_completed_batches, get_batch_response


In [6]:
CHUNK_TOKEN_LIMIT = 15000

In [7]:
def split_xml_text(text, chunk_token_limit=CHUNK_TOKEN_LIMIT):
    """
    Splits an XML document into chunks based on <page> tags, ensuring that no chunk
    exceeds the specified token limit.
    
    Parameters:
    - text (str): The XML document as a string.
    - chunk_token_limit (int): The maximum number of tokens allowed per chunk.
    
    Returns:
    - List[str]: A list of XML strings, each representing a chunk of pages.
    """
    
    # Convert text to bytes
    text_bytes = text.encode("utf-8")
    
    try:
        # Parse the XML text into an element tree
        root = etree.fromstring(text_bytes)
    except etree.XMLSyntaxError as e:
        # Extract the line number and column number of the error
        line_number = e.lineno
        column_number = e.offset
        
        # Find the offending line in the original text
        lines = text.splitlines()
        error_line = lines[line_number - 1] if line_number - 1 < len(lines) else "Unknown line"
        
        print(f"XMLSyntaxError: {e}")
        print(f"Offending line {line_number}, column {column_number}: {error_line}")
        return []  # Return an empty list or handle the error as needed
    
    # Initialize variables for splitting the document
    chunks = []
    current_chunk = []
    current_token_count = 0
    
    # Iterate over each <page> element in the document
    for page in root.findall(".//page"):
        # Convert the current page element back to a string for token counting
        page_text = etree.tostring(page, encoding="unicode")
        
        # Count the tokens in the page
        page_token_count = token_count(page_text)
        
        # Check if adding this page would exceed the token limit for the current chunk
        if current_token_count + page_token_count > chunk_token_limit:
            # If so, finalize the current chunk and start a new one
            chunks.append("<document>" + "".join(current_chunk) + "</document>")
            current_chunk = []
            current_token_count = 0
            
        # Add the current page to the chunk
        current_chunk.append(page_text)
        current_token_count += page_token_count
    
    # Add the last chunk if there are remaining pages
    if current_chunk:
        chunks.append("<document>" + "".join(current_chunk) + "</document>")
    
    return chunks

In [8]:
user_message_string_clean = """Clean this text: {text}"""

In [9]:
def user_wrap_function_clean(text_block):
    return user_message_string_clean.format(text=text_block)

In [None]:
system_message_clean = """
You are a meticulous and consistent world expert at cleaning OCR-generated Vietnamese text. 
You are cleaning text from a 1950's Buddhist Journal. 
The text will be given in XML, and the output text should be in matching XML. 
Your goal is to minimally modify the text to generate a cleaned version.
IMPORTANT: Do not move any text from one page to another.  
Do not remove any text from the main body of the text. 
Formatting markers (such as footers) at the end of the text can be adjusted or removed as needed for clarity. 
You can use patterns in the text blocks (given by page) to infer patterns in the text.
You can use the semantic meaning of the text to infer corrections—but make no semantic changes. 
You can also add diacritical marks if they are missing or clearly inaccurate. 
Do not change any proper names, except to add missing diacritical marks if the context is clear.  
This particular text has a title marker: "Phat Giao Viet Nam," and also a publishing mark near the end of each page of text. 
The publishing mark is something like "Tu Vien HUE QUANG" + "TRUNG TAM DICH THUAT HAN NOM" and is very difficult for the OCR to process. 
Text corresponding to these marks (or part thereof) and page numbers can be omitted.
Output the corrected text only with no comments.
"""

### testing gpt-4o-mini system message

You are an intelligent, meticulous, and consistent expert at cleaning OCR-generated Vietnamese text. 
The text will be given in XML, and the output text should be in matching XML. 
Your goal is to minimally modify the text to generate a cleaned version. 
Do not remove any text from the main content.  
Formatting markers at the beginning and end of the text can be adjusted or removed as needed for clarity. 
You can use the semantic meaning of the text to infer corrections—but make no semantic changes. 
You can also add diacritical marks if they are missing or clearly inaccurate and can be determined by context.
Do not change any proper names, except to add missing diacritical marks if the context is clear.  
This particular text has a title marker: "Phat Giao Viet Nam," and also a publishing mark near the end of each page of text. 
The publishing mark is something like "Tu Vien HUE QUANG" + "TRUNG TAM DICH THUAT HAN NOM" and is often incomplete.
Text corresponding to these marks (or part thereof) and page numbers can be omitted.
Output the corrected text only with no comments.

In [11]:
# from unittest.mock import Mock
# from datetime import datetime

# # Create a mock client with files.create and batches.create methods
# mock_client = Mock()

# # Mock the behavior of files.create to return a mock file object with an id
# mock_client.files.create.return_value.id = "mock_file_id"

# # Mock the behavior of batches.create to return a dictionary as if it were a batch object
# mock_client.batches.create.return_value = {"id": "mock_batch_id", "status": "created"}

# # Test the function
# jsonl_file = "journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-01.jsonl"
# result = start_batch(mock_client, jsonl_file)

# print(result)  # Expected output: {'id': 'mock_batch_id', 'status': 'created'}

### Testing:

In [12]:
# batch = start_batch(my_client, "journal_cleaning_batches/test_batch.jsonl")

In [13]:
# batch = start_batch(my_client, "journal_cleaning_batches/TEST_clean_batch_phat-giao-viet-nam-1956-01.jsonl")

### Process files in sequence and generate cleaning batch JSON files:

In [14]:

# processed_journals = "../../processed_journal_data"
# batch_ouput_prefix = "./journal_cleaning_batches/"

# for path in Path(processed_journals).iterdir():
#     if path.is_dir():
#         journal_name = path.name
#         for subpath in Path(path).iterdir():
#             regex = re.compile(r"^full_OCR_.*\.xml")
#             if subpath.is_file() and regex.search(subpath.name):
#                 print(subpath.name)
#                 try:
#                     ocr_text = get_text_from_file(subpath.name, path)
#                     print(f"{ocr_text[:90]}...")
#                     chunks = split_xml_text(ocr_text)
#                     clean_message_seq = generate_messages(system_message_clean, user_wrap_function_clean, chunks)
#                     create_jsonl_file_for_batch(clean_message_seq, batch_ouput_prefix + "clean_batch_" + journal_name + ".jsonl")
#                 except Exception as e:
#                     print(f"{e}\nfailed.. \nContinuing.")
                    

### Send batches to API client:

In [15]:
batch_client = set_api_client()

In [16]:
batch_job_dir = "./journal_cleaning_batches"
batch_files = os.listdir(batch_job_dir)
batch_files


['clean_batch_phat-giao-viet-nam-1956-12.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-14.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-09-10.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-16.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-11.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-25-26.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-13.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-17-18.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-15.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-28.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-24.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-19.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-02.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-22.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-04.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-03.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-27.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-01.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-05-06.jsonl',
 'clean_batch_phat-giao-viet-nam-1956-23.jsonl',
 'clean_

In [17]:
file_path = os.path.join(batch_job_dir, 'clean_batch_phat-giao-viet-nam-1956-05-06.jsonl')
file_path

'./journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-05-06.jsonl'

## completed batches:
10, 25-26

In [18]:
# start_batch(file_path)

In [19]:
# file_path

In [20]:
# batch_27 = start_batch(file_path)

In [21]:
# batch_05_06 = start_batch(file_path)

In [34]:
set_api_client()

<openai.OpenAI at 0x11b253690>

In [38]:
get_active_batches()

[]

In [35]:
completed = get_completed_batches()

In [36]:
completed

[{'id': 'batch_6737cad8c280819083f11d54277c8d0e',
  'status': 'completed',
  'created_at': 1731709656,
  'output_file_id': 'file-zcYd2bYdUG7XNr4ydEnhqHJV',
  'metadata': {'description': '11-15-2024 14:27:34 PST | ./journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-05-06.jsonl'}},
 {'id': 'batch_6737c99efe508190b9859fe37a8832eb',
  'status': 'completed',
  'created_at': 1731709343,
  'output_file_id': 'file-VxlfOfbcw2NopmK0Mufnsmi2',
  'metadata': {'description': '11-15-2024 14:22:21 PST | ./journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-27.jsonl'}},
 {'id': 'batch_6737b1c194f08190bafa64d6f7567eb8',
  'status': 'completed',
  'created_at': 1731703233,
  'output_file_id': 'file-eGiVuuuk7mB36zHDd3C15nKd',
  'metadata': {'description': '11-15-2024 12:40:32 PST | ./journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-12.jsonl'}},
 {'id': 'batch_6736d4cc63208190ac93b6a0f5b64e3d',
  'status': 'completed',
  'created_at': 1731646668,
  'output_file_id': 'file-P

In [37]:
def get_response_from_batch_info(batch_info):
    output_id = batch_info['output_file_id']
    file = batch_client.files.content(output_id)
    return file

In [39]:
out = get_batch_response(completed[0]['id'])

In [49]:
len(out)

5

In [47]:
full_cleaned_text = "\n".join(out)

In [None]:
set_working_directory()

In [48]:
write_text_to_file("full_cleaned_phat-giao-viet-nam-1956-05-06", full_cleaned_text)

In [50]:
cleaned_chunks = out

In [41]:
print(out[0])

<document>
  <page page="1">
    PHẬT GIÁO VIỆT NAM
    HUỆ QUANG
  </page>
  <page page="2">
    PHẬT-GIÁO VIỆT-NAM
    MỤC LỤC SỐ 5 VÀ
    XUÂN ĐINH DẬU (P.L. 2501)
    PHẬT GIÁO VIỆT NAM CHÚC TẾT
    MÙA XUÂN LÝ TƯỞNG
    NGUYÊN LÝ PHẬT HỌC
    GIAO CẢM
    GHÉT TẾT
    PHẬT GIÁO BỊ LỢI DỤNG
    TẾT THÔNG CẢM
    ĐẠO PHẬT NGÀY NAY
    P.G.V.N
    MINH HẠNH
    THÁI HƯ PHÁP SƯ
    NHẤT HẠNH
    THIỀU CHI
    PHẠM BÌNH
    HOÀNG HOA
    CHRISTMAS HUMPREY
    XUÂN DÂN TỘC VỚI NỤ CƯỜI ĐỨC DI LẶC TÂM NGUYÊN
    ĐỨC DI LẶC BỒ TÁT
    ĐẠI SỨ KHUÔNG VIỆT
    THỊ CHÚNG
    THIỆN HOA
    NGUYỄN-VĂN-HẬU
    MÃN GIÁC THIỀN SƯ
    TAMA HANA
  </page>
  <page page="3">
    BỨC TRANH XUÂN
    MÙA XUÂN MỚI
    CẢI HÓA
    TIÊU CHUẨN CỦA THIỆN ÁC
    BẠN NGHÈO
    PHẬT GIÁO VỚI KHOA HỌC
    TIẾNG MÕ ĐÊM BA MƯƠI
    HIỂU BIẾT CHÂN CHÍNH
    AI MẠNH HƠN CẢ?
    BÀ CHẲNG TINH
    VẤN ĐỀ GIẢI THOÁT
    XUNG QUANH ĐẠI LỄ GIỚI ĐÀN
    HUYỀN-KHÔNG
    PHÙNG KHÁNH
    KHÁC HUẤN
    TUỆ UYỀN
    THÍCH TÁC PH

In [None]:
# batch_job_path = Path("./journal_cleaning_batches")

# batch_info = []
# for path_obj in batch_job_path.iterdir():
#     regex = re.compile(r"^clean_batch_.*\.jsonl")
#     if path_obj.is_file() and regex.search(path_obj.name):
#         batch_file = batch_job_path / path_obj.name
#         print(batch_file)
#         try:
#             batch = start_batch(batch_file)
#             batch_info.append(batch)
#         except Exception as e:
#             print(f"{e}\nfailed.. \nContinuing.")

journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-12.jsonl
Batch Initiated: 11-14-2024 20:57:39 PST | journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-12.jsonl
journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-14.jsonl
Batch Initiated: 11-14-2024 20:57:41 PST | journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-14.jsonl
journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-09-10.jsonl
Batch Initiated: 11-14-2024 20:57:43 PST | journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-09-10.jsonl
journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-16.jsonl
Batch Initiated: 11-14-2024 20:57:45 PST | journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-16.jsonl
journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-11.jsonl
Batch Initiated: 11-14-2024 20:57:46 PST | journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-11.jsonl
journal_cleaning_batches/clean_batch_phat-giao-viet-nam-1956-25-26.jsonl
Batch 

In [51]:
system_message_translate = """
You are translating as Thich Nhat Hanh from Vietnamese to English for your experienced students. 
The text is from 1956-1958. Use the title: "Vietnamese Buddhist Journal." 
You want experienced students to understand the material and its historical and cultural context—in particular, as it relates to the life and teaching of Thich Nhat Hanh.

- Give a full translation that flows well in English in the style of Thich Nhat Hanh.
- Format the text clearly and cleanly.
- Add standard XML metadata tags that will improve the readability and clarity of the content.
- Example tags: <p> <section> <title> <subtitle> <contents> <author> <li> <i> <b> etc.
- Titles and XML sections should also be translated.
- Do not make comments within the text itself. 
- Leave all names of people in original Vietnamese with diacritics. 
- Each section of the text should have its own dedicated `<footnotes>` section.
- Use Footnotes liberally to explain elements of Vietnamese Buddhism or Buddhism in general, elements of Vietnamese culture and history, or elements relative to the life, teachings, and practice of Thich Nhat Hanh.
- When translating complex terms (Sanskrit, Sino-Vietnamese, or French) mark these with <i></i> tags and give an explanation in the footnotes.  
- For any terms footnoted, give the original Vietnamese, Sino-Vietnamese, Sanskrit, or French in the footnote. 
- If a paragraph logically spans two pages, it can be moved to the most logical page to allow translation to flow seamlessly.
- Due to the size of the journal, you may be translating a piece of the document in the middle which will be split from its preceding pages or sections.

Footnote Formatting:
Inline Footnote Markers:
   - In the main text, footnotes should be marked numerically in brackets, such as “[1]”.
   - Enclose each inline footnote marker within a `<footnote number="n">` tag, where `n` is the footnote number.
   - Example of a footnoted paragraph:
     `<p>This paragraph has a footnote. This is a sentence in the paragraph.</p><footnote number="1">[1]</footnote>`

Footnote Section:
   - At the end of each logical section, include a `<footnotes>` section where explanations for each footnote of that section are listed.
   - Each footnote in the `<footnotes>` section should use a `<footnote number="n">` tag, with `n` matching the inline reference.

Numbering:
   - Footnote numbers should restart at 1 for each new section of text.

Here's an example of text containing two footnotes, followed by its `<footnotes>`:

`<p>This is the first sentence with a footnote.<footnote number="1">[1]</footnote> Here is another statement that needs a footnote.<footnote number="2">[2]</footnote></p>`

`<footnotes>`
    `<footnote number="1">This is the explanation of the first footnote</footnote>`
    `<footnote number="2">This is the explanation of the second footnote</footnote>`
`</footnotes>` 
"""

In [52]:
user_message_string_translate = """{text}"""

In [53]:
def user_wrap_function_translate(text_block):
    return user_message_string_translate.format(text=text_block)

### clean the the chunks:

In [54]:
# chunks = split_xml_text(raw_text_xml, chunk_token_limit=11000)
# clean_message_seq = generate_messages(system_message_clean, user_wrap_function_clean, chunks)
# cleaned_data = []
# translated_chunks = []
# for i, msgs in enumerate(clean_message_seq):
#     print(f"cleaning chunk: {i}")
#     completion_clean = run_immediate_chat_process(msgs)
#     if completion_clean:
#         clean_response = completion_clean.choices[0].message.content
#         cleaned_data.append(clean_response)
#     else:
#         break
#         print("failed.")



In [57]:
token_count(full_cleaned_text)

12714

In [None]:
# write_text_to_file("phat-giao-viet-nam-1956-25-26/full_cleaned_text.xml", full_cleaned_text)

In [None]:
# to_tx_text = get_text_from_file("phat-giao-viet-nam-1956-25-26/full_cleaned_text.xml")

In [55]:
len(cleaned_chunks)

5

In [None]:
translation_message_seq = generate_messages(system_message_translate, user_wrap_function_translate, cleaned_chunks)
for i, tx_msgs in enumerate(translation_message_seq):
    print(f"translating chunk: {i}")
    
    if completion_tx:
        tx_response = completion_tx.choices[0].message.content
        translation_data.append(tx_response)
    else:
        print("failed.")
        break

In [None]:
translation_message_seq = generate_messages(system_message_translate, user_wrap_function_translate, cleaned_chunks)
for i, tx_msgs in enumerate(translation_message_seq):
    print(f"translating chunk: {i}")
    
    if completion_tx:
        tx_response = completion_tx.choices[0].message.content
        translation_data.append(tx_response)
    else:
        print("failed.")
        break

In [None]:
# translation_data = []
# input_text_chunks = [to_tx_text]
# translation_message_seq = generate_messages(system_message_translate, user_wrap_function_translate, input_text_chunks)
# for i, tx_msgs in enumerate(translation_message_seq):
#     print(f"translating chunk: {i}")
#     completion_tx = run_immediate_chat_process(tx_msgs)
#     if completion_tx:
#         tx_response = completion_tx.choices[0].message.content
#         translation_data.append(tx_response)
#     else:
#         print("failed.")
#         break

translating chunk: 0


In [65]:
print(translation_data[0])

<document>
  <page page="1">
    <title>Journal of Buddhism in Vietnam</title>
    <contents>Số 25 và 26 - Special Edition to Commence the Third Year</contents>
    <author>Thích Nhất Hạnh</author>
  </page>
  <page page="2">
    <title>Table of Contents</title>
    <p>The Vietnamese Buddhism enters its third year</p>
    <p>Truth does not lie in language and words</p>
    <p>Venerable Huyền Trang's journey to India to seek scriptures (continued)</p>
    <p>What is the self?</p>
    <p>Why it's necessary to unify Vietnamese Buddhism</p>
    <p>The Vietnamese Buddhism Monthly's interview on the issue of unification</p>
    <p>Plan to unify rituals</p>
    <p><i>Chia gia-lài</i> (A Dharma story)</p><footnote number="1">[1]</footnote>
    <p>Documents on the Buddhist Family (continued)</p>
    <p>Proper understanding of the significance of the Buddhist Family</p>
    <p>Two enduring peaks (A short story)</p>
    <p>Thích Đức-Nhuận</p>
    <p>Thích Tâm Châu</p>
    <p>Thích Tâm-Thọ</p>
   

In [66]:
full_translated_text = "\n".join(translation_data)

In [67]:
write_text_to_file("journal_1956_25_26_translation_full.xml", full_translated_text)

In [None]:
# response_schema = {
#   "name": "text_cleaning",
#   "strict": True,
#   "schema": {
#     "type": "object",
#     "properties": {
#       "cleaned_text": {
#         "type": "string",
#         "description": "The resulting cleaned text after applying all cleaning steps."
#       },
#       "cleaning_steps": {
#         "type": "array",
#         "description": "A series of notes that explain each step taken to clean the input text.",
#         "items": {
#           "type": "object",
#           "properties": {
#             "step_description": {
#               "type": "string",
#               "description": "A detailed description of the cleaning step."
#             },
#             "text_example": {
#               "type": "string",
#               "description": "an example fragment of text that was corrected."
#             },
#             "confidence_level": {
#               "type": "integer",
#               "description": "3 for high confidence, 2 for medium, 1 for low confidence corrections."
#             }
#           },
#           "required": [
#             "step_description",
#             "text_example",
#             "confidence_level"
#           ],
#           "additionalProperties": False
#         }
#       }
#     },
#     "required": [
#       "cleaned_text",
#       "cleaning_steps"
#     ],
#     "additionalProperties": False
#   }
# }