## Full Journal Processing Workbook

### Module Incudes

In [1]:
# General
import os
import io
from pathlib import Path
from typing import List, Dict
import json
import re
import time
from datetime import datetime
import logging
from math import floor

# OCR processing
from google.cloud import vision
from pdf2image import convert_from_path
from PIL import Image, ImageDraw, ImageFont
from pathlib import Path
import fitz  # for pdf work: PyMuPDF
from xml.sax.saxutils import escape


# XML
from xml.sax.saxutils import escape

# Local modules
from data_processing.ocr_processing import (
    make_image_preprocess_mask,
    build_processed_pdf, 
    save_processed_pdf_data, 
    load_processed_PDF_data
)

from data_processing.gpt_processing import (
    start_batch_with_retries, 
    set_model_settings,
    delete_api_files
)

from data_processing.xml_processing import split_xml_pages, split_xml_on_pagebreaks, save_pages_to_xml

from data_processing.text_processing import get_text_from_file

from data_processing.gpt_processing.pdf_journal_process import (
    setup_logger,
    generate_clean_batch, batch_section, batch_translate, 
    save_sectioning_data, save_translation_data, save_cleaned_data
)


## Parameters

In [2]:
# files
project_dir = Path("/Users/phapman/Desktop/tnh-scholar/")
data_dir = project_dir / "data_processing"
pdf_dir = data_dir / "PDF" / "Phat_Giao_journals" # directory to read pdfs from
journal_dir = data_dir / "processed_journal_data"
journal_name = "phat-giao-viet-nam-1956-11"
pdf_to_process = pdf_dir / f"{journal_name}.pdf"
working_dir = journal_dir / journal_name
ocr_data_dir = journal_dir / "ocr_data"
cleaned_xml_path = working_dir / f"full_cleaned_{journal_name}.xml"
batch_job_dir = working_dir / "processing_batch_files"
clean_batch_jsonl = batch_job_dir / f"clean_batch_{journal_name}.jsonl"
ocr_file = journal_dir / journal_name / f"full_OCR_{journal_name}.xml"
translation_xml_path = working_dir / f"translation_{journal_name}.xml"
section_batch_jsonl = batch_job_dir / "section_batch.jsonl"
translate_batch_jsonl = batch_job_dir / "translation_batch.jsonl"
section_metadata_path = working_dir / "section_metadata.json"
raw_json_metadata_path = working_dir / "raw_metadata_response.txt"
logfile = data_dir / "gpt_processing" / "pdf_journal_process" / "processing_info.log"

### Settings for cleaning OCR generated text

In [None]:
# setup for OCR cleaning
def user_wrap_function_clean(text_block):   # Function to wrap user message sent to model. Currently no wrapping.
    return text_block  

model_settings_clean = {
    "gpt-4o": {
        "max_tokens": 1000, # default value
        "temperature": 0
    }
}
system_message_clean = """You are a meticulous and consistent world expert at cleaning OCR-generated Vietnamese text. 
You are cleaning pages from a 1950's Buddhist Journal. 
Each line of scanned data will be enclosed in <> brackets. Leave <> brackets in place.
Your goal is to minimally modify the text to generate a cleaned version.
Do not remove any content from the main body of the text. 
Do not change the line formatting. 

You can use the semantic meaning of the text to infer corrections—but make no semantic changes. 
You can also add diacritical marks if they are missing or clearly inaccurate. 
Do not change any proper names, except to add missing diacritical marks or to fix orthographic errors if the context is clear.  

This particular text has a title marker in the footer, "Phat Giao Viet Nam," and also a publishing mark diagonally across the text.  
The publishing watermark is "TU VIEN HUE QUANG"  and is faint so only parts of it may appear in some locations in the text. Remove all text corresponding to the watermark.
Text corresponding to the footer, the publishing watermak (or part thereof), and page numbers can be omitted.

IMPORTANT: If the page is blank return: blank page 
IMPORTANT: Output the corrected text only with no comments (including ``` xml)"""

### Settings for sectioning a journal

In [4]:
# finding journal sections
model_settings_section = {
    "gpt-4o": {
        "max_tokens": 5000,
        "temperature": 0.25
    }
}

system_message_section = """You are a highly skilled assistant processing a Vietnamese Buddhist journal scanned from OCR. 
Use the title: "Journal of Vietnamese Buddhism."
You will be determining the journal sections by page number. You will also generate metadata for the full text and each section. 
You will return this metadata in JSON format.

Instructions:
1. Analyze the text and divide it into sections based on logical breaks, such as headings, topic changes, or clear shifts in content.
2. Ensure every page is part of a section. The first title page should always be its own section. Blank pages should be titled "blank page".
3. For each section, provide:
   - The original title in Vietnamese (`section_title_vi`).
   - The translated title in English (`section_title_en`).
   - The author's name if it is available (`section_author`). 
   - A one-paragraph summary of the section in English (`section_summary`).
   - A list of keywords for the section that are related to its content, these can be proper names, specific concepts, or contextual information.
   - The section's start and end page numbers (`start_page` and `end_page`).
   - Use "null" for any data that is not available (such as author name) for the section.

4. Return the output as a JSON object with the following schema:
{
    "journal_summary": "A one-page summary of the whole journal in English.",
    "sections": [
        {
            "title_vi": "Original title in Vietnamese",
            "title_en": "Translated title in English",
            "author": "Name of the author of the section",
            "summary": "One-paragraph summary of the section in English",
            "keywords": "A list of keywords for the section",
            "start_page":  X,
            "end_page":  Y
        },
        ...
    ]
}

5.  Ensure the JSON is well-formed and adheres strictly to the provided schema.
6.  IMPORTANT: ensure every page is part of a section and sections appear in order of pagination."""

### Settings for translation

In [5]:
# translation settings
model_settings_translate = {
    "gpt-4o": {
        "max_tokens": 5000,  # a default value, updated per batch
        "temperature": 0.75
    }
}

system_message_translate = """You are the world's foremost translator of Zen Master Thich Nhat Hanh's Vietnamese writing into English, following the language style of the plumvillage.org website.
The text is based on an OCR scan of a journal you edited from 1956-1958. Use the title: "Journal of Vietnamese Buddhism" for the journal when it is referenced.
You will be translating a single section of the journal and will be provided with the section title in English. 
Translate for the most meaningful, typical, and eloquent English interpretation that is simple, yet poetic. 
Translate precisely; do not add change the text or add commentary.  
Notes on the text can be added in the <notes>.
Make corrections in the text only where necessary (for example if words are missing) to create logical flow. Note all corrections in the <translation-notes>. 
Do not change <pagebreak> tag postioning. Each translated page must match its original page source as pages will be studied side by side with the original Vietnamese.
Infer paragraphs and text structure from the text layout.
Add XML tags for clarity, using only the following tags: 

   <section> for major sections.
   <subsection> for subsections.
   <title> for main titles of sections and subsections. 
   <subtitle> for subtitles of sections and subsections. 
   <heading> for headings that do not mark titles or subtitles
   <p> for paragraphs.
   <br/> for linebreaks that add meaning such as in poems or other structures.
   <TOC> for tables of contents
   <author> for named authors of sections (only)
   <i> for italics. 
   <b> for bold.
   <notes>
   <translation-notes>

You may use <notes> at the end of the section for notes on historical, cultural, spiritual, or other interesting elements of the text.
You want advanced students of Thay to understand the text in its larger historical context, in the context of Vietnamese Buddhism, and in the context of his life.
You may add <translation-notes> at the end of the section as a commentary to summarize your translation choices. 
For <translation-notes>, you may include information on Sino-Vietnamese, complex, unusual, poetic, or other interesting terms, and significant corrections to the text. 
In the <translation-notes> include the original Vietnamese terms for reference.

IMPORTANT: All titles, XML sections, text, poetry, quotations, and terms MUST BE TRANSLATED TO ENGLISH. Do not however, translate names of people; leave names in Vietnamese with diacritics.
IMPORTANT: Return pure XML with no formatting marks such as xml or ```.
IMPORTANT: The returned XML should begin and end with <section> tags."""

## Process execution pipeline

### start logger

In [6]:
logger = setup_logger(logfile)

### OCR Scan

In [52]:
# process pdf with OCR through google vision
client = vision.ImageAnnotatorClient()
pre_mask1 = make_image_preprocess_mask(0.1)  #this masks the bottom 10% of the image where the publishing mark is located
text_pages, word_locations_list, annotated_images, unannotated_images = build_processed_pdf(pdf_to_process, client, pre_mask1) 


2024-12-03 19:19:39,371 - google.auth._default - DEBUG - Checking None for explicit credentials as part of auth process...
2024-12-03 19:19:39,372 - google.auth._default - DEBUG - Checking Cloud SDK credentials as part of auth process...
I0000 00:00:1733282379.839355  366393 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
2024-12-03 19:19:39,842 - ocr_processing - INFO - Processing file with 99 pages:
	/Users/phapman/Desktop/tnh-scholar/data_processing/PDF/Phat_Giao_journals/phat-giao-viet-nam-1956-09-10.pdf
2024-12-03 19:19:39,843 - ocr_processing - INFO - Processing page 1/99...
2024-12-03 19:19:42,760 - google.auth.transport.requests - DEBUG - Making request: POST https://oauth2.googleapis.com/token
2024-12-03 19:19:45,136 - ocr_processing - INFO - Processing page 2/99...
2024-12-03 19:19:46,863 - ocr_processing - INFO - Processing page 3/99...
2024-12-03 19:19:48,463 - ocr_processing - INFO - Processing page 4/99...
2024-12-03 19:19

page dimensions: {'width_in': 10.720472547743055, 'height_in': 16.08268059624566, 'width_px': 1024, 'height_px': 1536}


In [None]:
# save processed info
save_processed_pdf_data(ocr_data_dir, journal_name, text_pages, word_locations_list, annotated_images, unannotated_images)
save_pages_to_xml(working_dir / f"full_OCR_{journal_name}.xml", text_pages, overwrite=True)

Processed data saved in: /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-09-10
XML file successfully saved at /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-09-10/full_OCR_phat-giao-viet-nam-1956-09-10.xml


### Data Cleaning

In [54]:
generate_clean_batch(ocr_file, clean_batch_jsonl, system_message_clean, user_wrap_function_clean)
job_description = f"cleaning for {journal_name} on {ocr_file}"
cleaned_data = start_batch_with_retries(clean_batch_jsonl, job_description) # run the clean process
save_cleaned_data(cleaned_xml_path, cleaned_data, journal_name)

2024-12-03 19:24:54,497 - journal_process - INFO - Processing file: /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-09-10/full_OCR_phat-giao-viet-nam-1956-09-10.xml
2024-12-03 19:24:54,500 - journal_process - INFO - Found 99 pages in /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-09-10/full_OCR_phat-giao-viet-nam-1956-09-10.xml.
2024-12-03 19:24:54,541 - gpt_interface - INFO - Creating JSONL batch file with [91m75598[0m requested tokens:
	/Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-09-10/processing_batch_files/clean_batch_phat-giao-viet-nam-1956-09-10.jsonl
2024-12-03 19:24:54,541 - gpt_interface - DEBUG - Batch request details: Method=POST, URL=/v1/chat/completions
2024-12-03 19:24:54,542 - gpt_interface - DEBUG - Batch parameters:
    model: gpt-4o
    max_tokens: 145
    temperature: 0.75
2024-12-03 19:24:54,547 - gpt_interface 

XML file successfully saved at /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-09-10/full_cleaned_phat-giao-viet-nam-1956-09-10.xml


### Sectioning

In [55]:
set_model_settings(model_settings_section)
metadata_serial_json = batch_section(cleaned_xml_path, section_batch_jsonl, system_message_section, journal_name) # run the section process
save_sectioning_data(section_metadata_path, raw_json_metadata_path, metadata_serial_json, journal_name)  

2024-12-03 19:26:28,231 - journal_process - INFO - Starting sectioning batch for phat-giao-viet-nam-1956-09-10 with file:
	/Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-09-10/full_cleaned_phat-giao-viet-nam-1956-09-10.xml
2024-12-03 19:26:28,234 - gpt_interface - INFO - Creating JSONL batch file with [91m5000[0m requested tokens:
	/Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-09-10/processing_batch_files/section_batch.jsonl
2024-12-03 19:26:28,234 - gpt_interface - DEBUG - Batch request details: Method=POST, URL=/v1/chat/completions
2024-12-03 19:26:28,234 - gpt_interface - DEBUG - Batch parameters:
    model: gpt-4o
    max_tokens: 5000
    temperature: 0.25
    response_format: {'type': 'json_object'}
2024-12-03 19:26:28,237 - gpt_interface - INFO - JSONL file created at: /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-09-10/proce

PosixPath('/Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-09-10/section_metadata.json')

### Translating

In [7]:
set_model_settings(model_settings_translate)
translation_data = batch_translate(cleaned_xml_path, translate_batch_jsonl, section_metadata_path, system_message_translate, journal_name)
save_translation_data(translation_xml_path, translation_data, journal_name)

2024-12-04 11:33:21,268 - journal_process - INFO - Starting translation batch for journal 'phat-giao-viet-nam-1956-11':
	with file: /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-11/full_cleaned_phat-giao-viet-nam-1956-11.xml
	metadata: /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-11/section_metadata.json
2024-12-04 11:33:21,271 - journal_process - DEBUG - page groups found: [(1, 1), (2, 2), (3, 4), (5, 7), (8, 14), (15, 19), (20, 20), (21, 25), (26, 28), (29, 35), (36, 40), (41, 43), (44, 45), (46, 49), (50, 50), (51, 51), (52, 52)]
2024-12-04 11:33:21,273 - journal_process - DEBUG - section_contents[0]:
PHẬT GIÁO
NAM
HUỆ QUANG
AM DỊCH
2
THUẬT HẠN
SỐ II RA NGÀY 15 THÁNG 6 ĐINH - DẬU
HUẾ CÙNG NG-HỘI PHẬT GIÁO VIỆT NAM XUẤT-BẢN
NGUYỆT-SAN
<pagebreak page='1' />
2024-12-04 11:33:21,275 - journal_process - DEBUG - section 0: VIETNAMESE BUDDHISM NAM HUỆ QUANG TRANSLATION added for batch 

### Cleanup

In [21]:
confirmation = input("Are you sure you want to delete API files? 'y' to confirm: ").strip().lower()
if confirmation == 'y':
    delete_api_files(datetime.now())
    print("Files deleted successfully.")
else:
    print("Deletion canceled.")

Deleted file: file-DNemfvfXHVd4NQRJ3juN63 (created on 2024-12-03 15:30:42)
Deleted file: file-Lw3LPdUTfVhxKWXufsGRSF (created on 2024-12-03 15:29:49)
Deleted file: file-VHdECWSKXUyH9PWgU5XNem (created on 2024-12-03 15:24:02)
Deleted file: file-X1efeiL6iRFwo1Hi2XzZsm (created on 2024-12-03 15:23:45)
Deleted file: file-Qu4EEAEh7BdqGM3AtHgRZq (created on 2024-12-03 15:23:28)
Deleted file: file-Scs1FgJoDAscTyx8dNJwza (created on 2024-12-03 15:23:11)
Deleted file: file-JmAoAUNQzvPBd2k6xtGt6j (created on 2024-12-03 15:22:55)
Deleted file: file-H8qi8GxrvtCdEyHK54bxFH (created on 2024-12-03 15:22:38)
Deleted file: file-4k94obuu1tCAtYqXwPidYX (created on 2024-12-03 15:21:45)
Deleted file: file-GkeUQV9243TKpDyAtF7EKU (created on 2024-12-03 13:36:39)
Deleted file: file-TcH8VchQ5gogkDk2aAhaH9 (created on 2024-12-03 13:36:23)
Deleted file: file-1fmZy7TR7yNRovkzfycZn1 (created on 2024-12-03 13:36:02)
Deleted file: file-PeYciMgS1yMQsL28YTZiUs (created on 2024-12-03 13:34:22)
Deleted file: file-3abTXq