In [None]:
import os
import xml.etree.ElementTree as ET
from typing import List, Union
import fitz  # PyMuPDF for PDF processing

In [None]:
%cd /Users/phapman/Desktop/tnh-scholar/data_processing/processed_journal_data/phat-giao-viet-nam-1956-28

In [None]:
def convert_xml_to_html(xml_string: str) -> str:
    """
    Converts XML content to HTML, mapping XML tags to appropriate HTML tags with CSS classes.
    Handles XML parsing errors gracefully by returning a placeholder or error message.

    Parameters:
        xml_string (str): The XML content as a string.

    Returns:
        str: The converted HTML content or an error placeholder.
    """
    tag_mapping = {
        "author": ("div", "author"),
        "contents": ("div", "contents"),
        "title": ("div", "title"),
        "subtitle": ("div", "subtitle"),
        "footnote": ("span", "footnote"),
        "footnotes": ("div", "footnotes"),
        "document": ("div", "document"),
        "page": ("div", "page"),
        "p": ("p", ""),
        "i": ("i", ""),
        "section": ("div", "section"),  # Added mapping for <section>
        "li": ("li", ""),              # List item, no specific class
    }

    def process_element(element):
        """
        Recursively processes an XML element and converts it to an HTML string.

        Parameters:
            element (xml.etree.ElementTree.Element): The XML element to process.

        Returns:
            str: The HTML string corresponding to the element.
        """
        # Get the mapped HTML tag and class
        html_tag, css_class = tag_mapping.get(element.tag, ("div", ""))
        css_class_attr = f' class="{css_class}"' if css_class else ""

        # Handle special cases, like attributes
        if element.tag == "section" and "title" in element.attrib:
            # Add the section title as an additional title element
            section_title = f'<div class="title">{element.attrib["title"]}</div>'
        else:
            section_title = ""

        # Start the HTML tag
        html = f"{section_title}<{html_tag}{css_class_attr}>"

        # Add element text if available
        if element.text and element.text.strip():
            html += element.text.strip()

        # Recursively process child elements
        for child in element:
            html += process_element(child)

        # Close the HTML tag
        html += f"</{html_tag}>"
        return html

    try:
        # Attempt to parse the XML string
        root = ET.fromstring(xml_string)
        return process_element(root)

    except ET.ParseError as e:
        # Handle XML parsing errors gracefully
        print(f"XML parsing error: {e}")
        return '<div class="error">[Error: Malformed XML content]</div>'

    except Exception as e:
        # Handle any other unexpected errors
        print(f"Unexpected error: {e}")
        return '<div class="error">[Error: Unexpected error occurred]</div>'

In [None]:
def generate_html_template(
    page_num: int,
    relative_image_path: str,
    final_text: str,
    cleaned_text: str,
    uncleaned_text: str,
    css_file_path: str,
) -> str:
    """
    Generates the HTML template for a single page of the journal.

    Parameters:
        page_num (int): The page number.
        relative_image_path (str): The relative path to the scanned image.
        final_text (str): The final translated text (HTML format).
        cleaned_text (str): The cleaned text.
        uncleaned_text (str): The uncleaned OCR text.
        css_file_path (str): The path to the external CSS file.

    Returns:
        str: The HTML content for the page.
    """
    html_template = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Journal Translation - Page {page_num}</title>
        <link rel="stylesheet" href="{css_file_path}">
    </head>
    <body>
        <div class="container">
            <h1 class="title">Journal Translation - Page {page_num}</h1>
            <div class="flex-row">
                <!-- Left Panel for Scanned Page -->
                <div class="panel" id="original-image-panel">
                    <img src="{relative_image_path}" alt="Original Scanned Page" loading="lazy">
                </div>

                <!-- Right Panel for Final Translated Text -->
                <div class="panel" id="translated-text-panel">
                    <div class="translated-content">
                        {final_text}
                    </div>
                </div>
            </div>
            <div class="underbar">
                <div class="content" id="cleaned-text">
                    <h3>Cleaned Text</h3>
                    <p>{cleaned_text}</p>
                </div>
                <div class="content" id="original-text">
                    <h3>Original Uncleaned Text</h3>
                    <p>{uncleaned_text}</p>
                </div>
            </div>
        </div>
    </body>
    </html>
    """
    return html_template

In [None]:
def generate_journal_html(
    pdf_filename: str,
    uncleaned_xml_filename: str,
    cleaned_xml_filename: str,
    final_xml_filename: str,
    output_dir: str,
    css_filename: Union[str, None] = None,
    build_images=False
):
    """
    Generates an HTML file for each page of the journal.

    Parameters:
        pdf_filename (str): Path to the PDF file.
        uncleaned_xml_filename (str): Path to the uncleaned XML file.
        cleaned_xml_filename (str): Path to the cleaned XML file.
        final_xml_filename (str): Path to the final XML file.
        output_dir (str): Directory to save the output files.
        css (str): CSS as a string.
        css_filename (str): Path to CSS file if not provided directly.
        build_images (bool): If True, build the image directory.

    Returns:
        None
    """
    os.makedirs(output_dir, exist_ok=True)
    images_dir = os.path.join(output_dir, "images")
    
    with fitz.open(pdf_filename) as pdf:
        page_count = pdf.page_count

    def extract_images_from_pdf(pdf_path, image_dir):
        pdf = fitz.open(pdf_path)
        images = []
        for page_num in range(len(pdf)):
            page = pdf.load_page(page_num)
            pix = page.get_pixmap()
            image_filename = f"page_{page_num + 1}.jpg"
            image_path = os.path.join(image_dir, image_filename)
            pix.save(image_path)
            images.append((image_path, image_filename))
        return images
    
    def get_existing_images(image_dir, page_count):
        """
        Retrieves existing images from the directory and returns them as a list of tuples
        containing system and relative paths.
        """
        # Verify that each expected image file exists
        images = []
        for page_num in range(1, page_count + 1):
            image_filename = f"page_{page_num}.jpg"
            image_path = os.path.join(image_dir, image_filename)
            if os.path.exists(image_path):
                images.append((image_path, image_filename))
            else:
                print(f"Warning: Missing expected image file: {image_filename}")
                
        return images

    def parse_xml(file_path):
        tree = ET.parse(file_path)
        root = tree.getroot()
        pages = root.findall(".//page")
        return {page.get("page"): ET.tostring(page, encoding="unicode") for page in pages}

    if build_images:
        os.makedirs(images_dir, exist_ok=True)
        images = extract_images_from_pdf(pdf_filename, images_dir)
    else:
        images = get_existing_images(images_dir, page_count)
        if not images:
            raise FileNotFoundError(f"No images found in {images_dir}. Please enable build_images or check the directory.")

    uncleaned_pages = parse_xml(uncleaned_xml_filename)
    cleaned_pages = parse_xml(cleaned_xml_filename)
    final_pages = parse_xml(final_xml_filename)

    for i, (system_image_path, relative_image_filename) in enumerate(images):
        page_num = str(i + 1)

        # Check if the image file exists
        if not os.path.exists(system_image_path):
            print(f"Warning: Missing image for page {page_num}.")

        # Process uncleaned_text to add <br> tags at the end of each line
        uncleaned_text = uncleaned_pages.get(page_num, "No uncleaned text available.")
        uncleaned_text = uncleaned_text.replace("\n", "<br>\n")  # Add <br> after each newline
        cleaned_text = cleaned_pages.get(page_num, "No cleaned text available.")
        final_text_raw = final_pages.get(page_num, "No translated text available.")
        final_text = convert_xml_to_html(final_text_raw)

        html_content = generate_html_template(
            page_num=page_num,
            relative_image_path=f"images/{relative_image_filename}",
            final_text=final_text,
            cleaned_text=cleaned_text,
            uncleaned_text=uncleaned_text,
            css_file_path=css_filename,
        )

        output_path = os.path.join(output_dir, f"page_{page_num}.html")
        with open(output_path, "w") as file:
            file.write(html_content)

    print(f"Generated HTML pages in {output_dir}")

In [None]:
journal_source = "/Users/phapman/Desktop/tnh-scholar/data_processing/PDF/Phat_Giao_journals/phat-giao-viet-nam-1956-28.pdf"
css_file = "tx_viewer.css"

generate_journal_html(
    pdf_filename=journal_source,
    uncleaned_xml_filename="full_OCR_text_phat-giao-viet-nam-1956-28.xml",
    cleaned_xml_filename="full_cleaned_phat-giao-viet-nam-1956-28.xml",
    final_xml_filename="full_tx_phat-giao-viet-nam-1956-28.xml",
    output_dir="./html_view",
    css_filename=css_file,
    build_images=False
)