In [None]:
import os
from data_processing.text_processing import get_text_from_file, set_working_directory


In [None]:
from lxml import etree

def extract_unique_tags(xml_file):
    """
    Extract all unique tags from an XML file using lxml.
    
    Parameters:
        xml_file (str): Path to the XML file.
        
    Returns:
        set: A set of unique tags in the XML document.
    """
    # Parse the XML file
    tree = etree.parse(xml_file)
    
    # Find all unique tags
    unique_tags = {element.tag for element in tree.iter()}
    
    return unique_tags

# # Example usage
# if __name__ == "__main__":
#     # Replace 'path/to/your/file.xml' with your actual file path
#     xml_file_path = "path/to/your/file.xml"
#     tags = extract_unique_tags(xml_file_path)
    
#     # Print all unique tags
#     print("Unique Tags Found:")
#     for tag in sorted(tags):
#         print(tag)

In [None]:
set_working_directory("../processed_journal_data")

In [None]:
test_file = "../processed_journal_data/phat-giao-viet-nam-1956-24/journal_1956_24_translation_full.xml"

In [None]:
os.getcwd()

In [None]:
j24 = get_text_from_file("phat-giao-viet-nam-1956-24/journal_1956_24_translation_full.xml")

In [None]:
extract_unique_tags("../processed_journal_data/phat-giao-viet-nam-1956-24/journal_1956_24_translation_full.xml")

In [None]:
import xml.etree.ElementTree as ET

def extract_structure(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    def recurse(element):
        return {
            "tag": element.tag,
            "children": [recurse(child) for child in element]
        }
    
    return recurse(root)


In [None]:
xml_structure = extract_structure(test_file)
print(xml_structure)

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, ListFlowable, ListItem
from reportlab.lib.styles import getSampleStyleSheet
import xml.etree.ElementTree as ET
from reportlab.lib.units import inch


def parse_xml_to_pdf(xml_file, output_pdf, xml_structure):
    """
    Parses an XML file, maps it to the given structure, and generates a PDF.
    """
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Set up PDF document and styles
    doc = SimpleDocTemplate(output_pdf, pagesize=letter)
    elements = []
    styles = getSampleStyleSheet()

    # Define a function to map tags to styles and layout logic
    def render_element(element, structure):
        tag = element.tag
        children = list(element)

        if tag == 'title':
            elements.append(Paragraph(element.text or "Title Placeholder", styles['Title']))
        elif tag == 'subtitle':
            elements.append(Paragraph(element.text or "Subtitle Placeholder", styles['Heading2']))
        elif tag == 'author':
            elements.append(Paragraph(element.text or "Author Placeholder", styles['Italic']))
        elif tag == 'p':
            elements.append(Paragraph(element.text or "Paragraph Placeholder", styles['BodyText']))
        elif tag == 'footer':
            elements.append(Paragraph(element.text or "Footer Placeholder", styles['Italic']))
        elif tag == 'ul':
            ul_items = [ListItem(Paragraph(child.text or "List Item Placeholder", styles['BodyText'])) for child in children]
            elements.append(ListFlowable(ul_items, bulletType='bullet'))
        elif tag == 'ol':
            ol_items = [ListItem(Paragraph(child.text or "List Item Placeholder", styles['BodyText'])) for child in children]
            elements.append(ListFlowable(ol_items, bulletType='1'))
        elif tag == 'footnote':
            elements.append(Paragraph(element.text or "Footnote Placeholder", styles['Italic']))
        elif tag in ['section', 'subsection']:
            elements.append(Paragraph(element.text or f"{tag.capitalize()} Placeholder", styles['Heading1']))

        # Recursively process child elements
        for child in children:
            child_structure = next((s for s in structure.get('children', []) if s['tag'] == child.tag), {})
            render_element(child, child_structure)

        # Add spacing after processing each element
        elements.append(Spacer(1, 0.2 * inch))

    # Start rendering from the root
    render_element(root, xml_structure)

    # Build the PDF
    doc.build(elements)


# Example `xml_structure` to define tag-to-style mappings
xml_structure = {
    'tag': 'document',
    'children': [
        {'tag': 'page', 'children': [
            {'tag': 'title', 'children': []},
            {'tag': 'author', 'children': []},
            {'tag': 'footer', 'children': [{'tag': 'i', 'children': []}, {'tag': 'footnote', 'children': []}]}
        ]}
    ]
}

# Example usage with an XML file


In [None]:
parse_xml_to_pdf(test_file, "test_output.pdf", xml_structure)