In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%aimport bs4, ebooklib, spacy, html.parser, lxml, re, warnings

In [8]:
from lxml import etree
from bs4 import BeautifulSoup


In [5]:
from clean_parse_tag import set_working_directory, get_text_from_file, write_text_to_file
from clean_parse_tag import normalize_quotes

set_working_directory("../books/private_books")

In [23]:
import re

def tag_chapters(text):
    """
    Adds XML tags to chapter headers in the format:
    -Chapter <number_in_words>-\n\n(title)\n\n
    
    Args:
        text (str): The input text containing chapter headers.
        
    Returns:
        str: The text with chapter headers tagged.
    
    Example:
        >>> text = "-Chapter Two-\n\nA Proposal for Peace\n\nSome content here."
        >>> print(tag_chapters(text))
        <section level="2" type="chapter">
        <chapter-number>Chapter Two</chapter-number>
        <title>A Proposal for Peace</title>
        
        Some content here.
    """
    # Updated regex pattern to match chapter number in words and title
    pattern = r"^-Chapter ([A-Za-z]+)-\n\n([^\n]+)\n\n"
    
    # Replacement string with tags
    replacement = r"""<section level="2" type="chapter">
<chapter-number>Chapter \1</chapter-number>
<title>\2</title>

"""
    
    # Perform the substitution and return the result
    tagged_text = re.sub(pattern, replacement, text, flags=re.M)
    return tagged_text

# Example usage
text = """
some text

-Chapter Two-

A Proposal for Peace

Some content here.

"""
print(tag_chapters(text))



some text

<section level="2" type="chapter">
<chapter-number>Chapter Two</chapter-number>
<title>A Proposal for Peace</title>

Some content here.




In [25]:
work_text = get_text_from_file("working_LiA.txt")


In [26]:
print(work_text)

<metadata>
    <title>Love in Action</title>
    <subtitle>Writtings on Nonviolent Social Change</subtitle>
    <author>Thich Nhat Hanh</author>
    <language>en</language>
    <publisher>Parallax Press</publisher>
    <date></date>
</metadata>

<section level="0" type="title-page">
LOVE IN ACTION

<i>Writings on</i>

<i>Nonviolent Social Change</i>

Thich Nhat Hanh

Foreword by

Daniel Berrigan

Parallax Press

Berkeley, California
</section>

<section level="1" type="play">
<title>
The Path of Return

Continues the Journey
</title>

<subtitle><i>A Play in One Act</i></subtitle>

<section level="2" type="foreward">
<title>-Foreword-</title>

<epigraph>
Their Speech Is

All of Forgiveness
</epigraph>

<author><i>Daniel Berrigan, S.J.</i></author>

A tone poem, a snatch of gentle, celebratory  music. A verse about death, which is really about life. A transmutation of death; the emptying of the veins of death, the letting in of light and air to those fearful spaces. How does one cope wit

In [27]:
text_sub = tag_chapters(work_text)

In [28]:
print(text_sub)

<metadata>
    <title>Love in Action</title>
    <subtitle>Writtings on Nonviolent Social Change</subtitle>
    <author>Thich Nhat Hanh</author>
    <language>en</language>
    <publisher>Parallax Press</publisher>
    <date></date>
</metadata>

<section level="0" type="title-page">
LOVE IN ACTION

<i>Writings on</i>

<i>Nonviolent Social Change</i>

Thich Nhat Hanh

Foreword by

Daniel Berrigan

Parallax Press

Berkeley, California
</section>

<section level="1" type="play">
<title>
The Path of Return

Continues the Journey
</title>

<subtitle><i>A Play in One Act</i></subtitle>

<section level="2" type="foreward">
<title>-Foreword-</title>

<epigraph>
Their Speech Is

All of Forgiveness
</epigraph>

<author><i>Daniel Berrigan, S.J.</i></author>

A tone poem, a snatch of gentle, celebratory  music. A verse about death, which is really about life. A transmutation of death; the emptying of the veins of death, the letting in of light and air to those fearful spaces. How does one cope wit

In [None]:
# write_text_to_file("working_LiA2.txt", text_sub)

In [77]:
work_text2 = get_text_from_file("TH_working_metadata_tagged.txt")

In [78]:
work_text2 = normalize_quotes(work_text2)

In [90]:
import warnings

# ANSI code for red text
RED = "\033[91m"
RESET = "\033[0m"

# Custom function to format and color warning output
def custom_showwarning(message, category, filename, lineno, file=None, line=None):
    # Print the warning message in red, excluding file path and line info
    print(f"{RED}{category.__name__}: {message}{RESET}")

# Define a custom warning class
class LineFormatWarning(UserWarning):
    """Warning for issues with line formatting in text parsing."""
    pass

# Override the default warning output behavior
warnings.showwarning = custom_showwarning

def process_text_string(text):
    output = []
    inside_list = False  # State to track if we're inside a <list>

    # Regular expressions for detecting patterns
    tag_pattern = re.compile(r'^<.*>$')
    list_item_pattern = re.compile(r'^\d+\.\s')
    list_start_pattern = re.compile(r'^1\.\s')
    paragraph_start_pattern = re.compile(r"^[\u0022\u201C\u201D'A-Z0-9\-\#\*]")

    # Split the input text into lines
    for idx, line in enumerate(text.splitlines()):
        stripped_line = line.strip()
        
        # print(f"line: {stripped_line} | match: {paragraph_start_pattern.match(stripped_line)}")

        # Ignore lines that start with a tag
        if tag_pattern.match(stripped_line):
            
            if inside_list:
                output.append('</list>')
                inside_list = False

            output.append(stripped_line)
            continue

        # Ignore all blank lines as these are not considered significant to text structure
        if not stripped_line:
            output.append("")
            continue
        
        # Check if the line is a numbered list item
        elif list_item_pattern.match(stripped_line):
            if not inside_list and list_start_pattern.match(stripped_line):
                output.append('<list>')  # Start the list if not already inside one
                inside_list = True
            if inside_list:
                output.append(f'    <item>{stripped_line}</item>')
            else: # treat as a paragraph
                warnings.warn(f"Out of order numbered line detected at line {idx}:\n\t'{stripped_line}'", LineFormatWarning)
        
        # Check if the line should be a paragraph
        elif paragraph_start_pattern.match(stripped_line):
            # If we're inside a list and encounter a paragraph, close the list
            if inside_list:
                output.append('</list>')
                inside_list = False
            output.append(f'<p>{stripped_line}</p>')
        
        elif stripped_line[0].islower():
            # flag this line as improper:
            warnings.warn(f"Uncapitalized line detected at line {idx}:\n\t'{stripped_line}'\n\tParagraph <p> tags added.", LineFormatWarning)
            # still make a paragraph:
            output.append(f'<p>{stripped_line}</p>')
            # still close a list in this case:
            if inside_list:
                output.append('</list>')
                inside_list = False    
        
        else:
            warnings.warn(f"Unrecognized line format at line {idx}:\n'{stripped_line}'\n\tParagraph <p> tags added.", LineFormatWarning)
            output.append(f'<p>{stripped_line}</p>')


    # Close any remaining open list at the end of the text
    if inside_list:
        output.append('</list>')

    return '\n'.join(output)

# Usage
text = """
<section level="2"><sutra-section>I.</sutra-section></section>

Capitalized 
"What are the Four Establishments?

"""

processed_text = process_text_string(text)
print(processed_text)


<section level="2"><sutra-section>I.</sutra-section></section>

<p>Capitalized</p>
<p>"What are the Four Establishments?</p>



In [91]:
update_text2 = process_text_string(work_text2)

	'is like sitting under the Bodhi tree.'
	Paragraph <p> tags added.[0m
	'entirely free from distraction.'
	Paragraph <p> tags added.[0m
	'the afflictions are lifted.'
	Paragraph <p> tags added.[0m
	'and the awakened mind is born.'
	Paragraph <p> tags added.[0m
	'will die.'
	Paragraph <p> tags added.[0m
	'www.parallax.org'
	Paragraph <p> tags added.[0m
	'info@plumvillage.org'
	Paragraph <p> tags added.[0m
	'mfmaster@vermontel.net'
	Paragraph <p> tags added.[0m
	'deerpark@plumvillage.org'
	Paragraph <p> tags added.[0m
	'www.parallax.org'
	Paragraph <p> tags added.[0m
	'eISBN : 978-1-935-20929-4'
	Paragraph <p> tags added.[0m


In [92]:
print(update_text2)

<book>
<metadata>
<title>Transformation and Healing: Sutra on the Four Establishments of Mindfulness</title>
<author>Thich Nhat Hanh</author>
<language>en</language>
<publisher>Parallax Press</publisher>
<date>2006-08-16T15:00:00+00:00</date>
</metadata>

<toc>
<section level="1"><note>A NOTE ON THE TEXT</note></section>
<section level="1"><introduction>Introduction</introduction></section>
<section level="1"><sutra>Sutra on the Four Establishments of Mindfulness</sutra></section>
<section level="1"><summary>Summary of the Sutra</summary></section>

<section level="1"><exercise_group>Mindfulness Exercises</exercise_group></section>
<section level="2"><exercise_group>EXERCISES FOR OBSERVING THE BODY</exercise_group></section>
<section level="2"><note>REMARKS ON THE FIRST NINE EXERCISES</note></section>
<section level="2"><exercise_group>EXERCISES FOR OBSERVING THE FEELINGS</exercise_group></section>
<section level="2"><exercise_group>EXERCISES FOR OBSERVING THE MIND</exercise_group></se

In [93]:
def transform_section_tags(xml_text):
    # Parse the XML text
    root = etree.fromstring(xml_text)
    
    # Find all sections to process them
    sections = root.findall(".//section")
    
    for section in sections:
        # Retrieve the first child (descriptor tag, e.g., <note> or <introduction>)
        descriptor_tag = section[0]
        
        # Get the tag type and content
        tag_type = descriptor_tag.tag
        tag_content = descriptor_tag.text
        
        # Update the section tag with type attribute and title element
        section.set("type", tag_type)  # Add `type` attribute to section
        title_element = etree.Element("title")
        title_element.text = tag_content  # Add title element text
        
        # Replace the descriptor tag with the title element
        section.replace(descriptor_tag, title_element)
        
        # Move subsequent paragraphs into the current section
        next_sibling = section.getnext()
        while next_sibling is not None and next_sibling.tag != "section":
            # Append the sibling paragraph to the current section
            section.append(next_sibling)
            next_sibling = section.getnext()  # Update to next sibling

    # Return the modified XML as a string
    return etree.tostring(root, pretty_print=True).decode()

# Sample input XML text
xml_text = '''
<root>
    <section level="1"><note>A NOTE ON THE TEXT</note></section>
    <p>The word for a Buddhist scripture, the teachings of the Buddha...</p>
    <p>The word satipatthana (Sanskrit: smrityupasthana) is a compound...</p>
    <p>For ease of use, the text of the original sutra has been kept...</p>
    <section level="1"><introduction>Introduction: What Is Mindfulness?</introduction></section>
    <p>Mindfulness is the foundation of Buddhist practice...</p>
</root>
'''

# Run the transformation
transformed_xml = transform_section_tags(xml_text)
print(transformed_xml)

<root>
    <section level="1" type="note"><title>A NOTE ON THE TEXT</title><p>The word for a Buddhist scripture, the teachings of the Buddha...</p>
    <p>The word satipatthana (Sanskrit: smrityupasthana) is a compound...</p>
    <p>For ease of use, the text of the original sutra has been kept...</p>
    </section>
    <section level="1" type="introduction"><title>Introduction: What Is Mindfulness?</title><p>Mindfulness is the foundation of Buddhist practice...</p>
</section>
    </root>



In [None]:
# write_text_to_file("TH_working2.txt", update_text2)

In [133]:
work_text3 = get_text_from_file("TH_working3.xml")

In [134]:
def validate_xml(xml_text):
    errors = []  # List to collect error messages

    # First pass to collect all errors without recovery
    parser_no_recover = etree.XMLParser(recover=False)
    try:
        etree.fromstring(xml_text, parser_no_recover)
    except etree.XMLSyntaxError as e:
        errors.extend(e.error_log)  
    
    return errors

In [135]:
root = etree.fromstring(work_text3)

In [136]:
validate_xml(work_text3)

[]

In [110]:
from lxml import etree

def validate_and_repair_xml(xml_text):
    try:
        # Attempt to parse the XML text
        parser = etree.XMLParser(recover=True)  # recover=True allows minor repairs
        root = etree.fromstring(xml_text, parser)
        print("XML is well-formed and any minor errors have been repaired.")
        return etree.tostring(root, pretty_print=True).decode()  # Return repaired XML
    except etree.XMLSyntaxError as e:
        print("XML Syntax Error detected:", e)
        return None

# Sample XML input with potential issues
xml_text = '''
<root>
    <section level="1"><note>A NOTE ON THE TEXT</note></section>
    The word for a Buddhist scripture, the teachings of the Buddha...
    <section level="1"><introduction>Introduction: What Is Mindfulness?</introduction>
    Mindfulness is the foundation of Buddhist practice...
</root>
'''

# Run validation and repair
repaired_xml = validate_and_repair_xml(xml_text)
if repaired_xml:
    print(repaired_xml)
else:
    print("The XML could not be repaired and requires manual correction.")

XML is well-formed and any minor errors have been repaired.
<root>
    <section level="1"><note>A NOTE ON THE TEXT</note></section>
    The word for a Buddhist scripture, the teachings of the Buddha...
    <section level="1"><introduction>Introduction: What Is Mindfulness?</introduction>
    Mindfulness is the foundation of Buddhist practice...
</section>
</root>



In [113]:
work_text3 = validate_and_repair_xml(work_text3)

XML is well-formed and any minor errors have been repaired.


In [114]:
tag_fix_text2 = transform_section_tags(work_text3)

In [115]:
write_text_to_file("TH_working3.txt", tag_fix_text2)

In [7]:
current_xml = get_text_from_file("TH_working3.xml")

In [9]:
from lxml import etree

def transform_exercise_quotes(xml_content):
    """
    Parses an XML document to replace <p> tags with <sutra-quote> tags 
    within <section> elements of level 3 and type 'exercise', specifically 
    for paragraphs immediately following the title.
    
    Parameters:
        xml_content (str): XML content as a string.
        
    Returns:
        str: Modified XML as a string.
    
    Example:
        transformed_xml = transform_exercise_quotes(xml_content)
    """
    # Parse the XML content
    root = etree.fromstring(xml_content)
    
    # Find all level 3 exercise sections
    for section in root.xpath('//section[@level="3" and @type="exercise"]'):
        # Locate the <title> and then check the next element
        title = section.find('title')
        if title is not None:
            # Get the next element after <title> and check if it's <p>
            next_element = title.getnext()
            if next_element is not None and next_element.tag == 'p':
                # Create <sutra-quote> element and copy the text from <p>
                sutra_quote = etree.Element("sutra-quote")
                sutra_quote.text = next_element.text
                
                # Replace <p> with <sutra-quote>
                section.replace(next_element, sutra_quote)
    
    # Convert the modified XML tree back to string
    return etree.tostring(root, pretty_print=True, encoding='unicode')

# Example usage
xml_content = '''<root>
    <section level="3" type="exercise">
        <title>Exercise 5 | Positions of the Body</title>
        <p>Moreover, when a practitioner walks, he is aware, 'I am walking.' When he is
            standing, he is aware, 'I am standing.' When he is sitting, he is aware, 'I am
            sitting.' When he is lying down, he is aware, 'I am lying down.' In whatever
            position his body happens to be, he is aware of the position of his body.</p>
    </section>
</root>'''

transformed_xml = transform_exercise_quotes(xml_content)
print(transformed_xml)

<root>
    <section level="3" type="exercise">
        <title>Exercise 5 | Positions of the Body</title>
        <sutra-quote>Moreover, when a practitioner walks, he is aware, 'I am walking.' When he is
            standing, he is aware, 'I am standing.' When he is sitting, he is aware, 'I am
            sitting.' When he is lying down, he is aware, 'I am lying down.' In whatever
            position his body happens to be, he is aware of the position of his body.</sutra-quote></section>
</root>



In [10]:
tx_quotes = transform_exercise_quotes(current_xml)

In [12]:
print(tx_quotes)

<book>
    <metadata>
        <title>Transformation and Healing: Sutra on the Four Establishments of Mindfulness</title>
        <author>Thich Nhat Hanh</author>
        <language>en</language>
        <publisher>Parallax Press</publisher>
        <date>2006-08-16T15:00:00+00:00</date>
    </metadata>

    <toc>
        <section level="1" type="note">
            <title>A NOTE ON THE TEXT</title>
        </section>
        <section level="1" type="introduction">
            <title>Introduction</title>
        </section>
        <section level="1" type="sutra">
            <title>Sutra on the Four Establishments of Mindfulness</title>
        </section>
        <section level="1" type="summary">
            <title>Summary of the Sutra</title>
        </section>

        <section level="1" type="exercise_group">
            <title>Mindfulness Exercises</title>
        </section>
        <section level="2" type="exercise_group">
            <title>EXERCISES FOR OBSERVING THE BODY</title>


In [13]:
write_text_to_file("TH_working4.xml", tx_quotes)