## Testing automated text tagging based on a TOC in text

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%aimport xml, re

In [None]:
import xml.etree.ElementTree as ET
import re

In [None]:
from clean_parse_tag import get_text_from_file, set_working_directory, write_text_to_file

In [None]:
def add_toc_tags(text):
    """
    Adds XML metadata tags to sections in text based on TOC structure.

    Args:
        text (str): The input text containing the TOC and text body.

    Returns:
        str: Modified text with metadata tags added.
    
    Example:
        >>> text = "some text containing <toc>...</toc> and more text"
        >>> modified_text = add_toc_tags(text)
        >>> print(modified_text)
    """
    # Step 1: Extract TOC content
    toc_pattern = re.compile(r'<toc>\s*(.*?)\s*</toc>', re.DOTALL)
    toc_match = toc_pattern.search(text)
    if not toc_match:
        raise ValueError("TOC section not found in the text.")

    toc_content = toc_match.group(1)

    # Step 2: Parse TOC XML and create a metadata map
    toc_sections = []
    toc_root = ET.fromstring(f"<toc>{toc_content}</toc>")
    for section in toc_root.findall(".//section"):
        level = section.get("level")
        for elem in section:
            tag = elem.tag
            title = elem.text.strip()
            toc_sections.append((level, tag, title))

    # Step 3: Add metadata tags to matching segments in the text
    def add_tags(match):
        matched_text = match.group(1)
        print(matched_text)
        for level, tag, title in toc_sections:
            if title in matched_text:
                return f'\n<section level="{level}"><{tag}>{matched_text}</{tag}></section>\n\n'
        return matched_text

    # Compile patterns for each TOC title in a case-sensitive way
    patterns = [re.compile(rf"(?:^|\n)({re.escape(title)})\s*\n") for _, _, title in toc_sections]

    print(patterns)
    # Step 4: Process the text, applying tags where TOC titles are matched
    tagged_text = text
    for pattern in patterns:
        tagged_text = pattern.sub(add_tags, tagged_text)

    return tagged_text

In [None]:
set_working_directory("../books/private_books")
test_text = get_text_from_file('test_th_output.txt')

In [None]:
print(test_text)

In [None]:
result = add_toc_tags(test_text)

In [None]:
write_text_to_file("toc_tagged_test.txt", result)

In [None]:
import re

text = "\nA NOTE ON THE TEXT\nAnother line"

pattern = re.compile(r"(?:^|\n)(A NOTE ON THE TEXT)\s*\n")

match = pattern.search(text)
if match:
    print("Matched:", match.group(2))  # Should print "A NOTE ON THE TEXT"
else:
    print("No match found")

In [None]:
pattern = re.compile('^(A\\ NOTE\\ ON\\ THE\\ TEXT)\\s*\\n')
print(pattern)

In [None]:
match = pattern.search(text)

In [None]:
print(match)