In [None]:
from bs4 import BeautifulSoup
from ebooklib import epub
import spacy

#from initial_cleaning_scripts import extract_text_and_metadata, clean_and_tag_text

In [None]:
import spacy

# Load the small English model for POS tagging
nlp = spacy.load("en_core_web_sm")

def is_title_case(text):
    """
    Check if a string is in title case using POS tagging.
    Content words (nouns, verbs, adjectives, pronouns) should be capitalized,
    and functional words (conjunctions, prepositions, determiners) are allowed to be lowercase.
    The first word of the sentence should always be capitalized.
    
    Args:
        text (str): The string to check.

    Returns:
        bool: True if the string follows title case rules, False otherwise.

    Examples:

    Example use cases:
        print(is_title_case("My Love Is Kind"))  # True
        print(is_title_case("my Love Is Kind"))  # False (first word not capitalized)
        print(is_title_case("The Quick Brown Fox Jumps Over the Lazy Dog"))  # True
        print(is_title_case("The quick Brown Fox Jumps Over the Lazy Dog"))  # False
        print(is_title_case("A Tale of Two Cities"))  # True
    """
    # Process the text with spaCy

    if not text: 
        return False
    
    doc = nlp(text)
    
    # Define parts of speech that should be capitalized (content words + pronouns)
    capitalizable_pos = {"NOUN", "PROPN", "ADJ", "VERB", "ADV", "PRON"}  
    
    # Ensure the first word is always capitalized
    if not doc[0].text.istitle():
        return False
    
    # Iterate through each token in the document
    for token in doc:
        # If the token is a content word or pronoun, it should be capitalized
        if token.pos_ in capitalizable_pos and not token.text.istitle():
            return False
        
        # If the token is a functional word, it should be lowercase, but we allow for caps, as this may be a typo.
        #elif token.pos_ not in capitalizable_pos and token.text.islower() is False:
        #    print(f"capitalized and needs: {token}")
        # return False
    
    return True



In [None]:
def is_title_case_fast(text):
    """
    Check if the given text follows proper title case rules:
    - First and last word are capitalized.
    - All other words, except members of a predefined list, are capitalized.
    - If a word has two or more letters, the second letter must be lowercase.
    
    Args:
    text (str): The input string (title) to check.

    Returns:
    bool: True if the text follows title case rules, False otherwise.

    # Example usage:
    print(is_title_case_fast("The Fault in Our Stars"))  # True
    print(is_title_case_fast("Gone with the Wind"))      # True
    print(is_title_case_fast("IV."))                     # False (second letter is capitalized, not valid)
    print(is_title_case_fast("To Kill a Mockingbird"))   # True
    """
    possibly_not_capitalized = [
        "a", "an", "the", "and", "but", "or", "nor", "for", "so", "yet", "as", "if", "because", "than", 
        "that", "though", "when", "where", "while", "although", "at", "by", "for", "from", "in", "of", 
        "on", "to", "up", "with", "off", "out", "over", "into", "near", "upon", "onto", "down", "about", 
        "across", "after", "along", "around", "before", "behind", "below", "beneath", "beside", "between", 
        "beyond", "during", "except", "inside", "outside", "through", "under", "until", "within", "without", 
        "once", "like", "now", "since", "till"
    ]
    
    # Split the title into words
    words = text.split()
    
    if not words:
        return False  # Empty string case

    # Check if the first and last word are capitalized
    if not (words[0][0].isupper() and words[-1][0].isupper()):
        return False

    # Check the capitalization of all words
    for word in words:
        # If the word is not in the list of possibly uncapitalized words
        if word.lower() not in possibly_not_capitalized:
            # Check if the first letter is uppercase
            if not word[0].isupper():
                return False
            # Check if the second letter (if it exists) is lowercase
            if len(word) > 1 and not word[1].islower():
                return False
    
    return True



In [None]:
segment_rules_TH = {
    "part": {
        "tags": ['p'],  # Handle more flexible tags, e.g., spans
        "patterns": ["Part \\d+", "Book \\d+"],
        "tests": []
    },
    "chapter": {
        "tags": ['p'],
        "patterns": ["Chapter \\d+", "Ch\\. \\d+"],
        "tests": []
    },
    "section": {
        "tags": ['p'],
        "patterns": ["Section \\d+", "Sec\\. \\d+", "Table of Contents", "Introduction", "Forward"],
        "tests": [is_title_case_fast]
    },
    "sub_section": {
        "tags": ['p', 'span'],
        "patterns": ["Summary", "Notes", "Sutra", "Exercise"],
        "tests": [lambda text: text.isupper()]
    }
}

In [None]:
segment_rules_LiA = {
    "part": {
        "tags": ['p', 'blockquote'],  # Handle more flexible tags, e.g., spans
        "patterns": ["Part \\d+", "Book \\d+"],
        "tests": []
    },
    "chapter": {
        "tags": ['p', 'blockquote'],
        "patterns": [
            r"Chapter \d+", 
            r"Ch\. \d+"
            r"-\s*Chapter\s+\d+\s*-",       # Matches "-Chapter 1-", "- Chapter 1 -" with spaces around "-"
            r"-\s*Chapter\s+[A-Za-z]+\s*-",  # Matches "-Chapter One-", "- Chapter One -" with spaces around "-"
            r"-\s*Ch\.\s*\d+\s*-",           # Matches "-Ch. 1-", "- Ch. 1 -" with spaces around "-"
            r"-\s*Ch\.\s*[A-Za-z]+\s*-",     # Matches "-Ch. One-", "- Ch. One -" with spaces around "-"
        ],
        "tests": []
    },
    "section": {
        "tags": ['p', 'blockquote'],
        "patterns": [r"Section \d+", r"Sec\. \d+", "Table of Contents", r"-\s*Introduction\s*-", r"-\s*Forward\s*-"],
        "tests": [is_title_case_fast]
    },
    "sub_section": {
        "tags": ['p', 'span', 'blockquote'],
        "patterns": ["Summary", "Notes", "Sutra", "Exercise"],
        "tests": [lambda text: text.isupper()]
    }
}

In [None]:
# LiA_paragraph_rules = {
#     "paragraph": {
#         "tags": ['blockquote'],  # list of tags
#         "attributes": [] # list of attributes
#     "quote": [],
#     "emphasized" ['bold']
# }

In [None]:
upper_test = lambda text: text.isupper()
upper_test("THIS IS AN ALL CAPS SENTENCE.")

In [None]:
def clean_text(text):
    """
    Cleans a given text by replacing specific unwanted characters such as 
    newline, tab, and non-breaking spaces with regular spaces.

    This function takes a string as input and applies replacements 
    based on a predefined mapping of characters to replace.

    Args:
        text (str): The text to be cleaned.

    Returns:
        str: The cleaned text with unwanted characters replaced by spaces.

    Example:
        >>> text = "This is\\n an example\\ttext with\\xa0extra spaces."
        >>> clean_text(text)
        'This is an example text with extra spaces.'

    """
    # Define a mapping of characters to replace
    replace_map = {
        '\n': ' ',       # Replace newlines with space
        '\t': ' ',       # Replace tabs with space
        '\xa0': ' ',     # Replace non-breaking space with regular space
        # Add more replacements as needed
    }

    # Loop through the replace map and replace each character
    for old_char, new_char in replace_map.items():
        text = text.replace(old_char, new_char)
    
    return text.strip()  # Ensure any leading/trailing spaces are removed

In [None]:
text = "This is\nan example\ttext with\xa0extra spaces."
clean_text(text)

In [None]:
import re

def sent_with_punctuation(text):
    """
    Check if a sentence ends with valid English sentence-ending punctuation.
    
    Args:
        text (str): The sentence to check.
    
    Returns:
        bool: True if the sentence ends with a valid punctuation mark, False otherwise.
    """
    # Regular expression to match a valid sentence end
    pattern = r'[.?!]["\']?$'
    
    # Check if the sentence matches the pattern
    return bool(re.search(pattern, text.strip()))

# Example use cases:
print(sent_with_punctuation('This is a valid sentence.'))  # True
print(sent_with_punctuation('Is this a question?'))  # True
print(sent_with_punctuation('He exclaimed, "Amazing!"'))  # True
print(sent_with_punctuation('It\'s a test.'))  # True
print(sent_with_punctuation('This sentence has no punctuation'))  # False

In [None]:
import re

def is_valid_heading(text, tag):
    """
    Determines if the text is likely a heading or chapter title based on its content
    and HTML tag properties like class or style (e.g., bold, italic, underline).
    
    Args:
        text (str): The text to check.
        tag (Tag): The BeautifulSoup tag that contains the text.

    Returns:
        bool: True if the text is likely a heading, False otherwise.
    """
    # Strip leading and trailing whitespace
    text = text.strip()

    # Heuristic: Check if the text starts with large Roman numeral
    if re.match(r'^[IVXLCDM]+\.', text):  # Roman numerals for chapter markers
        return True

    # Ignore very short or empty strings
    if len(text) < 3:
         return False

    if text.isupper():
        return True
    
    # Ignore long strings:
    if len(text.split()) > 10:
        return False
    
    # Heuristic: Title formatting
    if is_title_case(text):
        return True
    
    # Heuristic: Check for common heading words like "Chapter", "Exercise", "Sutra"
    common_heading_words = ["Chapter", "Exercise", "Sutra", "Introduction", "Appendix"]
    if any(word in text for word in common_heading_words):
        return True

    return False

In [None]:
def extract_content_hierarchy(soup, segment_rules, paragraph_rules):
    """
    Extracts hierarchical content (book title, parts, chapters, sections, sub-sections)
    from a BeautifulSoup object according to the specified rules. Additionally,
    collects all paragraphs and blockquotes.
    
    Args:
        soup (BeautifulSoup): The parsed HTML content from the EPUB.
        segment_rules (dict): Dictionary defining tags, attributes, and patterns for each segment type.
    
    Returns:
        dict: A dictionary containing extracted content in a hierarchical format.
    """
    hierarchy = {
        "parts": [],
        "chapters": [],
        "sections": [],
        "sutras": [],
        "sub_sections": [],
        "poems": [],
        "excersizes": [],
        "paragraphs": [], # Collect paragraphs and text content
        "quotes": [],
        "emphasized": [],
        "other": [], # unrecognized headings
        "segment_index": 0    
    } 

    # Iterate over segment rules
    seen_text = set()
    for segment, rules in segment_rules.items():
        # Check for matching tags and their attributes
        for tag in soup.find_all(rules['tags']):
            text = tag.get_text().strip()
            
            if not text or text in seen_text:
                continue
            
            tests = rules['tests']

            pattern_check = any(re.search(pattern, text) for pattern in rules.get('patterns', []))
            if pattern_check:
                test_check = None # don't need to check
            else:
                test_check = any(test(text) for test in tests)  # only check if necessary

            # Check for matching patterns / tests and using valid headings
            if (test_check or pattern_check) and is_valid_heading(text, tag):
                if segment == "part":
                    hierarchy['parts'].append(text)
                elif segment == "chapter":
                    hierarchy['chapters'].append(text)
                elif segment == "section":
                    hierarchy['sections'].append(text)
                elif segment == "sub_section":
                    hierarchy['sub_sections'].append(text)
                elif segment == "poem":
                    hierarchy['poem'].append(text)
                elif segment == "exercises":
                    hierarchy['exercise'].append(text)
                else:
                    hierarchy['other'].append(text)
                seen_text.add(text)
                
    # Collect paragraphs and inline content like blockquotes, spans, divs, etc.
    # Process paragraphs
    for para in soup.find_all('p'):
        text = clean_text(para.get_text())
        if text and text not in seen_text:  # Avoid adding empty or duplicate text
            hierarchy['paragraphs'].append(text)
            seen_text.add(text)

    # Process blockquotes
    for quote in soup.find_all('blockquote'):
        text = clean_text(quote.get_text())
        if text and text not in seen_text:
            hierarchy['quotes'].append(text)
            seen_text.add(text)

    # Process spans with class 'italic' (emphasized text)
    for italic_span in soup.find_all('span', {'class': 'italic'}):
        text = clean_text(italic_span.get_text())
        if text and text not in seen_text:
            hierarchy['emphasized'].append(text)
            seen_text.add(text)

    return hierarchy

In [None]:
def extract_text_and_metadata(file_path, segment_rules, paragraph_rules=None):
    """
    Extracts text, chapter titles, and metadata (like book title and author) from an EPUB file
    using a hierarchical content extraction process.
    
    Args:
        file_path (str): The path to the EPUB file to be processed.
        segment_rules (dict): Dictionary defining tags, attributes, and patterns for each segment type.
        paragraph_rule (dict): Dictionary defining tags for extracting main body text, paragraphs, quotes, and emphasized text.

    Returns:
        dict: A dictionary with metadata (title, author) and a hierarchical structure of the book content.
    """
    # Read the EPUB file
    book = epub.read_epub(file_path)
    content_with_metadata = {
        "metadata": {},
        "content_hierarchy": []
    }

    # Try extracting title and author with the standard 'DC' namespace
    try:
        metadata = book.get_metadata('DC', 'title')
        if metadata:
            content_with_metadata['metadata']['title'] = metadata[0][0]
    except KeyError:
        print("Title not found in 'DC' namespace.")

    try:
        author = book.get_metadata('DC', 'creator')
        if author:
            content_with_metadata['metadata']['author'] = author[0][0]
    except KeyError:
        print("Author not found in 'DC' namespace.")
    
    # Additional metadata (language, publisher, date, etc.)
    metadata_fields = {
        'language': 'language',
        'publisher': 'publisher',
        'date': 'date'
    }
    
    for key, field in metadata_fields.items():
        try:
            metadata = book.get_metadata('DC', field)
            if metadata:
                content_with_metadata['metadata'][key] = metadata[0][0]
        except KeyError:
            print(f"{key.capitalize()} not found in 'DC' namespace.")
            content_with_metadata['metadata'][key] = None  # Handle missing metadata gracefully

    # Get items of the desired media type (HTML content)
    segment_number = 0

    for item in book.get_items_of_media_type('application/xhtml+xml'):
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(item.get_body_content(), 'html.parser')

        # Extract the hierarchical content using the defined segment rules
        hierarchy = extract_content_hierarchy(soup, segment_rules, paragraph_rules)
        
        # Add the hierarchy to the content, only if it has meaningful structure
        if hierarchy.get("chapters") or hierarchy.get("sections") or hierarchy.get("sub_sections"):
            hierarchy['segment_index'] = segment_number
            content_with_metadata["content_hierarchy"].append(hierarchy)
            
        segment_number += 1
    
    return content_with_metadata

In [None]:
TH_book_path = "../books/private_books/Transformation_and_Healing-Thich_Nhat_Hanh.epub"
LiA_book_path = "../books/private_books/Love_in_Action-Thich_Nhat_Hanh.epub"
TH_epub_book = epub.read_epub(TH_book_path)
LiA_epub_book = epub.read_epub(LiA_book_path)

In [None]:
mdata = extract_text_and_metadata(TH_book_path, segment_rules_TH)

In [None]:
mdata

In [None]:
extract_text_and_metadata(LiA_book_path, segment_rules_LiA)

In [None]:
x = "Satipatthana Sutta (Theravada) from Majjhima Nikaya, 10."
is_title_case(x)

In [None]:
from clean_parse_tag import reduced_tags_and_text

In [None]:
soups = [BeautifulSoup(item.get_body_content(), 'html.parser') for item in TH_epub_book.get_items_of_media_type('application/xhtml+xml')]

In [None]:
# soups = [BeautifulSoup(item.get_body_content(), 'html.parser') for item in LiA_epub_book.get_items_of_media_type('application/xhtml+xml')]

In [None]:
import re

for soup in soups:
    for tag in soup.find_all(True, class_=re.compile(r'calibre.*')):
        del tag['class']  # Removes the class attribute if it matches

In [None]:
sp = reduced_tags_and_text(soups[3])

In [None]:
print(sp.prettify())

In [None]:
from clean_parse_tag import extract_tags_by_attributes, get_all_tag_names, get_all_attributes_for_tag

In [None]:
for i, soup in enumerate(soups):
    tagset = set()
    tags = get_all_tag_names(soup)
    for tag in tags:
        tagset.add(tag)

print(tagset)

In [None]:
soups[4]

In [None]:
for element in soups[4].find_all('span'):
    print(element.attrs)

In [None]:
for soup in soups:
    for tag in tagset:
        print(get_all_attributes_for_tag(soup, tag))

In [None]:
tagset = {
    'span': {'class': 'italic'}  
}
for soup in soups:
    dict = extract_tags_by_attributes(soup, tagset)
    print(dict)
    

In [None]:
tagset = {
    'span': {'class': ''}  
}
for soup in soups:
    dict = extract_tags_by_attributes(soup, tagset)
    print(dict)
    

In [None]:
def find_tag_content(soups, tag_name):
    """
    This function takes a list of BeautifulSoup objects and a tag name, 
    finds the content inside the specified tag for each soup, and returns 
    a list of BeautifulSoup objects containing the content inside the specified tag.
    
    :param soups: List of BeautifulSoup objects representing parsed HTML documents
    :param tag_name: The name of the tag to search for (e.g., 'blockquote', 'div', etc.)
    :return: A list of BeautifulSoup objects containing the content inside the specified tag
    """
    tag_soups = []

    for soup in soups:
        tags = soup.find_all(tag_name)  # Find all instances of the specified tag
        for tag in tags:
            # Append the content inside the tag as a new BeautifulSoup object
            tag_soups.append(tag)

    return tag_soups

In [None]:
def find_tag_content_with_attr(soups, tag_name, attrs=None):
    """
    This function takes a list of BeautifulSoup objects, a tag name, and an optional dictionary 
    of attributes. It finds the content inside the specified tag with matching attributes and 
    returns a list of BeautifulSoup objects containing the content inside the matching tags.
    
    :param soups: List of BeautifulSoup objects representing parsed HTML documents
    :param tag_name: The name of the tag to search for (e.g., 'blockquote', 'div', etc.)
    :param attrs: Dictionary of tag attributes to match (e.g., {'class': 'italic'})
    :return: A list of BeautifulSoup objects containing the content inside the matching tags
    """
    tag_soups = []

    for soup in soups:
        tags = soup.find_all(tag_name, attrs=attrs)  # Find all instances of the tag with the given attributes
        for tag in tags:
            # Append the content inside the tag as a new BeautifulSoup object
            tag_soups.append(tag)

    return tag_soups

In [None]:
find_tag_content(soups, "blockquote")

In [None]:
find_tag_content_with_attr(soups, 'span', {'class': 'italic'})

In [None]:
for soup in soups:
    for tag in soup.find_all(True):
        del tag['class']
    for a_tag in soup.find_all("a"):
        a_tag.unwrap()  # This removes the <a> tag but keeps the text inside


In [None]:
len(soups)

In [None]:
soups[0:10]

In [None]:
trans_healing_book = epub.read_epub("../books/Transformation_and_Healing-Thich_Nhat_Hanh.epub")

In [None]:
love_in_action_book = epub.read_epub("../books/Love_in_Action-Thich_Nhat_Hanh.epub")

In [None]:
book1 = love_in_action_book
book2 = trans_healing_book

In [None]:
L1 = [x for x in book1.get_items_of_media_type('application/xhtml+xml')]
L2 = [x for x in book2.get_items_of_media_type('application/xhtml+xml')]

In [None]:
len(L1), len(L2)

In [None]:
y = L1[20].get_body_content()
len(y)

In [None]:
S1 = [BeautifulSoup(element.get_body_content(), 'html.parser') for element in L1]
S2 = [BeautifulSoup(element.get_body_content(), 'html.parser') for element in L2]

In [None]:
S2[8:10]

In [None]:
def extract_content(soup):
    """
    Extracts content from a BeautifulSoup object considering blockquotes, spans, and paragraphs.
    """
    chapter_title = None

    # Look for blockquote or span elements that might contain the chapter title
    possible_titles = soup.find_all(['blockquote', 'span'])
    for title in possible_titles:
        text = title.get_text().strip()
        if "Chapter" in text:  # Basic check for chapter titles
            chapter_title = text
            break  # Stop after finding the first relevant title
    
    # Collect paragraphs and blockquote content
    paragraphs = []
    for para in soup.find_all(['p', 'blockquote']):
        text = para.get_text().strip()
        if text:  # Only add non-empty text
            paragraphs.append(text)

    return chapter_title, paragraphs


In [None]:
extract_content(soup)

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp("")
