In [None]:
from v2_cleaning_scripts import process_book

In [None]:
with open('../books/transformation_and_healing-thich_nhat_hanh.txt', 'r') as f:
    book_text = f.read()

In [None]:
import spacy
from spacy import displacy

# Load a pretrained model, e.g., for sentence and chapter recognition
nlp = spacy.load('en_core_web_sm')

def process_book(book_text):
    """
    Process text to extract chapters and paragraphs using spaCy NLP model.
    
    Args:
        book_text (str): Raw text of the book.
    
    Returns:
        list: A list of (chapter_title, chapter_content) tuples.
    """
    doc = nlp(book_text)
    chapters = []
    chapter_title = ""
    chapter_content = []
    
    for sent in doc.sents:
        if "chapter" in sent.text.lower():
            # Store previous chapter if any
            if chapter_title:
                chapters.append((chapter_title, " ".join(chapter_content)))
            chapter_title = sent.text.strip()
            chapter_content = []
        else:
            chapter_content.append(sent.text.strip())
    
    if chapter_title:
        chapters.append((chapter_title, " ".join(chapter_content)))
    
    return chapters

In [None]:
doc = nlp(book_text)

In [None]:
sents = [s for s in doc.sents]

In [None]:
len(sents)

In [None]:
for s in sents[:200]:
    print(f">>{s}<<")

In [None]:
entlist = list(doc.ents)
for ent in entlist[:200]:
    print(f">>{ent.text}, {ent.label_}<<")

In [None]:
processed_chapters = process_book(book_text)
for title, content in processed_chapters:
    print(f"{title}: {content[:200]}...")

In [None]:
from v2_cleaning_scripts import classify_heading

In [None]:
import torch
print(torch.backends.mps.is_available())  # Check if MPS is available for GPU computations on macOS

In [None]:
# Test the classifier
sample_text = "EXERCISES FOR OBSERVING THE BODY"
if classify_heading(sample_text):
    print(f"'{sample_text}' is classified as a heading.")
else:
    print(f"'{sample_text}' is classified as body text.")

In [None]:
from transformers import pipeline
import torch

# Check if MPS is available and set the device accordingly
device = "mps" if torch.backends.mps.is_available() else "cpu"

# Load the pre-trained BERT model for classification
classifier = pipeline('zero-shot-classification', model="facebook/bart-large-mnli", device=0 if device == "mps" else -1)



In [None]:
# Example classification
text = "Mindfulness is the practice of being fully aware of the present moment."
candidate_labels = ["mind", "presence", "direct", "awareness", "sentence", "paragraph"]

# Perform classification
result = classifier(text, candidate_labels)
print(result)

In [None]:
def classify_heading(text):
    """
    Classifies text as heading or not using a Hugging Face model.
    
    Args:
        text (str): The text to classify.
    
    Returns:
        bool: True if the text is classified as a heading, otherwise False.
    """
    # Define candidate labels for zero-shot classification
    candidate_labels = ["label", "paragraph"]

    result = classifier(text, candidate_labels)
    return result['labels'][0] == "heading"

In [None]:
classify_heading("EXERCISES FOR OBSERVING THE BODY")

In [None]:
def classify_text_lines(text, classifier):
    """
    Iterate through each line of the input text and classify it as a heading or not.
    
    Args:
        text (str): The full text to process line by line.
        classifier (pipeline): Hugging Face classifier pipeline.

    Returns:
        list: A list of tuples with (line, classification).

    # Example usage
        sample_text = 
        
            EXERCISES FOR OBSERVING THE BODY

            The First Establishment of Mindfulness is the body, which includes the breath, the positions of the body,

    result = classify_text_lines(sample_text, classifier)
    for line, classification in result:
    print(f"Line: {line}\nClassification: {classification}\n")
    """

    
    classified_lines = []
    for line in text.splitlines():
        if line.strip():  # Skip empty lines
            result = classifier(line, ["heading", "paragraph"])
            classification = "heading" if result['labels'][0] == "heading" else "non-heading"
            classified_lines.append((line, classification))
    
    return classified_lines



In [None]:
result = classify_text_lines(book_text, classifier)

In [None]:
result[:20]

In [None]:
result[20:30]

In [None]:
from transformers import pipeline
ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", device=0)
result = ner("EXERCISES FOR OBSERVING THE BODY")
print(result)

In [None]:
ner("Paris")

In [None]:
ner("Thich Nhat Hanh")

In [None]:
ner("Mindfulness")