In [1]:
from v2_cleaning_scripts import process_book

In [25]:
with open('../books/transformation_and_healing-thich_nhat_hanh.txt', 'r') as f:
    book_text = f.read()

In [26]:
import spacy
from spacy import displacy

# Load a pretrained model, e.g., for sentence and chapter recognition
nlp = spacy.load('en_core_web_sm')

def process_book(book_text):
    """
    Process text to extract chapters and paragraphs using spaCy NLP model.
    
    Args:
        book_text (str): Raw text of the book.
    
    Returns:
        list: A list of (chapter_title, chapter_content) tuples.
    """
    doc = nlp(book_text)
    chapters = []
    chapter_title = ""
    chapter_content = []
    
    for sent in doc.sents:
        if "chapter" in sent.text.lower():
            # Store previous chapter if any
            if chapter_title:
                chapters.append((chapter_title, " ".join(chapter_content)))
            chapter_title = sent.text.strip()
            chapter_content = []
        else:
            chapter_content.append(sent.text.strip())
    
    if chapter_title:
        chapters.append((chapter_title, " ".join(chapter_content)))
    
    return chapters

In [27]:
doc = nlp(book_text)

In [30]:
sents = [s for s in doc.sents]

In [44]:
len(sents)

2278

In [48]:
for s in sents[:200]:
    print(f">>{s}<<")

>>Table of Contents





Title Page

A NOTE ON THE TEXT

Introduction

Sutra on the Four Establishments of Mindfulness

Summary of the Sutra



Mindfulness Exercises

EXERCISES FOR OBSERVING THE BODY

REMARKS ON THE FIRST NINE EXERCISES

EXERCISES FOR OBSERVING THE FEELINGS

EXERCISES FOR OBSERVING THE MIND

EXERCISES FOR OBSERVING THE OBJECTS OF MIND



Principles for the Practice of Mindfulness

DHARMAS ARE MIND

BE ONE WITH THE OBJECT OF OBSERVATION

TRUE MIND AND DELUDED MIND ARE ONE

THE WAY OF NO-CONFLICT

OBSERVATION IS NOT INDOCTRINATION



Conclusion

Appendix Three Versions of the Sutra

Copyright Page





A NOTE ON THE TEXT



<<
>>The word for a Buddhist scripture, the teachings of the Buddha, is sutta in Pali and sutra in Sanskrit.<<
>>Because a number of texts, Pali, Sanskrit, and Chinese, are cited throughout the commentary, we use the word sutra as if it were an English word and use the word sutta only when it is part of the proper name of a Pali sutta, such as Satipat

In [51]:
entlist = list(doc.ents)
for ent in entlist[:200]:
    print(f">>{ent.text}, {ent.label_}<<")

>>Four, CARDINAL<<
>>FIRST, ORDINAL<<
>>NINE, CARDINAL<<
>>the Practice of Mindfulness

DHARMAS ARE MIND

BE ONE, ORG<<
>>DELUDED MIND, PERSON<<
>>Copyright Page, PERSON<<
>>Buddhist, NORP<<
>>Buddha, PERSON<<
>>Pali, NORP<<
>>Sanskrit, GPE<<
>>Pali, PERSON<<
>>Sanskrit, GPE<<
>>Chinese, NORP<<
>>English, LANGUAGE<<
>>Satipatthana Sutta, GPE<<
>>Anapanasati Sutta, FAC<<
>>Sanskrit, GPE<<
>>smrityupasthana, GPE<<
>>sati, GPE<<
>>Chinese, NORP<<
>>Nian Chu, PERSON<<
>>Chu, PERSON<<
>>Nian Chu, PERSON<<
>>ten or fifteen years, DATE<<
>>first, ORDINAL<<
>>second, ORDINAL<<
>>Sanskrit, ORG<<
>>vipassana, PERSON<<
>>Sanskrit, ORG<<
>>Sutra, NORP<<
>>the Four Establishments of Mindfulness, ORG<<
>>Buddha, PERSON<<
>>Sutra, NORP<<
>>the Four Establishments of Mindfulness, ORG<<
>>Four, CARDINAL<<
>>Buddha, PERSON<<
>>twenty-five centuries ago, DATE<<
>>Sutra, NORP<<
>>the Four Establishments of Mindfulness, ORG<<
>>today, DATE<<
>>four, CARDINAL<<
>>four, CARDINAL<<
>>four, CARDINAL<<
>>The Fi

In [None]:
processed_chapters = process_book(book_text)
for title, content in processed_chapters:
    print(f"{title}: {content[:200]}...")

In [1]:
from v2_cleaning_scripts import classify_heading

  from .autonotebook import tqdm as notebook_tqdm
  Referenced from: <253997FD-685F-34A9-B3D7-4AF6DAE96CDF> /opt/anaconda3/envs/tnh-scholar/lib/python3.11/site-packages/torchvision/image.so
  warn(
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [1]:
import torch
print(torch.backends.mps.is_available())  # Check if MPS is available for GPU computations on macOS

True


In [2]:
# Test the classifier
sample_text = "EXERCISES FOR OBSERVING THE BODY"
if classify_heading(sample_text):
    print(f"'{sample_text}' is classified as a heading.")
else:
    print(f"'{sample_text}' is classified as body text.")

: 

In [None]:
from transformers import pipeline
import torch

# Check if MPS is available and set the device accordingly
device = "mps" if torch.backends.mps.is_available() else "cpu"

# Load the pre-trained BERT model for classification
classifier = pipeline('zero-shot-classification', model="facebook/bart-large-mnli", device=0 if device == "mps" else -1)



In [18]:
# Example classification
text = "Mindfulness is the practice of being fully aware of the present moment."
candidate_labels = ["mind", "presence", "direct", "awareness", "sentence", "paragraph"]

# Perform classification
result = classifier(text, candidate_labels)
print(result)

{'sequence': 'Mindfulness is the practice of being fully aware of the present moment.', 'labels': ['mind', 'awareness', 'presence', 'direct', 'paragraph', 'sentence'], 'scores': [0.5224279761314392, 0.31954264640808105, 0.07851850241422653, 0.04390231892466545, 0.02076886221766472, 0.014839688315987587]}


In [3]:
def classify_heading(text):
    """
    Classifies text as heading or not using a Hugging Face model.
    
    Args:
        text (str): The text to classify.
    
    Returns:
        bool: True if the text is classified as a heading, otherwise False.
    """
    # Define candidate labels for zero-shot classification
    candidate_labels = ["label", "paragraph"]

    result = classifier(text, candidate_labels)
    return result['labels'][0] == "heading"

In [4]:
classify_heading("EXERCISES FOR OBSERVING THE BODY")

True

In [7]:
def classify_text_lines(text, classifier):
    """
    Iterate through each line of the input text and classify it as a heading or not.
    
    Args:
        text (str): The full text to process line by line.
        classifier (pipeline): Hugging Face classifier pipeline.

    Returns:
        list: A list of tuples with (line, classification).

    # Example usage
        sample_text = 
        
            EXERCISES FOR OBSERVING THE BODY

            The First Establishment of Mindfulness is the body, which includes the breath, the positions of the body,

    result = classify_text_lines(sample_text, classifier)
    for line, classification in result:
    print(f"Line: {line}\nClassification: {classification}\n")
    """

    
    classified_lines = []
    for line in text.splitlines():
        if line.strip():  # Skip empty lines
            result = classifier(line, ["heading", "paragraph"])
            classification = "heading" if result['labels'][0] == "heading" else "non-heading"
            classified_lines.append((line, classification))
    
    return classified_lines



In [10]:
result = classify_text_lines(book_text, classifier)

In [11]:
result[:20]

[('Table of Contents', 'heading'),
 ('Title Page', 'heading'),
 ('A NOTE ON THE TEXT', 'heading'),
 ('Introduction', 'heading'),
 ('Sutra on the Four Establishments of Mindfulness', 'heading'),
 ('Summary of the Sutra', 'non-heading'),
 ('Mindfulness Exercises', 'heading'),
 ('EXERCISES FOR OBSERVING THE BODY', 'heading'),
 ('REMARKS ON THE FIRST NINE EXERCISES', 'heading'),
 ('EXERCISES FOR OBSERVING THE FEELINGS', 'heading'),
 ('EXERCISES FOR OBSERVING THE MIND', 'heading'),
 ('EXERCISES FOR OBSERVING THE OBJECTS OF MIND', 'heading'),
 ('Principles for the Practice of Mindfulness', 'heading'),
 ('DHARMAS ARE MIND', 'heading'),
 ('BE ONE WITH THE OBJECT OF OBSERVATION', 'heading'),
 ('TRUE MIND AND DELUDED MIND ARE ONE', 'heading'),
 ('THE WAY OF NO-CONFLICT', 'heading'),
 ('OBSERVATION IS NOT INDOCTRINATION', 'heading'),
 ('Conclusion', 'heading'),
 ('Appendix Three Versions of the Sutra', 'heading')]

In [19]:
result[20:30]

TypeError: unhashable type: 'slice'

In [21]:
from transformers import pipeline
ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", device=0)
result = ner("EXERCISES FOR OBSERVING THE BODY")
print(result)

model.safetensors:  11%|#1        | 147M/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

[]


In [23]:
ner("Paris")

[{'entity': 'I-LOC',
  'score': 0.96201396,
  'index': 1,
  'word': 'Paris',
  'start': 0,
  'end': 5}]

In [24]:
ner("Thich Nhat Hanh")

[{'entity': 'I-PER',
  'score': 0.7270181,
  'index': 1,
  'word': 'T',
  'start': 0,
  'end': 1},
 {'entity': 'I-PER',
  'score': 0.45529315,
  'index': 2,
  'word': '##hic',
  'start': 1,
  'end': 4},
 {'entity': 'I-PER',
  'score': 0.46189806,
  'index': 3,
  'word': '##h',
  'start': 4,
  'end': 5},
 {'entity': 'I-PER',
  'score': 0.6301262,
  'index': 4,
  'word': 'N',
  'start': 6,
  'end': 7},
 {'entity': 'I-PER',
  'score': 0.80088836,
  'index': 5,
  'word': '##hat',
  'start': 7,
  'end': 10},
 {'entity': 'I-PER',
  'score': 0.66048324,
  'index': 6,
  'word': 'Han',
  'start': 11,
  'end': 14}]

In [35]:
ner("Mindfulness")

[]