In [None]:
import spacy
import nltk
from nltk.tokenize import sent_tokenize
import time

In [None]:
def ensure_nltk_data():
    """Ensure NLTK punkt tokenizer is available."""
    try:
        # Try to find the resource
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        # If not found, try downloading
        try:
            nltk.download('punkt', quiet=True)
            # Verify download
            nltk.data.find('tokenizers/punkt')
        except Exception as e:
            raise RuntimeError(
                "Failed to download required NLTK data. "
                "Please run 'python -m nltk.downloader punkt' "
                f"to install manually. Error: {e}"
            ) from e

# Call this before sentence tokenization
ensure_nltk_data()

In [None]:
# Basic NLTK sentence splitting test
import nltk
from nltk.tokenize import sent_tokenize

# Test on a simple string
text = "This is sentence one. This is sentence two! Is this sentence three? Yes it is."

# Try tokenizing
sentences = sent_tokenize(text)

print("Tokenized sentences:")
for i, sent in enumerate(sentences, 1):
    print(f"{i}. {sent}")

In [None]:


# Sample Vietnamese text
text = """
Thầy thường dạy chúng ta phải biết lắng nghe. Lắng nghe là một nghệ thuật. 
Khi lắng nghe với tâm từ bi, ta có thể hiểu được nỗi khổ của người khác. 
Ta phải tập ngồi yên và thở. Hơi thở là cây cầu nối liền thân và tâm.
"""

# Test Spacy
nlp_vi = spacy.load("xx_sent_ud_sm")  # Universal dependencies model
start = time.time()
doc = nlp_vi(text)
spacy_sents = list(doc.sents)
spacy_time = time.time() - start

# Test NLTK
start = time.time()
nltk_sents = sent_tokenize(text, language='vietnamese')
nltk_time = time.time() - start

print("Spacy sentences:", len(spacy_sents))
for sent in spacy_sents:
    print(f"- {sent}")
    
print("\nNLTK sentences:", len(nltk_sents))
for sent in nltk_sents:
    print(f"- {sent}")

print(f"\nTiming - Spacy: {spacy_time:.3f}s, NLTK: {nltk_time:.3f}s")