In [1]:
# Non English tokenization using NLTK and spaCy
# Fetch a hindi page from Wikipedia
import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://hi.wikipedia.org/wiki/स्टीव_जॉब्स" 
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')

In [3]:
# Extract all paragraph text
text = ' '.join(p.get_text() for p in soup.select('p'))
print("Preview of extracted text : ")
print(text[:500])    # Preview

Preview of extracted text : 
स्टीवन पॉल "स्टीव" जॉब्स (अंग्रेज़ी: Steven Paul "Steve" Jobs) (जन्म: २४ फरवरी, १९५५ - अक्टूबर ५, २०११) एक अमेरिकी बिजनेस टाईकून और आविष्कारक थे। वे एप्पल इंक के सह-संस्थापक और मुख्य कार्यकारी अधिकारी थे। अगस्त २०११ में उन्होने इस पद से त्यागपत्र दे दिया। जॉब्स पिक्सर एनीमेशन स्टूडियोज के मुख्य कार्यकारी अधिकारी भी रहे। सन् २००६ में वह दि वाल्ट डिज्नी कम्पनी के निदेशक मंडल के सदस्य भी रहे, जिसके बाद डिज्नी ने पिक्सर का अधिग्रहण कर लिया था। १९९५ में आई फिल्म टॉय स्टोरी के  वह  कार्यकारी निर्माता 


In [4]:
# -----------------------
# Tokenization using NLTK
# -----------------------
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dai.STUDENTSDC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Sentence tokenization
nltk_sentence = sent_tokenize(text)
print(f"\n NLTK Sentence Count: {len(nltk_sentence)}")


 NLTK Sentence Count: 11


In [7]:
# Word Tokenization
nltk_words = word_tokenize(text)
print(f'NLTK Word Count  : {len(nltk_words)}')

NLTK Word Count  : 1834


In [9]:
!python -m spacy download xx_ent_wiki_sm

Collecting xx-ent-wiki-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl (11.1 MB)
     ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
     --------------------------------------  11.0/11.1 MB 69.9 MB/s eta 0:00:01
     --------------------------------------- 11.1/11.1 MB 37.1 MB/s eta 0:00:00
Installing collected packages: xx-ent-wiki-sm
Successfully installed xx-ent-wiki-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_ent_wiki_sm')


In [None]:
# ------------------------
# Tokenization using spaCy
# ------------------------
import spacy
nlp = spacy.load('xx_ent_wiki_sm')

# Add sentencizer since xx_ent_wiki_sm doesn't include parser
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x1ec60ff4910>

In [11]:
doc = nlp(text)

# Sentence tokenization
spacy_sentences = [sent.text for sent in doc.sents]
print(f"\n Spacy Sentence Count : {len(spacy_sentences)}")


 Spacy Sentence Count : 88


In [13]:
# Word tokenization
spacy_words = [token.text for token in doc]
print(f"\n Spacy Word Count : {len(spacy_words)}")


 Spacy Word Count : 1952


In [None]:
# NLTK's sent_tokenize uses a Punkt model trained primarily on Enfglish.
# it does not handle hindi sentences boundaries ( like  | ) very well, which is why it returned only 11 sentences.

# spaCy (with xx_ent_wiki_sm + sentencizer) handles Unicode-aware punctuation better, including hindi sentences boundaries ( | ), leading to more accurate sentence splits.