In [9]:
import nltk
from nltk.tokenize import word_tokenize, WordPunctTokenizer, TreebankWordTokenizer, TweetTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [10]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
text = "The Mahakumbh, held every 12 years, is a grand spiritual gathering in India attracting millions of devotees and sadhus. #Mahakumbh #spirituality #tradition"

whitespace_tokenized = text.split()
print("Whitespace Tokenization:", whitespace_tokenized)

Whitespace Tokenization: ['The', 'Mahakumbh,', 'held', 'every', '12', 'years,', 'is', 'a', 'grand', 'spiritual', 'gathering', 'in', 'India', 'attracting', 'millions', 'of', 'devotees', 'and', 'sadhus.', '#Mahakumbh', '#spirituality', '#tradition']


In [12]:
wordpunct_tokenizer = WordPunctTokenizer()
wordpunct_tokenized = wordpunct_tokenizer.tokenize(text)
print("Punctuation-based Tokenization:", wordpunct_tokenized)

Punctuation-based Tokenization: ['The', 'Mahakumbh', ',', 'held', 'every', '12', 'years', ',', 'is', 'a', 'grand', 'spiritual', 'gathering', 'in', 'India', 'attracting', 'millions', 'of', 'devotees', 'and', 'sadhus', '.', '#', 'Mahakumbh', '#', 'spirituality', '#', 'tradition']


In [13]:
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokenized = treebank_tokenizer.tokenize(text)
print("Treebank Tokenization:", treebank_tokenized)

Treebank Tokenization: ['The', 'Mahakumbh', ',', 'held', 'every', '12', 'years', ',', 'is', 'a', 'grand', 'spiritual', 'gathering', 'in', 'India', 'attracting', 'millions', 'of', 'devotees', 'and', 'sadhus.', '#', 'Mahakumbh', '#', 'spirituality', '#', 'tradition']


In [14]:
tweet_tokenizer = TweetTokenizer()
tweet_tokenized = tweet_tokenizer.tokenize(text)
print("Tweet Tokenization:", tweet_tokenized)

Tweet Tokenization: ['The', 'Mahakumbh', ',', 'held', 'every', '12', 'years', ',', 'is', 'a', 'grand', 'spiritual', 'gathering', 'in', 'India', 'attracting', 'millions', 'of', 'devotees', 'and', 'sadhus', '.', '#Mahakumbh', '#spirituality', '#tradition']


In [15]:
mwe_tokenized = text.split()
print("MWE Tokenization:", mwe_tokenized)

MWE Tokenization: ['The', 'Mahakumbh,', 'held', 'every', '12', 'years,', 'is', 'a', 'grand', 'spiritual', 'gathering', 'in', 'India', 'attracting', 'millions', 'of', 'devotees', 'and', 'sadhus.', '#Mahakumbh', '#spirituality', '#tradition']


In [16]:
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer('english')

In [17]:
porter_stemmed = [porter_stemmer.stem(word) for word in wordpunct_tokenized]
print("Porter Stemming:", porter_stemmed)

Porter Stemming: ['the', 'mahakumbh', ',', 'held', 'everi', '12', 'year', ',', 'is', 'a', 'grand', 'spiritu', 'gather', 'in', 'india', 'attract', 'million', 'of', 'devote', 'and', 'sadhu', '.', '#', 'mahakumbh', '#', 'spiritu', '#', 'tradit']


In [18]:
snowball_stemmed = [snowball_stemmer.stem(word) for word in wordpunct_tokenized]
print("Snowball Stemming:", snowball_stemmed)

Snowball Stemming: ['the', 'mahakumbh', ',', 'held', 'everi', '12', 'year', ',', 'is', 'a', 'grand', 'spiritu', 'gather', 'in', 'india', 'attract', 'million', 'of', 'devote', 'and', 'sadhus', '.', '#', 'mahakumbh', '#', 'spiritu', '#', 'tradit']


In [19]:
lemmatizer = WordNetLemmatizer()

lemmatized = [lemmatizer.lemmatize(word) for word in wordpunct_tokenized]
print("Lemmatization:", lemmatized)

Lemmatization: ['The', 'Mahakumbh', ',', 'held', 'every', '12', 'year', ',', 'is', 'a', 'grand', 'spiritual', 'gathering', 'in', 'India', 'attracting', 'million', 'of', 'devotee', 'and', 'sadhu', '.', '#', 'Mahakumbh', '#', 'spirituality', '#', 'tradition']
