In [1]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

from nltk.tokenize import WhitespaceTokenizer, TreebankWordTokenizer, TweetTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

# Sample sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenization
# Whitespace Tokenizer
whitespace_tokenizer = WhitespaceTokenizer()
whitespace_tokens = whitespace_tokenizer.tokenize(sentence)

# Punctuation-based Tokenizer
punctuation_tokens = nltk.word_tokenize(sentence)

# Treebank Tokenizer
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(sentence)

# Tweet Tokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(sentence)

# Stemming
# Porter Stemmer
porter_stemmer = PorterStemmer()
porter_stems = [porter_stemmer.stem(token) for token in treebank_tokens]

# Snowball Stemmer
snowball_stemmer = SnowballStemmer("english")
snowball_stems = [snowball_stemmer.stem(token) for token in treebank_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token, pos=wordnet.VERB) for token in treebank_tokens]

# Print the results
print("Whitespace Tokenizer:", whitespace_tokens)
print("Punctuation-based Tokenizer:", punctuation_tokens)
print("Treebank Tokenizer:", treebank_tokens)
print("Tweet Tokenizer:", tweet_tokens)
print("Porter Stemmer:", porter_stems)
print("Snowball Stemmer:", snowball_stems)
print("Lemmas:", lemmas)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Whitespace Tokenizer: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.']
Punctuation-based Tokenizer: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']
Treebank Tokenizer: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']
Tweet Tokenizer: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']
Porter Stemmer: ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.']
Snowball Stemmer: ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.']
Lemmas: ['The', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '.']
