In [1]:
import nltk
from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer, TweetTokenizer
from nltk.tokenize import MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yash1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yash1\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
from nltk.corpus import wordnet

In [5]:
# Sample text
text = "Tokenization is a key task in NLP. It breaks text into tokens, which can be words, phrases, or symbols."

In [6]:
# Tokenization
# Whitespace Tokenizer
whitespace_tokenizer = WhitespaceTokenizer()
whitespace_tokens = whitespace_tokenizer.tokenize(text)
print("Whitespace Tokenizer:", whitespace_tokens)

Whitespace Tokenizer: ['Tokenization', 'is', 'a', 'key', 'task', 'in', 'NLP.', 'It', 'breaks', 'text', 'into', 'tokens,', 'which', 'can', 'be', 'words,', 'phrases,', 'or', 'symbols.']


In [7]:
# Punctuation-based Tokenizer
punct_tokenizer = WordPunctTokenizer()
punct_tokens = punct_tokenizer.tokenize(text)
print("Punctuation-based Tokenizer:", punct_tokens)

Punctuation-based Tokenizer: ['Tokenization', 'is', 'a', 'key', 'task', 'in', 'NLP', '.', 'It', 'breaks', 'text', 'into', 'tokens', ',', 'which', 'can', 'be', 'words', ',', 'phrases', ',', 'or', 'symbols', '.']


In [8]:
# Treebank Tokenizer
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)
print("Treebank Tokenizer:", treebank_tokens)

Treebank Tokenizer: ['Tokenization', 'is', 'a', 'key', 'task', 'in', 'NLP.', 'It', 'breaks', 'text', 'into', 'tokens', ',', 'which', 'can', 'be', 'words', ',', 'phrases', ',', 'or', 'symbols', '.']


In [9]:
# Tweet Tokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)
print("Tweet Tokenizer:", tweet_tokens)

Tweet Tokenizer: ['Tokenization', 'is', 'a', 'key', 'task', 'in', 'NLP', '.', 'It', 'breaks', 'text', 'into', 'tokens', ',', 'which', 'can', 'be', 'words', ',', 'phrases', ',', 'or', 'symbols', '.']


In [10]:
# Multi-Word Expression Tokenizer
mwe_tokenizer = MWETokenizer()
mwe_tokenizer.add_mwe(("key", "task"))
mwe_tokens = mwe_tokenizer.tokenize(text.split())
print("MWE Tokenizer:", mwe_tokens)

MWE Tokenizer: ['Tokenization', 'is', 'a', 'key_task', 'in', 'NLP.', 'It', 'breaks', 'text', 'into', 'tokens,', 'which', 'can', 'be', 'words,', 'phrases,', 'or', 'symbols.']


In [11]:
# Stemming
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")

In [12]:
porter_stems = [porter_stemmer.stem(token) for token in treebank_tokens]
print("Porter Stemmer:", porter_stems)

Porter Stemmer: ['token', 'is', 'a', 'key', 'task', 'in', 'nlp.', 'it', 'break', 'text', 'into', 'token', ',', 'which', 'can', 'be', 'word', ',', 'phrase', ',', 'or', 'symbol', '.']


In [13]:
snowball_stems = [snowball_stemmer.stem(token) for token in treebank_tokens]
print("Snowball Stemmer:", snowball_stems)

Snowball Stemmer: ['token', 'is', 'a', 'key', 'task', 'in', 'nlp.', 'it', 'break', 'text', 'into', 'token', ',', 'which', 'can', 'be', 'word', ',', 'phrase', ',', 'or', 'symbol', '.']


In [14]:
lemmatizer = WordNetLemmatizer()

In [15]:
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in treebank_tokens]

In [16]:
print("Lemmatization:", lemmatized_tokens)

Lemmatization: ['Tokenization', 'is', 'a', 'key', 'task', 'in', 'NLP.', 'It', 'break', 'text', 'into', 'token', ',', 'which', 'can', 'be', 'word', ',', 'phrase', ',', 'or', 'symbol', '.']
