In [7]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## 1. Tokenization
Tokenization is the process of breaking down a text into smaller units called tokens. Let's explore different tokenizers.

In [8]:
from nltk.tokenize import word_tokenize, WhitespaceTokenizer, PunktTokenizer, TreebankWordTokenizer, TweetTokenizer, MWETokenizer

text = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The user doesn't like MWEs!"
print(f"Original Text: {text}\n")

# Whitespace Tokenizer
print("--- Whitespace Tokenizer ---")
wt = WhitespaceTokenizer()
ws_tokens = wt.tokenize(text)
print(f"Tokens: {ws_tokens}\n")

# Punctuation Tokenizer (using word_tokenize which defaults to TreebankWordTokenizer but handles punctuation)
print("--- Punctuation Tokenizer ---")
punct_tokens = word_tokenize(text)
print(f"Tokens: {punct_tokens}\n")

# Treebank Tokenizer
print("--- Treebank Tokenizer ---")
tt = TreebankWordTokenizer()
treebank_tokens = tt.tokenize(text)
print(f"Tokens: {treebank_tokens}\n")

# Tweet Tokenizer
print("--- Tweet Tokenizer ---")
tw = TweetTokenizer()
tweet_tokens = tw.tokenize("This is a #tweet with a @mention and a smiley :) - check it out at http://example.com/ #NLP")
print(f"Tokens: {tweet_tokens}\n")

# MWE Tokenizer (Multi-Word Expression Tokenizer)
print("--- MWE Tokenizer ---")
mwe_tokenizer = MWETokenizer([('natural', 'language'), ('New', 'York')])
mwe_text = "I am learning about natural language processing in New York."
mwe_tokens = mwe_tokenizer.tokenize(mwe_text.split())
print(f"Tokens: {mwe_tokens}\n")

Original Text: Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The user doesn't like MWEs!

--- Whitespace Tokenizer ---
Tokens: ['Hello', 'Mr.', 'Smith,', 'how', 'are', 'you', 'doing', 'today?', 'The', 'weather', 'is', 'great,', 'and', 'Python', 'is', 'awesome.', 'The', 'user', "doesn't", 'like', 'MWEs!']

--- Punctuation Tokenizer ---
Tokens: ['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'user', 'does', "n't", 'like', 'MWEs', '!']

--- Treebank Tokenizer ---
Tokens: ['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome.', 'The', 'user', 'does', "n't", 'like', 'MWEs', '!']

--- Tweet Tokenizer ---
Tokens: ['This', 'is', 'a', '#tweet', 'with', 'a', '@mention', 'and', 'a', 'smiley', ':)', '-', 'check', 'it', 'out', 'at', 'http://example.com/', '#NL

## 2. Stemming
Stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form—generally a written word form. The stem is not necessarily a linguistic root of the word; it is usually an affix that has been removed.

Let's apply Porter and Snowball Stemmers.

In [3]:
from nltk.stem import PorterStemmer, SnowballStemmer

words_to_stem = ["running", "runner", "runs", "easily", "fairly", "universal", "universally"]

# Porter Stemmer
print("--- Porter Stemmer ---")
ps = PorterStemmer()
for word in words_to_stem:
    print(f"{word} -> {ps.stem(word)}")
print("\n")

# Snowball Stemmer (English)
print("--- Snowball Stemmer (English) ---")
sbs = SnowballStemmer("english")
for word in words_to_stem:
    print(f"{word} -> {sbs.stem(word)}")
print("\n")

# Snowball Stemmer (another language example, e.g., French)
print("--- Snowball Stemmer (French) Example ---")
sbs_fr = SnowballStemmer("french")
french_words = ["manger", "mangeant", "mangé"]
for word in french_words:
    print(f"{word} -> {sbs_fr.stem(word)}")
print("\n")

--- Porter Stemmer ---
running -> run
runner -> runner
runs -> run
easily -> easili
fairly -> fairli
universal -> univers
universally -> univers


--- Snowball Stemmer (English) ---
running -> run
runner -> runner
runs -> run
easily -> easili
fairly -> fair
universal -> univers
universally -> univers


--- Snowball Stemmer (French) Example ---
manger -> mang
mangeant -> mang
mangé -> mang




## 3. Lemmatization
Lemmatization is the process of grouping together the inflected forms of a word so they can be analyzed as a single item, identified by the word's lemma, or dictionary form. Unlike stemming, lemmatization considers the context and converts the word to its meaningful base form.

Let's use the WordNet Lemmatizer.

In [4]:
from nltk.stem import WordNetLemmatizer

words_to_lemmatize = ["running", "runner", "runs", "ran", "better", "good", "geese", "cacti"]

# WordNet Lemmatizer
print("--- WordNet Lemmatizer ---")
wnl = WordNetLemmatizer()
for word in words_to_lemmatize:
    print(f"{word} -> {wnl.lemmatize(word)}")

print("\nNote: Lemmatizer performs better with part-of-speech (POS) tags. Let's see an example with POS tags.")
print(f"{'better':<10} -> {wnl.lemmatize('better', pos='a')}") # 'a' for adjective
print(f"{'running':<10} -> {wnl.lemmatize('running', pos='v')}") # 'v' for verb
print(f"{'runs':<10} -> {wnl.lemmatize('runs', pos='v')}") # 'v' for verb

--- WordNet Lemmatizer ---
running -> running
runner -> runner
runs -> run
ran -> ran
better -> better
good -> good
geese -> goose
cacti -> cactus

Note: Lemmatizer performs better with part-of-speech (POS) tags. Let's see an example with POS tags.
better     -> good
running    -> run
runs       -> run
