In [1]:
import nltk
from nltk.tokenize import word_tokenize, TreebankWordTokenizer, TweetTokenizer, MWETokenizer, WhitespaceTokenizer, RegexpTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

# Download necessary resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

text = "Hello there! I'm testing various tokenization methods: whitespace, punctuation-based, treebank, tweet & MWE."

# Tokenizers
tokenizers = {
    "Whitespace": WhitespaceTokenizer(),
    "Punctuation-based": RegexpTokenizer(r'\w+'),
    "Treebank": TreebankWordTokenizer(),
    "Tweet": TweetTokenizer(),
    "MWE": MWETokenizer([('testing', 'various'), ('tokenization', 'methods')])
}

print("\n--- Tokenization ---")
tokens = {}
for name, tokenizer in tokenizers.items():
    if name == "MWE":
        tokens[name] = tokenizer.tokenize(word_tokenize(text))
    else:
        tokens[name] = tokenizer.tokenize(text)
    print(f"{name} Tokenization: {tokens[name]}")

# Use punctuation tokens for stemming and lemmatization
base_tokens = tokens["Punctuation-based"]

print("\n--- Stemming ---")
for stemmer_name, stemmer in {
    "Porter": PorterStemmer(),
    "Snowball": SnowballStemmer("english")
}.items():
    stems = [stemmer.stem(w) for w in base_tokens]
    print(f"{stemmer_name} Stemming: {stems}")

print("\n--- Lemmatization ---")
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in base_tokens]
print(f"Lemmatization: {lemmatized}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ameyp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ameyp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ameyp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



--- Tokenization ---
Whitespace Tokenization: ['Hello', 'there!', "I'm", 'testing', 'various', 'tokenization', 'methods:', 'whitespace,', 'punctuation-based,', 'treebank,', 'tweet', '&', 'MWE.']
Punctuation-based Tokenization: ['Hello', 'there', 'I', 'm', 'testing', 'various', 'tokenization', 'methods', 'whitespace', 'punctuation', 'based', 'treebank', 'tweet', 'MWE']
Treebank Tokenization: ['Hello', 'there', '!', 'I', "'m", 'testing', 'various', 'tokenization', 'methods', ':', 'whitespace', ',', 'punctuation-based', ',', 'treebank', ',', 'tweet', '&', 'MWE', '.']
Tweet Tokenization: ['Hello', 'there', '!', "I'm", 'testing', 'various', 'tokenization', 'methods', ':', 'whitespace', ',', 'punctuation-based', ',', 'treebank', ',', 'tweet', '&', 'MWE', '.']
MWE Tokenization: ['Hello', 'there', '!', 'I', "'m", 'testing_various', 'tokenization_methods', ':', 'whitespace', ',', 'punctuation-based', ',', 'treebank', ',', 'tweet', '&', 'MWE', '.']

--- Stemming ---
Porter Stemming: ['hello', '