In [1]:
import nltk
from nltk.tokenize import (
    WhitespaceTokenizer,
    WordPunctTokenizer,
    TreebankWordTokenizer,
    TweetTokenizer,
    MWETokenizer
)
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

In [2]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [5]:
def perform_nlp_tasks(sample_text):


    print("--- 1. TOKENIZATION ---")

    # A. Whitespace Tokenization
    ws_tokenizer = WhitespaceTokenizer()
    print(f"Whitespace: {ws_tokenizer.tokenize(sample_text)}")

    # B. Punctuation-based Tokenization
    punct_tokenizer = WordPunctTokenizer()
    print(f"Punctuation-based: {punct_tokenizer.tokenize(sample_text)}")

    # C. Treebank Tokenization
    tree_tokenizer = TreebankWordTokenizer()
    print(f"Treebank: {tree_tokenizer.tokenize(sample_text)}")

    # D. Tweet Tokenization
    tweet_tokenizer = TweetTokenizer()
    print(f"Tweet: {tweet_tokenizer.tokenize(sample_text)}")

    # E. Multi-Word Expression (MWE) Tokenization
    mwe_tokenizer = MWETokenizer()
    mwe_tokenizer.add_mwe(('Python', 'programs.'))
    # MWE tokenizer requires text already split into tokens
    mwe_tokens = mwe_tokenizer.tokenize(sample_text.split())
    print(f"MWE (Python programs.): {mwe_tokens}")

    print("\n--- 2. STEMMING ---")
    words = ["running", "flies", "happily", "denied", "better"]

    porter = PorterStemmer()
    snowball = SnowballStemmer(language='english')

    print(f"{'Word':<10} | {'Porter':<12} | {'Snowball'}")
    print("-" * 35)
    for w in words:
        print(f"{w:<10} | {porter.stem(w):<12} | {snowball.stem(w)}")

    print("\n--- 3. LEMMATIZATION ---")
    lemmatizer = WordNetLemmatizer()
    lem_words = ["feet", "corpora", "rocks", "better"]

    for w in lem_words:
        # 'v' for verb, 'n' for noun, 'a' for adjective
        # If 'better' is treated as an adjective:
        pos = 'a' if w == 'better' else 'n'
        print(f"{w} -> {lemmatizer.lemmatize(w, pos=pos)}")

In [6]:
if __name__ == "__main__":
    sample_text = "NLTK is a leading platform for building Python programs. It's great for #NLP! @NLTK_org"
    perform_nlp_tasks(sample_text)

--- 1. TOKENIZATION ---
Whitespace: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs.', "It's", 'great', 'for', '#NLP!', '@NLTK_org']
Punctuation-based: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', '.', 'It', "'", 's', 'great', 'for', '#', 'NLP', '!', '@', 'NLTK_org']
Treebank: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs.', 'It', "'s", 'great', 'for', '#', 'NLP', '!', '@', 'NLTK_org']
Tweet: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', '.', "It's", 'great', 'for', '#NLP', '!', '@NLTK_org']
MWE (Python programs.): ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python_programs.', "It's", 'great', 'for', '#NLP!', '@NLTK_org']

--- 2. STEMMING ---
Word       | Porter       | Snowball
-----------------------------------
running    | run          | run
flies      | fli          | fli
happily    | happili      | happili
denied    