In [None]:
!pip install nltk
#Natural Language ToolKit



In [None]:
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer, MWETokenizer
from nltk.tokenize import TreebankWordTokenizer
import string

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
text = "Don't underestimate natural language processing! NLP is used in tweets like #AI and mentions @OpenAI."
print("Original Text:")
print(text)


Original Text:
Don't underestimate natural language processing! NLP is used in tweets like #AI and mentions @OpenAI.


1. **Whitespace** **Tokenizer**: Splits text purely based on spaces

In [None]:
def whitespace_tokenizer(text):
    return text.split()

whitespace_tokens = whitespace_tokenizer(text)
print("\nWhitespace Tokenization:")
print(whitespace_tokens)



Whitespace Tokenization:
["Don't", 'underestimate', 'natural', 'language', 'processing!', 'NLP', 'is', 'used', 'in', 'tweets', 'like', '#AI', 'and', 'mentions', '@OpenAI.']


**2. Punctuation Tokenizer**: Splits text at punctuation marks

In [None]:
def punctuation_tokenizer(text):
    tokens = []
    word = ""
    for char in text:
        if char in string.punctuation:
            if word:
                tokens.append(word)
                word = ""
            tokens.append(char)
        else:
            word += char
    if word:
        tokens.append(word)
    return tokens

punctuation_tokens = punctuation_tokenizer(text)
print("\nPunctuation Tokenization:")
print(punctuation_tokens)



Punctuation Tokenization:
['Don', "'", 't underestimate natural language processing', '!', ' NLP is used in tweets like ', '#', 'AI and mentions ', '@', 'OpenAI', '.']


**3. Treebank Tokenizer**: Handles contractions and complex punctuation correctly

In [None]:
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)

print("\nTreebank Tokenization:")
print(treebank_tokens)



Treebank Tokenization:
['Do', "n't", 'underestimate', 'natural', 'language', 'processing', '!', 'NLP', 'is', 'used', 'in', 'tweets', 'like', '#', 'AI', 'and', 'mentions', '@', 'OpenAI', '.']


**4. Tweet Tokenizer**: Optimized for social media text (hashtags, mentions, emojis)

In [None]:
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)

print("\nTweet Tokenization:")
print(tweet_tokens)



Tweet Tokenization:
["Don't", 'underestimate', 'natural', 'language', 'processing', '!', 'NLP', 'is', 'used', 'in', 'tweets', 'like', '#AI', 'and', 'mentions', '@OpenAI', '.']


**5. MWE (Multi-Word Expression) Tokenizer**: Identifies and tokenizes multi-word expressions as a single unit

In [None]:
nltk.download('punkt_tab')
mwe_tokenizer = MWETokenizer([('natural', 'language'), ('language', 'processing')], separator='_')

mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(text))
print("\nMWE Tokenization:")
print(mwe_tokens)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



MWE Tokenization:
['Do', "n't", 'underestimate', 'natural_language', 'processing', '!', 'NLP', 'is', 'used', 'in', 'tweets', 'like', '#', 'AI', 'and', 'mentions', '@', 'OpenAI', '.']


2] What is Stemming?

Stemming is a text normalization technique that removes suffixes (and occasionally prefixes) from words to obtain their stem. The resulting stem may not be a valid dictionary word, but it represents a common linguistic root shared across word variants.

In [12]:
import nltk
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
text = "Stemming algorithms are useful for reducing words like running, runs, and runner into a common root."
print("Original Text:")
print(text)


Original Text:
Stemming algorithms are useful for reducing words like running, runs, and runner into a common root.


In [16]:
nltk.download('punkt_tab')
tokens = word_tokenize(text)
print("\nTokens:")
print(tokens)


Tokens:
['Stemming', 'algorithms', 'are', 'useful', 'for', 'reducing', 'words', 'like', 'running', ',', 'runs', ',', 'and', 'runner', 'into', 'a', 'common', 'root', '.']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


**1. Porter Stemmer**: The Porter Stemmer is a rule-based stemming algorithm. It applies a sequence of deterministic rules to remove common English suffixes.

In [17]:
porter = PorterStemmer()
porter_stems = [porter.stem(word) for word in tokens]

print("\nPorter Stemmer Output:")
print(porter_stems)



Porter Stemmer Output:
['stem', 'algorithm', 'are', 'use', 'for', 'reduc', 'word', 'like', 'run', ',', 'run', ',', 'and', 'runner', 'into', 'a', 'common', 'root', '.']


2. Snowball Stemmer: The Snowball Stemmer is an improved version of the Porter algorithm with better rules and multi-language support.

In [18]:
snowball = SnowballStemmer("english")
snowball_stems = [snowball.stem(word) for word in tokens]

print("\nSnowball Stemmer Output:")
print(snowball_stems)



Snowball Stemmer Output:
['stem', 'algorithm', 'are', 'use', 'for', 'reduc', 'word', 'like', 'run', ',', 'run', ',', 'and', 'runner', 'into', 'a', 'common', 'root', '.']


3. Lemmatization (WordNet Lemmatizer)


    Lemmatization is a linguistically informed text normalization technique that reduces words to their base dictionary form, known as a lemma, by considering the wordâ€™s part of speech and contextual meaning.

WordNet Lemmatizer

WordNet is a large lexical database of English that groups words into sets of synonyms (synsets) and defines semantic relationships among them.

In [19]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [20]:
lemmatizer = WordNetLemmatizer()


In [21]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


In [25]:
nltk.download('averaged_perceptron_tagger_eng')

pos_tags = nltk.pos_tag(tokens)

lemmatized_words = [
    lemmatizer.lemmatize(word, get_wordnet_pos(pos))
    for word, pos in pos_tags
]

print("\nWordNet Lemmatization (With POS Tags):")
print(lemmatized_words)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.



WordNet Lemmatization (With POS Tags):
['Stemming', 'algorithm', 'be', 'useful', 'for', 'reduce', 'word', 'like', 'run', ',', 'run', ',', 'and', 'runner', 'into', 'a', 'common', 'root', '.']
