In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
#Different Tokenizers

In [3]:
# Sample sentence
sentence = " I love eating apples, oranges, and bananas. Don't you? better happily can't #beautiful new york city @ask 12/25/2022 "

In [4]:
#WhiteSpace Tokenization

In [5]:
from nltk.tokenize import WhitespaceTokenizer
whitespace_tokenized = WhitespaceTokenizer().tokenize(sentence)
print(whitespace_tokenized)

['I', 'love', 'eating', 'apples,', 'oranges,', 'and', 'bananas.', "Don't", 'you?', 'better', 'happily', "can't", '#beautiful', 'new', 'york', 'city', '@ask', '12/25/2022']


In [6]:
# TreeBankWord Tokenization

from nltk.tokenize import TreebankWordTokenizer
treebank_tokenized = TreebankWordTokenizer().tokenize(sentence)
print(treebank_tokenized)

['I', 'love', 'eating', 'apples', ',', 'oranges', ',', 'and', 'bananas.', 'Do', "n't", 'you', '?', 'better', 'happily', 'ca', "n't", '#', 'beautiful', 'new', 'york', 'city', '@', 'ask', '12/25/2022']


In [7]:
#Multi-Word Expression (MWE) tokenizer Tokenization

from nltk.tokenize import MWETokenizer

s1=[("I", "love")]
mwe_tokenizer = MWETokenizer(s1)
mwe_tokens = mwe_tokenizer.tokenize(sentence.split())
print(mwe_tokens)

['I_love', 'eating', 'apples,', 'oranges,', 'and', 'bananas.', "Don't", 'you?', 'better', 'happily', "can't", '#beautiful', 'new', 'york', 'city', '@ask', '12/25/2022']


In [8]:
#Tweet Tokenization

from nltk.tokenize import TweetTokenizer
tweet_tokenized = TweetTokenizer().tokenize(sentence)
print(tweet_tokenized)

['I', 'love', 'eating', 'apples', ',', 'oranges', ',', 'and', 'bananas', '.', "Don't", 'you', '?', 'better', 'happily', "can't", '#beautiful', 'new', 'york', 'city', '@ask', '12/25', '/', '2022']


In [9]:
#Punctuation Based Word Tokenization

from nltk.tokenize import wordpunct_tokenize
print(wordpunct_tokenize(sentence))

['I', 'love', 'eating', 'apples', ',', 'oranges', ',', 'and', 'bananas', '.', 'Don', "'", 't', 'you', '?', 'better', 'happily', 'can', "'", 't', '#', 'beautiful', 'new', 'york', 'city', '@', 'ask', '12', '/', '25', '/', '2022']


In [10]:
#Different Stemming Techniques
#SnowBallStemming

from nltk.stem.snowball import SnowballStemmer
snowballstemmer = SnowballStemmer(language='english')
stem_words = []
for w in tweet_tokenized:
    x = snowballstemmer.stem(w)
    stem_words.append(x)
print('Before Snowball Stemming')
print(tweet_tokenized,'\n')
print('After Snowball Stemming')
print(stem_words)

Before Snowball Stemming
['I', 'love', 'eating', 'apples', ',', 'oranges', ',', 'and', 'bananas', '.', "Don't", 'you', '?', 'better', 'happily', "can't", '#beautiful', 'new', 'york', 'city', '@ask', '12/25', '/', '2022'] 

After Snowball Stemming
['i', 'love', 'eat', 'appl', ',', 'orang', ',', 'and', 'banana', '.', "don't", 'you', '?', 'better', 'happili', "can't", '#beauti', 'new', 'york', 'citi', '@ask', '12/25', '/', '2022']


In [11]:
#Porter Stemming
from nltk.stem import PorterStemmer
porterstemmer = PorterStemmer()
stem_words = []
for w in tweet_tokenized:
    x = porterstemmer.stem(w)
    stem_words.append(x)
print('Before Potter Stemming')
print(tweet_tokenized,'\n')
print('After Potter Stemming')
print(stem_words)

Before Potter Stemming
['I', 'love', 'eating', 'apples', ',', 'oranges', ',', 'and', 'bananas', '.', "Don't", 'you', '?', 'better', 'happily', "can't", '#beautiful', 'new', 'york', 'city', '@ask', '12/25', '/', '2022'] 

After Potter Stemming
['i', 'love', 'eat', 'appl', ',', 'orang', ',', 'and', 'banana', '.', "don't", 'you', '?', 'better', 'happili', "can't", '#beauti', 'new', 'york', 'citi', '@ask', '12/25', '/', '2022']


In [12]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
stem_words = []
for w in tweet_tokenized:
    x = wordnet_lemmatizer.lemmatize(w)
    stem_words.append(x)
print('Before Lemmatization')
print(tweet_tokenized,'\n')
print('After Lemmatization')
print(stem_words)


Before Lemmatization
['I', 'love', 'eating', 'apples', ',', 'oranges', ',', 'and', 'bananas', '.', "Don't", 'you', '?', 'better', 'happily', "can't", '#beautiful', 'new', 'york', 'city', '@ask', '12/25', '/', '2022'] 

After Lemmatization
['I', 'love', 'eating', 'apple', ',', 'orange', ',', 'and', 'banana', '.', "Don't", 'you', '?', 'better', 'happily', "can't", '#beautiful', 'new', 'york', 'city', '@ask', '12/25', '/', '2022']


In [13]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize a word with context
word = "running"
lemma = lemmatizer.lemmatize(word, pos='v')  # Verb
print(f"The lemma of '{word}' when it's a verb is '{lemma}'")


The lemma of 'running' when it's a verb is 'run'


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
