**Perform tokenization (Whitespace, Punctuation-based, Treebank, Tweet, MWE) using NLTK library. Use porter stemmer and snowball stemmer for stemming. Use any technique for lemmatization.**

In [None]:
import nltk
from nltk.tokenize import word_tokenize, regexp_tokenize, TreebankWordTokenizer, TweetTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.stem import WordNetLemmatizer

# Ensure you have the necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
text = "The phone is great! Battery life is excellent, and the camera quality is amazing. Definitely worth the price."

# Whitespace Tokenization
whitespace_tokens = text.split()
print("Whitespace Tokenization:", whitespace_tokens)


Whitespace Tokenization: ['The', 'phone', 'is', 'great!', 'Battery', 'life', 'is', 'excellent,', 'and', 'the', 'camera', 'quality', 'is', 'amazing.', 'Definitely', 'worth', 'the', 'price.']


In [None]:
# Punctuation-based Tokenization using regexp_tokenize
punctuation_tokens = regexp_tokenize(text, pattern=r'\s|[\.,;?!"]')
print("Punctuation-based Tokenization:", punctuation_tokens)


Punctuation-based Tokenization: [' ', ' ', ' ', '!', ' ', ' ', ' ', ' ', ',', ' ', ' ', ' ', ' ', ' ', ' ', '.', ' ', ' ', ' ', ' ', '.']


In [None]:
# Treebank Tokenization
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)
print("Treebank Tokenization:", treebank_tokens)


Treebank Tokenization: ['The', 'phone', 'is', 'great', '!', 'Battery', 'life', 'is', 'excellent', ',', 'and', 'the', 'camera', 'quality', 'is', 'amazing.', 'Definitely', 'worth', 'the', 'price', '.']


In [None]:
# Tweet Tokenization
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)
print("Tweet Tokenization:", tweet_tokens)


Tweet Tokenization: ['The', 'phone', 'is', 'great', '!', 'Battery', 'life', 'is', 'excellent', ',', 'and', 'the', 'camera', 'quality', 'is', 'amazing', '.', 'Definitely', 'worth', 'the', 'price', '.']


In [None]:
# Multi-Word Expression Tokenization (MWE)
mwe_phrases = [('New York', 'New York')]
mwe_tokenizer = MWETokenizer(mwe_phrases)
mwe_tokens = mwe_tokenizer.tokenize(text.split())
print("MWE Tokenization:", mwe_tokens)


MWE Tokenization: ['The', 'phone', 'is', 'great!', 'Battery', 'life', 'is', 'excellent,', 'and', 'the', 'camera', 'quality', 'is', 'amazing.', 'Definitely', 'worth', 'the', 'price.']


In [None]:
# Porter Stemmer
porter_stemmer = PorterStemmer()
porter_stems = [porter_stemmer.stem(word) for word in treebank_tokens]
print("Porter Stemmer:", porter_stems)


Porter Stemmer: ['the', 'phone', 'is', 'great', '!', 'batteri', 'life', 'is', 'excel', ',', 'and', 'the', 'camera', 'qualiti', 'is', 'amazing.', 'definit', 'worth', 'the', 'price', '.']


In [None]:
# Snowball Stemmer
snowball_stemmer = SnowballStemmer('english')
snowball_stems = [snowball_stemmer.stem(word) for word in treebank_tokens]
print("Snowball Stemmer:", snowball_stems)


Snowball Stemmer: ['the', 'phone', 'is', 'great', '!', 'batteri', 'life', 'is', 'excel', ',', 'and', 'the', 'camera', 'qualiti', 'is', 'amazing.', 'definit', 'worth', 'the', 'price', '.']


In [None]:
# Lemmatization using WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize with POS tagging
pos_tags = nltk.pos_tag(treebank_tokens)
lemmatized_words = []

for word, tag in pos_tags:
    if tag.startswith('V'):  # If the word is a verb
        lemma = lemmatizer.lemmatize(word, pos='v')
    elif tag.startswith('N'):  # If the word is a noun
        lemma = lemmatizer.lemmatize(word, pos='n')
    else:
        lemma = lemmatizer.lemmatize(word)
    lemmatized_words.append(lemma)

print("Lemmatized Words:", lemmatized_words)


Lemmatized Words: ['The', 'phone', 'be', 'great', '!', 'Battery', 'life', 'be', 'excellent', ',', 'and', 'the', 'camera', 'quality', 'be', 'amazing.', 'Definitely', 'worth', 'the', 'price', '.']
