In [1]:
# Stemming and Lemmatization Techniques in Text Preprocessing
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Download required resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to get WordNet POS tags for more accurate lemmatization
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()  # Get the first letter of POS tag
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to NOUN if not found

# Function for stemming
def stem_words(text):
    tokens = word_tokenize(text)
    return [stemmer.stem(word) for word in tokens]

# Function for lemmatization (with POS tagging)
def lemmatize_words(text):
    tokens = word_tokenize(text)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]

# Example text
text = """Formula 1 is the pinnacle of racing culture, where speed, strategy, and precision come together in a high-stakes blend of engineering and raw skill.
 With cars that push the limits of physics and drivers that embody relentless competitiveness, F1 captures the fascination of millions worldwide."""

# Perform stemming
stemmed_words = stem_words(text)
print("Stemmed Words:", stemmed_words)

# Perform lemmatization
lemmatized_words = lemmatize_words(text)
print("Lemmatized Words:", lemmatized_words)

# BONUS: Compare stemmed and lemmatized results side by side
print("\nComparison of Original, Stemmed, and Lemmatized Words:")
tokens = word_tokenize(text)
for i, token in enumerate(tokens):
    print(f"Original: {token} \t\t\t| Stemmed: {stemmed_words[i]} \t\t\t| Lemmatized: {lemmatized_words[i]}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Stemmed Words: ['formula', '1', 'is', 'the', 'pinnacl', 'of', 'race', 'cultur', ',', 'where', 'speed', ',', 'strategi', ',', 'and', 'precis', 'come', 'togeth', 'in', 'a', 'high-stak', 'blend', 'of', 'engin', 'and', 'raw', 'skill', '.', 'with', 'car', 'that', 'push', 'the', 'limit', 'of', 'physic', 'and', 'driver', 'that', 'embodi', 'relentless', 'competit', ',', 'f1', 'captur', 'the', 'fascin', 'of', 'million', 'worldwid', '.']
Lemmatized Words: ['Formula', '1', 'be', 'the', 'pinnacle', 'of', 'race', 'culture', ',', 'where', 'speed', ',', 'strategy', ',', 'and', 'precision', 'come', 'together', 'in', 'a', 'high-stakes', 'blend', 'of', 'engineering', 'and', 'raw', 'skill', '.', 'With', 'car', 'that', 'push', 'the', 'limit', 'of', 'physic', 'and', 'driver', 'that', 'embody', 'relentless', 'competitiveness', ',', 'F1', 'capture', 'the', 'fascination', 'of', 'million', 'worldwide', '.']

Comparison of Original, Stemmed, and Lemmatized Words:
Original: Formula 			| Stemmed: formula 			| Lem