In [3]:
#BAI-7B
#22K-4080
import nltk
import string
import re

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

def process_text(text):

    print("=" * 80)
    print("ORIGINAL TEXT:")
    print("=" * 80)
    print(text)
    print()

    # Step 1: Convert to lowercase
    text_lower = text.lower()
    print("Step 1 - Lowercase:")
    print(text_lower)
    print()

    # Step 2: Remove digits
    text_no_digits = re.sub(r'\d+', '', text_lower)
    print("Step 2 - Digits Removed:")
    print(text_no_digits)
    print()

    # Step 3: Remove punctuation
    text_no_punct = text_no_digits.translate(str.maketrans('', '', string.punctuation))
    print("Step 3 - Punctuation Removed:")
    print(text_no_punct)
    print()

    # Step 4: Trim extra whitespace
    text_cleaned = ' '.join(text_no_punct.split())
    print("Step 4 - Whitespace Trimmed:")
    print(text_cleaned)
    print()

    # Step 5: Tokenize
    tokens = word_tokenize(text_cleaned)
    print("Step 5 - Tokenized:")
    print(tokens)
    print()

    # Step 6: Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens_no_stop = [word for word in tokens if word not in stop_words]
    print("Step 6 - Stop Words Removed:")
    print(tokens_no_stop)
    print()

    # Step 7: Apply stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens_no_stop]
    print("Step 7 - Stemmed Tokens:")
    print(stemmed_tokens)
    print()

    # Step 8: Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens_no_stop]
    print("Step 8 - Lemmatized Tokens:")
    print(lemmatized_tokens)
    print()

    # Step 9: Compare stemming vs lemmatization
    print("=" * 80)
    print("COMPARISON: STEMMING VS LEMMATIZATION")
    print("=" * 80)
    print(f"{'Original':<20} {'Stemmed':<20} {'Lemmatized':<20}")
    print("-" * 60)
    for orig, stem, lem in zip(tokens_no_stop, stemmed_tokens, lemmatized_tokens):
        if stem != lem:
            print(f"{orig:<20} {stem:<20} {lem:<20} *")
        else:
            print(f"{orig:<20} {stem:<20} {lem:<20}")
    print()
    print("* indicates different results between stemming and lemmatization")
    print()

    # Final output summary
    print("=" * 80)
    print("FINAL OUTPUT SUMMARY")
    print("=" * 80)
    print("\nCleaned & Tokenized:")
    print(tokens_no_stop)
    print("\nStemmed Tokens:")
    print(stemmed_tokens)
    print("\nLemmatized Tokens:")
    print(lemmatized_tokens)
    print()

    return {
        'cleaned_tokens': tokens_no_stop,
        'stemmed_tokens': stemmed_tokens,
        'lemmatized_tokens': lemmatized_tokens
    }


# Input text
input_text = """Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence! In this lecture, we
explored tokenization – splitting text into words or subwords. We also discussed removing
punctuation, converting to lowercase, and eliminating stop words like 'is', 'the', and 'an'. Let's
normalize text properly before passing it to a model."""

result = process_text(input_text)

ORIGINAL TEXT:
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence! In this lecture, we
explored tokenization – splitting text into words or subwords. We also discussed removing
punctuation, converting to lowercase, and eliminating stop words like 'is', 'the', and 'an'. Let's
normalize text properly before passing it to a model.

Step 1 - Lowercase:
natural language processing (nlp) is a fascinating field of artificial intelligence! in this lecture, we
explored tokenization – splitting text into words or subwords. we also discussed removing
punctuation, converting to lowercase, and eliminating stop words like 'is', 'the', and 'an'. let's
normalize text properly before passing it to a model.

Step 2 - Digits Removed:
natural language processing (nlp) is a fascinating field of artificial intelligence! in this lecture, we
explored tokenization – splitting text into words or subwords. we also discussed removing
punctuation, converting to lowercase, and elimi

In [1]:
!pip install nltk

