In [7]:
# setup.py - Step 1 & 2: Install and download NLTK resources
import nltk
import subprocess
import sys

def install_nltk():
    """Step 1: Install NLTK using pip"""
    print("=" * 60)
    print("STEP 1: Installing NLTK")
    print("=" * 60)

    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])
        print("✓ NLTK installed successfully!")
    except:
        print("⚠ Could not install NLTK via pip")
        print("Please run manually: pip install nltk")
    print()

def download_datasets():
    """Step 2: Download required NLTK datasets"""
    print("=" * 60)
    print("STEP 2: Downloading NLTK datasets")
    print("=" * 60)

    datasets = {
        'punkt': 'Tokenizer models',
        'stopwords': 'Stopwords corpus',
        'wordnet': 'WordNet lexical database'
    }

    for dataset, description in datasets.items():
        try:
            nltk.download(dataset, quiet=True)
            print(f"✓ Downloaded '{dataset}' - {description}")
        except:
            print(f"⚠ Error downloading '{dataset}'")
    print()

if __name__ == "__main__":
    install_nltk()
    download_datasets()
    print("Setup complete! Now run the other scripts.")

STEP 1: Installing NLTK
✓ NLTK installed successfully!

STEP 2: Downloading NLTK datasets
✓ Downloaded 'punkt' - Tokenizer models
✓ Downloaded 'stopwords' - Stopwords corpus
✓ Downloaded 'wordnet' - WordNet lexical database

Setup complete! Now run the other scripts.


In [9]:
# preprocessing.py - Steps 3-8: Complete preprocessing pipeline
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download punkt_tab resource specifically
try:
    nltk.download('punkt_tab', quiet=True)
    print("✓ Downloaded 'punkt_tab' resource.")
except:
    print("⚠ Error downloading 'punkt_tab'.")

def load_sample_corpus():
    """Step 3: Load a sample text corpus"""
    sample_text = """Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
    It helps computers understand, interpret, and manipulate human language.
    NLP is used in many applications like chatbots, sentiment analysis, and machine translation.
    Researchers are constantly working on improving NLP techniques to make them more accurate."""

    print("=" * 60)
    print("STEP 3: SAMPLE TEXT CORPUS")
    print("=" * 60)
    print(sample_text)
    print()
    return sample_text

def tokenize_text(text):
    """Step 4: Perform tokenization"""
    print("=" * 60)
    print("STEP 4: TOKENIZATION")
    print("=" * 60)

    tokens = word_tokenize(text)
    print(f"Total tokens: {len(tokens)}")
    print("Tokens:", tokens)
    print()
    return tokens

def remove_stopwords(tokens):
    """Step 5: Remove stopwords"""
    print("=" * 60)
    print("STEP 5: STOPWORD REMOVAL")
    print("=" * 60)

    stop_words = set(stopwords.words('english'))
    print(f"Total stopwords in NLTK: {len(stop_words)}")
    print(f"Sample stopwords: {list(stop_words)[:10]}...")

    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Also remove punctuation
    filtered_tokens = [word for word in filtered_tokens if word not in string.punctuation]

    print(f"\nTokens after stopword removal: {len(filtered_tokens)}")
    print("Filtered tokens:", filtered_tokens)
    print()
    return filtered_tokens

def apply_stemming(tokens):
    """Step 6: Apply stemming"""
    print("=" * 60)
    print("STEP 6: STEMMING")
    print("=" * 60)

    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    print("Original → Stemmed")
    print("-" * 30)
    for original, stemmed in zip(tokens[:15], stemmed_tokens[:15]):
        if original != stemmed:
            print(f"{original:<15} → {stemmed}")

    print(f"\nAll stemmed tokens: {stemmed_tokens}")
    print()
    return stemmed_tokens

def apply_lemmatization(tokens):
    """Step 7: Apply lemmatization"""
    print("=" * 60)
    print("STEP 7: LEMMATIZATION")
    print("=" * 60)

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    print("Original → Lemmatized")
    print("-" * 30)
    for original, lemma in zip(tokens[:15], lemmatized_tokens[:15]):
        if original != lemma:
            print(f"{original:<15} → {lemma}")

    print(f"\nAll lemmatized tokens: {lemmatized_tokens}")
    print()
    return lemmatized_tokens

def main():
    """Main function to run the complete preprocessing pipeline"""
    print("TEXT PREPROCESSING PRACTICAL")
    print("=" * 60)

    # Run the complete pipeline
    text = load_sample_corpus()
    tokens = tokenize_text(text)
    filtered_tokens = remove_stopwords(tokens)
    stemmed_tokens = apply_stemming(filtered_tokens)
    lemmatized_tokens = apply_lemmatization(filtered_tokens)

    # Step 8: Display final comparison
    print("=" * 60)
    print("STEP 8: FINAL RESULTS COMPARISON")
    print("=" * 60)

    print("\nCOMPARISON TABLE:")
    print("-" * 50)
    print(f"{'Stage':<20} | {'Token Count':<15} | {'Sample'}")
    print("-" * 50)
    print(f"{'Original Text':<20} | {len(text.split()):<15} | {text[:30]}...")
    print(f"{'After Tokenization':<20} | {len(tokens):<15} | {tokens[:5]}...")
    print(f"{'After Stopword Removal':<20} | {len(filtered_tokens):<15} | {filtered_tokens[:5]}...")
    print(f"{'After Stemming':<20} | {len(stemmed_tokens):<15} | {stemmed_tokens[:5]}...")
    print(f"{'After Lemmatization':<20} | {len(lemmatized_tokens):<15} | {lemmatized_tokens[:5]}...")

    print("\n" + "=" * 60)
    print("PROCESSING COMPLETE!")
    print("=" * 60)

if __name__ == "__main__":
    main()


✓ Downloaded 'punkt_tab' resource.
TEXT PREPROCESSING PRACTICAL
STEP 3: SAMPLE TEXT CORPUS
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
    It helps computers understand, interpret, and manipulate human language.
    NLP is used in many applications like chatbots, sentiment analysis, and machine translation.
    Researchers are constantly working on improving NLP techniques to make them more accurate.

STEP 4: TOKENIZATION
Total tokens: 56
Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence', '.', 'It', 'helps', 'computers', 'understand', ',', 'interpret', ',', 'and', 'manipulate', 'human', 'language', '.', 'NLP', 'is', 'used', 'in', 'many', 'applications', 'like', 'chatbots', ',', 'sentiment', 'analysis', ',', 'and', 'machine', 'translation', '.', 'Researchers', 'are', 'constantly', 'working', 'on', 'improving', 'NLP', 'techniques', 'to', 'make', 'them', 'more', 'acc

In [10]:
# advanced_preprocessing.py - Additional preprocessing techniques
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet

def show_advanced_techniques():
    """Show additional text preprocessing techniques"""
    print("ADVANCED TEXT PREPROCESSING TECHNIQUES")
    print("=" * 60)

    text = "Natural Language Processing is amazing! It's changing how we interact with computers. The researchers' work on NLP models is revolutionary."

    # 1. Sentence Tokenization
    print("\n1. SENTENCE TOKENIZATION:")
    sentences = sent_tokenize(text)
    for i, sentence in enumerate(sentences, 1):
        print(f"Sentence {i}: {sentence}")

    # 2. Different Stemmers
    print("\n2. DIFFERENT STEMMERS COMPARISON:")
    words = ["running", "happily", "better", "went", "studies", "interesting"]

    porter = PorterStemmer()
    snowball = SnowballStemmer("english")

    print(f"{'Word':<15} {'Porter':<15} {'Snowball':<15}")
    print("-" * 45)
    for word in words:
        print(f"{word:<15} {porter.stem(word):<15} {snowball.stem(word):<15}")

    # 3. Lemmatization with POS tags
    print("\n3. LEMMATIZATION WITH POS TAGS:")
    from nltk import pos_tag
    from nltk.stem import WordNetLemmatizer

    lemmatizer = WordNetLemmatizer()

    # Sample words with different POS
    test_words = [("running", "v"), ("better", "a"), ("dogs", "n"), ("running", "n")]

    print(f"{'Word':<10} {'POS':<5} {'Lemma':<15}")
    print("-" * 30)
    for word, pos in test_words:
        # Convert POS to WordNet format
        if pos.startswith('v'):
            pos_wordnet = wordnet.VERB
        elif pos.startswith('a'):
            pos_wordnet = wordnet.ADJ
        elif pos.startswith('n'):
            pos_wordnet = wordnet.NOUN
        else:
            pos_wordnet = wordnet.NOUN

        lemma = lemmatizer.lemmatize(word, pos=pos_wordnet)
        print(f"{word:<10} {pos:<5} {lemma:<15}")

    # 4. Custom stopwords
    print("\n4. CUSTOM STOPWORDS EXAMPLE:")
    custom_stopwords = set(stopwords.words('english'))
    custom_stopwords.update(['natural', 'language', 'processing'])

    sample = "Natural Language Processing helps in understanding natural language patterns."
    tokens = word_tokenize(sample.lower())
    filtered = [word for word in tokens if word not in custom_stopwords and word not in string.punctuation]

    print(f"Original: {sample}")
    print(f"Filtered: {filtered}")

if __name__ == "__main__":
    show_advanced_techniques()

ADVANCED TEXT PREPROCESSING TECHNIQUES

1. SENTENCE TOKENIZATION:
Sentence 1: Natural Language Processing is amazing!
Sentence 2: It's changing how we interact with computers.
Sentence 3: The researchers' work on NLP models is revolutionary.

2. DIFFERENT STEMMERS COMPARISON:
Word            Porter          Snowball       
---------------------------------------------
running         run             run            
happily         happili         happili        
better          better          better         
went            went            went           
studies         studi           studi          
interesting     interest        interest       

3. LEMMATIZATION WITH POS TAGS:
Word       POS   Lemma          
------------------------------
running    v     run            
better     a     good           
dogs       n     dog            
running    n     running        

4. CUSTOM STOPWORDS EXAMPLE:
Original: Natural Language Processing helps in understanding natural language patt

In [12]:
# run_all.py - Run all preprocessing steps
import subprocess
import sys

def run_script(script_name, description):
    """Run a Python script"""
    print(f"\n{'='*60}")
    print(f"RUNNING: {description}")
    print(f"{'='*60}")

    try:
        subprocess.run([sys.executable, script_name], check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error running {script_name}: {e}")
    except FileNotFoundError:
        print(f"Script {script_name} not found!")

def main():
    print("TEXT PREPROCESSING PRACTICAL - COMPLETE EXECUTION")

    # Run setup first
    run_script("setup.py", "Setup and Installation")

    # Run main preprocessing
    run_script("preprocessing.py", "Main Preprocessing Pipeline")

    # Run advanced techniques
    response = input("\nDo you want to see advanced techniques? (y/n): ")
    if response.lower() == 'y':
        run_script("advanced_preprocessing.py", "Advanced Preprocessing Techniques")

    print("\n" + "="*60)
    print("PRACTICAL COMPLETED SUCCESSFULLY!")
    print("="*60)

if __name__ == "__main__":
    main()

TEXT PREPROCESSING PRACTICAL - COMPLETE EXECUTION

RUNNING: Setup and Installation
Error running setup.py: Command '['/usr/bin/python3', 'setup.py']' returned non-zero exit status 2.

RUNNING: Main Preprocessing Pipeline
Error running preprocessing.py: Command '['/usr/bin/python3', 'preprocessing.py']' returned non-zero exit status 2.

Do you want to see advanced techniques? (y/n): nnn

PRACTICAL COMPLETED SUCCESSFULLY!
