<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-c422-Likhitha/2_text_cleaning_tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import string
import nltk
import spacy
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)


True

In [2]:
try:
    nlp = spacy.load("en_core_web_sm")
    print("✅ spaCy model loaded successfully!")
except OSError:
    print("❌ Please install spaCy English model: python -m spacy download en_core_web_sm")
    nlp = None

# Sample messy text data (emojis removed)
messy_texts = [
    "Hey there!!! Check out this AMAZING deal at https://example.com/sale",
    "EMAIL me at john.doe@company.com for more INFO!!!",
    "   So much    extra    whitespace   everywhere   ",
    "HTML content: <p>This is a paragraph</p> with <b>bold</b> text",
    "Phone: +1-555-123-4567 or call (555) 987-6543 today!",
    "Social media: Follow @username #trending #viral #amazing",
    "Mixed CASE and 123 numbers and $pecial ch@r@cters!!!",
    "Line breaks\nand\ttabs\teverywhere\r\n",
    "Contractions like don't, won't, I'm, and we're are common",
    "Numbers: 1st, 2nd, 3rd places and dates like 01/15/2024"
]

print("Sample Messy Texts:")
print("=" * 50)
for i, text in enumerate(messy_texts, 1):
    print(f"{i:2d}. {repr(text)}")

✅ spaCy model loaded successfully!
Sample Messy Texts:
 1. 'Hey there!!! Check out this AMAZING deal at https://example.com/sale'
 2. 'EMAIL me at john.doe@company.com for more INFO!!!'
 3. '   So much    extra    whitespace   everywhere   '
 4. 'HTML content: <p>This is a paragraph</p> with <b>bold</b> text'
 5. 'Phone: +1-555-123-4567 or call (555) 987-6543 today!'
 6. 'Social media: Follow @username #trending #viral #amazing'
 7. 'Mixed CASE and 123 numbers and $pecial ch@r@cters!!!'
 8. 'Line breaks\nand\ttabs\teverywhere\r\n'
 9. "Contractions like don't, won't, I'm, and we're are common"
10. 'Numbers: 1st, 2nd, 3rd places and dates like 01/15/2024'


In [4]:
def demonstrate_cleaning_steps(text):
    """Show each cleaning step with before/after examples"""
    print(f"Original: {repr(text)}")
    print(f"Display:  {text}")
    print("-" * 80)

    # Step 1: Remove URLs
    url_pattern = r'https?://\S+'
    step1 = re.sub(url_pattern, '', text)
    print(f"After URL removal: {step1}")

    # Step 2: Remove email addresses
    email_pattern = r'\b[\w.-]+?@\w+?\.\w+?\b'
    step2 = re.sub(email_pattern, '', step1)
    print(f"After email removal: {step2}")

    # Step 3: Remove HTML tags
    html_pattern = r'<[^>]+>'
    step3 = re.sub(html_pattern, '', step2)
    print(f"After HTML removal: {step3}")

    # Step 4: Remove phone numbers
    phone_pattern = r'\+?\d{1,2}?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
    step4 = re.sub(phone_pattern, '', step3)
    print(f"After phone removal: {step4}")

    # Step 5: Remove hashtags and mentions
    step5 = re.sub(r'#\w+', '', step4)
    step5 = re.sub(r'@\w+', '', step5)
    print(f"After social media removal: {step5}")

    # Step 6: Remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    step6 = emoji_pattern.sub(r'', step5)
    print(f"After emoji removal: {step6}")

    # Step 7: Normalize whitespace
    step7 = re.sub(r'\s+', ' ', step6).strip()
    print(f"After whitespace normalization: {step7}")

    # Step 8: Convert to lowercase
    cleaned = step7.lower()
    print(f"After lowercasing: {cleaned}")

    print("=" * 80)
    return cleaned

print("\nTEXT CLEANING DEMONSTRATION")
print("=" * 80)
for i, text in enumerate(messy_texts[:3], 1):
    print(f"\nExample {i}:")
    cleaned = demonstrate_cleaning_steps(text)
    print(f"\nFinal result: '{cleaned}'")
    print("=" * 80)


TEXT CLEANING DEMONSTRATION

Example 1:
Original: 'Hey there!!! Check out this AMAZING deal at https://example.com/sale'
Display:  Hey there!!! Check out this AMAZING deal at https://example.com/sale
--------------------------------------------------------------------------------
After URL removal: Hey there!!! Check out this AMAZING deal at 
After email removal: Hey there!!! Check out this AMAZING deal at 
After HTML removal: Hey there!!! Check out this AMAZING deal at 
After phone removal: Hey there!!! Check out this AMAZING deal at 
After social media removal: Hey there!!! Check out this AMAZING deal at 
After emoji removal: Hey there!!! Check out this AMAZING deal at 
After whitespace normalization: Hey there!!! Check out this AMAZING deal at
After lowercasing: hey there!!! check out this amazing deal at

Final result: 'hey there!!! check out this amazing deal at'

Example 2:
Original: 'EMAIL me at john.doe@company.com for more INFO!!!'
Display:  EMAIL me at john.doe@company.com f

In [5]:
sample_text = """
Dr. Smith's research on AI (published in 2024) shows that machine learning can't
solve every problem. However, it's incredibly useful! Visit https://example.com
for more details. What do you think?
"""

print("\nTOKENIZATION COMPARISON")
print("=" * 60)
print(f"Sample text: {sample_text.strip()}")
print("-" * 60)


TOKENIZATION COMPARISON
Sample text: Dr. Smith's research on AI (published in 2024) shows that machine learning can't
solve every problem. However, it's incredibly useful! Visit https://example.com
for more details. What do you think?
------------------------------------------------------------


In [5]:
simple_tokens = sample_text.split()
print(f"\n1. SIMPLE SPLIT ({len(simple_tokens)} tokens):")
print(simple_tokens)

# Method 2: NLTK word tokenization
from nltk.tokenize import word_tokenize, sent_tokenize
nltk_word_tokens = word_tokenize(sample_text)
print(f"\n2. NLTK WORD TOKENIZE ({len(nltk_word_tokens)} tokens):")
print(nltk_word_tokens)

# Method 3: NLTK sentence tokenization
nltk_sent_tokens = sent_tokenize(sample_text)
print(f"\n3. NLTK SENTENCE TOKENIZE ({len(nltk_sent_tokens)} sentences):")
for i, sent in enumerate(nltk_sent_tokens, 1):
    print(f"   {i}. {sent.strip()}")

# Method 4: spaCy tokenization (if available)
if nlp:
    doc = nlp(sample_text)
    spacy_tokens = [token.text for token in doc]
    print(f"\n4. SPACY TOKENIZE ({len(spacy_tokens)} tokens):")
    print(spacy_tokens)

    spacy_sents = [sent.text.strip() for sent in doc.sents]
    print(f"\n5. SPACY SENTENCES ({len(spacy_sents)} sentences):")
    for i, sent in enumerate(spacy_sents, 1):
        print(f"   {i}. {sent}")

# Method 5: Regex tokenization
regex_tokens = re.findall(r'\b\w+\b', sample_text)
print(f"\n6. REGEX TOKENIZE (\\b\\w+\\b) ({len(regex_tokens)} tokens):")
print(regex_tokens)

print("\n" + "=" * 60)
print("Key Differences:")
print("• Simple split: Fast but crude")
print("• NLTK: Handles punctuation & contractions")
print("• spaCy: Most advanced, includes linguistic context")
print("• Regex: Custom and pattern-based")


1. SIMPLE SPLIT (29 tokens):
['Dr.', "Smith's", 'research', 'on', 'AI', '(published', 'in', '2024)', 'shows', 'that', 'machine', 'learning', "can't", 'solve', 'every', 'problem.', 'However,', "it's", 'incredibly', 'useful!', 'Visit', 'https://example.com', 'for', 'more', 'details.', 'What', 'do', 'you', 'think?']


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [6]:
import shutil

# Remove all known NLTK directories to clear corrupted data
shutil.rmtree('/root/nltk_data', ignore_errors=True)
shutil.rmtree('/usr/share/nltk_data', ignore_errors=True)
shutil.rmtree('/usr/local/share/nltk_data', ignore_errors=True)
shutil.rmtree('/usr/local/lib/nltk_data', ignore_errors=True)
shutil.rmtree('/usr/lib/nltk_data', ignore_errors=True)
!pip uninstall -y nltk
!pip install --no-cache-dir nltk
import nltk
nltk.download('punkt')        # Tokenizer model
nltk.download('stopwords')    # Optional
nltk.download('wordnet')      # Optional
from nltk.tokenize import word_tokenize, sent_tokenize

sample_text = "Dr. Smith's research in 2024 shows that AI can't solve everything."
print("Word tokens:", word_tokenize(sample_text))
print("Sentence tokens:", sent_tokenize(sample_text))


Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
