# Solution: Cleaning and Tokenization of Customer Reviews

In [None]:
# Install and import libraries
!pip install --quiet nltk spacy matplotlib seaborn
import re
import nltk
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
nltk.download('punkt', quiet=True)
nlp = spacy.load("en_core_web_sm")

# Sample messy product reviews
reviews = [
    "Absolutely LOVED this product!!! Will buy again 😊 Visit http://shop.example.com",
    "Worst purchase ever... arrived broken, no response from support @helpdesk",
    "Ok quality; does the job. 5/5 stars! #satisfied",
    "Email me at user@example.org for details about bulk order!!!",
    "Super overpriced!! Paid $299 but performance is meh...",
    "<div>Great build quality</div><p>But shipping was slow</p>",
    "Contact: +44 20 7946 0958 or (020)79460958",
    "MixedCASE and random123numbers and symbols %^&*",
    "Line1\nLine2\tTabbed text\r\nEnd of review",
    "Contractions—can't, won't, shouldn't—are common here."
]

# Cleaning function
def clean_text(text):
    text = re.sub(r'http[s]?://\S+', '', text)                          # URLs
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '', text)              # Emails
    text = re.sub(r'<[^>]+>', '', text)                                 # HTML tags
    text = re.sub(r'\+?\d[\d\s\-\(\)]{7,}\d', '', text)                 # Phone numbers
    text = re.sub(r'[@#]\w+', '', text)                                 # Mentions/hashtags
    text = re.sub(r'[^a-zA-Z0-9\s\']', ' ', text)                       # Non-alphanumeric except apostrophes
    text = re.sub(r'\s+', ' ', text).strip()                            # Whitespace normalization
    return text.lower()

# Clean all reviews
cleaned = [clean_text(r) for r in reviews]
df = pd.DataFrame({'original': reviews, 'cleaned': cleaned})
df.head(3)

# Tokenization methods on the first review
sample = cleaned[0]
print("Sample cleaned review:", sample, "\n")

# 1. Simple split
tokens_split = sample.split()
print("Split tokens:", tokens_split)

# 2. NLTK word_tokenize
from nltk.tokenize import word_tokenize
tokens_nltk = word_tokenize(sample)
print("NLTK tokens :", tokens_nltk)

# 3. spaCy tokens
doc = nlp(sample)
tokens_spacy = [tok.text for tok in doc]
print("spaCy tokens:", tokens_spacy)

# 4. Regex tokenization
tokens_regex = re.findall(r"\b[\w']+\b", sample)
print("Regex tokens:", tokens_regex)

# Aggregate all tokens from cleaned reviews
all_tokens = []
for text in cleaned:
    all_tokens.extend(re.findall(r"\b[\w']+\b", text))

# Frequency distribution
freq = Counter(all_tokens)
top10 = freq.most_common(10)
tokens, counts = zip(*top10)

# Plot top 10 tokens
plt.figure(figsize=(8,5))
sns.barplot(x=list(counts), y=list(tokens), palette="viridis")
plt.title("Top 10 Most Frequent Tokens")
plt.xlabel("Count")
plt.ylabel("Token")
plt.tight_layout()
plt.show()
