In [2]:
"""
02b - Text Preprocessing
Clean and prepare review text for feature engineering
"""

# %%
# Imports
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import re
from pathlib import Path

# %%
# Load raw data
df = pd.read_csv('../data/processed/reviews_cleaned.csv')
print(f" Loaded {len(df)} reviews")
print(f" Columns: {list(df.columns)}")

# %%
# Text cleaning function
def clean_text(text):
    """
    Clean review text for NLP processing
    """
    if pd.isna(text):
        return ""

    # Convert to string and lowercase
    text = str(text).lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.!?,\']', ' ', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# %%
# Apply cleaning
print("\n Cleaning text...")
print(" This may take 1-2 minutes...")

df['text_clean'] = df['text_'].apply(clean_text)

print(" Text cleaned!")

# %%
# Show examples
print("\n CLEANING EXAMPLES")
print("="*70)

for i in range(3):
    print(f"\n--- Example {i+1} ---")
    print(f"Original: {df.iloc[i]['text_'][:150]}...")
    print(f"Cleaned:  {df.iloc[i]['text_clean'][:150]}...")
    print("-"*70)

# %%
# Check for empty texts
empty_count = (df['text_clean'] == "").sum()
print(f"\n Empty texts after cleaning: {empty_count}")

if empty_count > 0:
    print(f"  Removing {empty_count} empty reviews...")
    df = df[df['text_clean'] != ""]
    print(f" Remaining reviews: {len(df)}")

# %%
# Add basic text statistics
print("\n Adding text statistics...")

df['text_length'] = df['text_clean'].str.len()
df['word_count'] = df['text_clean'].str.split().str.len()

print(" Statistics added")

# Show stats
print("\nText Statistics:")
print(df[['text_length', 'word_count']].describe())

# %%
# Save processed data
print("\n Saving processed data...")

output_path = Path('../data/processed/reviews_cleaned.csv')
df.to_csv(output_path, index=False)

print(f" Saved to: {output_path}")
print(f" Total reviews: {len(df)}")
print(f" Columns: {list(df.columns)}")


 Loaded 40432 reviews
 Columns: ['category', 'rating', 'label', 'text_', 'text_length', 'word_count', 'is_fake', 'text_clean']

 Cleaning text...
 This may take 1-2 minutes...
 Text cleaned!

 CLEANING EXAMPLES

--- Example 1 ---
Original: Love this!  Well made, sturdy, and very comfortable.  I love it!Very pretty...
Cleaned:  love this! well made, sturdy, and very comfortable. i love it!very pretty...
----------------------------------------------------------------------

--- Example 2 ---
Original: love it, a great upgrade from the original.  I've had mine for a couple of years...
Cleaned:  love it, a great upgrade from the original. i've had mine for a couple of years...
----------------------------------------------------------------------

--- Example 3 ---
Original: This pillow saved my back. I love the look and feel of this pillow....
Cleaned:  this pillow saved my back. i love the look and feel of this pillow....
-----------------------------------------------------------------