# 🧹 Text Cleaning & Preprocessing

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
df = pd.read_csv('IMDB_Dataset.csv')
print(f"Loaded {len(df)} rows.")

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess(text):
    text = clean_text(text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

In [None]:
df['processed'] = df['review'].apply(preprocess)
df.to_csv('processed_reviews.csv', index=False)
print("✅ Preprocessing complete. Saved as 'processed_reviews.csv'")

In [None]:
def create_wordcloud(text, title):
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        max_words=100,
        colormap='viridis'
    ).generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, fontsize=16)
    plt.axis('off')
    plt.show()

# Combine all positive reviews
positive_text = ' '.join(df[df['sentiment'] == 'positive']['processed'])
create_wordcloud(positive_text, '☀️ Most Common Words in Positive Reviews')

# Combine all negative reviews
negative_text = ' '.join(df[df['sentiment'] == 'negative']['processed'])
create_wordcloud(negative_text, '⛈️ Most Common Words in Negative Reviews')
