In [86]:
# Import necessary libraries
import pandas as pd
import re
import matplotlib.pyplot as plt
from collections import Counter
from textblob import TextBlob

re - Regular expression operations 
This module provides regular expression matching operations. The re module is versatile and very useful for text processing tasks, enabling you to quickly identify and manipulate patterns in text.

Counter from collections - 
Counter is a subclass of dict in the collections module. It is used to count the frequency of elements in an iterable, like a list or a string, and returns a dictionary-like object where keys are elements and values are their counts.

TextBlob is an external Python library for processing textual data. It provides a simple API for performing common natural language processing (NLP) tasks

In [88]:
# Load the data
data = {
    'Review_ID': [201, 202, 203, 204, 205, 206],
    'Customer_Review': [
        "!!!ThIs PrOdUct Is A-ma-zing!! Really gooood QUALITY!! :):):)",
        "Not wOrth the $$$! Too much hype... Didn't last long :( :( ",
        "    Perfect!!! Couldn't ask for BETTER!   :)",
        "Just o.k., nothing special.  Over-Priced for sureee!!!",
        "Th!s item???? Is very disappointing. poor #quality!!",
        "ABSOLUTELY **LOVE** IT!!! Even better than expected...!!!    "
    ]
}

In [90]:
# Step 1: Load data into a DataFrame
df = pd.DataFrame(data)

In [None]:
print("Original DataFrame:\n")
df

In [None]:
# Step 2: Lowercase conversion
df['Cleaned_Review'] = df['Customer_Review'].str.lower()
print("After Lowercase Conversion:\n")
print(df[['Review_ID', 'Cleaned_Review']])

In [None]:
# Step 3: Remove leading and trailing whitespace
df['Cleaned_Review'] = df['Cleaned_Review'].str.strip()
print("\nAfter Removing Leading and Trailing Whitespace:")
print(df[['Review_ID', 'Cleaned_Review']])

In [None]:
# Step 4: Remove punctuation and special characters using regex
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
print("After Removing Punctuation and Special Characters:\n")
df

In [None]:
# Step 5: Remove common stopwords (e.g., 'the', 'and', 'it') manually or with a predefined list
stopwords = {'the', 'is', 'it', 'be', 'but', 'could','for'}
df['Cleaned_Review'] = df['Cleaned_Review'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stopwords])
)
print("\nAfter Removing Stopwords:")
df

In [None]:
# Step 5: Correct elongated words (e.g., 'goooood' -> 'good', 'suuuuure' -> 'sure') using regex
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: re.sub(r'(\w)\1{2,}', r'\1\1', x))
print("\nAfter Correcting Elongated Words:\n")
print(df[['Review_ID', 'Cleaned_Review']])

In [None]:
# Step 6: Spelling Correction
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: str(TextBlob(x).correct()))
print("\nAfter Spelling Correction:")
print(df[['Review_ID', 'Cleaned_Review']])

In [None]:
# Step 6: Tokenize reviews to prepare for visualization
df['Tokenized_Review'] = df['Cleaned_Review'].apply(lambda x: x.split())
print("\nTokenized Reviews:")
print(df[['Review_ID', 'Tokenized_Review']])

In [None]:
# Step 7: Flatten the list of all words for word frequency analysis
all_words = [word for tokens in df['Tokenized_Review'] for word in tokens]
word_counts = Counter(all_words)

In [None]:
# Step 8: Plotting the frequency of top 10 words
top_words = word_counts.most_common(10)
words, frequencies = zip(*top_words)

plt.figure(figsize=(10, 6))
plt.bar(words, frequencies, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10 Most Frequent Words in Reviews')
plt.show()