In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv('cleaned_data.csv')
print(f"Loaded {len(df)} reviews")

Loaded 7895 reviews


In [3]:
#Clean Text function
def clean_text(text):
        
    if pd.isna(text):  
        return ""

    text = str(text).lower()  
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()

    return text

sample = df['Review text'].iloc[0]
print("BEFORE cleaning:")
print(sample)
print("\nAFTER cleaning:")
print(clean_text(sample))    

BEFORE cleaning:
Nice product, good quality, but price is now rising which is a bad sign. 800-850 was an affordable price, especially when we play everyday. So kindly help us out in terms of the price. Thank You.READ MORE

AFTER cleaning:
nice product good quality but price is now rising which is a bad sign was an affordable price especially when we play everyday so kindly help us out in terms of the price thank youread more


In [4]:
#Clean All reviews
# Apply cleaning to ALL reviews
print("Cleaning all reviews... (this takes a minute)")

df['Cleaned_Text'] = df['Review text'].apply(clean_text)

print("Done!")


for i in range(3):
    print(f"\n--- Example {i+1} ---")
    print(f"ORIGINAL: {df['Review text'].iloc[i][:80]}...")
    print(f"CLEANED:  {df['Cleaned_Text'].iloc[i][:80]}...")

Cleaning all reviews... (this takes a minute)
Done!

--- Example 1 ---
ORIGINAL: Nice product, good quality, but price is now rising which is a bad sign. 800-850...
CLEANED:  nice product good quality but price is now rising which is a bad sign was an aff...

--- Example 2 ---
ORIGINAL: They didn't supplied Yonex Mavis 350. Outside cover was Yonex Ad inside was a ch...
CLEANED:  they didnt supplied yonex mavis outside cover was yonex ad inside was a cheapest...

--- Example 3 ---
ORIGINAL: Worst product. Damaged shuttlecocks packed in new box. It's not a original yonex...
CLEANED:  worst product damaged shuttlecocks packed in new box its not a original yonex pr...


In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rriya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [9]:
from nltk.corpus import stopwords

In [11]:
# Remove Stopwords
stop_words = set(stopwords.words('english'))


stop_words.update(['flipkart', 'product', 'read', 'more'])

def remove_stopwords(text):
    words = text.lower().split()
    
    filtered = [word for word in words if word not in stop_words]
    return ' '.join(filtered)

print(" Removing stopwords...")
df['No_Stopwords'] = df['Cleaned_Text'].apply(remove_stopwords)

# Show difference
print("\nExample:")
print(f"WITH stopwords:    {df['Cleaned_Text'].iloc[0]}")
print(f"WITHOUT stopwords: {df['No_Stopwords'].iloc[0]}")

 Removing stopwords...

Example:
WITH stopwords:    nice product good quality but price is now rising which is a bad sign was an affordable price especially when we play everyday so kindly help us out in terms of the price thank youread more
WITHOUT stopwords: nice good quality price rising bad sign affordable price especially play everyday kindly help us terms price thank youread


In [13]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rriya\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rriya\AppData\Roaming\nltk_data...


True

In [16]:
from nltk.stem import WordNetLemmatizer

In [17]:
# Lemmatiztion
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    lemmatized = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized)

print("Lemmatizing... (this takes a minute)")
df['Final_Text'] = df['No_Stopwords'].apply(lemmatize_text)

print("Done!")


print("\nFull transformation example:")
print(f"ORIGINAL:     {df['Review text'].iloc[0]}")
print(f"CLEANED:      {df['Cleaned_Text'].iloc[0]}")
print(f"NO STOPWORDS: {df['No_Stopwords'].iloc[0]}")
print(f"LEMMATIZED:   {df['Final_Text'].iloc[0]}")

Lemmatizing... (this takes a minute)
Done!

Full transformation example:
ORIGINAL:     Nice product, good quality, but price is now rising which is a bad sign. 800-850 was an affordable price, especially when we play everyday. So kindly help us out in terms of the price. Thank You.READ MORE
CLEANED:      nice product good quality but price is now rising which is a bad sign was an affordable price especially when we play everyday so kindly help us out in terms of the price thank youread more
NO STOPWORDS: nice good quality price rising bad sign affordable price especially play everyday kindly help us terms price thank youread
LEMMATIZED:   nice good quality price rising bad sign affordable price especially play everyday kindly help u term price thank youread


In [18]:
#Remove Short Reviews
df['Word_Count'] = df['Final_Text'].apply(lambda x: len(x.split()))
df_final = df[df['Word_Count'] >= 3].copy()

print(f"Removed {len(df) - len(df_final)} very short reviews")
print(f"Remaining: {len(df_final)} reviews")

Removed 4983 very short reviews
Remaining: 2912 reviews


In [20]:
# final clean dataset

final_df = df_final[['Final_Text', 'Sentiment']].copy()
final_df.columns = ['Review', 'Sentiment']

final_df['Label'] = final_df['Sentiment'].map({'Positive': 1, 'Negative': 0})

print("Final dataset ready!")
print(final_df.head())


final_df.to_csv('preprocessed_data.csv', index=False)
print(" Saved to 'preprocessed_data.csv'")

Final dataset ready!
                                              Review Sentiment  Label
0  nice good quality price rising bad sign afford...  Positive      1
1  didnt supplied yonex mavis outside cover yonex...  Negative      0
2  worst damaged shuttlecock packed new box origi...  Negative      0
3  pricedjust retaileri didnt understand wat adva...  Negative      0
4                    good quality delivered timeread  Positive      1
 Saved to 'preprocessed_data.csv'
