Imports

In [1]:
import pandas as pd
import re


Load Dataset

In [2]:
DATA_PATH = "../data/raw/AMAZON_FASHION.json"

df = pd.read_json(DATA_PATH, lines=True)

print(f"Dataset loaded with {df.shape[0]} reviews")


Dataset loaded with 883636 reviews


Keep ONLY Required Columns

In [3]:
df = df[
    [
        "reviewerID",
        "asin",
        "overall",
        "reviewText",
        "summary",
        "verified",
        "unixReviewTime"
    ]
]


Remove Empty Reviews

In [4]:
df = df[df["reviewText"].notnull()].reset_index(drop=True)

print(f"Remaining reviews after removing null text: {df.shape[0]}")


Remaining reviews after removing null text: 882403


Define Minimal Cleaning Function

In [5]:
def basic_clean(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r"<.*?>", " ", text)
    
    # Remove extra spaces (multiple â†’ single)
    text = re.sub(r"\s+", " ", text).strip()
    
    return text


Apply Cleaning to Review Text

In [6]:
df["clean_review_text"] = df["reviewText"].apply(basic_clean)


Verify Before vs After

In [7]:
df[["reviewText", "clean_review_text"]].sample(3)


Unnamed: 0,reviewText,clean_review_text
865840,As for all three shirts after laundering I fou...,as for all three shirts after laundering i fou...
283467,7 year old got it for her BFF. Look very nice ...,7 year old got it for her bff. look very nice ...
622250,This is so beautiful on. Looks sexy. Great fit.,this is so beautiful on. looks sexy. great fit.


Save Cleaned Dataset

In [8]:
OUTPUT_PATH = "../data/processed/reviews_clean.csv"

df.to_csv(OUTPUT_PATH, index=False)

print("Cleaned dataset saved to data/processed/reviews_clean.csv")


Cleaned dataset saved to data/processed/reviews_clean.csv
