# 2.2 Clean Product Data
This notebook cleans the product data in preparation for modeling. The ultimate goal is to match cleaned articles to the products based on their cleaned product data.

In [21]:
import json
import re
from bs4 import BeautifulSoup

In [22]:
# Load raw product JSON
with open("./data/product-data-raw.json", "r", encoding="utf-8") as f:
    data = json.load(f)

def clean_html(raw_html):
    return BeautifulSoup(str(raw_html), "html.parser").get_text()

def normalize_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)            # normalize whitespace
    text = re.sub(r'[^\w\s]', '', text)         # remove punctuation
    return text.strip()

def remove_navigation_noise(text):
    """Filter out repeated UI/menu content"""
    keywords_to_remove = [
        "expand submenu", "collapse submenu", "site navigation", "create account",
        "your cart", "close cart", "log in", "info", "contact", "dealer locator",
        "who we are", "ordering information", "ambassadors", "journal"
    ]
    for kw in keywords_to_remove:
        text = re.sub(rf'\b{re.escape(kw)}\b', '', text)
    return re.sub(r'\s+', ' ', text).strip()

# Clean each product
for product in data:
    clean_text = clean_html(raw_text)
    clean_text = normalize_text(clean_text)
    clean_text = remove_navigation_noise(clean_text)
    product["clean_text"] = clean_text  # new field for cleaned version


## Save Cleaned Data
We save the cleaned product data as a json file for the next steps.

In [23]:
with open("./intermediate_data/product-data-clean.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)

## Next step
After you saved the dataset here, run the next step in the workflow [03-01-AnalyzeArticles.ipynb](./03-01-AnalyzeArticles.ipynb) or go back to [00-Workflow.ipynb](./00-Workflow.ipynb).

---

**Authors:**
[Salah Mohamoud](mailto:salah.mohamoud.dev@gmail.com),
[Sai Keertana Lakku](mailto:saikeertana005@gmail.com),
[Zhen Zhuang](mailto:zhuangzhen17cs@gmail.com),
[Nick Capaldini](mailto:nick.capaldini@ridethenextwave.com), Ride The Next Wave, May 19, 2025

---