# 2.1 Clean Articles
This notebook cleans the article data in preparation for modeling. The ultimate goal is to match cleaned articles to the products based on their provided data.

In [58]:
import pandas as pd
import re
import json
from bs4 import BeautifulSoup

In [59]:

# Load the JSON file
with open("./data/article-data-raw.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Define text cleaning functions
def clean_html(text):
    return BeautifulSoup(str(text), "html.parser").get_text()

def normalize_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)           # normalize whitespace
    text = re.sub(r'[^\w\s]', '', text)        # remove punctuation
    return text.strip()

# Parse and clean each article while preserving metadata
def parse_article(text, metadata):
    # Combine text and metadata into one dictionary
    return {
        "title": metadata.get("title", ""),
        "author": metadata.get("author", ""),
        "date": metadata.get("date", ""),
        "url": metadata.get("url", ""),
        "content": text.strip()
    }

# Combine each document with its corresponding metadata
if "documents" in data and isinstance(data["documents"], list):
    documents = data["documents"]
    metadatas = data.get("metadatas", [])

    if len(documents) != len(metadatas):
        raise ValueError("Mismatch between number of documents and metadata entries")

    cleaned_articles = [parse_article(doc, meta) for doc, meta in zip(documents, metadatas)]
else:
    raise ValueError("JSON does not contain 'documents' key with a list")



## Save Cleaned Data
We save the cleaned article data as a json file for the next steps.

In [60]:
with open("./intermediate_data/article-data-clean.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_articles, f, ensure_ascii=False, indent=4)

## Next step
After you saved the dataset here, run the next step in the workflow [02-02-CleanProducts.ipynb](./02-02-CleanProducts.ipynb) or go back to [00-Workflow.ipynb](./00-Workflow.ipynb).

---

**Authors:**
[Salah Mohamoud](mailto:salah.mohamoud.dev@gmail.com),
[Sai Keertana Lakku](mailto:saikeertana005@gmail.com),
[Zhen Zhuang](mailto:zhuangzhen17cs@gmail.com),
[Nick Capaldini](mailto:nick.capaldini@ridethenextwave.com), Ride The Next Wave, May 19, 2025

---