# 2.1 Clean Products List

This notebook cleans the prooduct raw data in preparation for modeling. The ultimate goal is to match cleaned articles to the products based on the provided data.

In [6]:
import json
import time
import requests
from bs4 import BeautifulSoup
import re

In [7]:
with open('./intermediate_data/Products_List_Raw.json', 'r') as f:
    categories = json.load(f)

def extract_product_info(element):
    try:
        name_elem = element.find(['a', 'span', 'div'], text=True)
        if name_elem:
            name = name_elem.get_text(strip=True)
            if len(name) > 5:
                return {
                    'name': name,
                    'description': name,
                    'part_number': None,
                    'quantity_available': None
                }
    except:
        return None

def scrape_category_products(category_url):
    try:
        response = requests.get(category_url, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        products = []
        elements = soup.find_all(['div', 'tr'], class_=re.compile(r'product|item|row', re.I))

        for el in elements[:100]:
            product = extract_product_info(el)
            if product:
                products.append(product)
        
        print(f"Found {len(products)} products")
        return products
    except Exception as e:
        return []

all_products = []
for i, cat in enumerate(categories[:20]): 
    products = scrape_category_products(cat['url'])
    for product in products:
        product['category'] = cat['name']
        product['main_category'] = cat['category']
        all_products.append(product)
    time.sleep(2)

results = {
    'categories': categories,
    'products': all_products,
    'summary': {
        'total_categories': len(categories),
        'total_products': len(all_products)
    }
}


### Save Cleaned Product List

In [8]:

with open('./intermediate_data/Products_List_Clean.json', 'w') as f:
    json.dump(results, f, indent=2)