## 1.1 Collect Products 

We begin by collecting the necessary product text data from [DigiKey](https://www.digikey.com/) website.

Run the following notebook to generate a product list with their respective url's and categories.

In [27]:
import requests
from bs4 import BeautifulSoup
import re
import time
import json

In [28]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
    "Referer": "https://www.digikey.com/",
}

BASE_URL = "https://www.digikey.com"

def clean_category_name(text):
    cleaned = re.sub(r'\s*[\d,]+\s+?.*$', '', text, flags=re.IGNORECASE)
    return cleaned.strip()

def determine_main_category(name):
    name_lower = name.lower()
    if any(term in name_lower for term in ['anti-static', 'esd', 'clean room']):
        return 'Anti-Static, ESD, Clean Room Products'
    elif any(term in name_lower for term in ['audio', 'microphone', 'speaker', 'amplifier']):
        return 'Audio Products'
    elif any(term in name_lower for term in ['battery', 'batteries']):
        return 'Battery Products'
    elif any(term in name_lower for term in ['cable', 'wire', 'connector']):
        return 'Cables & Connectors'
    elif any(term in name_lower for term in ['capacitor']):
        return 'Capacitors'
    else:
        return 'Other'

def parse_category(text):
    pattern = re.compile(r'^(.*?)([\d,]+)\s*Items$', re.IGNORECASE)
    match = pattern.match(text)
    if match:
        name = match.group(1).strip()
        count_str = match.group(2).replace(',', '')
        count = int(count_str)
        return name, count
    else:
        return text.strip(), 0

def scrape_main_categories():
    try:
        url = "https://www.digikey.com/en/products"
        print(f"Scraping main categories from: {url}")
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        categories = []

        for link in soup.find_all('a', href=True):
            href = link.get('href', '')
            text = link.get_text(" ",strip=True)
            if '/en/products/' in href and text:
                name, item_count = parse_category(text)
                print(f"Parsed name: '{name}', item count: {item_count}")
                match = re.search(r'([\d,]+)\s*Items?', text, flags=re.IGNORECASE)
                item_count = int(match.group(1).replace(',', '')) if match else 0
                name = clean_category_name(text)
                if name and len(name) > 3:
                    categories.append({
                        'name': name,
                        'url': BASE_URL + href if not href.startswith('http') else href,
                        'category': determine_main_category(name),
                        'items': item_count 
                    })
        
        # Remove duplicates
        seen = set()
        unique_categories = []
        for cat in categories:
            if cat['url'] not in seen:
                seen.add(cat['url'])
                unique_categories.append(cat)
        
        print(f"Found {len(unique_categories)} unique categories")
        return unique_categories
    except Exception as e:
        print(f"Error: {e}")
        return []

### Save Raw Data

In [29]:
categories = scrape_main_categories()

with open('./intermediate_data/Products_List_Raw.json', 'w') as f:
    json.dump(categories, f, indent=2)


Scraping main categories from: https://www.digikey.com/en/products
Parsed name: 'Automation & Control', item count: 0
Parsed name: 'Accessories', item count: 0
Parsed name: 'Controllers - Accessories', item count: 0
Parsed name: 'Controllers - PLC Modules', item count: 0
Parsed name: 'Controllers - Process, Temperature', item count: 0
Parsed name: 'Controllers - Programmable Logic (PLC)', item count: 0
Parsed name: 'Human Machine Interface (HMI)', item count: 0
Parsed name: 'Industrial Equipment', item count: 0
Parsed name: 'Machine Vision - Cameras/Sensors', item count: 0
Parsed name: 'Monitor - Current/Voltage Transducer', item count: 0
Parsed name: 'Panel Meters', item count: 0
Parsed name: 'Panel Meters - Counters, Hour Meters', item count: 0
Parsed name: 'Pneumatics, Hydraulics', item count: 0
Parsed name: 'Time Delay Relays', item count: 0
Parsed name: 'See All', item count: 0
Parsed name: 'PLC Modules', item count: 0
Parsed name: 'Cables, Wires', item count: 0
Parsed name: 'Cabl