In [18]:
import requests
from bs4 import BeautifulSoup
import time
import json
import re

In [19]:

class DigiKeyProductScraper:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Connection": "keep-alive",
            "Referer": "https://www.digikey.com/",
        }
        self.base_url = "https://www.digikey.com"
        self.product_data = []

    def scrape_main_categories(self):
        """Scrape the main product index page to get categories and subcategories"""
        try:
            url = "https://www.digikey.com/en/products"
            print(f"Scraping main categories from: {url}")
            
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Look for category sections
            categories = []
            
            # Look for category links
            category_links = soup.find_all('a', href=True)
            
            for link in category_links:
                href = link.get('href', '')
                text = link.get_text(strip=True)
                
                # Filter for product category links
                if '/en/products/' in href and text:
                    clean_name = self.clean_category_name(text)
                    
                    if clean_name and len(clean_name) > 3:  # Filter out short/invalid names
                        categories.append({
                            'name': clean_name,
                            'url': self.base_url + href if not href.startswith('http') else href,
                            'category': self.determine_main_category(clean_name)
                        })
            
            # Remove duplicates
            seen_urls = set()
            unique_categories = []
            for cat in categories:
                if cat['url'] not in seen_urls:
                    seen_urls.add(cat['url'])
                    unique_categories.append(cat)
            
            print(f"Found {len(unique_categories)} unique categories")
            return unique_categories
            
        except Exception as e:
            print(f"Error scraping main categories: {e}")
            return []

    def clean_category_name(self, text):
        """Clean category name by removing item counts"""
        cleaned = re.sub(r'\s*[\d,]+\s+Items?.*$', '', text, flags=re.IGNORECASE)
        return cleaned.strip()

    def determine_main_category(self, name):
        """Determine main category based on product name"""
        name_lower = name.lower()
        if any(term in name_lower for term in ['anti-static', 'esd', 'clean room']):
            return 'Anti-Static, ESD, Clean Room Products'
        elif any(term in name_lower for term in ['audio', 'microphone', 'speaker', 'amplifier']):
            return 'Audio Products'
        elif any(term in name_lower for term in ['battery', 'batteries']):
            return 'Battery Products'
        elif any(term in name_lower for term in ['cable', 'wire', 'connector']):
            return 'Cables & Connectors'
        elif any(term in name_lower for term in ['capacitor']):
            return 'Capacitors'
        else:
            return 'Other'

    def scrape_category_products(self, category_url, max_pages=5):
        """Scrape individual products from a category page"""
        try:
            print(f"Scraping products from: {category_url}")
            
            response = requests.get(category_url, headers=self.headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            products = []
            
            # Look for product listings
            product_elements = soup.find_all(['div', 'tr'], class_=re.compile(r'product|item|row', re.I))
            
            for element in product_elements[:100]:  # Limit to first 100 products per category
                product_info = self.extract_product_info(element)
                if product_info:
                    products.append(product_info)
            
            print(f"Found {len(products)} products in this category")
            return products
            
        except Exception as e:
            print(f"Error scraping category {category_url}: {e}")
            return []

    def extract_product_info(self, element):
        """Extract individual product information from HTML element"""
        try:
            name_elem = element.find(['a', 'span', 'div'], text=True)
            if name_elem:
                name = name_elem.get_text(strip=True)
                if len(name) > 5:
                    return {
                        'name': name,
                        'description': name, 
                        'part_number': None, 
                        'quantity_available': None 
                    }
        except:
            pass
        return None

    def run_full_scrape(self, include_products=False):
        """Run the complete scraping process"""
        print("Starting Digi-Key product scraping...")
        
        # Get all categories
        categories = self.scrape_main_categories()
        if not categories:
            print("No categories found. Exiting.")
            return
        
        # Scrape products from each category
        if include_products:
            for i, category in enumerate(categories[:20]):  # Limit to first 10 categories
                print(f"\nScraping category {i+1}/{min(10, len(categories))}: {category['name']}")
                products = self.scrape_category_products(category['url'])
                
                for product in products:
                    product['category'] = category['name']
                    product['main_category'] = category['category']
                    self.product_data.append(product)
                
                time.sleep(2) # the delays
        
        # Save results
        self.save_results(categories)
        
        return categories

    def save_results(self, categories):
        """Save results to JSON file"""
        results = {
            'categories': categories,
            'products': self.product_data,
            'summary': {
                'total_categories': len(categories),
                'total_products': len(self.product_data)
            }
        }
        
        with open('./intermediate_data/digikey_product_data.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        
        print(f"\nResults saved to: digikey_product_data.json")

# Usage
if __name__ == "__main__":
    scraper = DigiKeyProductScraper()
    
    #Get categories and counts
    categories = scraper.run_full_scrape(include_products=False)
    


Starting Digi-Key product scraping...
Scraping main categories from: https://www.digikey.com/en/products
Found 681 unique categories

Results saved to: digikey_product_data.json
