In [1]:
# load pd df from csv of jlc_parts.csv

import pandas as pd
import numpy as np

# load csv
df = pd.read_csv('jlc_parts.csv', sep=',', header=0, index_col=0)

In [1]:
import requests
from bs4 import BeautifulSoup

def get_lcsc_data(lcsc_id):
    # Construct the URL
    url = f'https://jlcpcb.com/partdetail/{lcsc_id}'
    
    # Get the HTML content
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return None  # Or some error handling

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html5lib')
    
    # Dictionary to store the data
    part_details = {}

    # Extracting general details
    for dl in soup.find_all('dl', class_='flex dl-list'):
        dt = dl.find('dt').get_text(strip=True)
        dd = dl.find('dd').get_text(strip=True)
        part_details[dt] = dd

    # Extracting stock information
    stock_info = soup.find("div", class_="text-16 font-bold")
    stock = int(stock_info.text.strip().split(': ')[1]) if stock_info else None
    part_details['Stock'] = stock

    # Extracting price information
    price_divs = soup.find_all("div", class_="flex items-center justify-between mt-14")
    price = []
    for div in price_divs:
        qty = div.find("span", class_="w-120 inline-block").text.strip()
        pr = div.find("span", class_="w-120 text-left").text.strip().replace('$', '')
        price.append((qty, float(pr)))
    part_details['Price'] = price

    # Check if 'JLCPCB Part #' is empty
    if not part_details.get('JLCPCB Part #'):
        return None

    return part_details

# Test the function with a specific lcsc_id
lcsc_id = "C10050"
data = get_lcsc_data(lcsc_id)
# print(data)
data


{'Manufacturer': 'STMicroelectronics',
 'MFR.Part #': 'VIPER53-DIP-E',
 'JLCPCB Part #': 'C10050',
 'Package': 'DIP-8',
 'Description': '300kHz 620V 8.4V~19V DIP-8  AC-DC Controllers & Regulators ROHS',
 'Datasheet': 'Download',
 'Source': 'JLCPCB',
 'Assembly Type': 'Wave SolderingA PCB assembly fixture is needed to protect and support this part during the assembly process.',
 'CAD Model': 'PCB Footprint or Symbol',
 'Stock': 0,
 'Price': [('1+', 0.3444),
  ('10+', 0.2814),
  ('30+', 0.2544),
  ('100+', 0.2207),
  ('500+', 0.2057),
  ('1000+', 0.1967)]}

In [2]:
# get_lcsc_data('C1529')

# put in firestore
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# Use a service account.
cred = credentials.Certificate('atopile-880ca67acfe2.json')

app = firebase_admin.initialize_app(cred)

db = firestore.client()

# Function to add data to Firestore
# Function to add data to Firestore
# Function to add data to Firestore with the 'JLCPCB Part #' as the document name
def add_to_firestore(collection_name, data_dict):
    # Extract the 'JLCPCB Part #' to use as the document ID
    doc_id = data_dict.get('JLCPCB Part #')
    if not doc_id:
        raise ValueError("Missing 'JLCPCB Part #' in the data dictionary")

    # Transform nested arrays if necessary (e.g., 'Price' field)
    if 'Price' in data_dict:
        data_dict['Price'] = [{'quantity': p[0], 'price': p[1]} for p in data_dict['Price']]
    
    # Specify the document ID and set the data
    collection_ref = db.collection(collection_name).document(doc_id)
    collection_ref.set(data_dict)



# Use the function
# part_data = get_lcsc_data('C1529')
# add_to_firestore('unparsed-parts', part_data)
    
def scrape_range(start, end):
    for x in range(start, end):
        part_data = get_lcsc_data(f'C{x}')
        if part_data:
            add_to_firestore('unparsed-parts', part_data)
            print(f'C{x} added')
        else:
            print(f'C{x} not found')

# Create threads to scrape the parts in parallel
from threading import Thread

# Define the number of threads
num_threads = 300
min_part_id = 10000
max_part_id = 200000

# Create the threads
threads = []
for i in range(num_threads):
    start = min_part_id + i * (max_part_id - min_part_id) // num_threads
    end = min_part_id + (i + 1) * (max_part_id - min_part_id) // num_threads
    threads.append(Thread(target=scrape_range, args=(start, end)))

# Start the threads
for thread in threads:
    thread.start()

# Wait for the threads to finish
for thread in threads:
    thread.join()



C11266 not found


Exception in thread Thread-5 (scrape_range):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 42, in scrape_range
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 29, in add_to_firestore
NameError: name 'db' is not defined


C14433 not found


Exception in thread Thread-29 (scrape_range):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 42, in scrape_range
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 29, in add_to_firestore
NameError: name 'db' is not defined


C11900 not found
C35333 not found


Exception in thread Thread-27 (scrape_range):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 42, in scrape_range
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 29, in add_to_firestore
NameError: name 'db' is not defined


C20766 not foundC10000 not found

C21400 not found
C26466 not found
C12533 not found
C13166 not found
C13800 not found
C27100 not found


Exception in thread Thread-35 (scrape_range):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 42, in scrape_range
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 29, in add_to_firestore
NameError: name 'db' is not defined


C111333 not foundC97400 not found

C93600 not found


Exception in thread Exception in thread Thread-194 (scrape_range):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
Exception in thread Thread-135 (scrape_range):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
Exception in thread Thread-168 (scrape_range):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 982, in run
Thread-148 (scrape_range):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.f

C179733 not foundC77133 not found

C120833 not found
C16333 not found
C48633 not found
C30266 not found
C35966 not found
C45466 not found
C194300 not found
C197466 not found
C58133 not found
C31533 not found
C133500 not found
C138566 not found
C54333 not found
C34066 not found
C100566 not found
C191766 not found
C68266 not found
C44200 not found
C39133 not found
C89166 not found
C158833 not found
C32166 not found
C15700 not found
C42300 not found
C49900 not found
C47366 not found
C32800 not found
C64466 not found
C106266 not found
C182900 not found
C40400 not found
C67000 not found
C129700 not found
C142366 not found
C86633 not found
C81566 not found
C28366 not found
C121466 not found
C108166 not found
C69533 not found
C132233 not found
C165166 not found
C50533 not found
C94866 not found
C58766 not found
C95500 not found
C149333 not found
C79033 not found
C56866 not found
C39766 not found
C85366 not found
C37233 not found
C51166 not found
C49266 not found
C37866 not found
C189866 not f

    self._target(*self._args, **self._kwargs)
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 42, in scrape_range
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 29, in add_to_firestore
    self._target(*self._args, **self._kwargs)
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 42, in scrape_range
    self.run()
  File "/opt/homebrew/Cellar/python@3.11/3.11.6/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 982, in run
NameError: name 'db' is not defined
NameError: name 'db' is not defined
    self._target(*self._args, **self._kwargs)
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 42, in scrape_range
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipykernel_28524/1362866264.py", line 29, in add_to_firestore
  File "/var/folders/xl/r_y6qlm504l70lmz99q_ndfw0000gn/T/ipy

TypeError: WebDriver.__init__() got an unexpected keyword argument 'executable_path'

In [98]:
import requests
from bs4 import BeautifulSoup

def get_lcsc_data(lcsc_id):
    # Construct the URL
    url = f'https://jlcpcb.com/partdetail/{lcsc_id}'
    
    # Get the HTML content
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return None  # Or some error handling

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Dictionary to store the data
    part_details = {}

    # Extracting general details
    for dl in soup.find_all('dl', class_='flex dl-list'):
        dt = dl.find('dt').get_text(strip=True)
        dd = dl.find('dd').get_text(strip=True)
        part_details[dt] = dd

    # Extracting stock information
    stock_info = soup.find("div", class_="text-16 font-bold")
    stock = int(stock_info.text.strip().split(': ')[1]) if stock_info else None
    part_details['Stock'] = stock

    # Extracting price information
    price_divs = soup.find_all("div", class_="flex items-center justify-between mt-14")
    price = []
    for div in price_divs:
        qty = div.find("span", class_="w-120 inline-block").text.strip()
        pr = div.find("span", class_="w-120 text-left").text.strip().replace('$', '')
        price.append((qty, float(pr)))
    part_details['Price'] = price

    # Extracting specific attributes from the specifications table
    # First find the 'Specifications' header
    specs_header = soup.find('nav', text='Specifications')
    if specs_header:
        # Then find the table following this header
        spec_tables = specs_header.find_all('div', class_='el-table__body-wrapper').find('table')
        for spec_table in spec_tables:
            for row in spec_table.find_all('tr', class_='el-table__row'):
                cells = row.find_all('div', class_='cell')
                if len(cells) == 2:
                    attr_name = cells[0].text.strip()
                    attr_value = cells[1].text.strip()
                    part_details[attr_name] = attr_value

    # Check if 'JLCPCB Part #' is empty
    if not part_details.get('JLCPCB Part #'):
        return None

    return part_details

# Test the function with a specific lcsc_id
lcsc_id = "C25744"
data = get_lcsc_data(lcsc_id)
data


  specs_header = soup.find('nav', text='Specifications')


{'Manufacturer': 'UNI-ROYAL(Uniroyal Elec)',
 'MFR.Part #': '0402WGF1002TCE',
 'JLCPCB Part #': 'C25744',
 'Package': '0402',
 'Description': '62.5mW Thick Film Resistors ±100ppm/℃ ±1% 10kΩ 0402  Chip Resistor - Surface Mount ROHS',
 'Datasheet': 'Download',
 'Source': 'JLCPCB',
 'Assembly Type': 'SMT AssemblyA PCB assembly fixture is needed to protect and support this part during the assembly process.',
 'CAD Model': 'PCB Footprint or Symbol',
 'Stock': 9655804,
 'Price': [('1+', 0.0005),
  ('1000+', 0.0004),
  ('3000+', 0.0003),
  ('10000+', 0.0003),
  ('50000+', 0.0002),
  ('50200+', 0.0002)]}

In [94]:
import requests
from bs4 import BeautifulSoup

def get_lcsc_data(lcsc_id):

    url = f'https://jlcpcb.com/partdetail/{lcsc_id}'

    # Get the HTML content
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        return None  # Or some error handling

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find the section with the specifications or attributes
    # This will depend on the actual structure of your HTML, but let's assume it's within a div with a specific class
    spec_div = soup.find('div', class_='spec-comp-box')

    # If the div is not found, try another method or print an error message
    if not spec_div:
        print("Specification section not found.")
    else:
        # Dictionary to store the attributes
        attributes = {}

        # Assuming each attribute is in a separate 'div' within the 'spec-comp-box' div
        attribute_divs = spec_div.find_all('div', recursive=False)

        for div in attribute_divs:
            # The structure within each 'div' might contain the attribute name and value
            # Extract the attribute name and value based on your HTML structure
            attribute_name = div.find('div', class_='attribute-name-class').text  # Replace with actual class or tag
            attribute_value = div.find('div', class_='attribute-value-class').text  # Replace with actual class or tag
            attributes[attribute_name] = attribute_value

        # Print the extracted attributes
        for attr_name, attr_value in attributes.items():
            print(f"{attr_name}: {attr_value}")

In [95]:
get_lcsc_data('C25744')

AttributeError: 'NoneType' object has no attribute 'text'