In [None]:
import os
import logging
import sys
import env
from dotenv import load_dotenv

#set certificate to user-specific (avoids issues with company-issued certificate)
# os.environ['REQUESTS_CA_BUNDLE'] = env.cert
CERT_PATH = os.getenv("CERT_PATH")

#logger config
log_formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s', datefmt='%H:%M:%S')
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(log_formatter)

# File handler
file_handler = logging.FileHandler("scrape.log", mode='w', encoding='utf-8')
file_handler.setFormatter(log_formatter)

# Combine them
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(console_handler)
logger.addHandler(file_handler)

In [None]:
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_core.documents import Document
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
from urllib.parse import urljoin, urlparse
import time
import random

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

import UDFS

# BASE_URL = env.BASE_URL
BASE_URL = os.getenv("BASE_URL")
# DOMAIN_FILTER = env.DOMAIN_FILTER
DOMAIN_FILTER = os.getenv("DOMAIN_FILTER")
CRAWLED = set()
start_time = time.time()
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

# Step 1: Crawl base page. FOR LINKS, NOT FOR CONTENT
logger.info(f"Crawling Level 1: {BASE_URL}") #logging for level 1
level1_links = UDFS.get_internal_links(BASE_URL,driver,DOMAIN_FILTER,logger)
# CRAWLED.update(level1_links)  # this is the culprit for section 2 failing. We are adding the tagged links to the CRAWLED list before they are even getting crawled.  
print(f"Level 1: Found {len(level1_links)} links")
logger.info(f"Level 1: Found {len(level1_links)} links.")
logger.info(f"Example links: {list(level1_links)[:3]}")

# Step 2: Crawl one level deeper. AGAIN, FOR LINKS--NOT CONTENT
level2_links = set()
for url in list(level1_links):
    if url not in CRAWLED:
        logger.info(f"Crawling Level 2: {url}")  #logging for level 2
        child_links = UDFS.get_internal_links(url,driver,DOMAIN_FILTER,logger)
        level2_links.update(child_links)
        CRAWLED.add(url)

        delay = random.uniform(1.0, 2.5)
        logger.info(f"Sleeping for {delay:.2f} seconds")
        time.sleep(delay) #Sleep betwen 1.0 and 2.5 seconds

print(f"Level 2: Found {len(level2_links)} additional links")

# 3rd level scrape. ALSO FOR LINKS, NOT FOR CONTENT
level3_links = set()
for url in level2_links:
    if url not in CRAWLED:
        logger.info(f"Crawling Level 3: {url}") # level 3 logging
        child_links = UDFS.get_internal_links(url,driver,DOMAIN_FILTER,logger)
        level3_links.update(child_links)
        CRAWLED.add(url) #add link to the set of crawled links--avoids uuplicating the scrape

        delay = random.uniform(1.0, 2.5)
        logger.info(f"Sleeping for {delay:.2f} seconds")
        time.sleep(delay)

logger.info(f"Level 3: Found {len(level3_links)} additional links")

# Combine and deduplicate all links
# all_links = sorted(CRAWLED)  #not just CRAWLED, b/c it doesn't include the level 3 links.
all_links = sorted(set().union(level1_links, level2_links, level3_links))
logger.info(f"Total unique URLs to load: {len(all_links)}") #logger for all unique links
print(f"Total unique URLs to load: {len(all_links)}")

# Step 3: Load all pages with Selenium. THIS IS WHERE THE ACTUAL SCRAPING OCCURS
logger.info(f"Loading {len(all_links)} pages with Selenium...")

docs = []
for url in all_links:
    logger.info(f"Loading: {url}")
    content = UDFS.get_page_text_with_retry(driver, url, retries=2, wait_time=10, wait_for_selector="main")              # loader = SeleniumURLLoader(urls=all_links)  # THE ORIGIAL LOADER, WHICH WE ARE REPLACING WITH THE NEW UDF

    if content:
        docs.append(Document(page_content=content, metadata={"source": url}))                                       # docs = loader.load()
    else:
        logger.warning(f"⚠️ Skipped empty or failed page: {url}")
logger.info(f"Successfully loaded {len(docs)} documents with meaningful content.")

elapsed = time.time() - start_time
logger.info(f"✅ Scraping completed in {elapsed:.2f} seconds.")


print(f"Successfully loaded {len(docs)} documents.")

10:06:53 [INFO] Crawling Level 1: https://docs.arduino.cc/programming/
10:07:02 [INFO] Links added for tracking for this level: ['https://docs.arduino.cc/built-in-examples/arduino-isp/ArduinoISP/', 'https://docs.arduino.cc/certifications/', 'https://docs.arduino.cc/', 'https://docs.arduino.cc/language-reference/en/functions/digital-io/pinMode/', 'https://docs.arduino.cc/micropython/basics/micropython-basics/', 'https://docs.arduino.cc/libraries/liquidcrystal-i2c/', 'https://docs.arduino.cc/language-reference/en/functions/communication/print/', 'https://docs.arduino.cc/libraries/wifi/', 'https://docs.arduino.cc/libraries/liquidcrystal/', 'https://docs.arduino.cc/libraries/', 'https://docs.arduino.cc/micropython/basics/digital-analog-pins/', 'https://docs.arduino.cc/libraries/stepper/', 'https://docs.arduino.cc/language-reference/', 'https://docs.arduino.cc/micropython/basics/board-examples/', 'https://docs.arduino.cc/micropython/', 'https://docs.arduino.cc/cloud-api/dashboards-api/', 'h

In [None]:
UDFS.save_documents_locally(docs, folder_name="scraped_output20250505")