In [None]:
import os
import logging
import sys
import env

#set certificate to user-specific (avoids issues with company-issued certificate)
os.environ['REQUESTS_CA_BUNDLE'] = env.cert

#logger config
log_formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s', datefmt='%H:%M:%S')
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(log_formatter)

# File handler
file_handler = logging.FileHandler("scrape.log", mode='w', encoding='utf-8')
file_handler.setFormatter(log_formatter)

# Combine them
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(console_handler)
logger.addHandler(file_handler)

In [None]:
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_core.documents import Document
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
from urllib.parse import urljoin, urlparse
import time
import random

BASE_URL = "https://docs.arduino.cc/programming/"
DOMAIN_FILTER = "docs.arduino.cc"
CRAWLED = set()
start_time = time.time()
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

# ########################################################################################################################################################
# Helper: Get internal links from a page. THE HELPER IS CREATING OUR LIST OF URLS TO SCRAPE
# ########################################################################################################################################################
def get_internal_links(url):
    time.sleep(random.uniform(1.0,2.5)) #Sleep betwen 1.0 and 2.5 seconds
    links = set()
    try:
        # response = requests.get(url, timeout=10) #this was the requests functions that partially failed b/c the base_url is set up with Gatsby/React--so requests.get() only fetches the raw HTML without JavaScript execution. (IE only a placeholder page with no <a> tags or links)
        driver.get(url) #selenium scraper instead of requests.get()
        time.sleep(2) # wait for JS to load
        # soup = BeautifulSoup(response.text, 'html.parser') # changing this to reflect the new Selenium driver instead of response.text
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for tag in soup.find_all('a', href=True):
            href = tag['href']
            full_url = urljoin(url, href)
            parsed = urlparse(full_url)
            if parsed.netloc.endswith(DOMAIN_FILTER) and parsed.scheme.startswith("http"):
                clean_url = full_url.split('#')[0]
                links.add(clean_url)
        logger.info(f"Links added for tracking for this level: {list(links)}")
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        logger.warning(f"Response Exception: failed to fetch {url}: {e}")
    return links


# #######################################################################################################################################################
# A BLOCK OF CODE TO REDO THE ACTUAL SCRAPING, THIS TIME WITH SELENIUM'S WEBDRIVEWAIT FUNCTION TO MORE CAREFULLY SCRAPE AND ONLY GRAB "MAIN" CONTENT WRAPPERS.
# #######################################################################################################################################################

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import logging

def get_page_text_with_retry(driver, url, retries=2, wait_time=10, wait_for_selector="main"):
    """
    Load a URL using Selenium and retry if no content appears.
    Uses WebDriverWait to wait for a specific DOM element.
    """
    for attempt in range(1, retries + 2):  # first try + retries
        try:
            driver.get(url)

            # Wait for the main content to load
            WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_selector))
            )

            soup = BeautifulSoup(driver.page_source, "html.parser")
            text = soup.get_text(separator="\n").strip()

            if text:
                return text

            logging.warning(f"[Attempt {attempt}] No meaningful content at {url}")

        except TimeoutException:
            logging.warning(f"[Attempt {attempt}] Timeout waiting for selector '{wait_for_selector}' on {url}")
        except Exception as e:
            logging.error(f"[Attempt {attempt}] Error loading {url}: {e}")

        time.sleep(2 + attempt)  # Backoff before retry

    logging.error(f"❌ Final failure for {url}")
    return ""

# Step 1: Crawl base page. FOR LINKS, NOT FOR CONTENT
logger.info(f"Crawling Level 1: {BASE_URL}") #logging for level 1
level1_links = get_internal_links(BASE_URL)
CRAWLED.update(level1_links)  # this is the culprit for section 2 failing. We are adding the tagged links to the CRAWLED list before they are even getting crawled.  
print(f"Level 1: Found {len(level1_links)} links")
logger.info(f"Level 1: Found {len(level1_links)} links.")
logger.info(f"Example links: {list(level1_links)[:3]}")

# Step 2: Crawl one level deeper. AGAIN, FOR LINKS--NOT CONTENT
level2_links = set()
for url in list(level1_links):
    if url not in CRAWLED:
        logger.info(f"Crawling Level 2: {url}")  #logging for level 2
        child_links = get_internal_links(url)
        level2_links.update(child_links)
        CRAWLED.add(url)

        delay = random.uniform(1.0, 2.5)
        logger.info(f"Sleeping for {delay:.2f} seconds")
        time.sleep(delay) #Sleep betwen 1.0 and 2.5 seconds

print(f"Level 2: Found {len(level2_links)} additional links")

# 3rd level scrape. ALSO FOR LINKS, NOT FOR CONTENT
level3_links = set()
for url in level2_links:
    if url not in CRAWLED:
        logger.info(f"Crawling Level 3: {url}") # level 3 logging
        child_links = get_internal_links(url)
        level3_links.update(child_links)
        CRAWLED.add(url) #add link to the set of crawled links--avoids uuplicating the scrape

        delay = random.uniform(1.0, 2.5)
        logger.info(f"Sleeping for {delay:.2f} seconds")
        time.sleep(delay)

logger.info(f"Level 3: Found {len(level3_links)} additional links")


# Combine and deduplicate all links
all_links = sorted(CRAWLED)
logger.info(f"Total unique URLs to load: {len(all_links)}") #logger for all unique links
print(f"Total unique URLs to load: {len(all_links)}")

# Step 3: Load all pages with Selenium. THIS IS WHERE THE ACTUAL SCRAPING OCCURS
logger.info(f"Loading {len(all_links)} pages with Selenium...")

docs = []
for url in all_links:
    logger.info(f"Loading: {url}")
    content = get_page_text_with_retry(driver, url, retries=2, wait_time=10, wait_for_selector="main")              # loader = SeleniumURLLoader(urls=all_links)  # THE ORIGIAL LOADER, WHICH WE ARE REPLACING WITH THE NEW UDF

    if content:
        docs.append(Document(page_content=content, metadata={"source": url}))                                       # docs = loader.load()
    else:
        logger.warning(f"⚠️ Skipped empty or failed page: {url}")
logger.info(f"Successfully loaded {len(docs)} documents with meaningful content.")

elapsed = time.time() - start_time
logger.info(f"✅ Scraping completed in {elapsed:.2f} seconds.")


print(f"Successfully loaded {len(docs)} documents.")


12:32:08 [INFO] Crawling Level 1: https://docs.arduino.cc/programming/
12:32:12 [INFO] Links added for tracking for this level: ['https://docs.arduino.cc/built-in-examples/', 'https://docs.arduino.cc/libraries/liquidcrystal-i2c/', 'https://docs.arduino.cc/language-reference/en/functions/communication/print/', 'https://docs.arduino.cc/micropython/basics/micropython-basics/', 'https://docs.arduino.cc/language-reference/en/functions/communication/serial/', 'https://docs.arduino.cc/language-reference/', 'https://docs.arduino.cc/libraries/wifi/', 'https://docs.arduino.cc/language-reference/en/variables/data-types/stringObject/', 'https://docs.arduino.cc/micropython/basics/board-installation/', 'https://docs.arduino.cc/cloud-api/dashboards-api/', 'https://docs.arduino.cc/language-reference/en/functions/external-interrupts/attachInterrupt/', 'https://docs.arduino.cc/built-in-examples/control-structures/ifStatementConditional/', 'https://docs.arduino.cc/micropython/basics/code-editors/', 'http

In [3]:
print(docs[0].page_content)

/
 
ARDUINO.CC
 
 
Hardware
 
Software
 
Cloud
 
Programming
 
Learn
 
Tutorials
Rapid Access
 
Libraries
 
Language Reference
 
Built-in Examples
Arduino Documentation
Browse through all our documentation to learn everything for your Arduino journey.
Hardware
The vital pieces of hardware documentation you need to start your Arduino projects.
BROWSE HARDWARE
 
Cloud
Arduino Cloud is a online platform that allows you to create, deploy and monitor IoT projects.
DISCOVER CLOUD
 
Software
Learn about the IDEs, Web Editor, CLI and all the software tools that you need to get your hands dirty.
DISCOVER SOFTWARE
 
Programming
All you need to know to program with Arduino, including library documentation.
START CODING
 
Set Up Your Arduino Board
Set up your Arduino board with this interactive tutorial. Don’t have any Arduino hardware yet? Have a look at our
 
Store
!
Type to Search
GET STARTED
Discover
Learn Arduino
 
Knowledge, principles and techniques behind the Arduino ecosystem.
Tutorials
 

In [4]:
docs_backup = docs.copy()

In [5]:
import os
import json
import re
from urllib.parse import urlparse

def sanitize_url_for_filename(url):
    # Strip scheme and convert slashes to underscores
    parsed = urlparse(url)
    path = parsed.path.strip("/")
    path = re.sub(r"[^\w\-]+", "_", path)  # Keep it filename-safe
    if not path:
        path = "index"
    return path[:50]  # limit length to avoid overly long filenames

def save_documents_locally(docs, folder_name="scraped_output"):
    output_dir = os.path.join(os.getcwd(), folder_name)
    os.makedirs(output_dir, exist_ok=True)

    for i, doc in enumerate(docs):
        url = doc.metadata.get("source", f"no_url_{i}")
        safe_name = sanitize_url_for_filename(url)
        base_name = f"{i:03}_{safe_name}"

        # Save content
        txt_path = os.path.join(output_dir, f"{base_name}.txt")
        with open(txt_path, "w", encoding="utf-8") as f_txt:
            f_txt.write(doc.page_content)

        # Save metadata
        json_path = os.path.join(output_dir, f"{base_name}.json")
        with open(json_path, "w", encoding="utf-8") as f_json:
            json.dump(doc.metadata, f_json, indent=2)

    print(f"✅ Saved {len(docs)} documents to: {output_dir}")


In [6]:
save_documents_locally(docs, folder_name="scrape_test_20250504")

✅ Saved 27 documents to: c:\Users\brian.clements\Documents\RAG_POC\Doc_Scraping\Func_App\scrape_test_20250504
