In [1]:
import os
import logging
import sys
import env

#set certificate to user-specific (avoids issues with company-issued certificate)
os.environ['REQUESTS_CA_BUNDLE'] = env.cert

#logger config
log_formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s', datefmt='%H:%M:%S')
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(log_formatter)

# File handler
file_handler = logging.FileHandler("scrape.log", mode='w', encoding='utf-8')
file_handler.setFormatter(log_formatter)

# Combine them
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(console_handler)
logger.addHandler(file_handler)

In [2]:
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_core.documents import Document
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
from urllib.parse import urljoin, urlparse
import time
import random

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

import UDFS

BASE_URL = env.BASE_URL
DOMAIN_FILTER = env.DOMAIN_FILTER
CRAWLED = set()
start_time = time.time()
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

# Step 1: Crawl base page. FOR LINKS, NOT FOR CONTENT
logger.info(f"Crawling Level 1: {BASE_URL}") #logging for level 1
level1_links = UDFS.get_internal_links(BASE_URL,driver,DOMAIN_FILTER,logger)
CRAWLED.update(level1_links)  # this is the culprit for section 2 failing. We are adding the tagged links to the CRAWLED list before they are even getting crawled.  
print(f"Level 1: Found {len(level1_links)} links")
logger.info(f"Level 1: Found {len(level1_links)} links.")
logger.info(f"Example links: {list(level1_links)[:3]}")

# Combine and deduplicate all links
all_links = sorted(CRAWLED)
logger.info(f"Total unique URLs to load: {len(all_links)}") #logger for all unique links
print(f"Total unique URLs to load: {len(all_links)}")

# Step 3: Load all pages with Selenium. THIS IS WHERE THE ACTUAL SCRAPING OCCURS
logger.info(f"Loading {len(all_links)} pages with Selenium...")

docs = []
for url in all_links:
    logger.info(f"Loading: {url}")
    content = UDFS.get_page_text_with_retry(driver, url, retries=2, wait_time=10, wait_for_selector="main")              # loader = SeleniumURLLoader(urls=all_links)  # THE ORIGIAL LOADER, WHICH WE ARE REPLACING WITH THE NEW UDF

    if content:
        docs.append(Document(page_content=content, metadata={"source": url}))                                       # docs = loader.load()
    else:
        logger.warning(f"⚠️ Skipped empty or failed page: {url}")
logger.info(f"Successfully loaded {len(docs)} documents with meaningful content.")

elapsed = time.time() - start_time
logger.info(f"✅ Scraping completed in {elapsed:.2f} seconds.")


print(f"Successfully loaded {len(docs)} documents.")

14:11:45 [INFO] Crawling Level 1: https://docs.arduino.cc/programming/
14:11:49 [INFO] Links added for tracking for this level: ['https://docs.arduino.cc/micropython/basics/board-installation/', 'https://docs.arduino.cc/micropython/basics/digital-analog-pins/', 'https://docs.arduino.cc/cloud-api', 'https://docs.arduino.cc/certifications/', 'https://docs.arduino.cc/language-reference/en/variables/data-types/stringObject/', 'https://docs.arduino.cc/micropython/basics/code-editors/', 'https://docs.arduino.cc/language-reference/en/functions/communication/serial/', 'https://docs.arduino.cc/libraries/wifi/', 'https://docs.arduino.cc/', 'https://docs.arduino.cc/built-in-examples/basics/Blink/', 'https://docs.arduino.cc/language-reference/en/functions/digital-io/pinMode/', 'https://docs.arduino.cc/built-in-examples/digital/Button/', 'https://docs.arduino.cc/language-reference/en/functions/communication/wire/', 'https://docs.arduino.cc/micropython/', 'https://docs.arduino.cc/libraries/liquidcry

In [3]:
UDFS.save_documents_locally(docs, folder_name="scrape_test_20250504")

✅ Saved 27 documents to: c:\Users\brian.clements\Documents\RAG_POC\Doc_Scraping\Func_App\scrape_test_20250504
