In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

# CDC Diabetes root URL
BASE_URL = "https://www.cdc.gov/diabetes/"
VISITED = set()
CRAWLED_TEXT = []

def clean_text(soup):
    # Remove navigation, footer, scripts, etc.
    for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]):
        tag.decompose()

    # Extract visible text only
    text = soup.get_text(separator="\n")
    lines = [line.strip() for line in text.splitlines()]
    return "\n".join([line for line in lines if line])

def crawl(url, max_pages=20):
    global VISITED
    if url in VISITED or len(VISITED) >= max_pages:
        return

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        VISITED.add(url)

        print(f"✅ Crawled: {url}")
        page_text = clean_text(soup)
        CRAWLED_TEXT.append(f"\n==== SOURCE: {url} ====\n{page_text}\n")

        # Crawl internal CDC diabetes links
        for link_tag in soup.find_all("a", href=True):
            href = link_tag["href"]
            full_url = urljoin(url, href)
            if full_url.startswith(BASE_URL):
                crawl(full_url)

        time.sleep(1)

    except Exception as e:
        print(f"⚠️ Error crawling {url}: {e}")

# Start crawling from base
crawl(BASE_URL)

# Save to file
with open("cdc_diabetes_guidelines.txt", "w", encoding="utf-8") as f:
    f.write("\n\n".join(CRAWLED_TEXT))
    
print("All content saved to: cdc_diabetes_guidelines.txt")


✅ Crawled: https://www.cdc.gov/diabetes/
✅ Crawled: https://www.cdc.gov/diabetes/#content
✅ Crawled: https://www.cdc.gov/diabetes/#cdc-search
✅ Crawled: https://www.cdc.gov/diabetes/#gov-notice
✅ Crawled: https://www.cdc.gov/diabetes/prevention-type-2/building-a-healthy-habit.html
✅ Crawled: https://www.cdc.gov/diabetes/prevention-type-2/building-a-healthy-habit.html#content
✅ Crawled: https://www.cdc.gov/diabetes/prevention-type-2/building-a-healthy-habit.html#cdc-search
✅ Crawled: https://www.cdc.gov/diabetes/prevention-type-2/building-a-healthy-habit.html#gov-notice
✅ Crawled: https://www.cdc.gov/diabetes/education-support-programs/index.html
✅ Crawled: https://www.cdc.gov/diabetes/education-support-programs/index.html#content
✅ Crawled: https://www.cdc.gov/diabetes/education-support-programs/index.html#cdc-search
✅ Crawled: https://www.cdc.gov/diabetes/education-support-programs/index.html#gov-notice
✅ Crawled: https://www.cdc.gov/diabetes/diabetes-tv/diabetes-kickstart.html
✅ Craw