In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

BASE_URL = "https://www.who.int/health-topics/diabetes"
VISITED = set()
CRAWLED_TEXT = []

def clean_text(soup):
    for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]):
        tag.decompose()
    text = soup.get_text(separator="\n")
    lines = [line.strip() for line in text.splitlines()]
    return "\n".join([line for line in lines if line])

def crawl(url, max_pages=20):
    global VISITED
    if url in VISITED or len(VISITED) >= max_pages:
        return

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        VISITED.add(url)

        print(f"Crawled: {url}")
        page_text = clean_text(soup)
        CRAWLED_TEXT.append(f"\n==== SOURCE: {url} ====\n{page_text}\n")

        # Find all internal WHO links related to diabetes
        for link_tag in soup.find_all("a", href=True):
            href = link_tag["href"]
            full_url = urljoin(url, href)
            parsed = urlparse(full_url)
            if "who.int" in parsed.netloc and "/diabetes" in parsed.path:
                crawl(full_url)

        time.sleep(1)

    except Exception as e:
        print(f"Error crawling {url}: {e}")

# Start crawling WHO
crawl(BASE_URL)

# Save to text file
with open("who_diabetes_guidelines.txt", "w", encoding="utf-8") as f:
    f.write("\n\n".join(CRAWLED_TEXT))

print("WHO diabetes content saved to: who_diabetes_guidelines.txt")


Crawled: https://www.who.int/health-topics/diabetes
Crawled: https://www.who.int/health-topics/diabetes#content
Crawled: https://www.who.int/news-room/fact-sheets/detail/diabetes
Crawled: https://www.who.int/news-room/fact-sheets/detail/diabetes#content
Crawled: https://www.who.int/news-room/facts-in-pictures/detail/diabetes
Crawled: https://www.who.int/news-room/facts-in-pictures/detail/diabetes#content
WHO diabetes content saved to: who_diabetes_guidelines.txt
