# Web Scraping the UChicago MSADS Website

Disclaimer: I used ChatGPT for some parts and marked prompts where I used it.

## Libraries

In [1]:
!pip install requests beautifulsoup4 tqdm



In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from tqdm import tqdm
import json
import os

## Scraping

AI prompt: "I want to scrape a web page starting from the home directory (give url) and I want to scrape all embedded links as well. How can I do this?"

In [3]:
# Main page URL
BASE_URL = "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/"

# Headers to simulate a browser visit
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}

In [4]:
# Get all internal links from a soup object
def get_internal_links(soup, base_url):
    internal_links = set()
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        full_url = urljoin(base_url, href)
        parsed_url = urlparse(full_url)
        base_domain = urlparse(base_url).netloc

        if parsed_url.netloc == base_domain and "mailto" not in href and "#" not in href:
            internal_links.add(full_url)
    return internal_links

In [5]:
# Extract clean text from a given URL
def extract_text_from_url(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code != 200:
            return None
        soup = BeautifulSoup(response.text, "html.parser")

        # Remove unwanted tags
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()

        # Extract text content
        text = soup.get_text(separator=" ", strip=True)
        return text
    except Exception as e:
        print(f"Failed to process {url}: {e}")
        return None

In [6]:
# Main scraping function
def scrape_site(base_url):
    scraped_data = {}

    print("🔎 Fetching main page...")
    response = requests.get(base_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")

    internal_links = get_internal_links(soup, base_url)
    internal_links.add(base_url)  # Include main page

    print(f"📄 Found {len(internal_links)} internal pages to scrape...\n")

    for url in tqdm(internal_links):
        page_text = extract_text_from_url(url)
        if page_text:
            scraped_data[url] = page_text

    return scraped_data


In [7]:
# scraping run
scraped_data = scrape_site(BASE_URL)

🔎 Fetching main page...
📄 Found 66 internal pages to scrape...



100%|██████████| 66/66 [00:24<00:00,  2.67it/s]


In [10]:
# check scraping
for url, text in scraped_data.items():
    print(f"URL: {url}\nContent Preview:\n{text[:500]}...\n{'-'*80}\n")

URL: https://datascience.uchicago.edu/news/can-a-doctors-notes-reveal-when-theyre-tired-new-research-illuminates-the-hidden-signals-of-physician-fatigue-and-raises-questions-about-ai-in-healthcare/
Content Preview:
Can a Doctor’s Notes Reveal When They’re Tired? New Research Illuminates the Hidden Signals of Physician Fatigue—And Raises Questions About AI in Healthcare – DSI Skip to main content About About the Data Science Institute The Data Science Institute (DSI) executes the University of Chicago’s bold, innovative vision of Data Science as a new discipline. Jobs & Opportunities Open faculty, postdoctoral, staff, and student roles with the UChicago Data Science Institute and our partners. Visiting DSI ...
--------------------------------------------------------------------------------

URL: https://datascience.uchicago.edu/outreach/
Content Preview:
Outreach – DSI Skip to main content About About the Data Science Institute The Data Science Institute (DSI) executes the University of

In [9]:
output_file = "/content/uchicago_ads_scraped.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(scraped_data, f, indent=2, ensure_ascii=False)