# UChicago MS in Applied Data Science Website - Link Discovery

This notebook handles Phase 1 of the scraping process: discovering all links on the UChicago Data Science Institute website and saving them for later processing.

In [1]:
# Import required libraries
import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import logging
from datetime import datetime

In [2]:
# Configuration
BASE_URL = "https://datascience.uchicago.edu/"
START_URL = "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/"
OUTPUT_DIR = "../data"
LOGS_DIR = "../logs"
DELAY = 0.1  # Reduced delay between requests (in seconds)
LINKS_FILE = os.path.join(OUTPUT_DIR, "discovered_links.json")

# Create output directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGS_DIR, exist_ok=True)

In [3]:
# Setup logging
log_file = os.path.join(LOGS_DIR, f"link_discovery_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()  # Also log to console
    ]
)
logger = logging.getLogger()

In [4]:
def is_internal_link(url):
    """
    Check if a URL is internal to the datascience.uchicago.edu domain
    """
    parsed_url = urlparse(url)
    return (not parsed_url.netloc or 
            'datascience.uchicago.edu' in parsed_url.netloc) and \
           not url.endswith(('.pdf', '.jpg', '.png', '.zip', '.doc', '.docx', '.webp'))

In [5]:
def normalize_url(url):
    """
    Normalize URL by removing trailing slashes, fragments, and query parameters
    """
    parsed = urlparse(url)
    normalized = parsed._replace(fragment='', query='').geturl()
    return normalized.rstrip('/')

In [6]:
def get_page_content(url):
    """
    Fetch the content of a page
    """
    try:
        response = requests.get(url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
        })
        response.raise_for_status()
        return response.text
    except Exception as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

In [7]:
def extract_links(soup, base_url):
    """
    Extract internal links from a BeautifulSoup object
    """
    links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        # Skip empty links and javascript
        if not href or href.startswith('javascript:') or href == '#':
            continue
            
        # Resolve relative URLs
        full_url = urljoin(base_url, href)
        
        # Only include internal links
        if is_internal_link(full_url):
            links.append(normalize_url(full_url))
    
    return links

In [8]:
def discover_all_urls(start_url):
    """
    Discover all URLs on the website
    """
    queue = [start_url]
    discovered = set()
    
    while queue:
        # Get the next URL from the queue
        url = queue.pop(0)
        
        # Skip if already discovered
        if url in discovered:
            continue
        
        logger.info(f"Discovering links from: {url}")
        
        # Fetch the page content
        html_content = get_page_content(url)
        if not html_content:
            discovered.add(url)
            continue
        
        # Parse the HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Extract links from the page
        links = extract_links(soup, url)
        
        # Add new links to the queue
        for link in links:
            if link not in discovered and link not in queue:
                queue.append(link)
        
        # Mark as discovered
        discovered.add(url)
        
        # Sleep to avoid overloading the server
        time.sleep(DELAY)  
    
    return list(discovered)

In [9]:
def save_discovered_links(links):
    """
    Save the discovered links to a JSON file
    """
    data = {
        'discovery_date': datetime.now().isoformat(),
        'count': len(links),
        'links': links
    }
    
    with open(LINKS_FILE, 'w') as f:
        json.dump(data, f, indent=2)
    
    logger.info(f"Saved {len(links)} links to {LINKS_FILE}")
    return LINKS_FILE

In [10]:
# Run the link discovery process
logger.info("Starting link discovery process...")
start_time = time.time()

# Discover all URLs
all_urls = discover_all_urls(START_URL)
logger.info(f"Found {len(all_urls)} URLs")

# Save the links
links_file = save_discovered_links(all_urls)

end_time = time.time()
elapsed_time = end_time - start_time
logger.info(f"Link discovery completed in {elapsed_time:.2f} seconds")

2025-05-04 16:21:33,844 - INFO - Starting link discovery process...
2025-05-04 16:21:33,845 - INFO - Discovering links from: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/
2025-05-04 16:21:52,087 - INFO - Discovering links from: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science
2025-05-04 16:21:53,244 - INFO - Discovering links from: https://datascience.uchicago.edu
2025-05-04 16:21:54,724 - INFO - Discovering links from: https://datascience.uchicago.edu/about
2025-05-04 16:21:56,022 - INFO - Discovering links from: https://datascience.uchicago.edu/about/about-dsi
2025-05-04 16:21:57,330 - INFO - Discovering links from: https://datascience.uchicago.edu/about/jobs
2025-05-04 16:21:58,499 - INFO - Discovering links from: https://datascience.uchicago.edu/about/visiting-dsi
2025-05-04 16:21:59,813 - INFO - Discovering links from: https://datascience.uchicago.edu/about/contact
2025-05-04 16:22:01,214 - INFO - Disc

In [11]:
# Preview the links
if os.path.exists(LINKS_FILE):
    with open(LINKS_FILE, 'r') as f:
        data = json.load(f)
    
    print(f"Discovery date: {data['discovery_date']}")
    print(f"Total links: {data['count']}")
    print("\nSample links:")
    for link in data['links'][:10]:  # Show first 10 links
        print(f"  - {link}")
    print("...")

Discovery date: 2025-05-04T16:53:47.334212
Total links: 1413

Sample links:
  - https://datascience.uchicago.edu/research/ai-science/people
  - https://datascience.uchicago.edu/people/rajvi-shah
  - https://datascience.uchicago.edu/people/olivia-morkved
  - https://datascience.uchicago.edu/people/mengzhan-jhan-liufu
  - https://datascience.uchicago.edu/people/jennifer-wei-mem
  - https://datascience.uchicago.edu/research/data-democracy-initiative
  - https://datascience.uchicago.edu/people/elva-lu
  - https://datascience.uchicago.edu/people/oscar-leong
  - https://datascience.uchicago.edu/people/kristin-haddadin-sheher
  - https://datascience.uchicago.edu/people/ramanujan-srinath
...
