# UChicago MS in Applied Data Science Website - Link Discovery

This notebook handles Phase 1 of the scraping process: discovering all links on the UChicago Data Science Institute website and saving them for later processing.

In [2]:
import os
import json
import requests
from urllib.parse import urljoin, urlparse, urlunparse
import time
import logging
from datetime import datetime
from robotexclusionrulesparser import RobotExclusionRulesParser
from pathlib import Path # For more robust path handling
from bs4 import BeautifulSoup

In [3]:
BASE_URL = "https://datascience.uchicago.edu/"
# Start URL for discovery
START_URL = "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/"
# Output directory for data files
OUTPUT_DIR = "../data"
# Output directory for log files
LOGS_DIR = "../logs"
# Delay between requests to be polite to the server (in seconds)
REQUEST_DELAY = 0.1  # Adjusted for politeness and performance
# Timeout for requests (in seconds)
REQUEST_TIMEOUT = 15
# File to save discovered links
LINKS_FILE = os.path.join(OUTPUT_DIR, "discovered_links.json")
# User-Agent for requests
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 GenAICodeReviewBot/1.0'
# List of file extensions to ignore during link discovery
EXCLUDED_FILE_EXTENSIONS = ('.pdf', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.mp3', '.mp4', '.avi', '.mov', '.webp', '.svg')
# Schemes to ignore
EXCLUDED_SCHEMES = ('mailto', 'tel', 'javascript')

In [4]:
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
Path(LOGS_DIR).mkdir(parents=True, exist_ok=True)

In [5]:
log_file = Path(LOGS_DIR) / f"link_discovery_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(module)s - %(funcName)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file, encoding='utf-8'),
        logging.StreamHandler()  # Also log to console
    ]
)
logger = logging.getLogger(__name__)

In [6]:
session = requests.Session()
session.headers.update({'User-Agent': USER_AGENT})

In [7]:
robots_parser = RobotExclusionRulesParser()
try:
    robots_url = urljoin(BASE_URL, "robots.txt")
    robots_response = session.get(robots_url, timeout=REQUEST_TIMEOUT)
    robots_response.raise_for_status()
    robots_parser.parse(robots_response.text)
    logger.info(f"Successfully fetched and parsed robots.txt from {robots_url}")
except requests.RequestException as e:
    logger.warning(f"Could not fetch robots.txt from {robots_url}: {e}. Proceeding without robots.txt rules.")
except Exception as e:
    logger.warning(f"Error parsing robots.txt: {e}. Proceeding without robots.txt rules.")

2025-05-12 10:59:45,468 - INFO - 2998126080 - <module> - Successfully fetched and parsed robots.txt from https://datascience.uchicago.edu/robots.txt


In [8]:
def is_allowed_by_robots(url):
    """Check if a URL is allowed by robots.txt."""
    if robots_parser: # Check if parser was initialized
        return robots_parser.is_allowed(USER_AGENT, url)
    return True # Default to allowed if robots.txt could not be parsed

In [9]:
def is_relevant_link(url, base_domain):
    """
    Check if a URL is internal to the specified base_domain, not a file,
    and not an excluded scheme.
    """
    parsed_url = urlparse(url)
    
    # Check scheme
    if parsed_url.scheme in EXCLUDED_SCHEMES:
        logger.debug(f"Skipping URL with excluded scheme: {url}")
        return False
        
    # Check if it's an HTTP/HTTPS URL
    if parsed_url.scheme not in ('http', 'https'):
        logger.debug(f"Skipping non-HTTP/HTTPS URL: {url}")
        return False

    # Check domain
    if not parsed_url.netloc or base_domain not in parsed_url.netloc:
        logger.debug(f"Skipping external URL: {url}")
        return False
        
    # Check file extension
    path = parsed_url.path
    if any(path.lower().endswith(ext) for ext in EXCLUDED_FILE_EXTENSIONS):
        logger.debug(f"Skipping URL with excluded file extension: {url}")
        return False
        
    return True

In [10]:
def normalize_url(url):
    """
    Normalize URL by removing trailing slashes, fragments, and common tracking query parameters.
    Keeps scheme and netloc case-insensitive as per RFC 3986.
    """
    parsed = urlparse(url)
    # Normalize scheme and netloc to lowercase
    normalized_scheme = parsed.scheme.lower()
    normalized_netloc = parsed.netloc.lower()
    
    # Reconstruct URL without fragment and query parameters (can be made more selective if needed)
    # For this specific site, removing all query params seems fine, but for others, you might want to be selective.
    normalized_path = parsed.path.rstrip('/') if parsed.path else ''
    if not normalized_path and (parsed.query or parsed.fragment): # if it was just "domain.com/"
        normalized_path = '/'

    # Ensure path starts with '/' if netloc is present
    if normalized_netloc and not normalized_path.startswith('/') and normalized_path:
        normalized_path = '/' + normalized_path
    elif not normalized_path and normalized_netloc: # Handle base URL like https://domain.com
        normalized_path = '/'


    # Reconstruct without fragment and query
    # For more general scraping, one might want to selectively keep some query parameters.
    # For this project, removing them simplifies duplicate detection.
    reconstructed_url = urlunparse((normalized_scheme, normalized_netloc, normalized_path, '', '', ''))
    
    return reconstructed_url

In [11]:
def get_page_content_and_type(url):
    """
    Fetch the content and content type of a page.
    Returns (content_text, content_type_main) or (None, None) on error.
    """
    try:
        response = session.get(url, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()  # Raise HTTPError for bad responses (4XX or 5XX)
        content_type = response.headers.get('Content-Type', '').lower()
        content_type_main = content_type.split(';')[0]
        logger.debug(f"Fetched {url} with Content-Type: {content_type}")
        return response.text, content_type_main
    except requests.exceptions.Timeout:
        logger.error(f"Timeout while fetching {url}")
    except requests.exceptions.HTTPError as e:
        logger.error(f"HTTP error fetching {url}: {e}")
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
    except Exception as e: # Catch any other unexpected errors
        logger.error(f"Unexpected error fetching {url}: {e}", exc_info=True)
    return None, None

In [12]:
def extract_links_from_html(html_content, current_url, base_domain):
    """
    Extract and filter relevant links from HTML content.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    extracted_links = set() # Use a set to store unique links from this page

    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        
        if not href or href.startswith('#'): # Skip empty or fragment-only links
            logger.debug(f"Skipping empty or fragment link: {href}")
            continue
            
        full_url = urljoin(current_url, href)
        normalized_full_url = normalize_url(full_url)
        
        if is_relevant_link(normalized_full_url, base_domain) and is_allowed_by_robots(normalized_full_url):
            extracted_links.add(normalized_full_url)
        else:
            logger.debug(f"Skipping irrelevant or disallowed link: {normalized_full_url}")
            
    return list(extracted_links)

In [13]:
def discover_all_urls(start_url):
    """
    Discover all relevant and allowed URLs on the website starting from start_url.
    """
    base_domain = urlparse(start_url).netloc
    queue = [normalize_url(start_url)]
    discovered_urls = set(queue) # Initialize with the start URL
    processed_urls_count = 0

    logger.info(f"Starting URL discovery from: {start_url} (Domain: {base_domain})")

    while queue:
        current_url = queue.pop(0)
        processed_urls_count += 1
        logger.info(f"Processing {processed_urls_count}/{len(discovered_urls) + len(queue)}: {current_url}")

        html_content, content_type = get_page_content_and_type(current_url)

        if not html_content:
            logger.warning(f"No content fetched for {current_url}. Skipping link extraction.")
            continue
        
        if 'text/html' not in content_type:
            logger.info(f"Skipping link extraction for non-HTML content type ({content_type}) at {current_url}")
            continue

        try:
            links_on_page = extract_links_from_html(html_content, current_url, base_domain)
            new_links_found = 0
            for link in links_on_page:
                if link not in discovered_urls:
                    discovered_urls.add(link)
                    queue.append(link)
                    new_links_found += 1
            if new_links_found > 0:
                logger.info(f"Found {new_links_found} new links on {current_url}. Queue size: {len(queue)}")
            else:
                logger.debug(f"No new links found on {current_url}.")

        except Exception as e:
            logger.error(f"Error parsing or extracting links from {current_url}: {e}", exc_info=True)
        
        time.sleep(REQUEST_DELAY)
    
    logger.info(f"Discovery finished. Total unique relevant URLs found: {len(discovered_urls)}")
    return sorted(list(discovered_urls))

In [14]:
def save_discovered_links(links, filepath):
    """
    Save the discovered links to a JSON file.
    """
    data = {
        'discovery_date': datetime.now().isoformat(),
        'start_url': START_URL,
        'base_url_domain': urlparse(BASE_URL).netloc,
        'count': len(links),
        'links': links
    }
    
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully saved {len(links)} links to {filepath}")
    except IOError as e:
        logger.error(f"Failed to save links to {filepath}: {e}")
    return filepath

In [15]:
logger.info("--- Starting Link Discovery Process ---")
discovery_start_time = time.time()

# Discover all URLs
all_discovered_urls = discover_all_urls(START_URL)

if all_discovered_urls:
    # Save the links
    save_discovered_links(all_discovered_urls, LINKS_FILE)
    
    # Preview the links
    logger.info(f"\n--- Link Discovery Summary ---")
    logger.info(f"Discovery date: {datetime.now().isoformat()}")
    logger.info(f"Total links found: {len(all_discovered_urls)}")
    logger.info("\nSample links (first 10):")
    for link in all_discovered_urls[:10]:
        logger.info(f"  - {link}")
    if len(all_discovered_urls) > 10:
        logger.info("  ...")
else:
    logger.warning("No URLs were discovered. Check logs for errors.")

discovery_end_time = time.time()
elapsed_time = discovery_end_time - discovery_start_time
logger.info(f"--- Link Discovery Completed in {elapsed_time:.2f} seconds ---")

2025-05-12 11:01:21,864 - INFO - 583492747 - <module> - --- Starting Link Discovery Process ---
2025-05-12 11:01:21,865 - INFO - 1551423106 - discover_all_urls - Starting URL discovery from: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/ (Domain: datascience.uchicago.edu)
2025-05-12 11:01:21,866 - INFO - 1551423106 - discover_all_urls - Processing 1/1: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science
2025-05-12 11:01:22,800 - INFO - 1551423106 - discover_all_urls - Found 64 new links on https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science. Queue size: 64
2025-05-12 11:01:22,901 - INFO - 1551423106 - discover_all_urls - Processing 2/128: https://datascience.uchicago.edu/insights/federal-budget-cuts-threaten-to-decimate-americas-ai-superiority-and-other-countries-are-watching
2025-05-12 11:01:23,174 - INFO - 1551423106 - discover_all_urls - Found 3 new links on https://datasci