# UChicago MS in Applied Data Science Website - Content Processor

This notebook handles Phase 2 of the scraping process: processing the previously discovered links and converting their content to Markdown format.

In [1]:
# Import required libraries
import os
import re
import json
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from urllib.parse import urlparse
import time
import logging
import pandas as pd
from datetime import datetime

In [2]:
# Configuration
BASE_URL = "https://datascience.uchicago.edu/"
DATA_DIR = "../data"
MARKDOWN_DIR = os.path.join(DATA_DIR, "markdown")
LOGS_DIR = "../logs"
LINKS_FILE = os.path.join(DATA_DIR, "discovered_links.json")
PROCESSED_FILE = os.path.join(DATA_DIR, "processed_links.json")
DELAY = 1  # Reduced delay between requests (in seconds)

# Create output directories if they don't exist
os.makedirs(MARKDOWN_DIR, exist_ok=True)
os.makedirs(LOGS_DIR, exist_ok=True)

In [3]:
# Setup logging
log_file = os.path.join(LOGS_DIR, f"content_processor_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()  # Also log to console
    ]
)
logger = logging.getLogger()

In [4]:
def load_discovered_links():
    """
    Load the previously discovered links from the JSON file
    """
    if not os.path.exists(LINKS_FILE):
        logger.error(f"Links file not found: {LINKS_FILE}")
        return []
    
    with open(LINKS_FILE, 'r') as f:
        data = json.load(f)
    
    logger.info(f"Loaded {len(data['links'])} links discovered on {data['discovery_date']}")
    return data['links']

In [5]:
def get_page_content(url):
    """
    Fetch the content of a page
    """
    try:
        response = requests.get(url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
        }, timeout=10)
        response.raise_for_status()
        return response.text
    except Exception as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

In [6]:
def get_title(soup):
    """
    Extract the title from a BeautifulSoup object
    """
    if soup.title and soup.title.string:
        return soup.title.string.strip()
    
    # Fallback to h1
    h1 = soup.find('h1')
    if h1 and h1.get_text():
        return h1.get_text().strip()
    
    return "No Title"

In [7]:
def get_main_content(soup):
    """
    Extract the main content from a BeautifulSoup object
    """
    # Try to find the main content based on common HTML5 elements or UChicago specific patterns
    content_candidates = [
        soup.find('main'),
        soup.find('article'),
        soup.find('div', {'class': 'entry-content'}),
        soup.find('div', {'class': 'content'}),
        soup.find('div', {'id': 'content'}),
        soup.find('div', {'class': 'container'}),
        soup.find('div', {'class': re.compile(r'main-content')})
    ]
    
    # Use the first non-None candidate
    for candidate in content_candidates:
        if candidate:
            return candidate
    
    # If no candidates found, return the body
    return soup.body if soup.body else soup

In [8]:
def extract_category(url):
    """
    Extract a category from the URL for better organization
    """
    parsed = urlparse(url)
    path = parsed.path.strip('/')
    
    # If the path is empty, it's the homepage
    if not path:
        return 'homepage'
    
    # Extract the first part of the path as the category
    parts = path.split('/')
    if len(parts) > 0:
        return parts[0]
    
    return 'uncategorized'

In [9]:
def html_to_markdown(html_content, url, preserve_tables=True, extract_img_alt=True):
    """
    Convert HTML to Markdown, with special handling for tables and images
    """
    # Custom options to preserve tables
    options = {
        'strip': ['script', 'style'],
        'heading_style': 'atx',
        'bullets': '*',
        'code_language': 'python'
    }
    
    soup = BeautifulSoup(html_content, 'html.parser')
    
    if extract_img_alt:
        # Extract alt text from images to preserve information
        for img in soup.find_all('img'):
            alt_text = img.get('alt', '').strip()
            if alt_text:
                # Create a text node with image description
                img_desc = soup.new_tag('p')
                img_desc.string = f"[Image: {alt_text}]"
                img.insert_after(img_desc)
    
    if preserve_tables:
        # Process all tables
        for table in soup.find_all('table'):
            # Create a list to store table data
            markdown_table = []
            rows = table.find_all('tr')
            
            if not rows:
                continue
                
            # Process header row
            header_cells = rows[0].find_all(['th', 'td'])
            if header_cells:
                header = ['| ' + cell.get_text().strip() for cell in header_cells]
                markdown_table.append(''.join(header) + ' |')
                markdown_table.append('|' + '---|' * len(header_cells))
            
            # Process data rows
            for row in rows[1:] if header_cells else rows:
                cells = row.find_all('td')
                if cells:
                    data_row = ['| ' + cell.get_text().strip() for cell in cells]
                    markdown_table.append(''.join(data_row) + ' |')
            
            # Replace the table with our markdown version
            if markdown_table:
                new_tag = soup.new_tag('div')
                new_tag.string = '\n'.join(markdown_table)
                table.replace_with(new_tag)
        
        # Also look for div-based tables (common in modern sites)
        # This is a simplified approach - might need customization for specific layouts
        table_divs = soup.find_all('div', class_=re.compile(r'(table|grid)'))
        for div in table_divs:
            # Add a comment to mark this as a table-like structure
            comment = soup.new_string('\n\n<!-- Table-like structure detected -->\n\n')
            div.insert_before(comment)
    
    # Extract breadcrumb navigation if present
    breadcrumbs = soup.find_all(['nav', 'div'], class_=re.compile(r'(breadcrumb)'))
    breadcrumb_text = ''
    if breadcrumbs:
        crumbs = []
        for bc in breadcrumbs:
            links = bc.find_all('a')
            if links:
                crumbs = [link.get_text().strip() for link in links]
                breadcrumb_text = ' > '.join(crumbs)
                break
    
    # Convert to markdown
    markdown = md(str(soup), **options)
    
    # Clean up the markdown
    markdown = re.sub(r'\n{3,}', '\n\n', markdown)  # Remove excess newlines
    markdown = re.sub(r'\[\s*\]\(([^)]+)\)', '', markdown)  # Remove empty links
    
    # Add breadcrumbs metadata if found
    if breadcrumb_text:
        breadcrumb_meta = f"\n\nBreadcrumb: {breadcrumb_text}\n\n"
        markdown = breadcrumb_meta + markdown
    
    return markdown

In [10]:
def save_markdown(title, content, url, category):
    """
    Save markdown content to a file with useful metadata
    """
    # Create a valid filename from the URL
    parsed_url = urlparse(url)
    path = parsed_url.path.strip('/')
    path = path.replace('/', '_')
    
    if not path:
        path = 'index'
    
    filename = f"{path}.md"
    filepath = os.path.join(MARKDOWN_DIR, filename)
    
    # Add frontmatter with metadata
    frontmatter = f"---\ntitle: {title}\noriginal_url: {url}\ncategory: {category}\ndate: {datetime.now().strftime('%Y-%m-%d')}\n---\n\n"
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(frontmatter + content)
    
    logger.info(f"Saved {filepath}")
    return filepath

In [11]:
def save_processing_status(processed_urls, failed_urls):
    """
    Save the processing status to a JSON file
    """
    data = {
        'processing_date': datetime.now().isoformat(),
        'processed_count': len(processed_urls),
        'failed_count': len(failed_urls),
        'processed_urls': processed_urls,
        'failed_urls': failed_urls
    }
    
    with open(PROCESSED_FILE, 'w') as f:
        json.dump(data, f, indent=2)
    
    logger.info(f"Saved processing status to {PROCESSED_FILE}")
    return PROCESSED_FILE

In [12]:
def process_content():
    """
    Process the content of the discovered links
    """
    # Load discovered links
    links = load_discovered_links()
    if not links:
        logger.error("No links to process")
        return
    
    processed_urls = []
    failed_urls = []
    url_to_file = {}
    
    total_links = len(links)
    logger.info(f"Starting to process {total_links} links")
    
    for i, url in enumerate(links):
        logger.info(f"Processing {i+1}/{total_links}: {url}")
        
        # Fetch the page content
        html_content = get_page_content(url)
        if not html_content:
            logger.warning(f"Failed to get content for {url}")
            failed_urls.append(url)
            continue
        
        try:
            # Parse the HTML
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Get the title
            title = get_title(soup)
            
            # Extract category
            category = extract_category(url)
            
            # Get the main content
            main_content = get_main_content(soup)
            
            # Convert to markdown
            markdown_content = html_to_markdown(str(main_content), url)
            
            # Save the markdown content
            local_file = save_markdown(title, markdown_content, url, category)
            url_to_file[url] = local_file
            processed_urls.append(url)
            
            # Periodically save progress (every 10 pages)
            if (i + 1) % 10 == 0:
                save_processing_status(processed_urls, failed_urls)
                
        except Exception as e:
            logger.error(f"Error processing {url}: {e}")
            failed_urls.append(url)
        
        # Sleep to avoid overloading the server
        time.sleep(DELAY)
    
    # Save final processing status
    save_processing_status(processed_urls, failed_urls)
    
    return {
        'processed_count': len(processed_urls),
        'failed_count': len(failed_urls),
        'url_to_file': url_to_file
    }

In [13]:
# Execute the content processing
logger.info("Starting content processing...")
start_time = time.time()

results = process_content()

end_time = time.time()
elapsed_time = end_time - start_time
logger.info(f"Content processing completed in {elapsed_time:.2f} seconds")

if results:
    logger.info(f"Processed: {results['processed_count']} pages")
    logger.info(f"Failed: {results['failed_count']} pages")

2025-05-04 17:48:17,483 - INFO - Starting content processing...
2025-05-04 17:48:17,485 - INFO - Loaded 1414 links discovered on 2025-05-04T16:53:47.334212
2025-05-04 17:48:17,486 - INFO - Starting to process 1414 links
2025-05-04 17:48:17,487 - INFO - Processing 1/1414: https://datascience.uchicago.edu/research/ai-science/people
2025-05-04 17:48:18,575 - INFO - Saved ../data\markdown\research_ai-science_people.md
2025-05-04 17:48:19,576 - INFO - Processing 2/1414: https://datascience.uchicago.edu/people/rajvi-shah
2025-05-04 17:48:20,483 - INFO - Saved ../data\markdown\people_rajvi-shah.md
2025-05-04 17:48:21,486 - INFO - Processing 3/1414: https://datascience.uchicago.edu/people/olivia-morkved
2025-05-04 17:48:22,416 - INFO - Saved ../data\markdown\people_olivia-morkved.md
2025-05-04 17:48:23,418 - INFO - Processing 4/1414: https://datascience.uchicago.edu/people/mengzhan-jhan-liufu
2025-05-04 17:48:24,430 - INFO - Saved ../data\markdown\people_mengzhan-jhan-liufu.md
2025-05-04 17:48

In [14]:
# Create a summary of the processed pages by category
if os.path.exists(PROCESSED_FILE):
    with open(PROCESSED_FILE, 'r') as f:
        processed_data = json.load(f)
    
    # Get all markdown files
    markdown_files = os.listdir(MARKDOWN_DIR)
    
    # Extract categories from frontmatter
    categories = {}
    for md_file in markdown_files:
        if md_file.endswith('.md'):
            filepath = os.path.join(MARKDOWN_DIR, md_file)
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
                match = re.search(r'category:\s*([^\n]+)', content)
                if match:
                    category = match.group(1).strip()
                    categories[category] = categories.get(category, 0) + 1
    
    # Create a DataFrame
    category_df = pd.DataFrame({
        'Category': list(categories.keys()),
        'Count': list(categories.values())
    }).sort_values('Count', ascending=False)
    
    display(category_df)
    
    # Save summary to CSV
    summary_path = os.path.join(DATA_DIR, 'category_summary.csv')
    category_df.to_csv(summary_path, index=False)
    print(f"\nCategory summary saved to {summary_path}")

Unnamed: 0,Category,Count
30,people,637
27,news,177
17,events,165
33,research,84
22,insights,60
15,education,33
29,outreach,28
2,about,8
16,engage,6
13,curriculum,4



Category summary saved to ../data\category_summary.csv


In [15]:
# Analysis of the content (word count, etc.)
def count_words_in_file(filepath):
    """
    Count the number of words in a markdown file (excluding frontmatter)
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        
    # Remove frontmatter
    content = re.sub(r'^---\n.*?\n---\n', '', content, flags=re.DOTALL)
    
    # Count words
    words = re.findall(r'\w+', content)
    return len(words)

In [16]:
# Calculate word counts for each file
markdown_files = [f for f in os.listdir(MARKDOWN_DIR) if f.endswith('.md')]
word_counts = []

for md_file in markdown_files:
    filepath = os.path.join(MARKDOWN_DIR, md_file)
    count = count_words_in_file(filepath)
    
    # Extract URL and title from frontmatter
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        url_match = re.search(r'original_url:\s*([^\n]+)', content)
        title_match = re.search(r'title:\s*([^\n]+)', content)
        category_match = re.search(r'category:\s*([^\n]+)', content)
        
        url = url_match.group(1).strip() if url_match else ""
        title = title_match.group(1).strip() if title_match else ""
        category = category_match.group(1).strip() if category_match else ""
    
    word_counts.append({
        'Filename': md_file,
        'Title': title,
        'URL': url,
        'Category': category,
        'Word Count': count
    })

word_count_df = pd.DataFrame(word_counts)
word_count_df = word_count_df.sort_values('Word Count', ascending=False)

# Show top 10 pages by word count
print("Top 10 pages by word count:")
display(word_count_df.head(10))

# Save word count data
word_count_path = os.path.join(DATA_DIR, 'word_count_summary.csv')
word_count_df.to_csv(word_count_path, index=False)
print(f"\nWord count summary saved to {word_count_path}")

Top 10 pages by word count:


Unnamed: 0,Filename,Title,URL,Category,Word Count
1213,research_postdoctoral-programs_rising-stars_20...,2021 Rising Stars – DSI,https://datascience.uchicago.edu/research/post...,research,21991
1229,rising-stars.md,2021 Rising Stars – DSI,https://datascience.uchicago.edu/rising-stars,rising-stars,21991
1211,research_postdoctoral-programs_rising-stars-in...,2023 Rising Stars – DSI,https://datascience.uchicago.edu/research/post...,research,11908
1214,research_postdoctoral-programs_rising-stars_20...,2022 Rising Stars – DSI,https://datascience.uchicago.edu/research/post...,research,11371
41,education_masters-programs_ms-in-applied-data-...,"Faculty, Instructors, Staff – DSI",https://datascience.uchicago.edu/education/mas...,education,9993
1,about-us.md,"Faculty, Instructors, Staff – DSI",https://datascience.uchicago.edu/about-us,about-us,9993
1233,the-university-of-chicago-and-caltech-conferen...,The University of Chicago and Caltech Conferen...,https://datascience.uchicago.edu/the-universit...,the-university-of-chicago-and-caltech-conferen...,8723
50,education_summerlab_2020-cohort.md,2020 Project Profiles – DSI,https://datascience.uchicago.edu/education/sum...,education,6901
72,events_ai-science-summer-school-2024.md,AI+Science Summer School 2024 – DSI,https://datascience.uchicago.edu/events/ai-sci...,events,6607
51,education_summerlab_2021-cohort.md,2021 Project Profiles – DSI,https://datascience.uchicago.edu/education/sum...,education,5727



Word count summary saved to ../data\word_count_summary.csv
