In [1]:
import os
import json
import re
import requests
import sys
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
from datetime import datetime
from urllib.parse import urljoin

sys.path.append('../src')
from old_program_scraper import get_soup, extract_program_details, clean_text

In [2]:
url = 'https://calendar.camosun.ca/preview_program.php?catoid=25&poid=3954'
soup = get_soup(url)

In [3]:
content_div = soup.select_one('.block_content') or soup.select_one('#gateway_container') or soup.select_one('div.main') or soup

In [4]:
content_div

<td class="block_content" colspan="2">
<table class="table_default">
<tr>
<td colspan="4">
<table class="table_default">
<tr>
<td>
<div class="help_block"><a class="help acalog-highlight-ignore" href="help.php" onclick="acalogPopup('help.php', 'help', 770, 530, 'yes');return false;" target="_blank"><strong>HELP</strong></a></div>
<span class="acalog_catalog_name">Academic Calendar 2025-26</span> <br/>
<h1 id="acalog-content">Athletic and Exercise Therapy (Bachelor’s Degree)</h1>
<div style="float: right"><a alt="Text Version" class="print_link acalog-highlight-ignore" href="/preview_program.php?catoid=25&amp;poid=3954&amp;print" onclick="acalogPopup('/preview_program.php?catoid=25&amp;poid=3954&amp;print', 'print_preview', 770, 530, 'yes');return false;" rel="nofollow" target="_blank" title="Print-Friendly Page (opens a new window)"><span class="sr-only">Print-Friendly Page (opens a new window)</span></a></div><div class="acalog-social-media-links float_right">
<div class="gateway-tool

In [5]:
description_table = content_div.select_one('.program_description').select_one('table')
description_table

<table border="0" cellpadding="1" cellspacing="1" style="width:100%">
<tbody>
<tr>
<td style="width:20%">Total Credits:</td>
<td style="width:80%">132</td>
</tr>
<tr>
<td>Credential:</td>
<td>Bachelor of Athletic and Exercise Therapy</td>
</tr>
<tr>
<td>Program Code:</td>
<td>AET.BDEG</td>
</tr>
<tr>
<td>CIP:</td>
<td>51.0913</td>
</tr>
</tbody>
</table>

In [6]:
rows = description_table.select('tr')
for row in rows:
    cells = row.select('td')
    if len(cells) >= 2:
        header = clean_text(cells[0].text).lower()
        value = clean_text(cells[1].text)
        print(f"{header}: {value}")

total credits:: 132
credential:: Bachelor of Athletic and Exercise Therapy
program code:: AET.BDEG
cip:: 51.0913


In [7]:
def extract_courses(url):
    """
    Extract all courses from a program curriculum page.
    
    Args:
        url (str): URL of the program page
        
    Returns:
        list: List of course information dictionaries with code, title, and credits if available
    """
    soup = get_soup(url)
    
    # Find the main content area
    content_div = soup.select_one('.block_content') or soup.select_one('#gateway_container') or soup.select_one('div.main') or soup
    
    # Initialize empty list to store course information
    courses = []
    
    # Step 1: Find the curriculum section
    curriculum_headings = ['Curriculum', 'Program Content', 'Courses', 'Required Courses']
    curriculum_section = None
    
    for heading in curriculum_headings:
        section = content_div.find(['h1', 'h2', 'h3', 'h4'], text=re.compile(f'{heading}', re.IGNORECASE))
        if section:
            curriculum_section = section
            break
    
    if not curriculum_section:
        print("No curriculum section found.")
        return courses
    
    # Step 2: Extract course information at various levels of nesting
    # This approach checks multiple levels of nesting until it finds courses
    
    # Variables to track elements we've checked
    checked_elements = set()
    
    # Function to extract course information from tables
    def extract_from_table(table):
        table_courses = []
        for row in table.select('tr'):
            cells = row.select('td')
            if len(cells) >= 2:
                # Check if first cell looks like a course code (usually format: ABCD 123)
                if re.match(r'[A-Z]{2,5}\s*\d{3,4}[A-Z]?', clean_text(cells[0].text), re.IGNORECASE):
                    course = {
                        'code': clean_text(cells[0].text),
                        'title': clean_text(cells[1].text),
                    }
                    # Credits might be in the third column
                    if len(cells) > 2:
                        credits_text = clean_text(cells[2].text)
                        # Try to extract just the number
                        credit_match = re.search(r'(\d+\.?\d*)', credits_text)
                        course['credits'] = credit_match.group(1) if credit_match else credits_text
                    
                    table_courses.append(course)
        return table_courses
    
    # Function to extract course information from lists
    def extract_from_list(list_elem):
        list_courses = []
        for li in list_elem.select('li'):
            text = clean_text(li.text)
            # Look for patterns like "COURSE 101 - Course Title (3 credits)"
            course_match = re.match(r'([A-Z]{2,5}\s*\d{3,4}[A-Z]?)\s*-?\s*(.*?)(?:\s*\((\d+\.?\d*)\s*credits?\))?$', text, re.IGNORECASE)
            if course_match:
                course = {
                    'code': course_match.group(1).strip(),
                    'title': course_match.group(2).strip(),
                    'credits': course_match.group(3) if course_match.group(3) else ''
                }
                list_courses.append(course)
        return list_courses
    
    # Function to search elements recursively for course information
    def search_for_courses(element, depth=0, max_depth=5):
        nonlocal courses
        
        if depth > max_depth or element is None or id(element) in checked_elements:
            return
        
        # Mark this element as checked
        checked_elements.add(id(element))
        
        # Check if element is a table
        if element.name == 'table':
            table_courses = extract_from_table(element)
            if table_courses:
                courses.extend(table_courses)
                return
        
        # Check if element is a list
        if element.name in ['ul', 'ol']:
            list_courses = extract_from_list(element)
            if list_courses:
                courses.extend(list_courses)
                return
        
        # Recursively check child elements
        for child in element.children:
            if hasattr(child, 'name') and child.name is not None:
                search_for_courses(child, depth + 1, max_depth)
    
    # Start searching from the curriculum section
    next_elem = curriculum_section.next_sibling
    
    # Check siblings until we find another heading or run out of siblings
    while next_elem:
        if hasattr(next_elem, 'name') and next_elem.name in ['h1', 'h2', 'h3', 'h4'] and not re.search('Year', next_elem.text, re.IGNORECASE):
            # Stop if we hit another main heading (that's not a Year heading)
            break
        
        # Only process element nodes (not string nodes)
        if hasattr(next_elem, 'name') and next_elem.name is not None:
            search_for_courses(next_elem, 0, 5)  # Maximum depth of 5 levels
        
        next_elem = next_elem.next_sibling
    
    # If no courses found with the first approach, try a different approach
    if not courses:
        # Try to find divs containing curriculum info after the curriculum heading
        section_div = curriculum_section.find_parent('div')
        if section_div:
            search_for_courses(section_div, 0, 5)
    
    # If still no courses, search more broadly for tables with course-like content
    if not courses:
        for table in content_div.select('table'):
            table_courses = extract_from_table(table)
            if table_courses:
                courses.extend(table_courses)
    
    return courses

In [8]:
extract_courses('https://calendar.camosun.ca/preview_program.php?catoid=25&poid=3954')

No curriculum section found.


  section = content_div.find(['h1', 'h2', 'h3', 'h4'], text=re.compile(f'{heading}', re.IGNORECASE))


[]

In [41]:
soup = get_soup('https://calendar.camosun.ca/preview_program.php?catoid=25&poid=3954')

# Find the "Curriculum" header (usually an h2 or h3 tag)
curriculum_header = None
for header_tag in ['h2', 'h3', 'h4']:
    headers = soup.find_all(header_tag)
    for header in headers:
        if 'curriculum' in header.get_text(strip=True).lower():
            curriculum_header = header
            break
    if curriculum_header:
        break

In [42]:
#courses_list = soup.select('ul .acalog-course')
courses_list = soup.select('.acalog-core ul li')
courses_list

[<li>C+ in <a aria-label="English 12 opens a new window" href="content.php?catoid=25&amp;navoid=2207" onclick="acalogPopup('content.php?catoid=25&amp;navoid=2207'+((location.search.match(/&amp;print/i)) ? '&amp;print' : '')+'', '16', 770, 530, 'yes');return false;" target="_blank">English 12</a><span style="display: none !important"> </span> </li>,
 <li>C in <a aria-label="English 12 Camosun Alternative opens a new window" href="content.php?catoid=25&amp;navoid=2230#course-alternatives" onclick="acalogPopup('content.php?catoid=25&amp;navoid=2230'+((location.search.match(/&amp;print/i)) ? '&amp;print' : '')+'#course-alternatives', '16', 770, 530, 'yes');return false;" target="_blank">English 12 Camosun Alternative</a><span style="display: none !important"> </span>   </li>,
 <li>C+ in <a aria-label="Anatomy and Physiology 12 opens a new window" href="content.php?catoid=25&amp;navoid=2210" onclick="acalogPopup('content.php?catoid=25&amp;navoid=2210'+((location.search.match(/&amp;print/i))

In [32]:
len(courses_list)

0

In [49]:
def get_program_courses(url):
    """
    Get all courses from a program page.
    
    Args:
        url (str): URL of the program page
        
    Returns:
        list: List of course information dictionaries with code, title, and credits if available
    """
    soup = get_soup(url)
    
    courses_list = soup.select('.acalog-core ul li')

    courses = []
    for element in courses_list:
    # Check if the element is a list item
        if element.name == 'li':
            # Extract the text content
            text = clean_text(element.text)
            # Look for patterns like "COURSE 101 - Course Title (3 credits)"
            course_match = re.match(r'([A-Z]{2,5}\s*\d{3,4}[A-Z]?)\s*-?\s*(.*?)(?:\s*\((\d+\.?\d*)\s*credits?\))?$', text, re.IGNORECASE)
            if course_match:
                course_code = course_match.group(1).strip()
                course_title = course_match.group(2).strip()
                courses.append(f'{course_code} - {course_title}')
    return courses

In [50]:
get_program_courses('https://calendar.camosun.ca/preview_program.php?catoid=25&poid=3954')

['BIOL 150 - Human Anatomy Credits: 3',
 'ENGL 151 - Academic Writing Strategies Credits: 3 *',
 'KIN 220 - Resistance Training & Group Exercise Instruction Credits: 3',
 'PHYS 160 - Biomechanics of Sport Credits: 3',
 'PSYC 160 - Sport & Exercise Psychology 1 Credits: 3',
 'AET 201 - Placement 1 Credits: 3',
 'AET 260 - Emergency Conditions 1 Credits: 3',
 'AET 272 - Field Prevention/Injury Care 1 Credits: 3',
 'BIOL 151 - Human Physiology Credits: 3',
 'KIN 230 - Behavioural Fitness Credits: 3',
 'AET 265 - Musculoskeletal Anatomy Credits: 3',
 'AET 320 - Human Motor Control Credits: 3',
 'KIN 210 - Exercise Physiology Credits: 3',
 'KIN 240 - Fitness & Health Assessment Credits: 3',
 'KIN 310 - Research Methods Credits: 3',
 'AET 202 - Placement 2 Credits: 3',
 'AET 310 - Pathophysiology Credits: 3',
 'CHEM 214 - Nutrition for Fitness Credits: 3',
 'KIN 241 - Exercise Prescription & Design Credits: 3',
 'KIN 341 - Training for Performance Credits: 3',
 'AET 261 - Emergency Condition

In [48]:
for element in courses_list:
    # Check if the element is a list item
    if element.name == 'li':
        # Extract the text content
        text = clean_text(element.text)
        # Look for patterns like "COURSE 101 - Course Title (3 credits)"
        course_match = re.match(r'([A-Z]{2,5}\s*\d{3,4}[A-Z]?)\s*-?\s*(.*?)(?:\s*\((\d+\.?\d*)\s*credits?\))?$', text, re.IGNORECASE)
        if course_match:
            course_code = course_match.group(1).strip()
            course_title = course_match.group(2).strip()
            print(f"Course Code: {course_code}, Title: {course_title}")

Course Code: BIOL 150, Title: Human Anatomy Credits: 3
Course Code: ENGL 151, Title: Academic Writing Strategies Credits: 3 *
Course Code: KIN 220, Title: Resistance Training & Group Exercise Instruction Credits: 3
Course Code: PHYS 160, Title: Biomechanics of Sport Credits: 3
Course Code: PSYC 160, Title: Sport & Exercise Psychology 1 Credits: 3
Course Code: AET 201, Title: Placement 1 Credits: 3
Course Code: AET 260, Title: Emergency Conditions 1 Credits: 3
Course Code: AET 272, Title: Field Prevention/Injury Care 1 Credits: 3
Course Code: BIOL 151, Title: Human Physiology Credits: 3
Course Code: KIN 230, Title: Behavioural Fitness Credits: 3
Course Code: AET 265, Title: Musculoskeletal Anatomy Credits: 3
Course Code: AET 320, Title: Human Motor Control Credits: 3
Course Code: KIN 210, Title: Exercise Physiology Credits: 3
Course Code: KIN 240, Title: Fitness & Health Assessment Credits: 3
Course Code: KIN 310, Title: Research Methods Credits: 3
Course Code: AET 202, Title: Placement

In [16]:
curriculum_header.find_parent('div').find_next_sibling('div')

<div class="custom_leftpad_20"><div class="acalog-core"><h3><a name="YearOne"></a><a id="core_79678" name="yearone"></a>Year One</h3><hr/></div><div class="custom_leftpad_20"><div class="acalog-core"><h4><a name="AcademicTermOne"></a><a id="core_79680" name="academictermone"></a>Academic Term One</h4><hr/></div><div class="custom_leftpad_20"><div class="acalog-core"><h5><a name="AllOf"></a><a id="core_79679" name="allof"></a>All of:</h5><hr/><ul><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('25', '44733',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~79679~;}'); return false;">BIOL 150 - Human Anatomy</a> <strong>Credits:</strong> 3</span></li><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('25', '45228',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~79679~;}'); return false;">ENGL 151 - Academic Writing Strategies</a> <strong>Credits:</strong> 3 *</span></li><li class="acalog-course"><spa

In [None]:
import requests
from bs4 import BeautifulSoup
import re

def extract_courses_from_program(url):
    soup = get_soup(url)

    # Find the "Curriculum" header (usually an h2 or h3 tag)
    curriculum_header = None
    for header_tag in ['h2', 'h3', 'h4']:
        headers = soup.find_all(header_tag)
        for header in headers:
            if 'curriculum' in header.get_text(strip=True).lower():
                curriculum_header = header
                break
        if curriculum_header:
            break

    if not curriculum_header:
        raise Exception("Curriculum section not found.")

    # Get the next sibling that contains curriculum content
    # We'll collect all course codes that look like AAAA 000
    curriculum_content = []
    current = curriculum_header.find_next_sibling()
    while current:
        # Stop if we hit another section header
        if current.name in ['h2', 'h3', 'h4']:
            break

        text = current.get_text(separator=' ', strip=True)
        # Regex pattern for course codes (e.g., ENGL 151, MATH 100)
        courses = re.findall(r'\b[A-Z]{4}\s*\d{3}[A-Z]?\b', text)
        curriculum_content.extend(courses)

        current = current.find_next_sibling()

    # Remove duplicates and sort
    return sorted(set(curriculum_content))


In [10]:
extract_courses_from_program('https://calendar.camosun.ca/preview_program.php?catoid=25&poid=3954')

[]

## Explanation of Course Extraction Function

The `extract_courses` function is designed to be flexible and handle different HTML structures for program curriculum pages. It works through the following steps:

1. **Locate the Curriculum Section**: Identifies the section containing course information by searching for common headings like "Curriculum", "Program Content", etc.

2. **Recursive Parsing**: The function uses a recursive approach to handle different levels of nesting:
   - Level 2 nesting: Courses are 2 levels deep from the curriculum heading
   - Level 3 nesting: Courses are 3 levels deep
   - Level 4 nesting: Courses are 4 levels deep or more

3. **Multiple Extraction Methods**: The function can extract course information from:
   - HTML tables (most common format)
   - Unordered or ordered lists
   - Specifically formatted text

4. **Pattern Recognition**: Uses regular expressions to identify course codes and extract components like:
   - Course code (e.g., "ENGL 151")
   - Course title (e.g., "Academic Writing Strategies")
   - Credit values when available

5. **Fallback Mechanisms**: If the primary approach doesn't find any courses, it tries additional strategies:
   - Looking at parent containers of the curriculum heading
   - Scanning all tables in the document for course-like content
   
This approach ensures that courses can be extracted regardless of the HTML structure, which varies between different programs and credential types.