In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time


In [9]:
CATALOG_URL = "https://www.africa.engineering.cmu.edu/academics/course-catalog.html"
BASE_URL = "https://www.africa.engineering.cmu.edu"

def get_available_course_links():
    resp = requests.get(CATALOG_URL)
    soup = BeautifulSoup(resp.text, "html.parser")

    # Find the header "Available Courses"
    available_header = soup.find("h2", string="Available courses")
    if not available_header:
        raise Exception("Could not find Available Courses section")

    # Find the next table after the header
    table = available_header.find_next("table")
    if not table:
        raise Exception("Could not find the courses table after the header")

    links = []
    for row in table.find_all("tr"):
        first_td = row.find("td")
        if first_td:
            a_tag = first_td.find("a", href=True)
            if a_tag and a_tag["href"].endswith(".html"):
                href = a_tag["href"]
                # Make absolute URL
                if href.startswith("/"):
                    full_url = BASE_URL + href
                else:
                    full_url = BASE_URL + "/academics/" + href
                links.append(full_url)
    return links

In [21]:
def parse_course_page(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")

    h1 = soup.find("h1")
    title_raw = h1.get_text(strip=True) if h1 else ""
    title_raw = title_raw.replace("\xa0", "").strip()
    if " " in title_raw:
        course_code, title = title_raw.split(" ", 1)
        title=title.strip()
    else:
        course_code = ""
        title = title_raw

    # Extract course info block
    info_block = soup.find("p", class_="mb-month")
    semester = units = location = None
    if info_block:
        text = info_block.get_text()
        for line in text.splitlines():
            line = line.strip()
            if "Semester Offered" in line:
                semester = line.split(":")[1].strip()
            elif "Units" in line:
                units = line.split(":")[1].strip()
            elif "Location" in line:
                location = line.split(":")[1].strip()

    # Initialize content
    course_data = {
        "url": url,
        "course_code": course_code,
        "title": title,
        "semester": semester,
        "units": units,
        "location": location,
        "description": "",
        "objectives": "",
        "outcomes": [],
        "content": "",
        "prerequisites": []
    }

    # Section detection
    content_wrapper = soup.find("section", class_="innerSection")
    if content_wrapper:
        current_section = None
        for tag in content_wrapper.find_all(["h2", "p", "ul"]):
            tag_text = tag.get_text(strip=True).lower()
            if tag.name == "h2":
                if "description" in tag_text:
                    current_section = "description"
                elif "objective" in tag_text:
                    current_section = "objectives"
                elif "outcome" in tag_text:
                    current_section = "outcomes"
                elif "content" in tag_text:
                    current_section = "content"
                elif "prerequisite" in tag_text:
                    current_section = "prerequisites"
                else:
                    current_section = None
            elif tag.name == "p":
                text = tag.get_text(" ", strip=True)
                if current_section == "description":
                    course_data["description"] += text + " "
                elif current_section == "objectives":
                    course_data["objectives"] += text + " "
                elif current_section == "content":
                    course_data["content"] += text + " "
                elif current_section == "prerequisites":
                    # Try extracting course codes or names from text
                    course_data["prerequisites"] += [
                        p.strip() for p in text.split(",") if p.strip()
                    ]
            elif tag.name == "ul" and current_section == "outcomes":
                course_data["outcomes"] += [
                    li.get_text(" ", strip=True) for li in tag.find_all("li")
                ]

    return course_data


In [None]:
def main():
    links = get_available_course_links()
    print(f"Found {len(links)} available courses.")
    
    all_courses = []
    for i, link in enumerate(links):
        print(f"[{i+1}/{len(links)}] Scraping {link}")
        try:
            course = parse_course_page(link)
            all_courses.append(course)
            time.sleep(1)  # we must be nice to the server
        except Exception as e:
            print(f"⚠️ Error scraping {link}: {e}")

    with open("cmuafrica_courses.json", "w", encoding="utf-8") as f:
        json.dump(all_courses, f, indent=2, ensure_ascii=False)

    print("✅ Scraping completed. Saved to cmuafrica_courses.json")

In [24]:
main()

Found 105 available courses.
[1/105] Scraping https://www.africa.engineering.cmu.edu/academics/courses/04-330.html
[2/105] Scraping https://www.africa.engineering.cmu.edu/academics/courses/04-601.html
[3/105] Scraping https://www.africa.engineering.cmu.edu/academics/courses/04-603.html
[4/105] Scraping https://www.africa.engineering.cmu.edu/academics/courses/04-605.html
[5/105] Scraping https://www.africa.engineering.cmu.edu/academics/courses/04-606.html
[6/105] Scraping https://www.africa.engineering.cmu.edu/academics/courses/04-607.html
[7/105] Scraping https://www.africa.engineering.cmu.edu/academics/courses/04-608.html
[8/105] Scraping https://www.africa.engineering.cmu.edu/academics/courses/04-609.html
[9/105] Scraping https://www.africa.engineering.cmu.edu/academics/courses/04-611.html
[10/105] Scraping https://www.africa.engineering.cmu.edu/academics/courses/04-612.html
[11/105] Scraping https://www.africa.engineering.cmu.edu/academics/courses/04-613.html
[12/105] Scraping https

In [25]:
with open("cmuafrica_courses.json", "r", encoding="utf-8") as f:
    courses = json.load(f)
    print(f"Number of courses in cmuafrica_courses.json: {len(courses)}")


Number of courses in cmuafrica_courses.json: 105
