# Scraping the UCSD course catalog
The UCSD course catalog contains 6,000+ course descriptions across 80+ pages. The HTML is messy, inconsistent, and full of surprises, so this was lots of fun.

First, I get the links to the individual program pages (e.g. CSE, LANG, MATH) from the "cover" of the course catalog. I just get every link named "courses".

In [28]:
import requests
import bs4
from bs4 import BeautifulSoup
import json
from collections import defaultdict
from tabulate import tabulate

page = requests.get('https://catalog.ucsd.edu/front/courses.html')
cover = BeautifulSoup(page.text, 'html.parser')
links = cover.find_all('a', href=lambda href: href and href.startswith('../courses'))
urls = ['https://catalog.ucsd.edu' + link['href'][2:] for link in links]

def get_dept(url):
    return url[33:-5].upper()

with open('urls.txt', 'w') as f:
    for url in urls:
        f.write(url + '\n')

In [5]:
# fetch all the pages (takes like 20-30 seconds)
pages = [BeautifulSoup(requests.get(url).content, 'html.parser') for url in urls]

I used the code below to show me all the most uncommon combinations of tag names and classes. It helped me find all sorts of oddities and edge cases.

In [9]:
def key(tag):
    return (tag.name, tuple(tag.attrs.get('class') or []))

tag_types = {}

with open('edge-cases.txt', 'w') as f:
    for url, page in zip(urls, pages):
        # f.write('\n\n==========================================\n\n' + url + '\n\n==========================================\n\n')
        content = page.find(class_="col-md-12 blank-slate").children
        for tag in content:
            if not isinstance(tag, bs4.element.Tag):
                continue
            
            k = key(tag)
            if k not in tag_types:
                tag_types[k] = [0, set()]
            tag_types[k][0] += 1
            tag_types[k][1].add(url)

    data = [[v[0], k[0], ' '.join(k[1]), ' '.join(v[1])] for k, v in tag_types.items()]
    data.sort()

    f.write(tabulate(data, headers = ["Count", "Tag", "Classes", "URLs"]))

    

The HTML is messy, but at least every page is structured similarly: all of the info I care about is in a `div` with class `"col-md-12 blank-slate"`. After that, I go off of the tag names and classes of that `div`'s direct children.

In [22]:
# classes i definitely dont care about
ignored_classes = set([
    'course-head',
    'basic-offset-top-only',
    'faculty-staff-subhead',
    'note',
    'course-note',
    'alphabreak',
    'sectionNav', 
    'program-contact-info',
    'anchor-parent',
    'courseFacLink',
    'course-list-overview',
    'course-prerequisite-paragraph',
    'course-list-courses',
    'course-disclaimer',
])

# tag names i definitely dont care about
ignored_tag_names = set([
    'ul', 'table', 'a'
])

def get_class(tag: bs4.element.Tag) -> str:
    classes = tag.attrs.get('class')
    if not classes: return ''
    if len(classes) > 1:
        raise Exception('multiple classes')

    # all instances of 'course-description' are typos of 'course-descriptions'
    # just shorten both to desc here
    # (not that it even matters because I'm ignoring them both)
    if classes[0] in ('course-description', 'course-descriptions'):
        return 'desc'
    
    return classes[0]

def ignored(tag):
    return get_class(tag) in ignored_classes or tag.name in ignored_tag_names

def get_tags(page: BeautifulSoup) -> list[bs4.element.Tag]:
    tags = page.find(class_="col-md-12 blank-slate").children
    return [tag for tag in tags if type(tag) is bs4.element.Tag and not ignored(tag)] # tag is bs4.element.Tag and

Now, I go through the tags I might care about (`get_tags(page)`) and extract the just the courses & categories for now - we'll process each course more afterwards. The most important thing I rely on is that each course title is a paragraph with class "course-name". I use the headers to give the courses categories. I write all the content into a file in the form of JSON so I can manually fix typos/formatting and add/remove content. My JSON will be a list of the 82 pages, each of which is a list of courses (title and description) and headers (h2-h4).

In [29]:
def get_clean_data(page: BeautifulSoup):
    def is_header(tag: bs4.element.Tag):
        return tag.name[0] == 'h'
    
    def is_title(tag: bs4.element.Tag):
        return get_class(tag) == 'course-name'

    seen_courses_header = False
    started = False

    # instead of relying on any other classes/tags, just add all text to the course
    # description unless it is between an h tag and a <p class="course-name">
    adding_text = False

    ret = [] # list of courses and headers in JSON format
    for tag in get_tags(page):
        if not started:
            # start at either at the first course-name or the first header AFTER "Courses"
            if tag.text == 'Courses':
                seen_courses_header = True
                continue
            elif is_title(tag):
                started = True
            elif is_header(tag) and seen_courses_header:
                started = True
            else:
                continue
        
        if is_header(tag):
            ret.append({
                'type': tag.name,
                'content': tag.text
            })
            adding_text = False # don't add text between headers and course-names
        elif is_title(tag):
            adding_text = True
            ret.append({
                'type': 'course',
                'title': tag.text,
                'desc': ''
            })
        elif adding_text:
            if ret[-1]['desc'] != '':
                ret[-1]['desc'] += '\n'
            ret[-1]['desc'] += tag.text
    
    return ret

pages_data = [{
    'url': url,
    'dept': get_dept(url),
    'content': get_clean_data(page)
} for url, page in zip(urls, pages)]

with open('pages-data.json', 'w') as f:
    json.dump(pages_data, f)



Now, I get the subject codes (e.g. CSE, LANG) and the major codes (e.g. CS27, UNHA) from their respective pages. I don't worry about minor codes because the catalog doesn't really use them.

In [None]:
# major codes
soup = BeautifulSoup(requests.get('https://blink.ucsd.edu/instructors/academic-info/majors/major-codes.html').content, 'html.parser')
major_codes = {}
for row in soup.find_all('tr'):
    cells = row.find_all('td')
    if not cells or len(cells) < 2 or len(cells[-2].text) < 3:
        continue
    try:
        major_codes[cells[-2].text] = cells[-1].text
    except IndexError:
        print(cells)


# subject codes
soup = BeautifulSoup(requests.get('https://blink.ucsd.edu/instructors/courses/schedule-of-classes/subject-codes.html').content, 'html.parser')
subject_codes = {}
for row in soup.find_all('tr'):
    cells = row.find_all('td')
    if not cells or len(cells) < 2:
        continue
    subject_codes[cells[0].text] = cells[1].text

# print(tabulate([[k, v] for k, v in major_codes.items()], headers=['Code', 'Major']))
# print(tabulate([[k, v] for k, v in subject_codes.items()], headers=['Subject', 'Major']))
