# Scraping the UCSD course catalog

The UCSD course catalog contains 6,000+ course descriptions across 80+ pages. The HTML is messy, inconsistent, and full of surprises, so this was lots of fun.


First, I get the links to the individual program pages (e.g. CSE, LANG, MATH) from the "cover" of the course catalog. I just get every link named "courses".


In [32]:
import requests
import bs4
from bs4 import BeautifulSoup
import json
from collections import defaultdict
from tabulate import tabulate
import re

page = requests.get("https://catalog.ucsd.edu/front/courses.html")
cover = BeautifulSoup(page.text, "html.parser")
links = cover.find_all("a", href=lambda href: href and href.startswith("../courses"))
urls = ["https://catalog.ucsd.edu" + link["href"][2:] for link in links]


def get_dept_abbr(url):
    return url[33:-5].upper()


with open("urls.txt", "w") as f:
    for url in urls:
        f.write(url + "\n")


In [4]:
# fetch all the pages (takes like 20-30 seconds)
pages = [BeautifulSoup(requests.get(url).content, "html.parser") for url in urls]


I used the code below to show me all the most uncommon combinations of tag names and classes. It helped me find all sorts of oddities and edge cases.


In [5]:
def key(tag):
    return (tag.name, tuple(tag.attrs.get("class") or []))


tag_types = {}

with open("edge-cases.txt", "w") as f:
    for url, page in zip(urls, pages):
        # f.write('\n\n==========================================\n\n' + url + '\n\n==========================================\n\n')
        content = page.find(class_="col-md-12 blank-slate").children
        for tag in content:
            if not isinstance(tag, bs4.element.Tag):
                continue

            k = key(tag)
            if k not in tag_types:
                tag_types[k] = [0, set()]
            tag_types[k][0] += 1
            tag_types[k][1].add(url)

    data = [[v[0], k[0], " ".join(k[1]), " ".join(v[1])] for k, v in tag_types.items()]
    data.sort()

    f.write(tabulate(data, headers=["Count", "Tag", "Classes", "URLs"]))


The HTML is messy, but at least every page is structured similarly: all of the info I care about is in a `div` with class `"col-md-12 blank-slate"`. After that, I classify each element as either a header (if so, which one), a course name (e.g. CSE 100. Advanced Data Structures), a course number (e.g. `cse100`), or text. Text is assigned to the last seen course/header. Text that is assigned to a header must be between the header and its corresponding courses, so it must contain notes that apply to all the following courses.


In [24]:
# # classes i definitely dont care about
# ignored_classes = set([
#     'course-head',
#     'basic-offset-top-only',
#     'faculty-staff-subhead',
#     'note',
#     'course-note',
#     'alphabreak',
#     'sectionNav',
#     'program-contact-info',
#     'anchor-parent',
#     'courseFacLink',
#     'course-list-overview',
#     'course-prerequisite-paragraph',
#     'course-list-courses',
#     'course-disclaimer',
# ])

# # tag names i definitely dont care about
# ignored_tag_names = set([
#     'ul', 'table', 'a'
# ])
    

def get_type(tag) -> str:
    if type(tag) is bs4.element.NavigableString:
        return None if tag.string.strip() == '' else 'plain-string'
    
    if type(tag) is not bs4.element.Tag:
        return None
    
    if tag.name[0] == "h":
        return tag.name

    classes = tag.attrs.get("class")
    if not classes:
        return "text"

    if classes[0] == "course-name":
        return "name"

    if classes[0] == "anchor-parent":
        return "num"

    return "text"


def get_tags(page: BeautifulSoup):
    tags = page.find(class_="col-md-12 blank-slate").children
    return [(tag, get_type(tag)) for tag in tags if get_type(tag) is not None]


Now, I go through the tags I might care about (`get_tags(page)`) and extract the just the courses & categories for now - we'll process each course more afterwards. The most important thing I rely on is that each course title is a paragraph with class "course-name". I use the headers to give the courses categories. I write all the content into a file in the form of JSON so I can manually fix typos/formatting and add/remove content. My JSON will be a list of the 82 pages, each of which is a list of courses (title and description) and headers (h2-h4).


In [38]:
def get_clean_data(page: BeautifulSoup):
    def get_backlinks(title):
        first_period = title.find('.')
        if first_period != -1:
            ret = re.split(', |\/', title[:first_period])
            if all(re.fullmatch('[A-Z]+ \d+[A-Z]*', s) for s in ret):
                return ret
        
        print('add backlinks: ' + title)
        return []

    def id_to_backlink(id: str):
        id = id.upper()
        if id.find('-') != -1:
            return id.replace('-', ' ')
        
        for i, char in enumerate(id):
            if char.isdigit():
                return id[:i] + " " + id[i:]
        
        print('fix id: ' + id)
        return id
    
    # if this tag (or any tag it contains) has a matching id and name, return the id
    def get_course_num(tag):
        def first_child(tag): # returns first child of a tag that is also a tag
            for c in tag.children:
                if type(c) is bs4.element.Tag:
                    return c

        if type(tag) is not bs4.element.Tag:
            return
        while tag:
            id = tag.get('id')
            name = tag.get('name')
            if id == '': id = None
            if name == '': name = None
            if tag.attrs.get('class') == ['anchor']:
                return id or name
            if id and name and id == name:
                return id
            tag = first_child(tag)


    # list of course numbers
    # cleared out after each header/course name
    # used to label courses
    cur_backlinks = []

    ret = [] # list of courses and headers in JSON format

    seen_courses_header = False
    started = False
    for tag, tag_type in get_tags(page):
        
        if not started:
            # start at either at the first .course-name, .anchor-parent, or the first header after "Courses"
            if type(tag) is not bs4.element.Tag:
                continue
            if tag.text == 'Courses':
                seen_courses_header = True
                continue

            if tag_type in ('name', 'num'):
                started = True
            elif tag_type[0] == 'h' and seen_courses_header:
                started = True
            else:
                continue
        
        if tag_type[0] == 'h':
            ret.append({
                'type': tag_type,
                'content': tag.text.strip(),
                'desc': '',
            })
            cur_backlinks = []
        elif tag_type == 'name':
            title = tag.text.strip()
            if title == title.upper():
                title = title.title()
            if '.' not in title:
                print('add period: ' + title)
            ret.append({
                'type': 'course',
                'title': title,
                'desc': '',
                'backlinks': cur_backlinks[:] if cur_backlinks else get_backlinks(title)
            })
            cur_backlinks = []
        elif get_course_num(tag):
            cur_backlinks.append(id_to_backlink(get_course_num(tag)))
        elif ret:
            text = tag.text if tag_type == 'text' else tag.string
            text = text.strip()
            if text:
                if ret[-1]['desc'] != '':
                    ret[-1]['desc'] += '\n'
                ret[-1]['desc'] += text
    
    return ret

def get_h1(page: BeautifulSoup):
    return page.find('h1').text

pages_data = [{
    'url': url,
    'dept': get_h1(page),
    'dept_abbr': get_dept_abbr(url),
    'content': get_clean_data(page)
} for url, page in zip(urls, pages)]

with open('pages-data.json', 'w') as f:
    json.dump(pages_data, f)



add period: COMM 114M CSI: Communication and the Law (4)
add period: GPCO 468: Evaluating Technological Innovation (4)
add period: GPPS 481: The Political Economy of Authoritarian Regimes (4)
add period: HIUS 178/278 The Atlantic World, 1400–1800 (4)
fix id: HLP
fix id: DS
fix id: LITWORLD
fix id: SPACER
add backlinks: Electives. Varies (12)
add backlinks: HIGR 236A-B. Seminar in History of Science (4-4)
add backlinks: HISC 163/263. History, Science, and Politics of Climate Change (4)
add backlinks: HISC 167/267. Gender and Science (4)
add backlinks: HISC 173/273. Seminar on Darwin and Darwinisms (4)
add backlinks: HISC 180/280. Science and Public Policy (4)
add period: SOCI 123 Japanese Culture Inside/Out: A Transnational Perspective (4)


I manually did the following cleanup:
- Add periods in the 5 titles where they were missing with the regex `"type": "course",\n.*"title": [^.]*$`
- Removed the 2 instances of `"Not offered until"`
- Replace ` [Pp]rerequisites: none\.?"` with `"`
- 

In [None]:

with open("pages-data.json", "r") as f:
    pages_data = json.load(f)

# courses: list[str] = []
# for page in pages_data:
#     for course in page['content']:
#         if course['type'] == 'course':
#             courses.append(course['desc'])

# courses.sort(key=lambda x: -len(x))

# for course in courses:
#     print(course)
#     print('\n' * 4 + '-'*100 + '\n' * 5)

for i, page_data in enumerate(pages_data):
    print("\n\n" + page_data["url"] + "\n")
    for j, obj in enumerate(page_data["content"]):
        if obj["type"][0] == "h":
            print("#" * int(obj["type"][1]) + " " + obj["content"])
            # if obj['content'].isupper():
            #     pages_data[i]['content'][j]['content'] = obj['content'].title() # mutate actual data

with open('temp.json', 'w') as f:
    json.dump(pages_data, f)




https://catalog.ucsd.edu/courses/AIP.html



https://catalog.ucsd.edu/courses/AASM.html

## Lower Division
## Upper Division


https://catalog.ucsd.edu/courses/AWP.html

## Lower Division
## Upper Division
## Graduate


https://catalog.ucsd.edu/courses/ANTH.html

## Lower Division
## Upper Division
## Anthropology: Archaeology
## Anthropology: Biological Anthropology
## Anthropology: Sociocultural Anthropology
## Graduate


https://catalog.ucsd.edu/courses/AUDL.html



https://catalog.ucsd.edu/courses/BIOI.html



https://catalog.ucsd.edu/courses/BIOL.html

## Lower Division
## Upper Division
### Biochemistry
### Genetics, Cellular and Developmental Biology of Plants and Animals
### Ecology, Behavior, and Evolution
### Molecular Biology, Microbiology
### Physiology and Neuroscience
### Special Courses
## Graduate


https://catalog.ucsd.edu/courses/BIOM.html



https://catalog.ucsd.edu/courses/CHEM.html

## Lower Division
## Upper Division
## Graduate


https://catalog.ucsd.edu/course

Now, I factor out all of the following metadata about each course:

- short/long name
- units
- sub-category
- department
- subject
- link

I will still group the courses by department, like the original course catalog does.


In [None]:
with open("pages-data.json", "r") as f:
    pages_data = json.load(f)

courses_data = []

cur_path = []  # [h2] or [h2, h3] or [h2, h3, h4]
for page_data in pages_data:
    courses_data.append({"dept": page_data["dept"], "courses": []})
    for obj in page_data["content"]:
        if obj["type"][0] == "h":
            lvl = int(obj["type"][1])
            title = obj["content"]
            if lvl == 2:
                cur_path = [title]
            elif lvl == 3:
                cur_path = [cur_path[0], title]
            else:
                cur_path = [cur_path[0], cur_path[1], title]
        else:
            title = obj["title"]
            i = 0
            while "A" <= title[i] <= "Z":
                i += 1
            subject = title[:i]
            while not title[i].isdigit():
                i += 1
            j = i
            while title[j].isdigit() or "A" <= title[j] <= "Z":
                j += 1
            short_name = subject + " " + title[i:j]
            while title[j] != ".":
                j += 1
            long_name = title[:j]
            k = j
            while title[k] != "(":
                k += 1
            cleaned_title = title[j + 1 : k]
            units = title[k + 1 : -1]


IndexError: string index out of range

Now, it's time to link the courses together.

- I get a set of all the course names
- I get the subject codes (e.g. CSE, LANG) and the major codes (e.g. CS27, UNHA) from their respective pages
- I add `a` tags every time I see any of them in a description


In [29]:
# major codes
soup = BeautifulSoup(
    requests.get("https://blink.ucsd.edu/instructors/academic-info/majors/major-codes.html").content, "html.parser"
)
major_codes = {}
for row in soup.find_all("tr"):
    cells = row.find_all("td")
    if not cells or len(cells) < 2 or len(cells[-2].text) < 3:
        continue
    try:
        major_codes[cells[-2].text] = cells[-1].text
    except IndexError:
        print(cells)


# subject codes
soup = BeautifulSoup(
    requests.get("https://blink.ucsd.edu/instructors/courses/schedule-of-classes/subject-codes.html").content,
    "html.parser",
)
subject_codes = {}
for row in soup.find_all("tr"):
    cells = row.find_all("td")
    if not cells or len(cells) < 2:
        continue
    subject_codes[cells[0].text] = cells[1].text

print(tabulate([[k, v] for k, v in major_codes.items()], headers=["Code", "Major"]))
print(tabulate([[k, v] for k, v in subject_codes.items()], headers=["Subject", "Major"]))


Code    Major
------  ----------------------------------------------------------------------------------------
AN27    Anthropology (Concentration in Archaeology)
AN26    Anthropology (Concentration in Biological Anthropology)
AN28    Anthropology (Concentration in Sociocultural Anthropology)
AN30    Anthropology with a Concentration in Climate Change and Human Solutions
AN29    Biological Anthropology
BE25    Bioengineering
BE28    Bioengineering (Bioinformatics)
BE29    Bioengineering: BioSystems
BE27    Bioengineering (Biotechnology)
BI34    Biology with Specialization in Bioinformatics
BI30    Ecology, Behavior, and Evolution
BI31    General Biology
BI35    Human Biology
BI32    Microbiology
BI37    Molecular and Cell Biology
BI38    Neurobiology
BI29    Biochemistry and Cell Biology (see also Chemistry and Biochemistry CH31)
BI33    Molecular Biology
BI36    Physiology and Neuroscience
AA25    Black Diaspora and African American Studies
CE25    Chemical Engineering
CH38    Biochem