In [82]:
import requests
import bs4
from bs4 import BeautifulSoup
import json
from collections import defaultdict
from tabulate import tabulate
import re
import os

First, I get the links to the individual program pages (e.g. CSE, LANG, MATH) from the "cover" of the course catalog. I just get every link named "courses".


In [3]:
page = requests.get("https://catalog.ucsd.edu/front/courses.html")
cover = BeautifulSoup(page.text, "html.parser")
links = cover.find_all("a", href=lambda href: href and href.startswith("../courses"))
urls = ["https://catalog.ucsd.edu" + link["href"][2:] for link in links]


def get_dept_abbr(url):
    return url[33:-5].upper()


with open("urls.txt", "w") as f:
    for url in urls:
        f.write(url + "\n")


In [4]:
# fetch all the pages (takes like 20-30 seconds)
pages = [BeautifulSoup(requests.get(url).content, "html.parser") for url in urls]


KeyboardInterrupt: 

I used the code below to show me all the most uncommon combinations of tag names and classes. It helped me find all sorts of oddities and edge cases.


In [4]:
def key(tag):
    return (tag.name, tuple(tag.attrs.get("class") or []))


tag_types = {}

with open("edge-cases.txt", "w") as f:
    for url, page in zip(urls, pages):
        # f.write('\n\n==========================================\n\n' + url + '\n\n==========================================\n\n')
        content = page.find(class_="col-md-12 blank-slate").children
        for tag in content:
            if not isinstance(tag, bs4.element.Tag):
                continue

            k = key(tag)
            if k not in tag_types:
                tag_types[k] = [0, set()]
            tag_types[k][0] += 1
            tag_types[k][1].add(url)

    data = [[v[0], k[0], " ".join(k[1]), " ".join(v[1])] for k, v in tag_types.items()]
    data.sort()

    f.write(tabulate(data, headers=["Count", "Tag", "Classes", "URLs"]))


The HTML is messy, but at least every page is structured similarly: all of the info I care about is in a `div` with class `"col-md-12 blank-slate"`. After that, I classify each element as either a header (if so, which one), a course name (e.g. CSE 100. Advanced Data Structures), a course number (e.g. `cse100`, usually found in an `a` tag), or text. Text is assigned to the last seen course/header. Text that is assigned to a header must be between the header and its corresponding courses, so it will end up containing notes that apply to all the following courses.


In [5]:
def get_type(tag) -> str:
    if type(tag) is bs4.element.NavigableString:
        return None if tag.string.strip() == '' else 'plain-string'
    
    if type(tag) is not bs4.element.Tag:
        return None
    
    if tag.name[0] == "h":
        return tag.name

    classes = tag.attrs.get("class")
    if not classes:
        return "text"

    if classes[0] == "course-name":
        return "name"

    if classes[0] == "anchor-parent":
        return "num"

    return "text"


def get_tags(page: BeautifulSoup):
    tags = page.find(class_="col-md-12 blank-slate").children
    return [(tag, get_type(tag)) for tag in tags if get_type(tag) is not None]


Now, I extract and clean the raw data and process the id's; we'll process each course more afterwards. The most important thing I rely on is that each course title is a paragraph with class "course-name". I use the headers to give the courses categories. I write all the content into a file in the form of JSON so I can manually fix typos/formatting and add/remove content. My JSON will be a list of the 82 pages, each of which is a list of headers and courses.

Remember to output to `pages-data-2.json` first and diff it with `pages-data.json`.


In [6]:
def get_clean_data(page: BeautifulSoup):
    def get_backlinks(title):
        first_period = title.find('.')
        if first_period != -1:
            ret = re.split(', |\/', title[:first_period])
            if all(re.fullmatch('[A-Z]+ \d+[A-Z]*', s) for s in ret):
                return ret
        
        print('add backlinks: ' + title)
        return []

    def id_to_backlink(id: str):
        id = id.upper()
        if id.find('-') != -1:
            id = id.replace('-', ' ')
        else:
            for i, char in enumerate(id):
                if char.isdigit():
                    id = id[:i] + " " + id[i:]
                    break
        
        if not re.fullmatch('[A-Z]+ \d+[A-Z]*', id):
            print('fix id: ' + id)
        return id
    
    # if this tag (or any tag it contains) has a matching id and name, return the id
    def get_course_num(tag):
        def first_child(tag): # returns first child of a tag that is also a tag
            for c in tag.children:
                if type(c) is bs4.element.Tag:
                    return c

        if type(tag) is not bs4.element.Tag:
            return
        while tag:
            id = tag.get('id')
            name = tag.get('name')
            if id == '': id = None
            if name == '': name = None
            if tag.attrs.get('class') == ['anchor']:
                return id or name
            if id and name and id == name:
                return id
            tag = first_child(tag)


    # list of course numbers
    # cleared out after each header/course name
    # used to label courses
    cur_backlinks = []

    ret = [] # list of courses and headers in JSON format

    seen_courses_header = False
    started = False
    for tag, tag_type in get_tags(page):
        
        if not started:
            # start at either at the first .course-name, .anchor-parent, or the first header after "Courses"
            if type(tag) is not bs4.element.Tag:
                continue
            if tag.text == 'Courses':
                seen_courses_header = True
                continue

            if tag_type in ('name', 'num'):
                started = True
            elif tag_type[0] == 'h' and seen_courses_header:
                started = True
            else:
                continue
        
        if tag_type[0] == 'h':
            title = tag.text.strip()
            if title == title.upper():
                title = title.title()
            ret.append({
                'type': tag_type,
                'content': title,
                'desc': '',
            })
            cur_backlinks = []
        elif tag_type == 'name':
            title = tag.text.strip()
            if title == title.upper():
                title = title.title()
            if '.' not in title:
                print('add period: ' + title)
            ret.append({
                'type': 'course',
                'title': title,
                'desc': '',
                'backlinks': cur_backlinks[:] if cur_backlinks else get_backlinks(title)
            })
            cur_backlinks = []
        elif get_course_num(tag):
            cur_backlinks.append(id_to_backlink(get_course_num(tag)))
        elif ret:
            text = tag.text if tag_type == 'text' else tag.string
            text = text.strip()
            if text:
                if ret[-1]['desc'] != '':
                    ret[-1]['desc'] += '\n'
                ret[-1]['desc'] += text
    
    return ret

def get_h1(page: BeautifulSoup):
    return page.find('h1').text

pages_data = [{
    'url': url,
    'dept': get_h1(page),
    'deptAbbr': get_dept_abbr(url),
    'content': get_clean_data(page)
} for url, page in zip(urls, pages)]

with open('pages-data-2.json', 'w') as f:
    json.dump(pages_data, f)



fix id: CLAS 196A B
add period: COMM 114M CSI: Communication and the Law (4)
fix id: EDS128 A
add period: GPCO 468: Evaluating Technological Innovation (4)
add period: GPPS 481: The Political Economy of Authoritarian Regimes (4)
add period: HIUS 178/278 The Atlantic World, 1400–1800 (4)
fix id: HLP
fix id: DS
fix id: LITWORLD
fix id: SPACER
add backlinks: Electives. Varies (12)
add backlinks: HIGR 236A-B. Seminar in History of Science (4-4)
add backlinks: HISC 163/263. History, Science, and Politics of Climate Change (4)
add backlinks: HISC 167/267. Gender and Science (4)
add backlinks: HISC 173/273. Seminar on Darwin and Darwinisms (4)
add backlinks: HISC 180/280. Science and Public Policy (4)
fix id: SIO 182B2
fix id: SIOB 273A2
add period: SOCI 123 Japanese Culture Inside/Out: A Transnational Perspective (4)


I manually did the following cleanup:
- Add periods in the 5 titles where they were missing with the regex `"type": "course",\n.*"title": [^.]*$`
- Removed the 2 instances of `"Not offered until"`
- Replace the regex ` [Pp]rerequisites: none\.?"` with `"`
- Replace the regex `(\\n){2,}` with `\n`
- Fix periods with no space after them: `\.[A-Z][a-z]`
- Replace colon-space-space with colon-space
- Fix a page that had h3's instead of h2's at the root level
- Change PSYCH to PSYC

In [9]:

with open("pages-data.json", "r") as f:
    pages_data = json.load(f)

# courses: list[str] = []
# for page in pages_data:
#     for course in page['content']:
#         if course['type'] == 'course':
#             courses.append(course['desc'])

# courses.sort(key=lambda x: -len(x))

# for course in courses:
#     print(course)
#     print('\n' * 4 + '-'*100 + '\n' * 5)

for i, page_data in enumerate(pages_data):
    print("\n\n" + page_data["url"] + "\n")
    for j, obj in enumerate(page_data["content"]):
        if obj["type"][0] == "h":
            print("#" * int(obj["type"][1]) + " " + obj["content"])

with open('temp.json', 'w') as f:
    json.dump(pages_data, f)




https://catalog.ucsd.edu/courses/AIP.html



https://catalog.ucsd.edu/courses/AASM.html

## Lower Division
## Upper Division


https://catalog.ucsd.edu/courses/AWP.html

## Lower Division
## Upper Division
## Graduate


https://catalog.ucsd.edu/courses/ANTH.html

## Lower Division
## Upper Division
## Anthropology: Archaeology
## Anthropology: Biological Anthropology
## Anthropology: Sociocultural Anthropology
## Graduate


https://catalog.ucsd.edu/courses/AUDL.html



https://catalog.ucsd.edu/courses/BIOI.html



https://catalog.ucsd.edu/courses/BIOL.html

## Lower Division
## Upper Division
### Biochemistry
### Genetics, Cellular and Developmental Biology of Plants and Animals
### Ecology, Behavior, and Evolution
### Molecular Biology, Microbiology
### Physiology and Neuroscience
### Special Courses
## Graduate


https://catalog.ucsd.edu/courses/BIOM.html



https://catalog.ucsd.edu/courses/CHEM.html

## Lower Division
## Upper Division
## Graduate


https://catalog.ucsd.edu/course

Now, I factor out all of the following metadata about each course:

- short/long name
- units
- sub-category (maybe later)
- department
- url
- description (everything before "Prerequisites: ")
- details (everything after)

I will still group the courses by department, like the original course catalog does. `parsed-data.json` will contain a lot of redundant info, because it is just an intermediate step to make sure everything looks correct.


In [95]:
with open("pages-data.json", "r") as f:
    parsed_data = json.load(f)

# cur_path = []  # [h2] or [h2, h3] or [h2, h3, h4]

for dept in parsed_data:
    del dept['url']
    for course in dept['content']:
        if course['type'] != 'course':
            continue
        
        # parse title
        title = course['title']
        i = title.find('.')
        j = title.rfind('(')
        if j == -1: j = len(title)
        short_name = title[:i].strip()
        long_name = title[i+1 : j].strip()
        units = title[j+1 : -1]

        del course['title']
        course['shortName'] = short_name
        course['longName'] = long_name
        course['units'] = units

        # parse description
        desc = course['desc']
        i = desc.find('Prerequisites:')
        if i == -1: i = len(desc)
        details = desc[i:].strip()
        desc = desc[:i].strip()
        course['desc'] = desc
        course['details'] = details

        # give this course an id for use in the url
        if len(course['backlinks']) >= 2:
            if course['backlinks'][0].split()[0] != course['backlinks'][1].split()[0]:
                id = course['backlinks'][0]
                for link in course['backlinks'][1:]:
                    if link.split()[0] == dept['deptAbbr']:
                        id = link
                        break
                    
            else:
                id = course['shortName']
        elif len(course['backlinks']) == 1:
            id = course['backlinks'][0]
        else:
            id = course['shortName']

        if id.split()[0] == dept['deptAbbr']:
            id = ''.join(id.split()[1:])
        
        id = ''.join(c for c in id if c.isalnum())
        course['deptAbbr'] = dept['deptAbbr']
        course['htmlId'] = id


In [96]:
with open('parsed-data-2.json', 'w') as f:
    json.dump(parsed_data, f)

In [97]:
with open('parsed-data.json', 'r') as f:
    parsed_data = json.load(f)

Now, I put all the courses in an array, so I can refer to them by their indeces. I'll put this in a Redis db later.

In [98]:
course_db = []

for dept in parsed_data:
    for course in dept['content']:
        if course['type'] != 'course':
            continue
        
        course = dict(course)
        del course['type']
        course_db.append(course)

with open('course-db.json', 'w') as f:
    json.dump(course_db, f)

Now, it's time to link everything together.

First, I get the subject codes (e.g. CSE, LANG) and the major codes (e.g. CS27, UNHA) from their respective pages.

In [50]:
# major codes
soup = BeautifulSoup(
    requests.get("https://blink.ucsd.edu/instructors/academic-info/majors/major-codes.html").content, "html.parser"
)
major_codes = {}
for row in soup.find_all("tr"):
    cells = row.find_all("td")
    if not cells or len(cells) < 2 or len(cells[-2].text) < 3:
        continue
    try:
        major_codes[cells[-2].text] = cells[-1].text
    except IndexError:
        print(cells)
    
with open('major-codes.json', 'w') as f:
    json.dump(major_codes, f)


# # subject codes
# soup = BeautifulSoup(
#     requests.get("https://blink.ucsd.edu/instructors/courses/schedule-of-classes/subject-codes.html").content,
#     "html.parser",
# )
# subject_codes = []
# for row in soup.find_all("tr"):
#     cells = row.find_all("td")
#     if not cells or len(cells) < 2:
#         continue
#     subject_codes.append([cells[0].text, cells[1].text])
#
# with open('subject-codes.json', 'w') as f:
#     json.dump(subject_codes, f)

In [51]:
# manually remove "(see ...)"

with open('major-codes.json', 'r') as f:
    major_codes = json.load(f)

# with open('subject-codes.json', 'r') as f:
#     subject_codes = json.load(f)

Then, I index the backlinks. If there are multiple courses with the same backlink, use the course that has the fewest backlinks.

In [13]:
backlinks = defaultdict(list)

for id, course in enumerate(course_db):  
    for link in course['backlinks']:
        backlinks[link].append((len(course['backlinks']), id))

backlinks = {link: min(ids)[1] for link, ids in backlinks.items()}

In [8]:
# see courses that have 1 backlink that doesn't match the shortName

for dept in parsed_data:
    for course in dept['content']:
        if course['type'] != 'course':
            continue
        
        if len(course['backlinks']) == 1 and course['backlinks'][0] != course['shortName']:
            print(course['shortName'], course['backlinks'][0])

BIBC 140 BIBC 130
CHEM 167 CHEM 166
CLASSIC 200A CLAS 200A
CLASSIC 200C CLAS 200C
CLASSIC 205 CLAS 205
CLASSIC 220 CLAS 220
CLASSIC 280 CLAS 280
CLASSIC 290 CLAS 290
CLASSIC 299 CLAS 299
CLASSIC 399 CLAS 399
COMM 111T COMM 111P
COMM 131 COMM 130
CAT 75 CAT 87
EDS 30/MATH 95 EDS 30
EDS 31/CHEM 96 EDS 31
EDS 105/PHYS 180 EDS 105
EDS 115GS EDS 115S
EDS 121A/MATH 121A EDS 121A
EDS 121B/MATH 121B EDS 121B
EDS 122/CHEM 187 EDS 122
EDS 123/CHEM 188 EDS 123
EDS 164R EDS 165R
EDS 278/COGR 278 EDS 278
BENG/BIMM/CSE 181 BENG 181
BENG/BIMM/CSE/CHEM 182 BENG 182
BENG/BIMM/CSE/CHEM 184 BENG 184
BENG 202/CSE 282 BENG 202
BENG 203/CSE 283 BENG 203
BENG 238/MED 238 BENG 238
BENG 242/MATS 257/NANO 257 BENG 242
BENG 247A/ECE 247A/NANO 247A BENG 247A
BENG 247B/ECE 247B/NANO 247B BENG 247B
BENG 247C/ECE 247C/NANO 247C BENG 247C
BENG 260/BGGN 260/PHYS 279 BENG 260
BENG 276/CHEM 276/MATH 276/SPPS 276 BENG 276
BENG 277/BIOM 287 BENG 277
BENG 278/RAD 278 BENG 278
BENG 283/CHEM 283/BIOM 283 BENG 283
BENG 285/BN

In [24]:
# see duplicate ids

for dept in parsed_data:
    seen = set()
    for course in dept['content']:
        if course['type'] != 'course': continue
        id = course['htmlId']
        if id in seen:
            print(f'dept: {dept["dept"]}, id: {id}')
        seen.add(id)

dept: Communication, id: 111P
dept: Culture, Art, and Technology, id: 87
dept: Ethnic Studies, id: 101
dept: School of Global Policy and Strategy (GPS), id: GPEC435
dept: School of Global Policy and Strategy (GPS), id: GPIM422
dept: School of Global Policy and Strategy (GPS), id: GPIM436
dept: School of Global Policy and Strategy (GPS), id: GPIM436
dept: School of Global Policy and Strategy (GPS), id: GPIM436
dept: School of Global Policy and Strategy (GPS), id: GPIM470
dept: School of Global Policy and Strategy (GPS), id: GPIM470
dept: School of Global Policy and Strategy (GPS), id: GPIM470
dept: School of Global Policy and Strategy (GPS), id: GPPA405
dept: School of Global Policy and Strategy (GPS), id: GPPA420
dept: School of Global Policy and Strategy (GPS), id: GPPA420
dept: School of Global Policy and Strategy (GPS), id: GPPA450
dept: School of Global Policy and Strategy (GPS), id: GPPA450
dept: School of Global Policy and Strategy (GPS), id: GPPA467
dept: School of Global Policy

Finally, I add Svelte components (e.g. `<CourseLink id={id} url={url} />`) every time I see a course or major name.


In [2]:
def add_markup(desc):
    for code, major in major_codes.items():
        desc = desc.replace(code, f'<MajorLink name="{major}">{code}</MajorLink>')

    matches = list(re.finditer('[A-Z]{2,} \d+[A-Z]*', desc))
    for m in reversed(matches):
        i, j = m.span()
        link = desc[i:j]
        if link not in backlinks:
            continue
        id = backlinks[link]
        url = f"/{course_db[id]['deptAbbr']}#{course_db[id]['htmlId']}"
        course_link = f'<CourseLink id={{{id}}} href="{url}">{link}</CourseLink>'
        desc = desc[:i] + course_link + desc[j:]
        
    return desc

In [103]:
dept_list = [[dept['deptAbbr'], dept['dept']] for dept in parsed_data]
with open('dept-list.json', 'w') as f:
    json.dump(dept_list, f)

In [1]:
dept_dir = '../ucsdcourses/src/routes/(dept)/'

for dept in parsed_data:
    dir = f"{dept_dir}/{dept['deptAbbr']}"
    try: os.makedirs(dir)
    except: pass
    with open(f"{dir}/+page.svelte", 'w') as f:
        f.write('<script>import CourseCard from "$lib/CourseCard.svelte";import CourseLink from "$lib/CourseLink.svelte";import MajorLink from "$lib/MajorLink.svelte";</script>\n')
        # f.write(f"<h1>{dept['dept']}</h1>\n")
        for o in dept['content']:
            try:
                o['type']
            except KeyError:
                print(o)
                break
            if o['type'][0] == 'h':
                title = o['content']
                note = add_markup(o['desc'])
                tag = o['type']

                f.write(f"<{tag}>{title}</{tag}>\n")
                if note:
                    f.write(f'<p class="note">{note}</p>\n')

            elif o['type'] == 'course':
                desc = add_markup(o['desc'])
                details = add_markup(o['details'])
                shortName = o['shortName']
                longName = o['longName']
                units = o['units']
                id = o['htmlId']
                url = f"/{o['deptAbbr']}#{id}"

                f.write(f'<CourseCard id="{id}" shortName="{shortName}" longName="{longName}" units="{units}" url="{url}">\n\t<p>{desc}</p>\n')
                if details:
                    f.write(f'\n\t<p>{details}</p>')
                f.write('\n</CourseCard>\n')
            
            else:
                print(f"error: unexpected type: {o['type']}")



NameError: name 'parsed_data' is not defined