In [1]:
import requests, json
from bs4 import BeautifulSoup as bs
from pprint import pprint

In [2]:
# Retrieve info from web
page = requests.get('https://advising.columbian.gwu.edu/general-education-courses')
soup = bs(page.content, 'html.parser') # parse the HTML
main_content = soup.find('div',{"id":"main_content"})

In [3]:
# Generate a dictionary of GPAC categories and the courses that fall under them. Basically just converting
# the web data into a usable format. Bonus points if you can understand my annoyingly Pythonic syntax.
gpac_dict = dict(zip([option.get_text().split(": ")[1] for option in main_content.find('select',{"class":"quick-dropdown form-select"}).find_all('option')],
    [[course.get_text().replace('\xa0',' ') for course in category.find_all('li')] for category in main_content.find_all('div',{"class":"node-content clearfix"})]))
#pprint(gpac_dict)

In [4]:
# Pull out a plain, raw list of all GPAC-applicable courses
gpac_courses = []
for category in gpac_dict:
    for course in gpac_dict[category]:
        gpac_courses.append(course) #(str(course.split()[0])+" "+str(course.split()[1][:4]))
#pprint(gpac_courses)

In [5]:
# Invert the dict so it's in terms of courses rather than categories
lookup = {}
for course in gpac_courses:
    applicable_categories = []
    for category in gpac_dict:
        if course in gpac_dict[category]:
            applicable_categories.append(category)
    lookup[course] = applicable_categories
#pprint(lookup)

In [6]:
# Filter for courses that apply to more than one GPAC category
multicourses = {}
for course in lookup:
    if len(lookup[course]) >= 2:
        multicourses[str(course.split()[0])+" "+str(course.split()[1][:4])] = lookup[course]
#pprint(multicourses)

In [7]:
# Output/save to a readable, distributable file
jsondata = json.dumps(multicourses, sort_keys=True, indent=2)
with open('multi_GPACs.json','w+') as file:
    file.write(jsondata)