In [1]:
term = '20091'

In [2]:
from bs4 import BeautifulSoup, NavigableString
from selenium import webdriver
from urllib.parse import urljoin
import numpy as np
import pandas as pd
import re
import sys
import time

At first, I couldn't figure out why BeautifulSoup couldn't find the content that I was seeing in the HTML. Then I realized it was because the page was dynamic, so it wasn't all loading when I was retrieving it through `urllib`. I found a solution [here](https://stackoverflow.com/questions/17597424/how-to-retrieve-the-values-of-dynamic-html-content-using-python), with more details [here](https://stackoverflow.com/questions/40208051/selenium-using-python-geckodriver-executable-needs-to-be-in-path). I also had to install `geckodriver` using [this](https://anaconda.org/conda-forge/geckodriver).

# Functions

In [3]:
def get_term(archive_url):
    '''Used by None. Uses get_subject_list(), get_subject_tables(), and clean_arlt().
    Begins process for scraping whole term. Saves each subject as CSV file.'''
    ge_list = get_subject_list(archive_url)
    df = get_subject_tables(ge_list)
    df = clean_arlt(df)
    filename = 'data/rosters-pre-2014/' + archive_url.rsplit('/', 1)[-1].replace('.html', '') + '.csv'
    df.to_csv(filename, index = False)

def get_subject_list(archive_url):
    '''Used by get_term(). Uses get_dynamic_webpage().
    Receives URL for Schedule of Classes. Returns list of links to each subject.'''
    soup = get_dynamic_webpage(archive_url)
    print(soup)
    page = soup.find('ul', id="programs").find_all('li')
    ge_list = []
    for link in page:
        url = link.find('a')['href']
        full_url = urljoin(archive_url, url)
        ge_list.append(full_url)
    ge_list = sorted(set(ge_list))
    return ge_list

def get_dynamic_webpage(url):
    '''Take URL for Schedule of Classes. Returns soup object.
    Used in get_course_info() and get_subject_list().'''
    try:
        driver.get(url)
        time.sleep(.5)
        html = driver.page_source
        soup = BeautifulSoup(html)
        return soup
    except:
        print('Error in get_dynamic_webpage():', url)

def get_subject_tables(ge_list):
    '''Used by get_term(). Uses get_course_info().
    Receives list of all subjects. Returns DataFrame with all courses for all subjects.'''
    frames = []
    for page in ge_list:
        print('get_subject_tables()', page)
        try:
            df = get_course_info(page)
        except ValueError:
            df = get_course_info(page)
        frames.append(df)
    df = pd.concat(frames, sort = True).reset_index()
    
    return df        
        
def get_course_info(url):
    '''Used by get_subject_tables(). Uses get_course_name().
    Receives URL of course table for subject. Collects data in course table.
    Returns DataFrame of course table.'''
    # Sometimes function doesn't get course info on first try.
    # This makes it retry until it has the course info.
    got_page_info = False
    
    while got_page_info == False:
        soup = get_dynamic_webpage(url)
        table_list = soup.find_all('div', class_ = re.compile('course_info ex.*'))
        
        # If there is no course info, table_list is None. So this returns empty DataFrame.
        if not table_list:
            got_page_info = True
            return pd.DataFrame()

        # Reads each table collected into a list of DataFrames.
        frames = []

        for table in table_list:
            match = table.find('table')
            try:
                df = pd.read_html(str(match), header = 0)[0]
                course_code = table.find('strong').text.replace(':', '').replace(' ', '-')
                course_name = get_course_name(table)
                df['course_code'] = course_code
                df['course_name'] = course_name
                frames.append(df)
            except ValueError:
                pass

        # Asserts that DataFrames have been collected.
        if len(frames) > 0:
            got_page_info = True
            df = pd.concat(frames, sort = True).reset_index()
            return df

def get_course_name(table):
    '''Used by get_course_info(). Uses None.
    Receives table soup for a course. Returns course name.'''
    match = table.find('h3').contents
    for x in match:
        if isinstance(x, NavigableString):
            return x
            break


def clean_arlt(df):
    '''Used by get_term(). Uses None.
    For rows that don\'t have Type (like ARLT), backfills course_name.'''
    for lab, row in df.iterrows():
        if df.loc[lab, 'Type'] is np.nan:
            course_name = df.loc[lab, 'Section']
            df.loc[lab + 1, 'course_name'] = course_name
    df = df.dropna(subset = ['Type'])
    
    return df

# Begin Scrape

In [4]:
driver = webdriver.Firefox()
archive_url = 'https://web-app.usc.edu/ws/soc_archive/soc/term_' + term + '.html'
get_term(archive_url)
driver.close()

<html><head>
<title>Spring 2009 | Schedule of Classes</title>
<link href="dev/style/A.primary.css.pagespeed.cf.2ga5Yu_DUr.css" media="screen" rel="stylesheet" type="text/css"/>
<link href="dev/style/spring.php?y=2009" media="screen" rel="stylesheet" type="text/css"/>
<link href="dev/style/A.print.css.pagespeed.cf.SwDQBXsDdx.css" media="print" rel="stylesheet" type="text/css"/>
<script language="javascript" src="dev/scripts/jquery.pack.js.pagespeed.jm.FMG7C4C0sA.js" type="text/javascript"></script>
<script language="javascript" type="text/javascript">var term='spring';var deptsrc='api/departments/20091';function showdepartment(dept,deptcode,depttype){return'<li><a href="20091/'+deptcode.toLowerCase()+'.html">'+dept+'</a> ('+deptcode+')</li>';}$(document).ready(function(){$.getJSON(deptsrc,function(json){var programs='';for(i=0;i<json.department.length;i++){if(json.department[i].type!='Y'){programs+=showdepartment(json.department[i].name,json.department[i].code,json.department[i].type);}

get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/acmd.html
get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/aest.html
get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/ahis.html
get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/ali.html
get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/ame.html
get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/amst.html
get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/anst.html
get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/anth.html
get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/arch.html
get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/arlt.html
get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/aste.html
get_subject_tables() https://web-app.usc.edu/ws/soc_archive/soc/20091/astr.html
get_subject_tables() https://web-app.usc.e

ImportError: html5lib not found, please install it