In [1]:
import requests
from bs4 import BeautifulSoup

import re

import numpy as np
import pandas as pd

In [2]:
def parse_ul(ul, df):
    """
    Function to parse the series of tags labeled 'ul'. Each contains two
    pieces of information: (i) the name of the committee and (ii) the link
    to the committee page. 
    
    Returns: a dataframe with the columns 'name' and 'link' filled in.
    """
    committee_list = ul.find_all()
    for committee in committee_list:
        if committee.name == 'a':
            row = {'name': committee.text, 'link': committee['href']}
            df = df.append(row, ignore_index=True)
    return df

def follow_links(df, base_url):
    """
    Follows the links given in a column from the dataframe given as argument.
    Scrapes tables from the committee pages and records them in dataframes.
    
    Returns: a dictionary of key:value pairs, where keys are committee names and
    values are the tables. In two instances, there are multiple tables, for which
    we record the tables in a list of items
    """
    tables = {}
    for i in df.index.values:
        if df.loc[i, 'link'][-6:] == '.shtml':
            url = base_url + df.loc[i, 'link']
            r = requests.get(url)
            soup = BeautifulSoup(r.content, 'html5lib')
            table = soup.find_all('table')
            
            tables[df.loc[i, 'name']] = []
            if len(table) > 1:
                if df.loc[i, 'name'] == 'Faculty Senate':
                    # 'Faculty Senate' has multiple tables and a different form (header is different 
                    # on both tables); requires different read-in
                    for j, tab in enumerate(table):
                        tables[df.loc[i, 'name']].append(pd.read_html(str(table), header=1)[j])
                else:
                    # Else just iterate through tables and read them in as usual
                    for j, tab in enumerate(table):
                        tables[df.loc[i, 'name']].append(pd.read_html(str(table), header=0)[j])
            else:            
                try:
                    tables[df.loc[i, 'name']].append(pd.read_html(str(table), header=0)[0])
                except:
                    continue
            
    return tables

In [3]:
# Request html for the committee's homepage
url = "https://www.niu.edu/u_council/committees/"
r = requests.get(url)

# Get the html
soup = BeautifulSoup(r.content, 'html5lib')

# First, follow the html code tree to the tags labeled 'ul'. These
# contain the links for the committees we are interested in scraping
links = soup.find_all('ul')

# Create a dataframe to hold the results of the committees and links
df = pd.DataFrame(columns=['name', 'link'])

# Use the function parse_ul() to fill in the dataframe
for i in range(len(links)):
    df = parse_ul(links[i], df)

In [4]:
# Now we follow those links and scrape the tables from each page
# using our follow_links() function. 
tables = follow_links(df, url)

In [5]:
# We're interested in sorting by specific members on committees, so
# let's transform the data into a dictionary of lists, where the keys
# are committee member names and the values are lists of the committees
# the members are a part of

com_members = {}
for committee in tables.keys():
    for table in tables[committee]:
        # Make sure to drop nan values before adding people
        if '2019-2020' in table.columns.values:
            members = table['2019-2020'].dropna().values
        else:
            members = table['2019-20'].dropna().values
            
        for member in members:
            # CLEAN THE MEMBER STRINGS
            # Remove anything between parentheses (including the parentheses)
            member = re.sub(r" \(\S*\)|\**|#|^x\S*\s*|\Ax", "", member)
            # Remove hexadecimal numbers
            member = re.sub(r"\xa0", " ", member)
            # Remove lone x's at the beginning of strings
            member = re.sub(r"^x", "", member)
            # Remove "for ..." (committee members who stand in for someone else)
            member = re.sub(r" \(for \S*\)| for \S*", "", member)
            
            # Check for specific names that are for some reason combined without spaces
            if member == "Carmorroa SiggersDevohn Hall":
                member = ["Carmorroa Siggers", "Devohn Hall"]
            elif member == "Ed KlonoskiMichelle Pickett":
                member = ["Ed Klonoski", "Michelle Pickett"]
            elif member == "Anne EdwardsJoseph Flynn":
                member = ["Anne Edwards", "Joseph Flynn"]
            elif member == "Katharine WhitelawNicole Wilson":
                member = ["Katharine Whitelaw", "Nicole Wilson"]
            elif member == "Prashanth GurralaElson Smith":
                member = ["Prashanth Gurrala", "Elson Smith"]
            else:
                # Split FALL and SPRING members into two seperate members
                # This produces A LIST of elements
                member = re.split(r"Fall-|Spring-|\s?/\s?", member)
                member = list(filter(None, member))
    
            for mem in member:
                # Remove titles after a comma
                mem = re.sub(r",[\S*\s*]*", "", mem)
                # Some of the "members" are headers of a table included in another table
                # Also don't save TBD
                if (mem == "2019-2020" or 
                    mem == "FACULTY SENATEOTHER MEMBERS AND CONSULTANTS" or 
                    mem == "See Marketing and Communications with shared appointments" or 
                    mem == "TBD" or 
                    mem.lower() == "spokesperson"):
                    continue
                # Add the committee member to the dictionary
                if mem in com_members:
                    com_members[mem].append(committee)
                else:
                    com_members[mem] = [committee]

In [6]:
for member in com_members.keys():
    print(member)
#     print(member, com_members[member], end='\n\n')

Devaki Rau
James Burton
Christine Mooney
William Penrod
Melanie Koss
Vicki Collins
So-Yeun Kim
Kevin Martin
Gary Chen
Reinaldo Moraga
Jenn-Terng Gau
Lin Shi
Hamid Bateni
Jie Chen
Therese Arado
Sharon Nelson
Anne Hanley
Sean Farrell
Scot Schraufnagel
Amy Newman
Simón Weffer
Reed Scherer
Jim Wilson
Alicia Schatteman
Amanda Littauer
Kendall Thu
Omar Chmaissem
Alan Polansky
Jim Millhorn
Kryssi Staikidis
Richard Siegesmund
Patricia Skarbinski
Greg Beyer
Mark Riley
Nan Qin
Mahesh Subramony
Elisa Fredericks
John Pendergrass
Katy Jaekel
Jodi Lampi
Todd Reeves
Laura Johnson
Peter Chomentowski
Benjamin Creed
Sarah Johnston-Rodriguez
Veysel Demir
Sahar Vahabzadeh
Bob Tatara
Jamie Mayer
Florensia Surjadi
Arlene Keddie
Amanda McCarthy
Nancy Petges
Heidi Kuehl
Mark Schuller
Jozef Bujarski
Chong Zheng
Karen Whedbee
Kirk Duffin
George Slotsve
Doris Macdonald
Mike Konen
Ross Powell
Ismael Montana
Gleb Sirotkin
Jason Hanna
Dhiman Chakraborty
Kheang Un
April Clark
Keith Millis
Jaehee Jong
Shane Sharp
Lei