In [154]:
import requests
from bs4 import BeautifulSoup

import re

import numpy as np
import pandas as pd

In [106]:
def parse_ul(ul, df):
    """
    Function to parse the series of tags labeled 'ul'. Each contains two
    pieces of information: (i) the name of the committee and (ii) the link
    to the committee page. 
    
    Returns: a dataframe with the columns 'name' and 'link' filled in.
    """
    committee_list = ul.find_all()
    for committee in committee_list:
        if committee.name == 'a':
            row = {'name': committee.text, 'link': committee['href']}
            df = df.append(row, ignore_index=True)
    return df

def follow_links(df, base_url):
    """
    Follows the links given in a column from the dataframe given as argument.
    Scrapes tables from the committee pages and records them in dataframes.
    
    Returns: a dictionary of key:value pairs, where keys are committee names and
    values are the tables. In two instances, there are multiple tables, for which
    we record the tables in a list of items
    """
    tables = {}
    for i in df.index.values:
        if df.loc[i, 'link'][-6:] == '.shtml':
            url = base_url + df.loc[i, 'link']
            r = requests.get(url)
            soup = BeautifulSoup(r.content, 'html5lib')
            table = soup.find_all('table')
            
            tables[df.loc[i, 'name']] = []
            if len(table) > 1:
                if df.loc[i, 'name'] == 'Faculty Senate':
                    # 'Faculty Senate' has multiple tables and a different form (header is different 
                    # on both tables); requires different read-in
                    for j, tab in enumerate(table):
                        tables[df.loc[i, 'name']].append(pd.read_html(str(table), header=1)[j])
                else:
                    # Else just iterate through tables and read them in as usual
                    for j, tab in enumerate(table):
                        tables[df.loc[i, 'name']].append(pd.read_html(str(table), header=0)[j])
            else:            
                try:
                    tables[df.loc[i, 'name']].append(pd.read_html(str(table), header=0)[0])
                except:
                    continue
            
    return tables

In [107]:
# Request html for the committee's homepage
url = "https://www.niu.edu/u_council/committees/"
r = requests.get(url)

In [108]:
soup = BeautifulSoup(r.content, 'html5lib')
# print(soup.prettify())

In [109]:
# First, follow the html code tree to the tags labeled 'ul'. These
# contain the links for the committees we are interested in scraping
links = soup.find('div', attrs={'id':'mainContent'})
links = links.find('div', attrs={'id':'contentwithleftnav'})
links = links.find('contenttext')
links = links.find_all('ul')

# Create a dataframe to hold the results of the committees and links
df = pd.DataFrame(columns=['name', 'link'])

# Use the function parse_ul() to fill in the dataframe
for i in range(len(links)):
    df = parse_ul(links[i], df)

In [110]:
# Now we follow those links and scrape the tables from each page
# using our follow_links() function. 
tables = follow_links(df, url)

In [113]:
# tables['Faculty Senate'][0]

In [203]:
com_members = {}
for committee in tables.keys():
    for table in tables[committee]:
        # Make sure to drop nan values before adding people
        if '2019-2020' in table.columns.values:
            members = table['2019-2020'].dropna().values
        else:
            members = table['2019-20'].dropna().values
            
        for member in members:
            
            # Remove anything between parentheses (including the parentheses)
            member = re.sub(r" \(\S*\)|\**|#|^x\S*\s*|\Ax", "", member)
            member = re.sub(r"\xa0", " ", member)
            member = re.sub(r"^x", "", member)
            if member == "2019-2020":
                continue
            if member in com_members:
                com_members[member].append(committee)
            else:
                com_members[member] = [committee]

In [204]:
for key in com_members.keys():
    print(key)

Nan Qin
Peter Chomentowski, Chair
Veysel Demir
Amanda McCarthy
Mark Schuller (for Irwin)
Rich Grund
Beth Ingram
Geoffrey Gordon
Brad Cripe
Chad McEvoy
Cynthia Campbell
Donald Zinger
Mahdi Vaezi
Hamid Bateni
Bette Montgomery for Ysasi
Marc Falkoff
Brad Peters
Kurt Thurmaier for Matuszewich
Evgueni Nesterov for Stephen
Judith Chitwood
John Siblik
Omar Ghrayeb
Gerald Blazey
Carolinda Douglass
Sue Mini
Fred Barnhart
Brad Bond
J. Daniel House
Ritu Subramony
Jeff Reynolds
James Burton
Vicki Collins, Chair
Jie Chen
Amy Newman
Omar Chmaissem
Jim Wilson
Alan Polansky
Judy Ledgerwood
Derryl Block
Omar Ghrayeb, Vice Provost for Undergraduate Academic Affairs
Amari White
Megan Rooney
Mark Groza
Tim Aurand
Erika Pinter
Joe Bittorf
Josephine Umoren
Michael Oswalt
Mike Konen
TBD for Ladell
Thomas Bough
Kamron Smith
Amberly Rodriguez
Pete Garrity
Glenn Roby
Lesley Gilbert for Nay
TBD for Salmon
Sean Frazier
Laura Sala
Aidan Shields
Amanda Ferguson
Sina Ethsani
Laura Hedin
Rod Caughron
Ji-Chul Ryu
Lipi

Evelyn Comber
Jeff Salmon for Olson
Eric Armstrong
Alex Pitner
Andrea Drott for Gotto
Bill Penrod
Melanie Koss
Hamid Bateni, Chair
Sarah Garner, Ethics and Compliance Officer
Shantez Branch
Andrew Setterstrom
Adam Carter
Hasan Ferdowsi
TBD for Hogan
Hyun-Mee Joung
Sheila Barrett for Abendroth
Dennis Brain
Alecia Santuzzi
Brandon Lagana
David Ballantine
Tawanda Gipson
Crystal Doyle
Carrie Zack
Therese Arado for Beamer
Varsie Geisler
Karen Smith for Calderala
Deb Boughton
Celeste Latham
Toni Tollerud
Liz Guess
Lisa Freeman
Kelly Wesener Michael
Sol Jensen
Naomoi Bolden
Sanchez Branch
Ashley Hines
Victor Owoeye
Lindsey David
Delaney Drew
Victoria Lang
Stephen Binderup
James Burton, MGMT
Fall-So-Yeun Kim, KNPESpring-TBD
Gary Chen, ISYE
Hamid Bateni, AHCD
Fall-Anne Hanley, HISTSpring-Sean Farrell, HIST
Simon Weffer, SOCI, for Thu
Richard Siegesmund, ARTD
Charles Petersen, OMIS
Fall-Myoungwhon Jung, SEEDSpring-James Cohen, CURR
Robert Tatara, TECH
Bryan Dallas, SIHP
Robert Jones
Alastair Fle

In [202]:
# remove "Web Team"

re.sub(r"^x", "", "xFred Barnhart")

'Fred Barnhart'