# Scrape BOE agendas

This notebook scrapes the agendas for the Shelby County Board of Education and pulls out 
the motions that were voted on.

In [1]:
import os
from dataclasses import dataclass
from time import sleep

import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import (StaleElementReferenceException,
                                        TimeoutException)
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm

In [2]:
# My chromium driver isn't in PATH, for some reason

os.environ['PATH'] += ':/opt/homebrew/bin'

In [13]:
@dataclass
class Motion:
    """A dataclass to store a motion."""
    date: str
    meeting_title: str
    action: str
    motion: str


motions = []
meetings_scanned = set()  # Save the meetings we've already scanned
# so that when we encounter an error, we don't have to start from the beginning

In [9]:
# Start a Selenium driver and navigate to the BOE page
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")

driver = webdriver.Chrome(options=options)
driver.get('https://go.boarddocs.com/tn/scsk12/Board.nsf/Public')
wait = WebDriverWait(driver, 4)

  options.headless = True


In [19]:
# Wait for the BOE page to load and navigate to the Meetings tab
wait.until(EC.visibility_of_element_located((By.LINK_TEXT, 'MEETINGS')))
meetings_button = driver.find_element(By.LINK_TEXT, 'MEETINGS')
meetings_button.click()

In [28]:
# Iterate through the years of meetings
meetings_button.click()

for year in '2018', '2019', '2020', '2021', '2022':
    # For each year, identify and interate through the meetings
    driver.find_element(By.LINK_TEXT, year).click()
    sleep(0.5)
    meeting_titles = []
    for link in driver.find_elements(By.TAG_NAME, 'a'):
        if 'Meeting' in link.text and link.text not in meetings_scanned:
            meeting_titles.append(link.text)

    for meeting_name in tqdm(meeting_titles, desc=year):
        # Navigate to the meeting
        link = driver.find_element(By.LINK_TEXT, meeting_name)
        link.location_once_scrolled_into_view
        link.click()
        date, meeting_title = meeting_name.split('\n')
        if 'Committee' in meeting_title:
            continue
        # wait until the View Agenda button is visible
        sleep(0.1)
        wait.until(
            EC.visibility_of_element_located(
                (By.LINK_TEXT, 'View the\nAgenda')))
        sleep(0.1)
        try:
            driver.find_element(By.LINK_TEXT, 'View the\nAgenda').click()
        except StaleElementReferenceException:
            sleep(0.1)
            driver.find_element(By.LINK_TEXT, 'View the\nAgenda').click()

        try:
            wait.until(
                EC.visibility_of_element_located(
                    (By.CLASS_NAME, "type-action")))
        except TimeoutException:  # no actions in this meeting
            meetings_button.click()
            meetings_scanned.add(meeting_name)
            continue
        sleep(0.1)

        # Iterate through the meeeting items with actions and save the motions
        for action in driver.find_elements(By.CLASS_NAME, "type-action"):
            action.location_once_scrolled_into_view
            if 'REMOVED' in action.text:
                continue
            action.click()
            try:
                while motion.is_displayed():
                    sleep(0.01)  # wait for an old motion to go away
            except:
                pass
            try:
                wait.until(
                    EC.visibility_of_element_located(
                        (By.CLASS_NAME, 'motions')))
            except TimeoutException:  # no motion this action
                continue

            motion = driver.find_element(By.CLASS_NAME, 'motions')
            motions.append(
                Motion(date, meeting_title, action.text, motion.text))
        # Record that a meeting has been scanned so that we can skip it if we have to restart
        meetings_scanned.add(meeting_name)
        meetings_button.click()

2018: 100%|██████████| 65/65 [00:10<00:00,  6.49it/s]
2019: 100%|██████████| 62/62 [00:09<00:00,  6.69it/s]
2020: 100%|██████████| 84/84 [02:35<00:00,  1.85s/it]
2021: 100%|██████████| 80/80 [02:32<00:00,  1.91s/it]
2022: 100%|██████████| 77/77 [03:31<00:00,  2.75s/it]


In [31]:
# Save the motions to a CSV
pd.DataFrame(motions).drop_duplicates().to_csv('boe-motions.csv', index=False)