Search parameters

In [33]:
keyword = "accounts"
start_date = "02/10/2000" # MM/DD/YYYY
end_date = "02/10/2026" # MM/DD/YYYY

In [None]:
# loop for all schools
import csv

with open("boardbook_schools.csv", newline='') as csvfile:
    reader = csv.DictReader(csvfile)

    for row in reader:
        school_name = row["School Name"]
        index = row["Index"]
        has_search = row["has_search_index"]

        print(f"🔍 Searching for: {school_name} (Index: {index})")

        if has_search == "True":
            search_school(index)

Get search result from BoardBook's search function

In [None]:
import requests
from bs4 import BeautifulSoup
import re

BASE_URL = "https://meetings.boardbook.org"

def is_valid_date_format(date_str: str):
    pattern = r"^(0[1-9]|1[0-2])/([0][1-9]|[12][0-9]|3[01])/\d{4}$"
    return re.match(pattern, date_str) is not None

# Step 1: Search request

def request_search(school_id):
    if not is_valid_date_format(start_date):
        print(f"Invalid start date format: {start_date}. Expected MM/DD/YYYY.")
    elif not is_valid_date_format(end_date):
        print(f"Invalid end date format: {end_date}. Expected MM/DD/YYYY.")
    else:
        search_url = f"{BASE_URL}/Search/AjaxSearch/{school_id}"
        params = {
            "q": keyword,
            "returnUrl": "",
            "i": [4, 8, 9],
            "from": start_date,
            "to": end_date,
            # "_": "1743820823754"
        }
        response = requests.get(search_url, params=params)
        soup = BeautifulSoup(response.text, "html.parser")

Parse the search results to download the files

In [None]:
import os
import requests
from urllib.parse import urljoin
import re

def download_pdf(url, filename, school_id):
    os.makedirs("downloads/" + school_id, exist_ok=True)
    file_path = os.path.join("downloads/" + school_id, filename)
    response = requests.get(url)
    response.raise_for_status()
    with open(file_path, "wb") as f:
        f.write(response.content)
    print(f"Downloaded: {file_path}")

# Step 2: Parse results
def parse_search(school_id):
    for a in soup.select("a[href^='/Search/GoToResult/']"):
        href = a["href"]
        full_url = urljoin(BASE_URL, href)
        index_type = href.split("Index=")[-1].split("&")[0]

        if "sparqmeetingsagendaitems" in index_type:
            print(f"Agenda item: {full_url}")
            r = requests.get(full_url, allow_redirects=True)

            if r.history and "meeting=" in r.url:
                meeting_id = r.url.split("meeting=")[-1]
                agenda_url = f"{BASE_URL}/Public/DownloadAgenda/{school_id}?meeting={meeting_id}"
                file_name = f"Agenda_{meeting_id}.pdf"
                download_pdf(agenda_url, file_name, school_id)
            else:
                print("No meeting ID found.")

        elif "sparqmeetingsdocuments" in index_type:
            print(f"Document: {full_url}")
            r = requests.get(full_url)
            html = r.text

            # Use regex to find file ID
            match = re.search(r'file=([a-f0-9\-]{36})', html)
            if match:
                file_id = match.group(1)
                direct_url = f"{BASE_URL}/Documents/DownloadPDF/{file_id}?org={school_id}"
                file_name = f"Doc_{file_id}.pdf"
                download_pdf(direct_url, file_name, school_id)
            else:
                print("No file ID found via regex.")

        else:
            print(f"Skipping unrecognized index type: {index_type}")

def search_school(school_id):
    request_search(school_id)
    parse_search(school_id)


In [58]:
import csv
import requests

BASE_URL = "https://meetings.boardbook.org"

# Function to check if a school has a valid search index (i.e., not redirected)
def check_search_index(school_index):
    search_url = f"{BASE_URL}/Search/AjaxSearch/{school_index}"
    try:
        # Send the request and allow redirection
        response = requests.get(search_url, allow_redirects=True)
        
        # If response.history is not empty, that means it was redirected
        if response.history:
            return False  # Redirected, no valid search index
        else:
            return True  # Not redirected, valid search index
    except Exception as e:
        print(f"Error checking search index for {school_index}: {e}")
        return False  # If there's any error, assume invalid search index

# Read the original CSV file and write a new CSV with the 'has_search_index' column
input_csv_file = "boardbook_schools.csv"
output_csv_file = "premier_schools_with_search_index.csv"

# Open the input CSV file and the output CSV file
with open(input_csv_file, newline='') as infile, open(output_csv_file, mode='w', newline='') as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + ["has_search_index"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    
    writer.writeheader()  # Write the header to the new file
    
    # Loop through each row and check if the school has a valid search index
    for row in reader:
        school_name = row["School Name"]
        school_index = row["Index"]
        
        # Check if the school has a valid search index
        has_search_index = check_search_index(school_index)
        
        # Add the new field to the row
        row["has_search_index"] = "True" if has_search_index else "False"
        
        # Write the row to the output CSV
        writer.writerow(row)

print(f"New CSV file with search index information created: {output_csv_file}")

New CSV file with search index information created: premier_schools_with_search_index.csv


In [None]:
import csv

valid_search_index_count = 0

# Open the CSV file to count the number of 'True' values in the 'has_search_index' column
with open("premier_schools_with_search_index.csv", newline='') as infile:
    reader = csv.DictReader(infile)
    
    # Loop through each row and check if the school has a valid search index
    for row in reader:
        # The column for search index is 'has_search_index', not 'Index'
        school_index = row["has_search_index"]
        
        # Increment the counter if the value is 'True' (as a string)
        if school_index == "True":
            valid_search_index_count += 1

# Print the count of schools with a valid search index
print(f"{valid_search_index_count} schools have a valid search index.")


In [None]:
import csv

def get_true_schools(file_path):
    true_schools = set()
    with open(file_path, newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row.get("has_search_index") == "True":
                school_name = row.get("School Name", "").strip()
                if school_name:
                    true_schools.add(school_name)
    return true_schools

# Load schools with 'True' from both files
true_schools_1 = get_true_schools("schools_with_search_index.csv")
true_schools_2 = get_true_schools("premier_schools_with_search_index.csv")

# Find intersection
overlap = true_schools_1 & true_schools_2

# Print results
print(f"{len(overlap)} schools have 'True' in both files.")