This notebook gets all the school districts and its state from `school_district_list.html`. This HTML file is downloaded from [https://ballotpedia.org/List_of_school_districts_in_the_United_States](https://ballotpedia.org/List_of_school_districts_in_the_United_States)

Input:
- `school_district_list.html`

Output:
- `school_districts.csv`

In [19]:
from bs4 import BeautifulSoup
import pandas as pd

# Load the HTML file
with open("school-district-list.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

# Initialize an empty list to store the data
data = []

# Find the element with id "headertabs"
headertabs = soup.find(id="headertabs")

In [20]:
for state_div in headertabs.find_all("div", recursive=False):
    print(state_div.get("id"))

Alabama
Alaska
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
District_of_Columbia
Florida
Georgia
Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
Michigan
Minnesota
Mississippi
Missouri
Montana
Nebraska
Nevada
New_Hampshire
New_Jersey
New_Mexico
New_York
North_Carolina
North_Dakota
Ohio
Oklahoma
Oregon
Pennsylvania
Rhode_Island
South_Carolina
South_Dakota
Tennessee
Texas
Utah
Vermont
Virginia
Washington
West_Virginia
Wisconsin
Wyoming


In [21]:
import re

# Iterate over all divs inside "headertabs"
for state_div in headertabs.find_all("div", recursive=False):
    state_name = state_div.get("id")  # Get the state name from the div id
    assert state_name
    # Find all <a> tags inside the div's ul
    ul = state_div.find("ul")
    if not ul:
        print(f"Schools not found for {state_name}")
        continue
    for a_tag in ul.find_all("a"):
        school_district = a_tag.get_text(strip=True)
        # remove double spaces and newlines with spaces
        school_district = re.sub(r'\s+', ' ', school_district).strip()
        # Append the school district and state name to the data list
        data.append({"school_district": school_district, "state": state_name})

# Convert the data list to a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

Schools not found for District_of_Columbia
                                         school_district    state
0      Alabama Department of Youth Services Schools, ...  Alabama
1                           Dothan City Schools, Alabama  Alabama
2                 Coffee County School District, Alabama  Alabama
3                       Haleyville City Schools, Alabama  Alabama
4                 Geneva County School District, Alabama  Alabama
...                                                  ...      ...
11752     Uinta County School District Number 6, Wyoming  Wyoming
11753  Washakie County School District Number 2, Wyoming  Wyoming
11754    Weston County School District Number 1, Wyoming  Wyoming
11755    Weston County School District Number 7, Wyoming  Wyoming
11756                    Natrona County Schools, Wyoming  Wyoming

[11757 rows x 2 columns]


In [27]:
# number of states
len(df["state"].unique())

50

In [23]:
# Save the DataFrame to a CSV file
df.to_csv("school_districts.csv", index=False)

Alabama Department of Youth Services Schools, Alabama
