This notebook takes a look at the scrapped results from the BoardDocs website and gets the correct addresses for each row.

Input:
- `prelim_results.csv`

In [559]:
import pandas as pd

df = pd.read_csv("prelim_results.csv")
df.head()

Unnamed: 0,URL,title_1,title_2,home_website
0,https://go.boarddocs.com/pa/shun/Board.nsf/Public,School Board Policy Manual,Southern Huntingdon County School District,http://www.shcsd.org
1,https://go.boarddocs.com/pa/cali/Board.nsf/Public,School Board Policy Manual,,www.calsd.org
2,https://go.boarddocs.com/ca/mbusd/Board.nsf/Public,"325 S. Peck Ave | Manhattan Beach, CA 90266 | (310) 318-7345 | f (310) 303-3822",Manhattan Beach Unified School District,http://www.mbusd.org
3,https://go.boarddocs.com/il/bhsd228/Board.nsf/Public,Bremen High School District 228,"15233 S. Pulaski Rd, Midlothian, IL 60445 | 708-389-1175",https://www.bhsd228.com/
4,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,Board Policies and Guidelines,St. Joseph Public Schools,https://www.sjschools.org/


In [560]:
# consider the 5-digit zip code approach
# check which rows have 5-digit codes in Title1, and those with them in Title2.
# Hopefully this will be a partition

# Define a regex pattern to match a 5-digit zip code
zip_code_pattern = r'\b\d{5}\b'

# Find rows where Title1 has a 5-digit zip code
df['Title1_has_zipcode'] = df['title_1'].str.contains(zip_code_pattern, na=False)

# Find rows where Title2 has a 5-digit zip code
df['Title2_has_zipcode'] = df['title_2'].str.contains(zip_code_pattern, na=False)

In [561]:
# check if it is a partition
# first check if they add up

import numpy as np

print(f"Num of rows where title 1 has zipcode {df['Title1_has_zipcode'].sum()}")
print(f"Num of rows where title 2 has zipcode {df['Title2_has_zipcode'].sum()}")
print(f"Num of rows where title 1 or title 2 has zipcode {np.sum(df['Title1_has_zipcode'] | df['Title2_has_zipcode'])}")
print(f"Num of rows where title 1 and title 2 have zipcode {np.sum(df['Title1_has_zipcode'] & df['Title2_has_zipcode'])}")
print(f"Num of total rows {df.shape[0]}")

Num of rows where title 1 has zipcode 1325
Num of rows where title 2 has zipcode 1298
Num of rows where title 1 or title 2 has zipcode 2623
Num of rows where title 1 and title 2 have zipcode 0
Num of total rows 3523


In [562]:
# ok great, there are no rows where you can find zipcodes on both cols
# but some rows don't have zipcodes in either
# let's check them out

no_zipcode_df = df[~(df['Title1_has_zipcode'] | df['Title2_has_zipcode'])]
print(f"Num of rows without zipcode {no_zipcode_df.shape[0]}")
no_zipcode_df.sample(5)

Num of rows without zipcode 900


Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
480,https://go.boarddocs.com/ny/greece/Board.nsf/Public,GREECE CENTRAL SCHOOL DISTRICT,Board of Education,http://web001.greece.k12.ny.us/,False,False
299,https://go.boarddocs.com/oh/bri/Board.nsf/Public,Board Policies and By-Laws,Bridgeport Exempted Village School District,https://www.bevs.k12.oh.us,False,False
1168,https://go.boarddocs.com/oh/pdys/Board.nsf/Public,Policy Manual,Pike-Delta-York Local School District,http://pdys.org/,False,False
1668,https://go.boarddocs.com/pa/seli/Board.nsf/Public,School Board Policy Manual,Selinsgrove Area School District,http://www.seal-pa.org,False,False
1111,https://go.boarddocs.com/pa/shan/Board.nsf/Public,Shanksville-Stonycreek School District,School Board Information and Policy Manual,https://www.sssd.com/,False,False


In [563]:
# some NaNs, some "Policy Manual", "School Board Policy and Guidelines", etc
# let's check the most common values

no_zipcode_df["title_1"].value_counts().head()

title_1
School Board Policy Manual              100
Policy Manual                            48
School Board Policies and Guidelines     35
School Board Policies                    18
BoardDocs PL                              8
Name: count, dtype: int64

In [564]:
no_zipcode_df["title_2"].value_counts().head()

title_2
                                    20
Board Policies                      18
School Board Policies               15
Board of Education                   9
Board of Education Policy Manual     8
Name: count, dtype: int64

In [565]:
# ok, now let's check the website col
sum(df["home_website"].isna())

57

In [566]:
# ok, unfortunately there are boarddocs without the home website linked
print(f"Percent of websites without links to official: {sum(df["home_website"].isna())/df.shape[0]*100:.3}%")

Percent of websites without links to official: 1.62%


In [567]:
# but thankfully this number is small
# let's take a look at these websites

# do not truncate the col values in display
pd.set_option('display.max_colwidth', None)

df[df["home_website"].isna()].head()

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
217,https://go.boarddocs.com/pa/camb/Board.nsf/Public,,,,False,False
399,https://go.boarddocs.com/oh/oakhil/Board.nsf/Public,,,,False,False
414,https://go.boarddocs.com/oh/warrenoh/Board.nsf/Public,,,,False,False
416,https://go.boarddocs.com/pa/marp/Board.nsf/Public,School Board Policy Manual,Marple Newtown School District,,False,False
481,https://go.boarddocs.com/oh/meigs/Board.nsf/Public,,,,False,False


In [568]:
# after inspecting a few, it seems like they will usually write their school district as the h1 tag at least.

In [569]:
# back to the address
# an observation is that I don't think those that don't contain zip codes will have addresses on the website
# let's check if a single number exists in them

number_pattern = r'\d'
print("Number of no-zipcode rows that contain a number")
sum(no_zipcode_df["title_1"].str.contains(number_pattern, na=False))

Number of no-zipcode rows that contain a number


44

In [570]:
no_zipcode_df[no_zipcode_df["title_1"].str.contains(number_pattern, na=False)].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
2679,https://go.boarddocs.com/il/rlasd/Board.nsf/Public,Round Lake Area School District CUSD 116,,http://www.rlas-116.org,False,False
648,https://go.boarddocs.com/il/thsd211/Board.nsf/Public,"G.A. McElroy Administration Center, 1750 South Roselle Road, Palatine, Illinois",Board of Education,http://www.adc.d211.org,False,False
1926,https://go.boarddocs.com/oh/polaris/Board.nsf/Public,"7285 Old Oak Blvd., | Middleburg Heights, OH | 440-891-7600",Polaris Career Center,http://www.polaris.edu/,False,False
990,https://go.boarddocs.com/la/pcpsb/Board.nsf/Public,"337 Napoleon Street ● New Roads, Louisiana ● p 225-638-8674 ● f 225-638-3237",Pointe Coupee Parish School System,http://www.pcpsb.net/,False,False
2239,https://go.boarddocs.com/in/brem/Board.nsf/Public,Bremen Public Schools | Phone: (574) 546-3929 | Fax: (574) 546-6303 | School Board Policies and Guidelines,,https://www.bps.k12.in.us,False,False


In [571]:
no_zipcode_df[no_zipcode_df["title_2"].str.contains(number_pattern, na=False)].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
2491,https://go.boarddocs.com/pa/iu12/Board.nsf/Public,Board Policy Manual,Lincoln Intermediate Unit 12,https://www.iu12.org/Page/9,False,False
926,https://go.boarddocs.com/mi/wake/Board.nsf/Public,Wakefield-Marenisco School District,(906) 224-7211,http://www.wmschools.org,False,False
641,https://go.boarddocs.com/mi/brand/Board.nsf/Public,Brandywine Community Schools,269-684-7150,http://www.brandywinebobcats.org,False,False
3046,https://go.boarddocs.com/pa/iu07/Board.nsf/Public,Policy Manual,Westmoreland Intermediate Unit 7,https://go.boarddocs.com/pa/iu07/,False,False
2069,https://go.boarddocs.com/mi/clark/Board.nsf/Public,Clarkston Community Schools,248-623-5400,http://www.clarkston.k12.mi.us,False,False


In [572]:
# it turns out that they could either be an address with a missing zip code, or phone numbers
# there are also misc cases

# let's get a conservative (high) bound on the number of addresses that we will miss

num_no_zipcode_with_number = no_zipcode_df[no_zipcode_df["title_1"].str.contains(number_pattern, na=False) | no_zipcode_df["title_2"].str.contains(number_pattern, na=False)].shape[0]
num_no_zipcode_with_number

97

In [573]:
num_zipcode = df.shape[0] - no_zipcode_df.shape[0]

In [574]:
print(f"Worst case proportion of addresses that we will miss {num_no_zipcode_with_number/(num_no_zipcode_with_number+num_zipcode)*100:.2}%")

Worst case proportion of addresses that we will miss 3.6%


In [575]:
# we put the ones we know are correct at a new address field

df["address"] = None
df.loc[df['Title1_has_zipcode'], "address"] = df["title_1"]
df.loc[df['Title2_has_zipcode'], "address"] = df["title_2"]
df.sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address
379,https://go.boarddocs.com/mo/nwr1/Board.nsf/Public,Northwest School District,"4290 Gravois Road, House Springs, MO 63051",www.northwestschools.net,False,True,"4290 Gravois Road, House Springs, MO 63051"
211,https://go.boarddocs.com/ny/oriskany/Board.nsf/Public,Oriskany Central School District,"1313 Utica Street Oriskany, NY 13424 | 315.768.2058",https://www.oriskanycsd.org,False,True,"1313 Utica Street Oriskany, NY 13424 | 315.768.2058"
1832,https://go.boarddocs.com/il/ipsd/Board.nsf/Public,"780 Shoreline Drive |Aurora, IL 60504 | 630.375.3000","Indian Prairie School District 204, serving Naperville, Aurora, Bolingbrook, & Plainfield, IL.",http://www.ipsd.org,True,False,"780 Shoreline Drive |Aurora, IL 60504 | 630.375.3000"
1576,https://go.boarddocs.com/in/cps/Board.nsf/Public,Crown Point Community School Corporation,,http://www.cps.k12.in.us,False,False,
494,https://go.boarddocs.com/il/vose/Board.nsf/Public,Village of South Elgin,"10 N. Water Street South Elgin, IL 60177-1602 | 847-742-5780",https://www.southelgin.com,False,True,"10 N. Water Street South Elgin, IL 60177-1602 | 847-742-5780"


In [576]:
# now, let's try to get the school district name.
# let's check if they have the word school

df["title_1_has_school"] = df["title_1"].str.contains("school",case=False,na=False)
df["title_2_has_school"] = df["title_2"].str.contains("school",case=False,na=False)

In [577]:
# number of rows with schools in at least one col
df[df["title_1_has_school"] | df["title_2_has_school"]].shape[0]

2885

In [578]:
# number of rows with schools in both cols
df[df["title_1_has_school"] & df["title_2_has_school"]].shape[0]

325

In [579]:
# check out these rows with both cols having schools
df[df["title_1_has_school"] & df["title_2_has_school"]].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school
1261,https://go.boarddocs.com/mi/rave/Board.nsf/Public,School Board Policies and Guidelines,Ravenna Public Schools,http://www.ravennaschools.org,False,False,,True,True
1354,https://go.boarddocs.com/in/ncsc/Board.nsf/Public,School Board Policies,Nettle Creek School Corporation,https://www.nettlecreek.k12.in.us,False,False,,True,True
3029,https://go.boarddocs.com/ny/brcsny/Board.nsf/Public,Bolivar-Richburg Central School District,"100 School Street, Bolivar, NY 14715 Phone: 585-928-2561",http://www.brcs.wnyric.org,False,True,"100 School Street, Bolivar, NY 14715 Phone: 585-928-2561",True,True
3103,https://go.boarddocs.com/pa/stlv/Board.nsf/Public,School Board Policy Manual,Steel Valley School District,http://steelvalleysd.org/,False,False,,True,True
2138,https://go.boarddocs.com/pa/blkv/Board.nsf/Public,School Board Policy Manual,Blacklick Valley School District,http://www.bvsd.k12.pa.us,False,False,,True,True


In [580]:
# there is boilerplate text like School Board Policies
# let's try to get the top few popular ones and remove them

df["title_1"].value_counts().head(10)

title_1
School Board Policy Manual              103
Policy Manual                            49
School Board Policies and Guidelines     35
School Board Policies                    18
Board Policy and Guidelines               8
BoardDocs PL                              8
School Board Policies & Bylaws            6
School Board Policies and Bylaws          6
Board Policy                              5
Board Policies and Guidelines             5
Name: count, dtype: int64

In [581]:
df["title_2"].value_counts().head(10)

title_2
                                    22
Board Policies                      19
School Board Policies               16
Board of Education                  12
NEOLA Board Policies                 8
Board of Education Policies          8
Board of Education Policy Manual     8
School Board Policy Manual           5
Policy Manual                        4
eGovernance Site                     4
Name: count, dtype: int64

In [582]:
# let's remove these

remove_title_1_list = df["title_1"].value_counts().head(10).index
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
df["title_1"].value_counts().head(10)

title_1
School District Policies                               5
Board Policies and Bylaws                              4
Board Policies                                         4
School Board Policies & Guidelines                     4
eGovernance Site                                       4
Board Policy Manual                                    4
Board Policies & Bylaws                                3
School Board Policy                                    3
School Board Policies and Administrative Guidelines    3
Board of Education Policies and Guidelines             3
Name: count, dtype: int64

In [583]:
# let's remove these
remove_title_1_list = df["title_1"].value_counts().head(10).index
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
df["title_1"].value_counts().head(10)

title_1
                                                                     3
School District Policy Manual                                        3
School Board Policy and Guidelines                                   3
School Board Bylaws and Policies                                     2
Cleveland Metropolitan School District                               2
Board of Education                                                   2
Board Policies & Guidelines                                          2
1300 Sherman Street Ste 222 | Sturgis, SD  57785                     2
Policy Manual and Agendas                                            2
60 Jefferson Street, Suite 3• Monticello, NY 12701 • 845-794-7700    2
Name: count, dtype: int64

In [584]:
# let's remove the first few
remove_title_1_list = df["title_1"].value_counts().head(7).index
remove_title_1_list

Index([' ', 'School District Policy Manual',
       'School Board Policy and Guidelines',
       'School Board Bylaws and Policies',
       'Cleveland Metropolitan School District', 'Board of Education',
       'Board Policies & Guidelines'],
      dtype='object', name='title_1')

In [585]:
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
df["title_1"].value_counts().head(10)

title_1
School Board By-Laws and Policies                                                              2
1300 Sherman Street Ste 222 | Sturgis, SD  57785                                               2
60 Jefferson Street, Suite 3• Monticello, NY 12701 • 845-794-7700                              2
Policy Manual and Agendas                                                                      2
1725 North Dodge St. |  Iowa City, IA 52245 | p (319) 688-1000 | f (319) 688-1009              2
Board of Education Policies                                                                    2
1290 Ridder Park Drive | San Jose, CA 95131-2304 | (408) 453-6500                              2
6301 Springside Avenue | Downers Grove, IL 60516 | Ph: (630) 795-7100  | Fx: (630) 795-7199    2
School Board Policy and Bylaws                                                                 2
200 Reid Street | Palatka, FL 32177 | (386) 329-0602                                           2
Name: count, dtype: in

In [586]:
# pick those
remove_title_1_list = [
    "School Board By-Laws and Policies",
    "Policy Manual and Agendas",
    "Board of Education Policies",
    "School Board Policy and Bylaws",
    "Board Policy and Bylaws",
    "Board Policies and By-Laws",
    "School Board Policies "
]
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
df["title_1"].value_counts()

title_1
60 Jefferson Street, Suite 3• Monticello, NY 12701 • 845-794-7700                              2
200 Reid Street | Palatka, FL 32177 | (386) 329-0602                                           2
6301 Springside Avenue | Downers Grove, IL 60516 | Ph: (630) 795-7100  | Fx: (630) 795-7199    2
2680 West County Road 476 | Bushnell, Florida 33513 | Ph: 352-793-2315  Fx: 352-793-4180       2
1300 Sherman Street Ste 222 | Sturgis, SD  57785                                               2
                                                                                              ..
East Valley School District #361                                                               1
Moraga School District                                                                         1
1255 Superior Ave | Calumet City, IL 60409 | (708) 868-7500                                    1
Hawthorn School District 73                                                                    1
41770 Baldridge Street

In [587]:
# do this for title_2
# let's remove these
# NOTE: NEOLOA Board Policies might provide coarse information on whether the school could be located

remove_title_2_list = df["title_2"].value_counts().head(10).index
remove_title_2_list

Index([' ', 'Board Policies', 'School Board Policies', 'Board of Education',
       'NEOLA Board Policies', 'Board of Education Policies',
       'Board of Education Policy Manual', 'School Board Policy Manual',
       'Policy Manual', 'eGovernance Site'],
      dtype='object', name='title_2')

In [588]:
df.loc[df["title_2"].isin(remove_title_2_list), "title_2"] = None
# let's check what's left
df["title_2"].value_counts().head(10)

title_2
Board of Education Meetings                              3
Board of Directors                                       3
Board of Education Policy                                3
Bylaws & Policies                                        3
Central Valley School District                           2
Iowa City Community School District                      2
Arlington Public Schools                                 2
Meetings and Information                                 2
8485 Homestead, Zeeland, MI 49464 Phone: 616-748-5637    2
Meade County, South Dakota                               2
Name: count, dtype: int64

In [589]:
remove_title_2_list = df["title_2"].value_counts().head(4).index
df.loc[df["title_2"].isin(remove_title_2_list), "title_2"] = None
# let's check what's left
df["title_2"].value_counts().head(10)

title_2
Green Local Schools                    2
Putnam County School District          2
Iowa City Community School District    2
Meetings and Information               2
Arlington Public Schools               2
Neola Board Policies                   2
Meade County, South Dakota             2
Central Valley School District         2
Meetings, Agendas and Information      2
Board Policy Manual                    2
Name: count, dtype: int64

In [590]:
remove_title_2_list = [
    "Meetings and Information",
    "Meetings, Agendas and Information",
    "Board Policy Manual"
]
df.loc[df["title_2"].isin(remove_title_2_list), "title_2"] = None
# let's check what's left
df["title_2"].value_counts().head(10)

title_2
Community High School District 99                        2
NEOLA Board Policy                                       2
Iowa City Community School District                      2
Meade County, South Dakota                               2
Neola Board Policies                                     2
Board of Education Policies                              2
Central Valley School District                           2
Green Local Schools                                      2
8485 Homestead, Zeeland, MI 49464 Phone: 616-748-5637    2
Sumter District Schools                                  2
Name: count, dtype: int64

In [None]:
# now, we check for the intersections again