This notebook takes a look at the scrapped results from the BoardDocs website and gets the correct addresses for each row.

Input:
- `prelim_results.csv`

In [2149]:
import pandas as pd

df = pd.read_csv("prelim_results.csv")
df.head()

Unnamed: 0,URL,title_1,title_2,home_website
0,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,Board Policies and Guidelines,St. Joseph Public Schools,https://www.sjschools.org/
1,https://go.boarddocs.com/pa/cali/Board.nsf/Public,School Board Policy Manual,,www.calsd.org
2,https://go.boarddocs.com/oh/mapleheights/Board.nsf/Public,Maple Heights City Schools,"5740 Lawn Avenue | Maple Heights, OH 44137 | 216-587-6100",http://www.mapleschools.com
3,https://go.boarddocs.com/oh/rlsd/Board.nsf/Public,"585 Riverside Drive | Painesville, Ohio 44077 | 440.352.0668 | f 440.639.1959",Riverside Local School District,https://www.riversidelocalschools.com/
4,https://go.boarddocs.com/pa/shun/Board.nsf/Public,School Board Policy Manual,Southern Huntingdon County School District,http://www.shcsd.org


In [2150]:
# we remove redundant info
df.loc[df["title_1"]==df["title_2"], "title_1"] = None

In [2151]:
# trim whitespace
df.loc[:, "title_1"] = df["title_1"].str.strip()
df.loc[:, "title_2"] = df["title_2"].str.strip()

In [2152]:
# consider the 5-digit zip code approach
# check which rows have 5-digit codes in Title1, and those with them in Title2.
# Hopefully this will be a partition

# Define a regex pattern to match a 5-digit zip code
zip_code_pattern = r'\b\d{5}\b'

# Find rows where Title1 has a 5-digit zip code
df['Title1_has_zipcode'] = df['title_1'].str.contains(zip_code_pattern, na=False)

# Find rows where Title2 has a 5-digit zip code
df['Title2_has_zipcode'] = df['title_2'].str.contains(zip_code_pattern, na=False)

In [2153]:
# check if it is a partition
# first check if they add up

import numpy as np

print(f"Num of rows where title 1 has zipcode {df['Title1_has_zipcode'].sum()}")
print(f"Num of rows where title 2 has zipcode {df['Title2_has_zipcode'].sum()}")
print(f"Num of rows where title 1 or title 2 has zipcode {np.sum(df['Title1_has_zipcode'] | df['Title2_has_zipcode'])}")
print(f"Num of rows where title 1 and title 2 have zipcode {np.sum(df['Title1_has_zipcode'] & df['Title2_has_zipcode'])}")
print(f"Num of total rows {df.shape[0]}")

Num of rows where title 1 has zipcode 1440
Num of rows where title 2 has zipcode 1442
Num of rows where title 1 or title 2 has zipcode 2882
Num of rows where title 1 and title 2 have zipcode 0
Num of total rows 3906


In [2154]:
# ok great, there are no rows where you can find zipcodes on both cols
# but some rows don't have zipcodes in either
# let's check them out

no_zipcode_df = df[~(df['Title1_has_zipcode'] | df['Title2_has_zipcode'])]
print(f"Num of rows without zipcode {no_zipcode_df.shape[0]}")
no_zipcode_df.sample(5)

Num of rows without zipcode 1024


Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
2359,https://go.boarddocs.com/wi/thawk/Board.nsf/Public,School Board Policies and Guidelines,School District of Tomahawk,https://www.tomahawk.k12.wi.us,False,False
1760,https://go.boarddocs.com/oh/sthrn/Board.nsf/Public,School board policies and manuals,Southern Local Schools,http://www.southern.k12.oh.us,False,False
1370,https://go.boarddocs.com/wi/hnrsd/Board.nsf/Public,Herman-Neosho-Rubicon School District,,www.hnrschools.org,False,False
536,https://go.boarddocs.com/mi/shep/Board.nsf/Public,Shepherd Public Schools,School Board Policies,http://www.shepherdschools.net,False,False
1904,https://go.boarddocs.com/in/lawrenc/Board.nsf/Public,Lawrenceburg Community School Corporation,Corporation Policies,http://www.lburg.k12.in.us,False,False


In [2155]:
# let's get the proportions


print(f"Percentage of rows where title 1 has zipcode {df['Title1_has_zipcode'].sum()/df.shape[0]*100:.2f}%")
print(f"Percentage of rows where title 2 has zipcode {df['Title2_has_zipcode'].sum()/df.shape[0]*100:.2f}%")
print(f"Percentage of rows where title 1 or title 2 has zipcode {np.sum(df['Title1_has_zipcode'] | df['Title2_has_zipcode']).sum()/df.shape[0]*100:.2f}%")
print(f"Percentage of rows where title 1 and title 2 have zipcode {np.sum(df['Title1_has_zipcode'] & df['Title2_has_zipcode']).sum()/df.shape[0]*100:.2f}%")
print(f"Percentage of rows with no zipcodes {no_zipcode_df.shape[0]/df.shape[0]*100:.2f}%")


Percentage of rows where title 1 has zipcode 36.87%
Percentage of rows where title 2 has zipcode 36.92%
Percentage of rows where title 1 or title 2 has zipcode 73.78%
Percentage of rows where title 1 and title 2 have zipcode 0.00%
Percentage of rows with no zipcodes 26.22%


In [2156]:
# some NaNs, some "Policy Manual", "School Board Policy and Guidelines", etc
# let's check the most common values

no_zipcode_df["title_1"].value_counts().head()

title_1
School Board Policy Manual              115
Policy Manual                            54
School Board Policies and Guidelines     41
School Board Policies                    22
BoardDocs PL                              9
Name: count, dtype: int64

In [2157]:
no_zipcode_df["title_2"].value_counts().head()

title_2
Board Policies                 24
                               22
School Board Policies          17
Board of Education             11
Board of Education Policies    11
Name: count, dtype: int64

In [2158]:
# ok, now let's check the website col
sum(df["home_website"].isna())

53

In [2159]:
# ok, unfortunately there are boarddocs without the home website linked
print(f"Percent of websites without links to official: {sum(df["home_website"].isna())/df.shape[0]*100:.3}%")

Percent of websites without links to official: 1.36%


In [2160]:
# but thankfully this number is small
# let's take a look at these websites

# do not truncate the col values in display
pd.set_option('display.max_colwidth', None)

df[df["home_website"].isna()].head()

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
232,https://go.boarddocs.com/pa/camb/Board.nsf/Public,,,,False,False
443,https://go.boarddocs.com/oh/oakhil/Board.nsf/Public,,,,False,False
465,https://go.boarddocs.com/oh/warrenoh/Board.nsf/Public,,,,False,False
468,https://go.boarddocs.com/pa/marp/Board.nsf/Public,School Board Policy Manual,Marple Newtown School District,,False,False
533,https://go.boarddocs.com/oh/meigs/Board.nsf/Public,,,,False,False


In [2161]:
# after inspecting a few, it seems like they will usually write their school district as the h1 tag at least.

In [2162]:
# back to the address
# an observation is that I don't think those that don't contain zip codes will have addresses on the website
# let's check if a single number exists in them

number_pattern = r'\d'
print("Number of no-zipcode rows that contain a number")
sum(no_zipcode_df["title_1"].str.contains(number_pattern, na=False))

Number of no-zipcode rows that contain a number


46

In [2163]:
no_zipcode_df[no_zipcode_df["title_1"].str.contains(number_pattern, na=False)].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
2270,https://go.boarddocs.com/pa/ewes/Board.nsf/Public,4904 Route 982,,http://www.ewctc.net,False,False
66,https://go.boarddocs.com/il/wilmette39/Board.nsf/Public,Wilmette Public School District 39,,https://www.wilmette39.org,False,False
3296,https://go.boarddocs.com/oh/poland/Board.nsf/Public,"3199 Dobbins Avenue, Poland, OH | Phone: 330-757-7000","Poland Local School District - Poland, OH",www.polandbulldogs.com,False,False
3150,https://go.boarddocs.com/co/asd20/Board.nsf/Public,Academy District 20 Board of Education,,http://www.asd20.org,False,False
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,POLICY,http://www.hartlandschools.us,False,False


In [2164]:
no_zipcode_df[no_zipcode_df["title_2"].str.contains(number_pattern, na=False)].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
1203,https://go.boarddocs.com/in/brownsburg/Board.nsf/Public,Brownsburg Community School Corporation,"310 Stadium Drive Brownsburg, IN",https://www.brownsburg.k12.in.us/,False,False
2779,https://go.boarddocs.com/pa/iu12/Board.nsf/Public,Board Policy Manual,Lincoln Intermediate Unit 12,https://www.iu12.org/Page/9,False,False
2995,https://go.boarddocs.com/mi/ishp/Board.nsf/Public,School Board Policies & Bylaws,Ishpeming Public School District No. 1,http://www.ishpemingschools.org,False,False
2525,https://go.boarddocs.com/nj/burlingtontwp/Board.nsf/Public,Burlington Township School District,(609) 387-3955,https://burltwpsch.org/,False,False
479,https://go.boarddocs.com/pa/iu06/Board.nsf/Public,"Meetings, Agendas, Policy Manual",Riverview Intermediate Unit 6,http://www.riu6.org,False,False


In [2165]:
# it turns out that they could either be an address with a missing zip code, or phone numbers
# there are also misc cases

# let's get a conservative (high) bound on the number of addresses that we will miss

num_no_zipcode_with_number = no_zipcode_df[no_zipcode_df["title_1"].str.contains(number_pattern, na=False) | no_zipcode_df["title_2"].str.contains(number_pattern, na=False)].shape[0]
num_no_zipcode_with_number

105

In [2166]:
num_zipcode = df.shape[0] - no_zipcode_df.shape[0]

In [2167]:
print(f"Worst case proportion of addresses that we will miss {num_no_zipcode_with_number/(num_no_zipcode_with_number+num_zipcode)*100:.2}%")

Worst case proportion of addresses that we will miss 3.5%


In [2168]:
# we put the ones we know are correct at a new address field

df["address"] = None
df.loc[df['Title1_has_zipcode'], "address"] = df["title_1"]
df.loc[df['Title2_has_zipcode'], "address"] = df["title_2"]
df.sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address
2667,https://go.boarddocs.com/mi/compk/Board.nsf/Public,Comstock Park Public Schools,"101 School Street NE Comstock Park, MI 49321",http://www.cppschools.com/,False,True,"101 School Street NE Comstock Park, MI 49321"
2012,https://go.boarddocs.com/in/ensc/Board.nsf/Public,School Board Policies,East Noble School Corporation,http://www.eastnoble.net,False,False,
3084,https://go.boarddocs.com/nc/cmsnc/Board.nsf/Public,Charlotte-Mecklenburg Schools,"P.O. Box 30035, Charlotte, NC 28230",http://www.cms.k12.nc.us,False,True,"P.O. Box 30035, Charlotte, NC 28230"
1850,https://go.boarddocs.com/ca/bpusd/Board.nsf/Public,Baldwin Park Unified School District CA,"3699 North Holly Avenue Baldwin Park, CA 91706 | (626) 962-3311",https://www.bpusd.net/,False,True,"3699 North Holly Avenue Baldwin Park, CA 91706 | (626) 962-3311"
2159,https://go.boarddocs.com/wi/waus/Board.nsf/Public,Wausau School District,Board of Education Policy Manual,http://www.wausauschools.org,False,False,


In [2169]:
# now, let's try to get the school district name.
# let's check if they have the word school

df["title_1_has_school"] = df["title_1"].str.contains("school",case=False,na=False)
df["title_2_has_school"] = df["title_2"].str.contains("school",case=False,na=False)

In [2170]:
# number of rows with schools in at least one col
df[df["title_1_has_school"] | df["title_2_has_school"]].shape[0]

3206

In [2171]:
# number of rows with schools in both cols
df[df["title_1_has_school"] & df["title_2_has_school"]].shape[0]

356

In [2172]:
# check out these rows with both cols having schools
df[df["title_1_has_school"] & df["title_2_has_school"]].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school
325,https://go.boarddocs.com/mi/jacks/Board.nsf/Public,School Board Policies and Administrative Guidelines,Jackson Public Schools,https://www.jpsk12.org,False,False,,True,True
2869,https://go.boarddocs.com/pa/hamb/Board.nsf/Public,School Board Policy Manual,Hamburg Area School District,http://www.hasdhawks.org,False,False,,True,True
842,https://go.boarddocs.com/pa/blur/Board.nsf/Public,School Board Policy Manual,Blue Ridge School District,http://www.brsd.org,False,False,,True,True
200,https://go.boarddocs.com/oh/beaverls/Board.nsf/Public,School Board Policies,Beaver Local Schools,http://www.beaver.k12.oh.us,False,False,,True,True
3714,https://go.boarddocs.com/mi/glads/Board.nsf/Public,School Board By-Laws and Policies,Gladstone Area Schools,http://www.gladstoneschools.com,False,False,,True,True


In [2173]:
# there is boilerplate text like School Board Policies
# let's try to get the top few popular ones and remove them

df["title_1"].value_counts().head(10)

title_1
School Board Policy Manual              118
Policy Manual                            55
School Board Policies and Guidelines     41
School Board Policies                    22
BoardDocs PL                              9
Board Policy and Guidelines               8
Board Policies                            7
School Board Policies and Bylaws          7
School Board Policies & Bylaws            6
Board Policies and Guidelines             6
Name: count, dtype: int64

In [2174]:
df["title_2"].value_counts().head(10)

title_2
Board Policies                      25
                                    25
School Board Policies               18
Board of Education                  14
Board of Education Policies         12
Board of Education Policy Manual    10
NEOLA Board Policies                10
Policy Manual                        6
School Board Policy Manual           6
Board Policy Manual                  5
Name: count, dtype: int64

In [2175]:
# let's remove these

remove_title_1_list = df["title_1"].value_counts().head(10).index
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
df["title_1"].value_counts().head(10)

title_1
Board Policy                          6
School District Policies              6
Board Policy Manual                   5
Board Policies and Bylaws             5
School Board Policies & Guidelines    4
eGovernance Site                      4
School Board Policy                   4
School District Policy Manual         3
Board Policies & Bylaws               3
Board Policy and Bylaws               3
Name: count, dtype: int64

In [2176]:
# let's remove these
remove_title_1_list = df["title_1"].value_counts().head(10).index
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
df["title_1"].value_counts().head(10)

title_1
Board of Education Policies and Guidelines                                                  3
School Board Policy and Guidelines                                                          3
School Board Policies and Administrative Guidelines                                         3
                                                                                            3
Board of Education Policies                                                                 3
2680 West County Road 476 | Bushnell, Florida 33513 | Ph: 352-793-2315  Fx: 352-793-4180    2
Bloomfield School District                                                                  2
1290 Ridder Park Drive | San Jose, CA 95131-2304 | (408) 453-6500                           2
School Board Policy & Guidelines                                                            2
1725 North Dodge St. |  Iowa City, IA 52245 | p (319) 688-1000 | f (319) 688-1009           2
Name: count, dtype: int64

In [2177]:
# let's remove the first few
remove_title_1_list = df["title_1"].value_counts().head(5).index
remove_title_1_list

Index(['Board of Education Policies and Guidelines',
       'School Board Policy and Guidelines',
       'School Board Policies and Administrative Guidelines', '',
       'Board of Education Policies'],
      dtype='object', name='title_1')

In [2178]:
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
df["title_1"].value_counts().head(10)

title_1
2680 West County Road 476 | Bushnell, Florida 33513 | Ph: 352-793-2315  Fx: 352-793-4180    2
1300 Sherman Street Ste 222 | Sturgis, SD  57785                                            2
Bloomfield School District                                                                  2
1290 Ridder Park Drive | San Jose, CA 95131-2304 | (408) 453-6500                           2
School Board Policy & Guidelines                                                            2
1725 North Dodge St. |  Iowa City, IA 52245 | p (319) 688-1000 | f (319) 688-1009           2
Electronic Governance System                                                                2
Policy Manual and Agendas                                                                   2
Board of Education                                                                          2
60 Jefferson Street, Suite 3• Monticello, NY 12701 • 845-794-7700                           2
Name: count, dtype: int64

In [2179]:
# pick those
remove_title_1_list = [
    "School Board By-Laws and Policies",
    "School Board policies and guidelines",
    "Policy Manual and Agendas",
    "Board of Education Policies",
    "School Board Policy and Bylaws",
    "Board Policy and Bylaws",
    "Board Policies and By-Laws",
    "School Board Policies ",
    "Board of Education",
    "School Board Policies and Guidlines",
    "School Board Bylaws and Policies",
    "SCHOOL BOARD POLICIES",
    "Board Policies & Guidelines",
    "School Board Policy & Guidelines",
    "Electronic Governance System",
    "School Board Agendas and Minutes",
    "School Board Policy and Administrative Guidelines",
    "School District Policies / Policy Manual",
    "School Policy Manual",
    "School Board Meetings",
    "School Board Policies & Ad Guidelines",
    "School Board meetings, agendas and policies",
    "School Board Policies and By Laws",
    "School board policies and manuals",
    "Board of School Directors",
    "School Board Agendas, Minutes, Policies and Guidelines",
    "Board Approved School Policy",
    "School Board Policies & Administrative Guidelines",
    "School Board Policy",
    "School Board Agendas, Policies and Guidelines",
    "School Board Bylaws/Policies/Guidelines",
    "SchoolBoard Policy Manual",
    "School Board Policy and Guidlines",
    "School Board Policy",
    "Governing Board Policy",
    "Board policy and guidelines",
    "Policies & By Laws",
    "Board of Trustees Policies and Administrative Guidelines",
    "Board of Education Policy and Bylaws"
    
]
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
print(df["title_1"].value_counts().index[:10])

Index(['2680 West County Road 476 | Bushnell, Florida 33513 | Ph: 352-793-2315  Fx: 352-793-4180',
       '200 Reid Street | Palatka, FL 32177 | (386) 329-0602',
       '60 Jefferson Street, Suite 3• Monticello, NY 12701 • 845-794-7700',
       'Bloomfield School District',
       '1290 Ridder Park Drive | San Jose, CA 95131-2304 | (408) 453-6500',
       '1300 Sherman Street Ste 222 | Sturgis, SD  57785',
       '315 N. French Avenue | Arlington  WA 98223 | 360.618.6200 | f 360.618.6221',
       '1725 North Dodge St. |  Iowa City, IA 52245 | p (319) 688-1000 | f (319) 688-1009',
       'Cleveland Metropolitan School District',
       '6301 Springside Avenue | Downers Grove, IL 60516 | Ph: (630) 795-7100  | Fx: (630) 795-7199'],
      dtype='object', name='title_1')


In [2180]:
# do this for title_2
# let's remove these
# NOTE: NEOLOA Board Policies might provide coarse information on whether the school could be located

remove_title_2_list = df["title_2"].value_counts().head(10).index
remove_title_2_list

Index(['Board Policies', '', 'School Board Policies', 'Board of Education',
       'Board of Education Policies', 'Board of Education Policy Manual',
       'NEOLA Board Policies', 'Policy Manual', 'School Board Policy Manual',
       'Board Policy Manual'],
      dtype='object', name='title_2')

In [2181]:
df.loc[df["title_2"].isin(remove_title_2_list), "title_2"] = None
# let's check what's left
df["title_2"].value_counts().head(10)

title_2
eGovernance Site                 4
Bylaws & Policies                4
Board of Education Policy        4
Board of Education Meetings      3
Board Policy                     3
Neola Board Policies             3
Board of Directors               3
Arlington Public Schools         3
Putnam County School District    2
BoardDocs - Meeting Agendas      2
Name: count, dtype: int64

In [2182]:
remove_title_2_list = df["title_2"].value_counts().head(4).index
df.loc[df["title_2"].isin(remove_title_2_list), "title_2"] = None
# let's check what's left
df["title_2"].value_counts().head(10)

title_2
Arlington Public Schools                  3
Neola Board Policies                      3
Board of Directors                        3
Board Policy                              3
Board Agendas                             2
Monticello Central School District        2
Meetings, Agendas and Information         2
Green Local Schools                       2
Santa Clara County Office of Education    2
Meade County, South Dakota                2
Name: count, dtype: int64

In [2183]:
remove_title_2_list = [
    "Meetings and Information",
    "Meetings, Agendas and Information",
    "Board Policy Manual",
    "Board of Education Meetings",
    "Board of Directors",
    "Policies",
    "BoardDocs - Meeting Agendas",
    "Board of Education Policies ",
    'Board Agendas',
    "NEOLA Board Policy",
    'NEOLA Board of Education Policies',
    'Neola Board Policies', 
    'NEOLA Board of Education Policy Manual',
    'NEOLA Policies', 
    'Meetings, Agendas, Information',
    'School Board Policies and Guidelines',
    "School Board Policies and Bylaws",
    "NEOLA School Board Policies",
    "Board of School Trustees Policy Manual",
    "School Board Policy	",
    "School Board Policies & Administrative Regulations	",
    "School Board Policy",
    "Board of School Trustees",
    "SCHOOL POLICIES AND GUIDELINES"
]
df.loc[df["title_2"].isin(remove_title_2_list), "title_2"] = None
# let's check what's left
df["title_2"].value_counts().index[:20]

Index(['Arlington Public Schools', 'Board Policy',
       'Central Valley School District', 'Meade County, South Dakota',
       'Sumter District Schools', 'Santa Clara County Office of Education',
       'Green Local Schools', 'eGovernance System',
       'Monticello Central School District',
       'Community High School District 99', 'Putnam County School District',
       '8485 Homestead, Zeeland, MI 49464 Phone: 616-748-5637',
       'Iowa City Community School District',
       '2045 School Street North Collins, NY 14111', 'Barry ISD',
       'Adena Local Schools',
       '801 Corporate Centre Drive | O'Fallon, MO 63368 | Phone: 636-851-4000',
       '1323 E. 7th Street, Lockport, IL 60441',
       'Phone: 330-627-2181, - Fax: 330-627-2182',
       'Johnson County School District #1'],
      dtype='object', name='title_2')

In [2184]:
# now, we check for the intersections again

# let's check if they have the word school

df["title_1_has_school"] = df["title_1"].str.contains("school",case=False,na=False)
df["title_2_has_school"] = df["title_2"].str.contains("school",case=False,na=False)

# number of rows with schools in both cols
df[df["title_1_has_school"] & df["title_2_has_school"]].shape[0]

89

In [2185]:
# check out these rows with both cols having schools
df[df["title_1_has_school"] & df["title_2_has_school"]].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school
3186,https://go.boarddocs.com/vsba/roacps/Board.nsf/Public,"Roanoke City Public Schools 40 Douglass Avenue, NW Roanoke VA 24012 540-853-2502",City of Roanoke School Board,http://rcps.info/,True,False,"Roanoke City Public Schools 40 Douglass Avenue, NW Roanoke VA 24012 540-853-2502",True,True
2419,https://go.boarddocs.com/va/surry/Board.nsf/Public,"45 School Street | Surry, VA 23883 | 757.294.5229 | f 757.294.5263",Surry County Public Schools,http://www.surryschools.net/,True,False,"45 School Street | Surry, VA 23883 | 757.294.5229 | f 757.294.5263",True,True
3369,https://go.boarddocs.com/ny/brcsny/Board.nsf/Public,Bolivar-Richburg Central School District,"100 School Street, Bolivar, NY 14715 Phone: 585-928-2561",http://www.brcs.wnyric.org,False,True,"100 School Street, Bolivar, NY 14715 Phone: 585-928-2561",True,True
1824,https://go.boarddocs.com/pa/moha/Board.nsf/Public,Mohawk Area School District,"385 Mohawk School Road, New Castle, PA 16102",www.mohawk.k12.pa.us,False,True,"385 Mohawk School Road, New Castle, PA 16102",True,True
673,https://go.boarddocs.com/pa/laur/Board.nsf/Public,Laurel School Board Agendas and Policy Manual,Laurel School District,http://www.laurel.k12.pa.us,False,False,,True,True


In [2186]:
# I asked ChatGPT to inspect and find more titles I can remove

remove_title_1_list = [
    "School Board Agendas, Minutes, and Policies",
    "School Board Policies, Bylaws, and Guidelines",
    "School Policies and Guidelines",
    "School Board Policy Manual",
    "Board of School Trustees Policy",
    "School Board Agendas and Policies",
    "SchoolBoard Policy Manual",
    "School Board Agendas, Policies, Rules and Exhibits",
    "School Board Policy",
    "SchoolBoard Policy Manual ",
    "School District Policies ",
    "Board Meetings and Policies",
    "Board Management System",
    "Board of Education Policy and Guidelines",
    "Board Policy and By-Laws",
    "Policy and Bylaws",
    "Board Bylaws & Policies",
    "Bylaws & Policies",
    "Board of Trustees Policies",
    "Policies - Bylaws",
    "Bylaws and Policies",
    "NEOLA Policies",
    "Board of Education Bylaws and Policies",
    "School Board Policy Manual",
    "Board Administrative Guidelines , Bylaws, Forms and Policies",
    "Board Policies, Administrative Guidelines, and Forms",
    "Board Policies, Bylaws, Administrative Guidelines, Forms",
    "Board Agendas and Policies",
    "Policies of the Board of Education",
    "School Board Policies and Guidelines",
    "Corporation Board Policies and Guidelines",
    "NEOLA Board Policies and By-Laws",
    "Policies & Bylaws",
    "Board of Education Policies and Guidelines",
    "Policies - Bylaws",
    "Policies of the Board of Education",
    "Board Policy Handbook",
    "Board of Education Policies",
    "Board Bylaws and Policies",
    "Board Policy Manual and Administrative Guidelines",
    "Board Policies and Administrative Guidelines",
    "Board of Education NEOLA Policy",
    "Board of Education Policies and Administrative Guidelines",
    "Board of School Trustees Policy Manual",
    "School Board Policies and Guidelines",
    "Board Policy",
    "Policies and Administrative Guidelines",
    "Board of Education Policy and Administrative Guidelines",
    "Policies & Administrative Guidelines",
    "Meetings, Agendas, Policy Manual",
    "Joint Operating Committee (JOC) Policy Manual",
    "Board Policy & Guidelines",
    "BOARD OF TRUSTEES",
    "School Board Policies, Meeting Agendas and Minutes",
    "Board Policies and By-laws",
    "Board of Regents",
    "Board of Education Policies and By-Laws",
    "Board of Education Bylaws and Policies/Administrative Guidelines",
    "BoardDocs LT",
    "Fairless District Policy Manual",
    "Board of Education Bylaws & Policies",
    "Board Policies, Administrative Guidelines and Forms",
    "Board Policy & Bylaws",
    "Success for all in the 21st Century . . ."
]

df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None

In [2187]:
# I asked ChatGPT to inspect and find more titles I can remove

remove_title_2_list = [
    "POLICY",
    "Neola Board Policies & Guidelines",
    "Board of Education Policy and Guidelines",
    "NEOLA Policy Manual",
    "Board Policy",
    "Board Policies and Guidelines",
    "Policies And Administrative Guidelines",
    "Board Policies, Administrative Guidelines, and Forms",
    "Board of Education Policy",
    "Board Bylaws and Policies",
    "Policies - Bylaws",
    "Board of Education Bylaws and Policies",
    "Neola Board Policy & Administrative Guidelines",
    "Board Policy Handbook",
    "Policies of the Board of Education",
    "NEOLA Board Policy Manual",
    "Policies & Administrative Guidelines",
    "Policies & Bylaws",
    "Board Bylaws and Policies",
    "Board Policies",
    "Board of Education Policies",
    "Board of Education Policy and Administrative Guidelines",
    "Board Policy Manual",
    "Board of Education Policies and Administrative Guidelines",
    "BoardDocs LT",
    "Policies - Bylaws",
    "Policies of the Board of Education",
    "Board Policy and By-Laws",
    "Policies and Administrative Guidelines",
    "Board of Education Bylaws & Policies",
    "Board Policy",
    "Policies & Administrative Guidelines",
    "Board of Education School Policies",
    "School Board",
    "School Board Policies & Administrative Regulations",
    "School Board Policy",
    "Board of School Trustees Policy Manual",
    "School Board Policies and Guidelines",
    "School Board Policy ",
    "Board Policy",
    "Bylaws and Policies",
    "Meetings, Agenda and Information",
    "eGovernance System",
    "Meeting Packets",
    "Opportunity. Equity. Social Justice.",
    "Neola Board Policies & Guidelines",
    "Board Of Education Policy",
    "Board of Education NEOLA Policy",
    "Board Of Education Policies",
    "NEOLA - Board of Education Policies",
    "Board Policy Manual and Administrative Guidelines",
    "Policies & By-Laws",
    "Providing today's students opportunities to become tomorrow's leaders",
    "Board of School Trustees Policy",
    "Corporation Policies",
    "Board of Education Neola Polcies",
    "NEOLA Board Policies and By-Laws",
    "Board of Education Policy /Administrative Guidelines",
    "Board of Education Policies and Bylaws",
    "Board of Education Policies and Guidelines",
    "Neola Board of Education Policy Manual",
    "Neola - Board Policy",
    "Neola Board Policy",
    "Creating the Greatest Opportunities for Our Students"
]

df.loc[df["title_2"].isin(remove_title_2_list), "title_2"] = None

In [2188]:
# now, we check for the intersections again

# let's check if they have the word school

df["title_1_has_school"] = df["title_1"].str.contains("school",case=False,na=False)
df["title_2_has_school"] = df["title_2"].str.contains("school",case=False,na=False)

# number of rows with schools in both cols
df[df["title_1_has_school"] & df["title_2_has_school"]].shape[0]

79

In [2189]:
# check out these rows with both cols having schools
df[df["title_1_has_school"] & df["title_2_has_school"]].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school
1022,https://go.boarddocs.com/wi/tlsd/Board.nsf/Public,Twin Lakes School District #4,Lakewood School Board Policies,http://www.twinlakes.k12.wi.us,False,False,,True,True
1365,https://go.boarddocs.com/oh/cwlsdoh/Board.nsf/Public,Crestwood Local Schools,"Crestwood Local School District 10880 John Edward Drive Mantua, OH 44255 | 330-357-8206",http://www.crestwoodschools.org,False,True,"Crestwood Local School District 10880 John Edward Drive Mantua, OH 44255 | 330-357-8206",True,True
2484,https://go.boarddocs.com/pa/pmsd/Board.nsf/Public,Pocono Mountain School District,"135 Pocono Mountain School Road, PO Box 200, Swiftwater, PA 18370",http://www.pmsd.org,False,True,"135 Pocono Mountain School Road, PO Box 200, Swiftwater, PA 18370",True,True
1578,https://go.boarddocs.com/oh/amanda/Board.nsf/Public,Amanda-Clearcreek Local Schools,Amanda-Clearcreek Local School District,http://www.amanda.k12.oh.us/,False,False,,True,True
2478,https://go.boarddocs.com/ca/hawking/Board.nsf/Public,Hawking STEAM Charter School,Hawking STEAM Charter Schools,https://www.hawkingschools.org/,False,False,,True,True


In [2190]:
# since a lot of it is from addresses,
# I will remove those where we have ported to the address field

df.loc[df["title_1"] == df["address"], "title_1"] = None
df.loc[df["title_2"] == df["address"], "title_2"] = None

In [2191]:
# now, we check for the intersections again

# let's check if they have the word school

df["title_1_has_school"] = df["title_1"].str.contains("school",case=False,na=False)
df["title_2_has_school"] = df["title_2"].str.contains("school",case=False,na=False)

# number of rows with schools in both cols
df[df["title_1_has_school"] & df["title_2_has_school"]].shape[0]

20

In [2192]:
# from 97 to 38, pretty good

# check out these rows with both cols having schools
df[df["title_1_has_school"] & df["title_2_has_school"]].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school
1022,https://go.boarddocs.com/wi/tlsd/Board.nsf/Public,Twin Lakes School District #4,Lakewood School Board Policies,http://www.twinlakes.k12.wi.us,False,False,,True,True
2060,https://go.boarddocs.com/vsba/fccpsva/Board.nsf/Public,Falls Church City Public Schools,Falls Church City School Board,https://www.fccps.org/page/school-board,False,False,,True,True
3257,https://go.boarddocs.com/in/fcsc/Board.nsf/Public,Franklin Township Community School Corp.,Franklin Township Community School Corporation,http://www.ftcsc.k12.in.us/,False,False,,True,True
769,https://go.boarddocs.com/pa/epns/Board.nsf/Public,EAST PENNSBORO AREA SCHOOL DISTRICT,East Pennsboro Area School District,http://www.epasd.org,False,False,,True,True
1593,https://go.boarddocs.com/oh/labr/Board.nsf/Public,Policies of the LaBrae Local School District,LaBrae Local School District,https://labrae.school,False,False,,True,True


In [2193]:
# now, let's check those with both titles still intact

n = df[(~df["title_1"].isna()) & (~df["title_2"].isna())].shape[0]
print(f"Number of rows with both cols intact: {n}")

Number of rows with both cols intact: 156


In [2194]:
# let's make a school district column
# only do those to rows where EXACTLY one column has the word school district

df["title_1_has_school_district"] = df["title_1"].str.contains("school district",case=False,na=False)
df["title_2_has_school_district"] = df["title_2"].str.contains("school district",case=False,na=False)

# number of rows with schools in both cols
df[df["title_1_has_school_district"] & df["title_2_has_school_district"]].shape[0]

4

In [2195]:
# check out these with both
df[df["title_1_has_school_district"] & df["title_2_has_school_district"]]

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school,title_1_has_school_district,title_2_has_school_district
538,https://go.boarddocs.com/ny/trivc/Board.nsf/Public,Tri-Valley Central School District Board of Education,Tri-Valley Central School District,http://www.trivalleycsd.org/,False,False,,True,True,True,True
769,https://go.boarddocs.com/pa/epns/Board.nsf/Public,EAST PENNSBORO AREA SCHOOL DISTRICT,East Pennsboro Area School District,http://www.epasd.org,False,False,,True,True,True,True
1593,https://go.boarddocs.com/oh/labr/Board.nsf/Public,Policies of the LaBrae Local School District,LaBrae Local School District,https://labrae.school,False,False,,True,True,True,True
2031,https://go.boarddocs.com/wi/vasd/Board.nsf/Public,Verona Area School District,Verona Area School District,http://www.verona.k12.wi.us,False,False,,True,True,True,True


In [2196]:
# we can remove all in title_1
df.loc[df["title_1_has_school_district"] & df["title_2_has_school_district"], "title_1"] = None

In [2197]:
# let's make a school district column
# only do those to rows where EXACTLY one column has the word school district

df["title_1_has_school_district"] = df["title_1"].str.contains("school district",case=False,na=False)
df["title_2_has_school_district"] = df["title_2"].str.contains("school district",case=False,na=False)

# number of rows with schools in both cols
df[df["title_1_has_school_district"] & df["title_2_has_school_district"]].shape[0]

0

In [2198]:
# now we can assign the school_district column

df.loc[df["title_1_has_school_district"],"school_district"] = df["title_1"]
df.loc[df["title_1_has_school_district"],"title_1"] = None

df.loc[df["title_2_has_school_district"],"school_district"] = df["title_2"]
df.loc[df["title_2_has_school_district"],"title_2"] = None

In [2199]:
# now let's look at the remaining columns
# first check those where both cols are still intact

df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())].shape[0]

70

In [2200]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school,title_1_has_school_district,title_2_has_school_district,school_district
3350,https://go.boarddocs.com/in/evsc/Board.nsf/Public,EVSC School Board Meetings,Evansville Vanderburgh School Corporation,www.evscschools.com,False,False,,True,True,False,False,
2781,https://go.boarddocs.com/ca/dlinorthcounty/Board.nsf/Public,Dual Language Immersion North County,PHONE: (760) 203-5140,https://www.dlinorthcounty.org/,False,False,,False,False,False,False,
624,https://go.boarddocs.com/in/triton/Board.nsf/Public,Triton School Corporation,Home of the Trojans/574-342-2255/www.triton.k12.in.us,http://www.triton.k12.in.us,False,False,,True,False,False,False,
2301,https://go.boarddocs.com/vsba/louisa/Board.nsf/Public,Louisa County Public Schools,Learners' Community,http://www.lcps.k12.va.us/education/components/scrapbook/default.php?sectiondetailid=1308&PHPSESSID=00a5e9972c66fd8162a098aed2356931,False,False,,True,False,False,False,
3525,https://go.boarddocs.com/pa/vnang/Board.nsf/Public,Venango Technology Center Policy Manual,Venango Technology Center,http://www.vtc1.org,False,False,,False,False,False,False,


In [2201]:
# now, let's check those with policy or policies in their titles

df["title_1_has_policy"] = df["title_1"].str.contains("policy",case=False,na=False) | df["title_1"].str.contains("policies",case=False,na=False)
df["title_2_has_policy"] = df["title_2"].str.contains("policy",case=False,na=False) | df["title_2"].str.contains("policies",case=False,na=False)

# number of rows with policy in any cols
df[df["title_1_has_policy"] | df["title_2_has_policy"]].shape[0]

50

In [2202]:
# declutter columns
df = df.loc[:,["URL","title_1","title_2","home_website","address","school_district"]]

In [2203]:
df.head()

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district
0,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,,St. Joseph Public Schools,https://www.sjschools.org/,,
1,https://go.boarddocs.com/pa/cali/Board.nsf/Public,,,www.calsd.org,,
2,https://go.boarddocs.com/oh/mapleheights/Board.nsf/Public,Maple Heights City Schools,,http://www.mapleschools.com,"5740 Lawn Avenue | Maple Heights, OH 44137 | 216-587-6100",
3,https://go.boarddocs.com/oh/rlsd/Board.nsf/Public,,,https://www.riversidelocalschools.com/,"585 Riverside Drive | Painesville, Ohio 44077 | 440.352.0668 | f 440.639.1959",Riverside Local School District
4,https://go.boarddocs.com/pa/shun/Board.nsf/Public,,,http://www.shcsd.org,,Southern Huntingdon County School District


In [2204]:
# let's try to common title_1 and title_2
# first standardize None and NaN

print(df.loc[df["URL"]=="https://go.boarddocs.com/pa/cali/Board.nsf/Public", "title_1"])
print(df.loc[df["URL"]=="https://go.boarddocs.com/pa/cali/Board.nsf/Public", "title_1"].isna())

1    None
Name: title_1, dtype: object
1    True
Name: title_1, dtype: bool


In [2205]:
print(df.loc[df["URL"]=="https://go.boarddocs.com/pa/cali/Board.nsf/Public", "title_2"])
print(df.loc[df["URL"]=="https://go.boarddocs.com/pa/cali/Board.nsf/Public", "title_2"].isna())

1    NaN
Name: title_2, dtype: object
1    True
Name: title_2, dtype: bool


In [2206]:
# set them to None
df.loc[df["title_1"].isna(), "title_1"] = None
df.loc[df["title_2"].isna(), "title_2"] = None

In [2207]:
# check those with both not None
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())].shape[0]

70

In [2208]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district
3500,https://go.boarddocs.com/mi/carain/Board.nsf/Public,Carman-Ainsworth Community Schools,810-591-3700,http://www.carmanainsworth.org,,
1729,https://go.boarddocs.com/co/eepto/Board.nsf/Public,"Wellington, CO",Eyestone Elementary PTO,https://eye.psdschools.org/,,
2697,https://go.boarddocs.com/mi/bics/Board.nsf/Public,Beaver Island Community School,(231) 448-2744,http://www.beaverisland.k12.mi.us,,
3350,https://go.boarddocs.com/in/evsc/Board.nsf/Public,EVSC School Board Meetings,Evansville Vanderburgh School Corporation,www.evscschools.com,,
283,https://go.boarddocs.com/oh/putesc/Board.nsf/Public,Putnam County Educational Services Center District Policy Manual,Putnam County Educational Service Center,www.putnamcountyesc.org,,


In [2209]:
# get those with phone numbers out
import re

# Function to check if a string contains a phone number
def contains_phone_number(value):
    # Regular expression for phone numbers
    phone_pattern = re.compile(r'\(?\b\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b')
    if pd.isnull(value):
        return False
    return bool(phone_pattern.search(str(value)))

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df['contains_phone_number_title_2'] = df['title_2'].apply(contains_phone_number)

In [2210]:
both_col_non_na_df = df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]
both_col_non_na_df[both_col_non_na_df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2
1112,https://go.boarddocs.com/la/pcpsb/Board.nsf/Public,"337 Napoleon Street ● New Roads, Louisiana ● p 225-638-8674 ● f 225-638-3237",Pointe Coupee Parish School System,http://www.pcpsb.net/,,,True,False
1576,https://go.boarddocs.com/vsba/fairfax/Board.nsf/Public,"8115 Gatehouse Road, Suite 5400 | Falls Church, VA | 571-423-1075",Fairfax County School Board,http://www.fcps.edu,,,True,False
1987,https://go.boarddocs.com/oh/nls/Board.nsf/Public,600 Lemoyne Rd | Northwood OH | P: (419) 691-3888 | F: (419) 697-2470,Northwood Local Schools,http://www.northwoodschools.org/site/default.aspx?PageID=1,,,True,False
2150,https://go.boarddocs.com/oh/polaris/Board.nsf/Public,"7285 Old Oak Blvd., | Middleburg Heights, OH | 440-891-7600",Polaris Career Center,http://www.polaris.edu/,,,True,False
3356,https://go.boarddocs.com/wa/cowa/Board.nsf/Public,City Council Chambers ~ 500 E. Main Street ~ 509-488-5686 ~ www.othellowa.gov,Othello Washington ~ City Council ~ Serving The Community,http://www.othellowa.gov,,,True,False


In [2211]:
# all the above are address
move_these = both_col_non_na_df[both_col_non_na_df["contains_phone_number_title_1"]]["URL"]
df.loc[df["URL"].isin(move_these), "address"] = df[df["URL"].isin(move_these)]["title_1"]
df.loc[df["URL"].isin(move_these), "title_1"] = None

In [2212]:
both_col_non_na_df = df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]
both_col_non_na_df[both_col_non_na_df["contains_phone_number_title_2"]]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2
159,https://go.boarddocs.com/in/resc/Board.nsf/Public,Randolph Eastern School Corporation,7659644994,http://www.resc.k12.in.us,,,False,True
440,https://go.boarddocs.com/mi/mionline/Board.nsf/Public,Michigan Online School,Phone: (269) 216-6972,https://www.michiganonlineschool.com/,,,False,True
624,https://go.boarddocs.com/in/triton/Board.nsf/Public,Triton School Corporation,Home of the Trojans/574-342-2255/www.triton.k12.in.us,http://www.triton.k12.in.us,,,False,True
715,https://go.boarddocs.com/mi/brand/Board.nsf/Public,Brandywine Community Schools,269-684-7150,http://www.brandywinebobcats.org,,,False,True
1608,https://go.boarddocs.com/oh/sclsd/Board.nsf/Public,South Central Local Schools,3305 Greenwich Angling Rd | 419-752-3815,http://www.south-central.org,,,False,True
1611,https://go.boarddocs.com/mi/whitec/Board.nsf/Public,White Cloud Public Schools,231-689-6591,www.whitecloud.net,,,False,True
1997,https://go.boarddocs.com/mi/chip/Board.nsf/Public,Chippewa Valley Schools,586-723-2004,http://www.cvs.k12.mi.us,,,False,True
2306,https://go.boarddocs.com/mi/clark/Board.nsf/Public,Clarkston Community Schools,248-623-5400,http://www.clarkston.k12.mi.us,,,False,True
2383,https://go.boarddocs.com/mi/white/Board.nsf/Public,Whitefish Township Community Schools,(906) 492-3353,http://whitefish.eupschools.org,,,False,True
2621,https://go.boarddocs.com/in/sgib/Board.nsf/Public,South Gibson School Corporation,812-753-4230,http://www.sgibson.k12.in.us,,,False,True


In [2213]:
# move those two that are address
move_these = ["https://go.boarddocs.com/oh/sclsd/Board.nsf/Public", "https://go.boarddocs.com/oh/sidn/Board.nsf/Public"]
df.loc[df["URL"].isin(move_these), "address"] = df["title_2"]
df.loc[df["URL"].isin(move_these), "title_2"] = None

In [2214]:
both_col_non_na_df = df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]
both_col_non_na_df[both_col_non_na_df["contains_phone_number_title_2"]]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2
159,https://go.boarddocs.com/in/resc/Board.nsf/Public,Randolph Eastern School Corporation,7659644994,http://www.resc.k12.in.us,,,False,True
440,https://go.boarddocs.com/mi/mionline/Board.nsf/Public,Michigan Online School,Phone: (269) 216-6972,https://www.michiganonlineschool.com/,,,False,True
624,https://go.boarddocs.com/in/triton/Board.nsf/Public,Triton School Corporation,Home of the Trojans/574-342-2255/www.triton.k12.in.us,http://www.triton.k12.in.us,,,False,True
715,https://go.boarddocs.com/mi/brand/Board.nsf/Public,Brandywine Community Schools,269-684-7150,http://www.brandywinebobcats.org,,,False,True
1611,https://go.boarddocs.com/mi/whitec/Board.nsf/Public,White Cloud Public Schools,231-689-6591,www.whitecloud.net,,,False,True
1997,https://go.boarddocs.com/mi/chip/Board.nsf/Public,Chippewa Valley Schools,586-723-2004,http://www.cvs.k12.mi.us,,,False,True
2306,https://go.boarddocs.com/mi/clark/Board.nsf/Public,Clarkston Community Schools,248-623-5400,http://www.clarkston.k12.mi.us,,,False,True
2383,https://go.boarddocs.com/mi/white/Board.nsf/Public,Whitefish Township Community Schools,(906) 492-3353,http://whitefish.eupschools.org,,,False,True
2621,https://go.boarddocs.com/in/sgib/Board.nsf/Public,South Gibson School Corporation,812-753-4230,http://www.sgibson.k12.in.us,,,False,True
2683,https://go.boarddocs.com/mi/slps/Board.nsf/Public,Spring Lake Public Schools,616-846-5500,http://www.springlakeschools.org,,,False,True


In [2215]:
# move the others to phone numbers
df["phone"] = None
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["contains_phone_number_title_2"], "phone"] = df["title_2"]
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["contains_phone_number_title_2"], "title_2"] = None

In [2216]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
42,https://go.boarddocs.com/oh/rid/Board.nsf/Public,Ridgemont Local Schools,"560 W. Taylor Street Mount Victory, OH",http://www.ridgemont.k12.oh.us,,,False,False,
283,https://go.boarddocs.com/oh/putesc/Board.nsf/Public,Putnam County Educational Services Center District Policy Manual,Putnam County Educational Service Center,www.putnamcountyesc.org,,,False,False,
356,https://go.boarddocs.com/ca/mendocino/Board.nsf/Public,Mendocino-Lake Community College District,"1000 Hensley Creek Road, Ukiah, CA",https://www.mendocino.edu,,,False,False,
415,https://go.boarddocs.com/mabe/calvert/Board.nsf/Public,Calvert County Board of Education,Calvert County Public Schools,http://www.calvertnet.k12.md.us,,,False,False,
532,https://go.boarddocs.com/vsba/pwcs/Board.nsf/Public,Launching Thriving Futures,Prince William County Public Schools,https://www.pwcs.edu/,,,False,False,
628,https://go.boarddocs.com/oh/cvcc/Board.nsf/Public,Cuyahoga Valley Career Center Bylaws and Policies,Cuyahoga Valley Career Center,http://www.cvccworks.edu/Default.aspx,,,False,False,
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29 & Schuylkill Technology Center Boards of Directors,Schuylkill Intermediate Unit 29,http://www.iu29.org,,,False,False,
985,https://go.boarddocs.com/mi/byr/Board.nsf/Public,Byron Area Schools,Home of the Eagles,http://www.byron.k12.mi.us,,,False,False,
1050,https://go.boarddocs.com/in/rodsped/Board.nsf/Public,Ripley-Ohio-Dearborn Special Education Cooperative,ROD Board Policy,http://www.rodspecialeducation.org,,,False,False,
1054,https://go.boarddocs.com/ga/fcss/Board.nsf/Public,Where Students Come First,Fulton County Schools,https://portal.fultonschools.org/Pages/default.aspx,,,False,False,


In [2217]:
# check those with numbers
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx & df["title_1"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29 & Schuylkill Technology Center Boards of Directors,Schuylkill Intermediate Unit 29,http://www.iu29.org,,,False,False,
1386,https://go.boarddocs.com/mo/lsr7sd/Board.nsf/Public,LSR-7,Learning for Life,http://www.lsr7.org,,,False,False,
2878,https://go.boarddocs.com/pa/daup/Board.nsf/Public,"School Board Policy Manual 6001 Locust Lane, Harrisburg, PA",Dauphin County Technical School,http://www.dcts.org,,,False,False,
3260,https://go.boarddocs.com/wi/chilsd/Board.nsf/Public,530 Main Street,Chilton Public Schools,www.chilton.k12.wi.us,,,False,False,
3456,https://go.boarddocs.com/vsba/vhsl/Board.nsf/Public,Serving Youth Since 1913,Virginia High School League,http://www.vhsl.org/about_vhsl/executive_committee,,,False,False,


In [2218]:
# edit the outlier
df.loc[df["URL"]=="https://go.boarddocs.com/pa/daup/Board.nsf/Public", "title_1"] = "6001 Locust Lane, Harrisburg, PA"

In [2219]:
# check those with numbers
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx & df["title_1"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29 & Schuylkill Technology Center Boards of Directors,Schuylkill Intermediate Unit 29,http://www.iu29.org,,,False,False,
1386,https://go.boarddocs.com/mo/lsr7sd/Board.nsf/Public,LSR-7,Learning for Life,http://www.lsr7.org,,,False,False,
2878,https://go.boarddocs.com/pa/daup/Board.nsf/Public,"6001 Locust Lane, Harrisburg, PA",Dauphin County Technical School,http://www.dcts.org,,,False,False,
3260,https://go.boarddocs.com/wi/chilsd/Board.nsf/Public,530 Main Street,Chilton Public Schools,www.chilton.k12.wi.us,,,False,False,
3456,https://go.boarddocs.com/vsba/vhsl/Board.nsf/Public,Serving Youth Since 1913,Virginia High School League,http://www.vhsl.org/about_vhsl/executive_committee,,,False,False,


In [2220]:
# move these two that are addresses
move_these = ["https://go.boarddocs.com/pa/daup/Board.nsf/Public", "https://go.boarddocs.com/wi/chilsd/Board.nsf/Public"]
df.loc[df["URL"].isin(move_these), "address"] = df["title_1"]
df.loc[df["URL"].isin(move_these), "title_1"] = None

# check
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx & df["title_1"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29 & Schuylkill Technology Center Boards of Directors,Schuylkill Intermediate Unit 29,http://www.iu29.org,,,False,False,
1386,https://go.boarddocs.com/mo/lsr7sd/Board.nsf/Public,LSR-7,Learning for Life,http://www.lsr7.org,,,False,False,
3456,https://go.boarddocs.com/vsba/vhsl/Board.nsf/Public,Serving Youth Since 1913,Virginia High School League,http://www.vhsl.org/about_vhsl/executive_committee,,,False,False,


In [2221]:
# move the second one to district and delete the two others
df.loc[df["URL"]=="https://go.boarddocs.com/mo/lsr7sd/Board.nsf/Public", "address"] = df["title_1"]
df.loc[df["URL"]=="https://go.boarddocs.com/mo/lsr7sd/Board.nsf/Public", "title_1"] = None

# check
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx & df["title_1"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29 & Schuylkill Technology Center Boards of Directors,Schuylkill Intermediate Unit 29,http://www.iu29.org,,,False,False,
3456,https://go.boarddocs.com/vsba/vhsl/Board.nsf/Public,Serving Youth Since 1913,Virginia High School League,http://www.vhsl.org/about_vhsl/executive_committee,,,False,False,


In [2222]:
df.loc[idx & df["title_1"].str.contains(number_pattern, na=False), "title_1"] = None

In [2223]:
# do the same for title_2
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx & df["title_2"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
42,https://go.boarddocs.com/oh/rid/Board.nsf/Public,Ridgemont Local Schools,"560 W. Taylor Street Mount Victory, OH",http://www.ridgemont.k12.oh.us,,,False,False,
356,https://go.boarddocs.com/ca/mendocino/Board.nsf/Public,Mendocino-Lake Community College District,"1000 Hensley Creek Road, Ukiah, CA",https://www.mendocino.edu,,,False,False,
1203,https://go.boarddocs.com/in/brownsburg/Board.nsf/Public,Brownsburg Community School Corporation,"310 Stadium Drive Brownsburg, IN",https://www.brownsburg.k12.in.us/,,,False,False,


In [2224]:
# move them to address
df.loc[idx & df["title_2"].str.contains(number_pattern, na=False),"address"] = df["title_2"]
df.loc[idx & df["title_2"].str.contains(number_pattern, na=False),"title_2"] = None

In [2225]:
# check the rest
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx,:]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
283,https://go.boarddocs.com/oh/putesc/Board.nsf/Public,Putnam County Educational Services Center District Policy Manual,Putnam County Educational Service Center,www.putnamcountyesc.org,,,False,False,
415,https://go.boarddocs.com/mabe/calvert/Board.nsf/Public,Calvert County Board of Education,Calvert County Public Schools,http://www.calvertnet.k12.md.us,,,False,False,
532,https://go.boarddocs.com/vsba/pwcs/Board.nsf/Public,Launching Thriving Futures,Prince William County Public Schools,https://www.pwcs.edu/,,,False,False,
628,https://go.boarddocs.com/oh/cvcc/Board.nsf/Public,Cuyahoga Valley Career Center Bylaws and Policies,Cuyahoga Valley Career Center,http://www.cvccworks.edu/Default.aspx,,,False,False,
985,https://go.boarddocs.com/mi/byr/Board.nsf/Public,Byron Area Schools,Home of the Eagles,http://www.byron.k12.mi.us,,,False,False,
1050,https://go.boarddocs.com/in/rodsped/Board.nsf/Public,Ripley-Ohio-Dearborn Special Education Cooperative,ROD Board Policy,http://www.rodspecialeducation.org,,,False,False,
1054,https://go.boarddocs.com/ga/fcss/Board.nsf/Public,Where Students Come First,Fulton County Schools,https://portal.fultonschools.org/Pages/default.aspx,,,False,False,
1161,https://go.boarddocs.com/mi/jon/Board.nsf/Public,JCS School Board Policies and Guidelines,Jonesville Community Schools,,,,False,False,
1214,https://go.boarddocs.com/vsba/scs/Board.nsf/Public,Together - We Prepare Our Students for Their Future,Spotsylvania County Public Schools,http://www.spotsylvania.k12.va.us/,,,False,False,
1292,https://go.boarddocs.com/oh/nwoesc/Board.nsf/Public,"NwOESC Board Agendas, Minutes, By-Laws and Policy Manual",Northwest Ohio Educational Service Center,https://www.nwoesc.org/,,,False,False,


In [2226]:
# check those with schools in one but not in the other
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
idx &= df["title_1"].str.contains("school",case=False,na=False) & ~df["title_2"].str.contains("school",case=False,na=False)
df.loc[idx]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
985,https://go.boarddocs.com/mi/byr/Board.nsf/Public,Byron Area Schools,Home of the Eagles,http://www.byron.k12.mi.us,,,False,False,
1448,https://go.boarddocs.com/mi/csps/Board.nsf/Public,Cedar Springs Public Schools,Board of Education Board Policies,http://www.csredhawks.org,,,False,False,
1548,https://go.boarddocs.com/mi/lfen/Board.nsf/Public,Lake Fenton Community Schools,Home of the Lake Fenton Blue Devils,http://www.lakefentonschools.org,,,False,False,
1980,https://go.boarddocs.com/mi/fwlv/Board.nsf/Public,Fowlerville Community Schools,Home of the Gladiators,http://www.fowlervilleschools.org,,,False,False,
2047,https://go.boarddocs.com/ok/okcps/Board.nsf/Public,Oklahoma City Public Schools,Ignite Passion. Instill Pride.,https://go.boarddocs.com/ok/okcps/Board.nsf/,,,False,False,
2108,https://go.boarddocs.com/oh/maplecc/Board.nsf/Public,SCHOOL POLICIES AND GUIDELINES,MAPLEWOOD CAREER CENTER,http://www.mwood.cc/,,,False,False,
2301,https://go.boarddocs.com/vsba/louisa/Board.nsf/Public,Louisa County Public Schools,Learners' Community,http://www.lcps.k12.va.us/education/components/scrapbook/default.php?sectiondetailid=1308&PHPSESSID=00a5e9972c66fd8162a098aed2356931,,,False,False,
2819,https://go.boarddocs.com/in/sssc/Board.nsf/Public,Southwest School Corporation,By-Laws and Policies,http://www.swest.k12.in.us,,,False,False,
3125,https://go.boarddocs.com/in/valp/Board.nsf/Public,Valparaiso Community Schools,Home of the Vikings,http://www.valpo.k12.in.us,,,False,False,
3126,https://go.boarddocs.com/oh/zville/Board.nsf/Public,Zanesville City Schools District,Home of the Blue Devils,http://www.zanesville.k12.oh.us,,,False,False,


In [2227]:
# take out the outlier
df.loc[df["URL"] == "https://go.boarddocs.com/oh/maplecc/Board.nsf/Public", "title_1"] = None

In [2228]:
# check those with schools in one but not in the other
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
idx &= df["title_1"].str.contains("school",case=False,na=False) & ~df["title_2"].str.contains("school",case=False,na=False)
df.loc[idx]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
985,https://go.boarddocs.com/mi/byr/Board.nsf/Public,Byron Area Schools,Home of the Eagles,http://www.byron.k12.mi.us,,,False,False,
1448,https://go.boarddocs.com/mi/csps/Board.nsf/Public,Cedar Springs Public Schools,Board of Education Board Policies,http://www.csredhawks.org,,,False,False,
1548,https://go.boarddocs.com/mi/lfen/Board.nsf/Public,Lake Fenton Community Schools,Home of the Lake Fenton Blue Devils,http://www.lakefentonschools.org,,,False,False,
1980,https://go.boarddocs.com/mi/fwlv/Board.nsf/Public,Fowlerville Community Schools,Home of the Gladiators,http://www.fowlervilleschools.org,,,False,False,
2047,https://go.boarddocs.com/ok/okcps/Board.nsf/Public,Oklahoma City Public Schools,Ignite Passion. Instill Pride.,https://go.boarddocs.com/ok/okcps/Board.nsf/,,,False,False,
2301,https://go.boarddocs.com/vsba/louisa/Board.nsf/Public,Louisa County Public Schools,Learners' Community,http://www.lcps.k12.va.us/education/components/scrapbook/default.php?sectiondetailid=1308&PHPSESSID=00a5e9972c66fd8162a098aed2356931,,,False,False,
2819,https://go.boarddocs.com/in/sssc/Board.nsf/Public,Southwest School Corporation,By-Laws and Policies,http://www.swest.k12.in.us,,,False,False,
3125,https://go.boarddocs.com/in/valp/Board.nsf/Public,Valparaiso Community Schools,Home of the Vikings,http://www.valpo.k12.in.us,,,False,False,
3126,https://go.boarddocs.com/oh/zville/Board.nsf/Public,Zanesville City Schools District,Home of the Blue Devils,http://www.zanesville.k12.oh.us,,,False,False,
3164,https://go.boarddocs.com/mi/clio/Board.nsf/Public,Clio Area Schools,Home of the Mustangs,http://www.clioschools.org,,,False,False,


In [2229]:
# all slogans in second col, delete
df.loc[idx, "title_2"] = None

In [2230]:
# do it for the second col
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
idx &= df["title_2"].str.contains("school",case=False,na=False) & ~df["title_1"].str.contains("school",case=False,na=False)
df.loc[idx]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
415,https://go.boarddocs.com/mabe/calvert/Board.nsf/Public,Calvert County Board of Education,Calvert County Public Schools,http://www.calvertnet.k12.md.us,,,False,False,
532,https://go.boarddocs.com/vsba/pwcs/Board.nsf/Public,Launching Thriving Futures,Prince William County Public Schools,https://www.pwcs.edu/,,,False,False,
1054,https://go.boarddocs.com/ga/fcss/Board.nsf/Public,Where Students Come First,Fulton County Schools,https://portal.fultonschools.org/Pages/default.aspx,,,False,False,
1214,https://go.boarddocs.com/vsba/scs/Board.nsf/Public,Together - We Prepare Our Students for Their Future,Spotsylvania County Public Schools,http://www.spotsylvania.k12.va.us/,,,False,False,
3308,https://go.boarddocs.com/in/hses/Board.nsf/Public,For additional information click the HOUSE icon to view streamed/archived board meeting videos.,Hamilton Southeastern Schools,https://www.hseschools.org/meet-hse/board,,,False,False,
3393,https://go.boarddocs.com/mo/nixa/Board.nsf/Public,Board of Education Meeting Information,Nixa Public Schools,http://www.nixapublicschools.net,,,False,False,
3430,https://go.boarddocs.com/mn/d196/Board.nsf/Public,"Educating, developing, and inspiring our students for lifelong success.",Rosemount - Apple Valley - Eagan Public Schools,http://www.district196.org/,,,False,False,
3443,https://go.boarddocs.com/mi/tda/Board.nsf/Public,THE DEARBORN ACADEMY,Public Charter School,http://www.thedearbornacademy.org,,,False,False,


In [2231]:
# edit the outlier
df.loc[df["URL"] == "https://go.boarddocs.com/mi/tda/Board.nsf/Public", "title_2"] = None
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
idx &= df["title_2"].str.contains("school",case=False,na=False) & ~df["title_1"].str.contains("school",case=False,na=False)
df.loc[idx]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
415,https://go.boarddocs.com/mabe/calvert/Board.nsf/Public,Calvert County Board of Education,Calvert County Public Schools,http://www.calvertnet.k12.md.us,,,False,False,
532,https://go.boarddocs.com/vsba/pwcs/Board.nsf/Public,Launching Thriving Futures,Prince William County Public Schools,https://www.pwcs.edu/,,,False,False,
1054,https://go.boarddocs.com/ga/fcss/Board.nsf/Public,Where Students Come First,Fulton County Schools,https://portal.fultonschools.org/Pages/default.aspx,,,False,False,
1214,https://go.boarddocs.com/vsba/scs/Board.nsf/Public,Together - We Prepare Our Students for Their Future,Spotsylvania County Public Schools,http://www.spotsylvania.k12.va.us/,,,False,False,
3308,https://go.boarddocs.com/in/hses/Board.nsf/Public,For additional information click the HOUSE icon to view streamed/archived board meeting videos.,Hamilton Southeastern Schools,https://www.hseschools.org/meet-hse/board,,,False,False,
3393,https://go.boarddocs.com/mo/nixa/Board.nsf/Public,Board of Education Meeting Information,Nixa Public Schools,http://www.nixapublicschools.net,,,False,False,
3430,https://go.boarddocs.com/mn/d196/Board.nsf/Public,"Educating, developing, and inspiring our students for lifelong success.",Rosemount - Apple Valley - Eagan Public Schools,http://www.district196.org/,,,False,False,


In [2232]:
# delete the first col
df.loc[idx,"title_1"] = None

In [2233]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
283,https://go.boarddocs.com/oh/putesc/Board.nsf/Public,Putnam County Educational Services Center District Policy Manual,Putnam County Educational Service Center,www.putnamcountyesc.org,,,False,False,
628,https://go.boarddocs.com/oh/cvcc/Board.nsf/Public,Cuyahoga Valley Career Center Bylaws and Policies,Cuyahoga Valley Career Center,http://www.cvccworks.edu/Default.aspx,,,False,False,
1050,https://go.boarddocs.com/in/rodsped/Board.nsf/Public,Ripley-Ohio-Dearborn Special Education Cooperative,ROD Board Policy,http://www.rodspecialeducation.org,,,False,False,
1161,https://go.boarddocs.com/mi/jon/Board.nsf/Public,JCS School Board Policies and Guidelines,Jonesville Community Schools,,,,False,False,
1292,https://go.boarddocs.com/oh/nwoesc/Board.nsf/Public,"NwOESC Board Agendas, Minutes, By-Laws and Policy Manual",Northwest Ohio Educational Service Center,https://www.nwoesc.org/,,,False,False,
1537,https://go.boarddocs.com/oh/moesc/Board.nsf/Public,Mid-Ohio Policies,Mid-Ohio ESC,www.moesc.net,,,False,False,
1729,https://go.boarddocs.com/co/eepto/Board.nsf/Public,"Wellington, CO",Eyestone Elementary PTO,https://eye.psdschools.org/,,,False,False,
2060,https://go.boarddocs.com/vsba/fccpsva/Board.nsf/Public,Falls Church City Public Schools,Falls Church City School Board,https://www.fccps.org/page/school-board,,,False,False,
2229,https://go.boarddocs.com/ca/sjccd/Board.nsf/Public,College of the Siskiyous,Siskiyou Joint Community College District,http://www.siskiyous.edu/,,,False,False,
2478,https://go.boarddocs.com/ca/hawking/Board.nsf/Public,Hawking STEAM Charter School,Hawking STEAM Charter Schools,https://www.hawkingschools.org/,,,False,False,


In [2234]:
# for those with the word policy, make it the first col
policy_pattern = r'polic(y|ies)'
idx = df["title_2"].str.contains(policy_pattern, case=False,na=False)
df.loc[idx, ["title_1", "title_2"]] = df.loc[idx,["title_2","title_1"]].values

  idx = df["title_2"].str.contains(policy_pattern, case=False,na=False)


In [2235]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
283,https://go.boarddocs.com/oh/putesc/Board.nsf/Public,Putnam County Educational Services Center District Policy Manual,Putnam County Educational Service Center,www.putnamcountyesc.org,,,False,False,
628,https://go.boarddocs.com/oh/cvcc/Board.nsf/Public,Cuyahoga Valley Career Center Bylaws and Policies,Cuyahoga Valley Career Center,http://www.cvccworks.edu/Default.aspx,,,False,False,
1050,https://go.boarddocs.com/in/rodsped/Board.nsf/Public,ROD Board Policy,Ripley-Ohio-Dearborn Special Education Cooperative,http://www.rodspecialeducation.org,,,False,False,
1161,https://go.boarddocs.com/mi/jon/Board.nsf/Public,JCS School Board Policies and Guidelines,Jonesville Community Schools,,,,False,False,
1292,https://go.boarddocs.com/oh/nwoesc/Board.nsf/Public,"NwOESC Board Agendas, Minutes, By-Laws and Policy Manual",Northwest Ohio Educational Service Center,https://www.nwoesc.org/,,,False,False,
1537,https://go.boarddocs.com/oh/moesc/Board.nsf/Public,Mid-Ohio Policies,Mid-Ohio ESC,www.moesc.net,,,False,False,
1729,https://go.boarddocs.com/co/eepto/Board.nsf/Public,"Wellington, CO",Eyestone Elementary PTO,https://eye.psdschools.org/,,,False,False,
2060,https://go.boarddocs.com/vsba/fccpsva/Board.nsf/Public,Falls Church City Public Schools,Falls Church City School Board,https://www.fccps.org/page/school-board,,,False,False,
2229,https://go.boarddocs.com/ca/sjccd/Board.nsf/Public,College of the Siskiyous,Siskiyou Joint Community College District,http://www.siskiyous.edu/,,,False,False,
2478,https://go.boarddocs.com/ca/hawking/Board.nsf/Public,Hawking STEAM Charter School,Hawking STEAM Charter Schools,https://www.hawkingschools.org/,,,False,False,


In [2236]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains(policy_pattern,case=False,na=False)]

  df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains(policy_pattern,case=False,na=False)]


Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
283,https://go.boarddocs.com/oh/putesc/Board.nsf/Public,Putnam County Educational Services Center District Policy Manual,Putnam County Educational Service Center,www.putnamcountyesc.org,,,False,False,
628,https://go.boarddocs.com/oh/cvcc/Board.nsf/Public,Cuyahoga Valley Career Center Bylaws and Policies,Cuyahoga Valley Career Center,http://www.cvccworks.edu/Default.aspx,,,False,False,
1050,https://go.boarddocs.com/in/rodsped/Board.nsf/Public,ROD Board Policy,Ripley-Ohio-Dearborn Special Education Cooperative,http://www.rodspecialeducation.org,,,False,False,
1161,https://go.boarddocs.com/mi/jon/Board.nsf/Public,JCS School Board Policies and Guidelines,Jonesville Community Schools,,,,False,False,
1292,https://go.boarddocs.com/oh/nwoesc/Board.nsf/Public,"NwOESC Board Agendas, Minutes, By-Laws and Policy Manual",Northwest Ohio Educational Service Center,https://www.nwoesc.org/,,,False,False,
1537,https://go.boarddocs.com/oh/moesc/Board.nsf/Public,Mid-Ohio Policies,Mid-Ohio ESC,www.moesc.net,,,False,False,
2720,https://go.boarddocs.com/mi/giresa/Board.nsf/Public,Board Policies and Bylaws,Gratiot-Isabella Regional Education Service District,http://www.giresd.net,,,False,False,
3326,https://go.boarddocs.com/mi/trav/Board.nsf/Public,TCAPS School Board Policies & Guidelines,Traverse City Area Public Schools,http://www.tcaps.net/board,,,False,False,
3407,https://go.boarddocs.com/oh/tructc/Board.nsf/Public,TCTC Board Policies,Trumbull Career and Technical Center,http://www.tctchome.com,,,False,False,
3525,https://go.boarddocs.com/pa/vnang/Board.nsf/Public,Venango Technology Center Policy Manual,Venango Technology Center,http://www.vtc1.org,,,False,False,


In [2237]:
# can remove title_1 for these
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains(policy_pattern,case=False,na=False), "title_1"] = None
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

  df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains(policy_pattern,case=False,na=False), "title_1"] = None


Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
1729,https://go.boarddocs.com/co/eepto/Board.nsf/Public,"Wellington, CO",Eyestone Elementary PTO,https://eye.psdschools.org/,,,False,False,
2060,https://go.boarddocs.com/vsba/fccpsva/Board.nsf/Public,Falls Church City Public Schools,Falls Church City School Board,https://www.fccps.org/page/school-board,,,False,False,
2229,https://go.boarddocs.com/ca/sjccd/Board.nsf/Public,College of the Siskiyous,Siskiyou Joint Community College District,http://www.siskiyous.edu/,,,False,False,
2478,https://go.boarddocs.com/ca/hawking/Board.nsf/Public,Hawking STEAM Charter School,Hawking STEAM Charter Schools,https://www.hawkingschools.org/,,,False,False,
2889,https://go.boarddocs.com/ut/uen/Board.nsf/Public,Utah Education and Telehealth Network,UETN Governing Board,http://www.uetn.org,,,False,False,
2963,https://go.boarddocs.com/fl/jcsd/Board.nsf/Public,Jefferson County Schools,Jefferson County School Board,https://www.jeffersonschools.net,,,False,False,
3046,https://go.boarddocs.com/mi/badax/Board.nsf/Public,BAPS,Home of the Hatchets,www.badaxeps.org,,,False,False,
3257,https://go.boarddocs.com/in/fcsc/Board.nsf/Public,Franklin Township Community School Corp.,Franklin Township Community School Corporation,http://www.ftcsc.k12.in.us/,,,False,False,
3350,https://go.boarddocs.com/in/evsc/Board.nsf/Public,EVSC School Board Meetings,Evansville Vanderburgh School Corporation,www.evscschools.com,,,False,False,


In [2238]:
# do the same for board
idx = df["title_2"].str.contains("board", case=False,na=False)
df.loc[idx, ["title_1", "title_2"]] = df.loc[idx,["title_2","title_1"]].values

In [2239]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains("board",case=False,na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
2060,https://go.boarddocs.com/vsba/fccpsva/Board.nsf/Public,Falls Church City School Board,Falls Church City Public Schools,https://www.fccps.org/page/school-board,,,False,False,
2889,https://go.boarddocs.com/ut/uen/Board.nsf/Public,UETN Governing Board,Utah Education and Telehealth Network,http://www.uetn.org,,,False,False,
2963,https://go.boarddocs.com/fl/jcsd/Board.nsf/Public,Jefferson County School Board,Jefferson County Schools,https://www.jeffersonschools.net,,,False,False,
3350,https://go.boarddocs.com/in/evsc/Board.nsf/Public,EVSC School Board Meetings,Evansville Vanderburgh School Corporation,www.evscschools.com,,,False,False,


In [2240]:
# can remove title_1 for these
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains("board",case=False,na=False), "title_1"] = None
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
1729,https://go.boarddocs.com/co/eepto/Board.nsf/Public,"Wellington, CO",Eyestone Elementary PTO,https://eye.psdschools.org/,,,False,False,
2229,https://go.boarddocs.com/ca/sjccd/Board.nsf/Public,College of the Siskiyous,Siskiyou Joint Community College District,http://www.siskiyous.edu/,,,False,False,
2478,https://go.boarddocs.com/ca/hawking/Board.nsf/Public,Hawking STEAM Charter School,Hawking STEAM Charter Schools,https://www.hawkingschools.org/,,,False,False,
3046,https://go.boarddocs.com/mi/badax/Board.nsf/Public,BAPS,Home of the Hatchets,www.badaxeps.org,,,False,False,
3257,https://go.boarddocs.com/in/fcsc/Board.nsf/Public,Franklin Township Community School Corp.,Franklin Township Community School Corporation,http://www.ftcsc.k12.in.us/,,,False,False,


In [2241]:
# just do case by case
df.loc[df["URL"]=="https://go.boarddocs.com/co/eepto/Board.nsf/Public", "address"] = df["title_1"]
df.loc[df["URL"]=="https://go.boarddocs.com/co/eepto/Board.nsf/Public", "title_1"] = None

df.loc[df["URL"]=="https://go.boarddocs.com/ca/sjccd/Board.nsf/Public", "title_1"] = None

df.loc[df["URL"]=="https://go.boarddocs.com/ca/hawking/Board.nsf/Public", "title_1"] = None

df.loc[df["URL"]=="https://go.boarddocs.com/mi/badax/Board.nsf/Public", "title_2"] = None

df.loc[df["URL"]=="https://go.boarddocs.com/in/fcsc/Board.nsf/Public", "title_1"] = None

df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone


In [2242]:
# check those with both not None
n = df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())].shape[0]
print(n)
assert n == 0

0


In [2243]:
# now we can combine both
df.loc[df["title_1"].isna(), "title_1"] = df["title_2"]
df = df.drop("title_2", axis=1)

In [2244]:
# see what is left
df.loc[~df["title_1"].isna()].shape[0]

1819

In [2245]:
df.loc[~df["title_1"].isna()].sample(5)

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
3129,https://go.boarddocs.com/ny/mexicocsd/Board.nsf/Public,16 Fravor Road Suite A,http://www.mexico.cnyric.org/,"Mexico, NY 13114",,False,False,
994,https://go.boarddocs.com/ks/hpsks/Board.nsf/Public,USD 210 Board of Education,http://www.usd210.org,529 S. Main Street | Hugoton KS 67951 | 620-544-4397,,False,False,
2083,https://go.boarddocs.com/ny/techvalleyhigh/Board.nsf/Public,Tech Valley High School,https://www.techvalleyhigh.org/,246 Tricentennial Drive Albany NY 12203,,False,False,
2764,https://go.boarddocs.com/in/smadison/Board.nsf/Public,South Madison Community School Corporation,http://www.smadison.k12.in.us/education/district/district.php?sectiondetailid=1&,"203 S. Heritage Way | Pendleton, IN 46064 | Ph: (765) 778-2152 | Fx: (765) 778-8207",,False,False,
1821,https://go.boarddocs.com/oh/arlingtonls/Board.nsf/Public,Arlington Local Schools,http://arlingtonlocalschools.com,,,False,False,


In [2246]:
# check those with numbers
df.loc[df["title_1"].str.contains(number_pattern, na=False)].shape[0]

134

In [2247]:
df.loc[df["title_1"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
38,https://go.boarddocs.com/mo/mcr1/Board.nsf/Public,Macon County R-1 Schools,http://www.macon.k12.mo.us/,"702 North Missouri  Macon, Missouri 63552  (660) 395-6164",,False,False,
49,https://go.boarddocs.com/ks/susdks/Board.nsf/Public,Seaman USD 345,http://www.seamanschools.org,"901 NW Lyman Road | Topeka, KS 66608 | Ph: (785) 575-8600",,False,False,
76,https://go.boarddocs.com/pa/iu11/Board.nsf/Public,Tuscarora Intermediate Unit 11,https://www.tiu11.org/,,,False,False,
125,https://go.boarddocs.com/wi/pesh/Board.nsf/Public,341 NORTH EMERY AVENUE,www.peshtigo.k12.wi.us,,PESHTIGO SCHOOL DISTRICT,False,False,
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
...,...,...,...,...,...,...,...,...
3806,https://go.boarddocs.com/ks/usd315/Board.nsf/Public,Colby Public Schools USD 315,http://www.colbyeagles.org/,600 W 3rd St. | Colby KS 67701-2000 | p 785-460-5000 | f 785-460-5050,,False,False,
3816,https://go.boarddocs.com/fl/semi/Board.nsf/Public,Phone - 407-320-0000,www.scps.k12.fl.us,"400 E. Lake Mary Boulevard - Sanford, FL - 32773",,False,True,
3820,https://go.boarddocs.com/ks/usd311/Board.nsf/Public,Pretty Prairie USD 311 KS,https://www.usd311.com/,"206 E Main, P.O. Box 218 Pretty Prairie, Kansas 67570",,False,False,
3877,https://go.boarddocs.com/pa/neiu/Board.nsf/Public,Northeastern Educational Intermediate Unit 19,http://www.iu19.org,1200 Line Street Archbald PA 18403,,False,False,


In [2248]:
# get the phone numbers

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df.loc[df["contains_phone_number_title_1"]].shape[0]

38

In [2249]:
df.loc[df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
252,https://go.boarddocs.com/wi/colesd/Board.nsf/Public,"347 Business Highway 141 North, Coleman WI | 920-897-4011",www.coleman.k12.wi.us,,Coleman School District,True,False,
547,https://go.boarddocs.com/wi/nfdl/Board.nsf/Public,"1115 Thurke Ave - North Fond du Lac, WI - (920) 929-3750",www.nfdlschools.org,,,True,False,
680,https://go.boarddocs.com/pa/prtg/Board.nsf/Public,(814) 736-9636,www.portageareasd.org,"84 Mountain Avenue, Portage, PA 15946",,True,True,
767,https://go.boarddocs.com/mi/cmps/Board.nsf/Public,Central Montcalm Public School | Office: 989-831-2001 | Fax: 989-831-2010,http://www.central-montcalm.org,,,True,False,
856,https://go.boarddocs.com/mi/elk/Board.nsf/Public,"Elk Rapids Central Office • 308 Meguzee Point Rd • Elk Rapids, Michigan • Phone: (231) 264-8692 Fax: (231) 264-6538",www.erschools.com,,,True,False,
1029,https://go.boarddocs.com/mi/wake/Board.nsf/Public,(906) 224-7211,http://www.wmschools.org,,Wakefield-Marenisco School District,True,True,
1064,https://go.boarddocs.com/wi/spartan/Board.nsf/Public,"900 E. Montgomery St. Sparta, Wisconsin | (608) 366-3400",https://www.spartan.org,,Sparta Area School District,True,True,
1199,https://go.boarddocs.com/wi/sdathen/Board.nsf/Public,Phone: 715-257-7511 Fax: 715-257-7502,https://www.athens1.org,"School District of Athens, 601 West Limits Road, Athens, WI 54411",,True,True,
1445,https://go.boarddocs.com/oh/hunt/Board.nsf/Public,"188 Huntsman Road | Chillicothe, OH | P: 740.663.5892 | F: 740.663.6078",http://www.huntsmen.org/,,Huntington Local School District,True,False,


In [2250]:
# move those without alphabets to the phone col

alphabet_pattern = r'[a-zA-Z]'
df.loc[df["contains_phone_number_title_1"] & (~df["title_1"].str.contains(alphabet_pattern,na=False)) ]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
680,https://go.boarddocs.com/pa/prtg/Board.nsf/Public,(814) 736-9636,www.portageareasd.org,"84 Mountain Avenue, Portage, PA 15946",,True,True,
1029,https://go.boarddocs.com/mi/wake/Board.nsf/Public,(906) 224-7211,http://www.wmschools.org,,Wakefield-Marenisco School District,True,True,
1765,https://go.boarddocs.com/pa/iu14/Board.nsf/Public,610-987-2248,http://www.berksiu.org/,"1111 Commons Boulevard, PO Box 16050, 19612-6050",,True,True,
1924,https://go.boarddocs.com/wi/shorewood/Board.nsf/Public,(414) 963-6901,https://www.shorewood.k12.wi.us,"Shorewood School District | 1701 East Capitol Drive | Shorewood, Wisconsin 53211",,True,True,
1985,https://go.boarddocs.com/pa/roch/Board.nsf/Public,724-775-7500,http://www.rasd.org,"540 Reno Street, Rochester, PA 15074",,True,True,
2028,https://go.boarddocs.com/mi/fowler/Board.nsf/Public,(989)593-2250,http://www.fowlerschools.net,"700 S Main Street, Fowler MI 48835",,True,True,
2525,https://go.boarddocs.com/nj/burlingtontwp/Board.nsf/Public,(609) 387-3955,https://burltwpsch.org/,,Burlington Township School District,True,True,
2767,https://go.boarddocs.com/in/jcdc/Board.nsf/Public,812-689-4114,http://www.jaccendel.k12.in.us,"723 N Buckeye St, Osgood IN 47037",,True,True,
2874,https://go.boarddocs.com/wi/hfj1/Board.nsf/Public,(262) 673-3155,www.hjt1.org,"School District of Hartford Jt. #1, 402 W. Sumner St. Hartford, WI 53027",,True,True,
3159,https://go.boarddocs.com/wi/wawmsd/Board.nsf/Public,414-604-3000,http://www.wawmsd.org,"West Allis-West Milwaukee School District, 9333 W. Lincoln Avenue, West Allis, WI 53227",,True,True,


In [2251]:
df.loc[df["contains_phone_number_title_1"] & (~df["title_1"].str.contains(alphabet_pattern,na=False)), "phone"] = df["title_1"]
df.loc[df["contains_phone_number_title_1"] & (~df["title_1"].str.contains(alphabet_pattern,na=False)), "title_1"] = None


In [2252]:
# get the phone numbers

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df.loc[df["contains_phone_number_title_1"]].shape[0]

28

In [2253]:
df.loc[df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
252,https://go.boarddocs.com/wi/colesd/Board.nsf/Public,"347 Business Highway 141 North, Coleman WI | 920-897-4011",www.coleman.k12.wi.us,,Coleman School District,True,False,
547,https://go.boarddocs.com/wi/nfdl/Board.nsf/Public,"1115 Thurke Ave - North Fond du Lac, WI - (920) 929-3750",www.nfdlschools.org,,,True,False,
767,https://go.boarddocs.com/mi/cmps/Board.nsf/Public,Central Montcalm Public School | Office: 989-831-2001 | Fax: 989-831-2010,http://www.central-montcalm.org,,,True,False,
856,https://go.boarddocs.com/mi/elk/Board.nsf/Public,"Elk Rapids Central Office • 308 Meguzee Point Rd • Elk Rapids, Michigan • Phone: (231) 264-8692 Fax: (231) 264-6538",www.erschools.com,,,True,False,
1064,https://go.boarddocs.com/wi/spartan/Board.nsf/Public,"900 E. Montgomery St. Sparta, Wisconsin | (608) 366-3400",https://www.spartan.org,,Sparta Area School District,True,True,
1199,https://go.boarddocs.com/wi/sdathen/Board.nsf/Public,Phone: 715-257-7511 Fax: 715-257-7502,https://www.athens1.org,"School District of Athens, 601 West Limits Road, Athens, WI 54411",,True,True,
1445,https://go.boarddocs.com/oh/hunt/Board.nsf/Public,"188 Huntsman Road | Chillicothe, OH | P: 740.663.5892 | F: 740.663.6078",http://www.huntsmen.org/,,Huntington Local School District,True,False,
1574,https://go.boarddocs.com/oh/swissohio/Board.nsf/Public,Phone: 740-472-5801,https://swissohio.k12.oh.us,"304 Mill Street Woodsfield, OH 43793",,True,True,
2128,https://go.boarddocs.com/ut/nebo/Board.nsf/Public,"350 S. Main | Spanish Fork, Utah | 801-354-7400",http://www.nebo.edu,,Nebo School District Board of Education,True,False,


In [2254]:
# the relevant info in the column is either phone, address or both
# the row with the website already has the website col filled
# first get those that are phone only

In [2255]:
# Define a pattern to match the words "phone" or "fax" only, allowing other non-alphabet characters
phone_fax_pattern = r'^[^a-zA-Z]*[Pp](hone)[^a-zA-Z]*$'

# Filter the rows
df[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False)]

  df[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False)]


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
1574,https://go.boarddocs.com/oh/swissohio/Board.nsf/Public,Phone: 740-472-5801,https://swissohio.k12.oh.us,"304 Mill Street Woodsfield, OH 43793",,True,True,
2140,https://go.boarddocs.com/wi/solon/Board.nsf/Public,Phone: (715) 378-2263,https://www.solonk12.net,"School District of Solon Springs, 8993 E Baldwin Ave, Solon Springs, WI 54873",,True,True,
2160,https://go.boarddocs.com/oh/galionoh/Board.nsf/Public,Phone - 419-468-3432,https://www.galionschools.org/,"Galion City Schools - 470 Portland Way North - Galion, OH 44833",,True,True,
3816,https://go.boarddocs.com/fl/semi/Board.nsf/Public,Phone - 407-320-0000,www.scps.k12.fl.us,"400 E. Lake Mary Boulevard - Sanford, FL - 32773",,True,True,


In [2256]:
df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "phone"] = df["title_1"]
df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "title_1"] = None

  df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "phone"] = df["title_1"]
  df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "title_1"] = None


In [2257]:
# get the phone numbers

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df.loc[df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
252,https://go.boarddocs.com/wi/colesd/Board.nsf/Public,"347 Business Highway 141 North, Coleman WI | 920-897-4011",www.coleman.k12.wi.us,,Coleman School District,True,False,
547,https://go.boarddocs.com/wi/nfdl/Board.nsf/Public,"1115 Thurke Ave - North Fond du Lac, WI - (920) 929-3750",www.nfdlschools.org,,,True,False,
767,https://go.boarddocs.com/mi/cmps/Board.nsf/Public,Central Montcalm Public School | Office: 989-831-2001 | Fax: 989-831-2010,http://www.central-montcalm.org,,,True,False,
856,https://go.boarddocs.com/mi/elk/Board.nsf/Public,"Elk Rapids Central Office • 308 Meguzee Point Rd • Elk Rapids, Michigan • Phone: (231) 264-8692 Fax: (231) 264-6538",www.erschools.com,,,True,False,
1064,https://go.boarddocs.com/wi/spartan/Board.nsf/Public,"900 E. Montgomery St. Sparta, Wisconsin | (608) 366-3400",https://www.spartan.org,,Sparta Area School District,True,True,
1199,https://go.boarddocs.com/wi/sdathen/Board.nsf/Public,Phone: 715-257-7511 Fax: 715-257-7502,https://www.athens1.org,"School District of Athens, 601 West Limits Road, Athens, WI 54411",,True,True,
1445,https://go.boarddocs.com/oh/hunt/Board.nsf/Public,"188 Huntsman Road | Chillicothe, OH | P: 740.663.5892 | F: 740.663.6078",http://www.huntsmen.org/,,Huntington Local School District,True,False,
2128,https://go.boarddocs.com/ut/nebo/Board.nsf/Public,"350 S. Main | Spanish Fork, Utah | 801-354-7400",http://www.nebo.edu,,Nebo School District Board of Education,True,False,
2173,https://go.boarddocs.com/ks/usd230/Board.nsf/Public,Info: (913) 592-7272 | T: (913) 592-7200 | F: (913) 592-7270,http://www.usd230.org,"Spring Hill School District | 17640 W. 199th Street, Spring Hill, Kansas 66083",,True,True,


In [2258]:
# Define a pattern to match the words "phone" or "fax" only, allowing other non-alphabet characters
phone_fax_pattern = r'^[^a-zA-Z]*([Pp]hone|PH|MAIN PHONE)?[^a-zA-Z]*([Ff]ax)?[^a-zA-Z]*$'

# Filter the rows
df[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False)]

  df[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False)]


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
1199,https://go.boarddocs.com/wi/sdathen/Board.nsf/Public,Phone: 715-257-7511 Fax: 715-257-7502,https://www.athens1.org,"School District of Athens, 601 West Limits Road, Athens, WI 54411",,True,True,
2538,https://go.boarddocs.com/wi/campsd/Board.nsf/Public,(920) 533-8381 | Fax (920) 533 -5726,www.csd.k12.wi.us,"327 N. Fond du Lac Ave. Campbellsport, WI 53010",,True,True,
2586,https://go.boarddocs.com/oh/cevsdoh/Board.nsf/Public,"Phone: 330-627-2181, - Fax: 330-627-2182",www.carrollton.k12.oh.us,"Carrollton Exempted Village School District, - 205 Scio Road S.W., - Carrollton, OH 44615",,True,True,
2811,https://go.boarddocs.com/id/nsd131/Board.nsf/Public,(208) 468-4600 Fax: (208) 468-4638,http://www.nsd131.org,"619 S. Canyon St | Nampa, ID 83686",,True,True,
3001,https://go.boarddocs.com/fl/highlfl/Board.nsf/Public,PH: 863-471-5555,www.highlands.k12.fl.us,"School Board of Highlands County, 426 School St., Sebring, FL 33870",,True,True,
3758,https://go.boarddocs.com/fl/brevco/Board.nsf/Public,* MAIN PHONE: (321) 633-1000,https://www.brevardschools.org/,"2700 JUDGE FRAN JAMIESON WAY, VIERA, FL 32940",,True,True,


In [2259]:
# Define a pattern to match the words "phone" or "fax" only, allowing other non-alphabet characters
phone_fax_pattern = r'^[^a-zA-Z]*([Pp]hone|PH|MAIN PHONE)?[^a-zA-Z]*([Ff]ax)?[^a-zA-Z]*$'

# Filter the rows
df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "phone"] = df["title_1"]
df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "title_1"] = None

  df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "phone"] = df["title_1"]
  df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "title_1"] = None


In [2260]:
# get the phone numbers

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df.loc[df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
252,https://go.boarddocs.com/wi/colesd/Board.nsf/Public,"347 Business Highway 141 North, Coleman WI | 920-897-4011",www.coleman.k12.wi.us,,Coleman School District,True,False,
547,https://go.boarddocs.com/wi/nfdl/Board.nsf/Public,"1115 Thurke Ave - North Fond du Lac, WI - (920) 929-3750",www.nfdlschools.org,,,True,False,
767,https://go.boarddocs.com/mi/cmps/Board.nsf/Public,Central Montcalm Public School | Office: 989-831-2001 | Fax: 989-831-2010,http://www.central-montcalm.org,,,True,False,
856,https://go.boarddocs.com/mi/elk/Board.nsf/Public,"Elk Rapids Central Office • 308 Meguzee Point Rd • Elk Rapids, Michigan • Phone: (231) 264-8692 Fax: (231) 264-6538",www.erschools.com,,,True,False,
1064,https://go.boarddocs.com/wi/spartan/Board.nsf/Public,"900 E. Montgomery St. Sparta, Wisconsin | (608) 366-3400",https://www.spartan.org,,Sparta Area School District,True,True,
1445,https://go.boarddocs.com/oh/hunt/Board.nsf/Public,"188 Huntsman Road | Chillicothe, OH | P: 740.663.5892 | F: 740.663.6078",http://www.huntsmen.org/,,Huntington Local School District,True,False,
2128,https://go.boarddocs.com/ut/nebo/Board.nsf/Public,"350 S. Main | Spanish Fork, Utah | 801-354-7400",http://www.nebo.edu,,Nebo School District Board of Education,True,False,
2173,https://go.boarddocs.com/ks/usd230/Board.nsf/Public,Info: (913) 592-7272 | T: (913) 592-7200 | F: (913) 592-7270,http://www.usd230.org,"Spring Hill School District | 17640 W. 199th Street, Spring Hill, Kansas 66083",,True,True,
2366,https://go.boarddocs.com/oh/naps/Board.nsf/Public,www.napoleonareaschools.org | 419-599-7015,www.napoleon.k12.oh.us,,Napoleon Area City School District,True,False,


In [2261]:
# handle special cases
move_these = ["https://go.boarddocs.com/ks/usd230/Board.nsf/Public","https://go.boarddocs.com/oh/naps/Board.nsf/Public"]
df.loc[df["URL"].isin(move_these), "phone"] = df["title_1"]
df.loc[df["URL"].isin(move_these), "title_1"] = None

In [2262]:
# get the phone numbers

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df.loc[df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
252,https://go.boarddocs.com/wi/colesd/Board.nsf/Public,"347 Business Highway 141 North, Coleman WI | 920-897-4011",www.coleman.k12.wi.us,,Coleman School District,True,False,
547,https://go.boarddocs.com/wi/nfdl/Board.nsf/Public,"1115 Thurke Ave - North Fond du Lac, WI - (920) 929-3750",www.nfdlschools.org,,,True,False,
767,https://go.boarddocs.com/mi/cmps/Board.nsf/Public,Central Montcalm Public School | Office: 989-831-2001 | Fax: 989-831-2010,http://www.central-montcalm.org,,,True,False,
856,https://go.boarddocs.com/mi/elk/Board.nsf/Public,"Elk Rapids Central Office • 308 Meguzee Point Rd • Elk Rapids, Michigan • Phone: (231) 264-8692 Fax: (231) 264-6538",www.erschools.com,,,True,False,
1064,https://go.boarddocs.com/wi/spartan/Board.nsf/Public,"900 E. Montgomery St. Sparta, Wisconsin | (608) 366-3400",https://www.spartan.org,,Sparta Area School District,True,True,
1445,https://go.boarddocs.com/oh/hunt/Board.nsf/Public,"188 Huntsman Road | Chillicothe, OH | P: 740.663.5892 | F: 740.663.6078",http://www.huntsmen.org/,,Huntington Local School District,True,False,
2128,https://go.boarddocs.com/ut/nebo/Board.nsf/Public,"350 S. Main | Spanish Fork, Utah | 801-354-7400",http://www.nebo.edu,,Nebo School District Board of Education,True,False,
2495,https://go.boarddocs.com/in/brem/Board.nsf/Public,Bremen Public Schools | Phone: (574) 546-3929 | Fax: (574) 546-6303 | School Board Policies and Guidelines,https://www.bps.k12.in.us,,,True,False,
2808,https://go.boarddocs.com/wi/ashland/Board.nsf/Public,"District Office - 2000 Beaser Avenue; Ashland, WI; (715) 682-7080",http://www.ashland.k12.wi.us/,,School District of Ashland Board of Education,True,False,


In [2263]:
# these are all addresses, move them
df.loc[df["contains_phone_number_title_1"], "address"] = df["title_1"]
df.loc[df["contains_phone_number_title_1"], "title_1"] = None

In [2264]:
# now check the remaining ones
print(df[~df["title_1"].isna()].shape[0])
df[~df["title_1"].isna()].sample(5)

1781


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
1679,https://go.boarddocs.com/va/martinsville/Board.nsf/Public,Martinsville City Public Schools,https://www.martinsville.k12.va.us/,"746 Indian Trail P.O. Box 5548 Martinsville, Virginia 24115 | (276) 403-5820",,False,False,
46,https://go.boarddocs.com/wv/berkeley/Board.nsf/Public,Berkeley County Schools,https://www.berkeleycountyschools.org/page/boe,"1453 Winchester Avenue | Martinsburg, WV 25405 | 304.267.3500",,False,False,
502,https://go.boarddocs.com/oh/hevs/Board.nsf/Public,Hubbard Eagles,www.hubbard.k12.oh.us,"108 Orchard Avenue | Hubbard, Ohio 44425 | 330-534-1921",,False,False,
93,https://go.boarddocs.com/va/nvcc/Board.nsf/Public,Northern Virginia Community College,https://www.nvcc.edu/,,,False,False,
854,https://go.boarddocs.com/oh/bristol/Board.nsf/Public,Bristol Local SD,http://www.bristol.k12.oh.us/,,,False,False,


In [2265]:
# check those with commas
# could be addresses
# check those without addresses
df.loc[df["title_1"].str.contains(",",na=False) & df["address"].isna(),:]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
723,https://go.boarddocs.com/il/thsd211/Board.nsf/Public,"G.A. McElroy Administration Center, 1750 South Roselle Road, Palatine, Illinois",http://www.adc.d211.org,,,False,False,
882,https://go.boarddocs.com/il/asd4/Board.nsf/Public,"222 N. Kennedy Drive, Addison, IL",http://www.asd4.org,,Addison School District 4,False,False,
901,https://go.boarddocs.com/nd/bsd7/Board.nsf/Public,"Agendas, Minutes, District Information, Policies and Events",http://www.belcourt.k12.nd.us/education/components/scrapbook/default.php?sectiondetailid=1376&,,Turtle Mountain Community Schools/Belcourt School District #7,False,False,
977,https://go.boarddocs.com/mo/rockport/Board.nsf/Public,"600 S Nebraska St | Rock Port, MO",https://rockport.k12.mo.us/,,Rock Port R-II School District,False,False,
1736,https://go.boarddocs.com/pa/uncf/Board.nsf/Public,"Meetings, Agendas, and Information",http://www.ucfsd.org,,Unionville-Chadds Ford School District,False,False,
1800,https://go.boarddocs.com/pa/boyr/Board.nsf/Public,"Meetings, Agendas, Information",http://www.boyertownasd.org,,,False,False,
2077,https://go.boarddocs.com/mo/unionrxi/Board.nsf/Public,"Meetings, Agendas, and Information",https://www.unionrxi.org,,Union R-XI School District,False,False,
2134,https://go.boarddocs.com/in/pike/Board.nsf/Public,"Administrative Services Center - 6901 Zionsville Road, Indianapolis, IN & via Live Streaming at: https://www.youtube.com/channel/UCsnM2UOzNfDocPPLZzktSOg",http://www.pike.k12.in.us/,,Metropolitan School District of Pike Township,False,False,
2505,https://go.boarddocs.com/wv/brooke/Board.nsf/Public,"Brooke County Schools - Excellence, Tradition and Bruin Pride!",https://www.brooke.k12.wv.us,,,False,False,
2521,https://go.boarddocs.com/oh/brightoh/Board.nsf/Public,"Meeting Agendas, Minutes and Policies",https://www.blsd.us,,Bright Local School District,False,False,


In [2266]:
# the agenda value keep coming up
# let's check for all rows on this
df.loc[df["title_1"].str.contains("agenda",case=False,na=False)]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
157,https://go.boarddocs.com/il/sd163/Board.nsf/Public,Board Meeting Agendas,http://www.sd163.com,,,False,False,
673,https://go.boarddocs.com/pa/laur/Board.nsf/Public,Laurel School Board Agendas and Policy Manual,http://www.laurel.k12.pa.us,,Laurel School District,False,False,
901,https://go.boarddocs.com/nd/bsd7/Board.nsf/Public,"Agendas, Minutes, District Information, Policies and Events",http://www.belcourt.k12.nd.us/education/components/scrapbook/default.php?sectiondetailid=1376&,,Turtle Mountain Community Schools/Belcourt School District #7,False,False,
1163,https://go.boarddocs.com/ca/vibrantminds/Board.nsf/Public,"Board Agendas, Minutes, and Policies",https://www.vibrantminds.us/,"412 W. CARL KARCHER WAY ANAHEIM, CA 92801 | ​714-563-2390",,False,False,
1736,https://go.boarddocs.com/pa/uncf/Board.nsf/Public,"Meetings, Agendas, and Information",http://www.ucfsd.org,,Unionville-Chadds Ford School District,False,False,
1800,https://go.boarddocs.com/pa/boyr/Board.nsf/Public,"Meetings, Agendas, Information",http://www.boyertownasd.org,,,False,False,
2077,https://go.boarddocs.com/mo/unionrxi/Board.nsf/Public,"Meetings, Agendas, and Information",https://www.unionrxi.org,,Union R-XI School District,False,False,
2472,https://go.boarddocs.com/pa/iu24/Board.nsf/Public,Board Agendas and Policy Manual,http://www.cciu.org,"455 Boot Road, Downingtown, PA 19335 | 484-237-5000",,False,False,
2521,https://go.boarddocs.com/oh/brightoh/Board.nsf/Public,"Meeting Agendas, Minutes and Policies",https://www.blsd.us,,Bright Local School District,False,False,
2646,https://go.boarddocs.com/oh/mayoh/Board.nsf/Public,Meeting Agendas & Minutes,http://www.mayfieldschools.org,,Mayfield City School District,False,False,


In [2267]:
# remove them

df.loc[df["title_1"].str.contains("agenda",case=False,na=False), "title_1"] = None

In [2268]:
# check those with commas
# could be addresses
# check those without addresses
df.loc[df["title_1"].str.contains(",",na=False) & df["address"].isna(),:]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
723,https://go.boarddocs.com/il/thsd211/Board.nsf/Public,"G.A. McElroy Administration Center, 1750 South Roselle Road, Palatine, Illinois",http://www.adc.d211.org,,,False,False,
882,https://go.boarddocs.com/il/asd4/Board.nsf/Public,"222 N. Kennedy Drive, Addison, IL",http://www.asd4.org,,Addison School District 4,False,False,
977,https://go.boarddocs.com/mo/rockport/Board.nsf/Public,"600 S Nebraska St | Rock Port, MO",https://rockport.k12.mo.us/,,Rock Port R-II School District,False,False,
2134,https://go.boarddocs.com/in/pike/Board.nsf/Public,"Administrative Services Center - 6901 Zionsville Road, Indianapolis, IN & via Live Streaming at: https://www.youtube.com/channel/UCsnM2UOzNfDocPPLZzktSOg",http://www.pike.k12.in.us/,,Metropolitan School District of Pike Township,False,False,
2505,https://go.boarddocs.com/wv/brooke/Board.nsf/Public,"Brooke County Schools - Excellence, Tradition and Bruin Pride!",https://www.brooke.k12.wv.us,,,False,False,
2627,https://go.boarddocs.com/nj/hhboe/Board.nsf/Public,"316-A Seventh Ave Haddon Heights, New Jersey",http://gogarnets.com/,,Haddon Heights School District,False,False,
2710,https://go.boarddocs.com/ak/swrsdak/Board.nsf/Public,"... educating our future, guided by our past",http://www.swrsd.org,,Southwest Region School District,False,False,
2729,https://go.boarddocs.com/ca/cvesd/Board.nsf/Public,"84 East J Street , Chula Vista , CA91910",https://www.cvesd.org/,,Chula Vista Elementary School District,False,False,
2824,https://go.boarddocs.com/mo/wpr7sd/Board.nsf/Public,"Excellence in Education, Service, Life.",https://www.zizzers.org,,West Plains School District,False,False,
3287,https://go.boarddocs.com/wi/afasd/Board.nsf/Public,"201 W. 6th Street, Friendship, WI",https://www.afasd.net/,,ADAMS-FRIENDSHIP AREA SCHOOL DISTRICT,False,False,


In [2269]:
# get those with numbers first

df.loc[df["title_1"].str.contains(",",na=False) & df["address"].isna() & df["title_1"].str.contains(number_pattern),:]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
723,https://go.boarddocs.com/il/thsd211/Board.nsf/Public,"G.A. McElroy Administration Center, 1750 South Roselle Road, Palatine, Illinois",http://www.adc.d211.org,,,False,False,
882,https://go.boarddocs.com/il/asd4/Board.nsf/Public,"222 N. Kennedy Drive, Addison, IL",http://www.asd4.org,,Addison School District 4,False,False,
977,https://go.boarddocs.com/mo/rockport/Board.nsf/Public,"600 S Nebraska St | Rock Port, MO",https://rockport.k12.mo.us/,,Rock Port R-II School District,False,False,
2134,https://go.boarddocs.com/in/pike/Board.nsf/Public,"Administrative Services Center - 6901 Zionsville Road, Indianapolis, IN & via Live Streaming at: https://www.youtube.com/channel/UCsnM2UOzNfDocPPLZzktSOg",http://www.pike.k12.in.us/,,Metropolitan School District of Pike Township,False,False,
2627,https://go.boarddocs.com/nj/hhboe/Board.nsf/Public,"316-A Seventh Ave Haddon Heights, New Jersey",http://gogarnets.com/,,Haddon Heights School District,False,False,
2729,https://go.boarddocs.com/ca/cvesd/Board.nsf/Public,"84 East J Street , Chula Vista , CA91910",https://www.cvesd.org/,,Chula Vista Elementary School District,False,False,
3287,https://go.boarddocs.com/wi/afasd/Board.nsf/Public,"201 W. 6th Street, Friendship, WI",https://www.afasd.net/,,ADAMS-FRIENDSHIP AREA SCHOOL DISTRICT,False,False,


In [2270]:
# move them to the address
idx = df["title_1"].str.contains(",",na=False) & df["address"].isna() & df["title_1"].str.contains(number_pattern)
df.loc[idx,"address"] = df["title_1"]
df.loc[idx,"title_1"] = None

In [2271]:
# check those with commas
# could be addresses
# check those without addresses
df.loc[df["title_1"].str.contains(",",na=False) & df["address"].isna(),:]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
2505,https://go.boarddocs.com/wv/brooke/Board.nsf/Public,"Brooke County Schools - Excellence, Tradition and Bruin Pride!",https://www.brooke.k12.wv.us,,,False,False,
2710,https://go.boarddocs.com/ak/swrsdak/Board.nsf/Public,"... educating our future, guided by our past",http://www.swrsd.org,,Southwest Region School District,False,False,
2824,https://go.boarddocs.com/mo/wpr7sd/Board.nsf/Public,"Excellence in Education, Service, Life.",https://www.zizzers.org,,West Plains School District,False,False,


In [2272]:
# move the first one to address and remove the rest, which are slogans
df.loc[df["URL"]=="https://go.boarddocs.com/co/eepto/Board.nsf/Public", "address"] = df["title_1"]
df.loc[df["URL"]=="https://go.boarddocs.com/co/eepto/Board.nsf/Public", "title_1"] = None

df.loc[df["title_1"].str.contains(",",na=False) & df["address"].isna(),"title_1"] = None

In [2273]:
# check the remaining ones
print(df.loc[~df["title_1"].isna()].shape[0])
df.loc[~df["title_1"].isna()].sample(5)

1757


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
784,https://go.boarddocs.com/wi/sbschools/Board.nsf/Public,Stanley-Boyd Area Schools,,,,False,False,
3582,https://go.boarddocs.com/in/ltschools/Board.nsf/Public,MSD of Lawrence Township,https://www.ltschools.org,"6501 Sunnyside Road | Indianapolis, IN 46236 | p (317) 423-8200 | f (317) 543-3534",,False,False,
1807,https://go.boarddocs.com/il/bpd/Board.nsf/Public,Bensenville Park District,http://www.bvilleparks.org/pages/index.html,"1000 W. Wood Ave | Bensenville, Illinois 60106 | 630-766-7015",,False,False,
2606,https://go.boarddocs.com/mi/fiacad/Board.nsf/Public,Frontier International Academy,https://frontierdetroit.geeacademies.net/,"13200 Conant St., Detroit, MI 48212 Phone: 313-462-6300",,False,False,
2187,https://go.boarddocs.com/vsba/waynesboro/Board.nsf/Public,Waynesboro Public Schools,http://www.waynesboro.k12.va.us,"301 Pine Ave., Waynesboro, Virginia 22980",,False,False,


In [2274]:
# check those with address na too
print(df.loc[~df["title_1"].isna() & df["address"].isna()].shape[0])
df.loc[~df["title_1"].isna() & df["address"].isna()].sample(5)

559


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
1942,https://go.boarddocs.com/in/madis/Board.nsf/Public,Madison Consolidated Schools,http://www.madison.k12.in.us,,,False,False,
1055,https://go.boarddocs.com/mi/alpena/Board.nsf/Public,Alpena Public Schools,https://www.alpenaschools.com/,,,False,False,
1528,https://go.boarddocs.com/oh/pett/Board.nsf/Public,Pettisville Local Schools,www.pettisvilleschools.org,,,False,False,
2964,https://go.boarddocs.com/mi/buca/Board.nsf/Public,Buchanan Community Schools,https://www.buchananschools.com/,,,False,False,
1753,https://go.boarddocs.com/mi/dew/Board.nsf/Public,DeWitt Public Schools,http://www.dewittschools.net,,,False,False,


In [2275]:
# check those with school_district na too
print(df.loc[~df["title_1"].isna() & df["school_district"].isna()].shape[0])
df.loc[~df["title_1"].isna() & df["school_district"].isna()].sample(5)

1704


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
2771,https://go.boarddocs.com/nc/lexcs/Board.nsf/Public,Lexington City Schools NC,http://www.lexcs.org,"1010 Fair St., Lexington, NC 27292 | 336-242-1527",,False,False,
2200,https://go.boarddocs.com/fla/vcsfl/Board.nsf/Public,Volusia County Schools,https://www.vcsedu.org/,"200 North Clara Ave | DeLand, FL 32720 | Ph: 386.734.7190 Fx: 386.734.2842",,False,False,
1484,https://go.boarddocs.com/oh/tricountycc/Board.nsf/Public,Tri-County Career Center,https://www.tricountyhightech.com/,"15676 State Rt 691 | Nelsonville, OH 45764 | 740-753-3511",,False,False,
1462,https://go.boarddocs.com/ar/rpsd30/Board.nsf/Public,Rogers Public Schools,http://www.rogersschools.net/,"500 W. Walnut Street | Rogers, AR 72756 | (479) 636-3910",,False,False,
499,https://go.boarddocs.com/in/ecesc/Board.nsf/Public,East Central Educational Service Center,http://www.ecesc.k12.in.us,"705 West 21st Street | Connersville, IN 47331 | Ph: (765) 825-1247 Fx: (765) 825-2532",,False,False,


In [2276]:
# check those with non-alphabets in title_1
non_alpha_pattern = r'[^A-Za-z ]'
print(df.loc[df["title_1"].str.contains(non_alpha_pattern, na=False)].shape[0])
df.loc[df["title_1"].str.contains(non_alpha_pattern, na=False)].sample(5)

319


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29,http://www.iu29.org,,,False,False,
229,https://go.boarddocs.com/il/cusd5/Board.nsf/Public,Ball-Chatham Board of Education,http://www.chathamschools.org/,"1475 E. Plummer Blvd Chatham, Illinois 62629",,False,False,
1397,https://go.boarddocs.com/in/risun/Board.nsf/Public,Rising Sun-Ohio County Community School Corporation,http://www.risingsunschools.com,,,False,False,
3156,https://go.boarddocs.com/il/elmhurst/Board.nsf/Public,Elmhurst Community Unit School Dist. 205,http://www.elmhurst205.org,,,False,False,
1484,https://go.boarddocs.com/oh/tricountycc/Board.nsf/Public,Tri-County Career Center,https://www.tricountyhightech.com/,"15676 State Rt 691 | Nelsonville, OH 45764 | 740-753-3511",,False,False,


In [2277]:
# for those that have addresses set, we can move them to school_district
df.loc[~df["address"].isna() & ~df["title_1"].isna()]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
2,https://go.boarddocs.com/oh/mapleheights/Board.nsf/Public,Maple Heights City Schools,http://www.mapleschools.com,"5740 Lawn Avenue | Maple Heights, OH 44137 | 216-587-6100",,False,False,
10,https://go.boarddocs.com/in/centergrove/Board.nsf/Public,Center Grove Community School Corporation,https://www.centergrove.k12.in.us/,"4800 West Stones Crossing Road | Greenwood, IN 46143 | (317) 881-9326",,False,False,
12,https://go.boarddocs.com/oh/pcc/Board.nsf/Public,Penta Career Center,http://www.pentacareercenter.org/,"9301 Buck Rd. | Perrysburg, Ohio 43551 | High School: 419-666-1120 Adult Education: 419-661-6555",,False,False,
15,https://go.boarddocs.com/oh/westholmes/Board.nsf/Public,West Holmes Local Schools,https://westholmes.org/,"28 W Jackson St. | Millersburg, OH 44654 | 330-674-3546",,False,False,
16,https://go.boarddocs.com/mabe/carps/Board.nsf/Public,Caroline County Public Schools,carolineschools.org,"Address: 204 Franklin Street | Denton, MD 21629 | Phone: (410) 479-1460",,False,False,
...,...,...,...,...,...,...,...,...
3897,https://go.boarddocs.com/co/jeffco/Board.nsf/Public,Jeffco Public Schools Board of Education,http://www.jeffcopublicschools.org/,1829 Denver West Drive | Golden. CO 80401 | (303) 982-6800,,False,False,
3902,https://go.boarddocs.com/il/cowil/Board.nsf/Public,City of Waukegan,http://www.waukeganil.gov/,"100 N. Martin Luther King Jr. Ave. | Waukegan, IL 60085",,False,False,
3903,https://go.boarddocs.com/ca/laccd/Board.nsf/Public,Los Angeles Community College District,http://laccd.edu,"770 Wilshire Boulevard, Los Angeles, CA 90017 | (213) 891-2000",,False,False,
3904,https://go.boarddocs.com/md/stmarysco/Board.nsf/Public,"St. Mary's County, Maryland",http://www.stmarysmd.com,"41770 Baldridge Street | Leonardtown, MD 20650 | 301-475-4200",,False,False,


In [2278]:
# check if there are any where all three are set
df.loc[~df["address"].isna() & ~df["title_1"].isna() & ~df["school_district"].isna()]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone


In [2279]:
# not really sure if it will be entirely correct
# check for those without address set
print(df.loc[df["address"].isna() & ~df["title_1"].isna()].shape[0])
df.loc[df["address"].isna() & ~df["title_1"].isna()].head()

559


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
0,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,St. Joseph Public Schools,https://www.sjschools.org/,,,False,False,
6,https://go.boarddocs.com/pa/fcctc/Board.nsf/Public,Franklin County Career and Technology Center,www.franklinctc.com,,,False,False,
9,https://go.boarddocs.com/ca/voc/Board.nsf/Public,Vista Oaks Charter School,https://www.vistaoaks.net,,,False,False,
20,https://go.boarddocs.com/mi/man/Board.nsf/Public,Manton Consolidated Schools,http://mantonmi.apptegy.us/o/mcs?mode=edit,,,False,False,
21,https://go.boarddocs.com/pa/cdsd/Board.nsf/Public,Quality to the Core,http://www.cdschools.org,,Central Dauphin School District,False,False,


In [2280]:
# not really sure if it will be entirely correct
# check for those with both unset
print(df.loc[df["address"].isna() & df["school_district"].isna() & ~df["title_1"].isna()].shape[0])
df.loc[df["address"].isna() & df["school_district"].isna() & ~df["title_1"].isna()].sample(5)

506


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
0,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,St. Joseph Public Schools,https://www.sjschools.org/,,,False,False,
9,https://go.boarddocs.com/ca/voc/Board.nsf/Public,Vista Oaks Charter School,https://www.vistaoaks.net,,,False,False,
54,https://go.boarddocs.com/mi/rogers/Board.nsf/Public,Rogers City Area Schools,http://www.rcashurons.org,,,False,False,
44,https://go.boarddocs.com/mi/repuf/Board.nsf/Public,Reeths-Puffer Schools,http://www.reeths-puffer.org,,,False,False,
3361,https://go.boarddocs.com/pa/fcvts/Board.nsf/Public,Fayette County Career and Technical Institute,http://www.fayettecti.org,,,False,False,


In [2281]:
# check how many we have left
sum(~df["title_1"].isna())

1757

In [2282]:
# send to GPT to deal with it
df.to_csv("test.csv", columns=["title_1","address","school_district"], index=False)

In [2283]:
# I gave GPT-4o the following prompt
"""
This csv has three columns: title_1, address, and school_district. Many values are empty.

Your goal is to move the value from title_1 to either address, or school_district, or simply discard it. Do not overwrite any values.

For example, for the row

Christopher Newport University,"1 Avenue of the Arts Newport News, VA 23606 | (757) 594-7000",

You should put Christopher Newport University as the school_district, which is third column, that is

,"1 Avenue of the Arts Newport News, VA 23606 | (757) 594-7000","Christopher Newport University"

Another example

Quality to the Core,,Central Dauphin School District

You should just drop the title_1, since it is a slogan

,,Central Dauphin School District

Another example

75 Chenango Ave Clinton NY13323,,Clinton Central School District

You should put this in the address column

,75 Chenango Ave Clinton NY13323,Clinton Central School District

Return a csv
"""

# followed by 

"""
Ok. Now instead of moving, add a fourth column that says "address", "school_district", "None" or "drop"
"""

# it returns the following csv

'\nOk. Now instead of moving, add a fourth column that says "address", "school_district", "None" or "drop"\n'

In [2284]:
gpt_df = pd.read_csv("classified_test_by_gpt.csv")
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,classification
31,,,Commodore Perry School District,
907,,"135 West Crooked Hill Road | Pearl River, NY 10965 | Tel: 845.620.3900 | Fx: 845.620.3927",Pearl River School District,
1279,Fort Leavenworth USD 207,"207 Education Way | Fort Leavenworth, KS 66027 | Ph: 913-651-7373 Fx: 913-758-6010",,school_district
3533,Butler County Mental Health and Addiction Recovery Services Board,"5963 Boymel Drive | Fairfield, OH 45014-5541 | p (513) 860-9240 | f (513) 860-9241",,school_district
2109,,"5 Jardine Place North Babylon, NY 11703",North Babylon School District,
74,,,Ridley School District Policy Manual,
2786,,,Wright City R II School District | Dedicated to our students' success,
515,Scarsdale Public Schools,"2 Brewster Road | Scarsdale, NY 10583 | 914-721-2400",,school_district
2133,ISANA Academies,"3580 Wilshire Blvd. Suite 1130 Los Angeles, CA 90010 | 323.291.1211",,school_district
1336,The Bronxville School,"177 Pondfield Road | Bronxville, New York 10708 | Phone (914) 395-0500",,school_district


In [2285]:
# let's check its distribution
gpt_df["classification"].value_counts(dropna=False)

classification
NaN                2149
school_district    1704
address              53
Name: count, dtype: int64

In [2286]:
# interestingly, there are no drops
# let's look at the slogan row
gpt_df[gpt_df["title_1"]=="Quality to the Core"]

Unnamed: 0,title_1,address,school_district,classification
21,Quality to the Core,,Central Dauphin School District,address


In [2287]:
# this is a pretty bad classification
# turns out GPT was using Python under the hood
# I asked GPT to classify manually instead
# here is its results
gpt_df = pd.read_csv("manual_classified_test.csv")
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,classification
1617,Avon Lake City Schools,"175 Avon Belden Road | Avon Lake, OH 44012 | 440-933-6210",,school_district
2550,,"65 Challenger Road, Ste. 360, Ridgefield Park, NJ 07660",Ridgefield Park School District,
1583,,"7448 Fox Road, PO Box 189 | Hughson, CA 95326 | p (209) 883-4428 | f (209) 883-4639",Hughson Unified School District,
2506,Alcona Community Schools,,,school_district
1371,"Town of Hadley, Massachusetts","100 Middle Street | Hadley, MA 01035 | (413) 586-0221 | f 413-586-5661",,school_district
264,,"Birmingham Public Schools - 31301 Evergreen Road, Beverly Hills, MI 48025",,
3712,,"850 Maple Street | Glenwood City, WI 54013 • (715) 265-4757 • fax (715) 265-4214",Glenwood City School District,
3169,Auglaize County Educational Service Center,,,school_district
137,,"6028 Broadway | West New York, NJ 07093 | (201) 553-4000",West New York School District,
861,Bay-Arenac ISD,,,school_district


In [2288]:
# still terrible, and I realized it still uses python under the hood
# I will start a new session and emphasize not to program
# let's check a few stats first

# number of NA title_1
print(df[df["title_1"].isna()].shape[0])

# number of none NA title_1
print(df[~df["title_1"].isna()].shape[0])

2149
1757


In [2289]:
df.columns

Index(['URL', 'title_1', 'home_website', 'address', 'school_district',
       'contains_phone_number_title_1', 'contains_phone_number_title_2',
       'phone'],
      dtype='object')

In [2290]:
df = df.drop(columns=["contains_phone_number_title_1","contains_phone_number_title_2"])

In [2291]:
# I have used a new prompt; see prompts/prompt_1.txt
# now let's read the reply
gpt_df = pd.read_csv("gpt_results/rows_classify.csv")
gpt_df.shape

(227, 1)

In [2292]:
gpt_df = pd.concat([df.iloc[:221, :], gpt_df], axis=1)
gpt_df.sample(10)

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,action
220,https://go.boarddocs.com/nj/longbranch/Board.nsf/Public,,https://www.longbranch.k12.nj.us/,"540 Broadway Long Branch, NJ 07740",Long Branch Public School District,,school_district
69,https://go.boarddocs.com/pa/cvsdpa/Board.nsf/Public,Chartiers Valley Board of School Directors,https://www.cvsd.net/about-cvsd/board-of-directors,,,,address
201,https://go.boarddocs.com/wa/mpswa/Board.nsf/Public,,http://www.monroe.wednet.edu,"14692 179th AVE SE |Monroe, WA 98272 | Phone: 360-804-2501",Monroe School District,,school_district
217,https://go.boarddocs.com/sc/kcsd/Board.nsf/Public,,http://www.kcsdschools.net,"2029 West DeKalb Street | Camden, SC 29020 | P: (803) 432-8416 | F: (803) 425-8918",Kershaw County School District,,school_district
163,https://go.boarddocs.com/mi/ros/Board.nsf/Public,Royal Oak Schools,http://www.royaloakschools.org,,,,school_district
41,https://go.boarddocs.com/ia/cojia/Board.nsf/Public,"City of Johnston, Iowa",http://www.cityofjohnston.com/,"6221 Merle Hay Road | Johnston, IA 50131 | 515-278-2344 | f 515-278-2033",,,address
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,,http://www.hartlandschools.us,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,,,school_district
113,https://go.boarddocs.com/ca/khsd/Board.nsf/Public,,http://www.kernhigh.org,"5801 Sundale Ave.| Bakersfield, CA 93309 | Tel: (661) 827-3100 | Fax:(661) 827-3301",Kern High School District,,school_district
144,https://go.boarddocs.com/il/bbchs/Board.nsf/Public,Bradley-Bourbonnais CHSD #307,http://www.bbchs.org/,"700 West North Street, Bradley, IL 60915 | p (815) 937-3707 | f (815) 937-0156",,,school_district
143,https://go.boarddocs.com/fl/sbmon/Board.nsf/Public,,www.keysschools.com,"241 Trumbo Road Key West, FL 33040 - Phone 305-293-1400 ext. 53323",Monroe County School District,,school_district


In [2293]:
gpt_df.sample(10)

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,action
98,https://go.boarddocs.com/pa/tama/Board.nsf/Public,,http://www.tamaqua.k12.pa.us,,Tamaqua Area School District,,school_district
197,https://go.boarddocs.com/ca/guerneville/Board.nsf/Public,,https://guernevilleschool.org/,"14630 Armstrong Woods Road, Guerneville, CA 95446 | 707.869.2864",Guerneville School District,,school_district
112,https://go.boarddocs.com/mi/eaglesnest/Board.nsf/Public,Eagle's Nest Academy,http://eaglesnestflint.org/,"5005 Cloverlawn Dr, Flint, MI 48504 Phone: (810) 869-6495",,,school_district
100,https://go.boarddocs.com/mi/grand/Board.nsf/Public,Grand Rapids Public Schools,https://www.grps.org/,"1331 Martin Luther King Jr. St. SE | Grand Rapids, MI 49506 | 616.819.2000",,,school_district
82,https://go.boarddocs.com/mi/riverside/Board.nsf/Public,Riverside Academy,https://riversidewest.geeacademies.net/,"6409 Schaefer Rd., Dearborn, MI 48126 (West) Phone: (313) 624-3600",,,school_district
80,https://go.boarddocs.com/oh/lcjvs/Board.nsf/Public,Lorain County JVS,https://www.lcjvs.com,"15181 State Route 58, Oberlin, OH 44074",,,school_district
76,https://go.boarddocs.com/pa/iu11/Board.nsf/Public,Tuscarora Intermediate Unit 11,https://www.tiu11.org/,,,,address
214,https://go.boarddocs.com/ks/mzufsd266/Board.nsf/Public,,http://www.usd266.com,"905 W Academy Avenue • Maize, KS 67101 • Phone (316) 722-0614",Maize Unified School District 266,,school_district
211,https://go.boarddocs.com/ca/esuhsd/Board.nsf/Public,,www.esuhsd.org,"830 N. Capitol Avenue | San Jose, California 95133 | 408-347-5000",EAST SIDE UNION HIGH SCHOOL DISTRICT,,school_district
10,https://go.boarddocs.com/in/centergrove/Board.nsf/Public,Center Grove Community School Corporation,https://www.centergrove.k12.in.us/,"4800 West Stones Crossing Road | Greenwood, IN 46143 | (317) 881-9326",,,school_district


In [2294]:
# still bad
# try again with index
# also only include none-NAs
df[~df["title_1"].isna()].to_csv("test.csv", columns=["title_1","address","school_district"])

In [2295]:
df["index"] = df.index

In [2296]:
# get the results
gpt_df = pd.read_csv("gpt_results/row_classify_2.csv")
gpt_df = pd.merge(df, gpt_df, how="inner", on="index")
gpt_df.sample(10)

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,action
128,https://go.boarddocs.com/oh/minls/Board.nsf/Public,Minford Local Schools,www.minford.k12.oh.us,,,,262,school_district
73,https://go.boarddocs.com/ks/kta/Board.nsf/Public,Kansas Turnpike Authority,http://www.ksturnpike.com/,"9401 E Kellogg | Wichita, KS 67207 | (316) 682-4537",,,152,
15,https://go.boarddocs.com/mi/ioni/Board.nsf/Public,Ionia Public Schools,http://www.ioniaschools.org,,,,36,school_district
139,https://go.boarddocs.com/co/bromley/Board.nsf/Public,Bromley East Charter School,https://www.bromleyeastcs.org,"356 Longspur Dr Brighton, CO 80601 | 720-685-3297",,,285,
195,https://go.boarddocs.com/vsba/kwcps/Board.nsf/Public,King William County Public Schools,http://www.kwcps.k12.va.us/,"18548 King William Road | King William, VA 23086 | p (804) 769-3434 | f (804) 769-3312",,,420,
161,https://go.boarddocs.com/in/baugo/Board.nsf/Public,Baugo Community Schools,http://www.baugo.org,,,,336,school_district
14,https://go.boarddocs.com/wa/cdhy/Board.nsf/Public,Center for Deaf and Hard of Hearing Youth,https://www.cdhy.wa.gov/,,,,35,school_district
57,https://go.boarddocs.com/mi/trentonmi/Board.nsf/Public,"City of Trenton, Michigan",http://www.trentonmi.org/,"2800 Third Street | Trenton, MI 48183 | (734) 675-8600",,,127,
152,https://go.boarddocs.com/oh/sugar/Board.nsf/Public,Bellbrook-Sugarcreek Local Schools,https://www.sugarcreek.k12.oh.us/,"St.Pierre Education Center, 3757 Upper Bellbrook Rd, Bellbrook OH 45305; 937-848-5001",,,316,
236,https://go.boarddocs.com/ca/gaen/Board.nsf/Public,Gateway Adult Education Network,http://www.mygaen.org,,,,506,school_district


In [2297]:
# let's look at the relevent cols for now
gpt_df = gpt_df[["title_1","address","school_district","action"]]

In [2298]:
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action
201,Kalamazoo RESA,"1819 E. Milham Ave., Portage, MI 49002 (269) 250-9200",,
7,Caroline County Public Schools,"Address: 204 Franklin Street | Denton, MD 21629 | Phone: (410) 479-1460",,
155,Peninsula Health Care District,"1819 Trousdale Drive | Burlingame, CA 94010 | (650) 697-6900",,
27,Chesapeake Public Schools,"1421 Kristina Way, Chesapeake, VA 23320",,
19,Ridgemont Local Schools,"560 W. Taylor Street Mount Victory, OH",,
106,Giles County Public Schools,"151 School Road, Pearisburg, VA 24134",,
11,"Meade County, South Dakota","1300 Sherman Street Ste 222 | Sturgis, SD 57785",,
35,Tuscarora Intermediate Unit 11,,,school_district
71,Holland Public Schools,,,school_district
176,Autism Model of Toledo,"3020 Tremainsville Rd, Toledo, OH 43613",,


In [2299]:
# get the results
gpt_df = pd.read_csv("gpt_results/row_classify_3.csv")
gpt_df = pd.merge(df, gpt_df, how="inner", on="index")
gpt_df = gpt_df[["title_1","address","school_district","action"]]
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action
196,Olmsted Falls City Schools,"26937 Bagley Road | Olmsted Falls, Ohio 44138 | p (440) 427-6000 | f (440) 427-6010",,school_district
218,Barstow Community College,"2700 Barstow Road • Barstow, California 92311 • 760.252.2411 • FAX 760.252.1875 • TTY 760. 252.6759",,address
220,Riverview Intermediate Unit 6,,,school_district
116,Ball-Chatham Board of Education,"1475 E. Plummer Blvd Chatham, Illinois 62629",,address
113,Kent County Public Schools Board of Education,"5608 Boundary Ave | Rock Hall, MD 21661 | 410-778-1595",,school_district
74,Old Mission Peninsula School,"2699 Island View Rd | Traverse City, MI 49686 | (231) 252-0225",,address
60,Monroe Central School Corporation,,,school_district
42,Rancocas Valley Regional High School,"520 Jacksonville Road | Mt. Holly, NJ 08060 | (609) 267-0830",,school_district
147,Kingsport City Schools eGovernance,,,school_district
45,The Greenspire School,"1026 Red Drive, Traverse City, Michigan 49684 Phone: (231) 421-5905",,school_district


In [2300]:
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action
25,Clermont Northeastern Schools,2792 U.S. 50 | Batavia OH 45103 | (513) 625-1211,,school_district
169,Mendocino-Lake Community College District,"1000 Hensley Creek Road, Ukiah, CA",,address
114,Home of the Tigers,"403 Audubon Road Howards Grove, WI 53083",,drop
198,Goshen Local Schools,"6694 Goshen Rd. | Goshen, OH 45122 | 513-722-2222",,school_district
219,Tri County Area Schools,,,school_district
12,Hanley International Academy,"2400 Denton St., Hamtramck, MI 48212 Phone: (313) 875-8888",,school_district
3,Vista Oaks Charter School,,,school_district
182,Sparta Board of Education,"18 Mohawk Avenue Sparta, NJ 07871 | 973-729-2155",,school_district
162,Lakeland School Corporation,,,school_district
225,EHOVE Career Center,"316 W. Mason Rd Milan, OH 44846 Ph 419-499-4663",,school_district


In [2301]:
# the address for "Kansas Turnpike Authority" is overwritten
# but the GPT is getting there

In [2302]:
# get the results
gpt_df = pd.read_csv("gpt_results/row_classify_4.csv")
gpt_df = pd.merge(df, gpt_df, how="inner", on="index")
gpt_df = gpt_df[["title_1","address","school_district","action"]]
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action
217,Colorado Charter School Institute,"1525 Sherman St | B76 | Denver, CO 80203 | Ph:303-866-3299 | Fx:303-866-2530",,
25,Clermont Northeastern Schools,2792 U.S. 50 | Batavia OH 45103 | (513) 625-1211,,
141,Dalton Local Schools,,,school_district
75,75 Chenango Ave Clinton NY13323,,Clinton Central School District,
124,Lakewood Board of Education,"200 Ramsey Ave., Lakewood, NJ 08701 - 732-364-2400",,
238,Kensington Woods Schools,"9501 Pettys Road, Lakeland, MI 48143 Phone: 517-545-0828",,
16,Macon County R-1 Schools,"702 North Missouri  Macon, Missouri 63552  (660) 395-6164",,
171,Madison District Public Schools,,,school_district
166,Rappahannock County Public Schools,"6 School House Road, Washington, VA 22747 Phone: 540-227-0023",,
201,Kalamazoo RESA,"1819 E. Milham Ave., Portage, MI 49002 (269) 250-9200",,


In [2303]:
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action
145,Eaton Rapids Public Schools,,,school_district
221,St. Louis Public Schools,"801 N. 11th Street • St. Louis, MO, 63101 • P: 314-231-3720",,
230,Argos Community Schools,,,school_district
70,West Jefferson Local Schools,"906 West Main Street | West Jefferson, OH 43162 | 614-879-7654",,
246,Erie County Technical School,"8500 Oliver Road, Erie, PA 16509 / 814-464-8600 Phone",,
131,Tuscarora S D,,Tuscarora School District,drop
68,Washington County Public Schools,"812 Thompson Drive | Abingdon, VA 24210 | Phone: (276)739-3000",,
243,Union County Public Schools,"400 N. Church Street, Monroe, NC 28112 | 704-296-9898",,
148,Imperial County Office of Education,"1398 Sperber Road | El Centro, CA 92243 | p (760) 312-6464 | f (760) 312-6565",,
97,Shamong Township Schools,"295 Indian Mills Road | Shamong, NJ 08088 | (609) 268-0316 | f (609) 268-1229",,


In [2304]:
# get the results
gpt_df = pd.read_csv("gpt_results/results_5.csv")
gpt_df = pd.merge(df, gpt_df, how="inner", on="index")
gpt_df = gpt_df[["title_1","address","school_district","action"]]
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action
201,Kalamazoo RESA,"1819 E. Milham Ave., Portage, MI 49002 (269) 250-9200",,address
100,BOARD OF EDUCATION,,Park City School District,drop
47,Edgar Excellence,,Edgar School District 203 East Birch Street 715-352-2351,drop
225,EHOVE Career Center,"316 W. Mason Rd Milan, OH 44846 Ph 419-499-4663",,address
26,Andover Public Schools,"1432 N. Andover Rd., Andover, KS 67002",,address
178,Northview Public Schools,"4365 Hunsberger Ave, NE | Grand Rapids, MI 49525 | 616.365.6861",,address
147,Kingsport City Schools eGovernance,,,drop
97,Shamong Township Schools,"295 Indian Mills Road | Shamong, NJ 08088 | (609) 268-0316 | f (609) 268-1229",,address
3,Vista Oaks Charter School,,,school_district
9,Quality to the Core,,Central Dauphin School District,drop


In [2305]:
# get the results
gpt_df = pd.read_csv("gpt_results/results_6.csv")
gpt_df = pd.merge(df, gpt_df, how="inner", on="index")
gpt_df = gpt_df[["title_1","address","school_district","action"]]
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action
74,Old Mission Peninsula School,"2699 Island View Rd | Traverse City, MI 49686 | (231) 252-0225",,address
14,Center for Deaf and Hard of Hearing Youth,,,drop
194,Calvert County Public Schools,,,school_district
134,Gwinnett County Public Schools,,,school_district
156,Bergen County Technical Schools,"540 Farview Ave. , Paramus , NJ 07652",,address
50,San Benito CISD,"240 N. Crockett St., San Benito, TX 78586 | 956-361-6100",,address
36,LaSalle Parish School Board,"3012 N First Street Jena, Louisiana 71342",,address
136,Leipsic Local Schools,,,school_district
64,Board Meeting Documents and Policies,,IROQUOIS SCHOOL DISTRICT,drop
197,Montgomery County Intermediate Unit #23,"2 West Lafayette Street, Norristown, PA 19401",,address


In [2306]:
# get the results
gpt_df = pd.read_csv("gpt_results/results_7.csv")
gpt_df = pd.merge(df, gpt_df, how="inner", on="index")
gpt_df = gpt_df[["title_1","address","school_district","action"]]
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action
153,Southington Local Schools,"2482 St. Rt. 534 | Southington, OH 44470 | 330-898-7480",,address
115,Lebanon County Career & Technology Center,,,school_district
18,"City of Johnston, Iowa","6221 Merle Hay Road | Johnston, IA 50131 | 515-278-2344 | f 515-278-2033",,address
60,Monroe Central School Corporation,,,school_district
94,SER YouthBuild Learning Academy,"9215 Michigan Ave, Detroit, MI 48210",,address
27,Chesapeake Public Schools,"1421 Kristina Way, Chesapeake, VA 23320",,address
236,Gateway Adult Education Network,,,school_district
174,Illinois Mathematics and Science Academy,"1500 Sullivan Road | Aurora, IL 60506-1000 | 630-907-5000 |",,address
62,Englewood Schools - Building a New Tradition of Excellence,"4101 South Bannock Street | Englewood, CO 80110 | Ph: 303-761-7050",,address
157,Jackson City Schools,"450 Vaughn Street | Jackson, Ohio 45640 | 740-286-6442 | f 740-286-6445​",,address


In [2307]:
# I decided to ask o1-mini
# get the results
gpt_df = pd.read_csv("gpt_results/results_8.csv")
gpt_df = pd.merge(df, gpt_df, how="inner", on="index")
gpt_df = gpt_df[["title_1","address","school_district","action"]]
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action
34,Huntington County Community School Corporation,"1063 E 900 S | Warren, IN 46792 | p (260) 356-8312 | f (260) 358-2222",,school_district
430,Texas Alcoholic Beverage Commission,"5806 Mesa Drive, Austin, Texas 78731 | 512-206-3333",,drop
175,Warren County Educational Service Center,,,school_district
998,DINWIDDIE COUNTY SCHOOL BOARD,,,school_district
164,New Kent County Public Schools,"12003 New Kent HWY | New Kent, VA 23124 | Tel: 804.966.9650",,school_district
387,Bristol Local SD,,,school_district
625,Learning for Life,LSR-7,,drop
1147,Ivywood Classical Academy,"14356 Genoa Ct., Plymouth, MI 48170 Phone: (248) 207-1757",,school_district
69,Bradley-Bourbonnais CHSD #307,"700 West North Street, Bradley, IL 60915 | p (815) 937-3707 | f (815) 937-0156",,school_district
705,Lincolnview Local Schools,"15945 Middle Point Rd. | Van Wert, OH 45891 | p (419) 968-2226 | f (419) 968-2227",,school_district


In [2308]:
# this looks very promising.
# I will concat the rest in
gpt_df = pd.read_csv("gpt_results/results_8.csv")
gpt_df = pd.merge(df, gpt_df, how="inner", on="index")
gpt_df = gpt_df[["title_1","address","school_district","action","index"]]
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action,index
1225,Avon Community School Corporation,"7203 E US Hwy 36 Avon, IN 46123 | 317-544-6000",,school_district,2741
479,Oxford Public Schools,"50 Great Oak Road, Suite A, Oxford, CT 06478",,school_district,1057
1331,North West Hendricks School Corporation IN,"104 N. Church Street, PO Box 70, Lizton, IN 46149 | 317-994-4100",,school_district,2904
485,Lafayette Elementary,"3477 School Street, Lafayette, California 94549 | (925) 927-3500",,school_district,1074
1501,Beaver Valley Intermediate Unit 27,,,school_district,3249
1185,Cloverdale Community Schools,,,drop,2666
5,Penta Career Center,"9301 Buck Rd. | Perrysburg, Ohio 43551 | High School: 419-666-1120 Adult Education: 419-661-6555",,school_district,12
776,Arlington Public Schools,"2110 Washington Blvd | Arlington, VA 22204 | Phone: 703-228-6015",,school_district,1737
396,Van Wert City Schools,,,school_district,884
1868,City of Waukegan,"100 N. Martin Luther King Jr. Ave. | Waukegan, IL 60085",,school_district,3902


In [2309]:
# check those with drop
drop = gpt_df[gpt_df["action"]=="drop"]
drop.shape

(303, 5)

In [2310]:
drop.sample(10)

Unnamed: 0,title_1,address,school_district,action,index
1023,4904 Route 982,,,drop,2270
381,"Silver Spring Township, Pennsylvania","8 Flowers Drive Mechanicsburg, PA 17050 Ph: 717-766-0178 FX: 717-766-1696",,drop,843
1848,,"N11941 Hwy 141, PO Box 258, Wausaukee, Wisconsin 54177 | 715 856-5151 Phone | 715 856-6592 Fax",School District of Wausaukee,drop,3869
1351,,"91 West Valley Road - PO BOX 278 Chimacum, WA 98325 | 360.302.5890",Chimacum School District,drop,2937
1839,Centreville Public Schools,,,drop,3853
1721,Mesick Consolidated Schools,,,drop,3632
1804,Colorado Board of Education,,,drop,3783
1363,,"290 Norwood Avenue, Port Jefferson Station, NY 11776 | 631-474-8100",Comsewogue School District,drop,2956
47,Edgar Excellence,,Edgar School District 203 East Birch Street 715-352-2351,drop,106
1773,Menominee Area Public Schools,,,drop,3721


In [2311]:
# check those with na
drop = gpt_df[gpt_df["action"]=="None"]
drop.shape

(0, 5)

In [2312]:
# check those with address
addr = gpt_df[gpt_df["action"]=="address"]
addr.shape

(3, 5)

In [2313]:
addr

Unnamed: 0,title_1,address,school_district,action,index
56,341 NORTH EMERY AVENUE,,PESHTIGO SCHOOL DISTRICT,address,125
75,75 Chenango Ave Clinton NY13323,,Clinton Central School District,address,155
607,1446 Kittanning Pike,"Karns City, PA 16041",,address,1343


In [2314]:
# try again
df["index"] = df.index
df[~df["title_1"].isna()].to_csv("test.csv", columns=["title_1","address","school_district","index"],index=False)

In [2315]:
# TODO INCORPORATE THE O1 HERE
gpt_df = pd.read_csv("gpt_results/results_9.csv")
gpt_df = pd.merge(df, gpt_df, how="inner", on="index")
gpt_df = gpt_df[["title_1","address","school_district","action","index"]]
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action,index
578,Goodrich Area Schools,"8029 South Gale Road, Goodrich, Michigan 48438",,school_district,1267
1449,Home of The Raymond Panthers,"2659 76th Street, Franksville, WI 53126 ~ (262) 835-2929",,school_district,3244
422,Troy City Schools,"500 N. Market St., Troy, OH 45373 937-332-6700",,school_district,936
1408,Lake Park Audubon Public School Distict,,,school_district,3160
240,Fairview Park City Schools,"21620 Mastick Road | Fairview Park, Oh 44126 | 440-331-5500",,school_district,514
1068,Greenon Local Schools,"120 South Xenia Drive, Enon, Ohio 45323",,school_district,2357
607,Gloucester City Board of Education NJ,"1300 Market Street Gloucester City, NJ 08030",,school_district,1338
623,Buchanan County Public Schools,"4447 Slate Creek Road, Suite 100, Grundy, Virginia 24614 | (276)935-4551 | (276)935-7150",,school_district,1376
1674,Gladstone Area Schools,,,school_district,3714
834,City of Carbondale,"200 South Illinois Avenue | Carbondale, Illinois 62901 | (618) 549-5302 | f 618-457-3283",,school_district,1861


In [2316]:
gpt_df["action"].value_counts(dropna=False)

action
school_district    1508
drop                229
NaN                  24
address               3
Name: count, dtype: int64

In [2317]:
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,action,index
1382,Liberty-Benton Local Schools,,,school_district,3116
1426,South Central Ohio Educational Service Center,,,drop,3196
866,"City of Effingham, Illinois","201 E. Jefferson Avenue | Effingham, IL 62401 | 217.342.5300",,school_district,1913
752,Willard City Schools,,,school_district,1691
1552,Jennings County School Corporation,,,school_district,3450
968,Confluence Charter Schools,"611 North Tenth Street, Suite 525 | St. Louis, MO 63101 | 314-588-8554 | f 314-588-1343",,school_district,2137
687,Pettisville Local Schools,,,school_district,1528
775,Town of Foxborough,"40 South Street | Foxborough, MA 02035",,school_district,1735
1635,Muskegon Heights Public School Academy System,"2441 Sanford Street, Muskegon Heights, MI 49444 Phone: 231-830-3703",,school_district,3623
40,Northern Virginia Community College,,,school_district,93


In [2318]:
gpt_df["action"].value_counts()

action
school_district    1508
drop                229
address               3
Name: count, dtype: int64

In [2319]:
gpt_df[gpt_df["action"]=="address"]

Unnamed: 0,title_1,address,school_district,action,index
75,75 Chenango Ave Clinton NY13323,,Clinton Central School District,address,155
119,Southwest Licking LSD OH,"Address: 927-A South Street, Pataskala, Ohio 43062 | Phone: 740-927-3941",,address,242
608,1446 Kittanning Pike,"Karns City, PA 16041",,address,1343


In [2320]:
gpt_df[gpt_df["action"]=="drop"][:50]

Unnamed: 0,title_1,address,school_district,action,index
9,Quality to the Core,,Central Dauphin School District,drop,21
73,Kansas Turnpike Authority,"9401 E Kellogg | Wichita, KS 67207 | (316) 682-4537",,drop,152
210,#AP_EveryStudentEveryDay,,Welcome to Averill Park Central School District,drop,451
255,Village of South Elgin,"10 N. Water Street South Elgin, IL 60177-1602 | 847-742-5780",,drop,544
263,NEOLA policies,,Ontonagon Area School District,drop,570
269,Board of Supervisors,"755 Roanoke St. Ste. 2E | Christiansburg, VA 24073 | (540) 382-6954 | f (540) 382-6943",,drop,582
279,Public School Teachers' Pension and Retirement Fund of Chicago,"425 S Financial Place, Suite 1400, Chicago, IL 60605 | 312-604-1400",,drop,609
284,Bay-Arenac Community High School,"805 Langstaff St, Essexville, MI 48732",,drop,621
286,Port of Los Angeles High School,"250 West 5th St., San Pedro, CA 90731",,drop,626
291,McComb Local School,,,drop,634


In [2321]:
# pass the drops through o1 mini again
drops = gpt_df.loc[gpt_df["action"]=="drop", ["title_1","school_district", "address","index"]]
drops.to_csv("drops.csv",index=False)

In [2322]:
gpt_df["action"].value_counts(dropna=False)

action
school_district    1508
drop                229
NaN                  24
address               3
Name: count, dtype: int64

In [2323]:
drop_edits = pd.read_csv("gpt_results/drops-edits.csv")
gpt_df = pd.merge(gpt_df, drop_edits, how="outer", on="index")

In [2324]:
gpt_df.head()

Unnamed: 0,title_1,address,school_district,action_x,index,action_y
0,St. Joseph Public Schools,,,school_district,0,
1,Maple Heights City Schools,"5740 Lawn Avenue | Maple Heights, OH 44137 | 216-587-6100",,,2,
2,Franklin County Career and Technology Center,,,school_district,6,
3,Vista Oaks Charter School,,,school_district,9,
4,Center Grove Community School Corporation,"4800 West Stones Crossing Road | Greenwood, IN 46143 | (317) 881-9326",,,10,


In [2325]:
gpt_df.loc[~gpt_df["action_y"].isna(), "action_x"] = gpt_df["action_y"]
gpt_df = gpt_df.rename(columns={"action_x":"action"})
gpt_df = gpt_df.drop("action_y",axis=1)
gpt_df.head()

Unnamed: 0,title_1,address,school_district,action,index
0,St. Joseph Public Schools,,,school_district,0
1,Maple Heights City Schools,"5740 Lawn Avenue | Maple Heights, OH 44137 | 216-587-6100",,,2
2,Franklin County Career and Technology Center,,,school_district,6
3,Vista Oaks Charter School,,,school_district,9
4,Center Grove Community School Corporation,"4800 West Stones Crossing Road | Greenwood, IN 46143 | (317) 881-9326",,,10


In [2326]:
gpt_df.to_csv("inspect.csv")

In [2327]:
gpt_df["action"].value_counts(dropna=False)

action
school_district    1551
drop                188
NaN                  24
address               3
Name: count, dtype: int64

In [2328]:
drops = gpt_df.loc[gpt_df["action"]=="drop",:]
drops.sample(20)

Unnamed: 0,title_1,address,school_district,action,index
1341,Little Silver Board of Education,"124 Willow Drive, Little Silver, NJ 07739",,drop,3018
1756,Tacoma Community College,"6501 S. 19th Street, Tacoma, WA 98466",,drop,3894
1205,Sacramento City College,"3835 Freeport Boulevard • Sacramento, CA 95822 | 916-558-2111",,drop,2699
1218,Gratiot-Isabella Regional Education Service District,,,drop,2720
1595,Venango Technology Center,,,drop,3525
466,The Township of Randolph,"502 Millbrook Ave | Randolph, NJ 07869-3799 | Ph: 973.989.7100 | Fx: 973.989.7076",,drop,1030
419,Landmark Academy,"4864 Lapeer Rd., Kimball, MI 48074 Phone: 810-982-7210",,drop,932
1695,,,,drop,3755
303,Southeast Florida Behavioral Health Network,"8895 N Military Trail, Suite E-102 | Palm Beach Gardens, FL 33410 | (561) 203-2485",,drop,653
549,Home of the Bluejays!,"1111 North Sales Street, Merrill WI 54452",,drop,1213


In [2329]:
# well we tried our best
# TODO further improve the GPT classifications

# implement the actions
# first we merge
gpt_df = gpt_df[["index","action"]]
df = pd.merge(df, gpt_df, how="left", on="index")
df.shape

(3906, 8)

In [2330]:
# check that action is only for non-NA titles
df.loc[df["action"]=="None","action"] = None
df[(df["title_1"].isna()) & (~df["action"].isna())]

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,action
1882,https://go.boarddocs.com/ks/usd373/Board.nsf/Public,,http://usd373.org,"308 East 1st Street, Newton, KS 67114 | (316) 284-6200",Newton Public Schools Unified School District #373 KS,,1882,school_district
1883,https://go.boarddocs.com/ca/aadusd/Board.nsf/Public,,https://www.aadusd.k12.ca.us,"32248 Crown Valley Road Acton, CA 93510 | 661-269-0750",Acton Agua Dulce Unified School District,,1883,school_district
1884,https://go.boarddocs.com/nj/frhsd/Board.nsf/Public,,http://www.frhsd.com/,"11 Pine Street | Englishtown, NJ 07726 | 732-792-7300",Freehold Regional High School District,,1884,school_district
1985,https://go.boarddocs.com/pa/roch/Board.nsf/Public,,http://www.rasd.org,"540 Reno Street, Rochester, PA 15074",,724-775-7500,1985,school_district
2206,https://go.boarddocs.com/ca/alisal/Board.nsf/Public,,https://www.alisal.org,"155 Bardin Road, Salinas, CA 93905 | 831-753-5700",Alisal Union School District,,2206,school_district
3088,https://go.boarddocs.com/il/besd61/Board.nsf/Public,,http://www.bradleyschools.com,"111 N. Croswell Avenue | Bradley, Illinois | 60915 | 815.933.3371",Bradley School District 61,,3088,school_district
3355,https://go.boarddocs.com/oh/nmlsd/Board.nsf/Public,,http://www.new-miami.k12.oh.us/,"600 Seven Mile Avenue | Hamilton, OH 45011",New Miami Local School District,,3355,school_district
3519,https://go.boarddocs.com/ia/bfcsd/Board.nsf/Public,,http://www.bfschools.org/,"300 Garfield St. SW | Bondurant, IA 50035 | Ph: 515-967-7819 Fx: 515-967-7847",Bondurant-Farrar Community School District,,3519,drop
3755,https://go.boarddocs.com/il/sd925/Board.nsf/Public,,http://www.sd925.org/,"9981 Canterbury Street | Westchester, IL 60154 | (708) 450-2700 | (708) 450-2718",Westchester School District 92 1/2,,3755,drop


In [2331]:
# do the address
df.loc[df["action"]=="address"]

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,action
155,https://go.boarddocs.com/ny/clintoncsd/Board.nsf/Public,75 Chenango Ave Clinton NY13323,https://www.ccs.edu/,,Clinton Central School District,,155,address
242,https://go.boarddocs.com/oh/swl/Board.nsf/Public,Southwest Licking LSD OH,http://www.swl.k12.oh.us,"Address: 927-A South Street, Pataskala, Ohio 43062 | Phone: 740-927-3941",,,242,address
1343,https://go.boarddocs.com/pa/karn/Board.nsf/Public,1446 Kittanning Pike,https://www.kcasdk12.org/,"Karns City, PA 16041",,,1343,address


In [2332]:
# handle them manually
df.loc[df["URL"]=="https://go.boarddocs.com/ny/clintoncsd/Board.nsf/Public", "address"] = df["title_1"]
df.loc[df["URL"]=="https://go.boarddocs.com/ny/clintoncsd/Board.nsf/Public", "title_1"] = None

In [2333]:
# drop the rest
df.loc[df["action"]=="address", "title_1"] = None

In [2334]:
# handle the drops
df.loc[df["action"]=="drop", "title_1"] = None

In [2335]:
# handle the school_district
# first check for collision
df.loc[(df["action"]=="school_district") & (~df["school_district"].isna())]

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,action
21,https://go.boarddocs.com/pa/cdsd/Board.nsf/Public,Quality to the Core,http://www.cdschools.org,,Central Dauphin School District,,21,school_district
106,https://go.boarddocs.com/wi/edsd/Board.nsf/Public,Edgar Excellence,www.edgar.k12.wi.us,,Edgar School District 203 East Birch Street 715-352-2351,,106,school_district
125,https://go.boarddocs.com/wi/pesh/Board.nsf/Public,341 NORTH EMERY AVENUE,www.peshtigo.k12.wi.us,,PESHTIGO SCHOOL DISTRICT,,125,school_district
136,https://go.boarddocs.com/pa/iroq/Board.nsf/Public,Board Meeting Documents and Policies,http://www.iroquoissd.org,,IROQUOIS SCHOOL DISTRICT,,136,school_district
202,https://go.boarddocs.com/ut/pcsd/Board.nsf/Public,BOARD OF EDUCATION,http://www.pcschools.us,,Park City School District,,202,school_district
244,https://go.boarddocs.com/pa/mlvlsd/Board.nsf/Public,Meetings and Information,http://www.millville.k12.pa.us/millville,,Millville Area School District,,244,school_district
273,https://go.boarddocs.com/pa/tusc/Board.nsf/Public,Tuscarora S D,http://www.tsdrockets.org,,Tuscarora School District,,273,school_district
281,https://go.boarddocs.com/ms/psd/Board.nsf/Public,Striving for a century of excellence,http://www.pgsd.ms,,The Pascagoula-Gautier School District,,281,school_district
314,https://go.boarddocs.com/mo/hancock/Board.nsf/Public,Inspiring Excellence,https://sd.hancock.k12.mo.us/,,Hancock Place School District,,314,school_district
432,https://go.boarddocs.com/oh/ehlsoh/Board.nsf/Public,District Policies,www.eastholmes.k12.oh.us,,East Holmes Local School District,,432,school_district


In [2336]:
# as we wait for o1-mini
# let's try to see if there are any websites
# URL regex from https://stackoverflow.com/questions/69242989/detect-presence-of-a-url-in-pandas

url_regex = r'[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'

df[df["title_1"].str.contains(url_regex,regex=True,na=False)]

  df[df["title_1"].str.contains(url_regex,regex=True,na=False)]
Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,action
2195,https://go.boarddocs.com/oh/iron/Board.nsf/Public,http://www.tigertown.com,http://www.tigertown.com,"Ironton City School District | 105 S. Fifth Street | Ironton, OH 45638 | Tel: (740) 532-4133 | Fax: (740) 532-2314",,,2195,school_district
3751,https://go.boarddocs.com/mi/aaps/Board.nsf/Public,Ann Arbor Public Schools - Board of Education ~ www.a2schools.org,http://www.a2schools.org,"2555 S. State Street ~ Ann Arbor, MI ~ 48104 ~ 734.994.2232",,,3751,school_district


In [2337]:
# handle these
df.loc[df['URL']=="https://go.boarddocs.com/mi/aaps/Board.nsf/Public","school_district"] = df["title_1"]
df.loc[df['URL']=="https://go.boarddocs.com/mi/aaps/Board.nsf/Public","title_1"] = None

df.loc[df['URL']=="https://go.boarddocs.com/oh/iron/Board.nsf/Public","title_1"] = None

In [2338]:
# we also try to get the phone number from the address
df['address_contains_phone_number'] = df['address'].apply(contains_phone_number)
df[df["address_contains_phone_number"]].sample(5)

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,action,address_contains_phone_number
1602,https://go.boarddocs.com/mi/eshoreleaders/Board.nsf/Public,East Shore Leadership Academy,https://www.eastshoreleaders.com/,"1403 7th St, Port Huron, MI 48060 Phone: (810) 294-8040",,,1602,school_district,True
1019,https://go.boarddocs.com/ny/mccsd/Board.nsf/Public,,http://www.mccsd.net,"Middle Country CSD, 8-43rd St., Centereach, NY 11720 (631) 285-8000;",Middle Country Central School District,,1019,,True
958,https://go.boarddocs.com/il/lw210/Board.nsf/Public,,http://www.lw210.org/,"1801 E. Lincoln HighWay | New Lenox, IL 60451 P: (815) 462-2345",Lincoln-Way Community School District 210,,958,,True
3309,https://go.boarddocs.com/ca/burton/Board.nsf/Public,,https://www.burtonschools.org,"264 N Westwood St | Porterville, CA 93257 | 559-781-8020",Burton School District,,3309,,True
3108,https://go.boarddocs.com/ny/hpcsd/Board.nsf/Public,,http://www.hpcsd.org/,"P. O. Box 2033 Hyde Park, NY 12538 (845) 229-4000",Hyde Park Central School District,,3108,,True


In [2339]:
df[df["address_contains_phone_number"]].shape

(2185, 9)

In [2340]:
# extract it
# but before that, check if conflicts
df[df["address_contains_phone_number"] & (~df["phone"].isna())].shape

(0, 9)

In [2341]:
# great! no conflicts
phone_pattern = re.compile(r'(\(?\b\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b)')
df["phone_extracted_from_address"] = df["address"].str.extract(phone_pattern)

In [2342]:
df[~df["phone_extracted_from_address"].isna()].sample(5)

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,action,address_contains_phone_number,phone_extracted_from_address
3133,https://go.boarddocs.com/mi/scacad/Board.nsf/Public,Saginaw Covenant Academy,https://covenantacademies.org/saginaw/,"508 S. Washington Avenue | Saginaw, MI 48607 | 989-596-1100",,,3133,school_district,True,989-596-1100
1940,https://go.boarddocs.com/ny/duanesburg/Board.nsf/Public,Duanesburg Central School,https://www.duanesburg.org/,"133 School Drive, Delanson, NY 12053 | (518) 895-2279",,,1940,school_district,True,(518) 895-2279
498,https://go.boarddocs.com/ny/ppcsd/Board.nsf/Public,,http://www.ppcsd.org/,"2829 Church Street :: Pine Plains, NY 12567 :: Phone: 518-398-7181 x1408",Pine Plains Central School District,,498,,True,518-398-7181
2732,https://go.boarddocs.com/wi/fpsd/Board.nsf/Public,,http://www.franklin.k12.wi.us/,"8255 West Forest Hill Avenue, Franklin, Wisconsin 53132 Ph: (414) 529-8220",Franklin Public School District,,2732,,True,(414) 529-8220
116,https://go.boarddocs.com/oh/wglsd/Board.nsf/Public,Waynesfield Goshen Local Schools,http://www.wgschools.org/,"500 N. Westminster St. | Waynesfield, OH 45896 | (419) 568-9100 | (419) 568-8024",,,116,school_district,True,(419) 568-9100


In [2343]:
# double check no clashes
df[(~df["phone_extracted_from_address"].isna()) & (~df["phone"].isna())]

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,action,address_contains_phone_number,phone_extracted_from_address


In [2344]:
# write in
df.loc[~df["phone_extracted_from_address"].isna(), "phone"] = df["phone_extracted_from_address"]
df.loc[~df["phone_extracted_from_address"].isna(), "phone_extracted_from_address"] = None

In [2345]:
# let's clean it
# extract all numbers and make sure there are 10

df['extracted_phone_numbers'] = None
df['extracted_phone_numbers'] = df['phone'].astype(str).str.replace(r'[^\d]', '', regex=True)

In [2346]:
# check their length
df["phone_length"] = df["extracted_phone_numbers"].apply(lambda x: len(x))
df["phone_length"].value_counts()

phone_length
10    2216
0     1684
20       4
12       1
30       1
Name: count, dtype: int64

In [2347]:
# check those that are not 10 or 0
df.loc[(df["phone_length"]!=10) & (df["phone_length"]!=0)]

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,action,address_contains_phone_number,phone_extracted_from_address,extracted_phone_numbers,phone_length
624,https://go.boarddocs.com/in/triton/Board.nsf/Public,Triton School Corporation,http://www.triton.k12.in.us,,,Home of the Trojans/574-342-2255/www.triton.k12.in.us,624,school_district,False,,574342225512,12
1199,https://go.boarddocs.com/wi/sdathen/Board.nsf/Public,,https://www.athens1.org,"School District of Athens, 601 West Limits Road, Athens, WI 54411",,Phone: 715-257-7511 Fax: 715-257-7502,1199,,False,,71525775117152577502,20
2173,https://go.boarddocs.com/ks/usd230/Board.nsf/Public,,http://www.usd230.org,"Spring Hill School District | 17640 W. 199th Street, Spring Hill, Kansas 66083",,Info: (913) 592-7272 | T: (913) 592-7200 | F: (913) 592-7270,2173,,False,,913592727291359272009135927270,30
2538,https://go.boarddocs.com/wi/campsd/Board.nsf/Public,,www.csd.k12.wi.us,"327 N. Fond du Lac Ave. Campbellsport, WI 53010",,(920) 533-8381 | Fax (920) 533 -5726,2538,,False,,92053383819205335726,20
2586,https://go.boarddocs.com/oh/cevsdoh/Board.nsf/Public,,www.carrollton.k12.oh.us,"Carrollton Exempted Village School District, - 205 Scio Road S.W., - Carrollton, OH 44615",,"Phone: 330-627-2181, - Fax: 330-627-2182",2586,,False,,33062721813306272182,20
2811,https://go.boarddocs.com/id/nsd131/Board.nsf/Public,,http://www.nsd131.org,"619 S. Canyon St | Nampa, ID 83686",,(208) 468-4600 Fax: (208) 468-4638,2811,,False,,20846846002084684638,20


In [2348]:
# ok, we can just take the top 10 numbers
df["extracted_phone_numbers"] = df["extracted_phone_numbers"].apply(lambda x: x[:10] if x else None)