This notebook takes a look at the scrapped results from the BoardDocs website and gets the correct addresses for each row.

Input:
- `prelim_results.csv`

In [8524]:
import pandas as pd

df = pd.read_csv("prelim_results.csv")
df.head()

Unnamed: 0,URL,title_1,title_2,home_website
0,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,Board Policies and Guidelines,St. Joseph Public Schools,https://www.sjschools.org/
1,https://go.boarddocs.com/pa/cali/Board.nsf/Public,School Board Policy Manual,,www.calsd.org
2,https://go.boarddocs.com/oh/mapleheights/Board.nsf/Public,Maple Heights City Schools,"5740 Lawn Avenue | Maple Heights, OH 44137 | 216-587-6100",http://www.mapleschools.com
3,https://go.boarddocs.com/oh/rlsd/Board.nsf/Public,"585 Riverside Drive | Painesville, Ohio 44077 | 440.352.0668 | f 440.639.1959",Riverside Local School District,https://www.riversidelocalschools.com/
4,https://go.boarddocs.com/pa/shun/Board.nsf/Public,School Board Policy Manual,Southern Huntingdon County School District,http://www.shcsd.org


In [8525]:
# we remove redundant info
df.loc[df["title_1"]==df["title_2"], "title_1"] = None

In [8526]:
# trim whitespace
df.loc[:, "title_1"] = df["title_1"].str.strip()
df.loc[:, "title_2"] = df["title_2"].str.strip()

In [8527]:
# consider the 5-digit zip code approach
# check which rows have 5-digit codes in Title1, and those with them in Title2.
# Hopefully this will be a partition

# Define a regex pattern to match a 5-digit zip code
zip_code_pattern = r'\b\d{5}\b'

# Find rows where Title1 has a 5-digit zip code
df['Title1_has_zipcode'] = df['title_1'].str.contains(zip_code_pattern, na=False)

# Find rows where Title2 has a 5-digit zip code
df['Title2_has_zipcode'] = df['title_2'].str.contains(zip_code_pattern, na=False)

In [8528]:
# check if it is a partition
# first check if they add up

import numpy as np

print(f"Num of rows where title 1 has zipcode {df['Title1_has_zipcode'].sum()}")
print(f"Num of rows where title 2 has zipcode {df['Title2_has_zipcode'].sum()}")
print(f"Num of rows where title 1 or title 2 has zipcode {np.sum(df['Title1_has_zipcode'] | df['Title2_has_zipcode'])}")
print(f"Num of rows where title 1 and title 2 have zipcode {np.sum(df['Title1_has_zipcode'] & df['Title2_has_zipcode'])}")
print(f"Num of total rows {df.shape[0]}")

Num of rows where title 1 has zipcode 1440
Num of rows where title 2 has zipcode 1442
Num of rows where title 1 or title 2 has zipcode 2882
Num of rows where title 1 and title 2 have zipcode 0
Num of total rows 3906


In [8529]:
# ok great, there are no rows where you can find zipcodes on both cols
# but some rows don't have zipcodes in either
# let's check them out

no_zipcode_df = df[~(df['Title1_has_zipcode'] | df['Title2_has_zipcode'])]
print(f"Num of rows without zipcode {no_zipcode_df.shape[0]}")
no_zipcode_df.sample(5)

Num of rows without zipcode 1024


Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
1029,https://go.boarddocs.com/mi/wake/Board.nsf/Public,Wakefield-Marenisco School District,(906) 224-7211,http://www.wmschools.org,False,False
2540,https://go.boarddocs.com/mi/lapisd/Board.nsf/Public,District Policies and Administrative Guidelines,Lapeer County Intermediate School District,https://www.lapeerisd.org,False,False
1946,https://go.boarddocs.com/wv/nicholas/Board.nsf/Public,School Board Policies and Guidelines,Nicholas County Schools,https://boe.nich.k12.wv.us/,False,False
2284,https://go.boarddocs.com/mi/mps/Board.nsf/Public,City of Muskegon Public Schools,School Board Policies,https://muskegonpublicschools.org/,False,False
2506,https://go.boarddocs.com/mi/alcona/Board.nsf/Public,School Board Policies and Guidelines,Alcona Community Schools,http://www.alconaschools.net,False,False


In [8530]:
# let's get the proportions


print(f"Percentage of rows where title 1 has zipcode {df['Title1_has_zipcode'].sum()/df.shape[0]*100:.2f}%")
print(f"Percentage of rows where title 2 has zipcode {df['Title2_has_zipcode'].sum()/df.shape[0]*100:.2f}%")
print(f"Percentage of rows where title 1 or title 2 has zipcode {np.sum(df['Title1_has_zipcode'] | df['Title2_has_zipcode']).sum()/df.shape[0]*100:.2f}%")
print(f"Percentage of rows where title 1 and title 2 have zipcode {np.sum(df['Title1_has_zipcode'] & df['Title2_has_zipcode']).sum()/df.shape[0]*100:.2f}%")
print(f"Percentage of rows with no zipcodes {no_zipcode_df.shape[0]/df.shape[0]*100:.2f}%")


Percentage of rows where title 1 has zipcode 36.87%
Percentage of rows where title 2 has zipcode 36.92%
Percentage of rows where title 1 or title 2 has zipcode 73.78%
Percentage of rows where title 1 and title 2 have zipcode 0.00%
Percentage of rows with no zipcodes 26.22%


In [8531]:
# some NaNs, some "Policy Manual", "School Board Policy and Guidelines", etc
# let's check the most common values

no_zipcode_df["title_1"].value_counts().head()

title_1
School Board Policy Manual              115
Policy Manual                            54
School Board Policies and Guidelines     41
School Board Policies                    22
BoardDocs PL                              9
Name: count, dtype: int64

In [8532]:
no_zipcode_df["title_2"].value_counts().head()

title_2
Board Policies                 24
                               22
School Board Policies          17
Board of Education             11
Board of Education Policies    11
Name: count, dtype: int64

In [8533]:
# ok, now let's check the website col
sum(df["home_website"].isna())

53

In [8534]:
# ok, unfortunately there are boarddocs without the home website linked
print(f"Percent of websites without links to official: {sum(df["home_website"].isna())/df.shape[0]*100:.3}%")

Percent of websites without links to official: 1.36%


In [8535]:
# but thankfully this number is small
# let's take a look at these websites

# do not truncate the col values in display
pd.set_option('display.max_colwidth', None)

df[df["home_website"].isna()].head()

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
232,https://go.boarddocs.com/pa/camb/Board.nsf/Public,,,,False,False
443,https://go.boarddocs.com/oh/oakhil/Board.nsf/Public,,,,False,False
465,https://go.boarddocs.com/oh/warrenoh/Board.nsf/Public,,,,False,False
468,https://go.boarddocs.com/pa/marp/Board.nsf/Public,School Board Policy Manual,Marple Newtown School District,,False,False
533,https://go.boarddocs.com/oh/meigs/Board.nsf/Public,,,,False,False


In [8536]:
# after inspecting a few, it seems like they will usually write their school district as the h1 tag at least.

In [8537]:
# back to the address
# an observation is that I don't think those that don't contain zip codes will have addresses on the website
# let's check if a single number exists in them

number_pattern = r'\d'
print("Number of no-zipcode rows that contain a number")
sum(no_zipcode_df["title_1"].str.contains(number_pattern, na=False))

Number of no-zipcode rows that contain a number


46

In [8538]:
no_zipcode_df[no_zipcode_df["title_1"].str.contains(number_pattern, na=False)].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
882,https://go.boarddocs.com/il/asd4/Board.nsf/Public,"222 N. Kennedy Drive, Addison, IL",Addison School District 4,http://www.asd4.org,False,False
2495,https://go.boarddocs.com/in/brem/Board.nsf/Public,Bremen Public Schools | Phone: (574) 546-3929 | Fax: (574) 546-6303 | School Board Policies and Guidelines,,https://www.bps.k12.in.us,False,False
3260,https://go.boarddocs.com/wi/chilsd/Board.nsf/Public,530 Main Street,Chilton Public Schools,www.chilton.k12.wi.us,False,False
2570,https://go.boarddocs.com/il/ccsd146/Board.nsf/Public,CCSD 146,Community Consolidated School District #146,http://www.district146.org,False,False
106,https://go.boarddocs.com/wi/edsd/Board.nsf/Public,Edgar School District 203 East Birch Street 715-352-2351,Edgar Excellence,www.edgar.k12.wi.us,False,False


In [8539]:
no_zipcode_df[no_zipcode_df["title_2"].str.contains(number_pattern, na=False)].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode
1798,https://go.boarddocs.com/il/lz95/Board.nsf/Public,,Lake Zurich Community Unit School District 95,http://www.lz95.org/,False,False
1997,https://go.boarddocs.com/mi/chip/Board.nsf/Public,Chippewa Valley Schools,586-723-2004,http://www.cvs.k12.mi.us,False,False
2918,https://go.boarddocs.com/oh/sidn/Board.nsf/Public,Sidney City Schools,"750 S. Fourth Ave., Sidney, OH 937-497-2200",http://www.sidneycityschools.org,False,False
3522,https://go.boarddocs.com/pa/iu10/Board.nsf/Public,Board Agendas and Policies,Central Intermediate Unit 10,http://www.ciu10.org,False,False
3287,https://go.boarddocs.com/wi/afasd/Board.nsf/Public,ADAMS-FRIENDSHIP AREA SCHOOL DISTRICT,"201 W. 6th Street, Friendship, WI",https://www.afasd.net/,False,False


In [8540]:
# it turns out that they could either be an address with a missing zip code, or phone numbers
# there are also misc cases

# let's get a conservative (high) bound on the number of addresses that we will miss

num_no_zipcode_with_number = no_zipcode_df[no_zipcode_df["title_1"].str.contains(number_pattern, na=False) | no_zipcode_df["title_2"].str.contains(number_pattern, na=False)].shape[0]
num_no_zipcode_with_number

105

In [8541]:
num_zipcode = df.shape[0] - no_zipcode_df.shape[0]

In [8542]:
print(f"Worst case proportion of addresses that we will miss {num_no_zipcode_with_number/(num_no_zipcode_with_number+num_zipcode)*100:.2}%")

Worst case proportion of addresses that we will miss 3.5%


In [8543]:
# we put the ones we know are correct at a new address field

df["address"] = None
df.loc[df['Title1_has_zipcode'], "address"] = df["title_1"]
df.loc[df['Title2_has_zipcode'], "address"] = df["title_2"]
df.sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address
1742,https://go.boarddocs.com/il/scusd/Board.nsf/Public,,Sherrard Community Unit School District,http://www.sherrard.us,False,False,
1060,https://go.boarddocs.com/oh/wooster/Board.nsf/Public,"144 North Market Street | Wooster, OH 44691 | Phone: (330) 988-1111",Wooster City School District,http://www.woostercityschools.org,True,False,"144 North Market Street | Wooster, OH 44691 | Phone: (330) 988-1111"
2278,https://go.boarddocs.com/oh/finlsd/Board.nsf/Public,,,www.finneytown.org,False,False,
260,https://go.boarddocs.com/in/nmont/Board.nsf/Public,North Montgomery School Corporation,"480 W. 580 N. Crawfordsville , IN 47933 | 765-359-2112",http://www.nm.k12.in.us,False,True,"480 W. 580 N. Crawfordsville , IN 47933 | 765-359-2112"
3350,https://go.boarddocs.com/in/evsc/Board.nsf/Public,EVSC School Board Meetings,Evansville Vanderburgh School Corporation,www.evscschools.com,False,False,


In [8544]:
# now, let's try to get the school district name.
# let's check if they have the word school

df["title_1_has_school"] = df["title_1"].str.contains("school",case=False,na=False)
df["title_2_has_school"] = df["title_2"].str.contains("school",case=False,na=False)

In [8545]:
# number of rows with schools in at least one col
df[df["title_1_has_school"] | df["title_2_has_school"]].shape[0]

3206

In [8546]:
# number of rows with schools in both cols
df[df["title_1_has_school"] & df["title_2_has_school"]].shape[0]

356

In [8547]:
# check out these rows with both cols having schools
df[df["title_1_has_school"] & df["title_2_has_school"]].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school
3485,https://go.boarddocs.com/wi/bowlsd/Board.nsf/Public,School District Policies,School District of Bowler,www.bowler.k12.wi.us,False,False,,True,True
1132,https://go.boarddocs.com/oh/pgl/Board.nsf/Public,School District Policy Manual,Pandora-Gilboa Local Schools,www.pgrockets.org,False,False,,True,True
3632,https://go.boarddocs.com/mi/misi/Board.nsf/Public,School Board Policies and Guidelines,Mesick Consolidated Schools,https://www.mesick.org,False,False,,True,True
1648,https://go.boarddocs.com/wi/linn/Board.nsf/Public,School Board Policies,Linn J4 School District,https://www.traverschool.org,False,False,,True,True
611,https://go.boarddocs.com/pa/bgbf/Board.nsf/Public,School Board Policy Manual,Big Beaver Falls School District,http://www.tigerweb.org/,False,False,,True,True


In [8548]:
# there is boilerplate text like School Board Policies
# let's try to get the top few popular ones and remove them

df["title_1"].value_counts().head(10)

title_1
School Board Policy Manual              118
Policy Manual                            55
School Board Policies and Guidelines     41
School Board Policies                    22
BoardDocs PL                              9
Board Policy and Guidelines               8
Board Policies                            7
School Board Policies and Bylaws          7
School Board Policies & Bylaws            6
Board Policies and Guidelines             6
Name: count, dtype: int64

In [8549]:
df["title_2"].value_counts().head(10)

title_2
Board Policies                      25
                                    25
School Board Policies               18
Board of Education                  14
Board of Education Policies         12
Board of Education Policy Manual    10
NEOLA Board Policies                10
Policy Manual                        6
School Board Policy Manual           6
Board Policy Manual                  5
Name: count, dtype: int64

In [8550]:
# let's remove these

remove_title_1_list = df["title_1"].value_counts().head(10).index
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
df["title_1"].value_counts().head(10)

title_1
Board Policy                          6
School District Policies              6
Board Policy Manual                   5
Board Policies and Bylaws             5
School Board Policies & Guidelines    4
eGovernance Site                      4
School Board Policy                   4
School District Policy Manual         3
Board Policies & Bylaws               3
Board Policy and Bylaws               3
Name: count, dtype: int64

In [8551]:
# let's remove these
remove_title_1_list = df["title_1"].value_counts().head(10).index
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
df["title_1"].value_counts().head(10)

title_1
Board of Education Policies and Guidelines                                                  3
School Board Policy and Guidelines                                                          3
School Board Policies and Administrative Guidelines                                         3
                                                                                            3
Board of Education Policies                                                                 3
2680 West County Road 476 | Bushnell, Florida 33513 | Ph: 352-793-2315  Fx: 352-793-4180    2
Bloomfield School District                                                                  2
1290 Ridder Park Drive | San Jose, CA 95131-2304 | (408) 453-6500                           2
School Board Policy & Guidelines                                                            2
1725 North Dodge St. |  Iowa City, IA 52245 | p (319) 688-1000 | f (319) 688-1009           2
Name: count, dtype: int64

In [8552]:
# let's remove the first few
remove_title_1_list = df["title_1"].value_counts().head(5).index
remove_title_1_list

Index(['Board of Education Policies and Guidelines',
       'School Board Policy and Guidelines',
       'School Board Policies and Administrative Guidelines', '',
       'Board of Education Policies'],
      dtype='object', name='title_1')

In [8553]:
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
df["title_1"].value_counts().head(10)

title_1
2680 West County Road 476 | Bushnell, Florida 33513 | Ph: 352-793-2315  Fx: 352-793-4180    2
1300 Sherman Street Ste 222 | Sturgis, SD  57785                                            2
Bloomfield School District                                                                  2
1290 Ridder Park Drive | San Jose, CA 95131-2304 | (408) 453-6500                           2
School Board Policy & Guidelines                                                            2
1725 North Dodge St. |  Iowa City, IA 52245 | p (319) 688-1000 | f (319) 688-1009           2
Electronic Governance System                                                                2
Policy Manual and Agendas                                                                   2
Board of Education                                                                          2
60 Jefferson Street, Suite 3• Monticello, NY 12701 • 845-794-7700                           2
Name: count, dtype: int64

In [8554]:
# pick those
remove_title_1_list = [
    "School Board By-Laws and Policies",
    "School Board policies and guidelines",
    "Policy Manual and Agendas",
    "Board of Education Policies",
    "School Board Policy and Bylaws",
    "Board Policy and Bylaws",
    "Board Policies and By-Laws",
    "School Board Policies ",
    "Board of Education",
    "School Board Policies and Guidlines",
    "School Board Bylaws and Policies",
    "SCHOOL BOARD POLICIES",
    "Board Policies & Guidelines",
    "School Board Policy & Guidelines",
    "Electronic Governance System",
    "School Board Agendas and Minutes",
    "School Board Policy and Administrative Guidelines",
    "School District Policies / Policy Manual",
    "School Policy Manual",
    "School Board Meetings",
    "School Board Policies & Ad Guidelines",
    "School Board meetings, agendas and policies",
    "School Board Policies and By Laws",
    "School board policies and manuals",
    "Board of School Directors",
    "School Board Agendas, Minutes, Policies and Guidelines",
    "Board Approved School Policy",
    "School Board Policies & Administrative Guidelines",
    "School Board Policy",
    "School Board Agendas, Policies and Guidelines",
    "School Board Bylaws/Policies/Guidelines",
    "SchoolBoard Policy Manual",
    "School Board Policy and Guidlines",
    "School Board Policy",
    "Governing Board Policy",
    "Board policy and guidelines",
    "Policies & By Laws",
    "Board of Trustees Policies and Administrative Guidelines",
    "Board of Education Policy and Bylaws"
    
]
df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None
# let's check what's left
print(df["title_1"].value_counts().index[:10])

Index(['2680 West County Road 476 | Bushnell, Florida 33513 | Ph: 352-793-2315  Fx: 352-793-4180',
       '200 Reid Street | Palatka, FL 32177 | (386) 329-0602',
       '60 Jefferson Street, Suite 3• Monticello, NY 12701 • 845-794-7700',
       'Bloomfield School District',
       '1290 Ridder Park Drive | San Jose, CA 95131-2304 | (408) 453-6500',
       '1300 Sherman Street Ste 222 | Sturgis, SD  57785',
       '315 N. French Avenue | Arlington  WA 98223 | 360.618.6200 | f 360.618.6221',
       '1725 North Dodge St. |  Iowa City, IA 52245 | p (319) 688-1000 | f (319) 688-1009',
       'Cleveland Metropolitan School District',
       '6301 Springside Avenue | Downers Grove, IL 60516 | Ph: (630) 795-7100  | Fx: (630) 795-7199'],
      dtype='object', name='title_1')


In [8555]:
# do this for title_2
# let's remove these
# NOTE: NEOLOA Board Policies might provide coarse information on whether the school could be located

remove_title_2_list = df["title_2"].value_counts().head(10).index
remove_title_2_list

Index(['Board Policies', '', 'School Board Policies', 'Board of Education',
       'Board of Education Policies', 'Board of Education Policy Manual',
       'NEOLA Board Policies', 'Policy Manual', 'School Board Policy Manual',
       'Board Policy Manual'],
      dtype='object', name='title_2')

In [8556]:
df.loc[df["title_2"].isin(remove_title_2_list), "title_2"] = None
# let's check what's left
df["title_2"].value_counts().head(10)

title_2
eGovernance Site                 4
Bylaws & Policies                4
Board of Education Policy        4
Board of Education Meetings      3
Board Policy                     3
Neola Board Policies             3
Board of Directors               3
Arlington Public Schools         3
Putnam County School District    2
BoardDocs - Meeting Agendas      2
Name: count, dtype: int64

In [8557]:
remove_title_2_list = df["title_2"].value_counts().head(4).index
df.loc[df["title_2"].isin(remove_title_2_list), "title_2"] = None
# let's check what's left
df["title_2"].value_counts().head(10)

title_2
Arlington Public Schools                  3
Neola Board Policies                      3
Board of Directors                        3
Board Policy                              3
Board Agendas                             2
Monticello Central School District        2
Meetings, Agendas and Information         2
Green Local Schools                       2
Santa Clara County Office of Education    2
Meade County, South Dakota                2
Name: count, dtype: int64

In [8558]:
remove_title_2_list = [
    "Meetings and Information",
    "Meetings, Agendas and Information",
    "Board Policy Manual",
    "Board of Education Meetings",
    "Board of Directors",
    "Policies",
    "BoardDocs - Meeting Agendas",
    "Board of Education Policies ",
    'Board Agendas',
    "NEOLA Board Policy",
    'NEOLA Board of Education Policies',
    'Neola Board Policies', 
    'NEOLA Board of Education Policy Manual',
    'NEOLA Policies', 
    'Meetings, Agendas, Information',
    'School Board Policies and Guidelines',
    "School Board Policies and Bylaws",
    "NEOLA School Board Policies",
    "Board of School Trustees Policy Manual",
    "School Board Policy	",
    "School Board Policies & Administrative Regulations	",
    "School Board Policy",
    "Board of School Trustees",
    "SCHOOL POLICIES AND GUIDELINES"
]
df.loc[df["title_2"].isin(remove_title_2_list), "title_2"] = None
# let's check what's left
df["title_2"].value_counts().index[:20]

Index(['Arlington Public Schools', 'Board Policy',
       'Central Valley School District', 'Meade County, South Dakota',
       'Sumter District Schools', 'Santa Clara County Office of Education',
       'Green Local Schools', 'eGovernance System',
       'Monticello Central School District',
       'Community High School District 99', 'Putnam County School District',
       '8485 Homestead, Zeeland, MI 49464 Phone: 616-748-5637',
       'Iowa City Community School District',
       '2045 School Street North Collins, NY 14111', 'Barry ISD',
       'Adena Local Schools',
       '801 Corporate Centre Drive | O'Fallon, MO 63368 | Phone: 636-851-4000',
       '1323 E. 7th Street, Lockport, IL 60441',
       'Phone: 330-627-2181, - Fax: 330-627-2182',
       'Johnson County School District #1'],
      dtype='object', name='title_2')

In [8559]:
# now, we check for the intersections again

# let's check if they have the word school

df["title_1_has_school"] = df["title_1"].str.contains("school",case=False,na=False)
df["title_2_has_school"] = df["title_2"].str.contains("school",case=False,na=False)

# number of rows with schools in both cols
df[df["title_1_has_school"] & df["title_2_has_school"]].shape[0]

89

In [8560]:
# check out these rows with both cols having schools
df[df["title_1_has_school"] & df["title_2_has_school"]].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school
349,https://go.boarddocs.com/vsba/rcpsva/Board.nsf/Public,"6 School House Road, Washington, VA 22747 Phone: 540-227-0023",Rappahannock County Public Schools,http://www.rappahannockschools.us,True,False,"6 School House Road, Washington, VA 22747 Phone: 540-227-0023",True,True
115,https://go.boarddocs.com/wi/hustsd/Board.nsf/Public,"Hustisford School District, 845 South Lake Street, Hustisford, WI 53034",Hustisford School District,www.hutisford.k12.wi.us,True,False,"Hustisford School District, 845 South Lake Street, Hustisford, WI 53034",True,True
3860,https://go.boarddocs.com/oh/spencerville/Board.nsf/Public,Spencerville Local School,"600 School Street | Spencerville, OH 45887 | 419-647-4111",https://www.spencervillebearcats.com/,False,True,"600 School Street | Spencerville, OH 45887 | 419-647-4111",True,True
2025,https://go.boarddocs.com/nj/rtboe/Board.nsf/Public,Randolph Township Schools,"25 School House Road, Randolph, NJ 07869 | (973) 361-0808",https://www.rtnj.org/,False,True,"25 School House Road, Randolph, NJ 07869 | (973) 361-0808",True,True
3071,https://go.boarddocs.com/pa/rasd/Board.nsf/Public,"62 School Drive | Ridgway, PA 15853 | p 814-773-3146 | f 814-776-4299",Ridgway Area School District,http://www.rasd.us,True,False,"62 School Drive | Ridgway, PA 15853 | p 814-773-3146 | f 814-776-4299",True,True


In [8561]:
# I asked ChatGPT to inspect and find more titles I can remove

remove_title_1_list = [
    "School Board Agendas, Minutes, and Policies",
    "School Board Policies, Bylaws, and Guidelines",
    "School Policies and Guidelines",
    "School Board Policy Manual",
    "Board of School Trustees Policy",
    "School Board Agendas and Policies",
    "SchoolBoard Policy Manual",
    "School Board Agendas, Policies, Rules and Exhibits",
    "School Board Policy",
    "SchoolBoard Policy Manual ",
    "School District Policies ",
    "Board Meetings and Policies",
    "Board Management System",
    "Board of Education Policy and Guidelines",
    "Board Policy and By-Laws",
    "Policy and Bylaws",
    "Board Bylaws & Policies",
    "Bylaws & Policies",
    "Board of Trustees Policies",
    "Policies - Bylaws",
    "Bylaws and Policies",
    "NEOLA Policies",
    "Board of Education Bylaws and Policies",
    "School Board Policy Manual",
    "Board Administrative Guidelines , Bylaws, Forms and Policies",
    "Board Policies, Administrative Guidelines, and Forms",
    "Board Policies, Bylaws, Administrative Guidelines, Forms",
    "Board Agendas and Policies",
    "Policies of the Board of Education",
    "School Board Policies and Guidelines",
    "Corporation Board Policies and Guidelines",
    "NEOLA Board Policies and By-Laws",
    "Policies & Bylaws",
    "Board of Education Policies and Guidelines",
    "Policies - Bylaws",
    "Policies of the Board of Education",
    "Board Policy Handbook",
    "Board of Education Policies",
    "Board Bylaws and Policies",
    "Board Policy Manual and Administrative Guidelines",
    "Board Policies and Administrative Guidelines",
    "Board of Education NEOLA Policy",
    "Board of Education Policies and Administrative Guidelines",
    "Board of School Trustees Policy Manual",
    "School Board Policies and Guidelines",
    "Board Policy",
    "Policies and Administrative Guidelines",
    "Board of Education Policy and Administrative Guidelines",
    "Policies & Administrative Guidelines",
    "Meetings, Agendas, Policy Manual",
    "Joint Operating Committee (JOC) Policy Manual",
    "Board Policy & Guidelines",
    "BOARD OF TRUSTEES",
    "School Board Policies, Meeting Agendas and Minutes",
    "Board Policies and By-laws",
    "Board of Regents",
    "Board of Education Policies and By-Laws",
    "Board of Education Bylaws and Policies/Administrative Guidelines",
    "BoardDocs LT",
    "Fairless District Policy Manual",
    "Board of Education Bylaws & Policies",
    "Board Policies, Administrative Guidelines and Forms",
    "Board Policy & Bylaws",
    "Success for all in the 21st Century . . ."
]

df.loc[df["title_1"].isin(remove_title_1_list), "title_1"] = None

In [8562]:
# I asked ChatGPT to inspect and find more titles I can remove

remove_title_2_list = [
    "POLICY",
    "Neola Board Policies & Guidelines",
    "Board of Education Policy and Guidelines",
    "NEOLA Policy Manual",
    "Board Policy",
    "Board Policies and Guidelines",
    "Policies And Administrative Guidelines",
    "Board Policies, Administrative Guidelines, and Forms",
    "Board of Education Policy",
    "Board Bylaws and Policies",
    "Policies - Bylaws",
    "Board of Education Bylaws and Policies",
    "Neola Board Policy & Administrative Guidelines",
    "Board Policy Handbook",
    "Policies of the Board of Education",
    "NEOLA Board Policy Manual",
    "Policies & Administrative Guidelines",
    "Policies & Bylaws",
    "Board Bylaws and Policies",
    "Board Policies",
    "Board of Education Policies",
    "Board of Education Policy and Administrative Guidelines",
    "Board Policy Manual",
    "Board of Education Policies and Administrative Guidelines",
    "BoardDocs LT",
    "Policies - Bylaws",
    "Policies of the Board of Education",
    "Board Policy and By-Laws",
    "Policies and Administrative Guidelines",
    "Board of Education Bylaws & Policies",
    "Board Policy",
    "Policies & Administrative Guidelines",
    "Board of Education School Policies",
    "School Board",
    "School Board Policies & Administrative Regulations",
    "School Board Policy",
    "Board of School Trustees Policy Manual",
    "School Board Policies and Guidelines",
    "School Board Policy ",
    "Board Policy",
    "Bylaws and Policies",
    "Meetings, Agenda and Information",
    "eGovernance System",
    "Meeting Packets",
    "Opportunity. Equity. Social Justice.",
    "Neola Board Policies & Guidelines",
    "Board Of Education Policy",
    "Board of Education NEOLA Policy",
    "Board Of Education Policies",
    "NEOLA - Board of Education Policies",
    "Board Policy Manual and Administrative Guidelines",
    "Policies & By-Laws",
    "Providing today's students opportunities to become tomorrow's leaders",
    "Board of School Trustees Policy",
    "Corporation Policies",
    "Board of Education Neola Polcies",
    "NEOLA Board Policies and By-Laws",
    "Board of Education Policy /Administrative Guidelines",
    "Board of Education Policies and Bylaws",
    "Board of Education Policies and Guidelines",
    "Neola Board of Education Policy Manual",
    "Neola - Board Policy",
    "Neola Board Policy",
    "Creating the Greatest Opportunities for Our Students"
]

df.loc[df["title_2"].isin(remove_title_2_list), "title_2"] = None

In [8563]:
# now, we check for the intersections again

# let's check if they have the word school

df["title_1_has_school"] = df["title_1"].str.contains("school",case=False,na=False)
df["title_2_has_school"] = df["title_2"].str.contains("school",case=False,na=False)

# number of rows with schools in both cols
df[df["title_1_has_school"] & df["title_2_has_school"]].shape[0]

79

In [8564]:
# check out these rows with both cols having schools
df[df["title_1_has_school"] & df["title_2_has_school"]].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school
3549,https://go.boarddocs.com/ny/wvalley/Board.nsf/Public,West Valley Central School District,"5359 School Street West Valley, NY 14171",https://www.wvalley.org/,False,True,"5359 School Street West Valley, NY 14171",True,True
365,https://go.boarddocs.com/ks/usd290/Board.nsf/Public,Ottawa School District,"Ottawa School District 1404 South Ash| Ottawa, Kansas 66067 | Phone: 785.229.8010 Fax: 785.229.8019",http://www.usd290.org,False,True,"Ottawa School District 1404 South Ash| Ottawa, Kansas 66067 | Phone: 785.229.8010 Fax: 785.229.8019",True,True
3326,https://go.boarddocs.com/mi/trav/Board.nsf/Public,TCAPS School Board Policies & Guidelines,Traverse City Area Public Schools,http://www.tcaps.net/board,False,False,,True,True
3071,https://go.boarddocs.com/pa/rasd/Board.nsf/Public,"62 School Drive | Ridgway, PA 15853 | p 814-773-3146 | f 814-776-4299",Ridgway Area School District,http://www.rasd.us,True,False,"62 School Drive | Ridgway, PA 15853 | p 814-773-3146 | f 814-776-4299",True,True
2588,https://go.boarddocs.com/in/wayne/Board.nsf/Public,Metropolitan School District of Wayne Township,"1220 South High School Road | Indianapolis, Indiana 46241 | Ph: 317-988-8600",http://www.wayne.k12.in.us,False,True,"1220 South High School Road | Indianapolis, Indiana 46241 | Ph: 317-988-8600",True,True


In [8565]:
# since a lot of it is from addresses,
# I will remove those where we have ported to the address field

df.loc[df["title_1"] == df["address"], "title_1"] = None
df.loc[df["title_2"] == df["address"], "title_2"] = None

In [8566]:
# now, we check for the intersections again

# let's check if they have the word school

df["title_1_has_school"] = df["title_1"].str.contains("school",case=False,na=False)
df["title_2_has_school"] = df["title_2"].str.contains("school",case=False,na=False)

# number of rows with schools in both cols
df[df["title_1_has_school"] & df["title_2_has_school"]].shape[0]

20

In [8567]:
# from 97 to 38, pretty good

# check out these rows with both cols having schools
df[df["title_1_has_school"] & df["title_2_has_school"]].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school
2060,https://go.boarddocs.com/vsba/fccpsva/Board.nsf/Public,Falls Church City Public Schools,Falls Church City School Board,https://www.fccps.org/page/school-board,False,False,,True,True
1161,https://go.boarddocs.com/mi/jon/Board.nsf/Public,JCS School Board Policies and Guidelines,Jonesville Community Schools,,False,False,,True,True
2878,https://go.boarddocs.com/pa/daup/Board.nsf/Public,"School Board Policy Manual 6001 Locust Lane, Harrisburg, PA",Dauphin County Technical School,http://www.dcts.org,False,False,,True,True
673,https://go.boarddocs.com/pa/laur/Board.nsf/Public,Laurel School Board Agendas and Policy Manual,Laurel School District,http://www.laurel.k12.pa.us,False,False,,True,True
3326,https://go.boarddocs.com/mi/trav/Board.nsf/Public,TCAPS School Board Policies & Guidelines,Traverse City Area Public Schools,http://www.tcaps.net/board,False,False,,True,True


In [8568]:
# now, let's check those with both titles still intact

n = df[(~df["title_1"].isna()) & (~df["title_2"].isna())].shape[0]
print(f"Number of rows with both cols intact: {n}")

Number of rows with both cols intact: 156


In [8569]:
# let's make a school district column
# only do those to rows where EXACTLY one column has the word school district

df["title_1_has_school_district"] = df["title_1"].str.contains("school district",case=False,na=False)
df["title_2_has_school_district"] = df["title_2"].str.contains("school district",case=False,na=False)

# number of rows with schools in both cols
df[df["title_1_has_school_district"] & df["title_2_has_school_district"]].shape[0]

4

In [8570]:
# check out these with both
df[df["title_1_has_school_district"] & df["title_2_has_school_district"]]

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school,title_1_has_school_district,title_2_has_school_district
538,https://go.boarddocs.com/ny/trivc/Board.nsf/Public,Tri-Valley Central School District Board of Education,Tri-Valley Central School District,http://www.trivalleycsd.org/,False,False,,True,True,True,True
769,https://go.boarddocs.com/pa/epns/Board.nsf/Public,EAST PENNSBORO AREA SCHOOL DISTRICT,East Pennsboro Area School District,http://www.epasd.org,False,False,,True,True,True,True
1593,https://go.boarddocs.com/oh/labr/Board.nsf/Public,Policies of the LaBrae Local School District,LaBrae Local School District,https://labrae.school,False,False,,True,True,True,True
2031,https://go.boarddocs.com/wi/vasd/Board.nsf/Public,Verona Area School District,Verona Area School District,http://www.verona.k12.wi.us,False,False,,True,True,True,True


In [8571]:
# we can remove all in title_1
df.loc[df["title_1_has_school_district"] & df["title_2_has_school_district"], "title_1"] = None

In [8572]:
# let's make a school district column
# only do those to rows where EXACTLY one column has the word school district

df["title_1_has_school_district"] = df["title_1"].str.contains("school district",case=False,na=False)
df["title_2_has_school_district"] = df["title_2"].str.contains("school district",case=False,na=False)

# number of rows with schools in both cols
df[df["title_1_has_school_district"] & df["title_2_has_school_district"]].shape[0]

0

In [8573]:
# now we can assign the school_district column

df.loc[df["title_1_has_school_district"],"school_district"] = df["title_1"]
df.loc[df["title_1_has_school_district"],"title_1"] = None

df.loc[df["title_2_has_school_district"],"school_district"] = df["title_2"]
df.loc[df["title_2_has_school_district"],"title_2"] = None

In [8574]:
# now let's look at the remaining columns
# first check those where both cols are still intact

df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())].shape[0]

70

In [8575]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,Title1_has_zipcode,Title2_has_zipcode,address,title_1_has_school,title_2_has_school,title_1_has_school_district,title_2_has_school_district,school_district
3500,https://go.boarddocs.com/mi/carain/Board.nsf/Public,Carman-Ainsworth Community Schools,810-591-3700,http://www.carmanainsworth.org,False,False,,True,False,False,False,
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29 & Schuylkill Technology Center Boards of Directors,Schuylkill Intermediate Unit 29,http://www.iu29.org,False,False,,False,False,False,False,
2918,https://go.boarddocs.com/oh/sidn/Board.nsf/Public,Sidney City Schools,"750 S. Fourth Ave., Sidney, OH 937-497-2200",http://www.sidneycityschools.org,False,False,,True,False,False,False,
1537,https://go.boarddocs.com/oh/moesc/Board.nsf/Public,Mid-Ohio Policies,Mid-Ohio ESC,www.moesc.net,False,False,,False,False,False,False,
3046,https://go.boarddocs.com/mi/badax/Board.nsf/Public,BAPS,Home of the Hatchets,www.badaxeps.org,False,False,,False,False,False,False,


In [8576]:
# now, let's check those with policy or policies in their titles

df["title_1_has_policy"] = df["title_1"].str.contains("policy",case=False,na=False) | df["title_1"].str.contains("policies",case=False,na=False)
df["title_2_has_policy"] = df["title_2"].str.contains("policy",case=False,na=False) | df["title_2"].str.contains("policies",case=False,na=False)

# number of rows with policy in any cols
df[df["title_1_has_policy"] | df["title_2_has_policy"]].shape[0]

50

In [8577]:
# declutter columns
df = df.loc[:,["URL","title_1","title_2","home_website","address","school_district"]]

In [8578]:
df.head()

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district
0,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,,St. Joseph Public Schools,https://www.sjschools.org/,,
1,https://go.boarddocs.com/pa/cali/Board.nsf/Public,,,www.calsd.org,,
2,https://go.boarddocs.com/oh/mapleheights/Board.nsf/Public,Maple Heights City Schools,,http://www.mapleschools.com,"5740 Lawn Avenue | Maple Heights, OH 44137 | 216-587-6100",
3,https://go.boarddocs.com/oh/rlsd/Board.nsf/Public,,,https://www.riversidelocalschools.com/,"585 Riverside Drive | Painesville, Ohio 44077 | 440.352.0668 | f 440.639.1959",Riverside Local School District
4,https://go.boarddocs.com/pa/shun/Board.nsf/Public,,,http://www.shcsd.org,,Southern Huntingdon County School District


In [8579]:
# let's try to common title_1 and title_2
# first standardize None and NaN

print(df.loc[df["URL"]=="https://go.boarddocs.com/pa/cali/Board.nsf/Public", "title_1"])
print(df.loc[df["URL"]=="https://go.boarddocs.com/pa/cali/Board.nsf/Public", "title_1"].isna())

1    None
Name: title_1, dtype: object
1    True
Name: title_1, dtype: bool


In [8580]:
print(df.loc[df["URL"]=="https://go.boarddocs.com/pa/cali/Board.nsf/Public", "title_2"])
print(df.loc[df["URL"]=="https://go.boarddocs.com/pa/cali/Board.nsf/Public", "title_2"].isna())

1    NaN
Name: title_2, dtype: object
1    True
Name: title_2, dtype: bool


In [8581]:
# set them to None
df.loc[df["title_1"].isna(), "title_1"] = None
df.loc[df["title_2"].isna(), "title_2"] = None

In [8582]:
# check those with both not None
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())].shape[0]

70

In [8583]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())].sample(5)

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district
1987,https://go.boarddocs.com/oh/nls/Board.nsf/Public,600 Lemoyne Rd | Northwood OH | P: (419) 691-3888 | F: (419) 697-2470,Northwood Local Schools,http://www.northwoodschools.org/site/default.aspx?PageID=1,,
3260,https://go.boarddocs.com/wi/chilsd/Board.nsf/Public,530 Main Street,Chilton Public Schools,www.chilton.k12.wi.us,,
1292,https://go.boarddocs.com/oh/nwoesc/Board.nsf/Public,"NwOESC Board Agendas, Minutes, By-Laws and Policy Manual",Northwest Ohio Educational Service Center,https://www.nwoesc.org/,,
2683,https://go.boarddocs.com/mi/slps/Board.nsf/Public,Spring Lake Public Schools,616-846-5500,http://www.springlakeschools.org,,
2047,https://go.boarddocs.com/ok/okcps/Board.nsf/Public,Oklahoma City Public Schools,Ignite Passion. Instill Pride.,https://go.boarddocs.com/ok/okcps/Board.nsf/,,


In [8584]:
# get those with phone numbers out
import re

# Function to check if a string contains a phone number
def contains_phone_number(value):
    # Regular expression for phone numbers
    phone_pattern = re.compile(r'\(?\b\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b')
    if pd.isnull(value):
        return False
    return bool(phone_pattern.search(str(value)))

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df['contains_phone_number_title_2'] = df['title_2'].apply(contains_phone_number)

In [8585]:
both_col_non_na_df = df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]
both_col_non_na_df[both_col_non_na_df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2
1112,https://go.boarddocs.com/la/pcpsb/Board.nsf/Public,"337 Napoleon Street ● New Roads, Louisiana ● p 225-638-8674 ● f 225-638-3237",Pointe Coupee Parish School System,http://www.pcpsb.net/,,,True,False
1576,https://go.boarddocs.com/vsba/fairfax/Board.nsf/Public,"8115 Gatehouse Road, Suite 5400 | Falls Church, VA | 571-423-1075",Fairfax County School Board,http://www.fcps.edu,,,True,False
1987,https://go.boarddocs.com/oh/nls/Board.nsf/Public,600 Lemoyne Rd | Northwood OH | P: (419) 691-3888 | F: (419) 697-2470,Northwood Local Schools,http://www.northwoodschools.org/site/default.aspx?PageID=1,,,True,False
2150,https://go.boarddocs.com/oh/polaris/Board.nsf/Public,"7285 Old Oak Blvd., | Middleburg Heights, OH | 440-891-7600",Polaris Career Center,http://www.polaris.edu/,,,True,False
3356,https://go.boarddocs.com/wa/cowa/Board.nsf/Public,City Council Chambers ~ 500 E. Main Street ~ 509-488-5686 ~ www.othellowa.gov,Othello Washington ~ City Council ~ Serving The Community,http://www.othellowa.gov,,,True,False


In [8586]:
# all the above are address
move_these = both_col_non_na_df[both_col_non_na_df["contains_phone_number_title_1"]]["URL"]
df.loc[df["URL"].isin(move_these), "address"] = df[df["URL"].isin(move_these)]["title_1"]
df.loc[df["URL"].isin(move_these), "title_1"] = None

In [8587]:
both_col_non_na_df = df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]
both_col_non_na_df[both_col_non_na_df["contains_phone_number_title_2"]]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2
159,https://go.boarddocs.com/in/resc/Board.nsf/Public,Randolph Eastern School Corporation,7659644994,http://www.resc.k12.in.us,,,False,True
440,https://go.boarddocs.com/mi/mionline/Board.nsf/Public,Michigan Online School,Phone: (269) 216-6972,https://www.michiganonlineschool.com/,,,False,True
624,https://go.boarddocs.com/in/triton/Board.nsf/Public,Triton School Corporation,Home of the Trojans/574-342-2255/www.triton.k12.in.us,http://www.triton.k12.in.us,,,False,True
715,https://go.boarddocs.com/mi/brand/Board.nsf/Public,Brandywine Community Schools,269-684-7150,http://www.brandywinebobcats.org,,,False,True
1608,https://go.boarddocs.com/oh/sclsd/Board.nsf/Public,South Central Local Schools,3305 Greenwich Angling Rd | 419-752-3815,http://www.south-central.org,,,False,True
1611,https://go.boarddocs.com/mi/whitec/Board.nsf/Public,White Cloud Public Schools,231-689-6591,www.whitecloud.net,,,False,True
1997,https://go.boarddocs.com/mi/chip/Board.nsf/Public,Chippewa Valley Schools,586-723-2004,http://www.cvs.k12.mi.us,,,False,True
2306,https://go.boarddocs.com/mi/clark/Board.nsf/Public,Clarkston Community Schools,248-623-5400,http://www.clarkston.k12.mi.us,,,False,True
2383,https://go.boarddocs.com/mi/white/Board.nsf/Public,Whitefish Township Community Schools,(906) 492-3353,http://whitefish.eupschools.org,,,False,True
2621,https://go.boarddocs.com/in/sgib/Board.nsf/Public,South Gibson School Corporation,812-753-4230,http://www.sgibson.k12.in.us,,,False,True


In [8588]:
# move those two that are address
move_these = ["https://go.boarddocs.com/oh/sclsd/Board.nsf/Public", "https://go.boarddocs.com/oh/sidn/Board.nsf/Public"]
df.loc[df["URL"].isin(move_these), "address"] = df["title_2"]
df.loc[df["URL"].isin(move_these), "title_2"] = None

In [8589]:
both_col_non_na_df = df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]
both_col_non_na_df[both_col_non_na_df["contains_phone_number_title_2"]]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2
159,https://go.boarddocs.com/in/resc/Board.nsf/Public,Randolph Eastern School Corporation,7659644994,http://www.resc.k12.in.us,,,False,True
440,https://go.boarddocs.com/mi/mionline/Board.nsf/Public,Michigan Online School,Phone: (269) 216-6972,https://www.michiganonlineschool.com/,,,False,True
624,https://go.boarddocs.com/in/triton/Board.nsf/Public,Triton School Corporation,Home of the Trojans/574-342-2255/www.triton.k12.in.us,http://www.triton.k12.in.us,,,False,True
715,https://go.boarddocs.com/mi/brand/Board.nsf/Public,Brandywine Community Schools,269-684-7150,http://www.brandywinebobcats.org,,,False,True
1611,https://go.boarddocs.com/mi/whitec/Board.nsf/Public,White Cloud Public Schools,231-689-6591,www.whitecloud.net,,,False,True
1997,https://go.boarddocs.com/mi/chip/Board.nsf/Public,Chippewa Valley Schools,586-723-2004,http://www.cvs.k12.mi.us,,,False,True
2306,https://go.boarddocs.com/mi/clark/Board.nsf/Public,Clarkston Community Schools,248-623-5400,http://www.clarkston.k12.mi.us,,,False,True
2383,https://go.boarddocs.com/mi/white/Board.nsf/Public,Whitefish Township Community Schools,(906) 492-3353,http://whitefish.eupschools.org,,,False,True
2621,https://go.boarddocs.com/in/sgib/Board.nsf/Public,South Gibson School Corporation,812-753-4230,http://www.sgibson.k12.in.us,,,False,True
2683,https://go.boarddocs.com/mi/slps/Board.nsf/Public,Spring Lake Public Schools,616-846-5500,http://www.springlakeschools.org,,,False,True


In [8590]:
# move the others to phone numbers
df["phone"] = None
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["contains_phone_number_title_2"], "phone"] = df["title_2"]
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["contains_phone_number_title_2"], "title_2"] = None

In [8591]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
42,https://go.boarddocs.com/oh/rid/Board.nsf/Public,Ridgemont Local Schools,"560 W. Taylor Street Mount Victory, OH",http://www.ridgemont.k12.oh.us,,,False,False,
283,https://go.boarddocs.com/oh/putesc/Board.nsf/Public,Putnam County Educational Services Center District Policy Manual,Putnam County Educational Service Center,www.putnamcountyesc.org,,,False,False,
356,https://go.boarddocs.com/ca/mendocino/Board.nsf/Public,Mendocino-Lake Community College District,"1000 Hensley Creek Road, Ukiah, CA",https://www.mendocino.edu,,,False,False,
415,https://go.boarddocs.com/mabe/calvert/Board.nsf/Public,Calvert County Board of Education,Calvert County Public Schools,http://www.calvertnet.k12.md.us,,,False,False,
532,https://go.boarddocs.com/vsba/pwcs/Board.nsf/Public,Launching Thriving Futures,Prince William County Public Schools,https://www.pwcs.edu/,,,False,False,
628,https://go.boarddocs.com/oh/cvcc/Board.nsf/Public,Cuyahoga Valley Career Center Bylaws and Policies,Cuyahoga Valley Career Center,http://www.cvccworks.edu/Default.aspx,,,False,False,
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29 & Schuylkill Technology Center Boards of Directors,Schuylkill Intermediate Unit 29,http://www.iu29.org,,,False,False,
985,https://go.boarddocs.com/mi/byr/Board.nsf/Public,Byron Area Schools,Home of the Eagles,http://www.byron.k12.mi.us,,,False,False,
1050,https://go.boarddocs.com/in/rodsped/Board.nsf/Public,Ripley-Ohio-Dearborn Special Education Cooperative,ROD Board Policy,http://www.rodspecialeducation.org,,,False,False,
1054,https://go.boarddocs.com/ga/fcss/Board.nsf/Public,Where Students Come First,Fulton County Schools,https://portal.fultonschools.org/Pages/default.aspx,,,False,False,


In [8592]:
# check those with numbers
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx & df["title_1"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29 & Schuylkill Technology Center Boards of Directors,Schuylkill Intermediate Unit 29,http://www.iu29.org,,,False,False,
1386,https://go.boarddocs.com/mo/lsr7sd/Board.nsf/Public,LSR-7,Learning for Life,http://www.lsr7.org,,,False,False,
2878,https://go.boarddocs.com/pa/daup/Board.nsf/Public,"School Board Policy Manual 6001 Locust Lane, Harrisburg, PA",Dauphin County Technical School,http://www.dcts.org,,,False,False,
3260,https://go.boarddocs.com/wi/chilsd/Board.nsf/Public,530 Main Street,Chilton Public Schools,www.chilton.k12.wi.us,,,False,False,
3456,https://go.boarddocs.com/vsba/vhsl/Board.nsf/Public,Serving Youth Since 1913,Virginia High School League,http://www.vhsl.org/about_vhsl/executive_committee,,,False,False,


In [8593]:
# edit the outlier
df.loc[df["URL"]=="https://go.boarddocs.com/pa/daup/Board.nsf/Public", "title_1"] = "6001 Locust Lane, Harrisburg, PA"

In [8594]:
# check those with numbers
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx & df["title_1"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29 & Schuylkill Technology Center Boards of Directors,Schuylkill Intermediate Unit 29,http://www.iu29.org,,,False,False,
1386,https://go.boarddocs.com/mo/lsr7sd/Board.nsf/Public,LSR-7,Learning for Life,http://www.lsr7.org,,,False,False,
2878,https://go.boarddocs.com/pa/daup/Board.nsf/Public,"6001 Locust Lane, Harrisburg, PA",Dauphin County Technical School,http://www.dcts.org,,,False,False,
3260,https://go.boarddocs.com/wi/chilsd/Board.nsf/Public,530 Main Street,Chilton Public Schools,www.chilton.k12.wi.us,,,False,False,
3456,https://go.boarddocs.com/vsba/vhsl/Board.nsf/Public,Serving Youth Since 1913,Virginia High School League,http://www.vhsl.org/about_vhsl/executive_committee,,,False,False,


In [8595]:
# move these two that are addresses
move_these = ["https://go.boarddocs.com/pa/daup/Board.nsf/Public", "https://go.boarddocs.com/wi/chilsd/Board.nsf/Public"]
df.loc[df["URL"].isin(move_these), "address"] = df["title_1"]
df.loc[df["URL"].isin(move_these), "title_1"] = None

# check
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx & df["title_1"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29 & Schuylkill Technology Center Boards of Directors,Schuylkill Intermediate Unit 29,http://www.iu29.org,,,False,False,
1386,https://go.boarddocs.com/mo/lsr7sd/Board.nsf/Public,LSR-7,Learning for Life,http://www.lsr7.org,,,False,False,
3456,https://go.boarddocs.com/vsba/vhsl/Board.nsf/Public,Serving Youth Since 1913,Virginia High School League,http://www.vhsl.org/about_vhsl/executive_committee,,,False,False,


In [8596]:
# move the second one to district and delete the two others
df.loc[df["URL"]=="https://go.boarddocs.com/mo/lsr7sd/Board.nsf/Public", "address"] = df["title_1"]
df.loc[df["URL"]=="https://go.boarddocs.com/mo/lsr7sd/Board.nsf/Public", "title_1"] = None

# check
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx & df["title_1"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
735,https://go.boarddocs.com/pa/iu29/Board.nsf/Public,Schuylkill Intermediate Unit 29 & Schuylkill Technology Center Boards of Directors,Schuylkill Intermediate Unit 29,http://www.iu29.org,,,False,False,
3456,https://go.boarddocs.com/vsba/vhsl/Board.nsf/Public,Serving Youth Since 1913,Virginia High School League,http://www.vhsl.org/about_vhsl/executive_committee,,,False,False,


In [8597]:
df.loc[idx & df["title_1"].str.contains(number_pattern, na=False), "title_1"] = None

In [8598]:
# do the same for title_2
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx & df["title_2"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
42,https://go.boarddocs.com/oh/rid/Board.nsf/Public,Ridgemont Local Schools,"560 W. Taylor Street Mount Victory, OH",http://www.ridgemont.k12.oh.us,,,False,False,
356,https://go.boarddocs.com/ca/mendocino/Board.nsf/Public,Mendocino-Lake Community College District,"1000 Hensley Creek Road, Ukiah, CA",https://www.mendocino.edu,,,False,False,
1203,https://go.boarddocs.com/in/brownsburg/Board.nsf/Public,Brownsburg Community School Corporation,"310 Stadium Drive Brownsburg, IN",https://www.brownsburg.k12.in.us/,,,False,False,


In [8599]:
# move them to address
df.loc[idx & df["title_2"].str.contains(number_pattern, na=False),"address"] = df["title_2"]
df.loc[idx & df["title_2"].str.contains(number_pattern, na=False),"title_2"] = None

In [8600]:
# check the rest
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
df.loc[idx,:]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
283,https://go.boarddocs.com/oh/putesc/Board.nsf/Public,Putnam County Educational Services Center District Policy Manual,Putnam County Educational Service Center,www.putnamcountyesc.org,,,False,False,
415,https://go.boarddocs.com/mabe/calvert/Board.nsf/Public,Calvert County Board of Education,Calvert County Public Schools,http://www.calvertnet.k12.md.us,,,False,False,
532,https://go.boarddocs.com/vsba/pwcs/Board.nsf/Public,Launching Thriving Futures,Prince William County Public Schools,https://www.pwcs.edu/,,,False,False,
628,https://go.boarddocs.com/oh/cvcc/Board.nsf/Public,Cuyahoga Valley Career Center Bylaws and Policies,Cuyahoga Valley Career Center,http://www.cvccworks.edu/Default.aspx,,,False,False,
985,https://go.boarddocs.com/mi/byr/Board.nsf/Public,Byron Area Schools,Home of the Eagles,http://www.byron.k12.mi.us,,,False,False,
1050,https://go.boarddocs.com/in/rodsped/Board.nsf/Public,Ripley-Ohio-Dearborn Special Education Cooperative,ROD Board Policy,http://www.rodspecialeducation.org,,,False,False,
1054,https://go.boarddocs.com/ga/fcss/Board.nsf/Public,Where Students Come First,Fulton County Schools,https://portal.fultonschools.org/Pages/default.aspx,,,False,False,
1161,https://go.boarddocs.com/mi/jon/Board.nsf/Public,JCS School Board Policies and Guidelines,Jonesville Community Schools,,,,False,False,
1214,https://go.boarddocs.com/vsba/scs/Board.nsf/Public,Together - We Prepare Our Students for Their Future,Spotsylvania County Public Schools,http://www.spotsylvania.k12.va.us/,,,False,False,
1292,https://go.boarddocs.com/oh/nwoesc/Board.nsf/Public,"NwOESC Board Agendas, Minutes, By-Laws and Policy Manual",Northwest Ohio Educational Service Center,https://www.nwoesc.org/,,,False,False,


In [8601]:
# check those with schools in one but not in the other
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
idx &= df["title_1"].str.contains("school",case=False,na=False) & ~df["title_2"].str.contains("school",case=False,na=False)
df.loc[idx]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
985,https://go.boarddocs.com/mi/byr/Board.nsf/Public,Byron Area Schools,Home of the Eagles,http://www.byron.k12.mi.us,,,False,False,
1448,https://go.boarddocs.com/mi/csps/Board.nsf/Public,Cedar Springs Public Schools,Board of Education Board Policies,http://www.csredhawks.org,,,False,False,
1548,https://go.boarddocs.com/mi/lfen/Board.nsf/Public,Lake Fenton Community Schools,Home of the Lake Fenton Blue Devils,http://www.lakefentonschools.org,,,False,False,
1980,https://go.boarddocs.com/mi/fwlv/Board.nsf/Public,Fowlerville Community Schools,Home of the Gladiators,http://www.fowlervilleschools.org,,,False,False,
2047,https://go.boarddocs.com/ok/okcps/Board.nsf/Public,Oklahoma City Public Schools,Ignite Passion. Instill Pride.,https://go.boarddocs.com/ok/okcps/Board.nsf/,,,False,False,
2108,https://go.boarddocs.com/oh/maplecc/Board.nsf/Public,SCHOOL POLICIES AND GUIDELINES,MAPLEWOOD CAREER CENTER,http://www.mwood.cc/,,,False,False,
2301,https://go.boarddocs.com/vsba/louisa/Board.nsf/Public,Louisa County Public Schools,Learners' Community,http://www.lcps.k12.va.us/education/components/scrapbook/default.php?sectiondetailid=1308&PHPSESSID=00a5e9972c66fd8162a098aed2356931,,,False,False,
2819,https://go.boarddocs.com/in/sssc/Board.nsf/Public,Southwest School Corporation,By-Laws and Policies,http://www.swest.k12.in.us,,,False,False,
3125,https://go.boarddocs.com/in/valp/Board.nsf/Public,Valparaiso Community Schools,Home of the Vikings,http://www.valpo.k12.in.us,,,False,False,
3126,https://go.boarddocs.com/oh/zville/Board.nsf/Public,Zanesville City Schools District,Home of the Blue Devils,http://www.zanesville.k12.oh.us,,,False,False,


In [8602]:
# take out the outlier
df.loc[df["URL"] == "https://go.boarddocs.com/oh/maplecc/Board.nsf/Public", "title_1"] = None

In [8603]:
# check those with schools in one but not in the other
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
idx &= df["title_1"].str.contains("school",case=False,na=False) & ~df["title_2"].str.contains("school",case=False,na=False)
df.loc[idx]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
985,https://go.boarddocs.com/mi/byr/Board.nsf/Public,Byron Area Schools,Home of the Eagles,http://www.byron.k12.mi.us,,,False,False,
1448,https://go.boarddocs.com/mi/csps/Board.nsf/Public,Cedar Springs Public Schools,Board of Education Board Policies,http://www.csredhawks.org,,,False,False,
1548,https://go.boarddocs.com/mi/lfen/Board.nsf/Public,Lake Fenton Community Schools,Home of the Lake Fenton Blue Devils,http://www.lakefentonschools.org,,,False,False,
1980,https://go.boarddocs.com/mi/fwlv/Board.nsf/Public,Fowlerville Community Schools,Home of the Gladiators,http://www.fowlervilleschools.org,,,False,False,
2047,https://go.boarddocs.com/ok/okcps/Board.nsf/Public,Oklahoma City Public Schools,Ignite Passion. Instill Pride.,https://go.boarddocs.com/ok/okcps/Board.nsf/,,,False,False,
2301,https://go.boarddocs.com/vsba/louisa/Board.nsf/Public,Louisa County Public Schools,Learners' Community,http://www.lcps.k12.va.us/education/components/scrapbook/default.php?sectiondetailid=1308&PHPSESSID=00a5e9972c66fd8162a098aed2356931,,,False,False,
2819,https://go.boarddocs.com/in/sssc/Board.nsf/Public,Southwest School Corporation,By-Laws and Policies,http://www.swest.k12.in.us,,,False,False,
3125,https://go.boarddocs.com/in/valp/Board.nsf/Public,Valparaiso Community Schools,Home of the Vikings,http://www.valpo.k12.in.us,,,False,False,
3126,https://go.boarddocs.com/oh/zville/Board.nsf/Public,Zanesville City Schools District,Home of the Blue Devils,http://www.zanesville.k12.oh.us,,,False,False,
3164,https://go.boarddocs.com/mi/clio/Board.nsf/Public,Clio Area Schools,Home of the Mustangs,http://www.clioschools.org,,,False,False,


In [8604]:
# all slogans in second col, delete
df.loc[idx, "title_2"] = None

In [8605]:
# do it for the second col
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
idx &= df["title_2"].str.contains("school",case=False,na=False) & ~df["title_1"].str.contains("school",case=False,na=False)
df.loc[idx]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
415,https://go.boarddocs.com/mabe/calvert/Board.nsf/Public,Calvert County Board of Education,Calvert County Public Schools,http://www.calvertnet.k12.md.us,,,False,False,
532,https://go.boarddocs.com/vsba/pwcs/Board.nsf/Public,Launching Thriving Futures,Prince William County Public Schools,https://www.pwcs.edu/,,,False,False,
1054,https://go.boarddocs.com/ga/fcss/Board.nsf/Public,Where Students Come First,Fulton County Schools,https://portal.fultonschools.org/Pages/default.aspx,,,False,False,
1214,https://go.boarddocs.com/vsba/scs/Board.nsf/Public,Together - We Prepare Our Students for Their Future,Spotsylvania County Public Schools,http://www.spotsylvania.k12.va.us/,,,False,False,
3308,https://go.boarddocs.com/in/hses/Board.nsf/Public,For additional information click the HOUSE icon to view streamed/archived board meeting videos.,Hamilton Southeastern Schools,https://www.hseschools.org/meet-hse/board,,,False,False,
3393,https://go.boarddocs.com/mo/nixa/Board.nsf/Public,Board of Education Meeting Information,Nixa Public Schools,http://www.nixapublicschools.net,,,False,False,
3430,https://go.boarddocs.com/mn/d196/Board.nsf/Public,"Educating, developing, and inspiring our students for lifelong success.",Rosemount - Apple Valley - Eagan Public Schools,http://www.district196.org/,,,False,False,
3443,https://go.boarddocs.com/mi/tda/Board.nsf/Public,THE DEARBORN ACADEMY,Public Charter School,http://www.thedearbornacademy.org,,,False,False,


In [8606]:
# edit the outlier
df.loc[df["URL"] == "https://go.boarddocs.com/mi/tda/Board.nsf/Public", "title_2"] = None
idx = (~df["title_1"].isna()) & (~df["title_2"].isna())
idx &= df["title_2"].str.contains("school",case=False,na=False) & ~df["title_1"].str.contains("school",case=False,na=False)
df.loc[idx]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
415,https://go.boarddocs.com/mabe/calvert/Board.nsf/Public,Calvert County Board of Education,Calvert County Public Schools,http://www.calvertnet.k12.md.us,,,False,False,
532,https://go.boarddocs.com/vsba/pwcs/Board.nsf/Public,Launching Thriving Futures,Prince William County Public Schools,https://www.pwcs.edu/,,,False,False,
1054,https://go.boarddocs.com/ga/fcss/Board.nsf/Public,Where Students Come First,Fulton County Schools,https://portal.fultonschools.org/Pages/default.aspx,,,False,False,
1214,https://go.boarddocs.com/vsba/scs/Board.nsf/Public,Together - We Prepare Our Students for Their Future,Spotsylvania County Public Schools,http://www.spotsylvania.k12.va.us/,,,False,False,
3308,https://go.boarddocs.com/in/hses/Board.nsf/Public,For additional information click the HOUSE icon to view streamed/archived board meeting videos.,Hamilton Southeastern Schools,https://www.hseschools.org/meet-hse/board,,,False,False,
3393,https://go.boarddocs.com/mo/nixa/Board.nsf/Public,Board of Education Meeting Information,Nixa Public Schools,http://www.nixapublicschools.net,,,False,False,
3430,https://go.boarddocs.com/mn/d196/Board.nsf/Public,"Educating, developing, and inspiring our students for lifelong success.",Rosemount - Apple Valley - Eagan Public Schools,http://www.district196.org/,,,False,False,


In [8607]:
# delete the first col
df.loc[idx,"title_1"] = None

In [8608]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
283,https://go.boarddocs.com/oh/putesc/Board.nsf/Public,Putnam County Educational Services Center District Policy Manual,Putnam County Educational Service Center,www.putnamcountyesc.org,,,False,False,
628,https://go.boarddocs.com/oh/cvcc/Board.nsf/Public,Cuyahoga Valley Career Center Bylaws and Policies,Cuyahoga Valley Career Center,http://www.cvccworks.edu/Default.aspx,,,False,False,
1050,https://go.boarddocs.com/in/rodsped/Board.nsf/Public,Ripley-Ohio-Dearborn Special Education Cooperative,ROD Board Policy,http://www.rodspecialeducation.org,,,False,False,
1161,https://go.boarddocs.com/mi/jon/Board.nsf/Public,JCS School Board Policies and Guidelines,Jonesville Community Schools,,,,False,False,
1292,https://go.boarddocs.com/oh/nwoesc/Board.nsf/Public,"NwOESC Board Agendas, Minutes, By-Laws and Policy Manual",Northwest Ohio Educational Service Center,https://www.nwoesc.org/,,,False,False,
1537,https://go.boarddocs.com/oh/moesc/Board.nsf/Public,Mid-Ohio Policies,Mid-Ohio ESC,www.moesc.net,,,False,False,
1729,https://go.boarddocs.com/co/eepto/Board.nsf/Public,"Wellington, CO",Eyestone Elementary PTO,https://eye.psdschools.org/,,,False,False,
2060,https://go.boarddocs.com/vsba/fccpsva/Board.nsf/Public,Falls Church City Public Schools,Falls Church City School Board,https://www.fccps.org/page/school-board,,,False,False,
2229,https://go.boarddocs.com/ca/sjccd/Board.nsf/Public,College of the Siskiyous,Siskiyou Joint Community College District,http://www.siskiyous.edu/,,,False,False,
2478,https://go.boarddocs.com/ca/hawking/Board.nsf/Public,Hawking STEAM Charter School,Hawking STEAM Charter Schools,https://www.hawkingschools.org/,,,False,False,


In [8609]:
# for those with the word policy, make it the first col
policy_pattern = r'polic(y|ies)'
idx = df["title_2"].str.contains(policy_pattern, case=False,na=False)
df.loc[idx, ["title_1", "title_2"]] = df.loc[idx,["title_2","title_1"]].values

  idx = df["title_2"].str.contains(policy_pattern, case=False,na=False)


In [8610]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
283,https://go.boarddocs.com/oh/putesc/Board.nsf/Public,Putnam County Educational Services Center District Policy Manual,Putnam County Educational Service Center,www.putnamcountyesc.org,,,False,False,
628,https://go.boarddocs.com/oh/cvcc/Board.nsf/Public,Cuyahoga Valley Career Center Bylaws and Policies,Cuyahoga Valley Career Center,http://www.cvccworks.edu/Default.aspx,,,False,False,
1050,https://go.boarddocs.com/in/rodsped/Board.nsf/Public,ROD Board Policy,Ripley-Ohio-Dearborn Special Education Cooperative,http://www.rodspecialeducation.org,,,False,False,
1161,https://go.boarddocs.com/mi/jon/Board.nsf/Public,JCS School Board Policies and Guidelines,Jonesville Community Schools,,,,False,False,
1292,https://go.boarddocs.com/oh/nwoesc/Board.nsf/Public,"NwOESC Board Agendas, Minutes, By-Laws and Policy Manual",Northwest Ohio Educational Service Center,https://www.nwoesc.org/,,,False,False,
1537,https://go.boarddocs.com/oh/moesc/Board.nsf/Public,Mid-Ohio Policies,Mid-Ohio ESC,www.moesc.net,,,False,False,
1729,https://go.boarddocs.com/co/eepto/Board.nsf/Public,"Wellington, CO",Eyestone Elementary PTO,https://eye.psdschools.org/,,,False,False,
2060,https://go.boarddocs.com/vsba/fccpsva/Board.nsf/Public,Falls Church City Public Schools,Falls Church City School Board,https://www.fccps.org/page/school-board,,,False,False,
2229,https://go.boarddocs.com/ca/sjccd/Board.nsf/Public,College of the Siskiyous,Siskiyou Joint Community College District,http://www.siskiyous.edu/,,,False,False,
2478,https://go.boarddocs.com/ca/hawking/Board.nsf/Public,Hawking STEAM Charter School,Hawking STEAM Charter Schools,https://www.hawkingschools.org/,,,False,False,


In [8611]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains(policy_pattern,case=False,na=False)]

  df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains(policy_pattern,case=False,na=False)]


Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
283,https://go.boarddocs.com/oh/putesc/Board.nsf/Public,Putnam County Educational Services Center District Policy Manual,Putnam County Educational Service Center,www.putnamcountyesc.org,,,False,False,
628,https://go.boarddocs.com/oh/cvcc/Board.nsf/Public,Cuyahoga Valley Career Center Bylaws and Policies,Cuyahoga Valley Career Center,http://www.cvccworks.edu/Default.aspx,,,False,False,
1050,https://go.boarddocs.com/in/rodsped/Board.nsf/Public,ROD Board Policy,Ripley-Ohio-Dearborn Special Education Cooperative,http://www.rodspecialeducation.org,,,False,False,
1161,https://go.boarddocs.com/mi/jon/Board.nsf/Public,JCS School Board Policies and Guidelines,Jonesville Community Schools,,,,False,False,
1292,https://go.boarddocs.com/oh/nwoesc/Board.nsf/Public,"NwOESC Board Agendas, Minutes, By-Laws and Policy Manual",Northwest Ohio Educational Service Center,https://www.nwoesc.org/,,,False,False,
1537,https://go.boarddocs.com/oh/moesc/Board.nsf/Public,Mid-Ohio Policies,Mid-Ohio ESC,www.moesc.net,,,False,False,
2720,https://go.boarddocs.com/mi/giresa/Board.nsf/Public,Board Policies and Bylaws,Gratiot-Isabella Regional Education Service District,http://www.giresd.net,,,False,False,
3326,https://go.boarddocs.com/mi/trav/Board.nsf/Public,TCAPS School Board Policies & Guidelines,Traverse City Area Public Schools,http://www.tcaps.net/board,,,False,False,
3407,https://go.boarddocs.com/oh/tructc/Board.nsf/Public,TCTC Board Policies,Trumbull Career and Technical Center,http://www.tctchome.com,,,False,False,
3525,https://go.boarddocs.com/pa/vnang/Board.nsf/Public,Venango Technology Center Policy Manual,Venango Technology Center,http://www.vtc1.org,,,False,False,


In [8612]:
# can remove title_1 for these
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains(policy_pattern,case=False,na=False), "title_1"] = None
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

  df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains(policy_pattern,case=False,na=False), "title_1"] = None


Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
1729,https://go.boarddocs.com/co/eepto/Board.nsf/Public,"Wellington, CO",Eyestone Elementary PTO,https://eye.psdschools.org/,,,False,False,
2060,https://go.boarddocs.com/vsba/fccpsva/Board.nsf/Public,Falls Church City Public Schools,Falls Church City School Board,https://www.fccps.org/page/school-board,,,False,False,
2229,https://go.boarddocs.com/ca/sjccd/Board.nsf/Public,College of the Siskiyous,Siskiyou Joint Community College District,http://www.siskiyous.edu/,,,False,False,
2478,https://go.boarddocs.com/ca/hawking/Board.nsf/Public,Hawking STEAM Charter School,Hawking STEAM Charter Schools,https://www.hawkingschools.org/,,,False,False,
2889,https://go.boarddocs.com/ut/uen/Board.nsf/Public,Utah Education and Telehealth Network,UETN Governing Board,http://www.uetn.org,,,False,False,
2963,https://go.boarddocs.com/fl/jcsd/Board.nsf/Public,Jefferson County Schools,Jefferson County School Board,https://www.jeffersonschools.net,,,False,False,
3046,https://go.boarddocs.com/mi/badax/Board.nsf/Public,BAPS,Home of the Hatchets,www.badaxeps.org,,,False,False,
3257,https://go.boarddocs.com/in/fcsc/Board.nsf/Public,Franklin Township Community School Corp.,Franklin Township Community School Corporation,http://www.ftcsc.k12.in.us/,,,False,False,
3350,https://go.boarddocs.com/in/evsc/Board.nsf/Public,EVSC School Board Meetings,Evansville Vanderburgh School Corporation,www.evscschools.com,,,False,False,


In [8613]:
# do the same for board
idx = df["title_2"].str.contains("board", case=False,na=False)
df.loc[idx, ["title_1", "title_2"]] = df.loc[idx,["title_2","title_1"]].values

In [8614]:
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains("board",case=False,na=False)]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
2060,https://go.boarddocs.com/vsba/fccpsva/Board.nsf/Public,Falls Church City School Board,Falls Church City Public Schools,https://www.fccps.org/page/school-board,,,False,False,
2889,https://go.boarddocs.com/ut/uen/Board.nsf/Public,UETN Governing Board,Utah Education and Telehealth Network,http://www.uetn.org,,,False,False,
2963,https://go.boarddocs.com/fl/jcsd/Board.nsf/Public,Jefferson County School Board,Jefferson County Schools,https://www.jeffersonschools.net,,,False,False,
3350,https://go.boarddocs.com/in/evsc/Board.nsf/Public,EVSC School Board Meetings,Evansville Vanderburgh School Corporation,www.evscschools.com,,,False,False,


In [8615]:
# can remove title_1 for these
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna()) & df["title_1"].str.contains("board",case=False,na=False), "title_1"] = None
df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
1729,https://go.boarddocs.com/co/eepto/Board.nsf/Public,"Wellington, CO",Eyestone Elementary PTO,https://eye.psdschools.org/,,,False,False,
2229,https://go.boarddocs.com/ca/sjccd/Board.nsf/Public,College of the Siskiyous,Siskiyou Joint Community College District,http://www.siskiyous.edu/,,,False,False,
2478,https://go.boarddocs.com/ca/hawking/Board.nsf/Public,Hawking STEAM Charter School,Hawking STEAM Charter Schools,https://www.hawkingschools.org/,,,False,False,
3046,https://go.boarddocs.com/mi/badax/Board.nsf/Public,BAPS,Home of the Hatchets,www.badaxeps.org,,,False,False,
3257,https://go.boarddocs.com/in/fcsc/Board.nsf/Public,Franklin Township Community School Corp.,Franklin Township Community School Corporation,http://www.ftcsc.k12.in.us/,,,False,False,


In [8616]:
# just do case by case
df.loc[df["URL"]=="https://go.boarddocs.com/co/eepto/Board.nsf/Public", "address"] = df["title_1"]
df.loc[df["URL"]=="https://go.boarddocs.com/co/eepto/Board.nsf/Public", "title_1"] = None

df.loc[df["URL"]=="https://go.boarddocs.com/ca/sjccd/Board.nsf/Public", "title_1"] = None

df.loc[df["URL"]=="https://go.boarddocs.com/ca/hawking/Board.nsf/Public", "title_1"] = None

df.loc[df["URL"]=="https://go.boarddocs.com/mi/badax/Board.nsf/Public", "title_2"] = None

df.loc[df["URL"]=="https://go.boarddocs.com/in/fcsc/Board.nsf/Public", "title_1"] = None

df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())]

Unnamed: 0,URL,title_1,title_2,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone


In [8617]:
# check those with both not None
n = df.loc[(~df["title_1"].isna()) & (~df["title_2"].isna())].shape[0]
print(n)
assert n == 0

0


In [8618]:
# now we can combine both
df.loc[df["title_1"].isna(), "title_1"] = df["title_2"]
df = df.drop("title_2", axis=1)

In [8619]:
# see what is left
df.loc[~df["title_1"].isna()].shape[0]

1819

In [8620]:
df.loc[~df["title_1"].isna()].sample(5)

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
2261,https://go.boarddocs.com/ny/nyack/Board.nsf/Public,Nyack Public Schools,http://www.nyackschools.org/,"13A Dickinson Avenue | Nyack, NY 10960 | Ph: 845-353-7000 | Fx: 845-353-7019",,False,False,
2811,https://go.boarddocs.com/id/nsd131/Board.nsf/Public,(208) 468-4600 Fax: (208) 468-4638,http://www.nsd131.org,"619 S. Canyon St | Nampa, ID 83686",,False,True,
570,https://go.boarddocs.com/mi/oasd/Board.nsf/Public,NEOLA policies,http://www.oasd.k12.mi.us,,Ontonagon Area School District,False,False,
3709,https://go.boarddocs.com/oh/ceda/Board.nsf/Public,Cedar Cliff Local Schools,https://www.cedarcliffschools.net/,,,False,False,
2897,https://go.boarddocs.com/nj/middletownk12/Board.nsf/Public,Middletown Township Public Schools,https://www.middletownk12.org/,"834 LEONARDVILLE RD. 2ND FL LEONARDO, NJ 07737",,False,False,


In [8621]:
# check those with numbers
df.loc[df["title_1"].str.contains(number_pattern, na=False)].shape[0]

134

In [8622]:
df.loc[df["title_1"].str.contains(number_pattern, na=False)]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
38,https://go.boarddocs.com/mo/mcr1/Board.nsf/Public,Macon County R-1 Schools,http://www.macon.k12.mo.us/,"702 North Missouri  Macon, Missouri 63552  (660) 395-6164",,False,False,
49,https://go.boarddocs.com/ks/susdks/Board.nsf/Public,Seaman USD 345,http://www.seamanschools.org,"901 NW Lyman Road | Topeka, KS 66608 | Ph: (785) 575-8600",,False,False,
76,https://go.boarddocs.com/pa/iu11/Board.nsf/Public,Tuscarora Intermediate Unit 11,https://www.tiu11.org/,,,False,False,
125,https://go.boarddocs.com/wi/pesh/Board.nsf/Public,341 NORTH EMERY AVENUE,www.peshtigo.k12.wi.us,,PESHTIGO SCHOOL DISTRICT,False,False,
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
...,...,...,...,...,...,...,...,...
3806,https://go.boarddocs.com/ks/usd315/Board.nsf/Public,Colby Public Schools USD 315,http://www.colbyeagles.org/,600 W 3rd St. | Colby KS 67701-2000 | p 785-460-5000 | f 785-460-5050,,False,False,
3816,https://go.boarddocs.com/fl/semi/Board.nsf/Public,Phone - 407-320-0000,www.scps.k12.fl.us,"400 E. Lake Mary Boulevard - Sanford, FL - 32773",,False,True,
3820,https://go.boarddocs.com/ks/usd311/Board.nsf/Public,Pretty Prairie USD 311 KS,https://www.usd311.com/,"206 E Main, P.O. Box 218 Pretty Prairie, Kansas 67570",,False,False,
3877,https://go.boarddocs.com/pa/neiu/Board.nsf/Public,Northeastern Educational Intermediate Unit 19,http://www.iu19.org,1200 Line Street Archbald PA 18403,,False,False,


In [8623]:
# get the phone numbers

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df.loc[df["contains_phone_number_title_1"]].shape[0]

38

In [8624]:
df.loc[df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
252,https://go.boarddocs.com/wi/colesd/Board.nsf/Public,"347 Business Highway 141 North, Coleman WI | 920-897-4011",www.coleman.k12.wi.us,,Coleman School District,True,False,
547,https://go.boarddocs.com/wi/nfdl/Board.nsf/Public,"1115 Thurke Ave - North Fond du Lac, WI - (920) 929-3750",www.nfdlschools.org,,,True,False,
680,https://go.boarddocs.com/pa/prtg/Board.nsf/Public,(814) 736-9636,www.portageareasd.org,"84 Mountain Avenue, Portage, PA 15946",,True,True,
767,https://go.boarddocs.com/mi/cmps/Board.nsf/Public,Central Montcalm Public School | Office: 989-831-2001 | Fax: 989-831-2010,http://www.central-montcalm.org,,,True,False,
856,https://go.boarddocs.com/mi/elk/Board.nsf/Public,"Elk Rapids Central Office • 308 Meguzee Point Rd • Elk Rapids, Michigan • Phone: (231) 264-8692 Fax: (231) 264-6538",www.erschools.com,,,True,False,
1029,https://go.boarddocs.com/mi/wake/Board.nsf/Public,(906) 224-7211,http://www.wmschools.org,,Wakefield-Marenisco School District,True,True,
1064,https://go.boarddocs.com/wi/spartan/Board.nsf/Public,"900 E. Montgomery St. Sparta, Wisconsin | (608) 366-3400",https://www.spartan.org,,Sparta Area School District,True,True,
1199,https://go.boarddocs.com/wi/sdathen/Board.nsf/Public,Phone: 715-257-7511 Fax: 715-257-7502,https://www.athens1.org,"School District of Athens, 601 West Limits Road, Athens, WI 54411",,True,True,
1445,https://go.boarddocs.com/oh/hunt/Board.nsf/Public,"188 Huntsman Road | Chillicothe, OH | P: 740.663.5892 | F: 740.663.6078",http://www.huntsmen.org/,,Huntington Local School District,True,False,


In [8625]:
# move those without alphabets to the phone col

alphabet_pattern = r'[a-zA-Z]'
df.loc[df["contains_phone_number_title_1"] & (~df["title_1"].str.contains(alphabet_pattern,na=False)) ]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
680,https://go.boarddocs.com/pa/prtg/Board.nsf/Public,(814) 736-9636,www.portageareasd.org,"84 Mountain Avenue, Portage, PA 15946",,True,True,
1029,https://go.boarddocs.com/mi/wake/Board.nsf/Public,(906) 224-7211,http://www.wmschools.org,,Wakefield-Marenisco School District,True,True,
1765,https://go.boarddocs.com/pa/iu14/Board.nsf/Public,610-987-2248,http://www.berksiu.org/,"1111 Commons Boulevard, PO Box 16050, 19612-6050",,True,True,
1924,https://go.boarddocs.com/wi/shorewood/Board.nsf/Public,(414) 963-6901,https://www.shorewood.k12.wi.us,"Shorewood School District | 1701 East Capitol Drive | Shorewood, Wisconsin 53211",,True,True,
1985,https://go.boarddocs.com/pa/roch/Board.nsf/Public,724-775-7500,http://www.rasd.org,"540 Reno Street, Rochester, PA 15074",,True,True,
2028,https://go.boarddocs.com/mi/fowler/Board.nsf/Public,(989)593-2250,http://www.fowlerschools.net,"700 S Main Street, Fowler MI 48835",,True,True,
2525,https://go.boarddocs.com/nj/burlingtontwp/Board.nsf/Public,(609) 387-3955,https://burltwpsch.org/,,Burlington Township School District,True,True,
2767,https://go.boarddocs.com/in/jcdc/Board.nsf/Public,812-689-4114,http://www.jaccendel.k12.in.us,"723 N Buckeye St, Osgood IN 47037",,True,True,
2874,https://go.boarddocs.com/wi/hfj1/Board.nsf/Public,(262) 673-3155,www.hjt1.org,"School District of Hartford Jt. #1, 402 W. Sumner St. Hartford, WI 53027",,True,True,
3159,https://go.boarddocs.com/wi/wawmsd/Board.nsf/Public,414-604-3000,http://www.wawmsd.org,"West Allis-West Milwaukee School District, 9333 W. Lincoln Avenue, West Allis, WI 53227",,True,True,


In [8626]:
df.loc[df["contains_phone_number_title_1"] & (~df["title_1"].str.contains(alphabet_pattern,na=False)), "phone"] = df["title_1"]
df.loc[df["contains_phone_number_title_1"] & (~df["title_1"].str.contains(alphabet_pattern,na=False)), "title_1"] = None


In [8627]:
# get the phone numbers

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df.loc[df["contains_phone_number_title_1"]].shape[0]

28

In [8628]:
df.loc[df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
252,https://go.boarddocs.com/wi/colesd/Board.nsf/Public,"347 Business Highway 141 North, Coleman WI | 920-897-4011",www.coleman.k12.wi.us,,Coleman School District,True,False,
547,https://go.boarddocs.com/wi/nfdl/Board.nsf/Public,"1115 Thurke Ave - North Fond du Lac, WI - (920) 929-3750",www.nfdlschools.org,,,True,False,
767,https://go.boarddocs.com/mi/cmps/Board.nsf/Public,Central Montcalm Public School | Office: 989-831-2001 | Fax: 989-831-2010,http://www.central-montcalm.org,,,True,False,
856,https://go.boarddocs.com/mi/elk/Board.nsf/Public,"Elk Rapids Central Office • 308 Meguzee Point Rd • Elk Rapids, Michigan • Phone: (231) 264-8692 Fax: (231) 264-6538",www.erschools.com,,,True,False,
1064,https://go.boarddocs.com/wi/spartan/Board.nsf/Public,"900 E. Montgomery St. Sparta, Wisconsin | (608) 366-3400",https://www.spartan.org,,Sparta Area School District,True,True,
1199,https://go.boarddocs.com/wi/sdathen/Board.nsf/Public,Phone: 715-257-7511 Fax: 715-257-7502,https://www.athens1.org,"School District of Athens, 601 West Limits Road, Athens, WI 54411",,True,True,
1445,https://go.boarddocs.com/oh/hunt/Board.nsf/Public,"188 Huntsman Road | Chillicothe, OH | P: 740.663.5892 | F: 740.663.6078",http://www.huntsmen.org/,,Huntington Local School District,True,False,
1574,https://go.boarddocs.com/oh/swissohio/Board.nsf/Public,Phone: 740-472-5801,https://swissohio.k12.oh.us,"304 Mill Street Woodsfield, OH 43793",,True,True,
2128,https://go.boarddocs.com/ut/nebo/Board.nsf/Public,"350 S. Main | Spanish Fork, Utah | 801-354-7400",http://www.nebo.edu,,Nebo School District Board of Education,True,False,


In [8629]:
# the relevant info in the column is either phone, address or both
# the row with the website already has the website col filled
# first get those that are phone only

In [8630]:
# Define a pattern to match the words "phone" or "fax" only, allowing other non-alphabet characters
phone_fax_pattern = r'^[^a-zA-Z]*[Pp](hone)[^a-zA-Z]*$'

# Filter the rows
df[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False)]

  df[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False)]


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
1574,https://go.boarddocs.com/oh/swissohio/Board.nsf/Public,Phone: 740-472-5801,https://swissohio.k12.oh.us,"304 Mill Street Woodsfield, OH 43793",,True,True,
2140,https://go.boarddocs.com/wi/solon/Board.nsf/Public,Phone: (715) 378-2263,https://www.solonk12.net,"School District of Solon Springs, 8993 E Baldwin Ave, Solon Springs, WI 54873",,True,True,
2160,https://go.boarddocs.com/oh/galionoh/Board.nsf/Public,Phone - 419-468-3432,https://www.galionschools.org/,"Galion City Schools - 470 Portland Way North - Galion, OH 44833",,True,True,
3816,https://go.boarddocs.com/fl/semi/Board.nsf/Public,Phone - 407-320-0000,www.scps.k12.fl.us,"400 E. Lake Mary Boulevard - Sanford, FL - 32773",,True,True,


In [8631]:
df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "phone"] = df["title_1"]
df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "title_1"] = None

  df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "phone"] = df["title_1"]
  df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "title_1"] = None


In [8632]:
# get the phone numbers

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df.loc[df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
252,https://go.boarddocs.com/wi/colesd/Board.nsf/Public,"347 Business Highway 141 North, Coleman WI | 920-897-4011",www.coleman.k12.wi.us,,Coleman School District,True,False,
547,https://go.boarddocs.com/wi/nfdl/Board.nsf/Public,"1115 Thurke Ave - North Fond du Lac, WI - (920) 929-3750",www.nfdlschools.org,,,True,False,
767,https://go.boarddocs.com/mi/cmps/Board.nsf/Public,Central Montcalm Public School | Office: 989-831-2001 | Fax: 989-831-2010,http://www.central-montcalm.org,,,True,False,
856,https://go.boarddocs.com/mi/elk/Board.nsf/Public,"Elk Rapids Central Office • 308 Meguzee Point Rd • Elk Rapids, Michigan • Phone: (231) 264-8692 Fax: (231) 264-6538",www.erschools.com,,,True,False,
1064,https://go.boarddocs.com/wi/spartan/Board.nsf/Public,"900 E. Montgomery St. Sparta, Wisconsin | (608) 366-3400",https://www.spartan.org,,Sparta Area School District,True,True,
1199,https://go.boarddocs.com/wi/sdathen/Board.nsf/Public,Phone: 715-257-7511 Fax: 715-257-7502,https://www.athens1.org,"School District of Athens, 601 West Limits Road, Athens, WI 54411",,True,True,
1445,https://go.boarddocs.com/oh/hunt/Board.nsf/Public,"188 Huntsman Road | Chillicothe, OH | P: 740.663.5892 | F: 740.663.6078",http://www.huntsmen.org/,,Huntington Local School District,True,False,
2128,https://go.boarddocs.com/ut/nebo/Board.nsf/Public,"350 S. Main | Spanish Fork, Utah | 801-354-7400",http://www.nebo.edu,,Nebo School District Board of Education,True,False,
2173,https://go.boarddocs.com/ks/usd230/Board.nsf/Public,Info: (913) 592-7272 | T: (913) 592-7200 | F: (913) 592-7270,http://www.usd230.org,"Spring Hill School District | 17640 W. 199th Street, Spring Hill, Kansas 66083",,True,True,


In [8633]:
# Define a pattern to match the words "phone" or "fax" only, allowing other non-alphabet characters
phone_fax_pattern = r'^[^a-zA-Z]*([Pp]hone|PH|MAIN PHONE)?[^a-zA-Z]*([Ff]ax)?[^a-zA-Z]*$'

# Filter the rows
df[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False)]

  df[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False)]


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
1199,https://go.boarddocs.com/wi/sdathen/Board.nsf/Public,Phone: 715-257-7511 Fax: 715-257-7502,https://www.athens1.org,"School District of Athens, 601 West Limits Road, Athens, WI 54411",,True,True,
2538,https://go.boarddocs.com/wi/campsd/Board.nsf/Public,(920) 533-8381 | Fax (920) 533 -5726,www.csd.k12.wi.us,"327 N. Fond du Lac Ave. Campbellsport, WI 53010",,True,True,
2586,https://go.boarddocs.com/oh/cevsdoh/Board.nsf/Public,"Phone: 330-627-2181, - Fax: 330-627-2182",www.carrollton.k12.oh.us,"Carrollton Exempted Village School District, - 205 Scio Road S.W., - Carrollton, OH 44615",,True,True,
2811,https://go.boarddocs.com/id/nsd131/Board.nsf/Public,(208) 468-4600 Fax: (208) 468-4638,http://www.nsd131.org,"619 S. Canyon St | Nampa, ID 83686",,True,True,
3001,https://go.boarddocs.com/fl/highlfl/Board.nsf/Public,PH: 863-471-5555,www.highlands.k12.fl.us,"School Board of Highlands County, 426 School St., Sebring, FL 33870",,True,True,
3758,https://go.boarddocs.com/fl/brevco/Board.nsf/Public,* MAIN PHONE: (321) 633-1000,https://www.brevardschools.org/,"2700 JUDGE FRAN JAMIESON WAY, VIERA, FL 32940",,True,True,


In [8634]:
# Define a pattern to match the words "phone" or "fax" only, allowing other non-alphabet characters
phone_fax_pattern = r'^[^a-zA-Z]*([Pp]hone|PH|MAIN PHONE)?[^a-zA-Z]*([Ff]ax)?[^a-zA-Z]*$'

# Filter the rows
df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "phone"] = df["title_1"]
df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "title_1"] = None

  df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "phone"] = df["title_1"]
  df.loc[df['contains_phone_number_title_1'] & df['title_1'].str.contains(phone_fax_pattern, case=False, na=False), "title_1"] = None


In [8635]:
# get the phone numbers

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df.loc[df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
252,https://go.boarddocs.com/wi/colesd/Board.nsf/Public,"347 Business Highway 141 North, Coleman WI | 920-897-4011",www.coleman.k12.wi.us,,Coleman School District,True,False,
547,https://go.boarddocs.com/wi/nfdl/Board.nsf/Public,"1115 Thurke Ave - North Fond du Lac, WI - (920) 929-3750",www.nfdlschools.org,,,True,False,
767,https://go.boarddocs.com/mi/cmps/Board.nsf/Public,Central Montcalm Public School | Office: 989-831-2001 | Fax: 989-831-2010,http://www.central-montcalm.org,,,True,False,
856,https://go.boarddocs.com/mi/elk/Board.nsf/Public,"Elk Rapids Central Office • 308 Meguzee Point Rd • Elk Rapids, Michigan • Phone: (231) 264-8692 Fax: (231) 264-6538",www.erschools.com,,,True,False,
1064,https://go.boarddocs.com/wi/spartan/Board.nsf/Public,"900 E. Montgomery St. Sparta, Wisconsin | (608) 366-3400",https://www.spartan.org,,Sparta Area School District,True,True,
1445,https://go.boarddocs.com/oh/hunt/Board.nsf/Public,"188 Huntsman Road | Chillicothe, OH | P: 740.663.5892 | F: 740.663.6078",http://www.huntsmen.org/,,Huntington Local School District,True,False,
2128,https://go.boarddocs.com/ut/nebo/Board.nsf/Public,"350 S. Main | Spanish Fork, Utah | 801-354-7400",http://www.nebo.edu,,Nebo School District Board of Education,True,False,
2173,https://go.boarddocs.com/ks/usd230/Board.nsf/Public,Info: (913) 592-7272 | T: (913) 592-7200 | F: (913) 592-7270,http://www.usd230.org,"Spring Hill School District | 17640 W. 199th Street, Spring Hill, Kansas 66083",,True,True,
2366,https://go.boarddocs.com/oh/naps/Board.nsf/Public,www.napoleonareaschools.org | 419-599-7015,www.napoleon.k12.oh.us,,Napoleon Area City School District,True,False,


In [8636]:
# handle special cases
move_these = ["https://go.boarddocs.com/ks/usd230/Board.nsf/Public","https://go.boarddocs.com/oh/naps/Board.nsf/Public"]
df.loc[df["URL"].isin(move_these), "phone"] = df["title_1"]
df.loc[df["URL"].isin(move_these), "title_1"] = None

In [8637]:
# get the phone numbers

# Apply the function to a specific column (e.g., "title_2")
df['contains_phone_number_title_1'] = df['title_1'].apply(contains_phone_number)
df.loc[df["contains_phone_number_title_1"]]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
134,https://go.boarddocs.com/mi/hartl/Board.nsf/Public,Hartland Consolidated Schools | phone: 810.626.2105 | fax: 810.626.2101,http://www.hartlandschools.us,,,True,False,
252,https://go.boarddocs.com/wi/colesd/Board.nsf/Public,"347 Business Highway 141 North, Coleman WI | 920-897-4011",www.coleman.k12.wi.us,,Coleman School District,True,False,
547,https://go.boarddocs.com/wi/nfdl/Board.nsf/Public,"1115 Thurke Ave - North Fond du Lac, WI - (920) 929-3750",www.nfdlschools.org,,,True,False,
767,https://go.boarddocs.com/mi/cmps/Board.nsf/Public,Central Montcalm Public School | Office: 989-831-2001 | Fax: 989-831-2010,http://www.central-montcalm.org,,,True,False,
856,https://go.boarddocs.com/mi/elk/Board.nsf/Public,"Elk Rapids Central Office • 308 Meguzee Point Rd • Elk Rapids, Michigan • Phone: (231) 264-8692 Fax: (231) 264-6538",www.erschools.com,,,True,False,
1064,https://go.boarddocs.com/wi/spartan/Board.nsf/Public,"900 E. Montgomery St. Sparta, Wisconsin | (608) 366-3400",https://www.spartan.org,,Sparta Area School District,True,True,
1445,https://go.boarddocs.com/oh/hunt/Board.nsf/Public,"188 Huntsman Road | Chillicothe, OH | P: 740.663.5892 | F: 740.663.6078",http://www.huntsmen.org/,,Huntington Local School District,True,False,
2128,https://go.boarddocs.com/ut/nebo/Board.nsf/Public,"350 S. Main | Spanish Fork, Utah | 801-354-7400",http://www.nebo.edu,,Nebo School District Board of Education,True,False,
2495,https://go.boarddocs.com/in/brem/Board.nsf/Public,Bremen Public Schools | Phone: (574) 546-3929 | Fax: (574) 546-6303 | School Board Policies and Guidelines,https://www.bps.k12.in.us,,,True,False,
2808,https://go.boarddocs.com/wi/ashland/Board.nsf/Public,"District Office - 2000 Beaser Avenue; Ashland, WI; (715) 682-7080",http://www.ashland.k12.wi.us/,,School District of Ashland Board of Education,True,False,


In [8638]:
# these are all addresses, move them
df.loc[df["contains_phone_number_title_1"], "address"] = df["title_1"]
df.loc[df["contains_phone_number_title_1"], "title_1"] = None

In [8639]:
# now check the remaining ones
print(df[~df["title_1"].isna()].shape[0])
df[~df["title_1"].isna()].sample(5)

1781


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
3531,https://go.boarddocs.com/ca/avc/Board.nsf/Public,Antelope Valley Community College District,http://www.avc.edu/,3041 West Avenue K | Lancaster CA 93536 | 661.722.6300,,False,False,
2788,https://go.boarddocs.com/mi/fent/Board.nsf/Public,FENTON AREA PUBLIC SCHOOLS,https://www.fentonschools.org,,,False,False,
2236,https://go.boarddocs.com/oh/ashtech/Board.nsf/Public,Ashtabula County Technical & Career Center,http://www.atech.edu,,,False,False,
162,https://go.boarddocs.com/ca/sbcc/Board.nsf/Public,Santa Barbara City College,http://www.sbcc.cc.ca.us/,"721 Cliff Drive | Santa Barbara, CA 93109-2394 | Main Campus: 805.965.0581",,False,False,
3905,https://go.boarddocs.com/ca/gjccd/Board.nsf/Public,Gavilan Joint Community College District,http://www.gavilan.edu/,"5055 Santa Teresa Blvd | Gilroy, CA 95020 | (408) 848 - 4800",,False,False,


In [8640]:
# check those with commas
# could be addresses
# check those without addresses
df.loc[df["title_1"].str.contains(",",na=False) & df["address"].isna(),:]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
723,https://go.boarddocs.com/il/thsd211/Board.nsf/Public,"G.A. McElroy Administration Center, 1750 South Roselle Road, Palatine, Illinois",http://www.adc.d211.org,,,False,False,
882,https://go.boarddocs.com/il/asd4/Board.nsf/Public,"222 N. Kennedy Drive, Addison, IL",http://www.asd4.org,,Addison School District 4,False,False,
901,https://go.boarddocs.com/nd/bsd7/Board.nsf/Public,"Agendas, Minutes, District Information, Policies and Events",http://www.belcourt.k12.nd.us/education/components/scrapbook/default.php?sectiondetailid=1376&,,Turtle Mountain Community Schools/Belcourt School District #7,False,False,
977,https://go.boarddocs.com/mo/rockport/Board.nsf/Public,"600 S Nebraska St | Rock Port, MO",https://rockport.k12.mo.us/,,Rock Port R-II School District,False,False,
1736,https://go.boarddocs.com/pa/uncf/Board.nsf/Public,"Meetings, Agendas, and Information",http://www.ucfsd.org,,Unionville-Chadds Ford School District,False,False,
1800,https://go.boarddocs.com/pa/boyr/Board.nsf/Public,"Meetings, Agendas, Information",http://www.boyertownasd.org,,,False,False,
2077,https://go.boarddocs.com/mo/unionrxi/Board.nsf/Public,"Meetings, Agendas, and Information",https://www.unionrxi.org,,Union R-XI School District,False,False,
2134,https://go.boarddocs.com/in/pike/Board.nsf/Public,"Administrative Services Center - 6901 Zionsville Road, Indianapolis, IN & via Live Streaming at: https://www.youtube.com/channel/UCsnM2UOzNfDocPPLZzktSOg",http://www.pike.k12.in.us/,,Metropolitan School District of Pike Township,False,False,
2505,https://go.boarddocs.com/wv/brooke/Board.nsf/Public,"Brooke County Schools - Excellence, Tradition and Bruin Pride!",https://www.brooke.k12.wv.us,,,False,False,
2521,https://go.boarddocs.com/oh/brightoh/Board.nsf/Public,"Meeting Agendas, Minutes and Policies",https://www.blsd.us,,Bright Local School District,False,False,


In [8641]:
# the agenda value keep coming up
# let's check for all rows on this
df.loc[df["title_1"].str.contains("agenda",case=False,na=False)]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
157,https://go.boarddocs.com/il/sd163/Board.nsf/Public,Board Meeting Agendas,http://www.sd163.com,,,False,False,
673,https://go.boarddocs.com/pa/laur/Board.nsf/Public,Laurel School Board Agendas and Policy Manual,http://www.laurel.k12.pa.us,,Laurel School District,False,False,
901,https://go.boarddocs.com/nd/bsd7/Board.nsf/Public,"Agendas, Minutes, District Information, Policies and Events",http://www.belcourt.k12.nd.us/education/components/scrapbook/default.php?sectiondetailid=1376&,,Turtle Mountain Community Schools/Belcourt School District #7,False,False,
1163,https://go.boarddocs.com/ca/vibrantminds/Board.nsf/Public,"Board Agendas, Minutes, and Policies",https://www.vibrantminds.us/,"412 W. CARL KARCHER WAY ANAHEIM, CA 92801 | ​714-563-2390",,False,False,
1736,https://go.boarddocs.com/pa/uncf/Board.nsf/Public,"Meetings, Agendas, and Information",http://www.ucfsd.org,,Unionville-Chadds Ford School District,False,False,
1800,https://go.boarddocs.com/pa/boyr/Board.nsf/Public,"Meetings, Agendas, Information",http://www.boyertownasd.org,,,False,False,
2077,https://go.boarddocs.com/mo/unionrxi/Board.nsf/Public,"Meetings, Agendas, and Information",https://www.unionrxi.org,,Union R-XI School District,False,False,
2472,https://go.boarddocs.com/pa/iu24/Board.nsf/Public,Board Agendas and Policy Manual,http://www.cciu.org,"455 Boot Road, Downingtown, PA 19335 | 484-237-5000",,False,False,
2521,https://go.boarddocs.com/oh/brightoh/Board.nsf/Public,"Meeting Agendas, Minutes and Policies",https://www.blsd.us,,Bright Local School District,False,False,
2646,https://go.boarddocs.com/oh/mayoh/Board.nsf/Public,Meeting Agendas & Minutes,http://www.mayfieldschools.org,,Mayfield City School District,False,False,


In [8642]:
# remove them

df.loc[df["title_1"].str.contains("agenda",case=False,na=False), "title_1"] = None

In [8643]:
# check those with commas
# could be addresses
# check those without addresses
df.loc[df["title_1"].str.contains(",",na=False) & df["address"].isna(),:]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
723,https://go.boarddocs.com/il/thsd211/Board.nsf/Public,"G.A. McElroy Administration Center, 1750 South Roselle Road, Palatine, Illinois",http://www.adc.d211.org,,,False,False,
882,https://go.boarddocs.com/il/asd4/Board.nsf/Public,"222 N. Kennedy Drive, Addison, IL",http://www.asd4.org,,Addison School District 4,False,False,
977,https://go.boarddocs.com/mo/rockport/Board.nsf/Public,"600 S Nebraska St | Rock Port, MO",https://rockport.k12.mo.us/,,Rock Port R-II School District,False,False,
2134,https://go.boarddocs.com/in/pike/Board.nsf/Public,"Administrative Services Center - 6901 Zionsville Road, Indianapolis, IN & via Live Streaming at: https://www.youtube.com/channel/UCsnM2UOzNfDocPPLZzktSOg",http://www.pike.k12.in.us/,,Metropolitan School District of Pike Township,False,False,
2505,https://go.boarddocs.com/wv/brooke/Board.nsf/Public,"Brooke County Schools - Excellence, Tradition and Bruin Pride!",https://www.brooke.k12.wv.us,,,False,False,
2627,https://go.boarddocs.com/nj/hhboe/Board.nsf/Public,"316-A Seventh Ave Haddon Heights, New Jersey",http://gogarnets.com/,,Haddon Heights School District,False,False,
2710,https://go.boarddocs.com/ak/swrsdak/Board.nsf/Public,"... educating our future, guided by our past",http://www.swrsd.org,,Southwest Region School District,False,False,
2729,https://go.boarddocs.com/ca/cvesd/Board.nsf/Public,"84 East J Street , Chula Vista , CA91910",https://www.cvesd.org/,,Chula Vista Elementary School District,False,False,
2824,https://go.boarddocs.com/mo/wpr7sd/Board.nsf/Public,"Excellence in Education, Service, Life.",https://www.zizzers.org,,West Plains School District,False,False,
3287,https://go.boarddocs.com/wi/afasd/Board.nsf/Public,"201 W. 6th Street, Friendship, WI",https://www.afasd.net/,,ADAMS-FRIENDSHIP AREA SCHOOL DISTRICT,False,False,


In [8644]:
# get those with numbers first

df.loc[df["title_1"].str.contains(",",na=False) & df["address"].isna() & df["title_1"].str.contains(number_pattern),:]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
723,https://go.boarddocs.com/il/thsd211/Board.nsf/Public,"G.A. McElroy Administration Center, 1750 South Roselle Road, Palatine, Illinois",http://www.adc.d211.org,,,False,False,
882,https://go.boarddocs.com/il/asd4/Board.nsf/Public,"222 N. Kennedy Drive, Addison, IL",http://www.asd4.org,,Addison School District 4,False,False,
977,https://go.boarddocs.com/mo/rockport/Board.nsf/Public,"600 S Nebraska St | Rock Port, MO",https://rockport.k12.mo.us/,,Rock Port R-II School District,False,False,
2134,https://go.boarddocs.com/in/pike/Board.nsf/Public,"Administrative Services Center - 6901 Zionsville Road, Indianapolis, IN & via Live Streaming at: https://www.youtube.com/channel/UCsnM2UOzNfDocPPLZzktSOg",http://www.pike.k12.in.us/,,Metropolitan School District of Pike Township,False,False,
2627,https://go.boarddocs.com/nj/hhboe/Board.nsf/Public,"316-A Seventh Ave Haddon Heights, New Jersey",http://gogarnets.com/,,Haddon Heights School District,False,False,
2729,https://go.boarddocs.com/ca/cvesd/Board.nsf/Public,"84 East J Street , Chula Vista , CA91910",https://www.cvesd.org/,,Chula Vista Elementary School District,False,False,
3287,https://go.boarddocs.com/wi/afasd/Board.nsf/Public,"201 W. 6th Street, Friendship, WI",https://www.afasd.net/,,ADAMS-FRIENDSHIP AREA SCHOOL DISTRICT,False,False,


In [8645]:
# move them to the address
idx = df["title_1"].str.contains(",",na=False) & df["address"].isna() & df["title_1"].str.contains(number_pattern)
df.loc[idx,"address"] = df["title_1"]
df.loc[idx,"title_1"] = None

In [8646]:
# check those with commas
# could be addresses
# check those without addresses
df.loc[df["title_1"].str.contains(",",na=False) & df["address"].isna(),:]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
2505,https://go.boarddocs.com/wv/brooke/Board.nsf/Public,"Brooke County Schools - Excellence, Tradition and Bruin Pride!",https://www.brooke.k12.wv.us,,,False,False,
2710,https://go.boarddocs.com/ak/swrsdak/Board.nsf/Public,"... educating our future, guided by our past",http://www.swrsd.org,,Southwest Region School District,False,False,
2824,https://go.boarddocs.com/mo/wpr7sd/Board.nsf/Public,"Excellence in Education, Service, Life.",https://www.zizzers.org,,West Plains School District,False,False,


In [8647]:
# move the first one to address and remove the rest, which are slogans
df.loc[df["URL"]=="https://go.boarddocs.com/co/eepto/Board.nsf/Public", "address"] = df["title_1"]
df.loc[df["URL"]=="https://go.boarddocs.com/co/eepto/Board.nsf/Public", "title_1"] = None

df.loc[df["title_1"].str.contains(",",na=False) & df["address"].isna(),"title_1"] = None

In [8648]:
# check the remaining ones
print(df.loc[~df["title_1"].isna()].shape[0])
df.loc[~df["title_1"].isna()].sample(5)

1757


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
2713,https://go.boarddocs.com/ct/norwichpublic/Board.nsf/Public,Norwich Public Schools,www.norwichpublicschools.org,"90 Town Street Norwich, CT 06360 | 860-823-4200",,False,False,
3457,https://go.boarddocs.com/vsba/madison/Board.nsf/Public,Madison County Public Schools,http://www2.madisonschools.k12.va.us/,"60 School Board Court | Madison, VA 22727 | Phone: 540.948.3780",,False,False,
630,https://go.boarddocs.com/mi/lapr/Board.nsf/Public,Lapeer Community Schools,https://lcs.sharpschool.net/home,,,False,False,
3324,https://go.boarddocs.com/ny/oesj/Board.nsf/Public,Oppenheim-Ephratah-St. Johnsville CSD,http://www.oesj.org/,"44 Center Street St. | Johnsville, NY 13452 | (518) 568-7280",,False,False,
2706,https://go.boarddocs.com/ks/usd430/Board.nsf/Public,"USD 430, South Brown County",http://www.usd430.org,"522 Central Avenue | Horton, KS 66439 | 785.486.2611",,False,False,


In [8649]:
# check those with address na too
print(df.loc[~df["title_1"].isna() & df["address"].isna()].shape[0])
df.loc[~df["title_1"].isna() & df["address"].isna()].sample(5)

559


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
3383,https://go.boarddocs.com/pa/cmvt/Board.nsf/Public,Columbia-Montour Area Vocational Technical School,http://www.cmvt.us,,,False,False,
2519,https://go.boarddocs.com/mi/nbra/Board.nsf/Public,North Branch Area Schools,http://www.nbbroncos.net,,,False,False,
933,https://go.boarddocs.com/oh/benj/Board.nsf/Public,Benjamin Logan Local Schools,http://www.benjaminlogan.org,,,False,False,
3371,https://go.boarddocs.com/oh/urbana/Board.nsf/Public,Urbana City Schools,http://www.urbanacityschools.org,,,False,False,
149,https://go.boarddocs.com/mi/hollan/Board.nsf/Public,Holland Public Schools,http://www.hollandpublicschools.org/,,,False,False,


In [8650]:
# check those with school_district na too
print(df.loc[~df["title_1"].isna() & df["school_district"].isna()].shape[0])
df.loc[~df["title_1"].isna() & df["school_district"].isna()].sample(5)

1704


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
3058,https://go.boarddocs.com/mo/sps/Board.nsf/Public,Springfield Public Schools,www.sps.org/,,,False,False,
1907,https://go.boarddocs.com/mi/honeycreek/Board.nsf/Public,Honey Creek Community School,https://www.honeycreekschool.org/,"1735 S. Wagner Road, Ann Arbor, MI 48103",,False,False,
489,https://go.boarddocs.com/oh/ehovecareercenter/Board.nsf/Public,EHOVE Career Center,https://www.ehove.net/,"316 W. Mason Rd Milan, OH 44846 Ph 419-499-4663",,False,False,
3377,https://go.boarddocs.com/oh/wau/Board.nsf/Public,Wauseon Exempted Village Schools,www.wauseonindians.org,,,False,False,
3383,https://go.boarddocs.com/pa/cmvt/Board.nsf/Public,Columbia-Montour Area Vocational Technical School,http://www.cmvt.us,,,False,False,


In [8651]:
# check those with non-alphabets in title_1
non_alpha_pattern = r'[^A-Za-z ]'
print(df.loc[df["title_1"].str.contains(non_alpha_pattern, na=False)].shape[0])
df.loc[df["title_1"].str.contains(non_alpha_pattern, na=False)].sample(5)

319


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
2548,https://go.boarddocs.com/mn/rcmn/Board.nsf/Public,"Rock County, Minnesota",http://www.co.rock.mn.us/,"204 East Brown Street, Luverne, MN 56156",,False,False,
2264,https://go.boarddocs.com/il/vowil/Board.nsf/Public,"Village of Wheeling, Illinois",http://www.wheelingil.gov/,"2 Community Boulevard | Wheeling, IL 60090 | (847) 459-2600 | f (847) 459-9692",,False,False,
44,https://go.boarddocs.com/mi/repuf/Board.nsf/Public,Reeths-Puffer Schools,http://www.reeths-puffer.org,,,False,False,
746,https://go.boarddocs.com/ks/usd418/Board.nsf/Public,McPherson USD 418,www.mcpherson.com,"514 North Main | McPherson, KS 67460 | 620-241-9400 | f 620-241-9410",,False,False,
26,https://go.boarddocs.com/sd/meade/Board.nsf/Public,"Meade County, South Dakota",http://www.meadecounty.org/,"1300 Sherman Street Ste 222 | Sturgis, SD 57785",,False,False,


In [8652]:
# for those that have addresses set, we can move them to school_district
df.loc[~df["address"].isna() & ~df["title_1"].isna()]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
2,https://go.boarddocs.com/oh/mapleheights/Board.nsf/Public,Maple Heights City Schools,http://www.mapleschools.com,"5740 Lawn Avenue | Maple Heights, OH 44137 | 216-587-6100",,False,False,
10,https://go.boarddocs.com/in/centergrove/Board.nsf/Public,Center Grove Community School Corporation,https://www.centergrove.k12.in.us/,"4800 West Stones Crossing Road | Greenwood, IN 46143 | (317) 881-9326",,False,False,
12,https://go.boarddocs.com/oh/pcc/Board.nsf/Public,Penta Career Center,http://www.pentacareercenter.org/,"9301 Buck Rd. | Perrysburg, Ohio 43551 | High School: 419-666-1120 Adult Education: 419-661-6555",,False,False,
15,https://go.boarddocs.com/oh/westholmes/Board.nsf/Public,West Holmes Local Schools,https://westholmes.org/,"28 W Jackson St. | Millersburg, OH 44654 | 330-674-3546",,False,False,
16,https://go.boarddocs.com/mabe/carps/Board.nsf/Public,Caroline County Public Schools,carolineschools.org,"Address: 204 Franklin Street | Denton, MD 21629 | Phone: (410) 479-1460",,False,False,
...,...,...,...,...,...,...,...,...
3897,https://go.boarddocs.com/co/jeffco/Board.nsf/Public,Jeffco Public Schools Board of Education,http://www.jeffcopublicschools.org/,1829 Denver West Drive | Golden. CO 80401 | (303) 982-6800,,False,False,
3902,https://go.boarddocs.com/il/cowil/Board.nsf/Public,City of Waukegan,http://www.waukeganil.gov/,"100 N. Martin Luther King Jr. Ave. | Waukegan, IL 60085",,False,False,
3903,https://go.boarddocs.com/ca/laccd/Board.nsf/Public,Los Angeles Community College District,http://laccd.edu,"770 Wilshire Boulevard, Los Angeles, CA 90017 | (213) 891-2000",,False,False,
3904,https://go.boarddocs.com/md/stmarysco/Board.nsf/Public,"St. Mary's County, Maryland",http://www.stmarysmd.com,"41770 Baldridge Street | Leonardtown, MD 20650 | 301-475-4200",,False,False,


In [8653]:
# check if there are any where all three are set
df.loc[~df["address"].isna() & ~df["title_1"].isna() & ~df["school_district"].isna()]

Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone


In [8654]:
# not really sure if it will be entirely correct
# check for those without address set
print(df.loc[df["address"].isna() & ~df["title_1"].isna()].shape[0])
df.loc[df["address"].isna() & ~df["title_1"].isna()].head()

559


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
0,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,St. Joseph Public Schools,https://www.sjschools.org/,,,False,False,
6,https://go.boarddocs.com/pa/fcctc/Board.nsf/Public,Franklin County Career and Technology Center,www.franklinctc.com,,,False,False,
9,https://go.boarddocs.com/ca/voc/Board.nsf/Public,Vista Oaks Charter School,https://www.vistaoaks.net,,,False,False,
20,https://go.boarddocs.com/mi/man/Board.nsf/Public,Manton Consolidated Schools,http://mantonmi.apptegy.us/o/mcs?mode=edit,,,False,False,
21,https://go.boarddocs.com/pa/cdsd/Board.nsf/Public,Quality to the Core,http://www.cdschools.org,,Central Dauphin School District,False,False,


In [8655]:
# not really sure if it will be entirely correct
# check for those with both unset
print(df.loc[df["address"].isna() & df["school_district"].isna() & ~df["title_1"].isna()].shape[0])
df.loc[df["address"].isna() & df["school_district"].isna() & ~df["title_1"].isna()].sample(5)

506


Unnamed: 0,URL,title_1,home_website,address,school_district,contains_phone_number_title_1,contains_phone_number_title_2,phone
163,https://go.boarddocs.com/mi/ros/Board.nsf/Public,Royal Oak Schools,http://www.royaloakschools.org,,,False,False,
3407,https://go.boarddocs.com/oh/tructc/Board.nsf/Public,Trumbull Career and Technical Center,http://www.tctchome.com,,,False,False,
3323,https://go.boarddocs.com/wv/tuck/Board.nsf/Public,Tucker County Schools,https://www.tuckercountyschools.com,,,False,False,
1362,https://go.boarddocs.com/mi/caro/Board.nsf/Public,Caro Community Schools,http://www.carok12.org,,,False,False,
1432,https://go.boarddocs.com/mi/flush/Board.nsf/Public,Flushing Community Schools,https://www.flushingschools.org/,,,False,False,


In [8656]:
# check how many we have left
sum(~df["title_1"].isna())

1757

In [8657]:
# send to GPT to deal with it
df.to_csv("test.csv", columns=["title_1","address","school_district"], index=False)

In [8658]:
# I gave GPT-4o the following prompt
"""
This csv has three columns: title_1, address, and school_district. Many values are empty.

Your goal is to move the value from title_1 to either address, or school_district, or simply discard it. Do not overwrite any values.

For example, for the row

Christopher Newport University,"1 Avenue of the Arts Newport News, VA 23606 | (757) 594-7000",

You should put Christopher Newport University as the school_district, which is third column, that is

,"1 Avenue of the Arts Newport News, VA 23606 | (757) 594-7000","Christopher Newport University"

Another example

Quality to the Core,,Central Dauphin School District

You should just drop the title_1, since it is a slogan

,,Central Dauphin School District

Another example

75 Chenango Ave Clinton NY13323,,Clinton Central School District

You should put this in the address column

,75 Chenango Ave Clinton NY13323,Clinton Central School District

Return a csv
"""

# followed by 

"""
Ok. Now instead of moving, add a fourth column that says "address", "school_district", "None" or "drop"
"""

# it returns the following csv

'\nOk. Now instead of moving, add a fourth column that says "address", "school_district", "None" or "drop"\n'

In [8659]:
gpt_df = pd.read_csv("classified_test_by_gpt.csv")
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,classification
959,,"South Central Community School Corporation | 9808 South 600 West Union Mills, IN 46382 | (219) 767-2263 | Fax: (219) 767-2260",,
2338,Davison Community Schools,,,school_district
3230,,"200 Elm Street | Reading, PA 19606 | 610-779-0700 | f 610-779-7104",Exeter Township School District,
1144,,"1702 School Street | Freedom, PA 15042 | (724) 775-5464",Freedom Area School District,
991,,"1191 NY Route 79 | Windsor, NY 13865",Windsor Central School District,
995,Rockford Public Schools,"501 7th Street | Rockford, IL 61104 | 815-966-3000",,school_district
801,Huntley Project Schools,"1477 Ash St. | Worden, MT 59088 |(406) 967-2540 | f (406) 967-3059",,school_district
2073,,"Lansing School District | 519 West Kalamazoo Street, Lansing, MI 48933 | Phone: (517) 755-1000",,
2801,,,Panther Valley School District,
3547,,"212 West 3rd Street | PO Box 38 | Wapato, WA 98951",,


In [8660]:
# let's check its distribution
gpt_df["classification"].value_counts(dropna=False)

classification
NaN                2149
school_district    1704
address              53
Name: count, dtype: int64

In [8661]:
# interestingly, there are no drops
# let's look at the slogan row
gpt_df[gpt_df["title_1"]=="Quality to the Core"]

Unnamed: 0,title_1,address,school_district,classification
21,Quality to the Core,,Central Dauphin School District,address


In [8662]:
# this is a pretty bad classification
# turns out GPT was using Python under the hood
# I asked GPT to classify manually instead
# here is its results
gpt_df = pd.read_csv("manual_classified_test.csv")
gpt_df.sample(10)

Unnamed: 0,title_1,address,school_district,classification
1758,,"701 W. Gregory Mount Prospect, IL 60056",Mt. Prospect School District 57,
2667,Comstock Park Public Schools,"101 School Street NE Comstock Park, MI 49321",,school_district
387,West Michigan Academy of Environmental Science,"4463 Leonard St. NW, Walker, MI 49534 Phone: (616) 791-7454",,school_district
1979,,"PO Box 980, 299 Bury Dr., Syracuse, NY 13209 Ph: (315) 468-1111 Fx: (315) 468-2755",Solvay Union Free School District,
3069,,"P.O. Box 1927 (mailing address) | 500 South Navajo Drive | Page, AZ 86040 | Tel: (928) 608-4100 | Fax: (928) 645-0067",Page Unified School District #8,
949,,102 S. Jackson St. Bluffton OH 45817,Bluffton Exempted Village School District,
1600,Littleton Public Schools,"5776 S Crocker St | Littleton, CO 80120 | ph: 303-347-3391 | fax: 303-347-4350",,school_district
1383,,"200 East St. Bernard Hwy. Chalmette, LA 70043 | (504) 301-2000",St. Bernard Parish School District,
1855,City of McPherson Kansas,"400 E Kansas Avenue | P.O. Box 1008 | McPherson, KS 67460 | 620-245-2535 | 620-245-2549",,school_district
314,Inspiring Excellence,,Hancock Place School District,address


In [8663]:
# still terrible, and I realized it still uses python under the hood
# I will start a new session and emphasize not to program
# let's check a few stats first

# number of NA title_1
print(df[df["title_1"].isna()].shape[0])

# number of none NA title_1
print(df[~df["title_1"].isna()].shape[0])

2149
1757


In [8664]:
df.columns

Index(['URL', 'title_1', 'home_website', 'address', 'school_district',
       'contains_phone_number_title_1', 'contains_phone_number_title_2',
       'phone'],
      dtype='object')

In [8665]:
df = df.drop(columns=["contains_phone_number_title_1","contains_phone_number_title_2"])

In [8666]:
# try again
df["index"] = df.index

In [8667]:
# o1 identified these as addresses
print(df.loc[df["index"].isin([155,242, 1343])])
# handle them manually
df.loc[df["URL"]=="https://go.boarddocs.com/ny/clintoncsd/Board.nsf/Public", "address"] = df["title_1"]
df.loc[df["URL"]=="https://go.boarddocs.com/ny/clintoncsd/Board.nsf/Public", "title_1"] = None

# drop the rest
df.loc[df["index"].isin([242, 1343]), "title_1"] = None

                                                          URL  \
155   https://go.boarddocs.com/ny/clintoncsd/Board.nsf/Public   
242          https://go.boarddocs.com/oh/swl/Board.nsf/Public   
1343        https://go.boarddocs.com/pa/karn/Board.nsf/Public   

                              title_1               home_website  \
155   75 Chenango Ave Clinton NY13323       https://www.ccs.edu/   
242          Southwest Licking LSD OH   http://www.swl.k12.oh.us   
1343             1446 Kittanning Pike  https://www.kcasdk12.org/   

                                                                       address  \
155                                                                       None   
242   Address: 927-A South Street, Pataskala, Ohio 43062 | Phone: 740-927-3941   
1343                                                     Karns City, PA  16041   

                      school_district phone  index  
155   Clinton Central School District  None    155  
242                            

In [8668]:
# output the rest
# use prompts/prompt_3.txt
df[~df["title_1"].isna()].to_csv("test.csv", columns=["title_1","index"],index=False)

In [8669]:
# get the o1 classification
gpt_df = pd.read_csv("gpt_results/results_10.csv")

# merge them
df = pd.merge(df, gpt_df, how="left", on="index")


In [8670]:
# get the missing ones
df[(~df["title_1"].isna()) & (df["classification"].isna()) ].to_csv("test-2.csv", columns=["title_1","index"],index=False)

In [8671]:
# sanity check
# all NAs should not have classifications
assert df[(df["title_1"].isna()) & (~df["classification"].isna()) ].shape[0] == 0

# all non-NAs should have classifications
assert df[(~df["title_1"].isna()) & (df["classification"].isna()) ].shape[0] == 0

# the number of classification should be the number of rows on the right
assert gpt_df.shape[0] == df[~df["classification"].isna()].shape[0]

In [8672]:
df["classification"].value_counts()

classification
school_district    1617
drop                 86
unsure               51
Name: count, dtype: int64

In [8673]:
# check the ones that are unsure
df[df["classification"]=="unsure"]

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,classification
26,https://go.boarddocs.com/sd/meade/Board.nsf/Public,"Meade County, South Dakota",http://www.meadecounty.org/,"1300 Sherman Street Ste 222 | Sturgis, SD 57785",,,26,unsure
152,https://go.boarddocs.com/ks/kta/Board.nsf/Public,Kansas Turnpike Authority,http://www.ksturnpike.com/,"9401 E Kellogg | Wichita, KS 67207 | (316) 682-4537",,,152,unsure
165,https://go.boarddocs.com/il/vobil/Board.nsf/Public,"The Village of Bourbonnais, Illinois",http://www.villageofbourbonnais.com/,"600 Main St N.W. | Bourbonnais, IL 60914",,,165,unsure
269,https://go.boarddocs.com/il/naperville/Board.nsf/Public,Naperville Park District,https://www.napervilleparks.org/,"320. W. Jackson Ave. Naperville, IL 60540 | 630-848-5000",,,269,unsure
303,https://go.boarddocs.com/mn/mboa/Board.nsf/Public,Minnesota Board on Aging,http://www.mn.gov/board-on-aging,"540 Cedar Street St. Paul, MN 55155 | 1-800-882-6262",,,303,unsure
320,https://go.boarddocs.com/ca/phcd/Board.nsf/Public,Peninsula Health Care District,http://www.peninsulahealthcaredistrict.org,"1819 Trousdale Drive | Burlingame, CA 94010 | (650) 697-6900",,,320,unsure
609,https://go.boarddocs.com/il/ctpf/Board.nsf/Public,Public School Teachers' Pension and Retirement Fund of Chicago,http://www.ctpf.org,"425 S Financial Place, Suite 1400, Chicago, IL 60605 | 312-604-1400",,,609,unsure
620,https://go.boarddocs.com/mi/jackson/Board.nsf/Public,"Jackson County, Michigan",http://www.mijackson.org,"120 West Michigan | Jackson, MI 49201 | P: (517) 788-4335 | F: (517) 780-4755",,,620,unsure
653,https://go.boarddocs.com/fl/sefbhn/Board.nsf/Public,Southeast Florida Behavioral Health Network,http://sefbhn.org/,"8895 N Military Trail, Suite E-102 | Palm Beach Gardens, FL 33410 | (561) 203-2485",,,653,unsure
697,https://go.boarddocs.com/ok/plib/Board.nsf/Public,Pioneer Library System,http://www.pioneerlibrarysystem.org,"300 Norman Center Court | Norman, OK 73072 | 405.801.4500",,,697,unsure


In [8674]:
# the unsures all look like valid entities
# and their school_district field is empty
# we will classify them as school_district

df.loc[df["classification"]=="unsure", "classification"] = "school_district"

In [8675]:
# look at those with drops
df.loc[df["classification"]=="drop",:].sample(5)

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,classification
1455,https://go.boarddocs.com/fl/madcofl/Board.nsf/Public,"Preparing Students for Career, College and Community",http://www.madison.k12.fl.us/,"210 NE Duval Avenue Madison, Florida 32340 - Phone: 850-973-5022",,,1455,drop
2417,https://go.boarddocs.com/mi/pincon/Board.nsf/Public,Board Policies and Administrative Guidelines and Forms,http://www.pasd.org,,Pinconning Area School District,,2417,drop
1947,https://go.boarddocs.com/oh/nrthn/Board.nsf/Public,The Board of Education Policies,http://www.nlsd.k12.oh.us,,Northern Local School District,,1947,drop
570,https://go.boarddocs.com/mi/oasd/Board.nsf/Public,NEOLA policies,http://www.oasd.k12.mi.us,,Ontonagon Area School District,,570,drop
106,https://go.boarddocs.com/wi/edsd/Board.nsf/Public,Edgar Excellence,www.edgar.k12.wi.us,,Edgar School District 203 East Birch Street 715-352-2351,,106,drop


In [8676]:
# drop the ones classified as drops
df.loc[df["classification"]=="drop", "title_1"] = None
df.loc[df["classification"]=="drop", "classification"] = None

In [8677]:
# look at those classified as school_district
df.loc[df["classification"]=="school_district",:].sample(5)

Unnamed: 0,URL,title_1,home_website,address,school_district,phone,index,classification
2523,https://go.boarddocs.com/pa/sctc/Board.nsf/Public,Somerset County Technology Center,www.sctc.net,,,,2523,school_district
382,https://go.boarddocs.com/nj/spartaboe/Board.nsf/Public,Sparta Board of Education,https://www.sparta.org/,"18 Mohawk Avenue Sparta, NJ 07871 | 973-729-2155",,,382,school_district
3533,https://go.boarddocs.com/oh/bcmhb/Board.nsf/Public,Butler County Mental Health and Addiction Recovery Services Board,http://bcmhars.org/,"5963 Boymel Drive | Fairfield, OH 45014-5541 | p (513) 860-9240 | f (513) 860-9241",,,3533,school_district
203,https://go.boarddocs.com/wa/wssda/Board.nsf/Public,Washington State School Directors' Association,https://www.wssda.org/,"Mailing Address: P. O. Box 5248 • Lacey, Washington 98609 • 360-493-9231",,,203,school_district
3216,https://go.boarddocs.com/ca/santarosa/Board.nsf/Public,Sonoma County Junior College District,http://www.santarosa.edu/,,,,3216,school_district


In [8678]:
# move the others to school_district
df.loc[df["classification"]=="school_district", "school_district"] = df["title_1"]
df.loc[df["classification"]=="school_district", "title_1"] = None
df.loc[df["classification"]=="school_district", "classification"] = None

# assert no others
assert df[~df["classification"].isna()].shape[0] == 0
assert df[~df["title_1"].isna()].shape[0] == 0

In [8679]:
# delete these columns
df = df.drop(["classification","title_1"], axis=1)

In [8680]:
# we also try to get the phone number from the address
df['address_contains_phone_number'] = df['address'].apply(contains_phone_number)
df[df["address_contains_phone_number"]].sample(5)

Unnamed: 0,URL,home_website,address,school_district,phone,index,address_contains_phone_number
403,https://go.boarddocs.com/nm/cisd/Board.nsf/Public,http://www.cuba.k12.nm.us,"PO Box 70, Cuba, NM 87013 | (575) 289-3211",Cuba Independent School District,,403,True
2558,https://go.boarddocs.com/ca/banning/Board.nsf/Public,https://www.banning.k12.ca.us,"161 W Williams St. Banning, CA 92220 | (951) 922-0200",Banning Unified School District,,2558,True
2292,https://go.boarddocs.com/ca/scoe/Board.nsf/Public,https://www.scoe.net/events/,"Mailing Address: PO Box 269003, Sacramento, CA 95826-9003 | (916) 228-2500",Sacramento County Office of Education,,2292,True
1890,https://go.boarddocs.com/fl/wcsb/Board.nsf/Public,http://www.wakullaschooldistrict.org/,"69 Arran Road, P.O. Box 100 | Crawfordville, FL 32326 | 850.926.0065",Wakulla County School District,,1890,True
3342,https://go.boarddocs.com/fl/citrus/Board.nsf/Public,http://www.citrusschools.org,"1007 W Main St, Inverness, FL 34450 | 352-726-1931",Citrus County School Board,,3342,True


In [8681]:
df[df["address_contains_phone_number"]].shape

(2185, 7)

In [8682]:
# extract it
# but before that, check if conflicts
df[df["address_contains_phone_number"] & (~df["phone"].isna())].shape

(0, 7)

In [8683]:
# great! no conflicts
phone_pattern = re.compile(r'(\(?\b\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b)')
df["phone_extracted_from_address"] = df["address"].str.extract(phone_pattern)

In [8684]:
df[~df["phone_extracted_from_address"].isna()].sample(5)

Unnamed: 0,URL,home_website,address,school_district,phone,index,address_contains_phone_number,phone_extracted_from_address
2769,https://go.boarddocs.com/wa/othello/Board.nsf/Public,http://www.othelloschools.org,1025 South 1st Avenue ~ Othello WA 99344 | Phone 509-488-2659 | Fax 509-488-5876,Othello School District,,2769,True,509-488-2659
1400,https://go.boarddocs.com/oh/tolles/Board.nsf/Public,http://www.tollestech.com,"7877 US Highway 42 S, Plain City OH USA 43064 | 614-873-4666",Tolles Career & Technical Center,,1400,True,614-873-4666
1581,https://go.boarddocs.com/ca/smcsd/Board.nsf/Public,https://www.smcsd.org,"200 PHILLIPS DRIVE, SAUSALITO (MARIN CITY), CA 94965 | 415-332-3190",Sausalito Marin City School District,,1581,True,415-332-3190
2808,https://go.boarddocs.com/wi/ashland/Board.nsf/Public,http://www.ashland.k12.wi.us/,"District Office - 2000 Beaser Avenue; Ashland, WI; (715) 682-7080",School District of Ashland Board of Education,,2808,True,(715) 682-7080
2551,https://go.boarddocs.com/nj/mtls/Board.nsf/Public,http://www.mtlaurelschools.org/,"330 Mount Laurel Road | Mount Laurel, NJ 08054 | 856.235.3387",Mount Laurel Schools,,2551,True,856.235.3387


In [8685]:
# double check no clashes
df[(~df["phone_extracted_from_address"].isna()) & (~df["phone"].isna())]

Unnamed: 0,URL,home_website,address,school_district,phone,index,address_contains_phone_number,phone_extracted_from_address


In [8686]:
# write in
df.loc[~df["phone_extracted_from_address"].isna(), "phone"] = df["phone_extracted_from_address"]
df.loc[~df["phone_extracted_from_address"].isna(), "phone_extracted_from_address"] = None

In [8687]:
# let's clean it
# extract all numbers and make sure there are 10

df['extracted_phone_numbers'] = None
df['extracted_phone_numbers'] = df['phone'].astype(str).str.replace(r'[^\d]', '', regex=True)

In [8688]:
# check their length
df["phone_length"] = df["extracted_phone_numbers"].apply(lambda x: len(x))
df["phone_length"].value_counts()

phone_length
10    2216
0     1684
20       4
12       1
30       1
Name: count, dtype: int64

In [8689]:
# check those that are not 10 or 0
df.loc[(df["phone_length"]!=10) & (df["phone_length"]!=0)]

Unnamed: 0,URL,home_website,address,school_district,phone,index,address_contains_phone_number,phone_extracted_from_address,extracted_phone_numbers,phone_length
624,https://go.boarddocs.com/in/triton/Board.nsf/Public,http://www.triton.k12.in.us,,Triton School Corporation,Home of the Trojans/574-342-2255/www.triton.k12.in.us,624,False,,574342225512,12
1199,https://go.boarddocs.com/wi/sdathen/Board.nsf/Public,https://www.athens1.org,"School District of Athens, 601 West Limits Road, Athens, WI 54411",,Phone: 715-257-7511 Fax: 715-257-7502,1199,False,,71525775117152577502,20
2173,https://go.boarddocs.com/ks/usd230/Board.nsf/Public,http://www.usd230.org,"Spring Hill School District | 17640 W. 199th Street, Spring Hill, Kansas 66083",,Info: (913) 592-7272 | T: (913) 592-7200 | F: (913) 592-7270,2173,False,,913592727291359272009135927270,30
2538,https://go.boarddocs.com/wi/campsd/Board.nsf/Public,www.csd.k12.wi.us,"327 N. Fond du Lac Ave. Campbellsport, WI 53010",,(920) 533-8381 | Fax (920) 533 -5726,2538,False,,92053383819205335726,20
2586,https://go.boarddocs.com/oh/cevsdoh/Board.nsf/Public,www.carrollton.k12.oh.us,"Carrollton Exempted Village School District, - 205 Scio Road S.W., - Carrollton, OH 44615",,"Phone: 330-627-2181, - Fax: 330-627-2182",2586,False,,33062721813306272182,20
2811,https://go.boarddocs.com/id/nsd131/Board.nsf/Public,http://www.nsd131.org,"619 S. Canyon St | Nampa, ID 83686",,(208) 468-4600 Fax: (208) 468-4638,2811,False,,20846846002084684638,20


In [8690]:
# ok, we can just take the top 10 numbers
df["extracted_phone_numbers"] = df["extracted_phone_numbers"].apply(lambda x: x[:10] if x else None)

In [8691]:
# put them in a nice format
def format_phone_numbers(number):
    if not number:
        return None
    # Convert number to string for easy slicing
    num_str = str(number)
    if len(num_str) == 10:  # Ensure it's a 10-digit number
        formatted = f"({num_str[:3]}) {num_str[3:6]}-{num_str[6:]}"
        return formatted
    else:
        raise Exception("Invalid number")

df["phone"] = df["extracted_phone_numbers"].apply(format_phone_numbers)

In [8692]:
# clean up the columns
df = df.loc[:,["URL","home_website","address","school_district","phone"]]
df.sample(5)

Unnamed: 0,URL,home_website,address,school_district,phone
3626,https://go.boarddocs.com/pa/blwd/Board.nsf/Public,https://www.blwd.k12.pa.us,"300 Martin Street, Bellwood, PA 16617",Bellwood-Antis School District,
3315,https://go.boarddocs.com/oh/mansoh/Board.nsf/Public,http://www.tygerpride.com,"856 W Cook Road | Mansfield, OH 44907 | 419.525.6400",Mansfield City School District,(419) 525-6400
805,https://go.boarddocs.com/ny/phoenixcsd/Board.nsf/Public,https://www.phoenixcsd.org,"116 Volney Street Phoenix, NY 13135 | 315-695-1573",Phoenix Central School District,(315) 695-1573
38,https://go.boarddocs.com/mo/mcr1/Board.nsf/Public,http://www.macon.k12.mo.us/,"702 North Missouri  Macon, Missouri 63552  (660) 395-6164",Macon County R-1 Schools,(660) 395-6164
3633,https://go.boarddocs.com/pa/tunk/Board.nsf/Public,www.tasd.net,"41 Philadelphia Ave Tunkhannock, PA, 18657",Tunkhannock Area School District,


In [8693]:
# finally, add a column indicate if the row is likely an actual school_district
# here is Tom's quote
"""
However, please add a column to the spreadsheet noting whether the words “school”, “academy”, “district” or “education” appear in the “address” or “school_district” columns.   That way, we can go through later and weed out the other organizations.
"""

'\nHowever, please add a column to the spreadsheet noting whether the words “school”, “academy”, “district” or “education” appear in the “address” or “school_district” columns.   That way, we can go through later and weed out the other organizations.\n'

In [8694]:
likely_school_district_pattern = r"school|academy|district|education"
df["contain_school_district_keywords"] = df["address"].str.contains(likely_school_district_pattern, case=False, regex=True, na=False) | df["school_district"].str.contains(likely_school_district_pattern, case=False, regex=True, na=False) 

In [8695]:
import validators

df["valid_home_website"] = df["home_website"].apply(lambda x: validators.url(x) == True or validators.url("http://" + str(x)) == True)
df["valid_home_website"].value_counts()

valid_home_website
True     3792
False     114
Name: count, dtype: int64

In [8696]:
# non-null but invalid home website
weird_websites = df[(~df["valid_home_website"]) & (~df["home_website"].isna())]
weird_websites.shape

(61, 7)

In [8697]:
weird_websites

Unnamed: 0,URL,home_website,address,school_district,phone,contain_school_district_keywords,valid_home_website
118,https://go.boarddocs.com/nj/pctvs/Board.nsf/Public,https://go.boarddocs.com/nj/pctvs/Board.nsf/Private?open&login#,"45 Reinhardt Road, Wayne, NJ 07470",Passaic County Technical-Vocational Institute,,False,False
173,https://go.boarddocs.com/in/wesdel/Board.nsf/Public,http://www.wes-del.k12.in us,,Wes-Del Community Schools,,True,False
199,https://go.boarddocs.com/pa/bthl/Board.nsf/Public,http://www.bpsd.org,"301 Church Road, Bethel Park, PA 15102",Bethel Park School District,,True,False
210,https://go.boarddocs.com/ny/elmont/Board.nsf/Public,http://www.elmontschools.org/,"135 Elmont Road | Elmont, NY 11003-1635 | 516-326-5500 | f 516-326-5574",Elmont Union Free School District,(516) 326-5500,True,False
301,https://go.boarddocs.com/wi/lakelanduhs/Board.nsf/Public,https://www.boarddocs.com/wi/lakelanduhs/ Board.nsf/Public?open&id=policies,,Lakeland Union High School District,,True,False
...,...,...,...,...,...,...,...
3679,https://go.boarddocs.com/pa/iu08/Board.nsf/Public,http://www.iu08.org,,Appalachia Intermediate Unit 8,,False,False
3741,https://go.boarddocs.com/mi/wpcs/Board.nsf/Public,http:/www.wpcschools.org,,White Pigeon Community Schools,,True,False
3832,https://go.boarddocs.com/pa/fleb/Board.nsf/Public,http://www.fortleboeuf.net,,Fort LeBoeuf School District,,True,False
3876,https://go.boarddocs.com/pa/avnw/Board.nsf/Public,http://www.avonworth.k12.pa.us/,"258 Josephs Lane Pittsburgh, PA 15237",Avonworth School District,,True,False


In [8698]:
# replace \\
idx = (~df["valid_home_website"]) & (~df["home_website"].isna())
df.loc[idx,"home_website"] = df.loc[:,"home_website"].str.replace("\\","/")

In [8699]:
# non-null but invalid home website
df["valid_home_website"] = df["home_website"].apply(lambda x: validators.url(x) == True or validators.url("http://" + str(x)) == True)
df[(~df["valid_home_website"]) & (~df["home_website"].isna())]

Unnamed: 0,URL,home_website,address,school_district,phone,contain_school_district_keywords,valid_home_website
118,https://go.boarddocs.com/nj/pctvs/Board.nsf/Public,https://go.boarddocs.com/nj/pctvs/Board.nsf/Private?open&login#,"45 Reinhardt Road, Wayne, NJ 07470",Passaic County Technical-Vocational Institute,,False,False
173,https://go.boarddocs.com/in/wesdel/Board.nsf/Public,http://www.wes-del.k12.in us,,Wes-Del Community Schools,,True,False
199,https://go.boarddocs.com/pa/bthl/Board.nsf/Public,http://www.bpsd.org,"301 Church Road, Bethel Park, PA 15102",Bethel Park School District,,True,False
210,https://go.boarddocs.com/ny/elmont/Board.nsf/Public,http://www.elmontschools.org/,"135 Elmont Road | Elmont, NY 11003-1635 | 516-326-5500 | f 516-326-5574",Elmont Union Free School District,(516) 326-5500,True,False
301,https://go.boarddocs.com/wi/lakelanduhs/Board.nsf/Public,https://www.boarddocs.com/wi/lakelanduhs/ Board.nsf/Public?open&id=policies,,Lakeland Union High School District,,True,False
434,https://go.boarddocs.com/wi/ncraw/Board.nsf/Public,www.northcrawford.com,"47050 County Road X | Soldiers Grove, WI 54655 | (608) 735-4318",North Crawford School District,(608) 735-4318,True,False
473,https://go.boarddocs.com/pa/bldw/Board.nsf/Public,http://www.bwschools.net/,,Baldwin-Whitehall School District,,True,False
569,https://go.boarddocs.com/wi/fond/Board.nsf/Public,http:/www.fonddulac.k12.wi.us,"72 West Ninth Street, Fond du Lac, WI 54935 (920) 906-6500",Fond du Lac School District,(920) 906-6500,True,False
611,https://go.boarddocs.com/pa/bgbf/Board.nsf/Public,http://www.tigerweb.org/,,Big Beaver Falls School District,,True,False
639,https://go.boarddocs.com/oh/young/Board.nsf/Public,http://www.ycsd.org,,Youngstown City Schools District,,True,False


In [8700]:
# fix the typos
pattern = r"https?:?/?/(?=[A-Za-z])"
idx = (~df["valid_home_website"]) & (~df["home_website"].isna())
df.loc[idx,"home_website"] = df.loc[:,"home_website"].str.replace(pattern,"http://", regex=True)

In [8701]:
# non-null but invalid home website
df["valid_home_website"] = df["home_website"].apply(lambda x: validators.url(x) == True or validators.url("http://" + str(x)) == True)
df[(~df["valid_home_website"]) & (~df["home_website"].isna())]

Unnamed: 0,URL,home_website,address,school_district,phone,contain_school_district_keywords,valid_home_website
118,https://go.boarddocs.com/nj/pctvs/Board.nsf/Public,http://go.boarddocs.com/nj/pctvs/Board.nsf/Private?open&login#,"45 Reinhardt Road, Wayne, NJ 07470",Passaic County Technical-Vocational Institute,,False,False
173,https://go.boarddocs.com/in/wesdel/Board.nsf/Public,http://www.wes-del.k12.in us,,Wes-Del Community Schools,,True,False
199,https://go.boarddocs.com/pa/bthl/Board.nsf/Public,http://www.bpsd.org,"301 Church Road, Bethel Park, PA 15102",Bethel Park School District,,True,False
210,https://go.boarddocs.com/ny/elmont/Board.nsf/Public,http://www.elmontschools.org/,"135 Elmont Road | Elmont, NY 11003-1635 | 516-326-5500 | f 516-326-5574",Elmont Union Free School District,(516) 326-5500,True,False
301,https://go.boarddocs.com/wi/lakelanduhs/Board.nsf/Public,http://www.boarddocs.com/wi/lakelanduhs/ Board.nsf/Public?open&id=policies,,Lakeland Union High School District,,True,False
434,https://go.boarddocs.com/wi/ncraw/Board.nsf/Public,www.northcrawford.com,"47050 County Road X | Soldiers Grove, WI 54655 | (608) 735-4318",North Crawford School District,(608) 735-4318,True,False
473,https://go.boarddocs.com/pa/bldw/Board.nsf/Public,http://www.bwschools.net/,,Baldwin-Whitehall School District,,True,False
611,https://go.boarddocs.com/pa/bgbf/Board.nsf/Public,http://www.tigerweb.org/,,Big Beaver Falls School District,,True,False
639,https://go.boarddocs.com/oh/young/Board.nsf/Public,http://www.ycsd.org,,Youngstown City Schools District,,True,False
673,https://go.boarddocs.com/pa/laur/Board.nsf/Public,http://www.laurel.k12.pa.us,,Laurel School District,,True,False


In [8702]:
# fix one
df.loc[df["URL"]=="https://go.boarddocs.com/in/wesdel/Board.nsf/Public", "home_website"] = "http://www.wes-del.k12.in.us"

# non-null but invalid home website
df["valid_home_website"] = df["home_website"].apply(lambda x: validators.url(x) == True or validators.url("http://" + str(x)) == True)
df[(~df["valid_home_website"]) & (~df["home_website"].isna())]

Unnamed: 0,URL,home_website,address,school_district,phone,contain_school_district_keywords,valid_home_website
118,https://go.boarddocs.com/nj/pctvs/Board.nsf/Public,http://go.boarddocs.com/nj/pctvs/Board.nsf/Private?open&login#,"45 Reinhardt Road, Wayne, NJ 07470",Passaic County Technical-Vocational Institute,,False,False
199,https://go.boarddocs.com/pa/bthl/Board.nsf/Public,http://www.bpsd.org,"301 Church Road, Bethel Park, PA 15102",Bethel Park School District,,True,False
210,https://go.boarddocs.com/ny/elmont/Board.nsf/Public,http://www.elmontschools.org/,"135 Elmont Road | Elmont, NY 11003-1635 | 516-326-5500 | f 516-326-5574",Elmont Union Free School District,(516) 326-5500,True,False
301,https://go.boarddocs.com/wi/lakelanduhs/Board.nsf/Public,http://www.boarddocs.com/wi/lakelanduhs/ Board.nsf/Public?open&id=policies,,Lakeland Union High School District,,True,False
434,https://go.boarddocs.com/wi/ncraw/Board.nsf/Public,www.northcrawford.com,"47050 County Road X | Soldiers Grove, WI 54655 | (608) 735-4318",North Crawford School District,(608) 735-4318,True,False
473,https://go.boarddocs.com/pa/bldw/Board.nsf/Public,http://www.bwschools.net/,,Baldwin-Whitehall School District,,True,False
611,https://go.boarddocs.com/pa/bgbf/Board.nsf/Public,http://www.tigerweb.org/,,Big Beaver Falls School District,,True,False
639,https://go.boarddocs.com/oh/young/Board.nsf/Public,http://www.ycsd.org,,Youngstown City Schools District,,True,False
673,https://go.boarddocs.com/pa/laur/Board.nsf/Public,http://www.laurel.k12.pa.us,,Laurel School District,,True,False
694,https://go.boarddocs.com/pa/eriesd/Board.nsf/Public,http://www.boarddocs.com/pa/eriesd/Board.nsf/Private?open&login,"148 West 21st Street | Erie, Pennsylvania 16502 | 814-874-6000","The School District of the City of Erie, Pennsylvania",(814) 874-6000,True,False


In [8703]:
# we don't care about the rest of the URL after the first singular /
# and we also don't care about spaces
# their spaces should be after / though

df["spaces_before_slash"] = df["home_website"].apply(lambda x: ' ' in str(x).replace("//","").split('/', 1)[0] if x else False)
df["spaces_before_slash"].value_counts()

spaces_before_slash
False    3884
True       22
Name: count, dtype: int64

In [8704]:
# chop off the rest of the URL if they have spaces
df.loc[:,"home_website"] = df["home_website"].apply(lambda x: str(x).split(' ', 1)[0] if x else None)

# also just keep the things before the slash
single_slash = r"(?<!/)/(?!/)"
df.loc[:,"home_website"] = df["home_website"].apply(lambda x: re.split(single_slash, str(x))[0] if x else None)

In [8705]:
# non-null but invalid home website
df["valid_home_website"] = df["home_website"].apply(lambda x: validators.url(x) == True or validators.url("http://" + str(x)) == True)
df[(~df["valid_home_website"]) & (~df["home_website"].isna())]

Unnamed: 0,URL,home_website,address,school_district,phone,contain_school_district_keywords,valid_home_website,spaces_before_slash
232,https://go.boarddocs.com/pa/camb/Board.nsf/Public,,,,,False,False,False
443,https://go.boarddocs.com/oh/oakhil/Board.nsf/Public,,,,,False,False,False
465,https://go.boarddocs.com/oh/warrenoh/Board.nsf/Public,,,,,False,False,False
468,https://go.boarddocs.com/pa/marp/Board.nsf/Public,,,Marple Newtown School District,,True,False,False
533,https://go.boarddocs.com/oh/meigs/Board.nsf/Public,,,,,False,False,False
552,https://go.boarddocs.com/pa/sola/Board.nsf/Public,,,,,False,False,False
590,https://go.boarddocs.com/in/lanesville/Board.nsf/Public,,,Lanesville Comunity School Corporation,,True,False,False
784,https://go.boarddocs.com/wi/sbschools/Board.nsf/Public,,,Stanley-Boyd Area Schools,,True,False,False
884,https://go.boarddocs.com/oh/vwcs/Board.nsf/Public,,,Van Wert City Schools,,True,False,False
996,https://go.boarddocs.com/ca/lmsv/Board.nsf/Public,,,,,False,False,False


In [8706]:
df.loc[df["home_website"] == "nan", "home_website"] = None

In [8707]:
# fix manual
df.loc[df["URL"]=="https://go.boarddocs.com/pa/ojrsd/Board.nsf/Public", "home_website"] = "http://www.ojrsd.com"
df.loc[df["URL"]=="https://go.boarddocs.com/mi/sutt/Board.nsf/Public", "home_website"] = "http://www.suttonsbayschools.com"

In [8708]:
# non-null but invalid home website
df["valid_home_website"] = df["home_website"].apply(lambda x: validators.url(x) == True or validators.url("http://" + str(x)) == True)
df[(~df["valid_home_website"]) & (~df["home_website"].isna())]

Unnamed: 0,URL,home_website,address,school_district,phone,contain_school_district_keywords,valid_home_website,spaces_before_slash


In [8709]:
# add http
df["valid_home_website"] = df["home_website"].apply(lambda x: validators.url(x) == True)
df.loc[~df["valid_home_website"], "home_website"] = df["home_website"] + "http://"

In [8710]:
# non-null but invalid home website
df["valid_URL"] = df["URL"].apply(lambda x: validators.url(x) == True)
df["valid_URL"].value_counts()

valid_URL
True     3905
False       1
Name: count, dtype: int64

In [8711]:
df[~df["valid_URL"]]

Unnamed: 0,URL,home_website,address,school_district,phone,contain_school_district_keywords,valid_home_website,spaces_before_slash,valid_URL
3553,,,,,,False,False,False,False


In [8712]:
idx = df[~df["valid_URL"]].index
df = df.drop(idx,axis=0)

In [8713]:
# non-null but invalid home website
df["valid_URL"] = df["URL"].apply(lambda x: validators.url(x) == True)
df["valid_URL"].value_counts()

valid_URL
True    3905
Name: count, dtype: int64

In [8714]:
# output the result
df.to_csv("release/deliverable_1.csv", index=False, columns=["URL","school_district","address","home_website","phone"])

In [8715]:
# get the stats
df["contain_school_district_keywords"].value_counts()

contain_school_district_keywords
True     3407
False     498
Name: count, dtype: int64

In [8716]:
# trim all whitespace before output
df["URL"] = df["URL"].str.strip()
df["school_district"] = df["school_district"].str.strip()
df["address"] = df["address"].str.strip()
df["home_website"] = df["home_website"].str.strip()
df["phone"] = df["phone"].str.strip()

In [8717]:
# get more stats
df.shape

(3905, 9)

In [8718]:
len(df["URL"].unique())

3905

In [8719]:
# home website
(~df["home_website"].isna()).value_counts()

home_website
True     3852
False      53
Name: count, dtype: int64

In [8720]:
# home website (in percentages)
(~df["home_website"].isna()).value_counts() / df.shape[0] * 100

home_website
True     98.642766
False     1.357234
Name: count, dtype: float64

In [8721]:
# phone numbers
(~df["phone"].isna()).value_counts()

phone
True     2222
False    1683
Name: count, dtype: int64

In [8722]:
# phone (in percentages)
(~df["phone"].isna()).value_counts() / df.shape[0] * 100

phone
True     56.901408
False    43.098592
Name: count, dtype: float64

In [8723]:
# phone or website
((~df["phone"].isna()) | (~df["home_website"].isna())).value_counts()

True     3857
False      48
Name: count, dtype: int64

In [8724]:
# phone or website (percentages)
((~df["phone"].isna()) | (~df["home_website"].isna())).value_counts()/ df.shape[0] * 100

True     98.770807
False     1.229193
Name: count, dtype: float64

In [8725]:
# school_district
(~df["school_district"].isna()).value_counts()

school_district
True     3641
False     264
Name: count, dtype: int64

In [8726]:
# school_district (in percentages)
(~df["school_district"].isna()).value_counts() / df.shape[0] * 100

school_district
True     93.239437
False     6.760563
Name: count, dtype: float64

In [8727]:
# address
(~df["address"].isna()).value_counts()

address
True     2919
False     986
Name: count, dtype: int64

In [8728]:
# address (in percentages)
(~df["address"].isna()).value_counts() / df.shape[0] * 100

address
True     74.75032
False    25.24968
Name: count, dtype: float64

In [8729]:
# get the stats
df["contain_school_district_keywords"].value_counts()

contain_school_district_keywords
True     3407
False     498
Name: count, dtype: int64

In [8730]:
# address (in percentages)
df["contain_school_district_keywords"].value_counts() / df.shape[0] * 100

contain_school_district_keywords
True     87.247119
False    12.752881
Name: count, dtype: float64