This notebook merges the NCES data with the BoardDocs URLs scrapped from deliverable 1.

Input
- `../deliverable1/release/deliverable_1.csv`
- `../nces2324.csv`

Output
-  `release/deliverable_2.csv`

In [707]:
import pandas as pd

boarddocs_df = pd.read_csv("../deliverable1/release/deliverable_1.csv")
nces_df = pd.read_csv("../nces2324.csv")

  nces_df = pd.read_csv("../nces2324.csv")


In [708]:

# select relevant columns
nces_df = nces_df[["LEA_NAME", "LEAID", "LSTREET1", "LSTREET2","LSTREET3","LCITY","LSTATE", "LZIP","PHONE","WEBSITE"]]
nces_df["LEAID"] = nces_df["LEAID"].astype(int)
nces_df["LZIP"] = nces_df["LZIP"].astype(int)

In [709]:
# match by phone
# then hopefully that can tell us what is up with the website
nces_df = nces_df.rename(columns={
    "PHONE": "phone"
})

In [710]:
# do some edits for phones
# check if all is in same format
phone_pattern = r"^\(\d{3}\)\d{3}\-\d{4}$"
nces_df["phone_good_pattern"] = nces_df["phone"].str.contains(phone_pattern, regex=True, na=False)

In [711]:
nces_df["phone_good_pattern"].value_counts()

phone_good_pattern
True    19637
Name: count, dtype: int64

In [712]:
# let's just add space
nces_df["phone"] = nces_df["phone"].str.replace(")",") ")

In [713]:
# all schools in nces have phone
sum(nces_df["phone"].isna())

0

In [714]:
# make sure both are unique
sum(nces_df["phone"].duplicated(keep=False))

1453

In [715]:
nces_df.shape

(19637, 11)

In [716]:
# seems like almost 10% of the phone numbers in nces is duplicated
nces_df[nces_df["phone"].duplicated(keep=False)].sort_values(by="phone").head()

Unnamed: 0,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone,WEBSITE,phone_good_pattern
11110,Bergen County Vocational Technical School Dist...,3401470,540 Farview Avenue,,,Paramus,NJ,7652,(201) 343-6000,http://bcts.bergen.org,True
11109,Bergen County Special Services School District,3401450,540 Farview Avenue,,,Paramus,NJ,7652,(201) 343-6000,http://bcss.bergen.org,True
11070,Hudson Arts and Science Charter School,3400787,131 Midland Ave,,,Kearny,NJ,7032,(201) 773-9140,http://www.hudsoncharter.org,True
11023,Bergen Arts and Science Charter School,3400715,200 MacArthur Ave,,,Garfield,NJ,7026,(201) 773-9140,http://www.bergencharter.org,True
3802,Regional School District 09,903780,654 Morehouse Road,,,Easton,CT,6612,(203) 261-2513,http://www.er9.org,True


In [717]:
# since the most coverage is by the website, let's do a merge there first
# our goal is to match as much boarddocs as possible
# so boarddocs_df is on the left
nces_df = nces_df.rename(columns={"WEBSITE":"home_website"})

In [718]:
nces_df["home_website"].isna().value_counts()

home_website
False    16957
True      2680
Name: count, dtype: int64

In [719]:
# before merge, let's clean
# remove https and www
url_prefix_pattern = r"https?://(www.)?"
nces_df["home_website"] = nces_df["home_website"].str.replace(url_prefix_pattern, '', regex=True)
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.replace(url_prefix_pattern, '', regex=True)

# also get the first stub
single_slash_pattern = r"[?<!/]/[?!/]"
nces_df["home_website"] = nces_df["home_website"].str.split(single_slash_pattern,n=1, regex=True).str[0]
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.split(single_slash_pattern,n=1, regex=True).str[0]

# also replace trailing /
trailing_slash_pattern = r"/$"
nces_df["home_website"] = nces_df["home_website"].str.replace(trailing_slash_pattern, '', regex=True)
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.replace(trailing_slash_pattern, '', regex=True)

In [720]:
# check if the websites are duplicated
sum(nces_df[~nces_df["home_website"].isna()]["home_website"].duplicated(keep=False)) / nces_df.shape[0] * 100

7.170138004786882

In [721]:
sum(nces_df[~nces_df["home_website"].isna()]["home_website"].duplicated(keep=False))

1408

In [722]:
# check if the websites are duplicated
sum(boarddocs_df[~boarddocs_df["home_website"].isna()]["home_website"].duplicated(keep=False)) / boarddocs_df.shape[0] * 100

0.4876796714579056

In [723]:
sum(boarddocs_df[~boarddocs_df["home_website"].isna()]["home_website"].duplicated(keep=False))

19

In [724]:
boarddocs_df.loc[(~boarddocs_df["home_website"].isna()) & (boarddocs_df["home_website"].duplicated(keep=False)),:].sort_values(by="home_website")

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone
322,https://go.boarddocs.com/nj/bergen/Board.nsf/P...,Bergen County Technical Schools,"540 Farview Ave. , Paramus , NJ 07652",bergen.org,
1781,https://go.boarddocs.com/nj/bergencss/Board.ns...,Bergen County Special Services,"540 Farview Avenue, Paramus, NJ 07652",bergen.org,
3552,https://go.boarddocs.com/wa/bethel/Board.nsf/P...,Bethel School District,"516 176th St E | Spanaway, WA 98387 | Phone: 2...",bethelsd.org,(253) 800-2010
3455,https://go.boarddocs.com/wa/pierce/Board.nsf/P...,Pierce County Skills Center,"16117 Canyon Rd. E Puyallup, WA 98375 Phone:...",bethelsd.org,(253) 800-4800
3768,https://go.boarddocs.com/mi/kcacad/Board.nsf/P...,Kalamazoo Covenant Academy,"400 W Crosstown Pkwy | Kalamazoo, MI 49001 | 2...",covenantacademies.org,(269) 888-2700
483,https://go.boarddocs.com/mi/mcacad/Board.nsf/P...,Muskegon Covenant Academy,"125 Catherine Avenue | Muskegon, MI 49442 | 23...",covenantacademies.org,(231) 720-3100
3128,https://go.boarddocs.com/mi/scacad/Board.nsf/P...,Saginaw Covenant Academy,"508 S. Washington Avenue | Saginaw, MI 48607 |...",covenantacademies.org,(989) 596-1100
2367,https://go.boarddocs.com/mi/engadine/Board.nsf...,Engadine Consolidated Schools,,eupschools.org,
1875,https://go.boarddocs.com/mi/macki/Board.nsf/Pu...,,,eupschools.org,
1620,https://go.boarddocs.com/ca/ecscn/Board.nsf/Pu...,Excelsior Charter School Corona-Norco,"1400 Fullerton Ave. Corona, CA 92879 | 951.547...",excelsior.com,(951) 547-7540


In [725]:
# these are quite the pain so let's ignore them for now

In [726]:
df = pd.merge(boarddocs_df[~boarddocs_df["home_website"].duplicated(keep=False)], nces_df[~nces_df["home_website"].isna()], how="left", on="home_website", suffixes=["_boarddocs", "_nces"])

In [683]:
# let's check how good is the match
# check how many rows have LEAID
df[~df["LEAID"].isna()].shape[0] / boarddocs_df.shape[0] * 100

78.10574948665298

In [684]:
# ok we have matched 61% of them
# check those that weren't matched but has website

df[(~df["home_website"].isna()) & (df["LEAID"].isna())].sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern
1073,https://go.boarddocs.com/pa/bocpa/Board.nsf/Pu...,Borough Of Chambersburg,"100 South 2nd Street Chambersburg, PA 17201 Ph...",chambersburgpa.gov,(717) 264-5151,,,,,,,,,,
3593,https://go.boarddocs.com/az/rcscw/Board.nsf/Pu...,Recreation Centers of Sun City West,19803 R.H. Johnson Blvd. (Bldg. G3)| Sun City ...,suncitywest.com,(623) 544-6115,,,,,,,,,,
1066,https://go.boarddocs.com/mi/bbdn/Board.nsf/Public,Big Bay de Noc School,,baydenoc.k12.mi.us,,,,,,,,,,,
3218,https://go.boarddocs.com/ut/spanishfork/Board....,"City of Spanish Fork, Utah","80 S. Main St. | Spanish Fork, UT 84660 | (8...",spanishfork.org,(801) 804-4530,,,,,,,,,,
219,https://go.boarddocs.com/oh/fairfieldul/Board....,Fairfield Union Local School District,"6417 Cincinnati-Zanesville Road NE, Lancaster,...",fairfield-union.k12.oh.us,(740) 536-7384,,,,,,,,,,


In [685]:
# let's take out the ones matched first
matched = df[~df["LEAID"].isna()]
boarddocs_df = boarddocs_df[~boarddocs_df["boarddocs_url"].isin(matched["boarddocs_url"])]

In [686]:
# sanity check the matches so far
# check whether the phone matches
# for those with both set
df = matched[(~matched["phone_boarddocs"].isna()) & (~matched["phone_nces"].isna()) ].copy()

In [687]:
df.shape

(1722, 15)

In [688]:
df.loc[:,"phones_agree"] = (df["phone_boarddocs"] == df["phone_nces"])
df["phones_agree"].value_counts()

phones_agree
True     1362
False     360
Name: count, dtype: int64

In [689]:
# check those that differs
df[~df["phones_agree"]].sample(10)

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern,phones_agree
325,https://go.boarddocs.com/ny/gardencity/Board.n...,Garden City Union Free School District,"56 Cathedral Ave Garden City, NY 11530 | 516-...",gardencity.k12.ny.us,(516) 478-1000,GARDEN CITY UNION FREE SCHOOL DISTRICT,3611760.0,56 CATHEDRAL AVE,,,GARDEN CITY,NY,11530.0,(516) 478-1010,True,False
2724,https://go.boarddocs.com/mabe/fcps/Board.nsf/P...,Frederick County Public Schools,"191 South East Street | Frederick, MD 21701 | ...",fcps.org,(227) 203-3277,Frederick County Public Schools,2400330.0,191 South East Street,,,Frederick,MD,21701.0,(301) 644-5000,True,False
1716,https://go.boarddocs.com/ca/aspire/Board.nsf/P...,Aspire Public Schools,"1001 22nd Avenue Oakland, CA 94606 | (510) 434...",aspirepublicschools.org,(510) 434-5000,Aspire College Academy District,602112.0,8030 Atherton St.,,,Oakland,CA,94605.0,(510) 562-8030,True,False
3083,https://go.boarddocs.com/wi/mcfsd/Board.nsf/Pu...,School District of McFarland,"5101 Farwell Street McFarland, WI 53558 | 608-...",mcfarland.k12.wi.us,(608) 838-4550,McFarland School District,5508910.0,5101 Farwell St,,,McFarland,WI,53558.0,(608) 838-3169,True,False
1182,https://go.boarddocs.com/ny/nrcsd/Board.nsf/Pu...,North Rockland Central School District,"65 Chapel Street | Garnerville, NY 10923 | Pho...",nrcsd.org,(845) 942-3000,HAVERSTRAW-STONY POINT CSD (NORTH ROCKLAND),3614010.0,65 CHAPEL ST,,,GARNERVILLE,NY,10923.0,(845) 942-3002,True,False
1720,https://go.boarddocs.com/ca/aspire/Board.nsf/P...,Aspire Public Schools,"1001 22nd Avenue Oakland, CA 94606 | (510) 434...",aspirepublicschools.org,(510) 434-5000,Aspire Benjamin Holt College Preparatory Acade...,602376.0,3201 E. Morada Ln.,,,Stockton,CA,95212.0,(209) 955-1477,True,False
3618,https://go.boarddocs.com/oh/lowellville/Board....,Lowellville School District,"52 Rocket Place | Lowellville, OH 44436 | 330-...",lowellville.k12.oh.us,(330) 536-8426,Lowellville Local,3904833.0,52 Rocket Place,,,Lowellville,OH,44436.0,(330) 536-6318,True,False
1043,https://go.boarddocs.com/ks/usd253/Board.nsf/P...,Emporia Unified School District #253,"1700 W. 7th | P.O. Box 1008 | Emporia, KS 668...",usd253.org,(620) 341-2201,Flint Hills Special Ed. Cooperative,2000363.0,1700 W. 7th Avenue,,,Emporia,KS,66801.0,(620) 341-2225,True,False
762,https://go.boarddocs.com/co/hsd2/Board.nsf/Public,Harrison School District Two,"1060 Harrison Road | Colorado Springs, CO 8090...",hsd2.org,(719) 579-2000,El Paso 2 Harrison AU,800236.0,2883 South Circle Drive,,,Colorado Springs,CO,80906.0,(719) 579-3240,True,False
1501,https://go.boarddocs.com/wi/mjsd/Board.nsf/Public,Menasha Joint School District,"100 MAIN STREET • MENASHA, WI 54952 • P...",mjsd.k12.wi.us,(920) 967-1403,Menasha Joint School District,5509030.0,100 Main St,3rd Floor,,Menasha,WI,54952.0,(920) 967-1401,True,False


In [690]:
# it seems like both phone numbers are valid
# I will just keep both for now

In [691]:
# check the situation at boarddocs
sum(boarddocs_df[~boarddocs_df["phone"].isna()]["phone"].duplicated(keep=False))

2

In [692]:
boarddocs_df[(~boarddocs_df["phone"].isna()) & boarddocs_df["phone"].duplicated(keep=False)]

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone
3166,https://go.boarddocs.com/ca/empirescs/Board.ns...,Empire Springs Charter School,"27740 Jefferson Ave | Temecula, CA 92590 | (95...",springscharterschools.org,(951) 252-8800
3645,https://go.boarddocs.com/ca/harborscs/Board.ns...,Harbor Springs Charter School,"27740 Jefferson Avenue | Temecula, CA 92590 | ...",springscharterschools.org,(951) 252-8800


In [693]:
# again, ignore this for now

In [694]:
# merge it
df = pd.merge(boarddocs_df[~boarddocs_df["phone"].duplicated(keep=False)], nces_df, how="left", on="phone", suffixes=["_boarddocs", "_nces"])

In [695]:
# check matches
# check how many rows have LEAID
df[~df["LEAID"].isna()].shape[0] / df.shape[0] * 100

62.19512195121951

In [696]:
df[~df["LEAID"].isna()].sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website_boarddocs,phone,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,home_website_nces,phone_good_pattern
525,https://go.boarddocs.com/oh/westfall/Board.nsf...,Westfall Local School District,"19463 Pherson Pike | Williamsport, OH 43164 | ...",westfall.k12.oh.us,(740) 986-3671,Westfall Local,3904910.0,19463 Pherson Pike,,,Williamsport,OH,43164.0,westfallschools.com,True
308,https://go.boarddocs.com/ca/wuhsd/Board.nsf/Pu...,Whittier Union High School District,"9401 S. Painter Avenue | Whittier, California ...",wuhsd.org,(562) 698-8121,Whittier Union High,642480.0,9401 South Painter Ave.,,,Whittier,CA,90605.0,wuhsd.k12.ca.us,True
155,https://go.boarddocs.com/la/ipsb/Board.nsf/Public,Iberville Parish Schools,"58060 Plaquemine Street | Plaquemine, LA 70764...",ipsb.net,(225) 687-4341,Iberville Parish,2200750.0,58030 Plaquemine Street,,,Plaquemine,LA,70764.0,,True
364,https://go.boarddocs.com/la/lpsb/Board.nsf/Public,Lafourche Parish Public Schools,"805 E 7th Street, Thibodaux, LA 70301 | p (985...",lpsd.k12.la.us,(985) 446-5631,Lafourche Parish,2200900.0,805 East Seventh Street,,,Thibodaux,LA,70301.0,,True
476,https://go.boarddocs.com/pa/exeter/Board.nsf/P...,Exeter Township School District,"200 Elm Street | Reading, PA 19606 | 610-779-0...",exeter.k12.pa.us,(610) 779-0700,Exeter Township SD,4209480.0,200 Elm St,,,Reading,PA,19606.0,exetersd.org,True


In [697]:
# add them to the match doc
df = df.drop("phone_good_pattern", axis=1)
matched = pd.concat([matched, df], axis=0)

In [698]:
# check if they have the same websites
df = df[(~df["home_website_boarddocs"].isna()) & (~df["home_website_nces"].isna()) ].copy()
df.shape

(216, 14)

In [699]:
df.loc[:,"website_agree"] = (df["home_website_boarddocs"] == df["home_website_nces"])
df["website_agree"].value_counts()

website_agree
False    213
True       3
Name: count, dtype: int64

In [700]:
df[~df["website_agree"]].sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website_boarddocs,phone,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,home_website_nces,website_agree
452,https://go.boarddocs.com/oh/hlsdoh/Board.nsf/P...,Hamilton Local School District,"775 Rathmell Road | Columbus, Ohio 43207 | (61...",hamiltonrangers.org,(614) 491-8044,Hamilton Local,3904695.0,775 Rathmell Rd,,,Columbus,OH,43207.0,hamilton-local.k12.oh.us,False
296,https://go.boarddocs.com/ny/nisk/Board.nsf/Public,Niskayuna Central School District,"1239 Van Antwerp Road | Niskayuna, NY 12309 | ...",niskayunaschools.org,(518) 377-4666,NISKAYUNA CENTRAL SCHOOL DISTRICT,3620880.0,1239 VAN ANTWERP RD,,,SCHENECTADY,NY,12309.0,niskyschools.org,False
194,https://go.boarddocs.com/oh/bcsdoh/Board.nsf/P...,Brunswick City School District,"3643 Center Road | Brunswick, Ohio 44212 | P...",brunswickschools.org,(330) 225-7731,Brunswick City,3904366.0,3643 Center Rd,,,Brunswick,OH,44212.0,bcsoh.org,False
247,https://go.boarddocs.com/oh/swissohio/Board.ns...,,"304 Mill Street Woodsfield, OH 43793",swissohio.k12.oh.us,(740) 472-5801,Switzerland of Ohio Local,3904865.0,304 Mill St,,,Woodsfield,OH,43793.0,sk12.org,False
428,https://go.boarddocs.com/pa/jean/Board.nsf/Public,Jeannette City School District,"1000 Lowry Avenue, Jeannette, PA 15644 | 724-5...",jeannetteschooldistrict.org,(724) 523-5497,Jeannette City SD,4212330.0,1000 Lowry Ave,,,Jeannette,PA,15644.0,jeannette.k12.pa.us,False


In [701]:
matched.head()

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern,home_website_boarddocs,phone,home_website_nces
0,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,St. Joseph Public Schools,,sjschools.org,,St. Joseph Public Schools,2632850.0,2580 S CLEVELAND AVE,,,SAINT JOSEPH,MI,49085.0,(269) 926-3100,True,,,
1,https://go.boarddocs.com/pa/cali/Board.nsf/Public,,,calsd.org,,California Area SD,4204710.0,40 Trojan Way,,,Coal Center,PA,15423.0,(724) 785-5800,True,,,
2,https://go.boarddocs.com/oh/mapleheights/Board...,Maple Heights City Schools,"5740 Lawn Avenue | Maple Heights, OH 44137 | 2...",mapleschools.com,(216) 587-6100,Maple Heights City,3904430.0,5740 Lawn Ave,,,Maple Heights,OH,44137.0,(216) 587-6100,True,,,
4,https://go.boarddocs.com/pa/shun/Board.nsf/Public,Southern Huntingdon County School District,,shcsd.org,,Southern Huntingdon County SD,4222320.0,10339 Pogue Road,,,Three Springs,PA,17264.0,(814) 447-5529,True,,,
5,https://go.boarddocs.com/de/sussexvt/Board.nsf...,Sussex Technical School District,17099 County Seat Hwy | Georgetown DE 19947 | ...,sussexvt.org,(302) 856-0961,Sussex Technical School District,1001680.0,17137 County Seat Highway,,,Georgetown,DE,19947.0,(302) 856-2541,True,,,


In [702]:
matched.shape

(3617, 18)

In [731]:
# create a sample for Tom first
matched = matched.rename(columns={
    "school_district": "school_district_from_boarddocs",
    "address": "address_from_boarddocs",
    "home_website": "home_website_from_boarddocs",
    "phone_nces": "phone_from_nces"
})

matched["LEAID"] = matched["LEAID"].astype("Int64")

matched.to_csv("release/sample-deliverable-2.csv", index=False, columns=["LEAID", "boarddocs_url", "school_district_from_boarddocs", "LEA_NAME", "address_from_boarddocs", "home_website_from_boarddocs", "phone_from_nces"])