This notebook merges the NCES data with the BoardDocs URLs scrapped from deliverable 1.

Input
- `../deliverable1/release/deliverable_1.csv`
- `../nces2324.csv`

Output
-  `release/deliverable_2.csv`

In [199]:
import pandas as pd

boarddocs_df = pd.read_csv("../deliverable1/release/deliverable_1.csv")
nces_df = pd.read_csv("../nces2324.csv")

  nces_df = pd.read_csv("../nces2324.csv")


In [200]:

# select relevant columns
nces_df = nces_df[["LEA_NAME", "LEAID", "LSTREET1", "LSTREET2","LSTREET3","LCITY","LSTATE", "LZIP","PHONE","WEBSITE"]]
nces_df["LEAID"] = nces_df["LEAID"].astype(int)
nces_df["LZIP"] = nces_df["LZIP"].astype(int)

In [201]:
# match by phone
# then hopefully that can tell us what is up with the website
nces_df = nces_df.rename(columns={
    "PHONE": "phone"
})

In [202]:
# do some edits for phones
# check if all is in same format
phone_pattern = r"^\(\d{3}\)\d{3}\-\d{4}$"
nces_df["phone_good_pattern"] = nces_df["phone"].str.contains(phone_pattern, regex=True, na=False)

In [203]:
nces_df["phone_good_pattern"].value_counts()

phone_good_pattern
True    19637
Name: count, dtype: int64

In [204]:
# let's just add space
nces_df["phone"] = nces_df["phone"].str.replace(")",") ")

In [205]:
# all schools in nces have phone
sum(nces_df["phone"].isna())

0

In [206]:
# make sure both are unique
sum(nces_df["phone"].duplicated(keep=False))

1453

In [207]:
nces_df.shape

(19637, 11)

In [208]:
# seems like almost 10% of the phone numbers in nces is duplicated
nces_df[nces_df["phone"].duplicated(keep=False)].sort_values(by="phone").head()

Unnamed: 0,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone,WEBSITE,phone_good_pattern
11110,Bergen County Vocational Technical School Dist...,3401470,540 Farview Avenue,,,Paramus,NJ,7652,(201) 343-6000,http://bcts.bergen.org,True
11109,Bergen County Special Services School District,3401450,540 Farview Avenue,,,Paramus,NJ,7652,(201) 343-6000,http://bcss.bergen.org,True
11070,Hudson Arts and Science Charter School,3400787,131 Midland Ave,,,Kearny,NJ,7032,(201) 773-9140,http://www.hudsoncharter.org,True
11023,Bergen Arts and Science Charter School,3400715,200 MacArthur Ave,,,Garfield,NJ,7026,(201) 773-9140,http://www.bergencharter.org,True
3802,Regional School District 09,903780,654 Morehouse Road,,,Easton,CT,6612,(203) 261-2513,http://www.er9.org,True


In [209]:
# since the most coverage is by the website, let's do a merge there first
# our goal is to match as much boarddocs as possible
# so boarddocs_df is on the left
nces_df = nces_df.rename(columns={"WEBSITE":"home_website"})

In [210]:
nces_df["home_website"].isna().value_counts()

home_website
False    16957
True      2680
Name: count, dtype: int64

In [211]:
# before merge, let's clean
# remove https and www
url_prefix_pattern = r"https?://(www.)?"
nces_df["home_website"] = nces_df["home_website"].str.replace(url_prefix_pattern, '', regex=True)
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.replace(url_prefix_pattern, '', regex=True)

# also get the first stub
single_slash_pattern = r"[?<!/]/[?!/]"
nces_df["home_website"] = nces_df["home_website"].str.split(single_slash_pattern,n=1, regex=True).str[0]
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.split(single_slash_pattern,n=1, regex=True).str[0]

# also replace trailing /
trailing_slash_pattern = r"/$"
nces_df["home_website"] = nces_df["home_website"].str.replace(trailing_slash_pattern, '', regex=True)
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.replace(trailing_slash_pattern, '', regex=True)

In [212]:
# check if the websites are duplicated
sum(nces_df[~nces_df["home_website"].isna()]["home_website"].duplicated(keep=False)) / nces_df.shape[0] * 100

7.170138004786882

In [213]:
sum(nces_df[~nces_df["home_website"].isna()]["home_website"].duplicated(keep=False))

1408

In [214]:
# check if the websites are duplicated
sum(boarddocs_df[~boarddocs_df["home_website"].isna()]["home_website"].duplicated(keep=False)) / boarddocs_df.shape[0] * 100

0.4876796714579056

In [215]:
sum(boarddocs_df[~boarddocs_df["home_website"].isna()]["home_website"].duplicated(keep=False))

19

In [216]:
boarddocs_df.loc[(~boarddocs_df["home_website"].isna()) & (boarddocs_df["home_website"].duplicated(keep=False)),:].sort_values(by="home_website")

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone
322,https://go.boarddocs.com/nj/bergen/Board.nsf/P...,Bergen County Technical Schools,"540 Farview Ave. , Paramus , NJ 07652",bergen.org,
1781,https://go.boarddocs.com/nj/bergencss/Board.ns...,Bergen County Special Services,"540 Farview Avenue, Paramus, NJ 07652",bergen.org,
3552,https://go.boarddocs.com/wa/bethel/Board.nsf/P...,Bethel School District,"516 176th St E | Spanaway, WA 98387 | Phone: 2...",bethelsd.org,(253) 800-2010
3455,https://go.boarddocs.com/wa/pierce/Board.nsf/P...,Pierce County Skills Center,"16117 Canyon Rd. E Puyallup, WA 98375 Phone:...",bethelsd.org,(253) 800-4800
3768,https://go.boarddocs.com/mi/kcacad/Board.nsf/P...,Kalamazoo Covenant Academy,"400 W Crosstown Pkwy | Kalamazoo, MI 49001 | 2...",covenantacademies.org,(269) 888-2700
483,https://go.boarddocs.com/mi/mcacad/Board.nsf/P...,Muskegon Covenant Academy,"125 Catherine Avenue | Muskegon, MI 49442 | 23...",covenantacademies.org,(231) 720-3100
3128,https://go.boarddocs.com/mi/scacad/Board.nsf/P...,Saginaw Covenant Academy,"508 S. Washington Avenue | Saginaw, MI 48607 |...",covenantacademies.org,(989) 596-1100
2367,https://go.boarddocs.com/mi/engadine/Board.nsf...,Engadine Consolidated Schools,,eupschools.org,
1875,https://go.boarddocs.com/mi/macki/Board.nsf/Pu...,,,eupschools.org,
1620,https://go.boarddocs.com/ca/ecscn/Board.nsf/Pu...,Excelsior Charter School Corona-Norco,"1400 Fullerton Ave. Corona, CA 92879 | 951.547...",excelsior.com,(951) 547-7540


In [217]:
# these are quite the pain so let's ignore them for now

In [218]:
df = pd.merge(boarddocs_df[~boarddocs_df["home_website"].duplicated(keep=False)], nces_df[~nces_df["home_website"].isna()], how="left", on="home_website", suffixes=["_boarddocs", "_nces"])

In [219]:
# let's check how good is the match
# check how many rows have LEAID
df[~df["LEAID"].isna()].shape[0] / boarddocs_df.shape[0] * 100

78.10574948665298

In [220]:
# ok we have matched 61% of them
# check those that weren't matched but has website

df[(~df["home_website"].isna()) & (df["LEAID"].isna())].sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern
3345,https://go.boarddocs.com/ca/scccd/Board.nsf/Pu...,State Center Community College District,1171 Fulton Street Fresno CA 93721 | (559) 243...,scccd.edu,(559) 243-7100,,,,,,,,,,
1424,https://go.boarddocs.com/oh/jclocal/Board.nsf/...,Jackson Center Local Schools,"204 S. Linden Street | Jackson Center, OH 4533...",jctigers.org,(937) 596-6053,,,,,,,,,,
11,https://go.boarddocs.com/pa/stlh/Board.nsf/Public,Steelton-Highspire School District,,shsd.k12.pa.us,,,,,,,,,,,
1997,https://go.boarddocs.com/oh/bedford/Board.nsf/...,Bedford City School District,"475 Northfield Road | Bedford, OH 44146 | 440-...",bedford.k12.oh.us,(440) 439-1500,,,,,,,,,,
3647,https://go.boarddocs.com/mi/rrs/Board.nsf/Public,School District of the City of River Rouge,,riverrougeschools.org,,,,,,,,,,,


In [221]:
# let's take out the ones matched first
matched = df[~df["LEAID"].isna()]
boarddocs_df = boarddocs_df[~boarddocs_df["boarddocs_url"].isin(matched["boarddocs_url"])]

In [222]:
# sanity check the matches so far
# check whether the phone matches
# for those with both set
df = matched[(~matched["phone_boarddocs"].isna()) & (~matched["phone_nces"].isna()) ].copy()

In [223]:
df.shape

(1722, 15)

In [224]:
df.loc[:,"phones_agree"] = (df["phone_boarddocs"] == df["phone_nces"])
df["phones_agree"].value_counts()

phones_agree
True     1362
False     360
Name: count, dtype: int64

In [225]:
# check those that differs
df[~df["phones_agree"]].sample(10)

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern,phones_agree
2578,https://go.boarddocs.com/ny/ouboces/Board.nsf/...,Orange-Ulster BOCES,"53 Gibson Road | Goshen, NY 10924 | (845) 291-...",ouboces.org,(845) 291-0100,ORANGE-ULSTER BOCES,3680620.0,53 GIBSON RD,,,GOSHEN,NY,10924.0,(845) 291-0145,True,False
1822,https://go.boarddocs.com/oh/midview/Board.nsf/...,Midview Local Schools,"13050 Durkee Rd Grafton, OH 44044 | 440-748-5353",midviewk12.org,(440) 748-5353,Midview Local,3904817.0,13050 Durkee Rd,,,Grafton,OH,44044.0,(877) 644-6338,True,False
3095,https://go.boarddocs.com/ny/rufsd/Board.nsf/Pu...,Roosevelt Union Free School District,"240 Denton Place | Roosevelt, NY 11575 | (516)...",rooseveltufsd.org,(516) 345-7005,ROOSEVELT UNION FREE SCHOOL DISTRICT,3624990.0,240 DENTON PL,,,ROOSEVELT,NY,11575.0,(516) 345-7001,True,False
1706,https://go.boarddocs.com/ca/aspire/Board.nsf/P...,Aspire Public Schools,"1001 22nd Avenue Oakland, CA 94606 | (510) 434...",aspirepublicschools.org,(510) 434-5000,Aspire Centennial College Preparatory Academy ...,601553.0,2079 Saturn Ave.,,,Huntington Park,CA,90255.0,(323) 826-9616,True,False
2395,https://go.boarddocs.com/ny/iufsd/Board.nsf/Pu...,Irvington Union Free School District,"6 Dows Lane | Irvington, NY 10533 | Ph: 91...",irvingtonschools.org,(914) 591-8500,IRVINGTON UNION FREE SCHOOL DISTRICT,3615450.0,6 DOWS LN,,,IRVINGTON,NY,10533.0,(914) 591-8501,True,False
397,https://go.boarddocs.com/ny/mufsd/Board.nsf/Pu...,Mamaroneck Union Free School District,"1000 W Boston Post Rd. | Mamaroneck, NY 10543...",mamkschools.org,(914) 220-3000,MAMARONECK UNION FREE SCHOOL DISTRICT,3618240.0,1000 W BOSTON POST RD,,,MAMARONECK,NY,10543.0,(914) 220-3005,True,False
3693,https://go.boarddocs.com/ny/sachem/Board.nsf/P...,Sachem Central School District,"51 School Street Lake Ronkonkoma, New York 117...",sachem.edu,(631) 471-1300,SACHEM CENTRAL SCHOOL DISTRICT,3625350.0,51 SCHOOL ST,,,LAKE RONKONKOMA,NY,11779.0,(631) 471-1336,True,False
3851,https://go.boarddocs.com/va/fcps/Board.nsf/Public,Fauquier County Public Schools VA,"320 HOSPITAL DRIVE, SUITE 40 WARRENTON, VA 201...",fcps1.org,(540) 422-7005,Fauquier County Public Schools,5101320.0,320 Hospital Drive,Suite 40,,Warrenton,VA,20186.0,(540) 422-7000,True,False
684,https://go.boarddocs.com/ny/frewsburg/Board.ns...,Frewsburg Central School District NY,"26 Institute Street, Frewsburg, NY 14738 | 716...",frewsburgcsd.org,(716) 569-7000,FREWSBURG CENTRAL SCHOOL DISTRICT,3611610.0,26 INSTITUTE ST,,,FREWSBURG,NY,14738.0,(716) 569-7041,True,False
2816,https://go.boarddocs.com/ak/yksd/Board.nsf/Public,Yukon-Koyukuk School District,"4762 Old Airport Way Fairbanks, AK 99709 | 907...",yksd.com,(907) 374-9400,Yukon-Koyukuk School District,200862.0,4762 Old Airport Way,,,Fairbanks,AK,99709.0,(907) 374-9416,True,False


In [226]:
# it seems like both phone numbers are valid
# I will just keep both for now

In [227]:
# check the situation at boarddocs
sum(boarddocs_df[~boarddocs_df["phone"].isna()]["phone"].duplicated(keep=False))

2

In [228]:
boarddocs_df[(~boarddocs_df["phone"].isna()) & boarddocs_df["phone"].duplicated(keep=False)]

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone
3166,https://go.boarddocs.com/ca/empirescs/Board.ns...,Empire Springs Charter School,"27740 Jefferson Ave | Temecula, CA 92590 | (95...",springscharterschools.org,(951) 252-8800
3645,https://go.boarddocs.com/ca/harborscs/Board.ns...,Harbor Springs Charter School,"27740 Jefferson Avenue | Temecula, CA 92590 | ...",springscharterschools.org,(951) 252-8800


In [229]:
# again, ignore this for now

In [230]:
# merge it
df = pd.merge(boarddocs_df[~boarddocs_df["phone"].duplicated(keep=False)], nces_df, how="left", on="phone", suffixes=["_boarddocs", "_nces"])

In [231]:
# check matches
# check how many rows have LEAID
df[~df["LEAID"].isna()].shape[0] / df.shape[0] * 100

62.19512195121951

In [232]:
df[~df["LEAID"].isna()].sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website_boarddocs,phone,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,home_website_nces,phone_good_pattern
208,https://go.boarddocs.com/in/lakerid/Board.nsf/...,,Lake Ridge Schools | 6111 West Ridge Road | Ga...,lakeridgeschools.net,(219) 838-1819,Lake Ridge New Tech Schools,1805460.0,6111 W Ridge Rd,,,Gary,IN,46408.0,lakeridge.k12.in.us,True
566,https://go.boarddocs.com/in/ccsin/Board.nsf/Pu...,Carmel Clay Schools,"5201 E. Main Street Carmel, IN 46033 | (317) 8...",.ccs.k12.in.us,(317) 844-9961,Carmel Clay Schools,1801200.0,5201 E Main St,,,Carmel,IN,46033.0,ccs.k12.in.us,True
417,https://go.boarddocs.com/oh/bellairelocal/Boar...,Bellaire Local Schools,"340 34th St. | Bellaire, OH 43906 | 740-676-1826",bellairesd.org,(740) 676-1826,Bellaire Local,3904357.0,340 34th St,,,Bellaire,OH,43906.0,bellaire.k12.oh.us,True
513,https://go.boarddocs.com/in/fremon/Board.nsf/P...,,Fremont Community Schools | 1100 West Toledo S...,fcs.k12.in.us,(260) 495-5005,Fremont Community Schools,1803780.0,1100 W Toledo St,,,Fremont,IN,46737.0,fremontschoolsin.com,True
77,https://go.boarddocs.com/wa/psd/Board.nsf/Public,Puyallup School District,"302 Second Street SE | Puyallup, WA 98372 | ...",puyallupsd.org,(253) 841-1301,Puyallup School District,5306960.0,302 2ND ST SE,,,PUYALLUP,WA,98372.0,,True


In [233]:
# add them to the match doc
df = df.drop("phone_good_pattern", axis=1)
matched = pd.concat([matched, df], axis=0)

In [234]:
# check if they have the same websites
df = df[(~df["home_website_boarddocs"].isna()) & (~df["home_website_nces"].isna()) ].copy()
df.shape

(216, 14)

In [235]:
df.loc[:,"website_agree"] = (df["home_website_boarddocs"] == df["home_website_nces"])
df["website_agree"].value_counts()

website_agree
False    213
True       3
Name: count, dtype: int64

In [236]:
df[~df["website_agree"]].sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website_boarddocs,phone,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,home_website_nces,website_agree
352,https://go.boarddocs.com/pa/bssd/Board.nsf/Public,River Valley School District,"102 School Lane | Blairsville, PA 15717 | Ph: ...",rivervalleysd.org,(724) 459-5500,River Valley SD,4203750.0,102 School Lane,,,Blairsville,PA,15717.0,rvsdpa.org,False
324,https://go.boarddocs.com/vsba/nnps/Board.nsf/P...,Newport News Public Schools,"12465 Warwick Boulevard • Newport News, VA ...",nnadmin.sbo.nn.k12.va.us,(757) 591-4500,Newport News City Public Schools,5102640.0,12465 Warwick Blvd,,,Newport News,VA,23606.0,sbo.nn.k12.va.us,False
235,https://go.boarddocs.com/oh/eastliverpool/Boar...,East Liverpool City Schools,"810 W 8th Street | East Liverpool, OH 43920 | ...",elpotters.school,(330) 385-7132,East Liverpool City,3904391.0,810 W 8th St,,,East Liverpool,OH,43920.0,elcsd.k12.oh.us,False
208,https://go.boarddocs.com/in/lakerid/Board.nsf/...,,Lake Ridge Schools | 6111 West Ridge Road | Ga...,lakeridgeschools.net,(219) 838-1819,Lake Ridge New Tech Schools,1805460.0,6111 W Ridge Rd,,,Gary,IN,46408.0,lakeridge.k12.in.us,False
383,https://go.boarddocs.com/ks/usd262/Board.nsf/P...,USD 262 Valley Center School District,"143 S. Meridian • Valley Center, Kansas 67147 ...",usd262.com,(316) 755-7000,Valley Center Pub Sch,2012510.0,143 S. Meridian,,,Valley Center,KS,67147.0,usd262.net,False


In [237]:
matched.head()

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern,home_website_boarddocs,phone,home_website_nces
0,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,St. Joseph Public Schools,,sjschools.org,,St. Joseph Public Schools,2632850.0,2580 S CLEVELAND AVE,,,SAINT JOSEPH,MI,49085.0,(269) 926-3100,True,,,
1,https://go.boarddocs.com/pa/cali/Board.nsf/Public,,,calsd.org,,California Area SD,4204710.0,40 Trojan Way,,,Coal Center,PA,15423.0,(724) 785-5800,True,,,
2,https://go.boarddocs.com/oh/mapleheights/Board...,Maple Heights City Schools,"5740 Lawn Avenue | Maple Heights, OH 44137 | 2...",mapleschools.com,(216) 587-6100,Maple Heights City,3904430.0,5740 Lawn Ave,,,Maple Heights,OH,44137.0,(216) 587-6100,True,,,
4,https://go.boarddocs.com/pa/shun/Board.nsf/Public,Southern Huntingdon County School District,,shcsd.org,,Southern Huntingdon County SD,4222320.0,10339 Pogue Road,,,Three Springs,PA,17264.0,(814) 447-5529,True,,,
5,https://go.boarddocs.com/de/sussexvt/Board.nsf...,Sussex Technical School District,17099 County Seat Hwy | Georgetown DE 19947 | ...,sussexvt.org,(302) 856-0961,Sussex Technical School District,1001680.0,17137 County Seat Highway,,,Georgetown,DE,19947.0,(302) 856-2541,True,,,


In [238]:
matched.shape

(3617, 18)

In [239]:
# match percentage
matched.shape[0] / pd.read_csv("../deliverable1/release/deliverable_1.csv").shape[0] * 100

92.8388090349076

In [240]:
# now match by zipcode
# zipcode is 5 digits standalone
zipcode_pattern = r"\b(\d{5})\b"
boarddocs_df["zipcode_from_boarddocs"] = boarddocs_df["address"].str.extract(zipcode_pattern)

In [241]:
boarddocs_df[["address", "zipcode_from_boarddocs"]].sample(5)

Unnamed: 0,address,zipcode_from_boarddocs
3032,,
3513,"1414 East Cedar Street | Allentown, PA 18109",18109.0
1285,"101 Edgeway Road | Dover, PA 17315 | 717-292-3671",17315.0
2306,"625 S. Yearling Rd. | Whitehall, Ohio 43213 | ...",43213.0
453,"4502 N. Central Ave. | Phoenix, AZ 85012 | p 6...",85012.0


In [242]:
# get the remaining boarddocs and nces
boarddocs_df = boarddocs_df[~boarddocs_df["boarddocs_url"].isin(matched["boarddocs_url"])]
nces_df = nces_df[~nces_df["LEAID"].isin(matched["LEAID"])]

In [243]:
print(f"{boarddocs_df.shape=}")
print(f"{nces_df.shape=}")

boarddocs_df.shape=(387, 6)
nces_df.shape=(16240, 11)


In [244]:
# get unique zipcodes
boarddocs_unique_zipcode = boarddocs_df[~boarddocs_df["zipcode_from_boarddocs"].duplicated(keep=False)]
nces_unique_zipcode = nces_df[~nces_df["LZIP"].duplicated(keep=False)]

In [245]:
print(f"{boarddocs_unique_zipcode.shape=}")
print(f"{nces_unique_zipcode.shape=}")

boarddocs_unique_zipcode.shape=(138, 6)
nces_unique_zipcode.shape=(8446, 11)


In [246]:
nces_unique_zipcode = nces_unique_zipcode.rename(columns={
    "LZIP": "zipcode"
})

boarddocs_unique_zipcode = boarddocs_unique_zipcode.rename(columns={
    "zipcode_from_boarddocs": "zipcode"
})

In [247]:
nces_unique_zipcode["zipcode"] = nces_unique_zipcode["zipcode"].astype(int)
boarddocs_unique_zipcode["zipcode"] = boarddocs_unique_zipcode["zipcode"].astype(int)

In [248]:
# try matching
df = pd.merge(boarddocs_unique_zipcode, nces_unique_zipcode, how="inner", on="zipcode", suffixes=["_boarddocs", "_nces"])

In [249]:
df.shape

(73, 16)

In [250]:
df.sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website_boarddocs,phone_boarddocs,zipcode,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,phone_nces,home_website_nces,phone_good_pattern
27,https://go.boarddocs.com/wi/elcho/Board.nsf/Pu...,Elcho School District,"N11268 Antigo St PO BOX 800 Elcho, WI 54428",elchoschool.org,,54428,Elcho School District,5504170,N11268 Antigo St,,,Elcho,WI,(715) 275-3225,elcho.k12.wi.us,True
53,https://go.boarddocs.com/wi/mhasd/Board.nsf/Pu...,Mount Horeb Area School District,"1304 East Lincoln Street Mount Horeb, WI 53572",mhasd.k12.wi.us,,53572,Mount Horeb Area School District,5509990,1304 E Lincoln St,,,Mount Horeb,WI,(608) 437-2400,mounthorebschools.org,True
12,https://go.boarddocs.com/pa/albg/Board.nsf/Public,Albert Gallatin School District,"2625 Morgantown Road, Uniontown, PA 15401",albertgallatin.k12.pa.us,,15401,Albert Gallatin Area SD,4202100,2625 Morgantown Rd,,,Uniontown,PA,(724) 564-7190,agasd.org,True
61,https://go.boarddocs.com/wv/pleasants/Board.ns...,Pleasants County Schools,"202 Fairview Dr. St. Marys, WV 26170",pleasantscountyschools.com,,26170,PLEASANTS COUNTY SCHOOLS,5401110,202 FAIRVIEW DR.,,,ST. MARYS,WV,(304) 684-2215,,True
43,https://go.boarddocs.com/nj/middlesex/Board.ns...,Middlesex Public School District,"300 John F Kennedy Dr, Middlesex, NJ 08846",mbschools.org,,8846,Middlesex Borough School District,3410050,300 John F. Kennedy Drive,,,Middlesex,NJ,(732) 317-6000,middlesex.k12.nj.us,True


In [251]:
# the matches are not exactly correctly
# discard

In [252]:
# now merge by name
SD_pattern = r"\b(SD)\b"
school_district_pattern = r"\b(school district)\b"
space_pattern = r"\s"

# remove SD
boarddocs_df["cleaned_name"] = boarddocs_df["school_district"].str.replace(SD_pattern, '', regex=True)
nces_df["cleaned_name"] = nces_df["LEA_NAME"].str.replace(SD_pattern, '', regex=True)

# lower case it
boarddocs_df["cleaned_name"] = boarddocs_df["cleaned_name"].str.lower()
nces_df["cleaned_name"] = nces_df["cleaned_name"].str.lower()

# remove school district
boarddocs_df["cleaned_name"] = boarddocs_df["cleaned_name"].str.replace(school_district_pattern, '', regex=True)
nces_df["cleaned_name"] = nces_df["cleaned_name"].str.replace(school_district_pattern, '', regex=True)

# remove spaces
boarddocs_df["cleaned_name"] = boarddocs_df["cleaned_name"].str.replace(space_pattern, '', regex=True)
nces_df["cleaned_name"] = nces_df["cleaned_name"].str.replace(space_pattern, '', regex=True)

In [253]:
# get unique ones
print(f"{boarddocs_df.shape=}")
print(f"{nces_df.shape=}")
boarddocs_df = boarddocs_df[~boarddocs_df["cleaned_name"].duplicated(keep=False)]
nces_df = nces_df[~nces_df["cleaned_name"].duplicated(keep=False)]

boarddocs_df.shape=(387, 7)
nces_df.shape=(16240, 12)


In [254]:
# try merging
print(f"{boarddocs_df.shape=}")
print(f"{nces_df.shape=}")
df = pd.merge(boarddocs_df, nces_df, how="inner", on="cleaned_name", suffixes=["_boarddocs", "_nces"])
print(f"{df.shape=}")

boarddocs_df.shape=(348, 7)
nces_df.shape=(15362, 12)
df.shape=(165, 18)


In [255]:
# check it
df.sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website_boarddocs,phone_boarddocs,zipcode_from_boarddocs,cleaned_name,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,home_website_nces,phone_good_pattern
60,https://go.boarddocs.com/pa/cval/Board.nsf/Public,Central Valley School District,,centralvalleysd.net,,,centralvalley,Central Valley SD,4200824,160 Baker Road Extension,,,Monaca,PA,15061,(724) 775-5600,centralvalleysd.org/District,True
35,https://go.boarddocs.com/wi/oas/Board.nsf/Public,Oconomowoc Area School District,"915 E. Summit Avenue Oconomowoc, WI 53066",oasd.k12.wi.us,,53066.0,oconomowocarea,Oconomowoc Area School District,5510890,915 East Summit Avenue,,,Oconomowoc,WI,53066,(262) 560-2111,oasd.org,True
92,https://go.boarddocs.com/wi/mosi/Board.nsf/Public,Mosinee School District,,mosineeschools.com,,,mosinee,Mosinee School District,5509960,146001 State Highway 153,,,Mosinee,WI,54455,(715) 693-2530,mosineeschools.org,True
163,https://go.boarddocs.com/wi/iola/Board.nsf/Public,Iola-Scandinavia School District,,,,,iola-scandinavia,Iola-Scandinavia School District,5506840,450 Division St,,,Iola,WI,54945,(715) 445-2411,iola.k12.wi.us,True
18,https://go.boarddocs.com/pa/marp/Board.nsf/Public,Marple Newtown School District,,,,,marplenewtown,Marple Newtown SD,4214760,40 Media Line Rd Ste 204,,,Newtown Square,PA,19073,(610) 359-4200,mnsd.org,True


In [257]:
matched = matched.drop(['home_website', 'phone'], axis=1)

In [259]:
df = df[list(matched.columns)]

In [260]:
matched = pd.concat([matched, df])
matched.shape

(3782, 16)

In [261]:
# match percentage
matched.shape[0] / pd.read_csv("../deliverable1/release/deliverable_1.csv").shape[0] * 100

97.07392197125256

In [264]:
# output the deliverable
matched = matched.rename(columns={
    "school_district": "school_district_from_boarddocs",
    "address": "address_from_boarddocs",
    "home_website": "home_website_from_boarddocs",
    "phone_nces": "phone_from_nces"
})

matched["LEAID"] = matched["LEAID"].astype("Int64")

matched.to_csv("release/deliverable_2.csv", index=False, columns=["LEAID",  "LEA_NAME", "boarddocs_url", "school_district_from_boarddocs", "address_from_boarddocs"])