This notebook merges the NCES data with the BoardDocs URLs scrapped from deliverable 1.

Input
- `../deliverable1/release/deliverable_1.csv`
- `../nces2324.csv`

Output
-  `release/deliverable_2.csv`

In [519]:
import pandas as pd

boarddocs_df = pd.read_csv("../deliverable1/release/deliverable_1.csv")
nces_df = pd.read_csv("../nces2324.csv")

  nces_df = pd.read_csv("../nces2324.csv")


In [520]:
# select relevant columns
nces_df = nces_df[["LEA_NAME", "LEAID", "LSTREET1", "LSTREET2","LSTREET3","LCITY","LSTATE", "LZIP","PHONE","WEBSITE"]]

In [521]:
# match by phone
# then hopefully that can tell us what is up with the website
nces_df = nces_df.rename(columns={
    "PHONE": "phone"
})

In [522]:
# do some edits for phones
# check if all is in same format
phone_pattern = r"^\(\d{3}\)\d{3}\-\d{4}$"
nces_df["phone_good_pattern"] = nces_df["phone"].str.contains(phone_pattern, regex=True, na=False)

In [523]:
nces_df["phone_good_pattern"].value_counts()

phone_good_pattern
True    19637
Name: count, dtype: int64

In [524]:
# let's just add space
nces_df["phone"] = nces_df["phone"].str.replace(")",") ")

In [525]:
# all schools in nces have phone
sum(nces_df["phone"].isna())

0

In [526]:
# make sure both are unique
sum(nces_df["phone"].duplicated(keep=False))

1453

In [527]:
nces_df.shape

(19637, 11)

In [528]:
# seems like almost 10% of the phone numbers in nces is duplicated
nces_df[nces_df["phone"].duplicated(keep=False)].sort_values(by="phone").head()

Unnamed: 0,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone,WEBSITE,phone_good_pattern
11110,Bergen County Vocational Technical School Dist...,3401470,540 Farview Avenue,,,Paramus,NJ,7652,(201) 343-6000,http://bcts.bergen.org,True
11109,Bergen County Special Services School District,3401450,540 Farview Avenue,,,Paramus,NJ,7652,(201) 343-6000,http://bcss.bergen.org,True
11070,Hudson Arts and Science Charter School,3400787,131 Midland Ave,,,Kearny,NJ,7032,(201) 773-9140,http://www.hudsoncharter.org,True
11023,Bergen Arts and Science Charter School,3400715,200 MacArthur Ave,,,Garfield,NJ,7026,(201) 773-9140,http://www.bergencharter.org,True
3802,Regional School District 09,903780,654 Morehouse Road,,,Easton,CT,6612,(203) 261-2513,http://www.er9.org,True


In [529]:
# since the most coverage is by the website, let's do a merge there first
# our goal is to match as much boarddocs as possible
# so boarddocs_df is on the left
nces_df = nces_df.rename(columns={"WEBSITE":"home_website"})

In [530]:
nces_df["home_website"].isna().value_counts()

home_website
False    16957
True      2680
Name: count, dtype: int64

In [531]:
# before merge, let's clean
# remove https and www
url_prefix_pattern = r"https?://(www.)?"
nces_df["home_website"] = nces_df["home_website"].str.replace(url_prefix_pattern, '', regex=True)
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.replace(url_prefix_pattern, '', regex=True)

# also get the first stub
single_slash_pattern = r"[?<!/]/[?!/]"
nces_df["home_website"] = nces_df["home_website"].str.split(single_slash_pattern,n=1, regex=True).str[0]
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.split(single_slash_pattern,n=1, regex=True).str[0]

# also replace trailing /
trailing_slash_pattern = r"/$"
nces_df["home_website"] = nces_df["home_website"].str.replace(trailing_slash_pattern, '', regex=True)
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.replace(trailing_slash_pattern, '', regex=True)

In [532]:
# check if the websites are duplicated
sum(nces_df[~nces_df["home_website"].isna()]["home_website"].duplicated(keep=False)) / nces_df.shape[0] * 100

7.170138004786882

In [533]:
sum(nces_df[~nces_df["home_website"].isna()]["home_website"].duplicated(keep=False))

1408

In [534]:
# check if the websites are duplicated
sum(boarddocs_df[~boarddocs_df["home_website"].isna()]["home_website"].duplicated(keep=False)) / boarddocs_df.shape[0] * 100

0.4876796714579056

In [535]:
sum(boarddocs_df[~boarddocs_df["home_website"].isna()]["home_website"].duplicated(keep=False))

19

In [536]:
boarddocs_df.loc[(~boarddocs_df["home_website"].isna()) & (boarddocs_df["home_website"].duplicated(keep=False)),:].sort_values(by="home_website")

Unnamed: 0,URL,school_district,address,home_website,phone
322,https://go.boarddocs.com/nj/bergen/Board.nsf/P...,Bergen County Technical Schools,"540 Farview Ave. , Paramus , NJ 07652",bergen.org,
1781,https://go.boarddocs.com/nj/bergencss/Board.ns...,Bergen County Special Services,"540 Farview Avenue, Paramus, NJ 07652",bergen.org,
3552,https://go.boarddocs.com/wa/bethel/Board.nsf/P...,Bethel School District,"516 176th St E | Spanaway, WA 98387 | Phone: 2...",bethelsd.org,(253) 800-2010
3455,https://go.boarddocs.com/wa/pierce/Board.nsf/P...,Pierce County Skills Center,"16117 Canyon Rd. E Puyallup, WA 98375 Phone:...",bethelsd.org,(253) 800-4800
3768,https://go.boarddocs.com/mi/kcacad/Board.nsf/P...,Kalamazoo Covenant Academy,"400 W Crosstown Pkwy | Kalamazoo, MI 49001 | 2...",covenantacademies.org,(269) 888-2700
483,https://go.boarddocs.com/mi/mcacad/Board.nsf/P...,Muskegon Covenant Academy,"125 Catherine Avenue | Muskegon, MI 49442 | 23...",covenantacademies.org,(231) 720-3100
3128,https://go.boarddocs.com/mi/scacad/Board.nsf/P...,Saginaw Covenant Academy,"508 S. Washington Avenue | Saginaw, MI 48607 |...",covenantacademies.org,(989) 596-1100
2367,https://go.boarddocs.com/mi/engadine/Board.nsf...,Engadine Consolidated Schools,,eupschools.org,
1875,https://go.boarddocs.com/mi/macki/Board.nsf/Pu...,,,eupschools.org,
1620,https://go.boarddocs.com/ca/ecscn/Board.nsf/Pu...,Excelsior Charter School Corona-Norco,"1400 Fullerton Ave. Corona, CA 92879 | 951.547...",excelsior.com,(951) 547-7540


In [537]:
# these are quite the pain so let's ignore them for now

In [538]:
df = pd.merge(boarddocs_df[~boarddocs_df["home_website"].duplicated(keep=False)], nces_df[~nces_df["home_website"].isna()], how="left", on="home_website", suffixes=["_boarddocs", "_nces"])

In [539]:
# let's check how good is the match
# check how many rows have LEAID
df[~df["LEAID"].isna()].shape[0] / boarddocs_df.shape[0] * 100

78.02874743326488

In [540]:
# ok we have matched 61% of them
# check those that weren't matched but has website

df[(~df["home_website"].isna()) & (df["LEAID"].isna())].sample(5)

Unnamed: 0,URL,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern
279,https://go.boarddocs.com/ga/gcps/Board.nsf/Public,Gwinnett County Public Schools,,gcpsk12.org,,,,,,,,,,,
1877,https://go.boarddocs.com/in/kokomo/Board.nsf/P...,Kokomo School Corporation,P.O. Box 2188 | 1500 S. Washington Street | Ko...,kokomoschools.com,(765) 455-8000,,,,,,,,,,
643,https://go.boarddocs.com/ok/francistuttle/Boar...,Francis Tuttle Technology Center,"12777 N. Rockwell Ave Oklahoma City, OK 73142 ...",francistuttle.edu,(405) 717-7799,,,,,,,,,,
2231,https://go.boarddocs.com/wa/cocp/Board.nsf/Public,"City of College Place, Washington","625 S. College Ave. | College Place, WA 99324 ...",cpwa.us,(509) 529-1200,,,,,,,,,,
1025,https://go.boarddocs.com/ca/pomona/Board.nsf/P...,Pomona Unified School District,"800 South Garey Avenue | Pomona, California 91...",pusd.org,(909) 397-4800,,,,,,,,,,


In [541]:
# let's take out the ones matched first
matched = df[~df["LEAID"].isna()]
boarddocs_df = boarddocs_df[~boarddocs_df["URL"].isin(matched["URL"])]

In [542]:
# sanity check the matches so far
# check whether the phone matches
# for those with both set
df = matched[(~matched["phone_boarddocs"].isna()) & (~matched["phone_nces"].isna()) ].copy()

In [543]:
df.shape

(1720, 15)

In [544]:
df.loc[:,"phones_agree"] = (df["phone_boarddocs"] == df["phone_nces"])
df["phones_agree"].value_counts()

phones_agree
True     1360
False     360
Name: count, dtype: int64

In [545]:
# check those that differs
df[~df["phones_agree"]].sample(10)

Unnamed: 0,URL,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern,phones_agree
645,https://go.boarddocs.com/ca/slcusd/Board.nsf/P...,San Luis Coastal Unified School District,"| 1500 Lizzie Street | San Luis Obispo, CA 934...",slcusd.org,(805) 549-1202,San Luis Coastal Unified,634800.0,1500 Lizzie St.,,,San Luis Obispo,CA,93401.0,(805) 549-1200,True,False
1904,https://go.boarddocs.com/pa/agora/Board.nsf/Pu...,Agora Cyber Charter School,"1018 West 8th Ave | King of Prussia, PA 19406 ...",agora.org,(844) 402-4672,Agora Cyber CS,4200140.0,1018 West 8th Avenue,,,King of Prussia,PA,19406.0,(610) 230-0775,True,False
429,https://go.boarddocs.com/mi/kresa/Board.nsf/Pu...,Kalamazoo RESA,"1819 E. Milham Ave., Portage, MI 49002 (26...",kresa.org,(269) 250-9200,Kalamazoo RESA,2680600.0,1819 E MILHAM AVE,,,PORTAGE,MI,49002.0,(269) 250-9202,True,False
744,https://go.boarddocs.com/nj/colps/Board.nsf/Pu...,Collingswood Public Schools,"100 Lees Ave | Collingswood, NJ 08108 | 856-96...",collsk12.org,(856) 962-5702,Collingswood Public School District,3403420.0,200 Lees Avenue,,,Collingswood,NJ,8108.0,(856) 962-5700,True,False
1847,https://go.boarddocs.com/in/nwscd/Board.nsf/Pu...,Northwestern School Corporation,"3075 N Washington St Kokomo, IN 46901 | 765-45...",nwsc.k12.in.us,(765) 457-8101,Northwestern School Corp,1802040.0,3075 N Washington St,,,Kokomo,IN,46901.0,(765) 452-3060,True,False
327,https://go.boarddocs.com/ak/matsu/Board.nsf/Pu...,Matanuska-Susitna Borough School District,"501 N. Gulkana | Palmer, Alaska 99645-6147 | 9...",matsuk12.us,(907) 746-9272,Matanuska-Susitna Borough School District,200510.0,501 N Gulkana St,,,Palmer,AK,99645.0,(907) 746-9200,True,False
2293,https://go.boarddocs.com/ny/ewufsd/Board.nsf/P...,East Williston Union Free School District NY,"11 Bacon Road Old Westbury, N.Y. 11568 | 516.3...",ewsdonline.org,(516) 333-1630,EAST WILLISTON UNION FREE SCHOOL DISTRICT,3610050.0,11 BACON RD,,,OLD WESTBURY,NY,11568.0,(516) 333-3758,True,False
1182,https://go.boarddocs.com/ny/nrcsd/Board.nsf/Pu...,North Rockland Central School District,"65 Chapel Street | Garnerville, NY 10923 | Pho...",nrcsd.org,(845) 942-3000,HAVERSTRAW-STONY POINT CSD (NORTH ROCKLAND),3614010.0,65 CHAPEL ST,,,GARNERVILLE,NY,10923.0,(845) 942-3002,True,False
1845,https://go.boarddocs.com/co/d11/Board.nsf/Public,Colorado Springs School District 11,"1115 N. El Paso Street | Colorado Springs, CO ...",d11.org,(719) 520-2000,El Paso 11 Colo Springs AU,800279.0,1115 North El Paso Street,,,Colorado Springs,CO,80903.0,(719) 520-2148,True,False
1496,https://go.boarddocs.com/ny/freeport/Board.nsf...,Freeport Union Free School District,"235 North Ocean Ave | Freeport, New York 11520...",freeportschools.org,(516) 867-5200,FREEPORT UNION FREE SCHOOL DISTRICT,3611550.0,235 N OCEAN AVE,,,FREEPORT,NY,11520.0,(516) 867-5205,True,False


In [546]:
# it seems like both phone numbers are valid
# I will just keep both for now

In [547]:
# check the situation at boarddocs
sum(boarddocs_df[~boarddocs_df["phone"].isna()]["phone"].duplicated(keep=False))

2

In [548]:
boarddocs_df[(~boarddocs_df["phone"].isna()) & boarddocs_df["phone"].duplicated(keep=False)]

Unnamed: 0,URL,school_district,address,home_website,phone
3166,https://go.boarddocs.com/ca/empirescs/Board.ns...,Empire Springs Charter School,"27740 Jefferson Ave | Temecula, CA 92590 | (95...",springscharterschools.org,(951) 252-8800
3645,https://go.boarddocs.com/ca/harborscs/Board.ns...,Harbor Springs Charter School,"27740 Jefferson Avenue | Temecula, CA 92590 | ...",springscharterschools.org,(951) 252-8800


In [549]:
# again, ignore this for now

In [550]:
# merge it
df = pd.merge(boarddocs_df[~boarddocs_df["phone"].duplicated(keep=False)], nces_df, how="left", on="phone", suffixes=["_boarddocs", "_nces"])

In [551]:
# check matches
# check how many rows have LEAID
df[~df["LEAID"].isna()].shape[0] / df.shape[0] * 100

62.326388888888886

In [552]:
df[~df["LEAID"].isna()].sample(5)

Unnamed: 0,URL,school_district,address,home_website_boarddocs,phone,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,home_website_nces,phone_good_pattern
387,https://go.boarddocs.com/mo/rpsd/Board.nsf/Public,Raymore-Peculiar School District,"21005 S School Rd. Peculiar, MO 64078 Phone:...",raypec.k12.mo.us,(816) 892-1300,RAYMORE-PECULIAR R-II,2923730.0,21005 S SCHOOL RD,,,PECULIAR,MO,64078.0,raypec.org,True
100,https://go.boarddocs.com/wv/jac/Board.nsf/Public,Jackson County Schools,"1 School Street, Ripley, WV 25271 |(304) 372-7300",boe.jack.k12.wv.us,(304) 372-7300,JACKSON COUNTY SCHOOLS,5400540.0,1 SCHOOL STREET,,,RIPLEY,WV,25271.0,,True
429,https://go.boarddocs.com/oh/loganelm/Board.nsf...,Logan Elm Local School District,"9579 Tarlton Road | Circleville, OH 43113 | 74...",loganelm.org,(740) 474-7501,Logan Elm Local,3904908.0,9579 Tarlton Rd,,,Circleville,OH,43113.0,loganelmschools.com,True
14,https://go.boarddocs.com/ca/cayucos/Board.nsf/...,Cayucos Elementary School District,"301 Cayucos Drive, Cayucos, CA 93430 | (805) 9...",cayucosschool.org,(805) 995-3694,Cayucos Elementary,607840.0,301 Cayucos Dr.,,,Cayucos,CA,93430.0,sites.google.com/a/cayucosschool.org/cayucos-e...,True
482,https://go.boarddocs.com/co/mcsdre3/Board.nsf/...,Morgan County School District Re-3,"715 West Platte Avenue, Fort Morgan, Colorado ...",fortmorgank12.com,(970) 867-5633,School District No. Re-3 Fort Morgan,804050.0,715 W. PLATTE AVENUE,,,FORT MORGAN,CO,80701.0,morgan.k12.co.us,True


In [553]:
# add them to the match doc
df = df.drop("phone_good_pattern", axis=1)
matched = pd.concat([matched, df], axis=0)

In [554]:
# check if they have the same websites
df = df[(~df["home_website_boarddocs"].isna()) & (~df["home_website_nces"].isna()) ].copy()
df.shape

(218, 14)

In [555]:
df.loc[:,"website_agree"] = (df["home_website_boarddocs"] == df["home_website_nces"])
df["website_agree"].value_counts()

website_agree
False    215
True       3
Name: count, dtype: int64

In [558]:
df[~df["website_agree"]].sample(5)

Unnamed: 0,URL,school_district,address,home_website_boarddocs,phone,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,home_website_nces,website_agree
501,https://go.boarddocs.com/nj/ewrsd/Board.nsf/Pu...,East Windsor Regional School District,"25A Leshin Lane | Hightstown, New Jersey 08520...",eastwindsorregionalschools.com,(609) 443-7717,East Windsor Regional School District,3404320.0,25A LESHIN LANE,,,HIGHTSTOWN,NJ,8520.0,ewrsd.org,False
510,https://go.boarddocs.com/mo/winfield/Board.nsf...,Winfield R-IV School District,"100 Eighth Street | Winfield, MO 63389 | 636-6...",winfield.k12.mo.us,(636) 668-8188,WINFIELD R-IV,2932190.0,100 8th Street,,,Winfield,MO,63389.0,WWW.WINFIELD.K12.MO.US,False
5,https://go.boarddocs.com/mo/mcr1/Board.nsf/Public,Macon County R-1 Schools,"702 North Missouri  Macon, Missouri 63552 ...",macon.k12.mo.us,(660) 395-6164,MACON CO. R-I,2919410.0,702 N MISSOURI,,,MACON,MO,63552.0,WWW.MACON.K12.MO.US,False
294,https://go.boarddocs.com/oh/clearview/Board.ns...,Clearview Local Schools,"4700 Broadway Avenue | Lorain, OH 44052 | 440-...",clearviewschools.org,(440) 233-5412,Clearview Local,3904813.0,4700 Broadway,,,Lorain,OH,44052.0,clearview.k12.oh.us,False
546,https://go.boarddocs.com/ca/cryrop/Board.nsf/P...,Colton Redlands Yucaipa ROP,"1214 Indiana Court Redlands, CA 92374 | (909) ...",cryrop.org,(909) 793-3115,Colton-Redlands-Yucaipa ROP JPA,601375.0,1214 Indiana Ct.,,,Redlands,CA,92374.0,cryrop.edu,False


In [556]:
matched.head()

Unnamed: 0,URL,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern,home_website_boarddocs,phone,home_website_nces
0,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,St. Joseph Public Schools,,sjschools.org,,St. Joseph Public Schools,2632850.0,2580 S CLEVELAND AVE,,,SAINT JOSEPH,MI,49085.0,(269) 926-3100,True,,,
1,https://go.boarddocs.com/pa/cali/Board.nsf/Public,,,calsd.org,,California Area SD,4204710.0,40 Trojan Way,,,Coal Center,PA,15423.0,(724) 785-5800,True,,,
2,https://go.boarddocs.com/oh/mapleheights/Board...,Maple Heights City Schools,"5740 Lawn Avenue | Maple Heights, OH 44137 | 2...",mapleschools.com,(216) 587-6100,Maple Heights City,3904430.0,5740 Lawn Ave,,,Maple Heights,OH,44137.0,(216) 587-6100,True,,,
4,https://go.boarddocs.com/pa/shun/Board.nsf/Public,Southern Huntingdon County School District,,shcsd.org,,Southern Huntingdon County SD,4222320.0,10339 Pogue Road,,,Three Springs,PA,17264.0,(814) 447-5529,True,,,
5,https://go.boarddocs.com/de/sussexvt/Board.nsf...,Sussex Technical School District,17099 County Seat Hwy | Georgetown DE 19947 | ...,sussexvt.org,(302) 856-0961,Sussex Technical School District,1001680.0,17137 County Seat Highway,,,Georgetown,DE,19947.0,(302) 856-2541,True,,,


In [557]:
matched.shape

(3616, 18)