This notebook merges the NCES data with the BoardDocs URLs scrapped from deliverable 1.

Input
- `../deliverable1/release/deliverable_1.csv`
- `../nces2324.csv`

Output
-  `release/deliverable_2.csv`

In [1]:
import pandas as pd

boarddocs_df = pd.read_csv("../deliverable1/release/deliverable_1.csv")
nces_df = pd.read_csv("../nces2324.csv")

  nces_df = pd.read_csv("../nces2324.csv")


In [2]:

# select relevant columns
nces_df = nces_df[["LEA_NAME", "LEAID", "LSTREET1", "LSTREET2","LSTREET3","LCITY","LSTATE", "LZIP","PHONE","WEBSITE"]]
nces_df["LEAID"] = nces_df["LEAID"].astype(int)
nces_df["LZIP"] = nces_df["LZIP"].astype(int)

In [3]:
# match by phone
# then hopefully that can tell us what is up with the website
nces_df = nces_df.rename(columns={
    "PHONE": "phone"
})

In [4]:
# do some edits for phones
# check if all is in same format
phone_pattern = r"^\(\d{3}\)\d{3}\-\d{4}$"
nces_df["phone_good_pattern"] = nces_df["phone"].str.contains(phone_pattern, regex=True, na=False)

In [5]:
nces_df["phone_good_pattern"].value_counts()

phone_good_pattern
True    19637
Name: count, dtype: int64

In [6]:
# let's just add space
nces_df["phone"] = nces_df["phone"].str.replace(")",") ")

In [7]:
# all schools in nces have phone
sum(nces_df["phone"].isna())

0

In [8]:
# make sure both are unique
sum(nces_df["phone"].duplicated(keep=False))

1453

In [9]:
nces_df.shape

(19637, 11)

In [10]:
# seems like almost 10% of the phone numbers in nces is duplicated
nces_df[nces_df["phone"].duplicated(keep=False)].sort_values(by="phone").head()

Unnamed: 0,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone,WEBSITE,phone_good_pattern
11110,Bergen County Vocational Technical School Dist...,3401470,540 Farview Avenue,,,Paramus,NJ,7652,(201) 343-6000,http://bcts.bergen.org,True
11109,Bergen County Special Services School District,3401450,540 Farview Avenue,,,Paramus,NJ,7652,(201) 343-6000,http://bcss.bergen.org,True
11070,Hudson Arts and Science Charter School,3400787,131 Midland Ave,,,Kearny,NJ,7032,(201) 773-9140,http://www.hudsoncharter.org,True
11023,Bergen Arts and Science Charter School,3400715,200 MacArthur Ave,,,Garfield,NJ,7026,(201) 773-9140,http://www.bergencharter.org,True
3802,Regional School District 09,903780,654 Morehouse Road,,,Easton,CT,6612,(203) 261-2513,http://www.er9.org,True


In [11]:
# since the most coverage is by the website, let's do a merge there first
# our goal is to match as much boarddocs as possible
# so boarddocs_df is on the left
nces_df = nces_df.rename(columns={"WEBSITE":"home_website"})

In [12]:
nces_df["home_website"].isna().value_counts()

home_website
False    16957
True      2680
Name: count, dtype: int64

In [13]:
# before merge, let's clean
# remove https and www
url_prefix_pattern = r"https?://(www.)?"
nces_df["home_website"] = nces_df["home_website"].str.replace(url_prefix_pattern, '', regex=True)
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.replace(url_prefix_pattern, '', regex=True)

# also get the first stub
single_slash_pattern = r"[?<!/]/[?!/]"
nces_df["home_website"] = nces_df["home_website"].str.split(single_slash_pattern,n=1, regex=True).str[0]
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.split(single_slash_pattern,n=1, regex=True).str[0]

# also replace trailing /
trailing_slash_pattern = r"/$"
nces_df["home_website"] = nces_df["home_website"].str.replace(trailing_slash_pattern, '', regex=True)
boarddocs_df["home_website"] = boarddocs_df["home_website"].str.replace(trailing_slash_pattern, '', regex=True)

In [14]:
# check if the websites are duplicated
sum(nces_df[~nces_df["home_website"].isna()]["home_website"].duplicated(keep=False)) / nces_df.shape[0] * 100

7.170138004786882

In [15]:
sum(nces_df[~nces_df["home_website"].isna()]["home_website"].duplicated(keep=False))

1408

In [16]:
# check if the websites are duplicated
sum(boarddocs_df[~boarddocs_df["home_website"].isna()]["home_website"].duplicated(keep=False)) / boarddocs_df.shape[0] * 100

0.4876796714579056

In [17]:
sum(boarddocs_df[~boarddocs_df["home_website"].isna()]["home_website"].duplicated(keep=False))

19

In [18]:
boarddocs_df.loc[(~boarddocs_df["home_website"].isna()) & (boarddocs_df["home_website"].duplicated(keep=False)),:].sort_values(by="home_website")

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone
322,https://go.boarddocs.com/nj/bergen/Board.nsf/P...,Bergen County Technical Schools,"540 Farview Ave. , Paramus , NJ 07652",bergen.org,
1781,https://go.boarddocs.com/nj/bergencss/Board.ns...,Bergen County Special Services,"540 Farview Avenue, Paramus, NJ 07652",bergen.org,
3552,https://go.boarddocs.com/wa/bethel/Board.nsf/P...,Bethel School District,"516 176th St E | Spanaway, WA 98387 | Phone: 2...",bethelsd.org,(253) 800-2010
3455,https://go.boarddocs.com/wa/pierce/Board.nsf/P...,Pierce County Skills Center,"16117 Canyon Rd. E Puyallup, WA 98375 Phone:...",bethelsd.org,(253) 800-4800
3768,https://go.boarddocs.com/mi/kcacad/Board.nsf/P...,Kalamazoo Covenant Academy,"400 W Crosstown Pkwy | Kalamazoo, MI 49001 | 2...",covenantacademies.org,(269) 888-2700
483,https://go.boarddocs.com/mi/mcacad/Board.nsf/P...,Muskegon Covenant Academy,"125 Catherine Avenue | Muskegon, MI 49442 | 23...",covenantacademies.org,(231) 720-3100
3128,https://go.boarddocs.com/mi/scacad/Board.nsf/P...,Saginaw Covenant Academy,"508 S. Washington Avenue | Saginaw, MI 48607 |...",covenantacademies.org,(989) 596-1100
2367,https://go.boarddocs.com/mi/engadine/Board.nsf...,Engadine Consolidated Schools,,eupschools.org,
1875,https://go.boarddocs.com/mi/macki/Board.nsf/Pu...,,,eupschools.org,
1620,https://go.boarddocs.com/ca/ecscn/Board.nsf/Pu...,Excelsior Charter School Corona-Norco,"1400 Fullerton Ave. Corona, CA 92879 | 951.547...",excelsior.com,(951) 547-7540


In [19]:
# these are quite the pain so let's ignore them for now

In [20]:
df = pd.merge(boarddocs_df[~boarddocs_df["home_website"].duplicated(keep=False)], nces_df[~nces_df["home_website"].isna()], how="left", on="home_website", suffixes=["_boarddocs", "_nces"])

In [21]:
# let's check how good is the match
# check how many rows have LEAID
df[~df["LEAID"].isna()].shape[0] / boarddocs_df.shape[0] * 100

78.10574948665298

In [22]:
# ok we have matched 61% of them
# check those that weren't matched but has website

df[(~df["home_website"].isna()) & (df["LEAID"].isna())].sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern
3218,https://go.boarddocs.com/ut/spanishfork/Board....,"City of Spanish Fork, Utah","80 S. Main St. | Spanish Fork, UT 84660 | (8...",spanishfork.org,(801) 804-4530,,,,,,,,,,
8,https://go.boarddocs.com/il/bhsd228/Board.nsf/...,Bremen High School District 228,"15233 S. Pulaski Rd, Midlothian, IL 60445 | 70...",bhsd228.com,(708) 389-1175,,,,,,,,,,
758,https://go.boarddocs.com/wy/wyos/Board.nsf/Public,Wyoming State Board of Education,"122 W. 25th St. Suite E200 Cheyenne, WY 82002 ...",edu.wyoming.gov,(307) 777-6213,,,,,,,,,,
2678,https://go.boarddocs.com/ca/scc/Board.nsf/Public,Sacramento City College,"3835 Freeport Boulevard • Sacramento, CA 95822...",scc.losrios.edu,(916) 558-2111,,,,,,,,,,
2903,https://go.boarddocs.com/mi/aaesa/Board.nsf/Pu...,Allegan Area Educational Service Agency,"310 Thomas Street, Allegan, MI 49010",alleganaesa.org,,,,,,,,,,,


In [23]:
# let's take out the ones matched first
matched = df[~df["LEAID"].isna()]
boarddocs_df = boarddocs_df[~boarddocs_df["boarddocs_url"].isin(matched["boarddocs_url"])]

In [24]:
# sanity check the matches so far
# check whether the phone matches
# for those with both set
df = matched[(~matched["phone_boarddocs"].isna()) & (~matched["phone_nces"].isna()) ].copy()

In [25]:
df.shape

(1722, 15)

In [26]:
df.loc[:,"phones_agree"] = (df["phone_boarddocs"] == df["phone_nces"])
df["phones_agree"].value_counts()

phones_agree
True     1362
False     360
Name: count, dtype: int64

In [27]:
# check those that differs
df[~df["phones_agree"]].sample(10)

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern,phones_agree
1668,https://go.boarddocs.com/va/mcpsva/Board.nsf/P...,Manassas City Public Schools,"8700 Centreville Rd. Suite 400 | Manassas, VA ...",mcpsva.org,(571) 377-6000,Manassas City Public Schools,5102360.0,8700 Centreville Rd Suite 400,,,Manassas,VA,20110.0,(571) 377-6008,True,False
684,https://go.boarddocs.com/ny/frewsburg/Board.ns...,Frewsburg Central School District NY,"26 Institute Street, Frewsburg, NY 14738 | 716...",frewsburgcsd.org,(716) 569-7000,FREWSBURG CENTRAL SCHOOL DISTRICT,3611610.0,26 INSTITUTE ST,,,FREWSBURG,NY,14738.0,(716) 569-7041,True,False
1931,https://go.boarddocs.com/oh/trlsd/Board.nsf/Pu...,Three Rivers Local School District,"401 N. Miami Ave. | Cleves, OH 45002 | 513-941...",threeriversschools.org,(513) 941-6400,Three Rivers Local,3904739.0,401 N Miami Ave,,,Cleves,OH,45002.0,(877) 644-6338,True,False
214,https://go.boarddocs.com/ks/mzufsd266/Board.ns...,Maize Unified School District 266,"905 W Academy Avenue • Maize, KS 67101 • Phone...",usd266.com,(316) 722-0614,Maize Special Education Cooperative,2000381.0,905 W Academy,,,Maize,KS,67101.0,(316) 350-2041,True,False
2764,https://go.boarddocs.com/ny/lscsd/Board.nsf/Pu...,Lake Shore Central Schools (Evans-Brant Centra...,"959 Beach Road | Angola, NY 14006 | 716-549-2300",lakeshorecsd.org,(716) 549-2300,EVANS-BRANT CENTRAL SCHOOL DISTRICT (LAKE SHORE),3616560.0,959 BEACH RD,,,ANGOLA,NY,14006.0,(716) 926-2201,True,False
229,https://go.boarddocs.com/ca/cvusd/Board.nsf/Pu...,Cajon Valley Union School District,"750 East Main Street | El Cajon, CA 92020 | p ...",cajonvalley.net,(619) 588-3000,Cajon Valley Union,606810.0,750 East Main St.,,,El Cajon,CA,92020.0,(619) 588-3005,True,False
1526,https://go.boarddocs.com/ny/chufsd/Board.nsf/P...,Croton-Harmon Union Free School District,"10 Gerstein Street | Croton-On-Hudson, NY 1052...",chufsd.org,(914) 271-4713,CROTON-HARMON UNION FREE SCHOOL DISTRICT,3608580.0,10 GERSTEIN ST,,,CROTON ON HUDSON,NY,10520.0,(914) 271-4793,True,False
1366,https://go.boarddocs.com/mi/wops/Board.nsf/Public,,West Ottawa Public Schools | 1138 136th Ave. |...,westottawa.net,(616) 786-2050,West Ottawa Public School District,2635910.0,1138 136TH AVE,,,HOLLAND,MI,49424.0,(616) 786-2099,True,False
397,https://go.boarddocs.com/ny/mufsd/Board.nsf/Pu...,Mamaroneck Union Free School District,"1000 W Boston Post Rd. | Mamaroneck, NY 10543...",mamkschools.org,(914) 220-3000,MAMARONECK UNION FREE SCHOOL DISTRICT,3618240.0,1000 W BOSTON POST RD,,,MAMARONECK,NY,10543.0,(914) 220-3005,True,False
3721,https://go.boarddocs.com/ny/waynecsd/Board.nsf...,Wayne Central School District,"6200 Ontario Center Rd. Ontario Center, NY 145...",waynecsd.org,(315) 524-1000,WAYNE CENTRAL SCHOOL DISTRICT,3630330.0,6200 ONTARIO CTR RD,,,ONTARIO CENTER,NY,14520.0,(315) 524-1001,True,False


In [28]:
# it seems like both phone numbers are valid
# I will just keep both for now

In [29]:
# check the situation at boarddocs
sum(boarddocs_df[~boarddocs_df["phone"].isna()]["phone"].duplicated(keep=False))

2

In [30]:
boarddocs_df[(~boarddocs_df["phone"].isna()) & boarddocs_df["phone"].duplicated(keep=False)]

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone
3166,https://go.boarddocs.com/ca/empirescs/Board.ns...,Empire Springs Charter School,"27740 Jefferson Ave | Temecula, CA 92590 | (95...",springscharterschools.org,(951) 252-8800
3645,https://go.boarddocs.com/ca/harborscs/Board.ns...,Harbor Springs Charter School,"27740 Jefferson Avenue | Temecula, CA 92590 | ...",springscharterschools.org,(951) 252-8800


In [31]:
# again, ignore this for now

In [32]:
# merge it
df = pd.merge(boarddocs_df[~boarddocs_df["phone"].duplicated(keep=False)], nces_df, how="left", on="phone", suffixes=["_boarddocs", "_nces"])

In [33]:
# check matches
# check how many rows have LEAID
df[~df["LEAID"].isna()].shape[0] / df.shape[0] * 100

62.19512195121951

In [34]:
df[~df["LEAID"].isna()].sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website_boarddocs,phone,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,home_website_nces,phone_good_pattern
486,https://go.boarddocs.com/wv/dcschools/Board.ns...,Doddridge County Schools,"268 Bulldog Drive | West Union, WV 26456 | 304...",dcschools.us,(304) 873-2300,DODDRIDGE COUNTY SCHOOLS,5400270.0,268 BULLDOG DRIVE,,,WEST UNION,WV,26456.0,,True
354,https://go.boarddocs.com/la/einsteincharter/Bo...,Einstein Charter Schools,"4801 Maid Marion, New Orleans, LA 70128 | 504-...",einsteincharterschool.com,(504) 503-0109,Einstein Charter School at Village De L'Est,2200283.0,5316 Michoud Blvd,,,New Orleans,LA,70129.0,,True
464,https://go.boarddocs.com/wv/lin/Board.nsf/Public,,Lincoln County School District | 10 Marland A...,lcsdwv.com,(304) 824-3033,LINCOLN COUNTY SCHOOLS,5400660.0,10 MARLAND AVENUE,,,HAMLIN,WV,25523.0,,True
309,https://go.boarddocs.com/mo/stjs/Board.nsf/Public,St. James R-1 School District,"122 E. Scioto Street | St. James, MO 65559 | 5...",,(573) 265-2300,ST. JAMES R-I,2929250.0,122 E SCIOTO ST,,,ST JAMES,MO,65559.0,stjschools.org,True
117,https://go.boarddocs.com/id/csd151/Board.nsf/P...,Cassia School District 151,"3650 Overland | Burley, ID 83318 | (208) 8...",cassiaschools.org,(208) 878-6600,CASSIA COUNTY JOINT DISTRICT,1600660.0,3650 OVERLAND AVENUE,,,BURLEY,ID,83318.0,,True


In [35]:
# add them to the match doc
df = df.drop("phone_good_pattern", axis=1)
matched = pd.concat([matched, df], axis=0)

In [36]:
# check if they have the same websites
df = df[(~df["home_website_boarddocs"].isna()) & (~df["home_website_nces"].isna()) ].copy()
df.shape

(216, 14)

In [37]:
df.loc[:,"website_agree"] = (df["home_website_boarddocs"] == df["home_website_nces"])
df["website_agree"].value_counts()

website_agree
False    213
True       3
Name: count, dtype: int64

In [38]:
df[~df["website_agree"]].sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website_boarddocs,phone,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,home_website_nces,website_agree
221,https://go.boarddocs.com/oh/jclocal/Board.nsf/...,Jackson Center Local Schools,"204 S. Linden Street | Jackson Center, OH 4533...",jctigers.org,(937) 596-6053,Jackson Center Local,3910005.0,204 S Linden St,,,Jackson Center,OH,45334.0,jackson-center.k12.oh.us,False
308,https://go.boarddocs.com/ca/wuhsd/Board.nsf/Pu...,Whittier Union High School District,"9401 S. Painter Avenue | Whittier, California ...",wuhsd.org,(562) 698-8121,Whittier Union High,642480.0,9401 South Painter Ave.,,,Whittier,CA,90605.0,wuhsd.k12.ca.us,False
14,https://go.boarddocs.com/ca/cayucos/Board.nsf/...,Cayucos Elementary School District,"301 Cayucos Drive, Cayucos, CA 93430 | (805) 9...",cayucosschool.org,(805) 995-3694,Cayucos Elementary,607840.0,301 Cayucos Dr.,,,Cayucos,CA,93430.0,sites.google.com/a/cayucosschool.org/cayucos-e...,False
114,https://go.boarddocs.com/ny/rnufsd/Board.nsf/P...,Rye Neck Board of Education,"310 Hornidge Road Mamaroneck, NY 10543 \t914-...",ryeneck.org,(914) 777-5200,RYE NECK UNION FREE SCHOOL DISTRICT,3625290.0,310 HORNIDGE RD,,,MAMARONECK,NY,10543.0,ryeneck.k12.ny.us,False
333,https://go.boarddocs.com/mo/webster/Board.nsf/...,Webster Groves School District,"400 E. Lockwood Ave. | Webster Groves, MO 6311...",webster.k12.mo.us,(314) 961-1233,WEBSTER GROVES,2931530.0,400 E LOCKWOOD AVE,,,WEBSTER GROVES,MO,63119.0,WWW.WEBSTER.K12.MO.US,False


In [39]:
matched.head()

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone_boarddocs,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,phone_good_pattern,home_website_boarddocs,phone,home_website_nces
0,https://go.boarddocs.com/mi/sjs/Board.nsf/Public,St. Joseph Public Schools,,sjschools.org,,St. Joseph Public Schools,2632850.0,2580 S CLEVELAND AVE,,,SAINT JOSEPH,MI,49085.0,(269) 926-3100,True,,,
1,https://go.boarddocs.com/pa/cali/Board.nsf/Public,,,calsd.org,,California Area SD,4204710.0,40 Trojan Way,,,Coal Center,PA,15423.0,(724) 785-5800,True,,,
2,https://go.boarddocs.com/oh/mapleheights/Board...,Maple Heights City Schools,"5740 Lawn Avenue | Maple Heights, OH 44137 | 2...",mapleschools.com,(216) 587-6100,Maple Heights City,3904430.0,5740 Lawn Ave,,,Maple Heights,OH,44137.0,(216) 587-6100,True,,,
4,https://go.boarddocs.com/pa/shun/Board.nsf/Public,Southern Huntingdon County School District,,shcsd.org,,Southern Huntingdon County SD,4222320.0,10339 Pogue Road,,,Three Springs,PA,17264.0,(814) 447-5529,True,,,
5,https://go.boarddocs.com/de/sussexvt/Board.nsf...,Sussex Technical School District,17099 County Seat Hwy | Georgetown DE 19947 | ...,sussexvt.org,(302) 856-0961,Sussex Technical School District,1001680.0,17137 County Seat Highway,,,Georgetown,DE,19947.0,(302) 856-2541,True,,,


In [40]:
matched.shape

(3617, 18)

In [41]:
# match percentage
matched.shape[0] / pd.read_csv("../deliverable1/release/deliverable_1.csv").shape[0] * 100

92.8388090349076

In [42]:
# now match by zipcode
# zipcode is 5 digits standalone
zipcode_pattern = r"\b(\d{5})\b"
boarddocs_df["zipcode_from_boarddocs"] = boarddocs_df["address"].str.extract(zipcode_pattern)

In [43]:
boarddocs_df[["address", "zipcode_from_boarddocs"]].sample(5)

Unnamed: 0,address,zipcode_from_boarddocs
724,"3200 Broadway | Kansas City, MO 64111 | 816.60...",64111.0
1537,,
756,"2900 Mink Point Boulevard | Beaufort, SC 29902...",29902.0
2941,"4166 State Route 28 | Boiceville, NY 12412 | P...",12412.0
1556,"3321 Georgia Street, Louisiana, MO 63353 | 573...",63353.0


In [44]:
# get the remaining boarddocs and nces
boarddocs_df = boarddocs_df[~boarddocs_df["boarddocs_url"].isin(matched["boarddocs_url"])]
nces_df = nces_df[~nces_df["LEAID"].isin(matched["LEAID"])]

In [45]:
print(f"{boarddocs_df.shape=}")
print(f"{nces_df.shape=}")

boarddocs_df.shape=(387, 6)
nces_df.shape=(16240, 11)


In [46]:
# get unique zipcodes
boarddocs_unique_zipcode = boarddocs_df[~boarddocs_df["zipcode_from_boarddocs"].duplicated(keep=False)]
nces_unique_zipcode = nces_df[~nces_df["LZIP"].duplicated(keep=False)]

In [47]:
print(f"{boarddocs_unique_zipcode.shape=}")
print(f"{nces_unique_zipcode.shape=}")

boarddocs_unique_zipcode.shape=(138, 6)
nces_unique_zipcode.shape=(8446, 11)


In [48]:
nces_unique_zipcode = nces_unique_zipcode.rename(columns={
    "LZIP": "zipcode"
})

boarddocs_unique_zipcode = boarddocs_unique_zipcode.rename(columns={
    "zipcode_from_boarddocs": "zipcode"
})

In [49]:
nces_unique_zipcode["zipcode"] = nces_unique_zipcode["zipcode"].astype(int)
boarddocs_unique_zipcode["zipcode"] = boarddocs_unique_zipcode["zipcode"].astype(int)

In [50]:
# try matching
df = pd.merge(boarddocs_unique_zipcode, nces_unique_zipcode, how="inner", on="zipcode", suffixes=["_boarddocs", "_nces"])

In [51]:
df.shape

(73, 16)

In [52]:
df.sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website_boarddocs,phone_boarddocs,zipcode,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,phone_nces,home_website_nces,phone_good_pattern
16,https://go.boarddocs.com/nh/lyme/Board.nsf/Public,Lyme School District,"35 Union Street; Lyme, New Hampshire NH 03768",lymeschool.org,,3768,Lyme School District,3304500,38 Union Street,,,Lyme,NH,(603) 795-4431,lymeschool.org/page/sau-76-business-office,True
32,https://go.boarddocs.com/in/whitley/Board.nsf/...,Whitley County Consolidated Schools,"107 N Walnut St, Columbia City, IN 46725",wccsonline.com,,46725,Whitley County Con Schools,1802280,107 N Walnut St,,,Columbia City,IN,(260) 244-5771,wccs.k12.in.us,True
63,https://go.boarddocs.com/wi/pwssd/Board.nsf/Pu...,Port Washington-Saukville School District,"100 West Monroe Street, Port Washington, WI 53074",pwssd.org,,53074,Port Washington-Saukville School District,5512000,100 W Monroe St,,,Port Washington,WI,(262) 268-6000,pwssd.k12.wi.us,True
46,https://go.boarddocs.com/ma/triton/Board.nsf/P...,Triton Regional School District,"112 Elm Street Byfield, MA 01922",,,1922,Triton,2511740,112 Elm Street,,,Byfield,MA,(978) 465-2397,tritonschools.org,True
27,https://go.boarddocs.com/wi/elcho/Board.nsf/Pu...,Elcho School District,"N11268 Antigo St PO BOX 800 Elcho, WI 54428",elchoschool.org,,54428,Elcho School District,5504170,N11268 Antigo St,,,Elcho,WI,(715) 275-3225,elcho.k12.wi.us,True


In [53]:
# the matches are not exactly correctly
# discard

In [54]:
# now merge by name
SD_pattern = r"\b(SD)\b"
school_district_pattern = r"\b(school district)\b"
space_pattern = r"\s"

# remove SD
boarddocs_df["cleaned_name"] = boarddocs_df["school_district"].str.replace(SD_pattern, '', regex=True)
nces_df["cleaned_name"] = nces_df["LEA_NAME"].str.replace(SD_pattern, '', regex=True)

# lower case it
boarddocs_df["cleaned_name"] = boarddocs_df["cleaned_name"].str.lower()
nces_df["cleaned_name"] = nces_df["cleaned_name"].str.lower()

# remove school district
boarddocs_df["cleaned_name"] = boarddocs_df["cleaned_name"].str.replace(school_district_pattern, '', regex=True)
nces_df["cleaned_name"] = nces_df["cleaned_name"].str.replace(school_district_pattern, '', regex=True)

# remove spaces
boarddocs_df["cleaned_name"] = boarddocs_df["cleaned_name"].str.replace(space_pattern, '', regex=True)
nces_df["cleaned_name"] = nces_df["cleaned_name"].str.replace(space_pattern, '', regex=True)

In [55]:
# get unique ones
print(f"{boarddocs_df.shape=}")
print(f"{nces_df.shape=}")
boarddocs_df = boarddocs_df[~boarddocs_df["cleaned_name"].duplicated(keep=False)]
nces_df = nces_df[~nces_df["cleaned_name"].duplicated(keep=False)]

boarddocs_df.shape=(387, 7)
nces_df.shape=(16240, 12)


In [56]:
# try merging
print(f"{boarddocs_df.shape=}")
print(f"{nces_df.shape=}")
df = pd.merge(boarddocs_df, nces_df, how="inner", on="cleaned_name", suffixes=["_boarddocs", "_nces"])
print(f"{df.shape=}")

boarddocs_df.shape=(348, 7)
nces_df.shape=(15362, 12)
df.shape=(165, 18)


In [57]:
# check it
df.sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website_boarddocs,phone_boarddocs,zipcode_from_boarddocs,cleaned_name,LEA_NAME,LEAID,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,phone_nces,home_website_nces,phone_good_pattern
139,https://go.boarddocs.com/pa/wyvw/Board.nsf/Public,Wyoming Valley West School District,,wvwsd.org,,,wyomingvalleywest,Wyoming Valley West SD,4225950,450 N Maple Ave,,,Kingston,PA,18704,(570) 288-6551,wvwspartans.org,True
48,https://go.boarddocs.com/pa/dngl/Board.nsf/Public,Donegal School District,,,,,donegal,Donegal SD,4207650,1051 Koser Rd,,,Mount Joy,PA,17552,(717) 492-1302,donegalsd.org,True
41,https://go.boarddocs.com/mi/lakev/Board.nsf/Pu...,Lakeview Community Schools,,lakeviewschools.net,,,lakeviewcommunityschools,LAKEVIEW COMMUNITY SCHOOLS,3100112,3744 83RD ST,,,COLUMBUS,NE,68601,(402) 563-2345,,True
122,https://go.boarddocs.com/pa/nyor/Board.nsf/Public,Northern York County School District,,nycsd.k12.pa.us,,,northernyorkcounty,Northern York County SD,4217760,650 S Baltimore St,,,Dillsburg,PA,17019,(717) 432-8691,northernpolarbears.com,True
45,https://go.boarddocs.com/mi/oscoda/Board.nsf/P...,Oscoda Area Schools,,oscodaschools.org,,,oscodaareaschools,Oscoda Area Schools,2626970,3550 E RIVER RD,,,OSCODA,MI,48750,(989) 739-2033,,True


In [58]:
matched = matched.drop(['home_website', 'phone'], axis=1)

In [59]:
df = df[list(matched.columns)]

In [60]:
matched = pd.concat([matched, df])
matched.shape

(3782, 16)

In [61]:
# match percentage
matched.shape[0] / pd.read_csv("../deliverable1/release/deliverable_1.csv").shape[0] * 100

97.07392197125256

In [62]:
# remove to get unmatched
boarddocs_df = boarddocs_df[~boarddocs_df["boarddocs_url"].isin(matched["boarddocs_url"])]
nces_df = nces_df[~nces_df["LEAID"].isin(matched["LEAID"])]
print(f"{boarddocs_df.shape=}")
print(f"{nces_df.shape=}")

boarddocs_df.shape=(183, 7)
nces_df.shape=(15197, 12)


In [63]:
boarddocs_df.sample(5)

Unnamed: 0,boarddocs_url,school_district,address,home_website,phone,zipcode_from_boarddocs,cleaned_name
1865,https://go.boarddocs.com/mi/sagpub/Board.nsf/P...,Saginaw Public Schools,,spsd.net,,,saginawpublicschools
2926,https://go.boarddocs.com/ca/imagineca/Board.ns...,Imagine Schools CA Region,,imagineschoolssouthwest.org,,,imagineschoolscaregion
2251,https://go.boarddocs.com/ak/alaska/Board.nsf/P...,University of Alaska,,alaska.edu,,,universityofalaska
614,https://go.boarddocs.com/oh/coshc/Board.nsf/Pu...,Coshocton City Schools,,coshoctonredskins.com,,,coshoctoncityschools
3252,https://go.boarddocs.com/id/gem/Board.nsf/Public,Gem Innovation Schools,,gemprep.org,,,geminnovationschools


In [72]:
old_nces_df.columns

Index(['SCHOOL_YEAR', 'FIPST', 'STATENAME', 'ST', 'LEA_NAME',
       'STATE_AGENCY_NO', 'UNION', 'ST_LEAID', 'LEAID', 'MSTREET1', 'MSTREET2',
       'MSTREET3', 'MCITY', 'MSTATE', 'MZIP', 'MZIP4', 'LSTREET1', 'LSTREET2',
       'LSTREET3', 'LCITY', 'LSTATE', 'LZIP', 'LZIP4', 'PHONE', 'WEBSITE',
       'SY_STATUS', 'SY_STATUS_TEXT', 'UPDATED_STATUS', 'UPDATED_STATUS_TEXT',
       'EFFECTIVE_DATE', 'LEA_TYPE', 'LEA_TYPE_TEXT', 'OUT_OF_STATE_FLAG',
       'CHARTER_LEA', 'CHARTER_LEA_TEXT', 'NOGRADES', 'G_PK_OFFERED',
       'G_KG_OFFERED', 'G_1_OFFERED', 'G_2_OFFERED', 'G_3_OFFERED',
       'G_4_OFFERED', 'G_5_OFFERED', 'G_6_OFFERED', 'G_7_OFFERED',
       'G_8_OFFERED', 'G_9_OFFERED', 'G_10_OFFERED', 'G_11_OFFERED',
       'G_12_OFFERED', 'G_13_OFFERED', 'G_UG_OFFERED', 'G_AE_OFFERED', 'GSLO',
       'GSHI', 'LEVEL', 'IGOFFERED', 'OPERATIONAL_SCHOOLS'],
      dtype='object')

In [80]:
old_nces_df = pd.read_csv("../nces2324.csv")

# Filter the NCES dataframe for Texas
texas_districts = old_nces_df[old_nces_df["LSTATE"] == "TX"]

# Assuming there is a column 'NUM_SCHOOLS' that indicates the number of operational schools
# Sort the dataframe by the number of operational schools in descending order
largest_texas_districts = texas_districts.sort_values(by="OPERATIONAL_SCHOOLS", ascending=False)

# Display the top 10 largest districts
largest_texas_districts.head(10)

  old_nces_df = pd.read_csv("../nces2324.csv")


Unnamed: 0,SCHOOL_YEAR,FIPST,STATENAME,ST,LEA_NAME,STATE_AGENCY_NO,UNION,ST_LEAID,LEAID,MSTREET1,...,G_11_OFFERED,G_12_OFFERED,G_13_OFFERED,G_UG_OFFERED,G_AE_OFFERED,GSLO,GSHI,LEVEL,IGOFFERED,OPERATIONAL_SCHOOLS
17355,2023-2024,48,TEXAS,TX,HOUSTON ISD,1,,TX-101912,4823640,4400 W 18TH ST,...,Yes,Yes,No,No,No,PK,12,Other,As reported,274
17163,2023-2024,48,TEXAS,TX,DALLAS ISD,1,,TX-057905,4816230,9400 N CENTRAL EXPY,...,Yes,Yes,No,No,No,PK,12,Other,As reported,240
17248,2023-2024,48,TEXAS,TX,FORT WORTH ISD,1,,TX-220905,4819700,7060 CAMP BOWIE BLVD,...,Yes,Yes,No,No,No,PK,12,Other,As reported,138
17599,2023-2024,48,TEXAS,TX,NORTHSIDE ISD,1,,TX-015915,4833120,5900 EVERS RD,...,Yes,Yes,No,No,No,PK,12,Other,As reported,126
16975,2023-2024,48,TEXAS,TX,AUSTIN ISD,1,,TX-227901,4808940,4000 S IH 35 FRONTAGE RD,...,Yes,Yes,No,No,No,PK,12,Other,As reported,123
16824,2023-2024,48,TEXAS,TX,IDEA PUBLIC SCHOOLS,1,,TX-108807,4800211,2115 W PIKE BLVD,...,Yes,Yes,No,No,No,KG,12,Other,As reported,123
17732,2023-2024,48,TEXAS,TX,SAN ANTONIO ISD,1,,TX-015907,4838730,514 QUINCY ST,...,Yes,Yes,No,No,No,PK,12,Other,As reported,98
17159,2023-2024,48,TEXAS,TX,CYPRESS-FAIRBANKS ISD,1,,TX-101907,4816110,P O BOX 692003,...,Yes,Yes,No,No,No,PK,12,Other,As reported,93
17247,2023-2024,48,TEXAS,TX,FORT BEND ISD,1,,TX-079907,4819650,16431 LEXINGTON BLVD,...,Yes,Yes,No,No,No,PK,12,Other,As reported,83
16945,2023-2024,48,TEXAS,TX,ALDINE ISD,1,,TX-101902,4807710,2520 W W THORNE DR,...,Yes,Yes,No,No,No,PK,12,Other,As reported,80


In [None]:
df = pd.read_csv("../deliverable1/working_school_districts_with_boarddocs_scraped.csv")
df[df["LEAID"].isin(largest_texas_districts["LEAID"].head(10))]["LEA_NAME"]

6665               HOUSTON ISD
9113             NORTHSIDE ISD
9408                DALLAS ISD
11136               ALDINE ISD
11854    CYPRESS-FAIRBANKS ISD
13982      IDEA PUBLIC SCHOOLS
14148           FORT WORTH ISD
14813            FORT BEND ISD
15074          SAN ANTONIO ISD
17394               AUSTIN ISD
Name: LEA_NAME, dtype: object

here is a manual compilation
HOUSTON ISD,https://houstonisd.legistar.com/Calendar.aspx
NORTHSIDE ISD,https://www.nisd.net/board/minutes
DALLAS ISD,https://go.boarddocs.com/tx/disd/Board.nsf/vpublic?open
ALDINE ISD,https://meetings.boardbook.org/public/Organization/1722
CYPRESS-FAIRBANKS ISD,https://meetings.boardbook.org/Public/Organization/668
IDEA PUBLIC SCHOOLS,https://ideapublicschools.org/our-story/national-board-of-directors/
FORT WORTH ISD,https://fortworthisd.granicus.com/ViewPublisher.php?view_id=2
FORT BEND ISD,https://meetings.boardbook.org/Public/Organization/649
SAN ANTONIO ISD,https://meetings.boardbook.org/public/Organization/1982
AUSTIN ISD,https://go.boarddocs.com/tx/austinisd/Board.nsf/Public

PLANO ISD,https://pisd.diligent.community/Portal/MeetingTypeList.aspx
FRISCO ISD,https://meetings.boardbook.org/Public/Organization/2014
GARLAND ISD,https://meetings.boardbook.org/public/Organization/1084


In [264]:
# output the deliverable
matched = matched.rename(columns={
    "school_district": "school_district_from_boarddocs",
    "address": "address_from_boarddocs",
    "home_website": "home_website_from_boarddocs",
    "phone_nces": "phone_from_nces"
})

matched["LEAID"] = matched["LEAID"].astype("Int64")

matched.to_csv("release/deliverable_2.csv", index=False, columns=["LEAID",  "LEA_NAME", "boarddocs_url", "school_district_from_boarddocs", "address_from_boarddocs"])