This notebook visits the scraped links and get the name and address fields.

Input:
- `working_school_districts_with_boarddocs_scraped.csv`

Output:
- `boarddocs_url_cleaned.csv`

In [102]:
import pandas as pd

input_filename = "working_school_districts_with_boarddocs_scraped.csv"
boarddocs_df = pd.read_csv(input_filename)
boarddocs_df.sample(10)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url
19270,Northern Lehigh SD,4217670,PENNSYLVANIA,https://go.boarddocs.com/pa/nleh/Board.nsf/goto?open&id=CEKR2L68BCD8
993,COALGATE,4008130,OKLAHOMA,
10715,HANCOCK PUBLIC SCHOOL DISTRICT,2713380,MINNESOTA,https://go.boarddocs.com/mi/hancps/Board.nsf/Public?open&id=policies
13008,Harrisonburg City Public Schools,5101860,VIRGINIA,https://go.boarddocs.com/vsba/hcsva/Board.nsf/Public
6549,Folsom Borough School District,3405280,NEW JERSEY,https://go.boarddocs.com/nj/folsom/Board.nsf/goto?open&id=D8MJR94E66D0
11978,iLearn Academy Charter School,6600005,GUAM,https://go.boarddocs.com/in/indps/Board.nsf/files/DB3TBR769BA3/$file/PLA%20103%20Renewal%20Data%20Report%20-%20November%202024.pdf
8466,RSU 38,2314788,MAINE,https://go.boarddocs.com/ca/sandi/Board.nsf/files/A452PU032F75/$file/King%20Chavez%20Preparatory%20Academy%20CHARTER%2007%2001%202016%20to%2006%2030%202021.pdf
620,East Alton-Wood River CHSD 14,1712990,ILLINOIS,
15592,QUEEN CITY ISD,4836210,TEXAS,https://go.boarddocs.com/la/ebrp/Board.nsf/files/BVZFDJ7C92A9/$file/Kristi%20Mickelson%20Resume_Redacted.pdf
15840,Satsop School District,5307680,WASHINGTON,https://go.boarddocs.com/wa/osd111/Board.nsf/vpublic?open


In [103]:
# get the url
urls = set(boarddocs_df["url"])

In [104]:
# number of urls
len(urls)

11186

In [105]:
from urllib.parse import urlparse

def extract_domain(url):
    try:
        domain = urlparse(url).netloc
        return domain
    except Exception as e:
        return None

# Example usage:
domains = set([extract_domain(url) for url in urls])
domains

{None, 'cdn.boarddocs.com', 'go.boarddocs.com', 'www.boarddocs.com'}

In [106]:
# let's check out the cdn
boarddocs_df['domain'] = boarddocs_df["url"].apply(extract_domain)

In [107]:
# Set the option to display all columns
pd.set_option('display.max_columns', None)

# (Optional) Set the maximum width of each column to prevent truncation
pd.set_option('display.max_colwidth', None)

In [108]:
boarddocs_df[boarddocs_df['domain'] == "cdn.boarddocs.com"]

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain
1322,Shonto Preparatory School,5900128,BUREAU OF INDIAN EDUCATION,https://cdn.boarddocs.com/sites/fl/pcsfl/2010/Attachments/Attachment-3349.pdf,cdn.boarddocs.com
2046,St. Coletta Special Education PCS,1100064,DISTRICT OF COLUMBIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Agendas/Documents/Agenda-1291.pdf,cdn.boarddocs.com
2152,Cedar Tree Academy PCS,1100029,DISTRICT OF COLUMBIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Attachments/Attachment-7964.pdf,cdn.boarddocs.com
2805,TEACH Prep Mildred S. Cunningham & Edith H. Morris Elem DIST,601575,CALIFORNIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Attachments/Attachment-2245.pdf,cdn.boarddocs.com
2837,Imagine Prep Coolidge Inc. (90034),400769,ARIZONA,https://cdn.boarddocs.com/sites/fl/pcsfl/2010/Attachments/Attachment-3349.pdf,cdn.boarddocs.com
3353,Skyview School Inc. (4496),400093,ARIZONA,https://cdn.boarddocs.com/sites/fl/pcsfl/2014/Resolutions/ResolutionsByMeeting.html,cdn.boarddocs.com
6051,HARBOR SCIENCE AND ARTS CHARTER SCHOOL,3600047,NEW YORK,https://cdn.boarddocs.com/sites/fl/pcsfl/2014/Attachments/Attachment-5954.pdf,cdn.boarddocs.com
6071,STOREFRONT ACADEMY CHARTER SCHOOL,3601137,NEW YORK,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Attachments/Attachment-7964.pdf,cdn.boarddocs.com
6637,Curlew School District,5301890,WASHINGTON,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Attachments/Attachment-1556.pdf,cdn.boarddocs.com
7231,Valley Academy for Career and Technology Education (79397),400341,ARIZONA,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Agendas/Documents/Agenda-1045.pdf,cdn.boarddocs.com


In [109]:
# we will keep the cdn in mind and parse the state and lea agency after
boarddocs_df[boarddocs_df['domain'] == "www.boarddocs.com"].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain
1242,Dallas County,101110,ALABAMA,https://www.boarddocs.com/tx/disd/Board.nsf/files/BFQT4W707EF4/$file/Resolution%20Approving%20Dallas%20County%20Elections%20Voting%20Equipment.pdf,www.boarddocs.com
1244,Stanislaus County Office of Education,691041,CALIFORNIA,https://www.boarddocs.com/ca/sccoe/Board.nsf/files/BPMMV65CCD1E/$file/CSBA_PGSBrochure_County.pdf,www.boarddocs.com
1256,Rootstown Local,3904921,OHIO,https://www.boarddocs.com/oh/root/Board.nsf/Public?open&id=policies,www.boarddocs.com
1261,Griffin RESA,1300012,GEORGIA,https://www.boarddocs.com/ga/fcss/Board.nsf/files/8TFTCR76C17B/%24file/CCGPS%20Revised%20Board%20presentationFINAL.pdf,www.boarddocs.com
1262,CONNETQUOT CENTRAL SCHOOL DISTRICT,3608160,NEW YORK,https://www.boarddocs.com/ny/ccsdli/Board.nsf/vpublic?open,www.boarddocs.com


In [110]:
boarddocs_df[boarddocs_df['domain'] == "go.boarddocs.com"].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain
1222,RICHFIELD PUBLIC SCHOOL DISTRICT,2731750,MINNESOTA,https://go.boarddocs.com/ar/nlrsd/Board.nsf/files/CDDUUN7DC4F2/$file/2019-07-11%20Special%20Meeting%20Agenda.pdf,go.boarddocs.com
1223,Impact | Puget Sound Elementary,5300345,WASHINGTON,https://go.boarddocs.com/nc/cart/Board.nsf/files/AY4GCU433316/$file/BSES_Items%20for%20Discussion%20with%20Board%20of%20Education-May%202018.pdf,go.boarddocs.com
1224,Coatesville Area SD,4206240,PENNSYLVANIA,https://go.boarddocs.com/pa/coat/Board.nsf/vpublic?open,go.boarddocs.com
1225,DONIPHAN R-I,2910920,MISSOURI,https://go.boarddocs.com/mo/foxc6/Board.nsf/files/CLX4GL0B7F3E/$file/01%20MUSIC%20Member%20Map%202022.pdf,go.boarddocs.com
1226,Rapides Parish,2201290,LOUISIANA,https://go.boarddocs.com/la/rpsb/Board.nsf/vpublic?open,go.boarddocs.com


In [111]:
def get_url_parts(url):
    try:
        path = urlparse(url).path
        parts = path.strip('/').split('/')
        return parts
    except Exception as e:
        return [None, None, None]


boarddocs_df["boarddocs_state"] = boarddocs_df["url"].apply(lambda row: get_url_parts(row)[0])
boarddocs_df.sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state
16874,Propel CS-Northside,4200849,PENNSYLVANIA,https://go.boarddocs.com/pa/phsd/Board.nsf/files/CWNJPF4D9AF6/$file/LIST%20OF%20PAYMENTS%202023-2024%20GF.PDF,go.boarddocs.com,pa
15355,TEC Connections Academy Commonwealth Virtual School District,2500556,MASSACHUSETTS,https://www.boarddocs.com/mabe/mcpsmd/Board.nsf/files/CZB3G37DAC6D/$file/ELD%20Update%20Attachment%20C.pdf,www.boarddocs.com,mabe
625,Forrestville Valley CUSD 221,1715490,ILLINOIS,,,
7927,Greenville Preparatory Academy,3705076,NORTH CAROLINA,https://go.boarddocs.com/sc/beau/Board.nsf/files/BYZRA66BD42D/$file/ATSI%20Schools.pdf,go.boarddocs.com,sc
12878,Silver Fork Elementary,636810,CALIFORNIA,https://go.boarddocs.com/ut/spanishfork/Board.nsf/files/CKQKQN52FD54/$file/draft%2005%20CC%20Minutes%20May%2017%2C%202022.pdf,go.boarddocs.com,ut


In [112]:
# for cdn its the second argument
boarddocs_df.loc[boarddocs_df['domain']=='cdn.boarddocs.com', "boarddocs_state"] = boarddocs_df[boarddocs_df['domain']=='cdn.boarddocs.com']["url"].apply(lambda row: get_url_parts(row)[1])
boarddocs_df.loc[boarddocs_df['domain']=='cdn.boarddocs.com',].sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state
9355,Ann Jerkins-Harris Academy of Excellence,3901502,OHIO,https://cdn.boarddocs.com/sites/fl/pcsfl/2016/Attachments/Attachment-7580.pdf,cdn.boarddocs.com,fl
17401,Community School of Digital and Visual A,3700054,NORTH CAROLINA,https://cdn.boarddocs.com/sites/fl/pcsfl/2012/Attachments/Attachment-5174.pdf,cdn.boarddocs.com,fl
6051,HARBOR SCIENCE AND ARTS CHARTER SCHOOL,3600047,NEW YORK,https://cdn.boarddocs.com/sites/fl/pcsfl/2014/Attachments/Attachment-5954.pdf,cdn.boarddocs.com,fl
2805,TEACH Prep Mildred S. Cunningham & Edith H. Morris Elem DIST,601575,CALIFORNIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Attachments/Attachment-2245.pdf,cdn.boarddocs.com,fl
2046,St. Coletta Special Education PCS,1100064,DISTRICT OF COLUMBIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Agendas/Documents/Agenda-1291.pdf,cdn.boarddocs.com,fl


In [113]:
# number of URL parts
boarddocs_df["num_url_parts"] = boarddocs_df["url"].apply(lambda row: len(get_url_parts(row)))
boarddocs_df["num_url_parts"].value_counts()

num_url_parts
7    12575
4     5788
3     1231
6       30
8        9
5        2
2        1
1        1
Name: count, dtype: int64

In [114]:
boarddocs_df[boarddocs_df["num_url_parts"]==1]

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts
17093,TRUMANN SCHOOL DISTRICT,500047,ARKANSAS,https://go.boarddocs.com/,go.boarddocs.com,,1


In [115]:
# delete this
boarddocs_df.loc[boarddocs_df["num_url_parts"]==1, ["url", "domain"]] = None

In [116]:
boarddocs_df["boarddocs_lea"] = boarddocs_df["url"].apply(lambda row: get_url_parts(row)[1])
boarddocs_df.sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
8236,Pickens County,102730,ALABAMA,https://go.boarddocs.com/sc/pickens/Board.nsf/files/A9H4Y8055B25/$file/11-24-14%20Minutes.pdf,go.boarddocs.com,sc,7,pickens
1523,Perry Local,3904790,OHIO,https://www.boarddocs.com/oh/perrysoh/Board.nsf/Public,www.boarddocs.com,oh,4,perrysoh
7969,West Warwick,4401140,RHODE ISLAND,https://go.boarddocs.com/ri/soki/Board.nsf/files/BXP3QT07FD71/$file/FY-21-local-tuition-rates-6-2-20.pdf,go.boarddocs.com,ri,7,soki
1324,GREENVILLE R-II,2913380,MISSOURI,https://go.boarddocs.com/mo/foxc6/Board.nsf/files/CLX4GL0B7F3E/$file/01%20MUSIC%20Member%20Map%202022.pdf,go.boarddocs.com,mo,7,foxc6
14042,Joliet PSD 86,1720580,ILLINOIS,https://go.boarddocs.com/il/joliet86/Board.nsf/Public,go.boarddocs.com,il,4,joliet86


In [117]:
# for cdn its the third argument
def func1(row):
    parts = get_url_parts(row)
    if len(parts) < 3:
        print(row)
        return None
    return parts[2]

boarddocs_df.loc[boarddocs_df['domain']=='cdn.boarddocs.com', "boarddocs_lea"] = boarddocs_df["url"].apply(func1)
boarddocs_df.loc[boarddocs_df['domain']=='cdn.boarddocs.com',].sample(5)

https://www.boarddocs.com/Home.nsf/blog


Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
15461,Da Vinci Schools,2600088,MICHIGAN,https://cdn.boarddocs.com/sites/fl/pcsfl/2010/Attachments/Attachment-2843.pdf,cdn.boarddocs.com,fl,6,pcsfl
16654,Judith Gap H S,3015390,MONTANA,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Attachments/Attachment-7818.pdf,cdn.boarddocs.com,fl,6,pcsfl
13164,Judith Gap Elem,3015360,MONTANA,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Attachments/Attachment-7818.pdf,cdn.boarddocs.com,fl,6,pcsfl
7231,Valley Academy for Career and Technology Education (79397),400341,ARIZONA,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Agendas/Documents/Agenda-1045.pdf,cdn.boarddocs.com,fl,7,pcsfl
2046,St. Coletta Special Education PCS,1100064,DISTRICT OF COLUMBIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Agendas/Documents/Agenda-1291.pdf,cdn.boarddocs.com,fl,7,pcsfl


In [118]:
# the https://www.boarddocs.com/Home.nsf/blog is a flop
# delete that url
boarddocs_df.loc[boarddocs_df['url']=='https://www.boarddocs.com/Home.nsf/blog', "boarddocs_state"] = None
boarddocs_df.loc[boarddocs_df['url']=='https://www.boarddocs.com/Home.nsf/blog', "domain"] = None
boarddocs_df.loc[boarddocs_df['url']=='https://www.boarddocs.com/Home.nsf/blog', "boarddocs_lea"] = None
boarddocs_df.loc[boarddocs_df['url']=='https://www.boarddocs.com/Home.nsf/blog', "url"] = None

In [119]:
# get the states parsed and their counts as a dictionary
state_counts_dict = boarddocs_df["boarddocs_state"].value_counts().to_dict()
print(state_counts_dict)

{'ca': 2510, 'ny': 1356, 'oh': 1240, 'pa': 1201, 'il': 921, 'co': 842, 'tx': 826, 'mo': 725, 'mi': 720, 'wi': 679, 'vsba': 666, 'nj': 641, 'fl': 535, 'in': 466, 'la': 439, 'wa': 423, 'mabe': 348, 'ks': 325, 'ia': 278, 'ga': 250, 'nc': 246, 'nv': 216, 'az': 216, 'ma': 162, 'ut': 161, 'sc': 149, 'id': 148, 'mt': 141, 'ct': 134, 'wy': 125, 'mn': 123, 'tn': 122, 'ak': 117, 'fla': 109, 'ar': 101, 'va': 87, 'wv': 84, 'nm': 82, 'me': 75, 'nh': 69, 'ok': 62, 'or': 54, 'ri': 45, 'sd': 39, 'de': 35, 'md': 21, 'ms': 20, 'nd': 20, 'vt': 13, 'OH': 10, 'WA': 6, 'butler': 5, 'can': 5, 'psba': 4, 'wcps': 3, 'support': 2, 'genoa': 2, 'NJ': 2, 'MI': 2, 'CA': 1, 'FLA': 1, 'IL': 1, 'sufsd': 1, '': 1, 'dc': 1}


In [120]:
# here are ones that stood out
weird_state_names = ['vsba', 'mabe', 'fla', 'can', 'psba', 'butler', 'wcps', 'support', 'genoa', 'FLA','Home.nsf','sufsd']

In [121]:
pd.set_option('display.max_colwidth',None)

In [122]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='vsba',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
1295,Staunton City Public Schools,5103690,VIRGINIA,https://go.boarddocs.com/vsba/waynesboro/Board.nsf/files/D64LP6577715/$file/SAW%20School%20Boards%20UWSAW%20MOU.pdf,go.boarddocs.com,vsba,7,waynesboro
1329,NORFOLK PUBLIC SCHOOLS,3174430,NEBRASKA,https://www.boarddocs.com/vsba/nps/Board.nsf/goto?open&id=89476K57C109,www.boarddocs.com,vsba,4,nps
1347,Tazewell County Public Schools,5103810,VIRGINIA,https://go.boarddocs.com/vsba/roecnty/Board.nsf/files/C62MPK59A63D/$file/163-21b.xlsx,go.boarddocs.com,vsba,7,roecnty
1371,Cumberland County Public Schools,5101080,VIRGINIA,https://go.boarddocs.com/vsba/ccpsva/Board.nsf/Public,go.boarddocs.com,vsba,4,ccpsva
1373,Central State Hospital,5100045,VIRGINIA,https://go.boarddocs.com/vsba/pitpsva/Board.nsf/files/CL2QSS6ABFB1/$file/2-16-22%20Minutes%20-%20PCPS%20Board%20Regular%20Meeting%20(Approved%20and%20Signed).pdf,go.boarddocs.com,vsba,7,pitpsva


In [123]:
# vsba is ok, probably means Virginia School Board Association
weird_state_names.remove("vsba")
next_weird_state_name = weird_state_names[0]
next_weird_state_name

'mabe'

In [124]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='mabe',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
1274,Greater Brunswick Charter School,3400047,NEW JERSEY,https://www.boarddocs.com/mabe/fcps/Board.nsf/files/CAUQ5L65A834/$file/01.26.22%20MVMPCS%20Application%20for%20Expansion_backup%202.pdf,www.boarddocs.com,mabe,7,fcps
1286,Hopkins County,2102860,KENTUCKY,https://www.boarddocs.com/mabe/mcpsmd/Board.nsf/files/AX5HMA48778D/$file/Curr%20Review%20JHU%20Rpt%20Exec%20Summary.pdf,www.boarddocs.com,mabe,7,mcpsmd
1300,Denton H S,3008850,MONTANA,https://go.boarddocs.com/mabe/carps/Board.nsf/goto?open&id=BCESGR72A800,go.boarddocs.com,mabe,4,carps
1391,Kenton County,2103090,KENTUCKY,https://go.boarddocs.com/mabe/pgcps/Board.nsf/files/C82HSJ49E063/$file/CCAP%20-%20Zero%20Energy%20and%20Carbon%20Schools%20in%20MD.pdf,go.boarddocs.com,mabe,7,pgcps
1406,OAKLAND CRAIG PUBLIC SCHOOLS,3174640,NEBRASKA,https://www.boarddocs.com/mabe/mcpsmd/Board.nsf/files/BR9MQ25C1961/$file/Retirement%20MCPS%20Personnel.pdf,www.boarddocs.com,mabe,7,mcpsmd


In [125]:
# mabe is fine
# probably maryland board of education
weird_state_names.remove("mabe")
next_weird_state_name = weird_state_names[0]
next_weird_state_name

'fla'

In [126]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='fla',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
1722,Pine Ridge Elementary,630390,CALIFORNIA,https://go.boarddocs.com/fla/lake/Board.nsf/files/CNURQN680B02/$file/Board%20List%2002.13.2023.pdf,go.boarddocs.com,fla,7,lake
1841,CHAMPS - Charter HS of Arts-MULmedia & Perf DIST,601580,CALIFORNIA,https://www.boarddocs.com/fla/vcsfl/Board.nsf/files/A2DHTW48DBE4/$file/Application%20-%20Big%20Engine.pdf,www.boarddocs.com,fla,7,vcsfl
1845,Copeland,2005190,KANSAS,https://go.boarddocs.com/fla/vcsfl/Board.nsf/files/9FP3JQ717E5A/$file/Copeland%20Bio%2012-3-2013.pdf,go.boarddocs.com,fla,7,vcsfl
2227,Orange County Public Schools,5102820,VIRGINIA,https://go.boarddocs.com/fla/orcpsfl/Board.nsf/files/CETM9F588798/$file/2024-25%20OCPS%20District%20Calendar.pdf,go.boarddocs.com,fla,7,orcpsfl
2412,Moore Elem,3018870,MONTANA,https://go.boarddocs.com/fla/leon/Board.nsf/files/CNLJ894B2752/$file/WT%20Moore%20Phase%20III%20Documents.pdf,go.boarddocs.com,fla,7,leon


In [127]:
# fla is fine, they are florida schools
weird_state_names.remove("fla")
next_weird_state_name = weird_state_names[0]
next_weird_state_name

'can'

In [128]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='can',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
2343,CANADIAN VALLEY,4000052,OKLAHOMA,https://go.boarddocs.com/can/fvrl/Board.nsf/goto?open&id=BKMQMQ64A488,go.boarddocs.com,can,4,fvrl
7138,CANADIAN VALLEY,4000062,OKLAHOMA,https://go.boarddocs.com/can/fvrl/Board.nsf/goto?open&id=BKMQMQ64A488,go.boarddocs.com,can,4,fvrl
8371,Highwood K-12,3014070,MONTANA,https://go.boarddocs.com/can/fsd38/Board.nsf/files/CCDNCK5F4A28/$file/Foothills%20School%20Division%20Capital%20Plan%202022.pdf,go.boarddocs.com,can,7,fsd38
8609,THOUSAND ISLANDS CENTRAL SCHOOL DISTRICT,3607650,NEW YORK,https://go.boarddocs.com/can/ucdsb/Board.nsf/files/DAGM9L5A2A92/$file/PR432.1_AppendixD_VideoSurveillance_Locations_2024Oct.pdf,go.boarddocs.com,can,7,ucdsb
14251,LITTLE HEART 4,3811540,NORTH DAKOTA,https://go.boarddocs.com/can/fsd38/Board.nsf/files/CKAMNR59FAEE/$file/C%20I%20McLaren%20School%20Ed%20Plan.pdf,go.boarddocs.com,can,7,fsd38


In [129]:
# can is canadian schools
# we should remove them since we don't care about canadian schools in this project
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='can', "boarddocs_lea"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='can', "url"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='can', "domain"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='can', "boarddocs_state"] = None

In [130]:
weird_state_names.remove("can")
next_weird_state_name = weird_state_names[0]
next_weird_state_name

'psba'

In [131]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='psba',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
2210,Deer Lakes SD,4207540,PENNSYLVANIA,https://go.boarddocs.com/psba/dlsdpa/Board.nsf/Public,go.boarddocs.com,psba,4,dlsdpa
5603,NURSERY ISD,4833270,TEXAS,https://go.boarddocs.com/psba/dlsdpa/Board.nsf/files/CZDNJ260153D/$file/200-AR-1-Enrollment%20Form.pdf,go.boarddocs.com,psba,7,dlsdpa
9224,East Union Comm School District,1910350,IOWA,https://go.boarddocs.com/psba/dlsdpa/Board.nsf/goto?open&id=CHZKKR52499D,go.boarddocs.com,psba,4,dlsdpa
10703,Ligonier Valley SD,4213710,PENNSYLVANIA,https://go.boarddocs.com/psba/dlsdpa/Board.nsf/files/CZDR6M6BA9ED/$file/808%20-%20AR%20-%201.pdf,go.boarddocs.com,psba,7,dlsdpa


In [132]:
# psba probably means Pennsylvania School Boards Association
weird_state_names.remove("psba")
next_weird_state_name = weird_state_names[0]
next_weird_state_name

'butler'

In [133]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='butler',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
3126,TRI-COUNTY TECHNOLOGY CENTER,4000093,OKLAHOMA,https://www.boarddocs.com/butler/Board.nsf/legacy-content/85BMP21D4F62/$FILE/Facilities%20Strategic%20Plan.pdf,www.boarddocs.com,butler,6,Board.nsf
5905,Knox County ESC,3904781,OHIO,https://go.boarddocs.com/butler/Board.nsf/files/8HTP64630762/$file/March%2018%20Regular%20Meeting%20and%20March%2025%2C%201998%20Minutes.pdf,go.boarddocs.com,butler,6,Board.nsf
7534,Butler Technology & Career Development Schools,3905088,OHIO,https://www.boarddocs.com/butler/Board.nsf/goto?open&id=ABULPD570215,www.boarddocs.com,butler,3,Board.nsf
7992,WES WATKINS TECHNOLOGY CENTER,4000065,OKLAHOMA,https://go.boarddocs.com/butler/Board.nsf/files/8HESY56FFC3C/$file/October%2011%2C%202000%20Minutes.pdf,go.boarddocs.com,butler,6,Board.nsf
16375,Miami Valley Career Tech,3905128,OHIO,https://www.boarddocs.com/butler/Board.nsf/legacy-content/85BMP21D4F62/$FILE/Facilities%20Strategic%20Plan.pdf,www.boarddocs.com,butler,6,Board.nsf


In [134]:
# butler is fine. it is a school board association in ohio
weird_state_names.remove("butler")
weird_state_names

['wcps', 'support', 'genoa', 'FLA', 'Home.nsf', 'sufsd']

In [135]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='wcps',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
3620,Screven County,1304500,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87Y65T65066A/$FILE/Min021103.pdf,www.boarddocs.com,wcps,6,Board.nsf
8090,Northeast Georgia RESA,1300010,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87XUQU642EAA/$FILE/ATT4CLAW/Min041806.pdf,www.boarddocs.com,wcps,7,Board.nsf
14353,Heart of Georgia RESA,1300017,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87XUQU642EAA/$FILE/ATT4CLAW/Min041806.pdf,www.boarddocs.com,wcps,7,Board.nsf


In [136]:
# wcps is fine. Walton County Public Schools
weird_state_names.remove("wcps")
weird_state_names

['support', 'genoa', 'FLA', 'Home.nsf', 'sufsd']

In [137]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='support',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
3768,Carmen High School of Science and Technology Inc,5500076,WISCONSIN,https://go.boarddocs.com/support/plus/Board.nsf/files/CGAMSU5AD0EB/$file/Personnel%20Consent%20Agenda%206.21.22.pdf,go.boarddocs.com,support,7,plus
11825,Pickford Public Schools,2628020,MICHIGAN,https://go.boarddocs.com/support/plus/Board.nsf/files/CGAMSU5AD0EB/$file/Personnel%20Consent%20Agenda%206.21.22.pdf,go.boarddocs.com,support,7,plus


In [138]:
# support is a demo website
# remove it
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='support', "boarddocs_lea"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='support', "url"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='support', "domain"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='support', "boarddocs_state"] = None

In [139]:
weird_state_names.remove("support")
weird_state_names

['genoa', 'FLA', 'Home.nsf', 'sufsd']

In [140]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='genoa',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
6086,Genoa Kingston CUSD 424,1716410,ILLINOIS,https://go.boarddocs.com/genoa/Board.nsf/Public,go.boarddocs.com,genoa,3,Board.nsf
8721,Kishwaukee Educ Consortium,1700258,ILLINOIS,https://go.boarddocs.com/genoa/Board.nsf/legacy-content/85WJVP121BB0/$FILE/082206%20%20%20Board%20of%20Education%20Minutes.pdf,go.boarddocs.com,genoa,6,Board.nsf


In [141]:
# genoa points towards a school board in Genoa, IL
# ok
weird_state_names.remove("genoa")
weird_state_names

['FLA', 'Home.nsf', 'sufsd']

In [142]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='FLA',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
5389,North Wildwood School District,3411670,NEW JERSEY,https://go.boarddocs.com/FLA/scsfl/Board.nsf/files/9P9LB4559728/$file/2014%2015%20Work%20Program.pdf,go.boarddocs.com,FLA,7,scsfl


In [143]:
# FLA is ok, florida
weird_state_names.remove("FLA")
weird_state_names

['Home.nsf', 'sufsd']

In [144]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='Home.nsf',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea


In [145]:
# probably previously removed
weird_state_names.remove("Home.nsf")
weird_state_names

['sufsd']

In [146]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='sufsd',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
13641,SPACKENKILL UNION FREE SCHOOL DISTRICT,3606500,NEW YORK,https://go.boarddocs.com/sufsd/Board.nsf/Public,go.boarddocs.com,sufsd,3,Board.nsf


In [147]:
# Spackenkill Union Free School District in New York
# OK

In [148]:
# we will now standardize the URLs to go.boarddocs.com/XX/XXXX/Board.nsf/Public for scrapping
boarddocs_df.sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
13265,Bartonville SD 66,1705190,ILLINOIS,https://www.boarddocs.com/tx/disd/Board.nsf/files/BF8SS47220D0/$file/Joyce%20Foreman%207.41j%20Response.pdf,www.boarddocs.com,tx,7,disd
15868,Piedmont City Unified,630330,CALIFORNIA,https://go.boarddocs.com/ca/orinda/Board.nsf/files/BHFSDT72392E/$file/2018-2019%20CAASPP%20presentation%20for%20OUSD%20Board%20Meeting.pdf,go.boarddocs.com,ca,7,orinda
1485,Long Valley District,602296,CALIFORNIA,https://go.boarddocs.com/pa/cmdvsd/Board.nsf/files/9G9NA8559C65/$file/CVPD%20Long%20Range%20Lease%20Figure%20Out%205%20Year%20Plan%20(lease).pdf,go.boarddocs.com,pa,7,cmdvsd
10342,Little Falls Community Schools,2718240,MINNESOTA,https://go.boarddocs.com/vsba/fccpsva/Board.nsf/files/BZWTNG782D8D/$file/FCCPS%20HIGH%20SCHOOL%20RENAMING%20COMMITTEE%20FINAL%20REPORT.pdf,go.boarddocs.com,vsba,7,fccpsva
8820,Monte Rio Union Elementary,625410,CALIFORNIA,https://go.boarddocs.com/ca/santarosa/Board.nsf/files/ADML2Z54536F/$file/SRJC%20Option%20A.pdf,go.boarddocs.com,ca,7,santarosa


In [149]:
boarddocs_df.sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea
110,Bell Canyon Charter School Inc (79983),400374,ARIZONA,,,,3,
4610,Sacramento County ROP,600095,CALIFORNIA,https://go.boarddocs.com/ca/cottonwood/Board.nsf/files/CE9VK98114C9/$file/ROP%20MOU%202022%20Participating%20District.pdf,go.boarddocs.com,ca,7,cottonwood
13699,McDowell County Schools,3702940,NORTH CAROLINA,https://go.boarddocs.com/wv/preston/Board.nsf/files/D98TTC78E4EB/$file/News%20Story%20on%20Friends%20with%20Paws%20Program.pdf,go.boarddocs.com,wv,7,preston
5695,Principle Academy Charter School,3400783,NEW JERSEY,https://go.boarddocs.com/ak/matsu/Board.nsf/files/CCXTS378B45E/$file/The%20Charter%20AMENDMENT%20WITH%20ADDENDUMS%203.20.22.pdf,go.boarddocs.com,ak,7,matsu
7623,GREENE COUNTY TECH SCHOOL DISTRICT,513080,ARKANSAS,https://go.boarddocs.com/ny/green/Board.nsf/files/CSZGE443640F/$file/23-24%20Greenville%20District%20Wide%20Plan%20-%20Final.pdf,go.boarddocs.com,ny,7,green


In [150]:
boarddocs_df.loc[~boarddocs_df["boarddocs_state"].isna(), "query_url"] = "https://go.boarddocs.com/" + boarddocs_df["boarddocs_state"] + '/' + boarddocs_df["boarddocs_lea"] + "/Board.nsf/Public"

In [151]:
# informed by later work in `visit_boarddocs.ipynb`
# we look at those with boarddocs_lea set to Board.nsf
boarddocs_df.loc[boarddocs_df['boarddocs_lea'] == "Board.nsf"]

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea,query_url
3126,TRI-COUNTY TECHNOLOGY CENTER,4000093,OKLAHOMA,https://www.boarddocs.com/butler/Board.nsf/legacy-content/85BMP21D4F62/$FILE/Facilities%20Strategic%20Plan.pdf,www.boarddocs.com,butler,6,Board.nsf,https://go.boarddocs.com/butler/Board.nsf/Board.nsf/Public
3620,Screven County,1304500,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87Y65T65066A/$FILE/Min021103.pdf,www.boarddocs.com,wcps,6,Board.nsf,https://go.boarddocs.com/wcps/Board.nsf/Board.nsf/Public
5905,Knox County ESC,3904781,OHIO,https://go.boarddocs.com/butler/Board.nsf/files/8HTP64630762/$file/March%2018%20Regular%20Meeting%20and%20March%2025%2C%201998%20Minutes.pdf,go.boarddocs.com,butler,6,Board.nsf,https://go.boarddocs.com/butler/Board.nsf/Board.nsf/Public
6086,Genoa Kingston CUSD 424,1716410,ILLINOIS,https://go.boarddocs.com/genoa/Board.nsf/Public,go.boarddocs.com,genoa,3,Board.nsf,https://go.boarddocs.com/genoa/Board.nsf/Board.nsf/Public
7534,Butler Technology & Career Development Schools,3905088,OHIO,https://www.boarddocs.com/butler/Board.nsf/goto?open&id=ABULPD570215,www.boarddocs.com,butler,3,Board.nsf,https://go.boarddocs.com/butler/Board.nsf/Board.nsf/Public
7992,WES WATKINS TECHNOLOGY CENTER,4000065,OKLAHOMA,https://go.boarddocs.com/butler/Board.nsf/files/8HESY56FFC3C/$file/October%2011%2C%202000%20Minutes.pdf,go.boarddocs.com,butler,6,Board.nsf,https://go.boarddocs.com/butler/Board.nsf/Board.nsf/Public
8090,Northeast Georgia RESA,1300010,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87XUQU642EAA/$FILE/ATT4CLAW/Min041806.pdf,www.boarddocs.com,wcps,7,Board.nsf,https://go.boarddocs.com/wcps/Board.nsf/Board.nsf/Public
8721,Kishwaukee Educ Consortium,1700258,ILLINOIS,https://go.boarddocs.com/genoa/Board.nsf/legacy-content/85WJVP121BB0/$FILE/082206%20%20%20Board%20of%20Education%20Minutes.pdf,go.boarddocs.com,genoa,6,Board.nsf,https://go.boarddocs.com/genoa/Board.nsf/Board.nsf/Public
13641,SPACKENKILL UNION FREE SCHOOL DISTRICT,3606500,NEW YORK,https://go.boarddocs.com/sufsd/Board.nsf/Public,go.boarddocs.com,sufsd,3,Board.nsf,https://go.boarddocs.com/sufsd/Board.nsf/Board.nsf/Public
14353,Heart of Georgia RESA,1300017,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87XUQU642EAA/$FILE/ATT4CLAW/Min041806.pdf,www.boarddocs.com,wcps,7,Board.nsf,https://go.boarddocs.com/wcps/Board.nsf/Board.nsf/Public


In [152]:
# in these cases we omit the second title
boarddocs_df.loc[boarddocs_df['boarddocs_lea'] == "Board.nsf", "query_url"] = "https://go.boarddocs.com/" + boarddocs_df["boarddocs_state"] + "/Board.nsf/Public"

In [153]:
boarddocs_df.sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea,query_url
18170,Plainville,2010740,KANSAS,https://www.boarddocs.com/ma/plainville/Board.nsf/vpublic?open,www.boarddocs.com,ma,4,plainville,https://go.boarddocs.com/ma/plainville/Board.nsf/Public
13049,Providence,4400900,RHODE ISLAND,https://go.boarddocs.com/ca/vcoe/Board.nsf/files/CN92JX02757A/$file/J4%20Providence%20SARC%202022.pdf,go.boarddocs.com,ca,7,vcoe,https://go.boarddocs.com/ca/vcoe/Board.nsf/Public
16365,Magnolia Union Elementary,623460,CALIFORNIA,https://go.boarddocs.com/ca/icoe/Board.nsf/files/BQE2H602366A/$file/Imperial%20CountySELPA.Attachments2020.2021ASP.ABP.pdf,go.boarddocs.com,ca,7,icoe,https://go.boarddocs.com/ca/icoe/Board.nsf/Public
18082,Monroe Central School Corp,1807080,INDIANA,https://go.boarddocs.com/in/monro/Board.nsf/Public?open&id=policies,go.boarddocs.com,in,4,monro,https://go.boarddocs.com/in/monro/Board.nsf/Public
7858,ZUMBRO EDUCATION DISTRICT,2700054,MINNESOTA,https://go.boarddocs.com/mn/rps535/Board.nsf/files/BGJJSC4E7733/$file/Attachment%20A%20-%20Review%20and%20Comment%208-19-19.pdf,go.boarddocs.com,mn,7,rps535,https://go.boarddocs.com/mn/rps535/Board.nsf/Public


In [154]:
# percent of duplicated queries
boarddocs_df['query_url'].duplicated().sum() / boarddocs_df.shape[0] * 100

np.float64(80.10897794978867)

In [155]:
# code from here is informed by later work at `visit_boarddocs.ipynb`

boarddocs_df[boarddocs_df["query_url"] == "https://go.boarddocs.com/mi/wpas/Board.nsf/Public"]

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea,query_url
2635,Whittemore-Prescott Area Schools,2636390,MICHIGAN,https://go.boarddocs.com/mi/wpas/Board.nsf/goto?open&id=C5JRT75DE77E,go.boarddocs.com,mi,4,wpas,https://go.boarddocs.com/mi/wpas/Board.nsf/Public


In [156]:
boarddocs_df[boarddocs_df["query_url"] == "https://go.boarddocs.com/il/tfd215/Board.nsf/Public"]

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea,query_url
15671,Thornton School District,3306660,NEW HAMPSHIRE,https://go.boarddocs.com/il/tfd215/Board.nsf/goto?open&id=CDLHZS4AED7D,go.boarddocs.com,il,4,tfd215,https://go.boarddocs.com/il/tfd215/Board.nsf/Public


In [157]:
boarddocs_df[boarddocs_df["query_url"] == "https://go.boarddocs.com/mi/oxf/Board.nsf/Public"]

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,num_url_parts,boarddocs_lea,query_url
10701,Oxford Community Schools,2627240,MICHIGAN,https://go.boarddocs.com/mi/oxf/Board.nsf/Public,go.boarddocs.com,mi,4,oxf,https://go.boarddocs.com/mi/oxf/Board.nsf/Public


In [158]:
# choose useful columns and save it
boarddocs_df = boarddocs_df.loc[:, ['LEA_NAME','LEAID','STATENAME','query_url']]
output_filename = "boarddocs_url_cleaned.csv"
boarddocs_df.to_csv(output_filename, index=False)