This notebook visits the scraped links and get the name and address fields.

Input:
- `working_school_districts_with_boarddocs_scraped.csv`

Output:
- `boarddocs_url_cleaned.csv`

In [105]:
import pandas as pd

input_filename = "working_school_districts_with_boarddocs_scraped.csv"
boarddocs_df = pd.read_csv(input_filename)
boarddocs_df.sample(10)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url
8959,Vista Condor Global Academy District,601760,CALIFORNIA,https://go.boarddocs.com/ca/rcoe/Board.nsf/files/D2U2PK0326F5/$file/RCOE%20Staff%20Report-Vista%20Lago%20Global%20Academy.pdf
19208,Holyoke,2506270,MASSACHUSETTS,https://go.boarddocs.com/co/cde/Board.nsf/files/9WSAFU6689EC/$file/Holyoke%20Innovation%20Application.Final.pdf
7838,North Orange County ROP JPA,601364,CALIFORNIA,https://go.boarddocs.com/ca/tcrop/Board.nsf/goto?open&id=BGCVDZ80512F
6980,EDUCATIONAL SERVICE UNIT 02,3180020,NEBRASKA,https://go.boarddocs.com/co/cde/Board.nsf/files/CB7SXS736699/$file/1%20CCR%20301-8%20Clean%20Version.pdf
18292,LACLEDE CO. R-I,2910110,MISSOURI,https://go.boarddocs.com/mo/foxc6/Board.nsf/files/CLX4GL0B7F3E/$file/01%20MUSIC%20Member%20Map%202022.pdf
11165,Enterprise Academy/Newport News City,5100073,VIRGINIA,https://go.boarddocs.com/vsba/nnps/Board.nsf/legacy-content/8963QR76FDA8/$FILE/Regular%20Session%2005.21.07.doc
13837,Hardin County CUSD 1,1718200,ILLINOIS,https://go.boarddocs.com/il/oths/Board.nsf/files/DAXJKQ4D7813/$file/2024.11%202024%20Tax%20Levy%20Packet.pdf
4807,Pioneer School District,5306750,WASHINGTON,https://go.boarddocs.com/ny/pioneerschools/Board.nsf/Public
17430,High Tech High Media Arts District,601515,CALIFORNIA,"https://www.boarddocs.com/ca/sandi/Board.nsf/files/AJ7VB87F6071/$file/Attach%202%20Disclosure%20of%20District%20Investments%20Ending%20December%2031,%202016.pdf"
225,Yavapai County Juvenile Justice Center (79533),400460,ARIZONA,


In [106]:
# get the url
urls = set(boarddocs_df["url"])

In [107]:
# number of urls
len(urls)

9472

In [108]:
from urllib.parse import urlparse

def extract_domain(url):
    try:
        domain = urlparse(url).netloc
        return domain
    except Exception as e:
        return None

# Example usage:
domains = set([extract_domain(url) for url in urls])
domains

{None, 'cdn.boarddocs.com', 'go.boarddocs.com', 'www.boarddocs.com'}

In [109]:
# let's check out the cdn
boarddocs_df['domain'] = boarddocs_df["url"].apply(extract_domain)

In [110]:
# Set the option to display all columns
pd.set_option('display.max_columns', None)

# (Optional) Set the maximum width of each column to prevent truncation
pd.set_option('display.max_colwidth', None)

In [111]:
boarddocs_df[boarddocs_df['domain'] == "cdn.boarddocs.com"]

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain
4874,Shonto Preparatory School,5900128,BUREAU OF INDIAN EDUCATION,https://cdn.boarddocs.com/sites/fl/pcsfl/2010/Attachments/Attachment-3349.pdf,cdn.boarddocs.com
5598,St. Coletta Special Education PCS,1100064,DISTRICT OF COLUMBIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Agendas/Documents/Agenda-1291.pdf,cdn.boarddocs.com
5704,Cedar Tree Academy PCS,1100029,DISTRICT OF COLUMBIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Attachments/Attachment-7964.pdf,cdn.boarddocs.com
6357,TEACH Prep Mildred S. Cunningham & Edith H. Morris Elem DIST,601575,CALIFORNIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Attachments/Attachment-2245.pdf,cdn.boarddocs.com
6389,Imagine Prep Coolidge Inc. (90034),400769,ARIZONA,https://cdn.boarddocs.com/sites/fl/pcsfl/2010/Attachments/Attachment-3349.pdf,cdn.boarddocs.com
6905,Skyview School Inc. (4496),400093,ARIZONA,https://cdn.boarddocs.com/sites/fl/pcsfl/2014/Resolutions/ResolutionsByMeeting.html,cdn.boarddocs.com
9603,HARBOR SCIENCE AND ARTS CHARTER SCHOOL,3600047,NEW YORK,https://cdn.boarddocs.com/sites/fl/pcsfl/2014/Attachments/Attachment-5954.pdf,cdn.boarddocs.com
9623,STOREFRONT ACADEMY CHARTER SCHOOL,3601137,NEW YORK,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Attachments/Attachment-7964.pdf,cdn.boarddocs.com
10189,Curlew School District,5301890,WASHINGTON,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Attachments/Attachment-1556.pdf,cdn.boarddocs.com
10783,Valley Academy for Career and Technology Education (79397),400341,ARIZONA,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Agendas/Documents/Agenda-1045.pdf,cdn.boarddocs.com


In [112]:
# we will keep the cdn in mind and parse the state and lea agency after
boarddocs_df[boarddocs_df['domain'] == "www.boarddocs.com"].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain
4794,Dallas County,101110,ALABAMA,https://www.boarddocs.com/tx/disd/Board.nsf/files/BFQT4W707EF4/$file/Resolution%20Approving%20Dallas%20County%20Elections%20Voting%20Equipment.pdf,www.boarddocs.com
4796,Stanislaus County Office of Education,691041,CALIFORNIA,https://www.boarddocs.com/ca/sccoe/Board.nsf/files/BPMMV65CCD1E/$file/CSBA_PGSBrochure_County.pdf,www.boarddocs.com
4808,Rootstown Local,3904921,OHIO,https://www.boarddocs.com/oh/root/Board.nsf/Public?open&id=policies,www.boarddocs.com
4813,Griffin RESA,1300012,GEORGIA,https://www.boarddocs.com/ga/fcss/Board.nsf/files/8TFTCR76C17B/%24file/CCGPS%20Revised%20Board%20presentationFINAL.pdf,www.boarddocs.com
4814,CONNETQUOT CENTRAL SCHOOL DISTRICT,3608160,NEW YORK,https://www.boarddocs.com/ny/ccsdli/Board.nsf/vpublic?open,www.boarddocs.com


In [113]:
boarddocs_df[boarddocs_df['domain'] == "go.boarddocs.com"].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain
4774,RICHFIELD PUBLIC SCHOOL DISTRICT,2731750,MINNESOTA,https://go.boarddocs.com/ar/nlrsd/Board.nsf/files/CDDUUN7DC4F2/$file/2019-07-11%20Special%20Meeting%20Agenda.pdf,go.boarddocs.com
4775,Impact | Puget Sound Elementary,5300345,WASHINGTON,https://go.boarddocs.com/nc/cart/Board.nsf/files/AY4GCU433316/$file/BSES_Items%20for%20Discussion%20with%20Board%20of%20Education-May%202018.pdf,go.boarddocs.com
4776,Coatesville Area SD,4206240,PENNSYLVANIA,https://go.boarddocs.com/pa/coat/Board.nsf/vpublic?open,go.boarddocs.com
4777,DONIPHAN R-I,2910920,MISSOURI,https://go.boarddocs.com/mo/foxc6/Board.nsf/files/CLX4GL0B7F3E/$file/01%20MUSIC%20Member%20Map%202022.pdf,go.boarddocs.com
4778,Rapides Parish,2201290,LOUISIANA,https://go.boarddocs.com/la/rpsb/Board.nsf/vpublic?open,go.boarddocs.com


In [114]:
def get_url_parts(url):
    try:
        path = urlparse(url).path
        parts = path.strip('/').split('/')
        return parts
    except Exception as e:
        return [None, None, None]


boarddocs_df["boarddocs_state"] = boarddocs_df["url"].apply(lambda row: get_url_parts(row)[0])
boarddocs_df.sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state
12756,LEANDER ISD,4827030,TEXAS,https://www.boarddocs.com/tx/rrisd/Board.nsf/files/AM9K224FA451/$file/D2.Board%20Meeting%20051117_HRS%20slides.pdf,www.boarddocs.com,tx
6911,Dept of Corrections-Youth,3000091,MONTANA,https://go.boarddocs.com/me/sport/Board.nsf/files/CC7JP24E131C/$file/LD%20756%20Morales%20amend%203-3.pdf,go.boarddocs.com,me
11736,Canton College Preparatory School,3901470,OHIO,https://go.boarddocs.com/co/cde/Board.nsf/files/D3U8SD1FB028/$file/Appendices%20REVISED%20Submission%20with%20highlighted%20changes.pdf,go.boarddocs.com,co
2411,LA CRESCENT-HOKAH SCHOOL DISTRICT,2717250,MINNESOTA,,,
3626,Northwood Local Schools,3905071,OHIO,,,


In [115]:
# for cdn its the second argument
boarddocs_df.loc[boarddocs_df['domain']=='cdn.boarddocs.com', "boarddocs_state"] = boarddocs_df["url"].apply(lambda row: get_url_parts(row)[1])
boarddocs_df.loc[boarddocs_df['domain']=='cdn.boarddocs.com',].sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state
5704,Cedar Tree Academy PCS,1100029,DISTRICT OF COLUMBIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Attachments/Attachment-7964.pdf,cdn.boarddocs.com,fl
13562,Riverside SD,4223250,PENNSYLVANIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Attachments/Attachment-1900.pdf,cdn.boarddocs.com,fl
19013,Da Vinci Schools,2600088,MICHIGAN,https://cdn.boarddocs.com/sites/fl/pcsfl/2010/Attachments/Attachment-2843.pdf,cdn.boarddocs.com,fl
12430,Ridgecrest Elem Acad for Language Music and Sci District,601931,CALIFORNIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2012/Attachments/Attachment-5174.pdf,cdn.boarddocs.com,fl
17443,Paul R Brown Leadership Academy,3700351,NORTH CAROLINA,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Attachments/Attachment-1880.pdf,cdn.boarddocs.com,fl


In [116]:
boarddocs_df["boarddocs_lea"] = boarddocs_df["url"].apply(lambda row: get_url_parts(row)[1])
boarddocs_df.sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
9718,TWO DIMENSIONS PREPARATORY ACADEMY,4800069,TEXAS,https://go.boarddocs.com/pa/wood/Board.nsf/files/CBMJ3S4B1BF6/$file/Curriculum%20Guide%2022-23.DRAFT.v.3.pdf,go.boarddocs.com,pa,wood
10092,California Virtual Academy at Fresno District,601616,CALIFORNIA,https://go.boarddocs.com/ca/socccd/Board.nsf/files/CF7PST6547A0/$file/EXH%20A%20SOCCCD%20SC%20CCAP%20SVUSD%20Appendix%20Fall%202022.pdf,go.boarddocs.com,ca,socccd
9655,South Shore School District,5512030,WISCONSIN,https://go.boarddocs.com/wi/sshore/Board.nsf/Public,go.boarddocs.com,wi,sshore
3284,SCHUYLER-STEUBEN-CHEMUNG-TIOGA-ALLEGANY BOCES,3680780,NEW YORK,,,,
17217,S AND S CISD,4838400,TEXAS,https://go.boarddocs.com/ca/smusd/Board.nsf/files/CKUNSK615503/$file/Calendar%20%20Draft%202024-2025.pdf,go.boarddocs.com,ca,smusd


In [117]:
# for cdn its the third argument
def func1(row):
    parts = get_url_parts(row)
    if len(parts) < 3:
        print(row)
        return None
    return parts[2]

boarddocs_df.loc[boarddocs_df['domain']=='cdn.boarddocs.com', "boarddocs_lea"] = boarddocs_df["url"].apply(func1)
boarddocs_df.loc[boarddocs_df['domain']=='cdn.boarddocs.com',].sample(5)

https://www.boarddocs.com/Home.nsf/blog


Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
5598,St. Coletta Special Education PCS,1100064,DISTRICT OF COLUMBIA,https://cdn.boarddocs.com/sites/fl/pcsfl/2017/Agendas/Documents/Agenda-1291.pdf,cdn.boarddocs.com,fl,pcsfl
17443,Paul R Brown Leadership Academy,3700351,NORTH CAROLINA,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Attachments/Attachment-1880.pdf,cdn.boarddocs.com,fl,pcsfl
12907,Ann Jerkins-Harris Academy of Excellence,3901502,OHIO,https://cdn.boarddocs.com/sites/fl/pcsfl/2016/Attachments/Attachment-7580.pdf,cdn.boarddocs.com,fl,pcsfl
14550,SOUTHSIDE ACADEMY CHARTER SCHOOL,3600070,NEW YORK,https://cdn.boarddocs.com/sites/fl/pcsfl/2013/Media/2511/VideoMain-2511.html,cdn.boarddocs.com,fl,pcsfl
18644,Northeast Dubois Co Sch Corp,1808120,INDIANA,https://cdn.boarddocs.com/sites/fl/pcsfl/2009/Attachments/Attachment-1675.pdf,cdn.boarddocs.com,fl,pcsfl


In [118]:
# the https://www.boarddocs.com/Home.nsf/blog is a flop
# delete that url
boarddocs_df.loc[boarddocs_df['url']=='https://www.boarddocs.com/Home.nsf/blog', "boarddocs_state"] = None
boarddocs_df.loc[boarddocs_df['url']=='https://www.boarddocs.com/Home.nsf/blog', "domain"] = None
boarddocs_df.loc[boarddocs_df['url']=='https://www.boarddocs.com/Home.nsf/blog', "boarddocs_lea"] = None
boarddocs_df.loc[boarddocs_df['url']=='https://www.boarddocs.com/Home.nsf/blog', "url"] = None

In [119]:
# get the states parsed and their counts as a dictionary
state_counts_dict = boarddocs_df["boarddocs_state"].value_counts().to_dict()
print(state_counts_dict)

{'ca': 2046, 'ny': 1077, 'oh': 1000, 'pa': 976, 'il': 749, 'co': 685, 'tx': 679, 'mo': 589, 'mi': 572, 'wi': 556, 'vsba': 545, 'nj': 505, 'fl': 421, 'in': 360, 'la': 358, 'wa': 342, 'mabe': 285, 'ks': 273, 'ia': 218, 'ga': 211, 'nc': 186, 'az': 174, 'nv': 173, 'ut': 131, 'ma': 130, 'id': 127, 'mt': 116, 'sc': 111, 'wy': 102, 'ct': 101, 'tn': 100, 'mn': 99, 'ak': 92, 'fla': 85, 'ar': 82, 'va': 69, 'wv': 67, 'nm': 67, 'me': 60, 'nh': 57, 'ok': 50, 'or': 42, 'ri': 39, 'sd': 32, 'de': 29, 'ms': 15, 'nd': 15, 'md': 14, 'OH': 10, 'vt': 8, 'WA': 5, 'can': 5, 'butler': 4, 'psba': 4, 'wcps': 3, 'support': 2, 'genoa': 2, 'MI': 2, 'CA': 1, 'FLA': 1, 'NJ': 1, 'IL': 1, 'sufsd': 1}


In [120]:
# here are ones that stood out
weird_state_names = ['vsba', 'mabe', 'fla', 'can', 'psba', 'butler', 'wcps', 'support', 'genoa', 'FLA','Home.nsf','sufsd']

In [121]:
pd.set_option('display.max_colwidth',None)

In [122]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='vsba',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
4847,Staunton City Public Schools,5103690,VIRGINIA,https://go.boarddocs.com/vsba/waynesboro/Board.nsf/files/D64LP6577715/$file/SAW%20School%20Boards%20UWSAW%20MOU.pdf,go.boarddocs.com,vsba,waynesboro
4881,NORFOLK PUBLIC SCHOOLS,3174430,NEBRASKA,https://www.boarddocs.com/vsba/nps/Board.nsf/goto?open&id=89476K57C109,www.boarddocs.com,vsba,nps
4899,Tazewell County Public Schools,5103810,VIRGINIA,https://go.boarddocs.com/vsba/roecnty/Board.nsf/files/C62MPK59A63D/$file/163-21b.xlsx,go.boarddocs.com,vsba,roecnty
4923,Cumberland County Public Schools,5101080,VIRGINIA,https://go.boarddocs.com/vsba/ccpsva/Board.nsf/Public,go.boarddocs.com,vsba,ccpsva
4925,Central State Hospital,5100045,VIRGINIA,https://go.boarddocs.com/vsba/pitpsva/Board.nsf/files/CL2QSS6ABFB1/$file/2-16-22%20Minutes%20-%20PCPS%20Board%20Regular%20Meeting%20(Approved%20and%20Signed).pdf,go.boarddocs.com,vsba,pitpsva


In [123]:
# vsba is ok, probably means Virginia School Board Association
weird_state_names.remove("vsba")
next_weird_state_name = weird_state_names[0]
next_weird_state_name

'mabe'

In [124]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='mabe',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
4826,Greater Brunswick Charter School,3400047,NEW JERSEY,https://www.boarddocs.com/mabe/fcps/Board.nsf/files/CAUQ5L65A834/$file/01.26.22%20MVMPCS%20Application%20for%20Expansion_backup%202.pdf,www.boarddocs.com,mabe,fcps
4838,Hopkins County,2102860,KENTUCKY,https://www.boarddocs.com/mabe/mcpsmd/Board.nsf/files/AX5HMA48778D/$file/Curr%20Review%20JHU%20Rpt%20Exec%20Summary.pdf,www.boarddocs.com,mabe,mcpsmd
4852,Denton H S,3008850,MONTANA,https://go.boarddocs.com/mabe/carps/Board.nsf/goto?open&id=BCESGR72A800,go.boarddocs.com,mabe,carps
4943,Kenton County,2103090,KENTUCKY,https://go.boarddocs.com/mabe/pgcps/Board.nsf/files/C82HSJ49E063/$file/CCAP%20-%20Zero%20Energy%20and%20Carbon%20Schools%20in%20MD.pdf,go.boarddocs.com,mabe,pgcps
4958,OAKLAND CRAIG PUBLIC SCHOOLS,3174640,NEBRASKA,https://www.boarddocs.com/mabe/mcpsmd/Board.nsf/files/BR9MQ25C1961/$file/Retirement%20MCPS%20Personnel.pdf,www.boarddocs.com,mabe,mcpsmd


In [125]:
# mabe is fine
# probably maryland board of education
weird_state_names.remove("mabe")
next_weird_state_name = weird_state_names[0]
next_weird_state_name

'fla'

In [126]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='fla',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
5274,Pine Ridge Elementary,630390,CALIFORNIA,https://go.boarddocs.com/fla/lake/Board.nsf/files/CNURQN680B02/$file/Board%20List%2002.13.2023.pdf,go.boarddocs.com,fla,lake
5393,CHAMPS - Charter HS of Arts-MULmedia & Perf DIST,601580,CALIFORNIA,https://www.boarddocs.com/fla/vcsfl/Board.nsf/files/A2DHTW48DBE4/$file/Application%20-%20Big%20Engine.pdf,www.boarddocs.com,fla,vcsfl
5397,Copeland,2005190,KANSAS,https://go.boarddocs.com/fla/vcsfl/Board.nsf/files/9FP3JQ717E5A/$file/Copeland%20Bio%2012-3-2013.pdf,go.boarddocs.com,fla,vcsfl
5779,Orange County Public Schools,5102820,VIRGINIA,https://go.boarddocs.com/fla/orcpsfl/Board.nsf/files/CETM9F588798/$file/2024-25%20OCPS%20District%20Calendar.pdf,go.boarddocs.com,fla,orcpsfl
5964,Moore Elem,3018870,MONTANA,https://go.boarddocs.com/fla/leon/Board.nsf/files/CNLJ894B2752/$file/WT%20Moore%20Phase%20III%20Documents.pdf,go.boarddocs.com,fla,leon


In [127]:
# fla is fine, they are florida schools
weird_state_names.remove("fla")
next_weird_state_name = weird_state_names[0]
next_weird_state_name

'can'

In [128]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='can',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
5895,CANADIAN VALLEY,4000052,OKLAHOMA,https://go.boarddocs.com/can/fvrl/Board.nsf/goto?open&id=BKMQMQ64A488,go.boarddocs.com,can,fvrl
10690,CANADIAN VALLEY,4000062,OKLAHOMA,https://go.boarddocs.com/can/fvrl/Board.nsf/goto?open&id=BKMQMQ64A488,go.boarddocs.com,can,fvrl
11923,Highwood K-12,3014070,MONTANA,https://go.boarddocs.com/can/fsd38/Board.nsf/files/CCDNCK5F4A28/$file/Foothills%20School%20Division%20Capital%20Plan%202022.pdf,go.boarddocs.com,can,fsd38
12161,THOUSAND ISLANDS CENTRAL SCHOOL DISTRICT,3607650,NEW YORK,https://go.boarddocs.com/can/ucdsb/Board.nsf/files/DAGM9L5A2A92/$file/PR432.1_AppendixD_VideoSurveillance_Locations_2024Oct.pdf,go.boarddocs.com,can,ucdsb
17803,LITTLE HEART 4,3811540,NORTH DAKOTA,https://go.boarddocs.com/can/fsd38/Board.nsf/files/CKAMNR59FAEE/$file/C%20I%20McLaren%20School%20Ed%20Plan.pdf,go.boarddocs.com,can,fsd38


In [129]:
# can is canadian schools
# we should remove them since we don't care about canadian schools in this project
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='can', "boarddocs_lea"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='can', "url"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='can', "domain"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='can', "boarddocs_state"] = None

In [130]:
weird_state_names.remove("can")
next_weird_state_name = weird_state_names[0]
next_weird_state_name

'psba'

In [131]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='psba',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
5762,Deer Lakes SD,4207540,PENNSYLVANIA,https://go.boarddocs.com/psba/dlsdpa/Board.nsf/Public,go.boarddocs.com,psba,dlsdpa
9155,NURSERY ISD,4833270,TEXAS,https://go.boarddocs.com/psba/dlsdpa/Board.nsf/files/CZDNJ260153D/$file/200-AR-1-Enrollment%20Form.pdf,go.boarddocs.com,psba,dlsdpa
12776,East Union Comm School District,1910350,IOWA,https://go.boarddocs.com/psba/dlsdpa/Board.nsf/goto?open&id=CHZKKR52499D,go.boarddocs.com,psba,dlsdpa
14255,Ligonier Valley SD,4213710,PENNSYLVANIA,https://go.boarddocs.com/psba/dlsdpa/Board.nsf/files/CZDR6M6BA9ED/$file/808%20-%20AR%20-%201.pdf,go.boarddocs.com,psba,dlsdpa


In [132]:
# psba probably means Pennsylvania School Boards Association
weird_state_names.remove("psba")
next_weird_state_name = weird_state_names[0]
next_weird_state_name

'butler'

In [133]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='butler',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
6678,TRI-COUNTY TECHNOLOGY CENTER,4000093,OKLAHOMA,https://www.boarddocs.com/butler/Board.nsf/legacy-content/85BMP21D4F62/$FILE/Facilities%20Strategic%20Plan.pdf,www.boarddocs.com,butler,Board.nsf
9457,Knox County ESC,3904781,OHIO,https://go.boarddocs.com/butler/Board.nsf/files/8HTP64630762/$file/March%2018%20Regular%20Meeting%20and%20March%2025%2C%201998%20Minutes.pdf,go.boarddocs.com,butler,Board.nsf
11086,Butler Technology & Career Development Schools,3905088,OHIO,https://www.boarddocs.com/butler/Board.nsf/goto?open&id=ABULPD570215,www.boarddocs.com,butler,Board.nsf
11544,WES WATKINS TECHNOLOGY CENTER,4000065,OKLAHOMA,https://go.boarddocs.com/butler/Board.nsf/files/8HESY56FFC3C/$file/October%2011%2C%202000%20Minutes.pdf,go.boarddocs.com,butler,Board.nsf


In [134]:
# butler is fine. it is a school board association in ohio
weird_state_names.remove("butler")
weird_state_names

['wcps', 'support', 'genoa', 'FLA', 'Home.nsf', 'sufsd']

In [135]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='wcps',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
7172,Screven County,1304500,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87Y65T65066A/$FILE/Min021103.pdf,www.boarddocs.com,wcps,Board.nsf
11642,Northeast Georgia RESA,1300010,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87XUQU642EAA/$FILE/ATT4CLAW/Min041806.pdf,www.boarddocs.com,wcps,Board.nsf
17905,Heart of Georgia RESA,1300017,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87XUQU642EAA/$FILE/ATT4CLAW/Min041806.pdf,www.boarddocs.com,wcps,Board.nsf


In [136]:
# wcps is fine. Walton County Public Schools
weird_state_names.remove("wcps")
weird_state_names

['support', 'genoa', 'FLA', 'Home.nsf', 'sufsd']

In [137]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='support',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
7320,Carmen High School of Science and Technology Inc,5500076,WISCONSIN,https://go.boarddocs.com/support/plus/Board.nsf/files/CGAMSU5AD0EB/$file/Personnel%20Consent%20Agenda%206.21.22.pdf,go.boarddocs.com,support,plus
15377,Pickford Public Schools,2628020,MICHIGAN,https://go.boarddocs.com/support/plus/Board.nsf/files/CGAMSU5AD0EB/$file/Personnel%20Consent%20Agenda%206.21.22.pdf,go.boarddocs.com,support,plus


In [138]:
# support is a demo website
# remove it
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='support', "boarddocs_lea"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='support', "url"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='support', "domain"] = None
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='support', "boarddocs_state"] = None

In [139]:
weird_state_names.remove("support")
weird_state_names

['genoa', 'FLA', 'Home.nsf', 'sufsd']

In [140]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='genoa',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
9638,Genoa Kingston CUSD 424,1716410,ILLINOIS,https://go.boarddocs.com/genoa/Board.nsf/Public,go.boarddocs.com,genoa,Board.nsf
12273,Kishwaukee Educ Consortium,1700258,ILLINOIS,https://go.boarddocs.com/genoa/Board.nsf/legacy-content/85WJVP121BB0/$FILE/082206%20%20%20Board%20of%20Education%20Minutes.pdf,go.boarddocs.com,genoa,Board.nsf


In [141]:
# genoa points towards a school board in Genoa, IL
# ok
weird_state_names.remove("genoa")
weird_state_names

['FLA', 'Home.nsf', 'sufsd']

In [142]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='FLA',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
8941,North Wildwood School District,3411670,NEW JERSEY,https://go.boarddocs.com/FLA/scsfl/Board.nsf/files/9P9LB4559728/$file/2014%2015%20Work%20Program.pdf,go.boarddocs.com,FLA,scsfl


In [143]:
# FLA is ok, florida
weird_state_names.remove("FLA")
weird_state_names

['Home.nsf', 'sufsd']

In [144]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='Home.nsf',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea


In [145]:
# probably previously removed
weird_state_names.remove("Home.nsf")
weird_state_names

['sufsd']

In [146]:
boarddocs_df.loc[boarddocs_df['boarddocs_state']=='sufsd',].head()

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
17193,SPACKENKILL UNION FREE SCHOOL DISTRICT,3606500,NEW YORK,https://go.boarddocs.com/sufsd/Board.nsf/Public,go.boarddocs.com,sufsd,Board.nsf


In [147]:
# Spackenkill Union Free School District in New York
# OK

In [148]:
# we will now standardize the URLs to go.boarddocs.com/XX/XXXX/Board.nsf/Public for scrapping
boarddocs_df.sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
2470,FRANKLIN CO SCHOOL DIST,2801530,MISSISSIPPI,,,,
13171,RENAISSANCE CHARTER SCHOOL,3600059,NEW YORK,https://www.boarddocs.com/fl/palmbeach/Board.nsf/files/A3RJW64DBECA/$file/RenCen%20HS%20of%20PB%20-%20Board%20Notification%20-%2010-28-15.pdf,www.boarddocs.com,fl,palmbeach
1298,Saunemin CCSD 438,1700065,ILLINOIS,,,,
13834,MaST Community CS III,4200903,PENNSYLVANIA,https://go.boarddocs.com/mt/kalispell/Board.nsf/files/CWWVTB824194/$file/Kalispell%20Community%20Partnerships%20Charter%20School%20App.pdf,go.boarddocs.com,mt,kalispell
19000,KONAWA,4016710,OKLAHOMA,https://go.boarddocs.com/fl/hendry/Board.nsf/files/CV8S7P70FFBC/$file/5.C.%20School_District_Final_Altria_Allocation.pdf,go.boarddocs.com,fl,hendry


In [149]:
boarddocs_df.sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea
5397,Copeland,2005190,KANSAS,https://go.boarddocs.com/fla/vcsfl/Board.nsf/files/9FP3JQ717E5A/$file/Copeland%20Bio%2012-3-2013.pdf,go.boarddocs.com,fla,vcsfl
18615,Stanly County Schools,3704320,NORTH CAROLINA,https://go.boarddocs.com/nc/stanlync/Board.nsf/Public,go.boarddocs.com,nc,stanlync
5248,Oregon City SD 62,4109330,OREGON,https://go.boarddocs.com/or/ocsd62/Board.nsf/files/CLU3SX0854FB/$file/OREGON%20CITY%20ADDITIONAL%20QUOTE%20FORM%20Stepping%20Stones%20SHC%20Procare%20Therapy%202022.pdf,go.boarddocs.com,or,ocsd62
18957,Seeley Lake Elem,3023730,MONTANA,https://go.boarddocs.com/wa/lwsd/Board.nsf/files/BV4U6S7A90FB/$file/Minutes050100.pdf,go.boarddocs.com,wa,lwsd
5224,Northern Burlington County Regional School District,3411700,NEW JERSEY,https://go.boarddocs.com/nj/nbcrsd/board.nsf/public,go.boarddocs.com,nj,nbcrsd


In [150]:
boarddocs_df.loc[~boarddocs_df["boarddocs_state"].isna(), "query_url"] = "https://go.boarddocs.com/" + boarddocs_df["boarddocs_state"] + '/' + boarddocs_df["boarddocs_lea"] + "/Board.nsf/Public"

In [151]:
# informed by later work in `visit_boarddocs.ipynb`
# we look at those with boarddocs_lea set to Board.nsf
boarddocs_df.loc[boarddocs_df['boarddocs_lea'] == "Board.nsf"]

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea,query_url
6678,TRI-COUNTY TECHNOLOGY CENTER,4000093,OKLAHOMA,https://www.boarddocs.com/butler/Board.nsf/legacy-content/85BMP21D4F62/$FILE/Facilities%20Strategic%20Plan.pdf,www.boarddocs.com,butler,Board.nsf,https://go.boarddocs.com/butler/Board.nsf/Board.nsf/Public
7172,Screven County,1304500,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87Y65T65066A/$FILE/Min021103.pdf,www.boarddocs.com,wcps,Board.nsf,https://go.boarddocs.com/wcps/Board.nsf/Board.nsf/Public
9457,Knox County ESC,3904781,OHIO,https://go.boarddocs.com/butler/Board.nsf/files/8HTP64630762/$file/March%2018%20Regular%20Meeting%20and%20March%2025%2C%201998%20Minutes.pdf,go.boarddocs.com,butler,Board.nsf,https://go.boarddocs.com/butler/Board.nsf/Board.nsf/Public
9638,Genoa Kingston CUSD 424,1716410,ILLINOIS,https://go.boarddocs.com/genoa/Board.nsf/Public,go.boarddocs.com,genoa,Board.nsf,https://go.boarddocs.com/genoa/Board.nsf/Board.nsf/Public
11086,Butler Technology & Career Development Schools,3905088,OHIO,https://www.boarddocs.com/butler/Board.nsf/goto?open&id=ABULPD570215,www.boarddocs.com,butler,Board.nsf,https://go.boarddocs.com/butler/Board.nsf/Board.nsf/Public
11544,WES WATKINS TECHNOLOGY CENTER,4000065,OKLAHOMA,https://go.boarddocs.com/butler/Board.nsf/files/8HESY56FFC3C/$file/October%2011%2C%202000%20Minutes.pdf,go.boarddocs.com,butler,Board.nsf,https://go.boarddocs.com/butler/Board.nsf/Board.nsf/Public
11642,Northeast Georgia RESA,1300010,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87XUQU642EAA/$FILE/ATT4CLAW/Min041806.pdf,www.boarddocs.com,wcps,Board.nsf,https://go.boarddocs.com/wcps/Board.nsf/Board.nsf/Public
12273,Kishwaukee Educ Consortium,1700258,ILLINOIS,https://go.boarddocs.com/genoa/Board.nsf/legacy-content/85WJVP121BB0/$FILE/082206%20%20%20Board%20of%20Education%20Minutes.pdf,go.boarddocs.com,genoa,Board.nsf,https://go.boarddocs.com/genoa/Board.nsf/Board.nsf/Public
17193,SPACKENKILL UNION FREE SCHOOL DISTRICT,3606500,NEW YORK,https://go.boarddocs.com/sufsd/Board.nsf/Public,go.boarddocs.com,sufsd,Board.nsf,https://go.boarddocs.com/sufsd/Board.nsf/Board.nsf/Public
17905,Heart of Georgia RESA,1300017,GEORGIA,https://www.boarddocs.com/wcps/Board.nsf/legacy-content/87XUQU642EAA/$FILE/ATT4CLAW/Min041806.pdf,www.boarddocs.com,wcps,Board.nsf,https://go.boarddocs.com/wcps/Board.nsf/Board.nsf/Public


In [152]:
# in these cases we omit the second title
boarddocs_df.loc[boarddocs_df['boarddocs_lea'] == "Board.nsf", "query_url"] = "https://go.boarddocs.com/" + boarddocs_df["boarddocs_state"] + "/Board.nsf/Public"

In [153]:
boarddocs_df.sample(5)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea,query_url
14266,Menominee Tribal School,5900144,BUREAU OF INDIAN EDUCATION,https://go.boarddocs.com/wi/menind/Board.nsf/Public?open&id=policies,go.boarddocs.com,wi,menind,https://go.boarddocs.com/wi/menind/Board.nsf/Public
18351,Columbus Grove Local,3904931,OHIO,https://www.boarddocs.com/oh/colum/Board.nsf/Public?open&id=policies,www.boarddocs.com,oh,colum,https://go.boarddocs.com/oh/colum/Board.nsf/Public
14752,Treeside Charter School,4900192,UTAH,https://go.boarddocs.com/ut/uen/Board.nsf/files/D29QKJ699BC2/$file/24_UETN_Technology_Report.pdf,go.boarddocs.com,ut,uen,https://go.boarddocs.com/ut/uen/Board.nsf/Public
9264,Yolo County ROP,600149,CALIFORNIA,https://go.boarddocs.com/ca/cottonwood/Board.nsf/files/CE9VK98114C9/$file/ROP%20MOU%202022%20Participating%20District.pdf,go.boarddocs.com,ca,cottonwood,https://go.boarddocs.com/ca/cottonwood/Board.nsf/Public
18576,IN Math & Science Academy,1800067,INDIANA,https://go.boarddocs.com/nc/tmsa/Board.nsf/goto?open&id=B92RJ96DB9B2,go.boarddocs.com,nc,tmsa,https://go.boarddocs.com/nc/tmsa/Board.nsf/Public


In [154]:
# percent of duplicated queries
boarddocs_df['query_url'].duplicated().sum() / boarddocs_df.shape[0] * 100

np.float64(82.05937770535215)

In [155]:
# code from here is informed by later work at `visit_boarddocs.ipynb`

boarddocs_df[boarddocs_df["query_url"] == "https://go.boarddocs.com/mi/wpas/Board.nsf/Public"]

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea,query_url
6187,Whittemore-Prescott Area Schools,2636390,MICHIGAN,https://go.boarddocs.com/mi/wpas/Board.nsf/goto?open&id=C5JRT75DE77E,go.boarddocs.com,mi,wpas,https://go.boarddocs.com/mi/wpas/Board.nsf/Public


In [156]:
boarddocs_df[boarddocs_df["query_url"] == "https://go.boarddocs.com/il/tfd215/Board.nsf/Public"]

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea,query_url
19223,Thornton School District,3306660,NEW HAMPSHIRE,https://go.boarddocs.com/il/tfd215/Board.nsf/goto?open&id=CDLHZS4AED7D,go.boarddocs.com,il,tfd215,https://go.boarddocs.com/il/tfd215/Board.nsf/Public


In [157]:
boarddocs_df[boarddocs_df["query_url"] == "https://go.boarddocs.com/mi/oxf/Board.nsf/Public"]

Unnamed: 0,LEA_NAME,LEAID,STATENAME,url,domain,boarddocs_state,boarddocs_lea,query_url
14253,Oxford Community Schools,2627240,MICHIGAN,https://go.boarddocs.com/mi/oxf/Board.nsf/Public,go.boarddocs.com,mi,oxf,https://go.boarddocs.com/mi/oxf/Board.nsf/Public


In [103]:
# choose useful columns and save it
boarddocs_df = boarddocs_df.loc[:, ['LEA_NAME','LEAID','STATENAME','query_url']]
output_filename = "boarddocs_url_cleaned.csv"
boarddocs_df.to_csv(output_filename, index=False)

Unnamed: 0,LEA_NAME,LEAID,STATENAME,query_url
6187,Whittemore-Prescott Area Schools,2636390,MICHIGAN,https://go.boarddocs.com/mi/wpas/Board.nsf/Public
