In [1]:
# autoreload libraries
%load_ext autoreload
%autoreload 2

import pandas as pd

import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import numpy as np
import folium
from folium.map import Element
import seaborn as sns
from IPython.display import Markdown as md
from IPython.display import display, HTML

import xyzservices.providers as xyz

from nycschools import schools, geo, ui, snapshot as snap
from nycschools.dataloader import load
from nycschools import config

import warnings
warnings.filterwarnings('ignore')

In [None]:
# load the most reent schools from the DOE website
url = "https://infohub.nyced.org/docs/default-source/default-document-library/demographic-snapshot-2019-20-to-2023-24-public.xlsx"
xls = pd.read_excel(url, sheet_name=None)
all_schools = xls["School"]
all_schools.columns


Index(['DBN', 'School Name', 'Year', 'Total Enrollment', 'Grade 3K',
       'Grade PK (Half Day & Full Day)', 'Grade K', 'Grade 1', 'Grade 2',
       'Grade 3', 'Grade 4', 'Grade 5', 'Grade 6', 'Grade 7', 'Grade 8',
       'Grade 9', 'Grade 10', 'Grade 11', 'Grade 12', '# Female', '% Female',
       '# Male', '% Male', '# Neither Female nor Male',
       '% Neither Female nor Male', '# Asian', '% Asian', '# Black', '% Black',
       '# Hispanic', '% Hispanic', '# Multi-Racial', '% Multi-Racial',
       '# Native American', '% Native American', '# White', '% White',
       '# Missing Race/Ethnicity Data', '% Missing Race/Ethnicity Data',
       '# Students with Disabilities', '% Students with Disabilities',
       '# English Language Learners', '% English Language Learners',
       '# Poverty', '% Poverty', 'Economic Need Index'],
      dtype='object')

In [79]:

cols = {'DBN':'dbn',
    'School Name':'school_name',
    'Year':'year',
    'Total Enrollment':'total_enrollment',
    'Grade 3K':'grade_3k',
    'Grade PK (Half Day & Full Day)':'grade_pk',
    'Grade K':'grade_k',
    'Grade 1':'grade_1',
    'Grade 2':'grade_2',
    'Grade 3':'grade_3',
    'Grade 4':'grade_4',
    'Grade 5':'grade_5',
    'Grade 6':'grade_6',
    'Grade 7':'grade_7',
    'Grade 8':'grade_8',
    'Grade 9':'grade_9',
    'Grade 10':'grade_10',
    'Grade 11':'grade_11',
    'Grade 12':'grade_12',
    '# Female':'female_n',
    '# Male':'male_n',
    '# Neither Female nor Male':'non_binary_n',
    '# Asian':'asian_n',
    '# Black':'black_n',
    '# Hispanic':'hispanic_n',
    '# White':'white_n',
    '# Students with Disabilities':'swd_n',
    '# English Language Learners':'ell_n',
    '# Poverty':'povery_n',
    'Economic Need Index': "eni"}
df = all_schools[cols.keys()]
df.rename(columns=cols, inplace=True)
df["ay"] = df.year.apply(lambda x: int(x.split("-")[0]))
df = df[df.ay == df.ay.max()]
df

Unnamed: 0,dbn,school_name,year,total_enrollment,grade_3k,grade_pk,grade_k,grade_1,grade_2,grade_3,...,non_binary_n,asian_n,black_n,hispanic_n,white_n,swd_n,ell_n,povery_n,eni,ay
4,01M015,P.S. 015 Roberto Clemente,2023-24,189,0,17,29,35,25,30,...,0.0,19,44,107,9,44,24,163,0.875365,2023
9,01M020,P.S. 020 Anna Silver,2023-24,380,0,28,47,56,55,73,...,0.0,60,57,221,26,63,119,326,0.85575,2023
14,01M034,P.S. 034 Franklin D. Roosevelt,2023-24,217,13,15,15,17,13,25,...,0.0,5,81,122,7,57,39,Above 95%,0.949327,2023
19,01M063,The STAR Academy - P.S.63,2023-24,206,14,15,32,29,27,31,...,0.0,2,30,142,26,64,31,164,0.801063,2023
24,01M064,P.S. 064 Robert Simon,2023-24,186,14,16,19,28,25,22,...,0.0,2,36,137,9,56,25,168,0.896446,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9342,84X705,Family Life Academy Charter School,2023-24,409,0,0,75,68,53,60,...,0.0,2,65,337,0,50,100,Above 95%,Above 95%,2023
9347,84X706,Harriet Tubman Charter School,2023-24,491,0,0,49,52,42,57,...,0.0,0,290,186,5,57,63,431,0.932961,2023
9352,84X717,Icahn Charter School 1,2023-24,337,0,0,35,41,37,42,...,0.0,8,184,138,1,26,13,309,0.900442,2023
9357,84X718,Bronx Charter School for Better Learning,2023-24,550,0,18,71,101,102,97,...,0.0,9,476,45,3,54,6,464,0.722634,2023


In [None]:
url = "https://data.cityofnewyork.us/download/jfju-ynrr/application%2Fx-zip-compressed"
school_locations = gpd.read_file(url)

In [None]:
loc = school_locations.copy()
loc.rename(columns={"ATS": "dbn", "Geographic":"geo_district"}, inplace=True)
loc = loc[["dbn", "geo_district", "geometry"]]
loc

Unnamed: 0,dbn,geo_district,geometry
0,01M015,1,POINT (-8235276.446 4971433.816)
1,01M020,1,POINT (-8236118.578 4971320.718)
2,01M034,1,POINT (-8234865.788 4972011.521)
3,01M063,1,POINT (-8236107.668 4971781.199)
4,01M064,1,POINT (-8235593.706 4971588.778)
...,...,...,...
1945,88X966,8,POINT (-8225428.122 4985312.134)
1946,88X994,12,POINT (-8225646.864 4987196.846)
1947,88X995,10,POINT (-8226862.696 4991308.524)
1948,88X996,8,POINT (-8221604.520 4986007.017)


In [87]:
# schools not in loc
display("Schools not in location")
not_found = all_schools[ (~all_schools.DBN.isin(loc.dbn)) & (all_schools.Year == '2023-24')]
display(not_found)

# those 10 schools will not be in our data set, we will have to look up their point based on address
df = loc.merge(df, on="dbn", how="inner")

'Schools not in location'

Unnamed: 0,DBN,School Name,Year,Total Enrollment,Grade 3K,Grade PK (Half Day & Full Day),Grade K,Grade 1,Grade 2,Grade 3,...,% White,# Missing Race/Ethnicity Data,% Missing Race/Ethnicity Data,# Students with Disabilities,% Students with Disabilities,# English Language Learners,% English Language Learners,# Poverty,% Poverty,Economic Need Index
725,02M933,City Knoll Middle School,2023-24,171,0,0,0,0,0,0,...,0.040936,0,0.0,44,0.25731,76,0.444444,158,0.923977,0.936971
845,03M299,The Maxine Greene HS for Imaginative Inquiry,2023-24,93,0,0,0,0,0,0,...,0.075269,1,0.010753,31,0.333333,14,0.150538,85,0.913979,0.901
2172,09X404,School for Excellence,2023-24,103,0,0,0,0,0,0,...,0.009709,0,0.0,47,0.456311,23,0.223301,96,0.932039,Above 95%
2567,10X382,"Elementary School for Math, Science, and Techn...",2023-24,226,0,0,30,41,27,42,...,0.0,0,0.0,59,0.261062,72,0.318584,Above 95%,Above 95%,Above 95%
2832,11X253,Bronx High School for Writing and Communicatio...,2023-24,203,0,0,0,0,0,0,...,0.039409,1,0.004926,52,0.256158,28,0.137931,Above 95%,Above 95%,0.90503
3022,12X098,J.H.S. 098 Herman Ridder,2023-24,110,0,0,0,0,0,0,...,0.009091,0,0.0,22,0.2,48,0.436364,Above 95%,Above 95%,0.9075
4145,17K531,"School for Human Rights, The",2023-24,143,0,0,0,0,0,0,...,0.034965,2,0.013986,43,0.300699,13,0.090909,124,0.867133,0.859755
4511,19K364,I.S. 364 Gateway,2023-24,130,0,0,0,0,0,0,...,0.007692,0,0.0,46,0.353846,1,0.007692,116,0.892308,0.903669
4541,19K504,High School for Civil Rights,2023-24,204,0,0,0,0,0,0,...,0.009804,15,0.073529,49,0.240196,56,0.27451,178,0.872549,0.944
7873,75Q256,P.S. Q256,2023-24,553,3,5,38,50,34,34,...,0.099458,0,0.0,512,0.925859,83,0.15009,492,0.889693,0.936984


In [92]:
# let's find all the colocated schools by matching schools with the same point
locations = pd.DataFrame()
# give each location an id
locations["geometry"] = df.geometry.unique()
locations["loc_id"] = locations.index + 1
# join the loc_id back into df using geometry
df = df.merge(locations, on="geometry", how="inner")
df["loc_id"]

0          1
1          2
2          3
3          4
4          4
        ... 
1877    1345
1878    1346
1879    1347
1880    1348
1881    1349
Name: loc_id, Length: 1882, dtype: int64

In [105]:
# create a dataframe for location campuses that have an ID a name, a total number of students, and number of schools
agg = {
    "dbn":"count",
    "total_enrollment":"sum",
    "school_name":"first", # this will be the largest school, b/c of how we sort the df
    "geometry":"first"
}
df.sort_values(by=["loc_id", "total_enrollment"], ascending=False, inplace=True)
df


Unnamed: 0,dbn,geo_district,geometry,school_name,year,total_enrollment,grade_3k,grade_pk,grade_k,grade_1,...,hispanic_n,white_n,swd_n,ell_n,povery_n,eni,ay,loc_id_x,loc_id_y,loc_id
1881,84X730,8,POINT (-8225004.996 4986094.837),Bronx Charter School for the Arts,2023-24,624,0,0,32,51,...,445,7,130,70,554,0.915459,2023,1349,1349,1349
1880,84X717,9,POINT (-8227033.905 4988572.215),Icahn Charter School 1,2023-24,337,0,0,35,41,...,138,1,26,13,309,0.900442,2023,1348,1348,1348
1879,84X706,9,POINT (-8227152.794 4987641.451),Harriet Tubman Charter School,2023-24,491,0,0,49,52,...,186,5,57,63,431,0.932961,2023,1347,1347,1347
1878,84X705,9,POINT (-8228588.148 4988808.820),Family Life Academy Charter School,2023-24,409,0,0,75,68,...,337,0,50,100,Above 95%,Above 95%,2023,1346,1346,1346
1877,84X703,9,POINT (-8226524.062 4988633.720),Bronx Preparatory Charter School,2023-24,895,0,0,0,0,...,423,6,135,46,761,0.88673,2023,1345,1345,1345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,01M363,1,POINT (-8236107.668 4971781.199),Neighborhood School,2023-24,226,0,21,30,35,...,83,82,61,2,103,0.52967,2023,4,4,4
3,01M063,1,POINT (-8236107.668 4971781.199),The STAR Academy - P.S.63,2023-24,206,14,15,32,29,...,142,26,64,31,164,0.801063,2023,4,4,4
2,01M034,1,POINT (-8234865.788 4972011.521),P.S. 034 Franklin D. Roosevelt,2023-24,217,13,15,15,17,...,122,7,57,39,Above 95%,0.949327,2023,3,3,3
1,01M020,1,POINT (-8236118.578 4971320.718),P.S. 020 Anna Silver,2023-24,380,0,28,47,56,...,221,26,63,119,326,0.85575,2023,2,2,2


In [107]:

campuses = df[["loc_id", "dbn", "total_enrollment", "school_name", "geometry"]].groupby("loc_id").agg(agg).reset_index()
campuses.rename(columns={"dbn":"num_schools", "school_name":"campus"}, inplace=True)
campuses.sort_values(by="total_enrollment", ascending=False).head(50)

Unnamed: 0,loc_id,num_schools,total_enrollment,campus,geometry
446,447,1,5804,Brooklyn Technical High School,POINT (-8235019.075 4966561.627)
350,351,4,5175,The Bronx High School of Science,POINT (-8225287.080 4994651.588)
378,379,8,5168,Harry S Truman High School,POINT (-8218903.797 4993779.572)
885,886,1,4427,Francis Lewis High School,POINT (-8214827.278 4974195.591)
667,668,1,4001,Fort Hamilton High School,POINT (-8241883.925 4957453.579)
739,740,1,3890,Midwood High School,POINT (-8232338.613 4958333.947)
1132,1133,1,3802,Tottenville High School,POINT (-8259174.959 4942991.348)
740,741,1,3784,James Madison High School,POINT (-8231815.968 4955059.004)
704,705,1,3595,Edward R. Murrow High School,POINT (-8233093.916 4956404.047)
977,978,1,3432,Forest Hills High School,POINT (-8220273.361 4972600.424)


In [109]:
# "colocated" schools will be an school where their campus has > 1 school
df["colocated"] = df.loc_id.isin(campuses[campuses.num_schools > 1].loc_id)
df.colocated.value_counts()

False    1007
True      875
Name: colocated, dtype: int64