In [1]:

import pandas as pd
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import folium
from folium.map import Element
import seaborn as sns
from IPython.display import Markdown as md
from IPython.display import display, HTML

import xyzservices.providers as xyz


In [2]:
# load the most reent schools from the DOE website
url = "https://infohub.nyced.org/docs/default-source/default-document-library/demographic-snapshot-2019-20-to-2023-24-public.xlsx"
xls = pd.read_excel(url, sheet_name=None)
all_schools = xls["School"]
all_schools.columns


Index(['DBN', 'School Name', 'Year', 'Total Enrollment', 'Grade 3K',
       'Grade PK (Half Day & Full Day)', 'Grade K', 'Grade 1', 'Grade 2',
       'Grade 3', 'Grade 4', 'Grade 5', 'Grade 6', 'Grade 7', 'Grade 8',
       'Grade 9', 'Grade 10', 'Grade 11', 'Grade 12', '# Female', '% Female',
       '# Male', '% Male', '# Neither Female nor Male',
       '% Neither Female nor Male', '# Asian', '% Asian', '# Black', '% Black',
       '# Hispanic', '% Hispanic', '# Multi-Racial', '% Multi-Racial',
       '# Native American', '% Native American', '# White', '% White',
       '# Missing Race/Ethnicity Data', '% Missing Race/Ethnicity Data',
       '# Students with Disabilities', '% Students with Disabilities',
       '# English Language Learners', '% English Language Learners',
       '# Poverty', '% Poverty', 'Economic Need Index'],
      dtype='object')

In [3]:
cols = {'DBN':'dbn',
    'School Name':'school_name',
    'Year':'year',
    'Total Enrollment':'total_enrollment',
    'Grade 3K':'grade_3k',
    'Grade PK (Half Day & Full Day)':'grade_pk',
    'Grade K':'grade_k',
    'Grade 1':'grade_1',
    'Grade 2':'grade_2',
    'Grade 3':'grade_3',
    'Grade 4':'grade_4',
    'Grade 5':'grade_5',
    'Grade 6':'grade_6',
    'Grade 7':'grade_7',
    'Grade 8':'grade_8',
    'Grade 9':'grade_9',
    'Grade 10':'grade_10',
    'Grade 11':'grade_11',
    'Grade 12':'grade_12',
    '# Female':'female_n',
    '# Male':'male_n',
    '# Neither Female nor Male':'nonbinary_n',
    '# Asian':'asian_n',
    '# Black':'black_n',
    '# Hispanic':'hispanic_n',
    '# White':'white_n',
    '# Students with Disabilities':'swd_n',
    '# English Language Learners':'ell_n',
    '# Poverty':'poverty_n',
    'Economic Need Index': "eni"}
df = all_schools[cols.keys()].copy()
df.rename(columns=cols, inplace=True)
df["ay"] = df.year.apply(lambda x: int(x.split("-")[0]))
df = df[df.ay == df.ay.max()]
df["district"] = df["dbn"].apply(lambda dbn: int(dbn[:2]))

boros = {"K":"Brooklyn", "X":"Bronx", "M": "Manhattan", "Q": "Queens", "R": "Staten Island"}
df["boro"] = df["dbn"].apply(lambda dbn: boros[dbn[2]])
df["school_num"] = df.dbn.apply(lambda dbn: int(dbn[3:]))
df["charter"] = df.district.apply(lambda x: 1 if x == 84 else 0)

# figure out what grades they teach
df["pk"] = df["grade_pk"] > 0
df["elementary"] = df["grade_2"] > 0
df["middle"] = df["grade_7"] > 0
df["hs"] = df["grade_10"] > 0
df

Unnamed: 0,dbn,school_name,year,total_enrollment,grade_3k,grade_pk,grade_k,grade_1,grade_2,grade_3,...,eni,ay,district,boro,school_num,charter,pk,elementary,middle,hs
4,01M015,P.S. 015 Roberto Clemente,2023-24,189,0,17,29,35,25,30,...,0.875365,2023,1,Manhattan,15,0,True,True,False,False
9,01M020,P.S. 020 Anna Silver,2023-24,380,0,28,47,56,55,73,...,0.85575,2023,1,Manhattan,20,0,True,True,False,False
14,01M034,P.S. 034 Franklin D. Roosevelt,2023-24,217,13,15,15,17,13,25,...,0.949327,2023,1,Manhattan,34,0,True,True,True,False
19,01M063,The STAR Academy - P.S.63,2023-24,206,14,15,32,29,27,31,...,0.801063,2023,1,Manhattan,63,0,True,True,False,False
24,01M064,P.S. 064 Robert Simon,2023-24,186,14,16,19,28,25,22,...,0.896446,2023,1,Manhattan,64,0,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9342,84X705,Family Life Academy Charter School,2023-24,409,0,0,75,68,53,60,...,Above 95%,2023,84,Bronx,705,1,False,True,False,False
9347,84X706,Harriet Tubman Charter School,2023-24,491,0,0,49,52,42,57,...,0.932961,2023,84,Bronx,706,1,False,True,True,False
9352,84X717,Icahn Charter School 1,2023-24,337,0,0,35,41,37,42,...,0.900442,2023,84,Bronx,717,1,False,True,True,False
9357,84X718,Bronx Charter School for Better Learning,2023-24,550,0,18,71,101,102,97,...,0.722634,2023,84,Bronx,718,1,True,True,False,False


In [None]:

# calc percents as real number for each of these columns
n_cols = ["male_n", "female_n", "nonbinary_n", "asian_n", "black_n", "hispanic_n", "white_n", "poverty_n", "swd_n", "ell_n"]
pct_cols = ["male_pct", "female_pct", "nonbinary_pct", "asian_pct", "black_pct", "hispanic_pct", "white_pct", "poverty_pct", "swd_pct", "ell_pct"]
cols = list(zip(n_cols, pct_cols))

In [8]:

def mk_pct(row):
    for col, pct_col in cols:
        n = row[col]
        try:
            n = int(n)
            row[pct_col] = row[col] / row.total_enrollment
        except:
            if "Above" in row[col]:
                row[col] = np.ceil(row.total_enrollment * .96)
                row[pct_col] = .96
            elif "Below" in row[col]:
                row[col] = np.ceil(row.total_enrollment * .04)
                row[pct_col] = .04
    return row

df = df.apply(mk_pct, axis=1)
df[["dbn"] + pct_cols]

Unnamed: 0,dbn,male_pct,female_pct,nonbinary_pct,asian_pct,black_pct,hispanic_pct,white_pct,poverty_pct,swd_pct,ell_pct
4,01M015,0.497354,0.502646,0.0,0.100529,0.232804,0.566138,0.047619,0.862434,0.232804,0.126984
9,01M020,0.489474,0.510526,0.0,0.157895,0.150000,0.581579,0.068421,0.857895,0.165789,0.313158
14,01M034,0.548387,0.451613,0.0,0.023041,0.373272,0.562212,0.032258,0.963134,0.262673,0.179724
19,01M063,0.504854,0.495146,0.0,0.009709,0.145631,0.689320,0.126214,0.796117,0.310680,0.150485
24,01M064,0.494624,0.505376,0.0,0.010753,0.193548,0.736559,0.048387,0.903226,0.301075,0.134409
...,...,...,...,...,...,...,...,...,...,...,...
9342,84X705,0.493888,0.506112,0.0,0.004890,0.158924,0.823961,0.000000,0.960880,0.122249,0.244499
9347,84X706,0.496945,0.503055,0.0,0.000000,0.590631,0.378819,0.010183,0.877800,0.116090,0.128310
9352,84X717,0.445104,0.554896,0.0,0.023739,0.545994,0.409496,0.002967,0.916914,0.077151,0.038576
9357,84X718,0.505455,0.494545,0.0,0.016364,0.865455,0.081818,0.005455,0.843636,0.098182,0.010909


In [54]:
url = "https://data.cityofnewyork.us/download/jfju-ynrr/application%2Fx-zip-compressed"
school_locations = gpd.read_file(url)

In [55]:
loc = school_locations.copy()
loc.rename(columns={"ATS": "dbn", "Geographic":"geo_district"}, inplace=True)
loc = loc[["dbn", "geo_district", "geometry"]]
loc

Unnamed: 0,dbn,geo_district,geometry
0,01M015,1,POINT (-8235276.446 4971433.816)
1,01M020,1,POINT (-8236118.578 4971320.718)
2,01M034,1,POINT (-8234865.788 4972011.521)
3,01M063,1,POINT (-8236107.668 4971781.199)
4,01M064,1,POINT (-8235593.706 4971588.778)
...,...,...,...
1945,88X966,8,POINT (-8225428.122 4985312.134)
1946,88X994,12,POINT (-8225646.864 4987196.846)
1947,88X995,10,POINT (-8226862.696 4991308.524)
1948,88X996,8,POINT (-8221604.520 4986007.017)


In [88]:
from nycschools import geo
from shapely import Point
old_loc = geo.load_school_geo_points()
loc.crs, old_loc.crs

(<Derived Projected CRS: EPSG:3857>
 Name: WGS 84 / Pseudo-Mercator
 Axis Info [cartesian]:
 - X[east]: Easting (metre)
 - Y[north]: Northing (metre)
 Area of Use:
 - name: World between 85.06°S and 85.06°N.
 - bounds: (-180.0, -85.06, 180.0, 85.06)
 Coordinate Operation:
 - name: Popular Visualisation Pseudo-Mercator
 - method: Popular Visualisation Pseudo Mercator
 Datum: World Geodetic System 1984 ensemble
 - Ellipsoid: WGS 84
 - Prime Meridian: Greenwich,
 <Geographic 2D CRS: EPSG:4326>
 Name: WGS 84
 Axis Info [ellipsoidal]:
 - Lat[north]: Geodetic latitude (degree)
 - Lon[east]: Geodetic longitude (degree)
 Area of Use:
 - name: World.
 - bounds: (-180.0, -90.0, 180.0, 90.0)
 Datum: World Geodetic System 1984 ensemble
 - Ellipsoid: WGS 84
 - Prime Meridian: Greenwich)

In [None]:
a = loc[["dbn", "geometry"]].copy()
b = old_loc[["dbn", "geometry"]].copy()
# reproject both to NYS plane
a = a.to_crs(epsg=2263)
b = b.to_crs(epsg=2263)

common = a[["dbn"]].merge(b[["dbn"]], how="inner")
a = a[a.dbn.isin(common.dbn)]
b = b[b.dbn.isin(common.dbn)]

a["xy"] = a.geometry.apply(lambda x: f"{round(x.x, 3),round(x.y, 3)}")
b["xy"] = b.geometry.apply(lambda x: f"{round(x.x, 3),round(x.y, 3)}")
union = a.merge(b[["dbn", "xy"]], on=["dbn", "xy"], how="inner")
diffa = a[~a.dbn.isin(union.dbn)]
diffb = b[~b.dbn.isin(union.dbn)]
sym = diffa[["dbn", "geometry"]].merge(diffb[["dbn", "geometry"]], on="dbn", how="inner")
sym["d"] = sym.apply(lambda x: x.geometry_x.distance(x.geometry_y), axis=1)
print(sym.sort_values(by="d", ascending=False).d.head(5))

# epsg = 2263
# 56888.083715
# 39903.024511
# 39448.698698
# 35206.342356
# 24493.494065

# epsg = 3857
# 22854.006727
# 16020.279858
# 15881.068213
# 14174.309343
# 9864.905424

1704    22854.006727
1743    16020.279858
1810    15881.068213
2027    14174.309343
1771     9864.905424
Name: d, dtype: float64


In [33]:
# schools not in loc
display("Schools not in location")
not_found = all_schools[ (~all_schools.DBN.isin(loc.dbn)) & (all_schools.Year == '2023-24')]
display(not_found)

# those 10 schools will not be in our data set, we will have to look up their point based on address
df = loc.merge(df, on="dbn", how="inner")

'Schools not in location'

Unnamed: 0,DBN,School Name,Year,Total Enrollment,Grade 3K,Grade PK (Half Day & Full Day),Grade K,Grade 1,Grade 2,Grade 3,...,% White,# Missing Race/Ethnicity Data,% Missing Race/Ethnicity Data,# Students with Disabilities,% Students with Disabilities,# English Language Learners,% English Language Learners,# Poverty,% Poverty,Economic Need Index
725,02M933,City Knoll Middle School,2023-24,171,0,0,0,0,0,0,...,0.040936,0,0.0,44,0.25731,76,0.444444,158,0.923977,0.936971
845,03M299,The Maxine Greene HS for Imaginative Inquiry,2023-24,93,0,0,0,0,0,0,...,0.075269,1,0.010753,31,0.333333,14,0.150538,85,0.913979,0.901
2172,09X404,School for Excellence,2023-24,103,0,0,0,0,0,0,...,0.009709,0,0.0,47,0.456311,23,0.223301,96,0.932039,Above 95%
2567,10X382,"Elementary School for Math, Science, and Techn...",2023-24,226,0,0,30,41,27,42,...,0.0,0,0.0,59,0.261062,72,0.318584,Above 95%,Above 95%,Above 95%
2832,11X253,Bronx High School for Writing and Communicatio...,2023-24,203,0,0,0,0,0,0,...,0.039409,1,0.004926,52,0.256158,28,0.137931,Above 95%,Above 95%,0.90503
3022,12X098,J.H.S. 098 Herman Ridder,2023-24,110,0,0,0,0,0,0,...,0.009091,0,0.0,22,0.2,48,0.436364,Above 95%,Above 95%,0.9075
4145,17K531,"School for Human Rights, The",2023-24,143,0,0,0,0,0,0,...,0.034965,2,0.013986,43,0.300699,13,0.090909,124,0.867133,0.859755
4511,19K364,I.S. 364 Gateway,2023-24,130,0,0,0,0,0,0,...,0.007692,0,0.0,46,0.353846,1,0.007692,116,0.892308,0.903669
4541,19K504,High School for Civil Rights,2023-24,204,0,0,0,0,0,0,...,0.009804,15,0.073529,49,0.240196,56,0.27451,178,0.872549,0.944
7873,75Q256,P.S. Q256,2023-24,553,3,5,38,50,34,34,...,0.099458,0,0.0,512,0.925859,83,0.15009,492,0.889693,0.936984


In [34]:
# let's find all the colocated schools by matching schools with the same point
locations = pd.DataFrame()
# give each location an id
locations["geometry"] = df.geometry.unique()
locations["loc_id"] = locations.index + 1
# join the loc_id back into df using geometry
df = df.merge(locations, on="geometry", how="inner")

In [35]:
# create a dataframe for location campuses that have an ID a name, a total number of students, and number of schools
agg = {
    "dbn":"count",
    "total_enrollment":"sum",
    "school_name":"first", # this will be the largest school, b/c of how we sort the df
    "geometry":"first"
}
df.sort_values(by=["loc_id", "total_enrollment"], ascending=False, inplace=True)
df


Unnamed: 0,dbn,geo_district,geometry,school_name,year,total_enrollment,grade_3k,grade_pk,grade_k,grade_1,...,female_pct,nonbinary_pct,asian_pct,black_pct,hispanic_pct,white_pct,poverty_pct,swd_pct,ell_pct,loc_id
1881,84X730,8,POINT (-8225004.996 4986094.837),Bronx Charter School for the Arts,2023-24,624,0,0,32,51,...,0.580128,0.000000,0.003205,0.261218,0.713141,0.011218,0.887821,0.208333,0.112179,1349
1880,84X717,9,POINT (-8227033.905 4988572.215),Icahn Charter School 1,2023-24,337,0,0,35,41,...,0.554896,0.000000,0.023739,0.545994,0.409496,0.002967,0.916914,0.077151,0.038576,1348
1879,84X706,9,POINT (-8227152.794 4987641.451),Harriet Tubman Charter School,2023-24,491,0,0,49,52,...,0.503055,0.000000,0.000000,0.590631,0.378819,0.010183,0.877800,0.116090,0.128310,1347
1878,84X705,9,POINT (-8228588.148 4988808.820),Family Life Academy Charter School,2023-24,409,0,0,75,68,...,0.506112,0.000000,0.004890,0.158924,0.823961,0.000000,0.960000,0.122249,0.244499,1346
1877,84X703,9,POINT (-8226524.062 4988633.720),Bronx Preparatory Charter School,2023-24,895,0,0,0,0,...,0.531844,0.000000,0.005587,0.505028,0.472626,0.006704,0.850279,0.150838,0.051397,1345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,01M363,1,POINT (-8236107.668 4971781.199),Neighborhood School,2023-24,226,0,21,30,35,...,0.495575,0.004425,0.061947,0.110619,0.367257,0.362832,0.455752,0.269912,0.008850,4
3,01M063,1,POINT (-8236107.668 4971781.199),The STAR Academy - P.S.63,2023-24,206,14,15,32,29,...,0.495146,0.000000,0.009709,0.145631,0.689320,0.126214,0.796117,0.310680,0.150485,4
2,01M034,1,POINT (-8234865.788 4972011.521),P.S. 034 Franklin D. Roosevelt,2023-24,217,13,15,15,17,...,0.451613,0.000000,0.023041,0.373272,0.562212,0.032258,0.960000,0.262673,0.179724,3
1,01M020,1,POINT (-8236118.578 4971320.718),P.S. 020 Anna Silver,2023-24,380,0,28,47,56,...,0.510526,0.000000,0.157895,0.150000,0.581579,0.068421,0.857895,0.165789,0.313158,2


In [36]:
campuses = df[["loc_id", "dbn", "total_enrollment", "school_name", "geometry"]].groupby("loc_id").agg(agg).reset_index()
campuses.rename(columns={"dbn":"num_schools", "school_name":"campus"}, inplace=True)
campuses.sort_values(by="total_enrollment", ascending=False).head(50)

Unnamed: 0,loc_id,num_schools,total_enrollment,campus,geometry
446,447,1,5804,Brooklyn Technical High School,POINT (-8235019.075 4966561.627)
350,351,4,5175,The Bronx High School of Science,POINT (-8225287.080 4994651.588)
378,379,8,5168,Harry S Truman High School,POINT (-8218903.797 4993779.572)
885,886,1,4427,Francis Lewis High School,POINT (-8214827.278 4974195.591)
667,668,1,4001,Fort Hamilton High School,POINT (-8241883.925 4957453.579)
739,740,1,3890,Midwood High School,POINT (-8232338.613 4958333.947)
1132,1133,1,3802,Tottenville High School,POINT (-8259174.959 4942991.348)
740,741,1,3784,James Madison High School,POINT (-8231815.968 4955059.004)
704,705,1,3595,Edward R. Murrow High School,POINT (-8233093.916 4956404.047)
977,978,1,3432,Forest Hills High School,POINT (-8220273.361 4972600.424)


In [37]:
# "colocated" schools will be an school where their campus has > 1 school
df["colocated"] = df.loc_id.isin(campuses[campuses.num_schools > 1].loc_id)
df.colocated.value_counts()


False    1007
True      875
Name: colocated, dtype: int64

In [38]:
# just colocated schools
co = df[df.colocated == True]
co[pct_cols].describe()


Unnamed: 0,male_pct,female_pct,nonbinary_pct,asian_pct,black_pct,hispanic_pct,white_pct,poverty_pct,swd_pct,ell_pct
count,875.0,875.0,875.0,875.0,875.0,875.0,875.0,875.0,875.0,875.0
mean,0.515805,0.483938,0.000257,0.069629,0.343067,0.480392,0.075514,0.834267,0.254873,0.162818
std,0.107178,0.107084,0.001297,0.108863,0.234832,0.23505,0.115897,0.153289,0.159152,0.165291
min,0.0,0.0,0.0,0.0,0.0,0.01626,0.0,0.075,0.0,0.0
25%,0.479076,0.45056,0.0,0.010795,0.153765,0.276436,0.012983,0.801999,0.180912,0.056644
50%,0.513089,0.486747,0.0,0.028889,0.290598,0.48587,0.026718,0.880519,0.232068,0.121495
75%,0.549061,0.520728,0.0,0.079883,0.513339,0.664924,0.071351,0.936429,0.283514,0.206085
max,1.0,1.0,0.023256,0.929539,0.896725,1.0,0.674757,0.96,1.0,1.0


In [39]:
# just the solo schools
solo = df[df.colocated == False]
co[pct_cols].describe()

Unnamed: 0,male_pct,female_pct,nonbinary_pct,asian_pct,black_pct,hispanic_pct,white_pct,poverty_pct,swd_pct,ell_pct
count,875.0,875.0,875.0,875.0,875.0,875.0,875.0,875.0,875.0,875.0
mean,0.515805,0.483938,0.000257,0.069629,0.343067,0.480392,0.075514,0.834267,0.254873,0.162818
std,0.107178,0.107084,0.001297,0.108863,0.234832,0.23505,0.115897,0.153289,0.159152,0.165291
min,0.0,0.0,0.0,0.0,0.0,0.01626,0.0,0.075,0.0,0.0
25%,0.479076,0.45056,0.0,0.010795,0.153765,0.276436,0.012983,0.801999,0.180912,0.056644
50%,0.513089,0.486747,0.0,0.028889,0.290598,0.48587,0.026718,0.880519,0.232068,0.121495
75%,0.549061,0.520728,0.0,0.079883,0.513339,0.664924,0.071351,0.936429,0.283514,0.206085
max,1.0,1.0,0.023256,0.929539,0.896725,1.0,0.674757,0.96,1.0,1.0
