In [None]:
from nycschools import schools
import pandas as pd
import plotly.express as px
import numpy as np



In [15]:
def get_demo_2024():
    """Read the latest demographic data from an Excel download"""
    def xls_cols(col):
        d = {
            "multi-racial": "multi_racial",
            "multi-racial_1": "multi_racial_1",
            "grade_pk_(half_day_&_full_day)": "grade_pk",
            "neither_female_nor_male": "non_binary_n",
            "neither_female_nor_male_1": "non_binary_pct",
            "missing_race/ethnicity_data": "missing_race_ethnicity_data",
            "missing_race/ethnicity_data_1": "missing_race_ethnicity_data_1"
        }
        col_name = col.lower().replace(" ", "_")
        if col_name[0] == "%":
            col_name = col_name[2:] + "_1"
        elif col_name[0] == "#":
            col_name = col_name[2:]
        if col_name in d:
            return d[col_name]
        return col_name
    
    f = "/home/mxc/Downloads/demographic-snapshot-2020-21-to-2024-25-public.xlsx"
    xls = pd.read_excel(f, sheet_name=None)
    df = xls["School"]
    a = len(df)
    df.rename(columns=xls_cols, inplace=True)
    df = schools.get_demographics(df)
    b = len(df)
    assert a == b, "lost some schools in the merge"
    return df


df = get_demo_2024()
df

[2020 2021 2022 2023 2024] {'asian_pct', 'asian_n'}


Unnamed: 0,dbn,beds,district,geo_district,boro,school_name,short_name,ay,year,school_type,...,missing_race_ethnicity_data_pct,swd_n,swd_pct,ell_n,ell_pct,poverty_n,poverty_pct,eni,clean_name,zip
0,01M015,310100010015,1,1,Manhattan,P.S. 015 Roberto Clemente,PS 15,2020,2020-21,community,...,0.000000,44,0.227979,21,0.108808,161,0.834197,0.864466,roberto clemente,10009.0
1,01M015,310100010015,1,1,Manhattan,P.S. 015 Roberto Clemente,PS 15,2021,2021-22,community,...,0.000000,45,0.251397,11,0.061453,150,0.837989,0.879354,roberto clemente,10009.0
2,01M015,310100010015,1,1,Manhattan,P.S. 015 Roberto Clemente,PS 15,2022,2022-23,community,...,0.000000,49,0.272222,12,0.066667,152,0.844444,0.862961,roberto clemente,10009.0
3,01M015,310100010015,1,1,Manhattan,P.S. 015 Roberto Clemente,PS 15,2023,2023-24,community,...,0.000000,45,0.238095,24,0.126984,164,0.867725,0.873524,roberto clemente,10009.0
4,01M015,310100010015,1,1,Manhattan,P.S. 015 Roberto Clemente,PS 15,2024,2024-25,community,...,0.000000,40,0.251572,11,0.069182,145,0.911950,0.917736,roberto clemente,10009.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9368,84X730,320800860846,84,8,Bronx,Bronx Charter School for the Arts,730,2021,2021-22,charter,...,0.003344,135,0.225753,79,0.132107,540,0.903010,0.903701,bronx charter school for the arts,10474.0
9369,84X730,320800860846,84,8,Bronx,Bronx Charter School for the Arts,730,2022,2022-23,charter,...,0.003384,135,0.228426,73,0.123519,516,0.873096,0.912959,bronx charter school for the arts,10474.0
9370,84X730,320800860846,84,8,Bronx,Bronx Charter School for the Arts,730,2023,2023-24,charter,...,0.000000,132,0.211538,70,0.112179,557,0.892628,0.921587,bronx charter school for the arts,10474.0
9371,84X730,320800860846,84,8,Bronx,Bronx Charter School for the Arts,730,2024,2024-25,charter,...,0.000000,124,0.214162,72,0.124352,517,0.892919,0.915102,bronx charter school for the arts,10474.0


In [16]:
# df = schools.load_school_demographics()
df = df[df['year'] == df.year.max()]
df = df[['dbn', 'district', 'total_enrollment', 'male_pct', 
         'asian_n', 'asian_pct', 'black_n', 'black_pct', 
         'hispanic_n', 'hispanic_pct', 'white_n', 'white_pct', 'ay']]
df.ay.max()

2024

In [31]:

white_total = df['white_n'].sum()
student_total = df.total_enrollment.sum()

df['white_share'] = df['white_n'] / df.total_enrollment
df = df.sort_values('white_share')

df['cum_students'] = df.total_enrollment.cumsum() / student_total
df['cum_white'] = df['white_n'].cumsum() / white_total

# Sort schools by white share
df = df.sort_values('white_share')

# Compute cumulative proportions
df['cum_students'] = df.total_enrollment.cumsum() / df.total_enrollment.sum()
df['cum_white'] = df['white_n'].cumsum() / df['white_n'].sum()

# Add (0,0) for the curve start
lorenz_df = pd.concat([
    pd.DataFrame({'cum_students': [0], 'cum_white': [0]}),
    df[['cum_students', 'cum_white']]
])

# Plot
fig = px.line(lorenz_df, x='cum_students', y='cum_white', title='Lorenz Curve: White Student Distribution in NYC Schools')
fig.add_shape(type='line', x0=0, y0=0, x1=1, y1=1, line=dict(dash='dash', color='gray'))
fig.update_layout(
    width=1200,
    height=1200,
    xaxis_scaleanchor="y",
)
fig.update_xaxes(
    tickvals=[i/10 for i in range(11)],
    ticktext=[f"{i*10}%" for i in range(11)],
    title="Cumulative share of students"
)
fig.update_yaxes(
    tickvals=[i/10 for i in range(11)],
    ticktext=[f"{i*10}%" for i in range(11)],
    title="Cumulative share of white students"
)



fig.show()
# save fig as an svg
fig.write_image("/home/mxc/Downloads/lorenz-nyc-schools-2024_25.svg")

In [30]:
top_10 = df[df['cum_students'] > 0.90]
bottom_90 = df[df['cum_students'] <= 0.90]

top_10_mean = top_10['white_n'].sum() / top_10['total_enrollment'].sum()
bottom_90_mean = bottom_90['white_n'].sum() / bottom_90['total_enrollment'].sum()

top_10.total_enrollment.sum(), bottom_90.total_enrollment.sum(), top_10_mean, bottom_90_mean

# top_10.dbn.nunique()

(99985, 898562, 0.5267690153523028, 0.09206042543530664)