In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from nycschools import schools, geo, segregation as seg
import warnings
warnings.filterwarnings('ignore')

In [4]:
# get long data for all schools, all years
demos = schools.load_school_demographics()
groups = [ 'asian_n', 'black_n', 'hispanic_n', 'multi_racial_n', 'native_american_n', 'white_n', 'missing_race_ethnicity_data_n', ]
index = ['dbn', 'ay', 'district']
data = demos[index + groups]
data = pd.melt(demos, id_vars=index, value_vars=groups, var_name='g', value_name='n')
data


Unnamed: 0,dbn,ay,district,g,n
0,01M015,2005,1,asian_n,10
1,01M015,2006,1,asian_n,18
2,01M015,2007,1,asian_n,16
3,01M015,2008,1,asian_n,16
4,01M015,2009,1,asian_n,16
...,...,...,...,...,...
198403,84X730,2018,84,missing_race_ethnicity_data_n,0
198404,84X730,2019,84,missing_race_ethnicity_data_n,0
198405,84X730,2020,84,missing_race_ethnicity_data_n,1
198406,84X730,2021,84,missing_race_ethnicity_data_n,2


In [7]:
lorenz = demos[demos.ay == demos.ay.max()][["dbn", "white_n"]].sort_values(by='white_n').reset_index(drop=True)

lorenz['cumulative_white_students'] = lorenz['white_n'].cumsum()
lorenz['cumulative_share_white_students'] = lorenz['cumulative_white_students'] / lorenz['white_n'].sum()
lorenz['cumulative_share_schools'] = (lorenz.index + 1) / len(lorenz)


lorenz["label"] = text = lorenz.apply(lambda row: f"{row['cumulative_share_schools']*100:.2f}% of schools, { row['cumulative_share_white_students']*100:.2f}% white students", axis=1)

display(lorenz)



fig = go.Figure()
fig.add_trace(go.Scatter(x=lorenz['cumulative_share_schools'], y=lorenz['cumulative_share_white_students'],
                         mode='lines', name='Lorenz Curve', text=lorenz["label"],
                         fill='tozeroy'))

# line of equality
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Line of Equality', line=dict(dash='dash')))

fig.update_layout(title='Lorenz Curve of White Students Distribution Across NYC Schools',
                  xaxis_title='Cumulative Share of Schools',
                  yaxis_title='Cumulative Share of White Students',
                  xaxis=dict(tickformat=".0%"), 
                  yaxis=dict(tickformat=".0%"), 
                  showlegend=True)

fig.show()

Unnamed: 0,dbn,white_n,cumulative_white_students,cumulative_share_white_students,cumulative_share_schools,label
0,10X159,0,0,0.000000,0.000529,"0.05% of schools, 0.00% white students"
1,09X170,0,0,0.000000,0.001058,"0.11% of schools, 0.00% white students"
2,09X088,0,0,0.000000,0.001587,"0.16% of schools, 0.00% white students"
3,09X004,0,0,0.000000,0.002116,"0.21% of schools, 0.00% white students"
4,84K710,0,0,0.000000,0.002646,"0.26% of schools, 0.00% white students"
...,...,...,...,...,...,...
1885,20K490,1307,130670,0.949106,0.997884,"99.79% of schools, 94.91% white students"
1886,13K430,1344,132014,0.958867,0.998413,"99.84% of schools, 95.89% white students"
1887,31R440,1380,133394,0.968891,0.998942,"99.89% of schools, 96.89% white students"
1888,22K425,1760,135154,0.981674,0.999471,"99.95% of schools, 98.17% white students"


In [8]:
def calc_M(district):
    """Calculate a district's mutual information index for each academic year"""
    x = data[data.district == district].copy()
    x.rename(columns={'dbn': 'u'}, inplace=True)

    x = x[x.n > 0]
    t = [[year] + list(seg.mutual_information(x[x.ay == year])) for year in x.ay.unique()]
    result = pd.DataFrame(columns=['ay', 'M', 'H'], data=t)
    result = result[["ay","M"]]
    result["district"] = district
    result = result.sort_values(by='ay')

    return result


m_data = pd.concat([calc_M(district) for district in data.district.unique()])
all_schools = data.rename(columns={'dbn': 'u'})
all_schools = all_schools[all_schools.n > 0]
all_schools = all_schools[all_schools.ay == 2022]
M, H = seg.mutual_information(all_schools)
M, m_data[m_data.ay == 2022].M.mean()

(0.4077531081698128, 0.17255912753825645)

In [9]:
bk = demos[demos.boro == 'Brooklyn']["district"].unique()
bk = sorted([x for x in bk if x <33])

graph_data = m_data[m_data.district.isin(bk)]
graph_data.sort_values(by=["district", "ay"], inplace=True)


fig = px.line(graph_data, x='ay', y='M', color='district', markers=True,
              labels={
                  "ay": "Academic Year",
                  "M": "M Index"
              },
              title="Thiel's M Index, Brooklyn Schools")

fig.update_traces(mode="markers+lines", hoverinfo="name+y+x")

fig.update_layout(
    hovermode="closest",
    plot_bgcolor="white",
    legend_title_text='District',
    xaxis=dict(title='Academic Year'),
    yaxis=dict(title='M Index'),
    legend=dict(
        title_font_size=16,
        font_size=12,
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    )
)

fig.show()

In [10]:
demos["black_hispanic_n"] = demos["black_n"] + demos["hispanic_n"]
demos["black_hispanic_pct"] = demos["black_hispanic_n"] / \
    demos["total_enrollment"]
hoods = geo.load_district_neighborhoods()

bar_data = demos[(demos.district.isin(bk)) & (demos.ay.isin([2005, 2022]))]
bar_data = bar_data[["district", "ay", "black_hispanic_pct"]]
bar_data = bar_data.groupby(["district", "ay"]).mean().reset_index()

bar_data['black_hispanic_pct'] *= 100
bar_data = bar_data.merge(m_data, on=["district", "ay"], how="inner")
bar_data = bar_data.merge(hoods, on=["district"], how="inner")
bar_data.neighborhood = bar_data.neighborhood.apply(
    lambda x: "<br><b>Neighborhoods:</b><br>" + "<br>".join(x.split(", ")))

bar_data.district = bar_data.district.astype(str)
bar_data.ay = bar_data.ay.astype(str)
bar_data['M_label'] = bar_data.apply(
    lambda x: f'M {x.M:.2f}<br> Black/Latinx {x.black_hispanic_pct:.0f}%', axis=1)

fig = px.bar(bar_data, x='district', y='black_hispanic_pct', color='ay', barmode='group',
             labels={'district': 'District',
                     'black_hispanic_pct': '% Black/Hispanic', 'ay': 'Academic Year'},
             title='Percentage of Black and Hispanic Students in Brooklyn School Districts (2005, 2022)',
             text='M_label',
             hover_data=["M", "ay", "neighborhood"])


fig.update_traces(
    hovertemplate='District: %{x}<br>AY: %{customdata[1]}<br>Percentage: %{y:.2f}%<br>M Index: %{customdata[0]:.2f}%{customdata[2]}')

fig.update_layout(barmode='group',
                  xaxis_title="District",
                  yaxis_title="Precent Black/Hispanic")

fig.show()

In [11]:
import plotly.graph_objects as go

corr_data = demos[demos.ay == demos.ay.max()].copy()
corr_data = corr_data[["district", "ay", "black_hispanic_pct"]].groupby("district").mean().reset_index()
corr_data = corr_data.merge(m_data, on=["district", "ay"], how="inner")
corr_data.drop(columns=["ay"], inplace=True)
# just the "regular" districts
corr_data = corr_data[corr_data.district < 33]
corr_data.sort_values(by='district', inplace=True)



corr_data.district = corr_data.district.astype(str)
# corr_data.M = 1 - corr_data.M

fig = go.Figure()



fig.add_trace(go.Scatter(x=corr_data['district'], y=corr_data['black_hispanic_pct'], mode='lines+markers', name='% Black/Hispanic'))
fig.add_trace(go.Scatter(x=corr_data['district'], y=corr_data['M'], mode='lines+markers', name='M Index'))


fig.update_layout(title='Correlation of % Black/Hispanic and M Index',
                  xaxis_title='District',
                  yaxis_title='',
                  yaxis=dict(range=[0, 1]))


fig.show()

corr_data[["black_hispanic_pct", "M"]].corr()

Unnamed: 0,black_hispanic_pct,M
black_hispanic_pct,1.0,-0.420742
M,-0.420742,1.0
