In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from functools import partial

import scipy

from IPython.display import Markdown as md, HTML
from nycschools import schools, geo, ui, class_size


In [14]:
# compare function to: 
# https://coascenters.howard.edu/dissimilarity-index-tutorial
df = pd.DataFrame(columns=["tract","b","w"], data=[[1,50,10],[2,200,40],[3,10,100],[4,30,200],[5,10,150]])
B = df["b"].sum()
W = df["w"].sum()
assert (B, W) == (300, 500)
df["d_b"] = df["b"]/B
df["d_w"] = df["w"]/W
D = (df["d_b"] - df["d_w"]).abs().sum() / 2
assert abs(D - 0.73) < 0.01
D

0.7333333333333334

In [31]:
df = schools.load_school_demographics()
df = df[df.ay == df.ay.max()]
print(f"Calculating dissimilarity index for {len(df):,} schools, AY {df.ay.max()}")


Calculating dissimilarity index for 1,890 schools, AY 2022


In [33]:
def calc_D(data):
    # Get the totals
    A = data['asian_n'].sum()
    B = data['black_n'].sum()
    H = data['hispanic_n'].sum()
    W = data['white_n'].sum()
    BH = B + H  # Black-Hispanic combined
    AW = A + W  # Asian-White combined
    N = data['total_enrollment'].sum()

    # Calculate "Non-Group" totals
    non_A = N - A
    non_W = N - W

    # Calculate proportions for each group and their complements
    data['p_a'] = data['asian_n'] / A
    data['p_non_a'] = (data['total_enrollment'] - data['asian_n']) / non_A

    data['p_w'] = data['white_n'] / W
    data['p_non_w'] = (data['total_enrollment'] - data['white_n']) / non_W

    data['p_bh'] = (data['black_n'] + data['hispanic_n']) / BH
    data['p_aw'] = (data['asian_n'] + data['white_n']) / AW

    # Calculate Dissimilarity Index for each comparison
    D_asian = (data['p_a'] - data['p_non_a']).abs().sum() / 2
    D_non_white = (data['p_w'] - data['p_non_w']).abs().sum() / 2
    D_black_hispanic = (data['p_bh'] - data['p_aw']).abs().sum() / 2
    return D_asian, D_non_white, D_black_hispanic



In [85]:
# calculate dissimilarity for each district in NYCPS
t = ["NYCPS", *calc_D(df.copy())]
for d in df.district.unique():
    t.extend([d, *calc_D(df[df.district == d].copy())])

diss = pd.DataFrame(columns=["district", "asian_d", "non_white_d", "black_hispanic_d"], data=np.array(t).reshape(-1, 4))
diss.sort_values(by="black_hispanic_d", ascending=False)

Unnamed: 0,district,asian_d,non_white_d,black_hispanic_d
13,13,0.6701569130356794,0.3926845104144236,0.6259420571746532
0,NYCPS,0.5476491261340923,0.5684714632157319,0.6031986212637963
10,10,0.5907860695944609,0.5411240613454735,0.6017576168782914
5,5,0.537816246940532,0.5479882146976124,0.5892834830055006
35,84,0.5359997980574033,0.6242665109017775,0.5862933622087521
2,2,0.4648554644733785,0.4797974110941249,0.5764986997345212
14,14,0.473308129529294,0.554797063631935,0.5725736362351402
3,3,0.3967366680036763,0.3913012597291139,0.5109917549213527
1,1,0.4767036483005069,0.4436379328692973,0.4972303481573455
29,29,0.5021218686473634,0.2797591840902961,0.4970530284654044


In [86]:
# join them with district aggregates
agg = {"total_enrollment": "sum", 
       "poverty_pct": "mean", 
       "ell_pct": "mean", 
       "asian_pct": "mean", 
       "black_pct": "mean", 
       "hispanic_pct": "mean", 
       "white_pct": "mean"}

x = pd.DataFrame([df.agg(agg)])
x["district"] = "NYCPS"
x.reset_index()


districts = df.groupby("district").agg(agg).reset_index()
districts = pd.concat([x, districts])
districts.district = districts.district.astype(str)
diss = diss.merge(districts, on="district")
diss

Unnamed: 0,district,asian_d,non_white_d,black_hispanic_d,total_enrollment,poverty_pct,ell_pct,asian_pct,black_pct,hispanic_pct,white_pct
0,NYCPS,0.5476491261340923,0.5684714632157319,0.6031986212637963,987787.0,0.775939,0.151943,0.114237,0.291878,0.442471,0.112103
1,1,0.4767036483005069,0.4436379328692973,0.4972303481573455,9470.0,0.731559,0.081927,0.125438,0.191427,0.499924,0.136576
2,2,0.4648554644733785,0.4797974110941249,0.5764986997345212,55347.0,0.568989,0.106243,0.169502,0.154687,0.370497,0.231004
3,3,0.3967366680036763,0.3913012597291139,0.5109917549213527,18867.0,0.59907,0.072855,0.068038,0.239747,0.391452,0.236945
4,4,0.4963756691701859,0.3387932982712172,0.4550124671969981,10899.0,0.845965,0.104726,0.062872,0.259084,0.584736,0.054014
5,5,0.537816246940532,0.5479882146976124,0.5892834830055006,8101.0,0.860052,0.086892,0.028358,0.484418,0.389927,0.056342
6,6,0.3721762076367145,0.5600741551903741,0.4934431653790513,17152.0,0.845703,0.297445,0.011439,0.064335,0.834968,0.061525
7,7,0.3398774848305486,0.2681277772873076,0.2266394668316233,14695.0,0.925867,0.161753,0.009764,0.26946,0.677782,0.015568
8,8,0.4136142029486914,0.3791259909239475,0.4147675737588198,22443.0,0.860856,0.153159,0.066011,0.211702,0.645034,0.045114
9,9,0.4144842528913052,0.2969457601732469,0.3136330154447229,25367.0,0.933797,0.236756,0.010343,0.267843,0.687556,0.013379


In [88]:
corr = diss[[c for c in diss.columns if c != "district"]].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,asian_d,non_white_d,black_hispanic_d,total_enrollment,poverty_pct,ell_pct,asian_pct,black_pct,hispanic_pct,white_pct
asian_d,1.0,0.408983,0.779081,0.25114,0.124418,-0.162717,-0.364475,0.09745,0.261782,-0.250515
non_white_d,0.408983,1.0,0.697244,0.298534,-0.148243,0.001588,-0.130865,-0.150213,0.181205,0.088056
black_hispanic_d,0.779081,0.697244,1.0,0.312012,-0.27306,-0.170867,-0.164435,-0.117724,0.124617,0.179631
total_enrollment,0.25114,0.298534,0.312012,1.0,-0.03893,0.00757,0.02195,-0.034773,0.002888,0.039033
poverty_pct,0.124418,-0.148243,-0.27306,-0.03893,1.0,0.292028,-0.596979,0.335699,0.452146,-0.719482
ell_pct,-0.162717,0.001588,-0.170867,0.00757,0.292028,1.0,0.11707,-0.609645,0.615337,-0.011362
asian_pct,-0.364475,-0.130865,-0.164435,0.02195,-0.596979,0.11707,1.0,-0.49251,-0.394139,0.453713
black_pct,0.09745,-0.150213,-0.117724,-0.034773,0.335699,-0.609645,-0.49251,1.0,-0.505793,-0.465529
hispanic_pct,0.261782,0.181205,0.124617,0.002888,0.452146,0.615337,-0.394139,-0.505793,1.0,-0.326394
white_pct,-0.250515,0.088056,0.179631,0.039033,-0.719482,-0.011362,0.453713,-0.465529,-0.326394,1.0
