References
============

Allen, R., & Vignoles, A. (2007). What should an index of school segregation measure? _Oxford Review of Education_, _33_(5), 643–668. https://doi.org/10.1080/03054980701366306

Cohen, D. (2021). NYC School Segregation Report Card: Still Last, Action Needed Now! _Civil Rights Project/Proyecto Derechos Civiles_. UCLA. https://escholarship.org/uc/item/5fx616qn

Frankel, D. M., & Volij, O. (2011). Measuring school segregation. _Journal of Economic Theory_, _146_(1), 1–38. https://doi.org/10.1016/j.jet.2010.10.008

Lauren Lefty. (2021, February 11). [The Long Fight for Educational Equity in NYC](https://www.mcny.org/story/long-fight-educational-equity-nyc). _Museum of the City of New York_.

Zhang, C. H., & Ruther, M. (2021). Contemporary patterns and issues of school segregation and white flight in U.S. metropolitan areas: Towards spatial inquiries. _GeoJournal_, _86_(3), 1511–1526. https://doi.org/10.1007/s10708-019-10122-1


In [3]:
from functools import partial
import pandas as pd
import numpy as np


# load common core of schools data
raw_data = pd.read_csv("_data/ccod-2007-ny_metro.csv")
data = raw_data.drop(columns=['school_id'])

raw_data.head()


Unnamed: 0,school_id,asian_n,black_n,hispanic_n,white_n
0,1275,2,0,4,169
1,2619,14,1,2,20
2,24,6,12,31,1247
3,26,5,11,16,661
4,27,6,2,3,365


In [4]:
# from Frankel & Volij
public_school_pop_ny = 2380186

# from Allen, R., & Vignoles, A. (2007)

def calculate_dissimilarity(data):
    total_black = data['black_n'].sum()
    total_white = data['white_n'].sum()
    total_asian = data['asian_n'].sum()
    total_hispanic = data['hispanic_n'].sum()

    total = sum([total_black, total_white, total_asian, total_hispanic])
    print(f"Total from article {public_school_pop_ny:,}")
    print(f"   Total from data {total:,}")

    black_pct = total_black / total
    white_pct = total_white / total
    asian_pct = total_asian / total
    hispanic_pct = total_hispanic / total

    non_black = total - total_black
    non_white = total - total_white
    non_asian = total - total_asian
    non_hispanic = total - total_hispanic

    def diss(row, eth, eth_total, total):
        cols = list(row.index)
        # the total students in the school outside of the target ethnic group `eth`
        non_eth = sum([row[col] for col in cols if col != eth and col.endswith('_n')])
        D = (row[eth] / eth_total) - (non_eth / total)
        return abs(D)

    black_D = data.apply(partial(diss, eth="black_n", eth_total=total_black, total=non_black), axis=1)
    black_D = black_D.sum() / 2

    white_D = data.apply(partial(diss, eth="white_n", eth_total=total_white, total=non_white), axis=1)
    white_D = white_D.sum() / 2

    asian_D = data.apply(partial(diss, eth="asian_n", eth_total=total_asian, total=non_asian), axis=1)
    asian_D = asian_D.sum() / 2

    hispanic_D = data.apply(partial(diss, eth="hispanic_n", eth_total=total_hispanic, total=non_hispanic), axis=1)
    hispanic_D = hispanic_D.sum() / 2

    # calculated a weighted average of the D indices
    weights = [asian_pct, black_pct, hispanic_pct, white_pct]
    D = np.average([asian_D, black_D, hispanic_D, white_D], weights=weights)

    return D

nyc_D = calculate_dissimilarity(data)

print("City D", nyc_D)


Total from article 2,380,186
   Total from data 181,380
City D 0.52419646033252


In [15]:

# from Frankel, D. M., & Volij, O. (2011).
ethnicity_cols = ['asian_n', 'black_n', 'hispanic_n', 'white_n']
school_ethnicity_counts_df = (
    raw_data[['school_id'] + ethnicity_cols]
    .rename(columns={col: col.replace('_n', '') for col in ethnicity_cols})
    .melt(id_vars=['school_id'], var_name='ethnicity', value_name='count')
    .sort_values(['school_id', 'ethnicity'])
    .reset_index(drop=True)
)
school_ethnicity_counts_df["count"].sum()


def dissimilarity_index(school_eth_counts: pd.DataFrame) -> dict:
    # g --> group/ethnicity index
    # n --> school index

    N_g = school_eth_counts.groupby('ethnicity')['count'].sum()
    N = N_g.sum()
    P_g = (N_g / N)
    I = (P_g * (1 - P_g)).sum()  # Simpson Interaction index
    P_g = pd.DataFrame(P_g.rename('P'))
    N_n = school_eth_counts.groupby('school_id')['count'].sum()
    pi_n = pd.DataFrame((N_n / N).rename('pi'))
    p_g_n = (
        school_eth_counts
        .groupby('school_id')
        .apply(lambda df: df.set_index('ethnicity')['count'] / df['count'].sum())
        .rename_axis([None], axis=1)
        .reset_index()
        .melt(id_vars=['school_id'], var_name='ethnicity', value_name='p')
        .set_index(['school_id', 'ethnicity'])
    )
    r_g_n = p_g_n.join(P_g)
    r_g_n = pd.DataFrame((r_g_n['p'] / r_g_n['P']).rename('r'))

    # Dissimilarity index
    d_g_n = r_g_n.join(pi_n).sort_index()
    d_g_n = (d_g_n['r'] - 1).abs() * d_g_n['pi']
    d_g = d_g_n.groupby('ethnicity').sum()
    d_g = pd.DataFrame(d_g.rename('d')).join(P_g)
    D = (d_g['d'] * d_g['P']).sum() / (2 * I)

    return {
        'dissimilarity_index': D,
        'simpson_interaction_index': I,
        'dissimilarity_index_unnormalized': I * D,
    }


dissimilarity_index(school_ethnicity_counts_df)


{'dissimilarity_index': 0.5328080086521683,
 'simpson_interaction_index': 0.7005849431221257,
 'dissimilarity_index_unnormalized': 0.37327726843659237}