In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from functools import partial

from IPython.display import Markdown as md, HTML
from nycschools import schools, geo, ui, class_size
from sklearn.metrics import mutual_info_score

In [41]:
# load the test data from the R segregation package
#  https://elbersb.github.io/segregation/articles/segregation.html

schools00 = pd.read_csv("schools00.csv")
schools00.head()
df = schools00.copy()
df.to_feather("segdata.feather")
# from the docs
test_M = 0.4255390
test_H = 0.4188083

In [40]:
# Pandas / Numpy implementation
# =============================



def mutual_information(df, group="g", unit="u", n="n"):
    data = df.copy()

    N = data[n].sum()
    # marginal probabilities
    data['p_ug'] = data[n] / N
    p_u = data.groupby(unit)[n].sum() / N # each unit
    p_g = data.groupby(group)[n].sum() / N # each group


    data['p_u'] = data[unit].map(p_u)
    data['p_g'] = data[group].map(p_g)
    M = np.sum(data['p_ug'] * np.log(data['p_ug'] / (data['p_u'] * data['p_g'])))
    E = -np.sum(p_g * np.log(p_g))
    H = M / E
    return M, H, E

M, H, E = mutual_information(schools00, group="race", unit="school")

assert abs(M - test_M) < 1e-6, f"Mutual information is {M}, not {test_M}"
assert abs(H - test_H) < 1e-6, f"Theil H-index is {H}, not {test_H}"

print(f'Entropy: {E}')
print(f'Mutual Information Score: {M}')
print(f'Theil H index: {H}')

Entropy: 1.0160709828319177
Mutual Information Score: 0.42553897587641965
Theil H index: 0.41880831464194457


In [29]:
# USING sklearn
# ====================================================================================

# Total and joint probabilities
N_g = df.groupby('race')['n'].sum()
N = N_g.sum()
P_gu = N_g / N

# Entropy for the groups
E = -np.sum(P_gu * np.log(P_gu))

# Expand data for use with sklearn mutual_info_score
df_expanded = df.loc[df.index.repeat(df['n'])]
M = mutual_info_score(df_expanded['school'], df_expanded['race'])

# Compute the Theil index
H = M / E


assert abs(M - test_M) < 1e-6, f"Mutual information is {M}, not {test_M}"
assert abs(H - test_H) < 1e-6, f"H-index {H}, not {test_H}"
print(f'Entropy: {E}')
print(f'Mutual Information Score: {M}')
print(f'Thiel H index: {H}')

Entropy: 1.0160709828319177
Mutual Information Score: 0.42553897587641937
Thiel H index: 0.4188083146419443


In [14]:
from sklearn.metrics import mutual_info_score
df = schools.load_school_demographics()
data = df[df.ay == df.ay.max()]

df_long = data.melt(id_vars='dbn', value_vars=['asian_n', 'white_n', 'black_n', 'hispanic_n'],
                  var_name='group', value_name='count')
expanded_rows = df_long.loc[df_long.index.repeat(df_long['count'])]

# Now, calculate mutual information
mi = mutual_info_score(expanded_rows['dbn'], expanded_rows['group'])

mi

In [4]:
df = schools.load_school_demographics()
data = df[df.ay == df.ay.max()]
groups = ["asian_n", "black_n", "hispanic_n", "white_n"]
data = data[["dbn"] + groups]
data.columns

Index(['dbn', 'asian_n', 'black_n', 'hispanic_n', 'white_n'], dtype='object')

In [5]:
# M index based on
# https://elbersb.github.io/segregation/articles/segregation.html
T = data.copy()
T.columns = ['u', 'asian', 'black', 'hispanic', 'white']

U = data.dbn # unites, i.e. schools in our data
G = groups # we are only using the 4 ethnic groups and will assume they are distinct
T.head()


Unnamed: 0,u,asian,black,hispanic,white
4,01M015,26,53,102,11
9,01M019,13,41,130,17
14,01M020,102,55,215,30
19,01M034,7,104,152,8
24,01M063,6,40,132,19


In [1]:
import pandas as pd
import numpy as np

# Create the DataFrame
data = {
    'u': ['01M015', '01M019', '01M020', '01M034', '01M063'],
    'asian': [10, 13, 102, 7, 6],
    'black': [10, 41, 55, 104, 40],
    'hispanic': [10, 130, 215, 152, 132],
    'white': [10, 17, 30, 8, 19]
}

df = pd.DataFrame(data).set_index('u')

# Calculate the joint probabilities p_ug
total = df.values.sum()
joint_probabilities = df / total # school-group pop / total pop
marginal_u = df.sum(axis=1) / total # school pop / total pop
marginal_g = df.sum(axis=0) / total # group pop / total pop
display(joint_probabilities)
display(marginal_u)
display(marginal_g)


# Initialize Mutual Information sum
mutual_information = 0


# Calculate the Mutual Information
for school in df.index:
    for group in df.columns:
        p_ug = joint_probabilities.at[school, group]
        p_u = marginal_u.at[school]
        p_g = marginal_g.at[group]
        if p_ug > 0:  # To avoid log(0)
            mutual_information += p_ug * np.log(p_ug / (p_u * p_g))

mutual_information

Unnamed: 0_level_0,asian,black,hispanic,white
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
01M015,0.009001,0.009001,0.009001,0.009001
01M019,0.011701,0.036904,0.117012,0.015302
01M020,0.091809,0.049505,0.193519,0.027003
01M034,0.006301,0.093609,0.136814,0.007201
01M063,0.005401,0.036004,0.118812,0.017102


u
01M015    0.036004
01M019    0.180918
01M020    0.361836
01M034    0.243924
01M063    0.177318
dtype: float64

asian       0.124212
black       0.225023
hispanic    0.575158
white       0.075608
dtype: float64

0.08210034929324432

In [None]:
import pandas as pd
import numpy as np

# Create the DataFrame
data = {
    'u': ['01M015', '01M019', '01M020', '01M034', '01M063'],
    'asian': [10, 13, 102, 7, 6],
    'black': [10, 41, 55, 104, 40],
    'hispanic': [10, 130, 215, 152, 132],
    'white': [10, 17, 30, 8, 19]
}

df = pd.DataFrame(data).set_index('u')

# Calculate the joint probabilities p_ug
total = df.values.sum()
df["p_ug"] = df / total
df["p_u"] = df.sum(axis=1) / total  # school pop / total pop
df["p_g"] = df.sum(axis=0) / total  # group pop / total pop


def calc_m(row):
    for group in df.columns:
        p_ug = joint_probabilities.at[school, group]
        p_u = marginal_u.at[school]
        p_g = marginal_g.at[group]
        if p_ug > 0:  # To avoid log(0)
            mutual_information += p_ug * np.log(p_ug / (p_u * p_g))

mutual_information