In [None]:
import pandas as pd
import numpy as np
from segregation.multigroup import MultiDissim as D, MultiInfoTheory as M
from segregation.batch import batch_compute_multigroup


Even Distribution
=================
These examples show "perfect integration" on the dissimilarity and information theory indices
because each tract (school) has the same population (students) in each group.

The least segregation should be 0 for each index.

In [2]:
# create a set of 4 schools each with 100 students in each group
groups = ["black", "white", "asian", "hispanic"]
schools = ["a", "b", "c", "d"]
n = [50,50,50,50]

df = pd.DataFrame(columns=["school"] + groups)
df["school"] = schools
df[groups] = n
display(df)
diss = D(df, groups=groups).statistic
mi = M(df, groups=groups).statistic
f"With even distributions in the district and each tract, both D and M are 0: D: {diss}, M: {mi}"

Unnamed: 0,school,black,white,asian,hispanic
0,a,50,50,50,50
1,b,50,50,50,50
2,c,50,50,50,50
3,d,50,50,50,50


'With even distributions in the district and each tract, both D and M are 0: D: 0.0, M: 0.0'

In [3]:
# create a set of 4 schools where the district is mostly white, but groups are distributed evenly
groups = ["black", "white", "asian", "hispanic"]
schools = ["a", "b", "c", "d"]
n = [10, 80, 5, 5]

df = pd.DataFrame(columns=["school"] + groups)
df["school"] = schools
df[groups] = n
display(df)
diss = D(df, groups=groups).statistic
mi = M(df, groups=groups).statistic
f"With even distributions in each tract, both D and M are still 0: D: {diss}, M: {mi}"

Unnamed: 0,school,black,white,asian,hispanic
0,a,10,80,5,5
1,b,10,80,5,5
2,c,10,80,5,5
3,d,10,80,5,5


'With even distributions in each tract, both D and M are still 0: D: 0.0, M: 0.0'

Maximum Segregation
===================

Here, even though the district has even populations in each group
the schools are perfectly segregated because each group is
isolated in its own school.

As tracts become segregated they get closer to 1. Here they are equal to one.

In [None]:
# create a set of 4 schools where the district is mostly white, but groups are distributed evenly
groups = ["black", "white", "asian", "hispanic"]
schools = ["a", "b", "c", "d"]
n = [10, 80, 5, 5]

df = pd.DataFrame(columns=["school"] + groups)
df["school"] = schools

df.loc[df.school == "a", groups] = [100,0,0,0]
df.loc[df.school == "b", groups] = [0,100,0,0]
df.loc[df.school == "c", groups] = [0,0,100,0]
df.loc[df.school == "d", groups] = [0,0,0,100]

# make sure the groups are int or float, not object (to handle 0)
df[groups] = df[groups].astype(int)
display(df)
diss = D(df, groups=groups).statistic
mi = M(df, groups=groups).statistic

f"With each group alone in a school, we have maximum segregation (1): D: {diss}, M: {mi}"

Unnamed: 0,school,black,white,asian,hispanic
0,a,100,0,0,0
1,b,0,100,0,0
2,c,0,0,100,0
3,d,0,0,0,100


'With each group alone in a school, with have maximum segregation (1): D: 1.0, M: 1.0'

Random Distribution
===================
This code assigns random distributions. You can run it multiple times
to see what "real" values might look like.

In [53]:
# create a set of 4 schools where the district is mostly white, but groups are distributed evenly
groups = ["black", "white", "asian", "hispanic"]
# use 2 groups to make it easier to see
groups = ["black", "white"]
schools = ["a", "b", "c", "d"]
df = pd.DataFrame(columns=["school"] + groups)
df["school"] = schools


def rand_n(total=100):
    n = []
    for _ in range(len(groups) - 1):
        num = np.random.randint(0, total - sum(n) + 1)
        n.append(num)
    n.append(total - sum(n))
    return n

df.loc[df.school == "a", groups] = rand_n()
df.loc[df.school == "b", groups] = rand_n()
df.loc[df.school == "c", groups] = rand_n()
df.loc[df.school == "d", groups] = rand_n()

df["total"] = df[groups].sum(axis=1)

# make sure the groups are int or float, not object (to handle 0)
df[groups] = df[groups].astype(int)
display(df)
diss = D(df, groups=groups).statistic
mi = M(df, groups=groups).statistic

f"D: {diss}, M: {mi}"

Unnamed: 0,school,black,white,total
0,a,88,12,100
1,b,84,16,100
2,c,28,72,100
3,d,96,4,100


'D: 0.5977130977130977, M: 0.3161707463099064'