In [3]:
import numpy as np
import pandas as pd

In [4]:
population = pd.DataFrame(
    ["white"] * 100000 +
    ["hispanic"] * 60000 +
    ["black"] * 50000 +
    ["asian"] * 15000 +
    ["other"] * 35000
)

sample = pd.DataFrame(
    ["white"] * 600 +
    ["hispanic"] * 300 +
    ["black"] * 250 +
    ["asian"] * 75 +
    ["other"] * 150
)

print("Population:")
display(population)

print("\nSample:")
display(sample)

Population:


Unnamed: 0,0
0,white
1,white
2,white
3,white
4,white
...,...
259995,other
259996,other
259997,other
259998,other



Sample:


Unnamed: 0,0
0,white
1,white
2,white
3,white
4,white
...,...
1370,other
1371,other
1372,other
1373,other


In [5]:
population_table = pd.crosstab(index=population[0], columns='count')
sample_table = pd.crosstab(index=sample[0], columns='count')

print("Population:")
display(population_table)

print("\nSample:")
display(sample_table)

Population:


col_0,count
0,Unnamed: 1_level_1
asian,15000
black,50000
hispanic,60000
other,35000
white,100000



Sample:


col_0,count
0,Unnamed: 1_level_1
asian,75
black,250
hispanic,300
other,150
white,600


In [6]:
observed = sample_table

population_ratios = population_table / len(population)

print("Population Ratios:")
display(population_ratios)

expected = population_ratios * len(sample)
print("\nExpected:")
display(expected)

chi_square_stat = (((observed - expected)**2/expected).sum())
print(chi_square_stat)

Population Ratios:


col_0,count
0,Unnamed: 1_level_1
asian,0.057692
black,0.192308
hispanic,0.230769
other,0.134615
white,0.384615



Expected:


col_0,count
0,Unnamed: 1_level_1
asian,79.326923
black,264.423077
hispanic,317.307692
other,185.096154
white,528.846154


col_0
count    18.194805
dtype: float64


In [7]:
from scipy.stats import chi2

critical = chi2.ppf(
    q=0.95,   # for 95% confidence interval    
    df=4      # degrees of freedom = number of categories - 1
)

print(f"Critical value: {critical}")

p_value = 1 - chi2.cdf(x=chi_square_stat, df=4)
print(p_value)

Critical value: 9.487729036781154
[0.00113047]


In [8]:
observed = sample_table

population_ratios = population_table / len(population)

expected = population_ratios * len(sample)


In [9]:
from scipy.stats import chisquare

chisquare(f_obs=observed, f_exp=expected)

Power_divergenceResult(statistic=array([18.19480519]), pvalue=array([0.00113047]))

In [10]:
np.random.seed(10)

# Sample data generate at fixed probabilities
voter_race = np.random.choice(
    a=['asian', 'black', 'hispanics', 'other', 'white'],
    p=[0.05, 0.15, 0.25, 0.05, 0.5],
    size=1000
)

voter_party = np.random.choice(
    a=['democrat', 'independent', 'republican'],
    p=[0.4, 0.2, 0.4],
    size=1000
)

voters = pd.DataFrame({
    'race': voter_race,
    'party': voter_party
})

print("Voters:")
display(voters)

voters_table = pd.crosstab(voters['race'], voters['party'], margins=True)
voters_table.columns = ['democrat', 'independent', 'republican', 'row_totals']
voters_table.index = ['asian', 'black', 'hispanic', 'other', 'white', 'col_totals']
print("\nVoters Table:")
display(voters_table)

Voters:


Unnamed: 0,race,party
0,white,democrat
1,asian,republican
2,white,independent
3,white,republican
4,other,democrat
...,...,...
995,white,republican
996,hispanics,independent
997,black,independent
998,white,republican



Voters Table:


Unnamed: 0,democrat,independent,republican,row_totals
asian,21,7,32,60
black,65,25,64,154
hispanic,107,50,94,251
other,15,8,15,38
white,189,96,212,497
col_totals,397,186,417,1000


In [12]:
observed = voters_table.iloc[0:5, 0:3]
display(observed)

Unnamed: 0,democrat,independent,republican
asian,21,7,32
black,65,25,64
hispanic,107,50,94
other,15,8,15
white,189,96,212


In [13]:
# display(voters_table["row_totals"][0:5])
# display(voters_table.loc["col_totals"][0:3])

expected = np.outer(
    voters_table["row_totals"][0:5],
    voters_table.loc["col_totals"][0:3]
) / 1000

# display(expected)

expected = pd.DataFrame(expected)
expected.columns = ['democrat', 'independent', 'republican']
expected.index = ['asian', 'black', 'hispanic', 'other', 'white']

display(expected)

Unnamed: 0,democrat,independent,republican
asian,23.82,11.16,25.02
black,61.138,28.644,64.218
hispanic,99.647,46.686,104.667
other,15.086,7.068,15.846
white,197.309,92.442,207.249


In [14]:
chi_square_stat = (((observed - expected)**2/expected).sum()).sum()
print(chi_square_stat)


7.169321280162059


In [15]:
from scipy.stats import chi2

critical = chi2.ppf(
    q=0.95,   # for 95% confidence interval    
    df=8      # degrees of freedom = (number of rows - 1) * (number of cols - 1)
)

print(f"Critical value: {critical}")

p_value = 1 - chi2.cdf(x=chi_square_stat, df=8)
print(p_value)

Critical value: 15.507313055865453
0.518479392948842


In [16]:
from scipy.stats import chi2_contingency

result = chi2_contingency(observed=observed)

print(result)

Chi2ContingencyResult(statistic=7.169321280162059, pvalue=0.518479392948842, dof=8, expected_freq=array([[ 23.82 ,  11.16 ,  25.02 ],
       [ 61.138,  28.644,  64.218],
       [ 99.647,  46.686, 104.667],
       [ 15.086,   7.068,  15.846],
       [197.309,  92.442, 207.249]]))


In [17]:
import numpy as np
import pandas as pd

In [18]:
from scipy.stats import poisson
np.random.seed(12)

races = ['asian', 'black', 'hispanic', 'other', 'white']

voter_race = np.random.choice(
    a=races,
    p=[0.05, 0.15, 0.25, 0.05, 0.5],
    size=1000
)

voter_age = poisson.rvs(loc=18, mu=30, size=1000)

voters = pd.DataFrame({'race':voter_race, 'age':voter_age})

groups = voters.groupby('race').groups

display(groups)
print(type(groups))
print(groups.keys())

{'asian': [4, 7, 14, 21, 49, 53, 59, 78, 95, 98, 135, 136, 162, 203, 227, 264, 278, 289, 326, 335, 345, 373, 430, 480, 484, 491, 516, 587, 602, 684, 692, 708, 715, 761, 776, 826, 828, 832, 853, 897, 942, 951, 986, 996], 'black': [0, 9, 19, 22, 23, 42, 50, 56, 62, 76, 105, 108, 119, 120, 124, 126, 131, 134, 138, 143, 152, 167, 171, 178, 182, 184, 202, 206, 211, 213, 229, 233, 236, 238, 242, 243, 245, 249, 253, 259, 261, 265, 266, 281, 287, 290, 294, 299, 303, 312, 314, 320, 328, 332, 333, 340, 349, 379, 383, 389, 394, 409, 411, 412, 420, 427, 438, 456, 457, 464, 469, 472, 476, 478, 479, 486, 488, 492, 497, 498, 508, 518, 521, 534, 536, 552, 555, 557, 568, 574, 601, 607, 609, 615, 620, 624, 625, 628, 636, 646, ...], 'hispanic': [2, 10, 24, 28, 31, 32, 38, 40, 44, 45, 47, 54, 55, 58, 63, 71, 74, 83, 87, 88, 89, 91, 100, 104, 109, 110, 111, 113, 114, 117, 121, 123, 128, 132, 133, 139, 144, 145, 148, 155, 156, 158, 159, 168, 169, 172, 173, 188, 191, 195, 209, 210, 217, 218, 220, 223, 224, 2

<class 'pandas.io.formats.printing.PrettyDict'>
dict_keys(['asian', 'black', 'hispanic', 'other', 'white'])


In [19]:
asian = voter_age[groups['asian']]
black = voter_age[groups['black']]
hispanic = voter_age[groups['hispanic']]
other = voter_age[groups['other']]
white = voter_age[groups['white']]

print(asian)

print(voter_age[groups['asian']].mean())
print(voter_age[groups['black']].mean())
print(voter_age[groups['hispanic']].mean())
print(voter_age[groups['other']].mean())
print(voter_age[groups['white']].mean())

[56 52 37 50 53 47 56 43 46 54 45 54 42 44 55 50 45 49 51 57 56 46 43 53
 48 54 54 44 40 46 51 52 44 54 43 44 53 42 54 44 59 47 54 40]
48.88636363636363
47.55102040816327
48.127049180327866
48.86
47.46796116504854


In [20]:
from scipy.stats import f_oneway

f_oneway(asian, black, hispanic, other, white)

F_onewayResult(statistic=1.7744689357329695, pvalue=0.13173183201930463)

In [21]:
np.random.seed(12)

# Generate random data
voter_race = np.random.choice(
    a=races,
    p=[0.05, 0.15, 0.25, 0.05, 0.5],
    size=1000
)

# Use a different distribution for ages of white people
white_ages = poisson.rvs(loc=18, mu=32, size=1000)

voter_age = poisson.rvs(loc=18, mu=30, size=1000)

voter_age = np.where(voter_race=='white', white_ages, voter_age)

# Group age data by race
voters = pd.DataFrame({'race': voter_race, 'age': voter_age})

groups = voters.groupby('race').groups

# Extract individual groups
asian = voter_age[groups["asian"]]
black = voter_age[groups["black"]]
hispanic = voter_age[groups["hispanic"]]
other = voter_age[groups["other"]]
white = voter_age[groups["white"]]

print(voter_age[groups['asian']].mean())
print(voter_age[groups['black']].mean())
print(voter_age[groups['hispanic']].mean())
print(voter_age[groups['other']].mean())
print(voter_age[groups['white']].mean())

# Perform the ANOVA
f_oneway(asian, black, hispanic, other, white)

48.20454545454545
47.40136054421769
48.618852459016395
47.14
50.15922330097087


F_onewayResult(statistic=10.164699828386366, pvalue=4.5613242113994585e-08)

In [22]:
race_pairs = list()
for race1 in range(4):
    for race2 in range(race1 + 1, 5):
        race_pairs.append((races[race1], races[race2]))
print(race_pairs)

[('asian', 'black'), ('asian', 'hispanic'), ('asian', 'other'), ('asian', 'white'), ('black', 'hispanic'), ('black', 'other'), ('black', 'white'), ('hispanic', 'other'), ('hispanic', 'white'), ('other', 'white')]


In [23]:
from scipy.stats import ttest_ind

for race1, race2 in race_pairs:
    print(f"\n {race1} - {race2} Test")
    print(ttest_ind(voter_age[groups[race1]], voter_age[groups[race2]]))


 asian - black Test
TtestResult(statistic=0.8386446909747979, pvalue=0.4027281369339345, df=189.0)

 asian - hispanic Test
TtestResult(statistic=-0.42594691924932293, pvalue=0.6704669004240726, df=286.0)

 asian - other Test
TtestResult(statistic=0.9795284739636, pvalue=0.3298877500095152, df=92.0)

 asian - white Test
TtestResult(statistic=-2.318108811252288, pvalue=0.020804701566400217, df=557.0)

 black - hispanic Test
TtestResult(statistic=-1.9527839210712925, pvalue=0.05156197171952593, df=389.0)

 black - other Test
TtestResult(statistic=0.28025754367057176, pvalue=0.7795770111117659, df=195.0)

 black - white Test
TtestResult(statistic=-5.379303881281834, pvalue=1.0394212166624012e-07, df=660.0)

 hispanic - other Test
TtestResult(statistic=1.5853626170340225, pvalue=0.11396630528484335, df=292.0)

 hispanic - white Test
TtestResult(statistic=-3.5160312714115376, pvalue=0.0004641298649066684, df=757.0)

 other - white Test
TtestResult(statistic=-3.763809322077872, pvalue=0.0001

In [26]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

tukey = pairwise_tukeyhsd(
    endog=voter_age, 
    groups = voter_race,
    alpha=0.05
)

tukey.summary()

group1,group2,meandiff,p-adj,lower,upper,reject
asian,black,-0.8032,0.9208,-3.4423,1.836,False
asian,hispanic,0.4143,0.9915,-2.1011,2.9297,False
asian,other,-1.0645,0.8906,-4.2391,2.11,False
asian,white,1.9547,0.1751,-0.4575,4.3668,False
black,hispanic,1.2175,0.2318,-0.386,2.821,False
black,other,-0.2614,0.9986,-2.7757,2.253,False
black,white,2.7579,0.0,1.3217,4.194,True
hispanic,other,-1.4789,0.4374,-3.863,0.9053,False
hispanic,white,1.5404,0.004,0.3468,2.734,True
other,white,3.0192,0.0028,0.7443,5.2941,True
