# Chi Square Test

#### This tests whether the distribution of a categorical variable matches an expected distribution

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [2]:
national = pd.DataFrame(['white']*100000 + ['hispanic'] * 60000 + \
                       ['black'] * 50000 + ['asian']*15000 + ['other']*35000)

minnesota = pd.DataFrame(['white']*600 + ['hispanic'] * 300 + \
                       ['black'] * 250 + ['asian']*75 + ['other']*150)

national_table = pd.crosstab(index=national[0], columns='count')
minnesota_table = pd.crosstab(index=minnesota[0], columns='count')

print("National")
print(national_table)
print(" ")
print("Minnesota")
print(minnesota_table)

National
col_0      count
0               
asian      15000
black      50000
hispanic   60000
other      35000
white     100000
 
Minnesota
col_0     count
0              
asian        75
black       250
hispanic    300
other       150
white       600


#### Chi-Squared goodness of fit test uses: sum( [Observed - Expected] ^2 / Expected)

In [10]:
#Manual Chi-Square Test

observed = minnesota_table

national_ratios = national_table / len(national)

print(national_ratios)

expected = national_ratios * len(minnesota)

chi_squared_stat = ((observed - expected) ** 2 / expected).sum()

print(chi_squared_stat)

col_0        count
0                 
asian     0.057692
black     0.192308
hispanic  0.230769
other     0.134615
white     0.384615
col_0
count    18.194805
dtype: float64


In [16]:
crit = stats.chi2.ppf(q = 0.95, #Find critical value of 95% CI
                     df = 4) # Degrees of freedom = # Categories - 1

print("Critical Value", crit)

p_value = 1 - stats.chi2.cdf(chi_squared_stat, # Find the p-value
                            df = 4)

print("P Value", p_value)

Critical Value 9.487729036781154
P Value [0.00113047]


In [18]:
#### P-value of 0.001 suggests significance up to 99.8% CI

if crit < chi_squared_stat[0]:
    print("We reject the null hypothesis")

We reject the null hypothesis


In [20]:
#Automatically run the Chi-square test

stats.chisquare(f_obs = observed, #array of observed counts
               f_exp = expected) #array of expected counts

Power_divergenceResult(statistic=array([18.19480519]), pvalue=array([0.00113047]))

# Chi-Square test of independence

#### This can test whether two categorical variables are independent from one another

In [24]:
np.random.seed(10)

voter_race = np.random.choice(a = ["asian",'black','hispanic','other','white'],
                             p = [0.05,0.15,0.25,0.05,0.5],
                             size = 1000)

voter_party = np.random.choice( a = ['democrat','independent','republican'],
                              p = [0.4,0.2,0.4],
                              size = 1000)

voters = pd.DataFrame({'race':voter_race,
                      'party':voter_party})

voter_tab = pd.crosstab(voters.race, voters.party, margins = True)

voter_tab.columns = ['democrat','independent','republican','row_totals']

voter_tab.index = ["asian",'black','hispanic','other','white','col_totals']

observed = voter_tab.iloc[0:5,0:3] # Get table without totals
voter_tab

Unnamed: 0,democrat,independent,republican,row_totals
asian,21,7,32,60
black,65,25,64,154
hispanic,107,50,94,251
other,15,8,15,38
white,189,96,212,497
col_totals,397,186,417,1000


#### Since race and party data were generated independently, we should find the chi-square test of independence indicates independence of the variables

In [30]:
#Get the expected distribution if the two variables were independent
expected = np.outer(voter_tab['row_totals'][0:5],
                 voter_tab.loc['col_totals'][0:3]) / observed.sum().sum()

expected = pd.DataFrame(expected)

expected.columns = ['democrat', 'independent','republican']
expected.index = ["asian",'black','hispanic','other','white']

expected

Unnamed: 0,democrat,independent,republican
asian,23.82,11.16,25.02
black,61.138,28.644,64.218
hispanic,99.647,46.686,104.667
other,15.086,7.068,15.846
white,197.309,92.442,207.249


In [32]:
chi_squared_stat = ((observed - expected) ** 2 / expected).sum().sum()

print(chi_squared_stat)

7.169321280162059


In [33]:
crit = stats.chi2.ppf(q = 0.95, #Find critical value of 95% CI
                     df = 8) # Degrees of freedom = # Categories - 1 for each dimension (5-1) x (3-1) = 8

print("Critical Value", crit)

p_value = 1 - stats.chi2.cdf(chi_squared_stat, # Find the p-value
                            df = 8)

print("P Value", p_value)

Critical Value 15.50731305586545
P Value 0.518479392948842


In [36]:
#Null hypothesis is that the two variables are independent

if crit < chi_squared_stat:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")

We fail to reject the null hypothesis


In [37]:
#quick test

stats.chi2_contingency(observed = observed)

Chi2ContingencyResult(statistic=7.169321280162059, pvalue=0.518479392948842, dof=8, expected_freq=array([[ 23.82 ,  11.16 ,  25.02 ],
       [ 61.138,  28.644,  64.218],
       [ 99.647,  46.686, 104.667],
       [ 15.086,   7.068,  15.846],
       [197.309,  92.442, 207.249]]))