# Chi-Square Goodness-Of-Fit Test

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [2]:
national=pd.DataFrame(['White']*100000+['Hispanic']*60000+['Black']*50000+['Asian']*15000+['other']*35000)
minnesota=pd.DataFrame(['White']*600+['Hispanic']*300+['Black']*250+['Asian']*75+['other']*150)

In [3]:
national_table=pd.crosstab(index=national[0],columns='count')
minnesota_table=pd.crosstab(index=minnesota[0],columns='count')
minnesota_table

col_0,count
0,Unnamed: 1_level_1
Asian,75
Black,250
Hispanic,300
White,600
other,150


##### After getting these two tables we need to calculate the Chi-square test based on the Chi-square Statistic fomula:
##### Sum((observed-expected)**2/expected)
##### Observed is the actual observed count for each category and expected is the expected count based on the distribution of the population for the corresponding category

In [55]:
observed=minnesota_table
national_ratios=national_table/len(national)
expected=national_ratios*len(minnesota)

In [5]:
Chi_squared_stat=((observed-expected)**2/expected).sum()

In [6]:
Chi_squared_stat

col_0
count    18.194805
dtype: float64

##### Similar to the t-test where we compared the t-test statistic to a critical value based on the t-distribution to determine whether the result is significant, in the chi-square test we compare the chi-square test statistic to a critical value based on the chi-square distribution. The scipy library shorthand for the chi-square distribution is chi2. Let's use this knowledge to find the critical value for 95% confidence level and check the p-value of our result:

In [7]:
crit=stats.chi2.ppf(q=0.95, # find the critical value for 95% confidence
                    df=4, #Df=number of variable categories-1
                   )

In [8]:
print('Critical Value:\n',crit)

Critical Value:
 9.487729036781154


In [9]:
p_value=1-stats.chi2.cdf(x=Chi_squared_stat,# find the p value
                        df=4)
print('p-value:\n',p_value)

p-value:
 [0.00113047]


### Easier Way

In [56]:
stats.chisquare(f_obs=observed,
               f_exp=expected)

Power_divergenceResult(statistic=array([18.19480519]), pvalue=array([0.00113047]))

#####  *Note: we are only interested in the right tail of the chi-square distribution. Read more on this here.
##### Since our chi-squared statistic exceeds the critical value,(18.194805>9.487729036781154) we'd reject the null hypothesis that the two distributions are the same

# Chi-Squared Test of Independence

In [23]:
np.random.seed(10)
voter_race=np.random.choice(a=['asian','hispanic','black','white','other'],
                            p=[0.05,0.15,0.25,0.05,0.5],
                            size=1000)

voter_party=np.random.choice(a=['democrat','independent','republica'],
                             p=[0.4,0.2,0.4],
                             size=1000)

voter=pd.DataFrame({'race':voter_race,
                   'party':voter_party})

voter_tab=pd.crosstab(voter_rate,voter_party,margins=True)

voter_tab.index=['asian','black','hispanic','other','white','col_Totals']

voter_tab.columns=['democrat','independent','republican','row_Totals']

In [24]:
voter_tab

Unnamed: 0,democrat,independent,republican,row_Totals
asian,21,7,32,60
black,107,50,94,251
hispanic,65,25,64,154
other,189,96,212,497
white,15,8,15,38
col_Totals,397,186,417,1000


In [57]:
observed=voter_tab.iloc[0:5,0:3]

In [58]:
observed

Unnamed: 0,democrat,independent,republican
asian,21,7,32
black,107,50,94
hispanic,65,25,64
other,189,96,212
white,15,8,15


In [59]:
expected=np.outer(voter_tab['row_Totals'][0:5],voter_tab.ix['col_Totals'][0:3])/1000

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [60]:
expected=pd.DataFrame(expected)
expected.index=['asian','black','hispanic','other','white']
expected.columns=['democrat','independent','republican']

In [61]:
expected

Unnamed: 0,democrat,independent,republican
asian,23.82,11.16,25.02
black,99.647,46.686,104.667
hispanic,61.138,28.644,64.218
other,197.309,92.442,207.249
white,15.086,7.068,15.846


In [48]:
chi_squared_stat=((observed-expected)**2/expected).sum().sum()

In [49]:
chi_squared_stat

7.169321280162059

In [51]:
criticalValue=stats.chi2.ppf(q=0.95,
                             df=8)
p_value=1-stats.chi2.cdf(x=chi_squared_stat,
                       df=8)

In [54]:
print('Critical Value is:\n',criticalValue)
print('p-value is:\n',p_value)

Critical Value is:
 15.50731305586545
p-value is:
 0.518479392948842


### Easier Way

In [62]:
stats.chi2_contingency(observed=observed)

(7.169321280162059, 0.518479392948842, 8, array([[ 23.82 ,  11.16 ,  25.02 ],
        [ 99.647,  46.686, 104.667],
        [ 61.138,  28.644,  64.218],
        [197.309,  92.442, 207.249],
        [ 15.086,   7.068,  15.846]]))

##### Result: the P-value is high it means that the values are independent with each other