In [1]:
# Efficient numerical arrays.
import numpy as np
# Data frames.
import pandas as pd
# Alternative statistics package.
import statsmodels.stats.weightstats as stat
# Mains statistics package.
import scipy.stats as ss
# Plotting.
import matplotlib.pyplot as plt
# Fancier plotting.
import seaborn as sns

# Better sized plots.
plt.rcParams['figure.figsize'] = (12, 8)

# Nicer colours and styles for plots.
plt.style.use("fivethirtyeight")


In [2]:
# Python code demonstrate creating  
# pandas DataFrame with indexed by  
  
# DataFrame using arrays. 
import pandas as pd 
  
# initialise data of lists. 
data = {'A':[90, 30, 30, ], 'B':[60, 50, 40, ], 'C':[104, 51, 45, ],'D':[95, 20, 35, ], } 
  
# Creates pandas DataFrame. 
df = pd.DataFrame(data, index =['White Collar', 'Blue Collar', 'No Collar']) 
  
# print the data 
df 

Unnamed: 0,A,B,C,D
White Collar,90,60,104,95
Blue Collar,30,50,51,20
No Collar,30,40,45,35


In [3]:
df.head()

Unnamed: 0,A,B,C,D
White Collar,90,60,104,95
Blue Collar,30,50,51,20
No Collar,30,40,45,35


In [4]:
df.values

array([[ 90,  60, 104,  95],
       [ 30,  50,  51,  20],
       [ 30,  40,  45,  35]], dtype=int64)

In [5]:
#Observed Values
Observed_Values = df.values 
print("Observed Values :-\n",Observed_Values)

Observed Values :-
 [[ 90  60 104  95]
 [ 30  50  51  20]
 [ 30  40  45  35]]


In [6]:
val=ss.chi2_contingency(df)
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html

In [7]:
val

(24.5712028585826,
 0.0004098425861096696,
 6,
 array([[ 80.53846154,  80.53846154, 107.38461538,  80.53846154],
        [ 34.84615385,  34.84615385,  46.46153846,  34.84615385],
        [ 34.61538462,  34.61538462,  46.15384615,  34.61538462]]))

Above formula calculates the chi square value, p value, degrees of freedom and array showing the expected values.  I have manually calculated the same below to find out how they are arrived at, if they are accurate and what they mean in relation to the information within the dataframe.

In [8]:
# Double check the calculation of Degrees of Freedom
# Calculate the number of rows in the table
no_of_rows=len(df.iloc[0:3,0])
no_of_columns=len(df.iloc[0,0:4])
# Calculate the degree of freedom, (number of rows -1) multiply by (number of columns -1)
ddof=(no_of_rows-1)*(no_of_columns-1)
print("Degrees of Freedom:-", ddof)
# Set the variance
alpha = 0.05

Degrees of Freedom:- 6


In [29]:
Expected_Values=val[3]

In [30]:
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(Observed_Values,Expected_Values)])
chi_square_statistic=chi_square[0]+chi_square[1]

In [31]:
print("chi-square statistic:-",chi_square_statistic)

chi-square statistic:- 15.066172455306972


In [32]:
critical_value=chi2.ppf(q=1-alpha,df=ddof)
print('critical_value:',critical_value)

critical_value: 12.591587243743977


In [13]:
#p-value
p_value=1-chi2.cdf(x=chi_square_statistic,df=ddof)
print('p-value:',p_value)
print('Significance level: ',alpha)
print('Degree of Freedom: ',ddof)
print('p-value:',p_value)

p-value: 0.019748239201875184
Significance level:  0.05
Degree of Freedom:  6
p-value: 0.019748239201875184


In [14]:
if chi_square_statistic>=critical_value:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")
    
if p_value<=alpha:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")

Reject H0,There is a relationship between 2 categorical variables
Reject H0,There is a relationship between 2 categorical variables


## Run a Hypothetis Test
***

**1. Null & Alternative Hypothese** <br> <br> 
If we take A, B, C & D as locations and their populations;<br>
Null: Collar and location are not related.<br>
Alternative: Collar and location are related.<br>
<br>
**2. Alpha Level 0.05 (5%)** <br> <br>
Alpha is the probability of rejecting a true null hypothesis.  If the alpha level was set to 10% this would increase the chance that the null hypotheis would be rejected.  An alpha level of 1% would be tiny and increase the chance of **NOT**  rejecting the null.  So it is a balancing act to avoid blindly rejecting or accepting the null and possibly incorrectly influencing the result. Therefore a 5% level is widely accepted as a good comprimise.
https://www.statisticshowto.com/what-is-an-alpha-level/<br>

**3. Calculate Degrees of Freedom** <br> <br>
The degrees of freedom (often abbreviated as df or d) tell you how many numbers in your grid are actually independent. For a Chi-square grid, the degrees of freedom can be said to be the number of cells you need to fill in before, given the totals in the margins, you can fill in the rest of the grid using a formula. https://www.ling.upenn.edu/~clight/chisquared.htm <br> <br>

df = (rows - 1) (columns - 1)<br>
df = (3-1)(4-1)<br>
df = (2)(3) = 6.  This analysis will use 6 degrees of freedom.<br>
<br>
**4. State Decision Rule**<br> <br>
Go to chi square table and using alpha 0.05 and 6 degrees of freedom = critical value of 12.5916<br>
So our decision rule is if the calcuated chi square is greater than 12.59 we will end up rejecting the null hypothesis. <br>
<br>
**5. Calculate chi-square**<br> <br>

$$\chi^2 = \sum \frac {(f_o - f_e)^2}{f_e}$$


Multiply the frequencies for the columns times the frequency for the rows and then divide by the total number of subjects to get
the expected frequency for each cell. eg. How many white collar wearers are from Area A. We take the column total for A which is 150
and multiply it by the row total for white collar which is 349 and we divide by the total number of subjects which is 650. 
(150*349) / 650 = 80.53.  In this sample we would have expected 80.53 people white collars to be from location A.  Continue the calculation to get the expected values for all the cells.



In [15]:
# initialise data of lists. 
data = {'A (Expected)':[80.53, 34.85, 34.62 ], 'B(Expected)':[80.53, 34.84, 34.62 ], 'C(Expected)':[107.36, 46.46, 46.15],'D(Expected)':[80.54, 34.84, 34.61], } 
  
# Creates pandas DataFrame. 
df2 = pd.DataFrame(data, index =['Expected White Collar', 'Expected Blue Collar', 'Expected No Collar']) 
df2

Unnamed: 0,A (Expected),B(Expected),C(Expected),D(Expected)
Expected White Collar,80.53,80.53,107.36,80.54
Expected Blue Collar,34.85,34.84,46.46,34.84
Expected No Collar,34.62,34.62,46.15,34.61


In [16]:
# initialise data of lists. 
data = {'A (Expected)':[80.53, 34.85, 34.62 ], 'B(Expected)':[80.53, 34.84, 34.62 ], 'C(Expected)':[107.36, 46.46, 46.15],'D(Expected)':[80.54, 34.84, 34.61], } 
  
# Creates pandas DataFrame. 
df2 = pd.DataFrame(data, index =['Expected White Collar', 'Expected Blue Collar', 'Expected No Collar']) 
df2

Unnamed: 0,A (Expected),B(Expected),C(Expected),D(Expected)
Expected White Collar,80.53,80.53,107.36,80.54
Expected Blue Collar,34.85,34.84,46.46,34.84
Expected No Collar,34.62,34.62,46.15,34.61


In [17]:
# initialise data of lists. 
data = {'A':["90 (80.53)", "30 (34.85)", "30 (34.62)", ], 'B':["60 (80.53)", "50 (34.84)", "40 (34.62)", ], 'C':["104 (107.36)", "51 (46.46)", "45 (46.15)", ],'D':["95 (80.54)", "20 (34.85)", "35 (34.61)", ], } 
  
# Creates pandas DataFrame. 
df3 = pd.DataFrame(data, index =['White Collar', 'Blue Collar', 'No Collar']) 
  
# print the data 
df3 

# Below shows the observed values and next to them in brackets are the expected values.

Unnamed: 0,A,B,C,D
White Collar,90 (80.53),60 (80.53),104 (107.36),95 (80.54)
Blue Collar,30 (34.85),50 (34.84),51 (46.46),20 (34.85)
No Collar,30 (34.62),40 (34.62),45 (46.15),35 (34.61)


Again calculate chi squared this time taking all the observed values, subtracting the expected values from it, squaring the result and finally dividing by the expected values to get 12 different fractions.

5. $$\chi^2 = \sum \frac {(f_o - f_e)^2}{f_e}$$

$$\chi^2 = \frac {(90 - 80.53)^2}{80.53} + \frac {(60 - 80.53)^2}{80.53} + \frac {(104 - 107.36)^2}{107.36} + \frac {(95 - 80.54)^2}{80.54} + \frac {(30 - 34.85)^2}{34.85}+ \frac {(50 - 34.84)^2}{34.84} + \frac {(51 - 46.46)^2}{46.46} + \frac {(20 - 34.85)^2}{34.85} + \frac {(30 - 34.62)^2}{34.62} + \frac {(40 - 34.62)^2}{34.62} + \frac {(45 - 46.15)^2}{46.15} + \frac {(35 - 34.61)^2}{34.61} $$

Adding all the above together results in a chi squared of 24.66.  Allowing for rounding differences is quite close to 24.6.  The chi-square was greater than 12.5916 so we reject the null hypothesis that collar and location are not related and can say that there is a relationship between location and collar status.

The Chi-Square Test looks at the relationship between two variables and sees if there is any corellation between them.  

Rough Work

In [34]:
from scipy import stats
a=stats.chi2.pdf(24.66 , 6)
#print (a)
f'{a:.20f}'
stats.chi2.sf(24.571, 6)

0.00040987793499886133

In [36]:
crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*
                      df = 6)   # Df = number of variable categories - 1
print("Critical value")
print(crit)

p_value = 1 - stats.chi2.cdf(x=24.571,  # Find the p-value
                             df=6)
print("P value")
print(p_value)

Critical value
12.591587243743977
P value
0.00040987793499891456


In [37]:
1 - stats.chi2.cdf(24.571, 6)

0.00040987793499891456

References:
    
    https://www.youtube.com/watch?v=w5iKu1IrTJQ&ab_channel=KrishNaik
    https://www.youtube.com/watch?v=LE3AIyY_cn8&ab_channel=statslectures
    https://www.statisticshowto.com/probability-and-statistics/chi-square/
    