In [3]:
import numpy as np
import pandas as pd

from scipy.stats import chi2_contingency, fisher_exact
from scipy.stats.contingency import expected_freq

In [4]:
data1 = pd.read_csv('edu_income.csv')
data1.head()

Unnamed: 0,Nation,Education,income_c
0,Austria,High,Low
1,Norway,High,Low
2,Denmark,High,Low
3,Sweden,High,Low
4,Norway,College,Low


In [6]:
observed_table = pd.crosstab(index = data1['Education'], columns = data1['income_c'], margins = False)
observed_table

income_c,High,Low,Medium
Education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
College,5,7,20
Graduate,13,1,18
High,2,19,11


In [7]:
expected_table = expected_freq(observed_table)
expected_table

array([[ 6.66666667,  9.        , 16.33333333],
       [ 6.66666667,  9.        , 16.33333333],
       [ 6.66666667,  9.        , 16.33333333]])

In [9]:
Tstat, pvalue, dof, expected = chi2_contingency(observed_table)

print("___________________________")
print("Test statistic:", np.round(Tstat, 4))
print("p-value :", pvalue)
print("degree of freedom:", dof)
print("___________________________")

print("Expected frequencies:", np.round(expected, 4))

___________________________
Test statistic: 31.1014
p-value : 2.91904786912222e-06
degree of freedom: 4
___________________________
Expected frequencies: [[ 6.6667  9.     16.3333]
 [ 6.6667  9.     16.3333]
 [ 6.6667  9.     16.3333]]


In [10]:
grade_data = pd.DataFrame({'grade':['G1', 'G1', 'G2', 'G2', 'G3','G3', 'G4','G4'],
                          'status':['Attend', 'Absent','Attend','Absent','Attend','Absent','Attend','Absent'],
                          'observed':[6,48,14,32,13,47,7,33]})
grade_data.head()

Unnamed: 0,grade,status,observed
0,G1,Attend,6
1,G1,Absent,48
2,G2,Attend,14
3,G2,Absent,32
4,G3,Attend,13


In [11]:
grade_table = pd.pivot_table(grade_data, values=['observed'], index=['status'],
                            columns=['grade'], aggfunc = np.sum, margins = False)
grade_table

Unnamed: 0_level_0,observed,observed,observed,observed
grade,G1,G2,G3,G4
status,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Absent,48,32,47,33
Attend,6,14,13,7


In [12]:
Tstat, pvalue, dof, expected = chi2_contingency(grade_table)

print("______________________")
print("Test statistic:", np.round(Tstat, 4))
print("p-value:", np.round(pvalue, 4))
print("degree of freedom :", dof)
print("______________________")


______________________
Test statistic: 6.0575
p-value: 0.1088
degree of freedom : 3
______________________


In [14]:
fisher_data = pd.DataFrame({'ab':['A','A','B','B'],
                           'g12':['G1','G2','G1','G2'],
                           'observed':[1,4,8,5]})
fisher_table = pd.pivot_table(fisher_data, values=['observed'], index = ['g12'], columns=['ab'])
fisher_table

Unnamed: 0_level_0,observed,observed
ab,A,B
g12,Unnamed: 1_level_2,Unnamed: 2_level_2
G1,1,8
G2,4,5


In [15]:
import math

def observed_prob(table):
    n, p= table.shape
    out1 = 1
    out2 = 1
    tot_n = 0
    
    for i in range(n):
        tot_n +=np.sum(table.iloc[i,:])
        out1 +=math.factorial(np.sum(table.iloc[i,:]))
        for j in range(p):
            out2 += math.factorial(table.iloc[i,j])
            
    out2 += math.factorial(tot_n)
    for j in range(p):
        out1 += math.factorial(np.sum(table.iloc[:,j]))
        
    result = out1/out2
    return result

In [16]:
print("observed probability:", observed_prob(fisher_table))

observed probability: 9.727246435785014e-07


In [17]:
_, pvalue = fisher_exact(fisher_table, alternative = 'two-sided')
print("p-value:", np.round(pvalue, 4))

p-value: 0.2941
