Chi-squared: Goodness of fit 
###### *tests if distribution of categorical data matches the expected distribution

In [2]:
#!pip install numpy pandas scipy

import numpy as np 
import pandas as pd
import scipy.stats as stats

In [3]:
national = pd.DataFrame(['white']*100000 + ['hispanic']*60000 + \
                        ['black']*50000 + ['asian']*15000 + ['others']*35000)
                
minnesota = pd.DataFrame(['white']*600 + ['hispanic']*300 + \
                        ['black']*250 + ['asian']*75 + ['others']*150)          

In [4]:
national_table = pd.crosstab(index=national[0], columns="count")
minnesota_table = pd.crosstab(index=minnesota[0], columns="count")

print('National')
print(national_table)
print('\n')
print('Minnesota')
print(minnesota_table)

National
col_0      count
0               
asian      15000
black      50000
hispanic   60000
others     35000
white     100000


Minnesota
col_0     count
0              
asian        75
black       250
hispanic    300
others      150
white       600


In [22]:
print(len(minnesota))
print(len(national))

1375
260000


In [11]:
observed = minnesota_table

national_ratios = national_table/len(national)
print(national_ratios)

expected = national_ratios * len(minnesota) # expected counts
print(expected)

chi_squared_stat = (((observed-expected)**2)/expected).sum()

col_0        count
0                 
asian     0.057692
black     0.192308
hispanic  0.230769
others    0.134615
white     0.384615
col_0          count
0                   
asian      79.326923
black     264.423077
hispanic  317.307692
others    185.096154
white     528.846154


In [8]:
print(chi_squared_stat)

col_0
count    18.194805
dtype: float64


In [14]:
# Calculating critical value and p-value

crit = stats.chi2.ppf(q=0.95, # 95% confidence level,
                      df=4)  # 4 degrees of freedom, number of variable catergories - 1

print('Critical value')
print(crit)

p_value = 1 - stats.chi2.cdf(x=chi_squared_stat,
                             df=4)

print('P value')
print(p_value)

Critical value
9.487729036781154
P value
[0.00113047]


In [15]:
stats.chisquare(f_obs=observed,
                f_exp=expected)

Power_divergenceResult(statistic=array([18.19480519]), pvalue=array([0.00113047]))

In [20]:
real = pd.read_csv('datasets/kag_risk_factors_cervical_cancer.csv')
real.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [21]:
real.dtypes

Age                                    int64
Number of sexual partners             object
First sexual intercourse              object
Num of pregnancies                    object
Smokes                                object
Smokes (years)                        object
Smokes (packs/year)                   object
Hormonal Contraceptives               object
Hormonal Contraceptives (years)       object
IUD                                   object
IUD (years)                           object
STDs                                  object
STDs (number)                         object
STDs:condylomatosis                   object
STDs:cervical condylomatosis          object
STDs:vaginal condylomatosis           object
STDs:vulvo-perineal condylomatosis    object
STDs:syphilis                         object
STDs:pelvic inflammatory disease      object
STDs:genital herpes                   object
STDs:molluscum contagiosum            object
STDs:AIDS                             object
STDs:HIV  