# Categorical Data

In [47]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import chi2_contingency

**Percent Agreement**

* if 100 ratings are made and raters agree 80% of the time, the percent agreement is 80/100 = 0.80

**Cohen's Kappa**
$$
K = \frac{P_o - P_e}{1 - P_e}
$$

## The Chi-Square Distribution
The chi-square distribution is a continuous theoretical probability distribution that is widely used in significance testing because many test statistics follow this distribution when the null hypothesis is true.

$$
Q = \sum _{i=1}^{k}X_{i}^2
$$

### Chi-Square Test for Independence
For a study with two variables, the chi-square test for independence tests the null hypothesis that the variables are independent of each other, that is, that there is no relationship between them. The alternative hypothesis is that the variables are related, so they are dependent rather than independent.

**expected value**
$$
E_{ij} = \frac{i\text{th row total} \times j\text{th row total}}{\text{grand total}}
$$

In [11]:
# chi-square test for independence
I = pd.Index(['Currently Smoke','Do Not Currently Smoke'])
C = pd.Index(['Lung Cancer Diag', 'No Lung Cancer Diag'])
df = pd.DataFrame(data=[(60,300),(10,390)], index=I, columns=C)

# add totals
df['Total'] = df.sum(axis=1)
df.loc['Total']= df.sum()

print(df)

                        Lung Cancer Diag  No Lung Cancer Diag  Total
Currently Smoke                       60                  300    360
Do Not Currently Smoke                10                  390    400
Total                                 70                  690    760


In [39]:
expected_data = [
    (
        round(df.iloc[0][2] * df.iloc[2][0] / df.iloc[2][2],2),
        round(df.iloc[0][2] * df.iloc[2][1] / df.iloc[2][2],2)
    ),
    (
        round(df.iloc[1][2] * df.iloc[2][0] / df.iloc[2][2],2),
        round(df.iloc[1][2] * df.iloc[2][1] / df.iloc[2][2],2)
    )
]

expected_df = pd.DataFrame(data=expected_data, index=I, columns=C)


# add totals
expected_df['Total'] = expected_df.sum(axis=1)
expected_df.loc['Total']= expected_df.sum()

expected_df


Unnamed: 0,Lung Cancer Diag,No Lung Cancer Diag,Total
Currently Smoke,33.16,326.84,360.0
Do Not Currently Smoke,36.84,363.16,400.0
Total,70.0,690.0,760.0


In [44]:
d = {
    'gender':str.split('m f m f f m m f f',' '), 
    'like_shopping':str.split('no yes yes yes yes yes no no no', ' ')
}

df = pd.DataFrame(d)
df


Unnamed: 0,gender,like_shopping
0,m,no
1,f,yes
2,m,yes
3,f,yes
4,f,yes
5,m,yes
6,m,no
7,f,no
8,f,no


In [45]:
ct = pd.crosstab(df['gender'],df['like_shopping'])
ct

like_shopping,no,yes
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
f,2,3
m,2,2


In [46]:
# observed values
ct.values

array([[2, 3],
       [2, 2]])

In [48]:
# expected values
test = stats.chi2_contingency(ct)
test[3]

array([[2.22222222, 2.77777778],
       [1.77777778, 2.22222222]])

In [49]:
# degrees of freedom
n_rows=len(ct.iloc[0:2,0])
n_cols=len(ct.iloc[0,0:2])
freedom=(n_rows-1)*(n_cols-1)

freedom

1

In [None]:
# Significance Level 5%
alpha=0.05

In [50]:
national = pd.DataFrame(['white']*100000 + ['hispanic']*60000 +\
                        ['black']*50000 + ['asian']*15000 + ['other']*35000)
           

minnesota = pd.DataFrame(['white']*600 + ['hispanic']*300 + \
                         ['black']*250 +['asian']*75 + ['other']*150)

national_table = pd.crosstab(index=national[0], columns='count')
minnesota_table = pd.crosstab(index=minnesota[0], columns='count')

print( 'National')
print(national_table)
print(' ')
print( 'Minnesota')
print(minnesota_table)

National
col_0      count
0               
asian      15000
black      50000
hispanic   60000
other      35000
white     100000
 
Minnesota
col_0     count
0              
asian        75
black       250
hispanic    300
other       150
white       600


In [51]:
observed = minnesota_table

national_ratios = national_table/len(national)  # Get population ratios

expected = national_ratios * len(minnesota)   # Get expected counts

chi_squared_stat = (((observed-expected)**2)/expected).sum()

print(chi_squared_stat)

col_0
count    18.194805
dtype: float64


In [52]:
national.head()

Unnamed: 0,0
0,white
1,white
2,white
3,white
4,white


## Chi-Squared Test
used to determine if there is a significant association between two categorical variables from a single population.

In [79]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
from IPython.display import display,Math

In [58]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [59]:
ct = pd.crosstab(df['sex'],df['smoker'])
ct

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [69]:
observed = ct.values

print('Observed Values')
print('------------------')
print(ct.values)

Observed Values
------------------
[[60 97]
 [33 54]]


In [63]:
val = stats.chi2_contingency(ct)
val

(0.008763290531773594,
 0.925417020494423,
 1,
 array([[59.84016393, 97.15983607],
        [33.15983607, 53.84016393]]))

In [71]:
# extract expected values
ev = val[3]

print('Expected Values')
print('------------------')
print(ev)

Expected Values
------------------
[[59.84016393 97.15983607]
 [33.15983607 53.84016393]]


In [65]:
# compute df
n_rows = len(ct.iloc[0:2,0])
n_cols = len(ct.iloc[0,0:2])

ddof = (n_rows - 1) * (n_cols - 1)

print('Degrees of Freedom')
print('------------------')
print(ddof)

Degrees of Freedom
------------------
1


In [74]:
# set alpha @ 95%
alpha = 0.05

In [75]:
chi_square = sum([(o-e)**2./e for o,e in zip(observed,ev)])
chi_stat = chi_square[0] + chi_square[1]

print('Chi-Squared Statistic')
print('------------------')
print(chi_stat)

Chi-Squared Statistic
------------------
0.001934818536627623


In [76]:
critical = stats.chi2.ppf(q=1-alpha,df=ddof)

print('Critical Value')
print('------------------')
print(critical)

Critical Value
------------------
3.841458820694124


In [77]:
# p-value
p_value = 1-stats.chi2.cdf(x=chi_stat,df=ddof)

print('p_value: ',p_value)
print('significance level: ',alpha)
print('degrees of freedom: ',ddof)

p_value:  0.964915107315732
significance level:  0.05
degrees of freedom:  1


In [81]:
if chi_stat >= critical:
    print('Reject H0, there is a relationship between categorical vaiables')
else:
    print('Accept H0, there is no relationship between categorical vaiables')
    
if alpha >= p_value:
    print('Reject H0, there is a relationship between categorical vaiables')
else:
    print('Accept H0, there is no relationship between categorical vaiables')
    

Accept H0, there is no relationship between categorical vaiables
Accept H0, there is no relationship between categorical vaiables
