# Marascuillo.ipynb

https://www.itl.nist.gov/div898/handbook/prc/section4/prc46.htm


https://www.itl.nist.gov/div898/handbook/prc/section4/prc464.htm

In [1]:
import pandas as pd
import scipy.stats
import itertools
import math

In [2]:
# Test data

group = ['1','2','3','4','5']
bad = [36, 46, 42, 63, 38]
good = [264, 254, 258, 237, 262]
df = pd.DataFrame({'group': group, 'obs_bad': bad, 'obs_good': good})
df

Unnamed: 0,group,obs_bad,obs_good
0,1,36,264
1,2,46,254
2,3,42,258
3,4,63,237
4,5,38,262


In [3]:
# My data

group = ['V23B', 'DUG42', 'PNG', 'X2B']
OrNV_detected =     [10,  9,  4,  4]
OrNV_not_detected = [ 9, 15, 12, 12]
df = pd.DataFrame({'group': group, 'obs_bad': OrNV_detected, 'obs_good': OrNV_not_detected})
df

Unnamed: 0,group,obs_bad,obs_good
0,V23B,10,9
1,DUG42,9,15
2,PNG,4,12
3,X2B,4,12


In [4]:
# Calculate overall proportion of bad units for all groups

prop_bad = df.obs_bad.sum() /(df.obs_bad.sum() + df.obs_good.sum())
prop_good = 1 - prop_bad
print(f'{prop_bad=}   {prop_good=}')

prop_bad=0.36   prop_good=0.64


In [5]:
# Calculate expected values and chi-squares

df['n'] = df.obs_bad + df.obs_good
df['p_bad'] = df.obs_bad / df.n
df['exp_bad'] = prop_bad * df.n
df['exp_good'] = prop_good * df.n
df['chisq_bad'] = (df.obs_bad - df.exp_bad)**2 / df.exp_bad
df['chisq_good'] = (df.obs_good - df.exp_good)**2 / df.exp_good
df

Unnamed: 0,group,obs_bad,obs_good,n,p_bad,exp_bad,exp_good,chisq_bad,chisq_good
0,V23B,10,9,19,0.526316,6.84,12.16,1.459883,0.821184
1,DUG42,9,15,24,0.375,8.64,15.36,0.015,0.008437
2,PNG,4,12,16,0.25,5.76,10.24,0.537778,0.3025
3,X2B,4,12,16,0.25,5.76,10.24,0.537778,0.3025


In [6]:
test_statistic = df.chisq_bad.sum() + df.chisq_good.sum()
degrees_of_freedom = len(df) - 1
critical_value = scipy.stats.chi2.ppf(1-0.05, degrees_of_freedom)
print(f'{degrees_of_freedom=}   {critical_value=}   {test_statistic=}')

degrees_of_freedom=3   critical_value=7.814727903251179   test_statistic=3.9850603070175437


In [7]:
chi = math.sqrt(scipy.stats.chi2.ppf(0.975, degrees_of_freedom))
print(f'{chi=}')

pairs = list(itertools.combinations(group, 2))
mylist = []
for pair in pairs:
    g1 = pair[0]
    g2 = pair[1]
    diff = abs(df[df.group==g1].p_bad.values[0] - df[df.group==g2].p_bad.values[0])
    p1 = df[df.group==g1].p_bad.values[0]
    n1 = df[df.group==g1].n.values[0]
    p2 = df[df.group==g2].p_bad.values[0]
    n2 = df[df.group==g2].n.values[0]
    diff = abs(p1-p2)
    critical_range = chi * math.sqrt(((p1 * (1 - p1)) / n1) + ((p2 * (1 - p2)) / n2)) 
    if diff > critical_range:
        significant = 'yes'
    else:
        significant = 'no'                
    mylist.append({'g1': g1, 'g2': g2, 'diff': diff, 'critical_range': critical_range, 'significant': significant})
pd.DataFrame(mylist)

chi=3.0575159205629903


Unnamed: 0,g1,g2,diff,critical_range,significant
0,V23B,DUG42,0.151316,0.462555,no
1,V23B,PNG,0.276316,0.481888,no
2,V23B,X2B,0.276316,0.481888,no
3,DUG42,PNG,0.125,0.448157,no
4,DUG42,X2B,0.125,0.448157,no
5,PNG,X2B,0.0,0.468085,no
