-
Notifications
You must be signed in to change notification settings - Fork 1
/
ChiSquareTestOfHomogeneity.py
122 lines (99 loc) · 5.87 KB
/
ChiSquareTestOfHomogeneity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
class ChiSquareTestOfHomogeneity():
"""
Overview
=======================
Run a chi-square test of homogeneity to determine whether frequency counts
are distributed identically across different populations.
The test should be applied to a single categorical variable from two different populations,
where data from the categorical variable is represented in binary format.
Example condition data format
-----------------------
condition="Survived"
df["Survived"] should contain only 0s and 1s.
Keyword arguments
=======================
population_labels -- labels for your comparison and expected frequency tables
population_datasets -- data that contains observed frequencies to measure; condition column required
condition -- the column out of the datasets provided that will be measured in the test
alpha -- level of statistical significance
critical_value = chi-square critical value (see notes for details)
Other
=======================
Default critical value based on alpha level and 1 degree of freedom and the table:
https://faculty.elgin.edu/dkernler/statistics/ch09/images/chi-square-table.gif
More information about the test here:
http://stattrek.com/chi-square-test/homogeneity.aspx?Tutorial=AP
"""
def __init__(self, population_labels=[], population_datasets=[], condition="", alpha=0.01, critical_value=6.635):
if len(population_labels) > 2:
raise ValueError("Cannot compare more than two populations.")
elif len(set(population_labels)) != len(population_labels):
raise ValueError("Cannot compare the same population")
else:
self.pop1_data, self.pop2_data = population_datasets[0], population_datasets[1]
self.pop1_label, self.pop2_label = population_labels[0], population_labels[1]
self.condition = condition
self.inverse_condition = "Not %s" % self.condition
self.alpha = alpha
self.crit = critical_value
def calculate_condition_counts(self, pop_data):
return pop_data[self.condition].sum(), (pop_data[self.condition].count() - pop_data[self.condition].sum())
def generate_comp_table(self):
'''Create comparison table'''
pop1_condition, pop1_inverse = self.calculate_condition_counts(self.pop1_data)
pop2_condition, pop2_inverse = self.calculate_condition_counts(self.pop2_data)
total_condition = pop1_condition + pop2_condition
total_inverse = pop1_inverse + pop2_inverse
total_pop1 = self.pop1_data[self.condition].count()
total_pop2 = self.pop2_data[self.condition].count()
self.comp_table = pd.DataFrame({(self.condition): {self.pop1_label: pop1_condition,
self.pop2_label: pop2_condition,
"Total": total_condition},
self.inverse_condition: {self.pop1_label: pop1_inverse,
self.pop2_label: pop2_inverse,
"Total": total_inverse},
"Total": {self.pop1_label: total_pop1,
self.pop2_label: total_pop2,
"Total": total_pop1+total_pop2}}).T[[self.pop1_label, self.pop2_label, "Total"]].T
print "\n\nComparison Table\n"
print self.comp_table
def calculate_E(self, row_total, column_total):
grand_total = self.comp_table.loc["Total"]["Total"]
return (row_total * column_total) / grand_total
def calculate_E_by_pop(self, pop):
inverse = self.calculate_E(self.comp_table.loc[pop]["Total"], self.comp_table.loc[pop][self.inverse_condition])
condition = self.calculate_E(self.comp_table.loc[pop]["Total"], self.comp_table.loc[pop][self.condition])
return inverse, condition
def generate_e_table(self):
'''Create expected frequency contingency table'''
# helpful resource: http://www.pindling.org/Math/Statistics/Textbook/Chapter11_Chi_Square/homogeneity.html
pop1_inverse_e, pop1_condition_e = self.calculate_E_by_pop(self.pop1_label)
pop2_inverse_e, pop2_condition_e = self.calculate_E_by_pop(self.pop2_label)
self.e_table = pd.DataFrame({self.condition: {self.pop1_label: pop1_condition_e, self.pop2_label: pop2_condition_e},
self.inverse_condition: {self.pop1_label: pop1_inverse_e, self.pop2_label: pop2_inverse_e}})
print "\n\nExpected Frequency Table\n"
print self.e_table
def print_results(self):
print "\n\nResults\n"
print "Chi-square: %f" % self.chi_square
print "Chi-square critical value at p=%f: %f" % (self.alpha, self.crit)
print "Statistically significant: %s" % (self.chi_square > self.crit)
def run_test(self):
'''Run a chi-square test of homogeneity based on the population data and parameters
with which the instance was created'''
print "Chi-Square Test of Homogeneity"
print "Comparison of Rates of Survival between %s and %s" % (self.pop1_label, self.pop2_label)
# Step 0: generate tables
self.generate_comp_table()
self.generate_e_table()
# Step 1: subtract observed from expected
comp_wo_totals = self.comp_table[[self.inverse_condition, self.condition]].loc[[self.pop1_label, self.pop2_label]]
diff = comp_wo_totals - self.e_table
# Step 2: square each value
diff = diff**2
# Step 3: divide by expected frequency
diff = diff/self.e_table
# Step 4: calculate chi-square
self.chi_square = diff.loc[self.pop1_label].sum() + diff.loc[self.pop2_label].sum()
# Step 5: print results
self.print_results()