In [1]:
import pandas as pd

In [2]:
data = pd.read_csv(r'glpatientexomedatabase011.csv', index_col=None, low_memory=False)

In [3]:
# listing column names
list(data)

['GLIDNUM',
 'GLDISCODE',
 'PATIENTID',
 'DIAGNOSIS',
 'CAUSALGENES',
 'GENOTYPE',
 'NUCLEOTIDE',
 'EXOMESITE',
 'YROB',
 'GENDER',
 'ETHNICITY',
 'FAMILYTYPE',
 'YRENROLLED']

In [4]:
# listing unique values in columns of CAUSALGENES and FAMILYTYPE
col_CAUSALGENES = set(data['CAUSALGENES'])
col_FAMILYTYPE = set(data['FAMILYTYPE'])
print 'Column CAUSALGENES contains:', '\n', col_CAUSALGENES
print 'Column FAMILYTYPE contains:', '\n', col_FAMILYTYPE

Column CAUSALGENES contains: 
set(['Detected', 'Not Detected'])
Column FAMILYTYPE contains: 
set([nan, 'Sporadic', 'Consanguineous', 'Same Village', 'Non-consanguineous'])


In [5]:
# generating contingency table of observed counts
ct = pd.crosstab(data['CAUSALGENES'], data['FAMILYTYPE'])
print ct

FAMILYTYPE    Consanguineous  Non-consanguineous  Same Village  Sporadic
CAUSALGENES                                                             
Detected                2218                 175            83       229
Not Detected            5279                1047           303      1505


In [6]:
# calculating column percentage
colsum = ct.sum(axis=0)
colpct = ct/colsum
print colpct

FAMILYTYPE    Consanguineous  Non-consanguineous  Same Village  Sporadic
CAUSALGENES                                                             
Detected            0.295852            0.143208      0.215026  0.132065
Not Detected        0.704148            0.856792      0.784974  0.867935


In [7]:
import scipy.stats as stat

# chi-square test of independence
cs = stat.chi2_contingency(ct)
print 'chi-square value, p value, expeted counts', '\n', cs

chi-square value, p value, expeted counts 
(289.86302379890162, 1.5544300966943123e-62, 3L, array([[ 1870.96457238,   304.96448012,    96.33084233,   432.74010518],
       [ 5626.03542762,   917.03551988,   289.66915767,  1301.25989482]]))


As we can see from the large chi-square value, and significant P value, causal gene and family type are significantly associated. In examining the column percents of each family type with causal gene, we see higher rates of causal gene among Consanguineous group (29.6%) and Same Village group (21.5%) than Non-consanguineous group (14.3%) and Sporadic group (13.2%). Next, I'll do post hoc tests to look at these groups separately. I'll use the post doc approach known as the Bonferroni Adjustment. Since there will be 6 pairs of comparison, the adjusted p value is 0.05 / 6 = 0.008.

In [8]:
# post hoc tests for chi-sqare test of independency
recode1 = {'Consanguineous': 'Consanguineous', 'Non-consanguineous': 'Non-consanguineous'}
data['COMPCONvNON'] = data['FAMILYTYPE'].map(recode1)

# contingency table of observed counts
ct1 = pd.crosstab(data['CAUSALGENES'], data['COMPCONvNON'])

# chi-square test
cs1 = stat.chi2_contingency(ct1)

print 'Consanguineous vs. Non-consanguineous', '\n', 'chi-square value, p value, expected counts', '\n', cs1

Consanguineous vs. Non-consanguineous 
chi-square value, p value, expected counts 
(122.17994955807615, 2.1082230233406736e-28, 1L, array([[ 2057.61222617,   335.38777383],
       [ 5439.38777383,   886.61222617]]))


In [9]:
# post hoc tests for chi-sqare tests of independency
recode2 = {'Consanguineous': 'Consanguineous', 'Same Village': 'Same Village'}
data['COMPCONvSV'] = data['FAMILYTYPE'].map(recode2)

# contingency table of observed counts
ct2 = pd.crosstab(data['CAUSALGENES'], data['COMPCONvSV'])

# chi-square test
cs2 = stat.chi2_contingency(ct2)

print 'Consanguineous vs. Same Village', '\n', 'chi-square value, p value, expected counts', '\n', cs2

Consanguineous vs. Same Village 
chi-square value, p value, expected counts 
(11.214963459777952, 0.00081140407363826828, 1L, array([[ 2188.32893568,   112.67106432],
       [ 5308.67106432,   273.32893568]]))


In [10]:
# post hoc tests for chi-sqare tests of independency
recode3 = {'Consanguineous': 'Consanguineous', 'Sporadic': 'Sporadic'}
data['COMPCONvSP'] = data['FAMILYTYPE'].map(recode3)

# contingency table of observed counts
ct3 = pd.crosstab(data['CAUSALGENES'], data['COMPCONvSP'])

# chi-square test
cs3 = stat.chi2_contingency(ct3)

print 'Consanguineous vs. Sporadic', '\n', 'chi-square value, p value, expected counts', '\n', cs3

Consanguineous vs. Sporadic 
chi-square value, p value, expected counts 
(193.08118120916544, 6.7577703671272454e-44, 1L, array([[ 1987.34254144,   459.65745856],
       [ 5509.65745856,  1274.34254144]]))


In [11]:
# post hoc tests for chi-sqare tests of independency
recode4 = {'Non-consanguineous': 'Non-consanguineous', 'Sporadic': 'Sporadic'}
data['COMPNONvSP'] = data['FAMILYTYPE'].map(recode4)

# contingency table of observed counts
ct4 = pd.crosstab(data['CAUSALGENES'], data['COMPNONvSP'])

# chi-square test
cs4 = stat.chi2_contingency(ct4)

print 'Non-consanguineous vs. Sporadic', '\n', 'chi-square value, p value, expected counts', '\n', cs4

Non-consanguineous vs. Sporadic 
chi-square value, p value, expected counts 
(0.66289067038087912, 0.41554127908261917, 1L, array([[  167.01217862,   236.98782138],
       [ 1054.98782138,  1497.01217862]]))


In [12]:
# post hoc tests for chi-sqare tests of independency
recode5 = {'Non-consanguineous': 'Non-consanguineous', 'Same Village': 'Same Village'}
data['COMPNONvSV'] = data['FAMILYTYPE'].map(recode5)

# contingency table of observed counts
ct5 = pd.crosstab(data['CAUSALGENES'], data['COMPNONvSV'])

# chi-square test
cs5 = stat.chi2_contingency(ct5)

print 'Non-consanguineous vs. Same Village', '\n', 'chi-square value, p value, expected counts', '\n', cs5

Non-consanguineous vs. Same Village 
chi-square value, p value, expected counts 
(10.705204745679863, 0.0010683456374215845, 1L, array([[  196.06716418,    61.93283582],
       [ 1025.93283582,   324.06716418]]))


In [13]:
# post hoc tests for chi-sqare tests of independency
recode6 = {'Same Village': 'Same Village', 'Sporadic': 'Sporadic'}
data['COMPSVvSP'] = data['FAMILYTYPE'].map(recode6)

# contingency table of observed counts
ct6 = pd.crosstab(data['CAUSALGENES'], data['COMPSVvSP'])

# chi-square test
cs6 = stat.chi2_contingency(ct6)

print 'Same Village vs. Sporadic', '\n', 'chi-square value, p value, expected counts', '\n', cs6

Same Village vs. Sporadic 
chi-square value, p value, expected counts 
(16.65824712451105, 4.4755299363058195e-05, 1L, array([[   56.80754717,   255.19245283],
       [  329.19245283,  1478.80754717]]))
