In [61]:
import os
import numpy as np
import scipy.stats as st
import pandas as pd
import itertools
import scikit_posthocs

import iqplot

import bokeh.io
import bokeh.plotting
import bokeh.layouts
bokeh.io.output_notebook()

## Exploratory Data Analysis

1. Uploading the whole excel file to read from all the sheets later.

In [62]:
path = os.path.join('..', 'data', 'benCom-long.xlsx')
data = pd.ExcelFile(path)

2. Creating a list of all sheets (tests) that we want to analyse.

In [63]:
tests = [i for i in data.sheet_names if ('Cohort' not in i)]
tests

['Beam',
 'Beam_steps',
 'Pole',
 'Wirehang',
 'Sticker',
 'Hindlimb',
 'Fecal_output',
 'Fecal_score',
 'Water_content',
 'Carmine_red',
 'Bead_exp']

3. Parsing the Excel file into separate datasets (1 test = 1 dataset) and storing them in a dictionary with keys = names of the tests/sheets

In [64]:
data_dict = {}

for test in tests:
    temp_df = data.parse(test)   

    trial_cols = []
    for i in list(temp_df.columns):
        if 'Trial' in (i):
            trial_cols.append(i)    
            
    temp_df = temp_df.melt(id_vars=['Cohort', 'Genotype', 'Microbiome', 'Cage', 'ID'], 
            value_vars=trial_cols, 
            var_name='Trial', 
            value_name='Measurement')
    
    if test == 'Beam_steps':
        temp_df.loc[:, 'Type'] = temp_df.loc[:, 'Trial'].str.split('_', expand=True)[0]
        temp_df.loc[:, 'Trial'] = temp_df.loc[:, 'Trial'].str.split('_', expand=True)[1]   
        temp_df = temp_df.pivot(index=['ID', 'Trial', 'Cohort', 'Genotype', 'Microbiome', 'Cage'], columns='Type', values='Measurement').reset_index()
        temp_df['Total_steps'] = temp_df['Steps'] * 4
        temp_df['Slip_probability'] = temp_df['Slips']/temp_df['Total_steps']
        temp_df = temp_df.groupby(by=['Cohort', 'Genotype', 'Microbiome', 'Cage', 'ID'])['Slip_probability'].mean().reset_index()
        temp_df = temp_df.rename(columns={'Slip_probability':'Measurement'})
    elif test == 'Fecal_output':
        temp_df = temp_df.loc[temp_df['Trial'] == 'Trial15']
    else:
        temp_df = temp_df.groupby(by=['Cohort', 'Genotype', 'Microbiome', 'Cage', 'ID'])['Measurement'].mean().reset_index()
        

    temp_df = temp_df.dropna()
    data_dict[test] = temp_df

4. Plotting the raw data from all the tests for the EDA.

In [65]:
plots = []

for test in tests:
    df = data_dict[test]
        
    p = iqplot.stripbox(
        df,
        q='Measurement',
        q_axis='y',
        cats=['Genotype', 'Microbiome'],
        tooltips=[
            ('Mouse', '@ID'),
            ('Cage', '@Cage')
        ],
        title=test,
        color_column='ID',
    )
    plots.append(p)

lt = bokeh.layouts.grid(plots, ncols=2)
bokeh.io.show(lt)

for n, i in enumerate(plots):
    bokeh.io.save(
        i,
        filename=('../figures/' + 'EDA_Bencom_' + tests[n] + '.html'),
        title='Bokeh plot',
        resources=bokeh.resources.CDN)


## Pole descent

In [66]:
work_df = data_dict['Pole']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [67]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(8.91679397525553), pvalue=np.float64(0.011580912762417413))

In [68]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()), p_adjust="fdr_bh")
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Bencom_pole_pvals_conover_test.csv")
res

Unnamed: 0,ASO_SPF,ASO_bC,WT_SPF,Kruskal-Wallis
ASO_SPF,1.0,0.09059,0.006198,0.011581
ASO_bC,0.09059,1.0,0.175293,0.011581
WT_SPF,0.006198,0.175293,1.0,0.011581


## Beam cross

In [69]:
work_df = data_dict['Beam']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [70]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(19.568520710059175), pvalue=np.float64(5.6331292789915556e-05))

In [71]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()), p_adjust="fdr_bh")
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Bencom_beam_pvals_conover_test.csv")
res

Unnamed: 0,ASO_SPF,ASO_bC,WT_SPF,Kruskal-Wallis
ASO_SPF,1.0,0.73193,1.3e-05,5.6e-05
ASO_bC,0.73193,1.0,1.3e-05,5.6e-05
WT_SPF,1.3e-05,1.3e-05,1.0,5.6e-05


## Sticker removal

In [72]:
work_df = data_dict['Sticker']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [73]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(19.86871808086631), pvalue=np.float64(4.8480008521333334e-05))

In [74]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()), p_adjust="fdr_bh")
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Bencom_sticker_pvals_conover_test.csv")
res

Unnamed: 0,ASO_SPF,ASO_bC,WT_SPF,Kruskal-Wallis
ASO_SPF,1.0,0.003246,9.185881e-07,4.8e-05
ASO_bC,0.003245537,1.0,0.003245537,4.8e-05
WT_SPF,9.185881e-07,0.003246,1.0,4.8e-05


## Wirehang

In [75]:
work_df = data_dict['Wirehang'].copy()

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n


### NHST

In [76]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(20.615435162861054), pvalue=np.float64(3.3374528074606554e-05))

In [77]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()), p_adjust="fdr_bh")
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Bencom_wirehang_pvals_conover_test.csv")
res

Unnamed: 0,ASO_SPF,ASO_bC,WT_SPF,Kruskal-Wallis
ASO_SPF,1.0,0.763306,8e-06,3.3e-05
ASO_bC,0.763306,1.0,4e-06,3.3e-05
WT_SPF,8e-06,4e-06,1.0,3.3e-05


## Hindlimb

In [78]:
work_df = data_dict['Hindlimb'].copy()

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [79]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(12.939849801939362), pvalue=np.float64(0.0015493420826462657))

In [80]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()), p_adjust="fdr_bh")
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Bencom_hindlimb_pvals_conover_test.csv")
res

Unnamed: 0,ASO_SPF,ASO_bC,WT_SPF,Kruskal-Wallis
ASO_SPF,1.0,0.002659,0.000654,0.001549
ASO_bC,0.002659,1.0,0.516273,0.001549
WT_SPF,0.000654,0.516273,1.0,0.001549


## Fecal output

In [81]:
work_df = data_dict['Fecal_output'].copy()
work_df = work_df.loc[work_df['Trial'] == 'Trial15']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j) & (work_df['Trial'] == 'Trial15'), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n.astype('int32')

### NHST

In [82]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(3.136874538356031), pvalue=np.float64(0.20837055514468025))

In [83]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()), p_adjust="fdr_bh")
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Bencom_fecoutput_pvals_conover_test.csv")
res

Unnamed: 0,WT_SPF,ASO_SPF,ASO_bC,Kruskal-Wallis
WT_SPF,1.0,0.230255,0.230255,0.208371
ASO_SPF,0.230255,1.0,0.847247,0.208371
ASO_bC,0.230255,0.847247,1.0,0.208371


## Fecal score

In [84]:
work_df = data_dict['Fecal_score'].copy()

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [85]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(19.117927136007715), pvalue=np.float64(7.056589752860742e-05))

In [86]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()), p_adjust="fdr_bh")
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Bencom_fecal_score_pvals_conover_test.csv")
res

Unnamed: 0,ASO_SPF,ASO_bC,WT_SPF,Kruskal-Wallis
ASO_SPF,1.0,0.004568,2e-06,7.1e-05
ASO_bC,0.004568,1.0,0.006583,7.1e-05
WT_SPF,2e-06,0.006583,1.0,7.1e-05


## Bead expulsion

In [87]:
work_df = data_dict['Bead_exp'].copy()

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n.astype('int32')

### NHST

In [88]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(14.764041721977692), pvalue=np.float64(0.0006223419477943169))

In [89]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()), p_adjust="fdr_bh")
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Bencom_beadexp_pvals_conover_test.csv")
res

Unnamed: 0,ASO_SPF,ASO_bC,WT_SPF,Kruskal-Wallis
ASO_SPF,1.0,0.031326,0.000101,0.000622
ASO_bC,0.031326,1.0,0.010535,0.000622
WT_SPF,0.000101,0.010535,1.0,0.000622


## Beam steps/slips

In [90]:
work_df = data_dict['Beam_steps']
# work_df = work_df.pivot(index=['ID', 'Trial', 'Cohort', 'Genotype', 'Microbiome', 'Cage'], columns='Type', values='Measurement').reset_index()
# work_df['Total_steps'] = work_df['Steps'] * 4
# work_df['Slip_probability'] = work_df['Slips']/work_df['Total_steps']
# work_df = work_df.groupby(by=['Cohort', 'Genotype', 'Microbiome', 'Cage', 'ID'])['Slip_probability'].mean().reset_index()

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [91]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(24.7615871961726), pvalue=np.float64(4.198456508956678e-06))

In [92]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()), p_adjust="fdr_bh")
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Bencom_beamsteps_pvals_conover_test.csv")
res

Unnamed: 0,ASO_SPF,ASO_bC,WT_SPF,Kruskal-Wallis
ASO_SPF,1.0,0.6038566,2.26757e-07,4e-06
ASO_bC,0.6038566,1.0,3.217036e-08,4e-06
WT_SPF,2.26757e-07,3.217036e-08,1.0,4e-06


## Carmine red

In [93]:
work_df = data_dict['Carmine_red']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n.astype('int32')

### NHST

In [94]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(5.08941384338645), pvalue=np.float64(0.07849605410046921))

In [95]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()), p_adjust="fdr_bh")
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Bencom_carminered_pvals_conover_test.csv")
res

Unnamed: 0,ASO_SPF,ASO_bC,WT_SPF,Kruskal-Wallis
ASO_SPF,1.0,0.933,0.096629,0.078496
ASO_bC,0.933,1.0,0.096629,0.078496
WT_SPF,0.096629,0.096629,1.0,0.078496


## Water content

In [96]:
work_df = data_dict['Water_content'].copy()

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [97]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(9.846132118046107), pvalue=np.float64(0.007276785554293323))

In [98]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()), p_adjust="fdr_bh")
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Bencom_watercont_pvals_conover_test.csv")
res

Unnamed: 0,ASO_SPF,ASO_bC,WT_SPF,Kruskal-Wallis
ASO_SPF,1.0,0.386822,0.005609,0.007277
ASO_bC,0.386822,1.0,0.02096,0.007277
WT_SPF,0.005609,0.02096,1.0,0.007277
