In [76]:
import os
import numpy as np
import scipy.stats as st
import pandas as pd
import itertools
import scikit_posthocs

import iqplot

import bokeh.io
import bokeh.plotting
import bokeh.layouts
bokeh.io.output_notebook()

## Exploratory Data Analysis

1. Uploading the whole excel file to read from all the sheets later.

In [77]:
path = os.path.join('..', 'data', 'FPrausnitzii_motor_data.xlsx')
data = pd.ExcelFile(path)

2. Creating a list of all sheets (tests) that we want to analyse.

In [78]:
tests = [i for i in data.sheet_names if ('Cohort' not in i)]
tests

['Beam',
 'Beam_steps',
 'Pole',
 'Wirehang',
 'Sticker',
 'Hindlimb',
 'Fecal_output',
 'Fecal_score',
 'Water_content',
 'Carmine_red',
 'Bead_exp']

3. Parsing the Excel file into separate datasets (1 test = 1 dataset) and storing them in a dictionary with keys = names of the tests/sheets

In [79]:
data_dict = {}

for test in tests:
    temp_df = data.parse(test)   

    trial_cols = []
    for i in list(temp_df.columns):
        if 'Trial' in (i):
            trial_cols.append(i)    
            
    temp_df = temp_df.melt(id_vars=['Cohort', 'Genotype', 'Microbiome', 'Cage', 'ID'], 
            value_vars=trial_cols, 
            var_name='Trial', 
            value_name='Measurement')
    
    if test == 'Beam_steps':
        temp_df.loc[:, 'Type'] = temp_df.loc[:, 'Trial'].str.split('_', expand=True)[0]
        temp_df.loc[:, 'Trial'] = temp_df.loc[:, 'Trial'].str.split('_', expand=True)[1]    

    temp_df = temp_df.dropna()
    data_dict[test] = temp_df

4. Plotting the raw data from all the tests for the EDA.

In [80]:
plots = []

for test in tests:
    df = data_dict[test]
    if test == 'Fecal_output':
        df = df.loc[df['Trial'] == 'Trial15']
    else:
        df = df.loc[df['Trial'] == 'Trial1']
        
    p = iqplot.stripbox(
        df,
        q='Measurement',
        q_axis='y',
        cats=['Genotype', 'Microbiome'],
        tooltips=[
            ('Mouse', '@ID'),
            ('Cage', '@Cage')
        ],
        title=test,
        color_column='ID',
    )
    p.y_range = bokeh.models.Range1d(0.8*np.min(data_dict[test].loc[:, 'Measurement'])-1, 1.1*np.max(data_dict[test].loc[:, 'Measurement']))
    plots.append(p)

lt = bokeh.layouts.grid(plots, ncols=2)
bokeh.io.show(lt)

for n, i in enumerate(plots):
    bokeh.io.save(
        i,
        filename=('../figures/' + 'EDA_Fprausnitzii_' + tests[n] + '.html'),
        title='Bokeh plot',
        resources=bokeh.resources.CDN)


## Pole descent

In [81]:
trial = 'Trial1'

work_df = data_dict['Pole']
work_df = work_df.loc[work_df['Trial'] == trial]

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [82]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(10.329344495555663), pvalue=np.float64(0.005714935614929976))

In [83]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()))
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Fprausnitzii_pole_pvals_conover_test.csv")
res

Unnamed: 0,WT_SPF,ASO_SPF,ASO_Fpr,Kruskal-Wallis
WT_SPF,1.0,0.003803,0.001988,0.005715
ASO_SPF,0.003803,1.0,0.834307,0.005715
ASO_Fpr,0.001988,0.834307,1.0,0.005715


## Beam cross

In [84]:
trial = 'Trial1'

work_df = data_dict['Beam']
work_df = work_df.loc[work_df['Trial'] == trial]

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [85]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(21.02895700020801), pvalue=np.float64(2.714063517716188e-05))

In [86]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()))
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Fprausnitzii_beam_pvals_conover_test.csv")
res

Unnamed: 0,WT_SPF,ASO_SPF,ASO_Fpr,Kruskal-Wallis
WT_SPF,1.0,5.673248e-07,1.1e-05,2.7e-05
ASO_SPF,5.673248e-07,1.0,0.293427,2.7e-05
ASO_Fpr,1.11713e-05,0.2934273,1.0,2.7e-05


## Sticker removal

In [87]:
trial = 'Trial1'

work_df = data_dict['Sticker']
work_df = work_df.loc[work_df['Trial'] == trial]

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [88]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(14.037830974397934), pvalue=np.float64(0.0008947953838723295))

In [89]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()))
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Fprausnitzii_sticker_pvals_conover_test.csv")
res

Unnamed: 0,WT_SPF,ASO_SPF,ASO_Fpr,Kruskal-Wallis
WT_SPF,1.0,8.2e-05,0.070395,0.000895
ASO_SPF,8.2e-05,1.0,0.008303,0.000895
ASO_Fpr,0.070395,0.008303,1.0,0.000895


## Wirehang

In [90]:
work_df = data_dict['Wirehang'].copy()
work_df = work_df.loc[work_df['Trial'] == 'Trial1']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n


### NHST

In [91]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(19.234772786514913), pvalue=np.float64(6.656135572079036e-05))

In [92]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()))
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Fprausnitzii_wirehang_pvals_conover_test.csv")
res

Unnamed: 0,WT_SPF,ASO_SPF,ASO_Fpr,Kruskal-Wallis
WT_SPF,1.0,1e-06,0.00016,6.7e-05
ASO_SPF,1e-06,1.0,0.105464,6.7e-05
ASO_Fpr,0.00016,0.105464,1.0,6.7e-05


## Hindlimb

In [93]:
work_df = data_dict['Hindlimb'].copy()
work_df = work_df.loc[work_df['Trial'] == 'Trial1']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [94]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(23.03800658268312), pvalue=np.float64(9.93940606253031e-06))

In [95]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()))
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Fprausnitzii_hindlimb_pvals_conover_test.csv")
res

Unnamed: 0,WT_SPF,ASO_SPF,ASO_Fpr,Kruskal-Wallis
WT_SPF,1.0,8.666641e-08,0.049669,1e-05
ASO_SPF,8.666641e-08,1.0,1.5e-05,1e-05
ASO_Fpr,0.04966885,1.500221e-05,1.0,1e-05


## Fecal output

In [96]:
work_df = data_dict['Fecal_output'].copy()
work_df = work_df.loc[work_df['Trial'] == 'Trial15']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j) & (work_df['Trial'] == 'Trial15'), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n.astype('int32')

### NHST

In [97]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(1.408322805632333), pvalue=np.float64(0.4945231061033477))

In [98]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()))
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Fprausnitzii_fecoutput_pvals_conover_test.csv")
res

Unnamed: 0,WT_SPF,ASO_SPF,ASO_Fpr,Kruskal-Wallis
WT_SPF,1.0,0.245341,0.511089,0.494523
ASO_SPF,0.245341,1.0,0.568811,0.494523
ASO_Fpr,0.511089,0.568811,1.0,0.494523


## Fecal score

In [99]:
work_df = data_dict['Fecal_score'].copy()
work_df = work_df.loc[work_df['Trial'] == 'Trial1']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [100]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(29.2456438588259), pvalue=np.float64(4.460557336410156e-07))

In [101]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()))
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Fprausnitzii_fecal_score_pvals_conover_test.csv")
res

Unnamed: 0,WT_SPF,ASO_SPF,ASO_Fpr,Kruskal-Wallis
WT_SPF,1.0,2.045388e-10,0.1730298,4.460557e-07
ASO_SPF,2.045388e-10,1.0,1.84443e-09,4.460557e-07
ASO_Fpr,0.1730298,1.84443e-09,1.0,4.460557e-07


## Bead expulsion

In [102]:
work_df = data_dict['Bead_exp'].copy()
work_df = work_df.loc[work_df['Trial'] == 'Trial1']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n.astype('int32')

### NHST

In [103]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(19.399203693790138), pvalue=np.float64(6.130790012521373e-05))

In [104]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()))
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Fprausnitzii_beadexp_pvals_conover_test.csv")
res

Unnamed: 0,WT_SPF,ASO_SPF,ASO_Fpr,Kruskal-Wallis
WT_SPF,1.0,5e-06,0.236675,6.1e-05
ASO_SPF,5e-06,1.0,4.2e-05,6.1e-05
ASO_Fpr,0.236675,4.2e-05,1.0,6.1e-05


## Beam steps/slips

In [105]:
work_df = data_dict['Beam_steps']
work_df = work_df.pivot(index=['ID', 'Trial', 'Cohort', 'Genotype', 'Microbiome', 'Cage'], columns='Type', values='Measurement').reset_index()
work_df['Total_steps'] = work_df['Steps'] * 4
work_df['Slip_probability'] = work_df['Slips']/work_df['Total_steps']
work_df = work_df.loc[work_df['Trial'] == 'Trial1']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Slip_probability'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [106]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(28.297535771065167), pvalue=np.float64(7.165857731539214e-07))

In [107]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()))
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Fprausnitzii_beadexp_pvals_conover_test.csv")
res

Unnamed: 0,ASO_SPF,ASO_Fpr,WT_SPF,Kruskal-Wallis
ASO_SPF,1.0,4.5e-05,2.849863e-11,7.165858e-07
ASO_Fpr,4.498464e-05,1.0,1.597817e-05,7.165858e-07
WT_SPF,2.849863e-11,1.6e-05,1.0,7.165858e-07


## Carmine red

In [108]:
work_df = data_dict['Carmine_red']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n.astype('int32')

### NHST

In [109]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(11.59727666312338), pvalue=np.float64(0.0030316800791488348))

In [110]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()))
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Fprausnitzii_beadexp_pvals_conover_test.csv")
res

Unnamed: 0,WT_SPF,ASO_SPF,ASO_Fpr,Kruskal-Wallis
WT_SPF,1.0,0.032557,0.194659,0.003032
ASO_SPF,0.032557,1.0,0.000396,0.003032
ASO_Fpr,0.194659,0.000396,1.0,0.003032


## Water content

In [111]:
work_df = data_dict['Water_content'].copy()
work_df = work_df.loc[work_df['Trial'] == 'Trial1']

group_vals = {}

effect_1 = 'Genotype'
effect_2 = 'Microbiome'

effect1_lst = work_df[effect_1].unique()
effect2_lst = work_df[effect_2].unique()

for i in effect1_lst:
    for j in effect2_lst:
        name = i + '_' + j
        n = work_df.loc[(work_df[effect_1] == i) & (work_df[effect_2] == j), 'Measurement'].values
        if len(n) != 0:
            group_vals[name] = n

### NHST

In [112]:
res_kw = st.kruskal(*list(group_vals.values()))
res_kw

KruskalResult(statistic=np.float64(15.927219029572782), pvalue=np.float64(0.00034789511587313734))

In [113]:
res = scikit_posthocs.posthoc_conover(list(group_vals.values()))
names = list(group_vals.keys())
res = res.rename(columns={1:names[0], 2:names[1], 3:names[2]}, index={1:names[0], 2:names[1], 3:names[2]})
res['Kruskal-Wallis'] = res_kw.pvalue
res.to_csv("../output/Fprausnitzii_watercont_pvals_conover_test.csv")
res

Unnamed: 0,WT_SPF,ASO_SPF,ASO_Fpr,Kruskal-Wallis
WT_SPF,1.0,2.1e-05,0.000953,0.000348
ASO_SPF,2.1e-05,1.0,0.149596,0.000348
ASO_Fpr,0.000953,0.149596,1.0,0.000348
