# Poop Power: Beta Diversity Analysis, Rarefaction and Significance Tests

In [2]:
# importing all required packages & notebook extensions at the start of the notebook
import os
import pandas as pd
import qiime2 as q2
import seaborn as sns
import matplotlib.pyplot as plt
from skbio import OrdinationResults
from qiime2 import Visualization as vis
from seaborn import scatterplot

%matplotlib inline

In [3]:
data_dir = 'poop_data/BetaDiversity'
div_dir = 'poop_data/Diversity'
phy_dir = 'poop_data/Phylogeny'
tox_dir = 'poop_data/Taxonomy'
base_dir = 'poop_data'
extract_dir = 'poop_data/BetaDiversity/uw_unifrac_significance_permanova'
w_extract_dir = 'poop_data/BetaDiversity/w_unifrac_significance_permanova'

## 1. Metadata



In [4]:
#get metadata as a dataframe
df_metadata = pd.read_csv('poop_data/metadata.tsv', sep = '\t')
#set sampleid as index
df_metadata.set_index('sampleid', inplace = True)
metadata_col = list(df_metadata.columns)
#excluding NaN-values
#df_metadata.isna().sum()
df_metadata = df_metadata.dropna()
len(df_metadata)

459

In [4]:
#len(original metadata) minus len(metadata without lines with missing values)
523-459

64

In [5]:
df_metadata.nunique()

GEN_age_cat                       8
GEN_age_corrected                71
GEN_bmi_cat                       4
GEN_bmi_corrected               343
GEN_cat                           2
GEN_collection_timestamp        446
GEN_country                      10
GEN_dog                           2
GEN_elevation                   335
GEN_geo_loc_name                 52
GEN_height_cm                    53
GEN_host_common_name              1
GEN_last_move                     6
GEN_last_travel                   6
GEN_latitude                    147
GEN_level_of_education            8
GEN_longitude                   167
GEN_race                          6
GEN_sample_type                   1
GEN_sex                           4
GEN_weight_kg                    80
HEA_acid_reflux                   2
HEA_add_adhd                      2
HEA_allergic_to_peanuts           2
HEA_antibiotic_history            6
HEA_appendix_removed              2
HEA_autoimmune                    2
HEA_bowel_movement_frequency

In [15]:
#list of columns with a) more than one value & b) categorical (you can only do the PERMANOVA with those ones) & c) the ones 
#which give an error during permanova (what would also have to be taken out are clearly collinear variables)
metadata_col_cat = ['GEN_age_cat', 'GEN_bmi_cat', 'GEN_cat',
 'GEN_dog', 'GEN_last_move', 'GEN_last_travel', 'GEN_level_of_education', 'GEN_race', 'GEN_sex',  
 'HEA_acid_reflux', 'HEA_add_adhd', 'HEA_allergic_to_peanuts', 'HEA_antibiotic_history', 'HEA_appendix_removed', 
 'HEA_autoimmune', 'HEA_bowel_movement_frequency', 'HEA_bowel_movement_quality', 'HEA_cancer', 'HEA_cancer_treatment', 
 'HEA_cardiovascular_disease', 'HEA_cdiff', 'HEA_chickenpox', 'HEA_csection', 'HEA_diabetes', 
 'HEA_exercise_frequency', 'HEA_ibd', 'HEA_ibs', 'HEA_liver_disease', 'HEA_lung_disease', 'HEA_mental_illness', 
 'HEA_migraine', 'HEA_seasonal_allergies', 'HEA_sibo', 'HEA_skin_condition', 'HEA_sleep_duration', 
'HEA_smoking_frequency', 'HEA_thyroid', 'HEA_weight_change']

metadata_col_intestine_disease = ['HEA_cdiff', 'HEA_ibd', 'HEA_ibs', 'HEA_sibo', 'HEA_acid_reflux']
metadata_col_intestine_detail = ['HEA_appendix_removed','HEA_bowel_movement_frequency', 
                                 'HEA_bowel_movement_quality']
metadata_col_MO_interruption = ['HEA_csection', 'HEA_antibiotic_history', 'GEN_last_move', 'GEN_last_travel', 
                                'HEA_weight_change']
metadata_col_lifestyle = ['GEN_cat','GEN_dog', 'HEA_exercise_frequency','HEA_sleep_duration', 'HEA_smoking_frequency']
metadata_col_disease_history = ['HEA_cancer_treatment','HEA_chickenpox']
metadata_col_active_disease = ['HEA_cancer','HEA_diabetes','HEA_thyroid','HEA_migraine', 'HEA_lung_disease', 'HEA_liver_disease', 'HEA_cardiovascular_disease']
metadata_col_allergy = ['HEA_autoimmune', 'HEA_seasonal_allergies','HEA_allergic_to_peanuts']
metadata_col_mental = ['HEA_add_adhd','HEA_mental_illness']
metadata_col_body = ['HEA_weight_change','GEN_age_cat', 'GEN_bmi_cat','GEN_race', 'GEN_sex']

## 2. Beta Diversity
### Principal Coordinates Plots PCoA
#### a) unweighted_unifrac_emperor

In [7]:
vis.load(f'{div_dir}/core-metrics-results/unweighted_unifrac_emperor.qzv')

#### b) weighted_unifrac_emperor

In [8]:
vis.load(f'{div_dir}/core-metrics-results/weighted_unifrac_emperor.qzv')

#### c) bray_curtis_emperor

In [9]:
vis.load(f'{div_dir}/core-metrics-results/bray_curtis_emperor.qzv')

#### d) jaccard emperor

In [10]:
vis.load(f'{div_dir}/core-metrics-results/jaccard_emperor.qzv')

## 2. PERMANOVA
#### Anova Test 1: Are coordinates significantly different from each other with differing column values? looking at one column

could be further done: doing permanova also for weighted unifrac, jaccard and bray curtis to look at the differences

In [6]:
! qiime diversity beta-group-significance \
    --i-distance-matrix $div_dir/core-metrics-results/unweighted_unifrac_distance_matrix.qza \
    --m-metadata-file $base_dir/metadata.tsv \
    --m-metadata-column  GEN_bmi_cat \
    --p-pairwise \
    --o-visualization $data_dir/uw_unifrac-GEN_bmi_cat-significance.qzv

[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac-GEN_bmi_cat-significance.qzv[0m
[0m

In [None]:
#doing the permanova test for all of the columns: this will be done further down again, but with additionally other steps
#for column in metadata_col_cat:
 #   ! qiime diversity beta-group-significance \
  #      --i-distance-matrix $div_dir/core-metrics-results/unweighted_unifrac_distance_matrix.qza \
   #     --m-metadata-file $base_dir/metadata.tsv \
    #    --m-metadata-column  $column \
     #   --p-pairwise \
      #  --o-visualization $data_dir/uw_unifrac-$column-significance.qzv

In [8]:
vis.load(f'{data_dir}/uw_unifrac-GEN_bmi_cat-significance.qzv')

In [15]:
###This does not work. how could it work ? 
#for column in metadata_col_cat:
   # Visualization.load(f'{data_dir}/uw_unifrac-{column}-significance.qzv')
    

In [16]:
vis.load(f'{extract_dir}/uw_unifrac-HEA_acid_reflux-significance.qzv')

#### Anova Test 2: 
could be further done: doing permanova also for weighted unifrac, jaccard and bray curtis to look at the differences

In [16]:
#create an input for adonis-anova-test; listing all column titles of metadata

columns = ""
for column in metadata_col_cat:
    if column == "HEA_weight_change":
        columns = columns + column
    else:
        columns = columns + column + "+"
print(columns)

GEN_age_cat+GEN_bmi_cat+GEN_cat+GEN_dog+GEN_last_move+GEN_last_travel+GEN_level_of_education+GEN_race+GEN_sex+HEA_acid_reflux+HEA_add_adhd+HEA_allergic_to_peanuts+HEA_antibiotic_history+HEA_appendix_removed+HEA_autoimmune+HEA_bowel_movement_frequency+HEA_bowel_movement_quality+HEA_cancer+HEA_cancer_treatment+HEA_cardiovascular_disease+HEA_cdiff+HEA_chickenpox+HEA_csection+HEA_diabetes+HEA_exercise_frequency+HEA_ibd+HEA_ibs+HEA_liver_disease+HEA_lung_disease+HEA_mental_illness+HEA_migraine+HEA_seasonal_allergies+HEA_sibo+HEA_skin_condition+HEA_sleep_duration+HEA_smoking_frequency+HEA_thyroid+HEA_weight_change


In [18]:
df_metadata.to_csv(f'{data_dir}/metadata_dropna.csv', sep = '\t')  
df_metadata3 = pd.read_csv(f'{data_dir}/metadata_dropna.csv', sep = '\t')

In [19]:
missing_samples = ['10317.000047381', '10317.000036431', '10317.000053480', '10317.000054310', 
                   '10317.000047230', '10317.000054330', '10317.000039980', '10317.000030366', 
                   '10317.000047370', '10317.000031598', '10317.000042660', '10317.000062070', 
                   '10317.000044340', '10317.000050288', '10317.000047228', '10317.000040490', 
                   '10317.000046290', '10317.000046305', '10317.000047404', '10317.000051558', 
                   '10317.000036170', '10317.000037960', '10317.000002930', '10317.000053353', 
                   '10317.000053458', '10317.000047151', '10317.000051160', '10317.000048326', 
                   '10317.000048283', '10317.000047140', '10317.000053435', '10317.000033294', 
                   '10317.000042590', '10317.000052055', '10317.000041592', '10317.000042635', 
                   '10317.000051560', '10317.000042655', '10317.000053430', '10317.000050273', 
                   '10317.000047606', '10317.000050240', '10317.000030383', '10317.000027920', 
                   '10317.000047141', '10317.000028654', '10317.000052260', '10317.000046121',
                   '10317.000050156', '10317.000052034', '10317.000054323', '10317.000051258', 
                   '10317.000044550', '10317.000042969', '10317.000062073', '10317.000038019',
                   '10317.000058480', '10317.000046336', '10317.000052280', '10317.000052450', 
                   '10317.000051561', '10317.000051130', '10317.000041730', '10317.000050294', 
                   '10317.000052030', '10317.000052380', '10317.000047777', '10317.000048284',
                   '10317.000047680', '10317.000030384', '10317.000047610', '10317.000047380', 
                   '10317.000051210', '10317.000047196', '10317.000050290', '10317.000051100', 
                   '10317.000047220', '10317.000058550', '10317.000044252', '10317.000053433', 
                   '10317.000053410', '10317.000054313', '10317.000042865', '10317.000062083', 
                   '10317.000062076', '10317.000038081', '10317.000051180', '10317.000052448', 
                   '10317.000047197', '10317.000042845', '10317.000052431', '10317.000030255',
                   '10317.000048277', '10317.000032650', '10317.000047620', '10317.000052370', 
                   '10317.000052432', '10317.000053310', '10317.000046270']
len(missing_samples)

99

**This is weird though, as the .dropna() command only deleted 63 lines and not 99. where did some of the other lines got lost?**

In [13]:
#problem here: it was not happy that there is missing data in the df. I deleted the lines with missing data. 
#now it is complaining that there are lines missing I guess (and i just had to delete them as they were empty. 
#how to solve that? replacing NaN values with fake values? this is not really possible for boolean columns I 
#guess...)
! qiime diversity adonis \
    --i-distance-matrix $div_dir/core-metrics-results/unweighted_unifrac_distance_matrix.qza \
    --m-metadata-file $base_dir/metadata.tsv \
    --p-formula $columns \
    --o-visualization $data_dir/uw_unifrac-overall-significance-adonis.qzv

[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac-overall-significance-adonis.qzv[0m
[0m

In [17]:
#same for weighted unifrac
! qiime diversity adonis \
    --i-distance-matrix $div_dir/core-metrics-results/weighted_unifrac_distance_matrix.qza \
    --m-metadata-file $base_dir/metadata.tsv \
    --p-formula $columns \
    --o-visualization $data_dir/w_unifrac-overall-significance-adonis.qzv

[32mSaved Visualization to: poop_data/BetaDiversity/w_unifrac-overall-significance-adonis.qzv[0m
[0m

In [20]:
vis.load(f'{data_dir}/w_unifrac-overall-significance-adonis.qzv')

In [22]:
! qiime diversity adonis \
    --i-distance-matrix $div_dir/core-metrics-results/unweighted_unifrac_distance_matrix.qza \
    --m-metadata-file $base_dir/metadata.tsv \
    --p-formula  'HEA_cdiff+HEA_ibd+HEA_ibs+HEA_sibo+HEA_acid_reflux'\
    --o-visualization $data_dir/uw_unifrac-cdiff_ibd_ibs_sibo_acidrefl-significance-adonis.qzv

[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac-cdiff_ibd_ibs_sibo_acidrefl-significance-adonis.qzv[0m
[0m

In [23]:
vis.load(f'{data_dir}/uw_unifrac-cdiff_ibd_ibs_sibo_acidrefl-significance-adonis.qzv')

In [24]:
! qiime diversity adonis \
    --i-distance-matrix $div_dir/core-metrics-results/unweighted_unifrac_distance_matrix.qza \
    --m-metadata-file $base_dir/metadata.tsv \
    --p-formula  'HEA_csection+HEA_antibiotic_history+GEN_last_move+GEN_last_travel+HEA_weight_change' \
    --o-visualization $data_dir/uw_unifrac-MO_interruption-significance-adonis.qzv

[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac-MO_interruption-significance-adonis.qzv[0m
[0m

In [18]:
vis.load(f'{data_dir}/uw_unifrac-MO_interruption-significance-adonis.qzv')

In [26]:
! qiime diversity adonis \
    --i-distance-matrix $div_dir/core-metrics-results/unweighted_unifrac_distance_matrix.qza \
    --m-metadata-file $base_dir/metadata.tsv \
    --p-formula  'HEA_cancer+HEA_diabetes+HEA_thyroid+HEA_migraine+HEA_lung_disease+HEA_liver_disease+HEA_cardiovascular_disease' \
    --o-visualization $data_dir/uw_unifrac-active_diseases-significance-adonis.qzv

[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac-active_diseases-significance-adonis.qzv[0m
[0m

In [27]:
vis.load(f'{data_dir}/uw_unifrac-active_diseases-significance-adonis.qzv')

In [28]:
! qiime diversity adonis \
    --i-distance-matrix $div_dir/core-metrics-results/unweighted_unifrac_distance_matrix.qza \
    --m-metadata-file $base_dir/metadata.tsv \
    --p-formula  'GEN_cat+GEN_dog+HEA_exercise_frequency+HEA_sleep_duration+HEA_smoking_frequency' \
    --o-visualization $data_dir/uw_unifrac-lifestyle-significance-adonis.qzv

[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac-lifestyle-significance-adonis.qzv[0m
[0m

In [29]:
vis.load(f'{data_dir}/uw_unifrac-lifestyle-significance-adonis.qzv')

In [30]:
! qiime diversity adonis \
    --i-distance-matrix $div_dir/core-metrics-results/unweighted_unifrac_distance_matrix.qza \
    --m-metadata-file $base_dir/metadata.tsv \
    --p-formula  'HEA_appendix_removed+HEA_bowel_movement_frequency+HEA_bowel_movement_quality' \
    --o-visualization $data_dir/uw_unifrac-intestine_detail-significance-adonis.qzv

[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac-intestine_detail-significance-adonis.qzv[0m
[0m

In [31]:
vis.load(f'{data_dir}/uw_unifrac-intestine_detail-significance-adonis.qzv')

adonis: einfach nochmals probieren mit weniger spalten auf einmal (vlt schlau gruppieren),
    vlt die GEN fakroten mit multiplikation dazu nehmen wegen interaktionen? ausprobieren...
    und sonst wenn man nicht  davon ausgeht dass es interaktionen gibt dann einfach mit plus

In [32]:
! qiime diversity adonis --help

Usage: [94mqiime diversity adonis[0m [OPTIONS]

  Determine whether groups of samples are significantly different from one
  another using the ADONIS permutation-based statistical test in vegan-R.
  The function partitions sums of squares of a multivariate data set, and is
  directly analogous to MANOVA (multivariate analysis of variance). This
  action differs from beta_group_significance in that it accepts R formulae
  to perform multi-way ADONIS tests; beta_group_signficance only performs
  one-way tests. For more details, consult the reference manual available on
  the CRAN vegan page: https://CRAN.R-project.org/package=vegan

[1mInputs[0m:
  [94m[4m--i-distance-matrix[0m ARTIFACT
    [32mDistanceMatrix[0m     Matrix of distances between pairs of samples.
                                                                    [35m[required][0m
[1mParameters[0m:
  [94m[4m--m-metadata-file[0m METADATA...
    (multiple          Sample metadata containing formula terms.
   

In [33]:
metadata_col_cat_trial = ['HEA_cardiovascular_disease', 'HEA_cdiff']

In [34]:
#empty dataframe for adding the other dataframes with q-value-informations to it
df_empty = pd.DataFrame(columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', 'pseudo-F', 
                                   'p-value', 'q-value', 'column'])

for column in metadata_col_cat:
    #producing path name:
    path = f'{extract_dir}/uw_unifrac-{column}-significance.qzv'
    name = f'uw_unifrac-{column}-significance.qzv'
    
    #permanova visualisation production for every column
    ! qiime diversity beta-group-significance \
        --i-distance-matrix $div_dir/core-metrics-results/unweighted_unifrac_distance_matrix.qza \
        --m-metadata-file $base_dir/metadata.tsv \
        --m-metadata-column  $column \
        --p-pairwise \
        --o-visualization $data_dir/uw_unifrac_significance_permanova/uw_unifrac-$column-significance.qzv
    
    #extract the qzv that was just produced
    vis.extract(f'{extract_dir}/{name}', output_dir = f'{extract_dir}/permanova_extracted')
    
    #get the uuid of it
    uid = vis.peek(f'{extract_dir}/{name}').uuid
    
    #go to the folder of the uuid and turn the permanova_paired table as a dataframe
    df_permanova = pd.read_csv(f'{extract_dir}/permanova_extracted/{uid}/data/permanova-pairwise.csv')
    
    #add a new column for the assignment to the original column
    df_permanova['column'] = column
    
    #concat the new dataframe with the ones before
    df_empty = pd.concat([df_empty, df_permanova])
    
#close the loop
    
df_empty

[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac_significance_permanova/uw_unifrac-GEN_age_cat-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac_significance_permanova/uw_unifrac-GEN_bmi_cat-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac_significance_permanova/uw_unifrac-GEN_cat-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac_significance_permanova/uw_unifrac-GEN_dog-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac_significance_permanova/uw_unifrac-GEN_last_move-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac_significance_permanova/uw_unifrac-GEN_last_travel-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/uw_unifrac_significance_permanova/uw_unifrac-GEN_level_of_education-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiv

Unnamed: 0,Group 1,Group 2,Sample size,Permutations,pseudo-F,p-value,q-value,column
0,20s,30s,118,999,1.357405,0.064,0.177231,GEN_age_cat
1,20s,40s,149,999,2.110666,0.002,0.018000,GEN_age_cat
2,20s,50s,164,999,2.972795,0.001,0.018000,GEN_age_cat
3,20s,60s,124,999,3.032620,0.001,0.018000,GEN_age_cat
4,20s,70+,78,999,2.320031,0.002,0.018000,GEN_age_cat
...,...,...,...,...,...,...,...,...
1,Decreased more than 10 pounds,Not provided,63,999,1.077469,0.319,0.319000,HEA_weight_change
2,Decreased more than 10 pounds,Remained stable,460,999,1.247888,0.101,0.151500,HEA_weight_change
3,Increased more than 10 pounds,Not provided,48,999,1.149553,0.226,0.271200,HEA_weight_change
4,Increased more than 10 pounds,Remained stable,445,999,1.908779,0.005,0.030000,HEA_weight_change


In [35]:
df_empty.to_csv(f'{data_dir}/uw_unifrac_significance.csv')

In [36]:
#these are all the paired comparisons from permanova, where the q-value is below significance-level
df_unifrac_sig = pd.read_csv(f'{data_dir}/uw_unifrac_significance.csv')
df_significant = df_unifrac_sig[df_unifrac_sig['q-value']<0.05]
df_significant.sort_values(by = 'q-value')

Unnamed: 0.1,Unnamed: 0,Group 1,Group 2,Sample size,Permutations,pseudo-F,p-value,q-value,column
209,0,False,True,508,999,2.741888,0.001,0.001,HEA_ibd
190,0,False,True,508,999,2.147153,0.001,0.001,HEA_cdiff
137,7,I have not taken antibiotics in the past year.,Week,353,999,3.67775,0.001,0.005,HEA_antibiotic_history
135,5,I have not taken antibiotics in the past year.,Month,357,999,2.437568,0.001,0.005,HEA_antibiotic_history
173,5,I tend to be constipated (have difficulty pass...,I tend to have normal formed stool - Type 3 and 4,403,999,2.404935,0.001,0.005,HEA_bowel_movement_quality
172,4,I tend to be constipated (have difficulty pass...,I tend to have diarrhea (watery stool) - Type ...,139,999,2.380821,0.001,0.005,HEA_bowel_movement_quality
130,0,6 months,I have not taken antibiotics in the past year.,409,999,3.454242,0.001,0.005,HEA_antibiotic_history
216,0,False,True,508,999,1.748658,0.007,0.007,HEA_sibo
148,1,Five or more,Less than one,64,999,3.122729,0.001,0.007,HEA_bowel_movement_frequency
161,14,Less than one,Two,180,999,2.084991,0.001,0.007,HEA_bowel_movement_frequency


### Now doing the same thing (PERMANOVA) but for weighted unifrac results!

In [38]:
#empty dataframe for adding the other dataframes with q-value-informations to it
df_empty2 = pd.DataFrame(columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', 'pseudo-F', 
                                   'p-value', 'q-value', 'column'])

for column in metadata_col_cat:
    #producing path name:
    path = f'{w_extract_dir}/w_unifrac-{column}-significance.qzv'
    name = f'w_unifrac-{column}-significance.qzv'
    
    #permanova visualisation production for every column
    ! qiime diversity beta-group-significance \
        --i-distance-matrix $div_dir/core-metrics-results/weighted_unifrac_distance_matrix.qza \
        --m-metadata-file $base_dir/metadata.tsv \
        --m-metadata-column  $column \
        --p-pairwise \
        --o-visualization $data_dir/w_unifrac_significance_permanova/w_unifrac-$column-significance.qzv
    
    #extract the qzv that was just produced
    vis.extract(f'{w_extract_dir}/{name}', output_dir = f'{w_extract_dir}/permanova_extracted')
    
    #get the uuid of it
    uid = vis.peek(f'{w_extract_dir}/{name}').uuid
    
    #go to the folder of the uuid and turn the permanova_paired table as a dataframe
    df_permanova = pd.read_csv(f'{w_extract_dir}/permanova_extracted/{uid}/data/permanova-pairwise.csv')
    
    #add a new column for the assignment to the original column
    df_permanova['column'] = column
    
    #concat the new dataframe with the ones before
    df_empty2 = pd.concat([df_empty, df_permanova])
    
#close the loop
  
df_empty2.to_csv(f'{data_dir}/w_unifrac_significance.csv')


[32mSaved Visualization to: poop_data/BetaDiversity/w_unifrac_significance_permanova/w_unifrac-GEN_age_cat-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/w_unifrac_significance_permanova/w_unifrac-GEN_bmi_cat-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/w_unifrac_significance_permanova/w_unifrac-GEN_cat-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/w_unifrac_significance_permanova/w_unifrac-GEN_dog-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/w_unifrac_significance_permanova/w_unifrac-GEN_last_move-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/w_unifrac_significance_permanova/w_unifrac-GEN_last_travel-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/w_unifrac_significance_permanova/w_unifrac-GEN_level_of_education-significance.qzv[0m
[0m[32mSaved Visualization to: poop_data/BetaDiversity/w_unifr

In [39]:
df_w_unifrac_sig = pd.read_csv(f'{data_dir}/w_unifrac_significance.csv')
df_w_significant = df_w_unifrac_sig[df_w_unifrac_sig['q-value']<0.05]
df_w_significant.sort_values(by = 'q-value')

Unnamed: 0.1,Unnamed: 0,Group 1,Group 2,Sample size,Permutations,pseudo-F,p-value,q-value,column
209,0,False,True,508,999,2.741888,0.001,0.001,HEA_ibd
190,0,False,True,508,999,2.147153,0.001,0.001,HEA_cdiff
173,5,I tend to be constipated (have difficulty pass...,I tend to have normal formed stool - Type 3 and 4,403,999,2.404935,0.001,0.005,HEA_bowel_movement_quality
172,4,I tend to be constipated (have difficulty pass...,I tend to have diarrhea (watery stool) - Type ...,139,999,2.380821,0.001,0.005,HEA_bowel_movement_quality
137,7,I have not taken antibiotics in the past year.,Week,353,999,3.67775,0.001,0.005,HEA_antibiotic_history
135,5,I have not taken antibiotics in the past year.,Month,357,999,2.437568,0.001,0.005,HEA_antibiotic_history
130,0,6 months,I have not taken antibiotics in the past year.,409,999,3.454242,0.001,0.005,HEA_antibiotic_history
257,2,Decreased more than 10 pounds,Remained stable,460,999,6.10233,0.001,0.006,HEA_weight_change
216,0,False,True,508,999,1.748658,0.007,0.007,HEA_sibo
161,14,Less than one,Two,180,999,2.084991,0.001,0.007,HEA_bowel_movement_frequency


So the 12 columns which have the highest significance in Beta Diversity analysis are: 

In [15]:
top12_permanova_weighted = ['HEA_ibd', 'HEA_cdiff', 'HEA_bowel_movement_quality','HEA_antibiotic_history', 'HEA_weight_change', 
         'HEA_sibo', 'HEA_bowel_movement_frequency', 'GEN_race', 'HEA_exercise_frequency', 
         'HEA_chickenpox','GEN_age_cat', 'HEA_allergic_to_peanuts']

These are the columns which seem to significantly explain some variance in our data (from unweighted adonis test):

In [14]:
df_adonis_uw = pd.read_csv(f'{data_dir}/adonis_unweighted.tsv', sep = '\t')
df_adonis_uw = df_adonis_uw.sort_values(by = 'Pr(>F)')
df_adonis_uw[df_adonis_uw['Pr(>F)'] < 0.05]

Unnamed: 0,Df,SumsOfSqs,MeanSqs,F.Model,R2,Pr(>F)
GEN_age_cat,8,2.242974,0.280372,1.490067,0.022618,0.001
HEA_antibiotic_history,5,1.753241,0.350648,1.86356,0.01768,0.001
HEA_bowel_movement_frequency,6,1.45471,0.242452,1.288537,0.014669,0.003
GEN_bmi_cat,4,1.059552,0.264888,1.407777,0.010684,0.004
HEA_cdiff,1,0.335836,0.335836,1.784837,0.003387,0.008
GEN_last_travel,5,1.198868,0.239774,1.274304,0.012089,0.011
HEA_acid_reflux,1,0.285962,0.285962,1.519779,0.002884,0.024
HEA_allergic_to_peanuts,1,0.295631,0.295631,1.571166,0.002981,0.025
GEN_race,5,1.151559,0.230312,1.224018,0.011612,0.029
HEA_appendix_removed,1,0.276879,0.276879,1.471505,0.002792,0.037


And these are the columns which seem to significantly explain some variance in our data (from weighted adonis test):

In [23]:
df_adonis_w = pd.read_csv(f'{data_dir}/adonis_weighted.tsv', sep = '\t')
df_adonis_w = df_adonis_w.sort_values(by = 'Pr(>F)')
df_adonis_w[df_adonis_w['Pr(>F)'] < 0.05]

Unnamed: 0,Df,SumsOfSqs,MeanSqs,F.Model,R2,Pr(>F)
GEN_age_cat,8,2.03254,0.254068,1.794976,0.027329,0.005
HEA_cdiff,1,0.601527,0.601527,4.249763,0.008088,0.005
HEA_csection,1,0.487865,0.487865,3.446744,0.00656,0.013
