This notebook prints out the exchanged OTUs in differentially prevalent order.

In [1]:
import pandas as pd
import numpy as np
from statsmodels.sandbox.stats.multicomp import multipletests


import os, sys
src_dir = os.path.normpath(os.path.join(os.getcwd(), '../util'))
sys.path.append(src_dir)
from util import convert_to_latex



In [2]:
fpreva = '../../data/analysis/prevalence.partial_corrs.nthresh10-qthresh01-rthresh0.txt'
preva = pd.read_csv(fpreva, sep='\t')
preva.head()

Unnamed: 0,otu,prevalence_exchange,meta_var,meta_val,site_comparison,n_patients
0,k__Bacteria;p__;c__;o__;f__;g__;s__;d__denovo364,0.215909,all_patients,all_patients,bal-gastric_fluid,88
1,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,0.284091,all_patients,all_patients,bal-gastric_fluid,88
2,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,0.431818,all_patients,all_patients,bal-gastric_fluid,88
3,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,0.227273,all_patients,all_patients,bal-gastric_fluid,88
4,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,0.227273,all_patients,all_patients,bal-gastric_fluid,88


In [3]:
# Number of exchange OTUs per site combination
preva.query('meta_var == "all_patients"').groupby('site_comparison').size()

site_comparison
bal-gastric_fluid             76
bal-throat_swab               13
gastric_fluid-throat_swab    117
dtype: int64

# BAL-oropharyngeal OTUs

In [4]:
# Let's look just at the aspirator vs. non-aspirator comparisons
preva = preva.query('meta_var == "mbs_consolidated"')

# And just the bal-throat_swab comparison
balthr_preva = preva.query('site_comparison == "bal-throat_swab"')

In [5]:
# Differential prevalence
diffpreva = balthr_preva.pivot(index='otu', columns='meta_val', values='prevalence_exchange')
diffpreva['asp_minus_nml'] = diffpreva['Aspiration/Penetration'] - diffpreva['Normal']
diffpreva['abs_diff'] = abs(diffpreva['asp_minus_nml'])
diffpreva.sort_values(by='abs_diff', ascending=False)

meta_val,Aspiration/Penetration,Normal,asp_minus_nml,abs_diff
otu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__;s__;d__denovo93,0.48,0.086957,0.393043,0.393043
k__Bacteria;p__Fusobacteria;c__Fusobacteriia;o__Fusobacteriales;f__Fusobacteriaceae;g__Fusobacterium;s__;d__denovo8,0.68,0.304348,0.375652,0.375652
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Rothia;s__;d__denovo288,0.44,0.086957,0.353043,0.353043
k__Bacteria;p__Firmicutes;c__Negativicutes;o__Selenomonadales;f__Veillonellaceae;g__Veillonella;s__;d__denovo26,0.6,0.26087,0.33913,0.33913
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__;s__;d__denovo19,0.76,0.434783,0.325217,0.325217
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Porphyromonas;s__;d__denovo13,0.68,0.391304,0.288696,0.288696
k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__;d__denovo323,0.4,0.130435,0.269565,0.269565
k__Bacteria;p__Firmicutes;c__Negativicutes;o__Selenomonadales;f__Veillonellaceae;g__Centipeda;s__;d__denovo68,0.32,0.086957,0.233043,0.233043
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__;d__denovo213,0.36,0.173913,0.186087,0.186087
k__Bacteria;p__Fusobacteria;c__Fusobacteriia;o__Fusobacteriales;f__Leptotrichiaceae;g__Streptobacillus;s__;d__denovo104,0.4,0.217391,0.182609,0.182609


In [6]:
## Do some text manipulations to print these values out in latex table format

# Split OTU names to individual phyla
table = pd.concat((diffpreva.reset_index(), diffpreva.reset_index()['otu'].str.split(';', expand=True)), axis=1)
table['family'] = table[4].str.split('f__').str[1]
table['genus'] = table[5].str.split('g__').str[1]
# Re-format percentages
table['Aspiration/Penetration'] = table['Aspiration/Penetration'].apply(lambda x: '{:.1f}'.format(x*100))
table['Normal'] = table['Normal'].apply(lambda x: '{:.1f}'.format(x*100))
table['Difference'] = table['asp_minus_nml'].apply(lambda x: '{:.1f}'.format(x*100))
table = table.sort_values(by='asp_minus_nml', ascending=False)

col_order = ['family', 'genus', 'Normal', 'Aspiration/Penetration', 'Difference']
table = table[col_order]
print('\n'.join(table.apply(convert_to_latex, axis=1)))

Flavobacteriaceae &  & 8.7 & 48.0 & 39.3 \\ 
Fusobacteriaceae & Fusobacterium & 30.4 & 68.0 & 37.6 \\ 
Micrococcaceae & Rothia & 8.7 & 44.0 & 35.3 \\ 
Veillonellaceae & Veillonella & 26.1 & 60.0 & 33.9 \\ 
Prevotellaceae &  & 43.5 & 76.0 & 32.5 \\ 
Porphyromonadaceae & Porphyromonas & 39.1 & 68.0 & 28.9 \\ 
Streptococcaceae & Streptococcus & 13.0 & 40.0 & 27.0 \\ 
Veillonellaceae & Centipeda & 8.7 & 32.0 & 23.3 \\ 
Prevotellaceae & Prevotella & 17.4 & 36.0 & 18.6 \\ 
Leptotrichiaceae & Streptobacillus & 21.7 & 40.0 & 18.3 \\ 
Fusobacteriaceae & Fusobacterium & 17.4 & 32.0 & 14.6 \\ 
Aerococcaceae & Abiotrophia & 21.7 & 28.0 & 6.3 \\ 
Neisseriaceae & Neisseria & 17.4 & 20.0 & 2.6 \\ 


In [7]:
# How many patients were used for these prevalence calcs?
balthr_preva[['meta_val', 'n_patients']].drop_duplicates()

Unnamed: 0,meta_val,n_patients
1305,Aspiration/Penetration,25
1318,Normal,23


# BAL-gastric OTUs

In [8]:
balgas_preva = preva.query('site_comparison == "bal-gastric_fluid"')

# Pivot to get differential prevalence
diffpreva = balgas_preva.pivot(index='otu', columns='meta_val', values='prevalence_exchange')

# Sort values by differential prevalence in aspirators
diffpreva['asp_minus_nml'] = diffpreva['Aspiration/Penetration'] - diffpreva['Normal']
#diffpreva['abs_diff'] = abs(diffpreva['asp_minus_nml'])
diffpreva.sort_values(by='asp_minus_nml', ascending=False).head(10)

meta_val,Aspiration/Penetration,Normal,asp_minus_nml
otu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Neisseriaceae;g__Neisseria;s__;d__denovo124,0.413793,0.071429,0.342365
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Porphyromonas;s__;d__denovo53,0.62069,0.285714,0.334975
k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Haemophilus;s__;d__denovo3,0.827586,0.5,0.327586
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Coprococcus;s__;d__denovo216,0.37931,0.107143,0.272167
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Rothia;s__;d__denovo288,0.413793,0.142857,0.270936
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__;d__denovo23,0.517241,0.25,0.267241
k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Carnobacteriaceae;g__Granulicatella;s__;d__denovo33,0.586207,0.321429,0.264778
k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillales_Incertae_Sedis_XI;g__Gemella;s__;d__denovo20,0.689655,0.428571,0.261084
k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Haemophilus;s__;d__denovo7,0.827586,0.571429,0.256158
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Actinomyces;s__;d__denovo60,0.413793,0.178571,0.235222


In [9]:
## Do some text manipulations to print these values out in latex table format
table = pd.concat((diffpreva.reset_index(), diffpreva.reset_index()['otu'].str.split(';', expand=True)), axis=1)

# Split OTU names to individual phyla
# Prettify results
table['family'] = table[4].str.split('f__').str[1]
table['genus'] = table[5].str.split('g__').str[1]

# Fill in empty families with order
table['family'] = table['family'].where(table['family'] != '', "Unknown " + table[3].str.split('o__').str[1])
# Fill in empty orders with "Unkown bacteria" (I checked manually that this one is unannotated all the way)
table['family'] = table['family'].where(table['family'] != 'Unknown ', "Unknown Bacteria")

# Re-format percentages
table['Aspiration/Penetration'] = table['Aspiration/Penetration'].apply(lambda x: '{:.1f}'.format(x*100))
table['Normal'] = table['Normal'].apply(lambda x: '{:.1f}'.format(x*100))
table['Difference'] = table['asp_minus_nml'].apply(lambda x: '{:.1f}'.format(x*100))

table = table.sort_values(by='asp_minus_nml', ascending=False)

col_order = ['family', 'genus', 'Normal', 'Aspiration/Penetration', 'Difference']

# Print Ruminococcus2 in response to reviewer comments
print(table.query('genus == "Ruminococcus2"')['otu'].values)

table = table[col_order]
print('\n'.join(table.apply(convert_to_latex, axis=1)))

['k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Ruminococcus2;s__;d__denovo57']
Neisseriaceae & Neisseria & 7.1 & 41.4 & 34.2 \\ 
Porphyromonadaceae & Porphyromonas & 28.6 & 62.1 & 33.5 \\ 
Pasteurellaceae & Haemophilus & 50.0 & 82.8 & 32.8 \\ 
Lachnospiraceae & Coprococcus & 10.7 & 37.9 & 27.2 \\ 
Micrococcaceae & Rothia & 14.3 & 41.4 & 27.1 \\ 
Prevotellaceae & Prevotella & 25.0 & 51.7 & 26.7 \\ 
Carnobacteriaceae & Granulicatella & 32.1 & 58.6 & 26.5 \\ 
Bacillales\_Incertae\_Sedis\_XI & Gemella & 42.9 & 69.0 & 26.1 \\ 
Pasteurellaceae & Haemophilus & 57.1 & 82.8 & 25.6 \\ 
Actinomycetaceae & Actinomyces & 17.9 & 41.4 & 23.5 \\ 
Streptococcaceae & Streptococcus & 39.3 & 62.1 & 22.8 \\ 
Lachnospiraceae & Oribacterium & 14.3 & 34.5 & 20.2 \\ 
Leptotrichiaceae & Streptobacillus & 17.9 & 37.9 & 20.1 \\ 
Lachnospiraceae & Lachnoanaerobaculum & 17.9 & 37.9 & 20.1 \\ 
Fusobacteriaceae & Fusobacterium & 42.9 & 62.1 & 19.2 \\ 
Prevotellaceae &  & 50.0 & 69.0 

In [10]:
# how many patients were used for these prevalence calcs?
balgas_preva[['meta_val', 'n_patients']].drop_duplicates()

Unnamed: 0,meta_val,n_patients
76,Aspiration/Penetration,29
152,Normal,28


# Significance

In [11]:
fname = '../../data/analysis/exchange.with_partial_corrs.null_20reps.txt'

nthresh = 10
rthresh = 0
qthresh = 0.1

df = pd.read_csv(fname, sep='\t')

df = df.query('n_partial >= @nthresh')
df.head()

Unnamed: 0,otu,site1,site2,site3,r_site12,p_site12,n_site12,r_partial,p_partial,n_partial,shuffle_iter
0,k__Bacteria;p__;c__;o__;f__;g__;s__;d__denovo364,bal,gastric_fluid,throat_swab,0.337461,0.170838,18,0.338192,0.093,13,0
2,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,gastric_fluid,throat_swab,bal,-0.207389,0.28037,29,-0.20765,0.8665,18,0
3,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,bal,gastric_fluid,throat_swab,0.230943,0.301112,22,0.227302,0.157,12,0
4,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,bal,throat_swab,gastric_fluid,0.36087,0.083195,24,0.332241,0.044,20,0
5,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,gastric_fluid,throat_swab,bal,-0.137014,0.222569,81,-0.138964,0.894,47,0


In [12]:
# Need to correct for multiple tests within each iteration only
for g, subdf in df.groupby('shuffle_iter'):
    _, df.loc[subdf.index, 'q_partial'], _, _ = multipletests(subdf['p_partial'], method='fdr_bh')

df.head()

Unnamed: 0,otu,site1,site2,site3,r_site12,p_site12,n_site12,r_partial,p_partial,n_partial,shuffle_iter,q_partial
0,k__Bacteria;p__;c__;o__;f__;g__;s__;d__denovo364,bal,gastric_fluid,throat_swab,0.337461,0.170838,18,0.338192,0.093,13,0,0.682297
2,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,gastric_fluid,throat_swab,bal,-0.207389,0.28037,29,-0.20765,0.8665,18,0,0.982952
3,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,bal,gastric_fluid,throat_swab,0.230943,0.301112,22,0.227302,0.157,12,0,0.805345
4,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,bal,throat_swab,gastric_fluid,0.36087,0.083195,24,0.332241,0.044,20,0,0.495
5,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,gastric_fluid,throat_swab,bal,-0.137014,0.222569,81,-0.138964,0.894,47,0,0.982952


In [13]:
exchanged = df.query('(r_partial > @rthresh) & (q_partial < @qthresh)')
exchanged['site_comparison'] = exchanged['site1'] + '-' + exchanged['site2']
exchanged.groupby(['shuffle_iter', 'site_comparison']).size()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


shuffle_iter  site_comparison          
5             gastric_fluid-throat_swab    1
6             bal-gastric_fluid            1
              gastric_fluid-throat_swab    1
10            bal-gastric_fluid            2
              bal-throat_swab              2
13            bal-gastric_fluid            1
dtype: int64

The dataframe above shows how many OTUs were found to be exchanged in each shuffle iteration