This notebook analyzes the results from the aspiration classifiers which were built based on the entire microbial communities and those which were built based only on the exchanged OTUs.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

from scipy.stats import ranksums

# to pretty print dataframes
from IPython.display import display

sns.set_style('white')

# Classifiers based on entire communities

In [2]:
# Classifiers
fnclssum = '/Users/claire/github/aspiration-analysis/data/analysis/rf_results.summary_stats.txt'
#fnclsroc = '/Users/claire/github/aspiration-analysis/data/analysis/rf_results.rocs.txt'

summary = pd.read_csv(fnclssum, sep='\t')
#rocs = pd.read_csv(fnclsroc, sep='\t')

summary['log_fisher_p'] = np.log10(summary['fisher_p'])

In [3]:
summary.head()

Unnamed: 0,site,n_asp,n_nml,n_feats,auc,fisher_p,log_fisher_p
0,bal,33,33,928,0.629477,0.321257,-0.493147
1,throat_swab,36,43,835,0.683463,0.017014,-1.769198
2,gastric_fluid,41,48,1034,0.660315,0.084866,-1.071265
3,bal-throat_swab,25,23,1609,0.77913,0.007197,-2.14287
4,bal-gastric_fluid,29,28,1829,0.66133,0.01671,-1.777027


## Main text info

In the main text, we'll just report the mean AUC and p-values for relevant comparisons.

In [4]:
## Mean AUCs - there's only one per site now, but keep the call to .mean()
## for the table to show up nicely
order = ['bal', 'throat_swab', 'gastric_fluid', 
         'bal-throat_swab', 'bal-gastric_fluid', 'throat_swab-gastric_fluid',
         'bal-throat_swab-gastric_fluid']
summary.groupby('site').mean().loc[order, ['auc', 'fisher_p', 'n_nml', 'n_asp']]

Unnamed: 0_level_0,auc,fisher_p,n_nml,n_asp
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bal,0.629477,0.321257,33,33
throat_swab,0.683463,0.017014,43,36
gastric_fluid,0.660315,0.084866,48,41
bal-throat_swab,0.77913,0.007197,23,25
bal-gastric_fluid,0.66133,0.01671,28,29
throat_swab-gastric_fluid,0.733929,0.00301,35,32
bal-throat_swab-gastric_fluid,0.783753,0.011912,19,23


In [5]:
## No figure, we're not doing 100 reps anymore

# fig, ax = plt.subplots(2, 1, figsize=(9, 8))

# ## Set up some stuff
# boxprops = {'edgecolor': 'k', 'facecolor': 'w'}
# lineprops = {'color': 'k'}

# xlabels = {'bal': 'BAL', 
#            'throat_swab': 'Oropharyngeal\nswab',
#            'gastric_fluid': 'Gastric\nfluid',
#            'bal-throat_swab': 'BAL and\noropharyngeal\nswab',
#            'bal-gastric_fluid': 'BAL and\ngastric\nfluid',
#            'throat_swab-gastric_fluid': 'Oropharyngeal\nswab and\ngastric fluid',
#            'bal-throat_swab-gastric_fluid': 'All three\nsites'}


# ## AUCs
# sns.boxplot(x='site', y='auc', data=summary, ax=ax[0], width=0.5,
#             **{'boxprops': boxprops, 'medianprops': lineprops,
#                'whiskerprops': lineprops, 'capprops': lineprops})
# ## Plot AUC = 0.7
# #ax[0].plot(ax[0].get_xlim(), 2*[0.7], '--', color='gray', alpha=0.5)

# # Axes
# ax[0].set_ylim([0.4, 0.95])

# # Labels
# ax[0].set_ylabel('AUC', fontsize='x-large')
# yticks = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
# ax[0].set_yticks(yticks)
# ax[0].set_yticklabels(yticks, fontsize='large')

# ax[0].set_xticklabels('')
# ax[0].set_xlabel('')

# ## Fisher pvalues
# sns.boxplot(x='site', y='log_fisher_p', data=summary, ax=ax[1], width=0.5,
#             **{'boxprops': boxprops, 'medianprops': lineprops,
#                'whiskerprops': lineprops, 'capprops': lineprops})
# # Plot p = 0.05
# ax[1].plot(ax[1].get_xlim(), 2*[np.log10(0.05)], '--', color='gray', alpha=0.5)

# # Labels
# ax[1].set_ylabel('log$_{10}$(Fisher p-value)', fontsize='x-large')
# yticks = [-5, -4, -3, -2, -1, 0]
# ax[1].set_ylim([-5, 0.1])
# ax[1].set_yticks(yticks)
# ax[1].set_yticklabels(yticks, fontsize='large')

# ax[1].set_xticklabels([xlabels[i.get_text()] for i in ax[1].get_xticklabels()],
#                   fontsize='large')
# ax[1].set_xlabel('')

# #fig.savefig('../../final/figures/figure9.auc_fisherp_asp_classifiers.png', dpi=200)

# Classifiers built on exchanged OTUs

In [6]:
# Classifiers
fnclssum = '/Users/claire/github/aspiration-analysis/data/analysis/rf_results.exchanged.summaries.txt'
#fnclsroc = '/Users/claire/github/aspiration-analysis/data/analysis/rf_results.exchanged.rocs.txt'

summary = pd.read_csv(fnclssum, sep='\t')
#rocs = pd.read_csv(fnclsroc, sep='\t')

summary['log_fisher_p'] = np.log10(summary['fisher_p'])

In [7]:
summary.head()

Unnamed: 0,site,classifier_type,n_asp,n_nml,n_feats,auc,fisher_p,log_fisher_p
0,bal,abundance,33,33,13,0.578512,0.212743,-0.672144
1,throat_swab,abundance,36,43,13,0.65407,0.1083,-0.965371
2,bal-throat_swab,abundance,25,23,26,0.697391,0.080967,-1.09169
3,bal,presence,33,33,13,0.655647,0.080438,-1.09454
4,throat_swab,presence,36,43,13,0.567829,0.353404,-0.451729


In [8]:
## Mean AUCs
order = ['bal', 'throat_swab', 'gastric_fluid', 
         'bal-throat_swab', 'bal-gastric_fluid', 'throat_swab-gastric_fluid',
         'bal-throat_swab-gastric_fluid']
# I forgot to include a column indicating which exchanged OTUs this was based off of,
# but you can infer it from the n_feats column.
# n_feats = 9 or 18 corresponds to bal-throat exchanged OTUs
# n_feats = 48 or 96 corresponds to bal-gastric exchanged OTUs
summary.groupby(['classifier_type', 'n_feats', 'site']).mean()[['auc', 'fisher_p', 'n_nml', 'n_asp']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,auc,fisher_p,n_nml,n_asp
classifier_type,n_feats,site,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
abundance,13,bal,0.578512,0.212743,33,33
abundance,13,throat_swab,0.65407,0.1083,43,36
abundance,26,bal-throat_swab,0.697391,0.080967,23,25
abundance,76,bal,0.560147,0.455303,33,33
abundance,76,gastric_fluid,0.659553,0.030947,48,41
abundance,152,bal-gastric_fluid,0.678571,0.008075,28,29
presence,13,bal,0.655647,0.080438,33,33
presence,13,bal-throat_swab,0.626087,0.048693,23,25
presence,13,throat_swab,0.567829,0.353404,43,36
presence,76,bal,0.5955,0.139108,33,33
