This notebook checks that our aspiration classifiers are not confounded with PPI use.

In [4]:
import pandas as pd

from IPython.display import display
from scipy.stats import fisher_exact

In [5]:
import os, sys
src_dir = os.path.normpath(os.path.join(os.getcwd(), '../util'))
sys.path.append(src_dir)
import util

## Single-site classifiers


In [6]:
def get_single_site_samples(s):
    h_smpls = meta\
        .query('mbs_consolidated == @nml')\
        .query('site == @s')\
        .index.tolist()
    asp_smpls = meta\
        .query('mbs_consolidated == @asp')\
        .query('site == @s')\
        .index.tolist()
    all_smpls = h_smpls + asp_smpls
    all_smpls = [i for i in all_smpls
                 if not i.endswith('2')
                 and not i.endswith('F')
                 and not i.endswith('F2T')
                 and not i.startswith('05')]
    return all_smpls

def investigate_single_site(s):
    smpls = get_single_site_samples(s)
    print('\n' + s)
    confmat = pd.crosstab(
        meta.loc[smpls, 'mbs_consolidated'], 
        meta.loc[smpls, 'ppi_consolidated'])
    display(confmat)

    print(fisher_exact(confmat.drop('conflicting', axis=1)))

In [7]:
def make_combined_site_df(tidydf, sites, mbs_col):
    """
    Return a wide-form dataframe with data from all sites.
    
    Drops any rows with NaNs (i.e. subjects which are missing
    one of the sites). Also drops any subjects without mbs_col
    metadata.
    
    Parameters
    ----------
    tidydf : pandas DataFrame
        'subject_id', 'site', and 'otu_w_site' columns
    sites : list
        list of sites to keep
    """
        
    tmpotu = tidydf.query('site == @sites')\
            .dropna(subset=[mbs_col])\
            .pivot(index='subject_id', columns='otu_w_site',
                   values='abun')\
            .dropna(axis=0)

    return tmpotu

def get_multi_site_subjects(sites):
    df = make_combined_site_df(tidydf, sites, 'mbs_consolidated')    
    subjects = df.index.tolist()
    
    return subjects

def investigate_multi_site_subjects(sites):
    print(sites)
    
    subjects = get_multi_site_subjects(sites)
    
    meta_info = tidydf\
        .query('subject_id == @subjects')\
        [['subject_id', 'mbs_consolidated', 'ppi_consolidated']]\
        .drop_duplicates()
    
    confmat = pd.crosstab(
        meta_info['mbs_consolidated'], 
        meta_info['ppi_consolidated'])
    
    display(confmat)
    
    print(fisher_exact(confmat.drop('conflicting', axis=1)))    
    
nml = 'Normal'
asp = 'Aspiration/Penetration'

In [8]:
fotu = '../../data/clean/rosen.otu_table.rel_abun.clean'
fmeta = '../../data/clean/rosen.metadata.clean'

df = pd.read_csv(fotu, sep='\t', index_col=0)
meta = pd.read_csv(fmeta, sep='\t', index_col=0)

In [9]:
investigate_single_site('bal')
investigate_single_site('gastric_fluid')
investigate_single_site('throat_swab')


bal


ppi_consolidated,conflicting,off,on
mbs_consolidated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aspiration/Penetration,1,16,16
Normal,2,11,20


(1.8181818181818181, 0.31139218212169373)

gastric_fluid


ppi_consolidated,conflicting,off,on
mbs_consolidated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aspiration/Penetration,1,19,21
Normal,1,22,25


(1.0281385281385282, 1.0)

throat_swab


ppi_consolidated,conflicting,off,on
mbs_consolidated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aspiration/Penetration,2,14,20
Normal,1,22,20


(0.63636363636363635, 0.36359075997842771)


## Multiple sites

In [10]:
tidydf = util.tidyfy_otu(df, meta, 'mbs_consolidated', cols=['ppi_consolidated'])
tidydf['ppi_consolidated'] = tidydf['ppi_consolidated'].fillna('nan')

Tidying data... 
Finished.


In [11]:
twosites = ['bal', 'throat_swab']
investigate_multi_site_subjects(twosites)

twosites = ['bal', 'gastric_fluid']
investigate_multi_site_subjects(twosites)

twosites = ['throat_swab', 'gastric_fluid']
investigate_multi_site_subjects(twosites)

allsites = ['bal', 'throat_swab', 'gastric_fluid']
investigate_multi_site_subjects(allsites)

['bal', 'throat_swab']


ppi_consolidated,conflicting,off,on
mbs_consolidated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aspiration/Penetration,1,9,15
Normal,1,9,13


(0.8666666666666667, 1.0)
['bal', 'gastric_fluid']


ppi_consolidated,conflicting,off,on
mbs_consolidated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aspiration/Penetration,1,14,14
Normal,1,10,17


(1.7, 0.41824475302455044)
['throat_swab', 'gastric_fluid']


ppi_consolidated,conflicting,off,on
mbs_consolidated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aspiration/Penetration,2,12,19
Normal,0,19,16


(0.53185595567867039, 0.22723718261648584)
['bal', 'throat_swab', 'gastric_fluid']


ppi_consolidated,conflicting,off,on
mbs_consolidated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aspiration/Penetration,1,8,14
Normal,0,8,11


(0.7857142857142857, 0.75684303636771089)
