This notebook builds classifiers to distinguish aspirators from non-aspirators using different microbial communities (including combinations).

In [56]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from scipy.stats import fisher_exact

from IPython.display import display

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [57]:
import os, sys
src_dir = os.path.normpath(os.path.join(os.getcwd(), '../util'))
sys.path.append(src_dir)
import util

In [58]:
fotu = '../../data/clean/rosen.otu_table.rel_abun.clean'
fmeta = '../../data/clean/rosen.metadata.clean'

df = pd.read_csv(fotu, sep='\t', index_col=0)
meta = pd.read_csv(fmeta, sep='\t', index_col=0)

In [59]:
mbs_col = 'mbs_consolidated'

# Check that each aspiration patient has only one sample per site
meta.dropna(subset=[mbs_col]).groupby(['subject_id', 'site']).size().max()

1

In [60]:
meta = meta.dropna(subset=[mbs_col])
tidydf = util.tidyfy_otu(df, meta, mbs_col, cols=['ppi_consolidated'])
tidydf['ppi_consolidated'] = tidydf['ppi_consolidated'].fillna('nan')
tidydf.head()

Tidying data... 
Finished.


Unnamed: 0,sample,otu,abun,mbs_consolidated,site,subject_id,ppi_consolidated,otu_w_site
35,02-096-6G,k__Bacteria;p__Firmicutes;c__Negativicutes;o__...,0.0,Normal,gastric_fluid,02-096-6,off,k__Bacteria;p__Firmicutes;c__Negativicutes;o__...
621,02-096-6G,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...,0.0,Normal,gastric_fluid,02-096-6,off,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
1207,02-096-6G,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...,0.0,Normal,gastric_fluid,02-096-6,off,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
1793,02-096-6G,k__Bacteria;p__Actinobacteria;c__Actinobacteri...,0.0,Normal,gastric_fluid,02-096-6,off,k__Bacteria;p__Actinobacteria;c__Actinobacteri...
2379,02-096-6G,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...,0.0,Normal,gastric_fluid,02-096-6,off,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...


In [61]:
# Define some global variables
random_state = 12345
nml = 'Normal'
asp = 'Aspiration/Penetration'
aspdict = {'Normal': 0, 'Aspiration/Penetration': 1}
sites = util.get_sites()
sites

['bal', 'gastric_fluid', 'throat_swab']

# Single-site classifiers

These classifiers are trained using only one community per patient.

In [62]:
def single_site_classifier(s):
    """
    Make classifier based on site s.

    Global variables used are: meta, df (OTU table), aspdict, and random_state.
    """
    h_smpls = meta\
        .query('mbs_consolidated == @nml')\
        .query('site == @s')\
        .index.tolist()
    asp_smpls = meta\
        .query('mbs_consolidated == @asp')\
        .query('site == @s')\
        .index.tolist()

    X = df.loc[h_smpls + asp_smpls, :]
    Y = [aspdict[meta.loc[i, 'mbs_consolidated']] for i in X.index]
    X = X.values

    rf = RandomForestClassifier(n_estimators=1000, random_state=random_state)

    results = util.cv_and_roc(rf, X, Y, random_state=random_state)

    # Print results
    print('N non-asp = {}, N asp = {}'.format(len(h_smpls), len(asp_smpls)))
    print('AUC = {}'.format(np.mean(results['auc_list'])))
    util.plot_auc_from_list(results['tpr_list'], results['mean_fpr'])
    print('Fisher results : {}'.format(fisher_exact(results['conf_mat'])))
    display(results['conf_mat'])

In [63]:
s = 'bal'
single_site_classifier(s)

N non-asp = 36, N asp = 33
AUC = 0.65612244898


NameError: global name 'plot_auc_from_list' is not defined

In [None]:
s = 'gastric_fluid'
single_site_classifier(s)

In [None]:
s = 'throat_swab'
single_site_classifier(s)

# Two-site classifiers

These classifiers are trained using two sites per patient.

In [None]:
def make_combined_site_df(tidydf, sites, mbs_col):
    """
    Return a wide-form dataframe with data from all sites.
    
    Drops any rows with NaNs (i.e. subjects which are missing
    one of the sites). Also drops any subjects without mbs_col
    metadata.
    
    Parameters
    ----------
    tidydf : pandas DataFrame
        'subject_id', 'site', and 'otu_w_site' columns
    sites : list
        list of sites to keep
    """
        
    tmpotu = tidydf.query('site == @sites')\
            .dropna(subset=[mbs_col])\
            .pivot(index='subject_id', columns='otu_w_site',
                   values='abun')\
            .dropna(axis=0)

    return tmpotu

def multi_site_classifier(tidydf, sites):
    """
    Make a multi-site classifiers.
    
    First makes a datafarme with has subject IDs in rows
    and OTUs in columns. OTUs are labeled by which site
    they were in (e.g. k__Bacteria;...;d__denovo123-bal and 
    k__Bacteria;...;d__denovo123-gastric_fluid are separate columns).
    
    tidydf has columns 'subject_id' and 'mbs_consolidated'
    
    sites is a list of sites to consider
    
    Global variables used: meta (with column 'subject_id'), aspdict, random_state, mbs_col
    """
    
    df = make_combined_site_df(tidydf, sites, mbs_col)
    
    subjects = df.index.tolist()
    
    # Using pandas.query matches order given in subjects, which matches
    # order in df (which becomes X)
    mbs_info = tidydf\
        .query('subject_id == @subjects')\
        [['subject_id', 'mbs_consolidated']]\
        .drop_duplicates()
        
    Y = [aspdict[i] for i in mbs_info['mbs_consolidated']]
    X = df.values
    
    
    rf = RandomForestClassifier(n_estimators=1000, random_state=random_state)

    results = util.cv_and_roc(rf, X, Y, random_state=random_state)

    # Print results
    print(mbs_info.groupby('mbs_consolidated').size())
    print('AUC = {}'.format(np.mean(results['auc_list'])))
    util.plot_auc_from_list(results['tpr_list'], results['mean_fpr'])
    print('Fisher results : {}'.format(fisher_exact(results['conf_mat'])))
    display(results['conf_mat'])

In [None]:
twosites = ['bal', 'throat_swab']
print(', '.join(twosites))
multi_site_classifier(tidydf, twosites)

In [None]:
twosites = ['bal', 'gastric_fluid']
print(', '.join(twosites))
multi_site_classifier(tidydf, twosites)

In [None]:
twosites = ['throat_swab', 'gastric_fluid']
print(', '.join(twosites))
multi_site_classifier(tidydf, twosites)

# All three sites

In [None]:
print(', '.join(sites))
multi_site_classifier(tidydf, sites)

# Check PPI confounding

For patients used in each classifier, show the confusion matrix of on/off PPI and yes/no aspiration.

## Single-site classifiers

In [None]:
from IPython.display import display
from scipy.stats import fisher_exact

def get_single_site_samples(s):
    h_smpls = meta\
        .query('mbs_consolidated == @nml')\
        .query('site == @s')\
        .index.tolist()
    asp_smpls = meta\
        .query('mbs_consolidated == @asp')\
        .query('site == @s')\
        .index.tolist()
    return h_smpls + asp_smpls

nml = 'Normal'
asp = 'Aspiration/Penetration'

In [None]:
def investigate_single_site(s):
    smpls = get_single_site_samples(s)
    print(s)
    confmat = pd.crosstab(
        meta.loc[smpls, 'mbs_consolidated'], 
        meta.loc[smpls, 'ppi_consolidated'])
    display(confmat)

    print(fisher_exact(confmat.drop('conflicting', axis=1)))

In [None]:
investigate_single_site('bal')

In [None]:
investigate_single_site('gastric_fluid')

In [None]:
investigate_single_site('throat_swab')

## Two sites

In [None]:
mbs_col = 'mbs_consolidated'

def get_multi_site_subjects(sites):
    df = make_combined_site_df(tidydf, sites, mbs_col)    
    subjects = df.index.tolist()
    
    return subjects

def investigate_multi_site_subjects(sites):
    print(sites)
    
    subjects = get_multi_site_subjects(sites)
    
    meta_info = tidydf\
        .query('subject_id == @subjects')\
        [['subject_id', 'mbs_consolidated', 'ppi_consolidated']]\
        .drop_duplicates()
    
    confmat = pd.crosstab(
        meta_info['mbs_consolidated'], 
        meta_info['ppi_consolidated'])
    
    display(confmat)
    
    print(fisher_exact(confmat.drop('conflicting', axis=1)))

In [None]:
twosites = ['bal', 'throat_swab']
investigate_multi_site_subjects(twosites)

In [None]:
twosites = ['bal', 'gastric_fluid']
investigate_multi_site_subjects(twosites)

In [None]:
twosites = ['throat_swab', 'gastric_fluid']
investigate_multi_site_subjects(twosites)

In [None]:
allsites = ['bal', 'throat_swab', 'gastric_fluid']
investigate_multi_site_subjects(allsites)