The purpose of this notebook is to document the manipulations I do to get the number of patients/samples bc I'm sick of re-doing it every time.

In [1]:
import pandas as pd
import numpy as np

In [2]:
fnotu = '/Users/claire/github/aspiration-analysis/data/clean/rosen.otu_table.rel_abun.clean'
fnmeta = '/Users/claire/github/aspiration-analysis/data/clean/rosen.metadata.clean'

meta = pd.read_csv(fnmeta, sep='\t', index_col=0)
meta.columns

Index([u' If Yes, specify the symptom score',
       u' If yes, please indicate level', u'% time pH<4', u'% time pH<4:',
       u'A1. Subject ID number:', u'A2. Subject initials:',
       u'A3. What Cohort is the subject enrolled into?',
       u'A4. Aim(s) enrolled in?',
       u'A5.  Date of initial/baseline visit/procedure (MM/DD/YYYY):',
       u'A5a. Date filled out(MM/DD/YYYY):',
       ...
       u'STUDYID', u'STUDY', u'AIM', u'SOURCE', u'PHMII', u'ACIDSUP', u'DATE',
       u'ppi_consolidated', u'mbs_consolidated', u'total_reads'],
      dtype='object', length=958)

In [3]:
meta['mbs_consolidated'] = meta['mbs_consolidated'].fillna('nan')
meta['ppi_consolidated'] = meta['ppi_consolidated'].fillna('nan')

patientsamples = meta\
    .groupby(['mbs_consolidated', 'ppi_consolidated', 'site', 'subject_id'])\
    .size()\
    .to_frame('n_samples').reset_index()
patientsamples.iloc[::65, :]

Unnamed: 0,mbs_consolidated,ppi_consolidated,site,subject_id,n_samples
0,Aspiration/Penetration,conflicting,bal,13-117-4,1
65,Aspiration/Penetration,on,bal,04-247-3,1
130,Normal,off,gastric_fluid,02-111-4,1
195,Normal,on,gastric_fluid,04-011-3,1
260,,,gastric_fluid,01-173-4,1
325,,,throat_swab,03-199-7,1
390,,off,stool,13-058-2,1
455,,on,gastric_fluid,04-047-2,1
520,,on,throat_swab,04-136-6,1


# First, number of patients for each site alone

This is useful for Figure 1, where I compare samples across patients.

I should probably re-make Figure 1 using data only from patients who are not known to be aspirators?

In [4]:
sites = ['stool', 'bal', 'gastric_fluid', 'throat_swab']

In [5]:
# With PPI info
patientsamples.query('site == @sites')\
    .groupby(['site', 'mbs_consolidated', 'ppi_consolidated'])\
    .size()\
    .to_frame('n_samples')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_samples
site,mbs_consolidated,ppi_consolidated,Unnamed: 3_level_1
bal,Aspiration/Penetration,conflicting,1
bal,Aspiration/Penetration,off,16
bal,Aspiration/Penetration,on,16
bal,Normal,conflicting,2
bal,Normal,off,11
bal,Normal,on,23
bal,,conflicting,1
bal,,,3
bal,,off,21
bal,,on,21


In [6]:
# Without PPI info
patientsamples.query('site == @sites')\
    .groupby(['site', 'mbs_consolidated'])\
    .size()\
    .to_frame('n_samples')

Unnamed: 0_level_0,Unnamed: 1_level_0,n_samples
site,mbs_consolidated,Unnamed: 2_level_1
bal,Aspiration/Penetration,33
bal,Normal,36
bal,,46
gastric_fluid,Aspiration/Penetration,42
gastric_fluid,Normal,52
gastric_fluid,,75
stool,,41
throat_swab,Aspiration/Penetration,38
throat_swab,Normal,46
throat_swab,,125


### Number of samples from each site

In [7]:
# Without PPI info or MBS info
patientsamples.query('site == @sites')\
    .groupby(['site'])\
    .size()

site
bal              115
gastric_fluid    169
stool             41
throat_swab      209
dtype: int64

### Number of unique patients with each sample type

In [8]:
# Need to re-do the reset_index part bc currently patients with
# pre- and post-PPI samples show up twice in the subject_id column,
# since grouping by ppi_consolidated split those samples up
patientsamples.query('site == @sites')\
    .groupby(['site', 'subject_id'])\
    .size()\
    .reset_index()\
    .groupby('site')\
    .size()

site
bal              115
gastric_fluid    169
stool             30
throat_swab      197
dtype: int64

In [9]:
patientsamples.query('site == @sites')\
    .groupby(['mbs_consolidated', 'subject_id'])\
    .size()\
    .reset_index()\
    .groupby(['mbs_consolidated'])\
    .size()

mbs_consolidated
Aspiration/Penetration     49
Normal                     63
nan                       142
dtype: int64

In [10]:
49+63+146

258

# Number of patients with intra site combinations

This is for figure 2: the within-patient comparisons.

The comparisons I've made are: bal_throat, bal_gastric, gastric_throat, stool_throat, stool_stool.

Note: intra-patient beta diversity should take care to compare stools and throats taken at the same time point. See the `2017-08-16.baseline_relationships_different_betas` notebook for these checks.

You can also see in the code below that there are 38 stool_throat comparisons but only 27 unique patients: this is because there are 11
patients for whom we can do two throat_stool comparisons (one before 
PPI and one after PPI). To see this, uncomment the lines in the cell below

In [11]:
for site1 in sites:
    for site2 in sites[sites.index(site1)+1:]:
        subjects = patientsamples\
                    .query('(site == @site1) | (site == @site2)')\
                    .groupby(['ppi_consolidated', 'mbs_consolidated', 'subject_id'])\
                    .size()
        print('{} + {}'.format(site1, site2))
                
        # This line shows the number of within-patient comparisons,
        # just grouped by MBS status 
        print(subjects[subjects == 2].reset_index()
              .groupby(['mbs_consolidated']).size())

        ## Uncomment this line to see disaggregation by PPI status too
        ## (this only affects stool_throat comparisons)
        #print(subjects[subjects == 2].reset_index()
        #      .groupby(['ppi_consolidated', 'mbs_consolidated']).size())        
        
        # And this line is just straight-up the number of unique patients
        print(subjects[subjects == 2].reset_index()['subject_id'].unique().shape)
        print('')


stool + bal
Series([], dtype: int64)
(0,)

stool + gastric_fluid
Series([], dtype: int64)
(0,)

stool + throat_swab
mbs_consolidated
nan    38
dtype: int64
(27,)

bal + gastric_fluid
mbs_consolidated
Aspiration/Penetration    29
Normal                    30
nan                       41
dtype: int64
(100,)

bal + throat_swab
mbs_consolidated
Aspiration/Penetration    25
Normal                    24
nan                       30
dtype: int64
(79,)

gastric_fluid + throat_swab
mbs_consolidated
Aspiration/Penetration    32
Normal                    36
nan                       53
dtype: int64
(121,)



In [13]:
site1 = 'stool'
site2 = 'throat_swab'

subjects = patientsamples\
            .query('(site == @site1) | (site == @site2)')\
            .groupby(['ppi_consolidated', 'mbs_consolidated', 'subject_id'])\
            .size()

subjects[subjects == 2].reset_index().shape

(38, 4)

## All three sites

In [14]:
aero_sites = ['bal', 'gastric_fluid', 'throat_swab']
subjects = patientsamples\
            .query('(site == @aero_sites)')\
            .groupby(['ppi_consolidated', 'mbs_consolidated', 'subject_id'])\
            .size()

# This line shows the number of within-patient comparisons,
# just grouped by MBS status 
print(subjects[subjects == 3].reset_index()
      .groupby(['mbs_consolidated']).size())

## Uncomment this line to see disaggregation by PPI status too
## (this only affects stool_throat comparisons)
#print(subjects[subjects == 2].reset_index()
#      .groupby(['ppi_consolidated', 'mbs_consolidated']).size())        

# And this line is just straight-up the number of unique patients
print(subjects[subjects == 3].reset_index()['subject_id'].unique().shape)
print('')

mbs_consolidated
Aspiration/Penetration    23
Normal                    19
nan                       29
dtype: int64
(71,)



In [15]:
## NOTE: this subject has all three sites sequenced, but conflicting
## PPI metadata for the throat sample and so is excluded from these counts

# To get this patient: re-calculate the patientsamples and cell above this one
# without grouping by ppi_consolidated
showcols = ['On PPI currently?', 'PPI Status', 'Patient taking PPI',
           'Patient taking PPI?', 'ACIDSUP', 'mbs_consolidated', 'site', 'subject_id']
meta.query('subject_id == "03-102-4"')[showcols]

Unnamed: 0,On PPI currently?,PPI Status,Patient taking PPI,Patient taking PPI?,ACIDSUP,mbs_consolidated,site,subject_id
03-102-4G,,,no,,Off,Aspiration/Penetration,gastric_fluid,03-102-4
03-102-4T,,,no,,On,Aspiration/Penetration,throat_swab,03-102-4


## Number of patients with at least two sites

In [36]:
(patientsamples.query('mbs_consolidated != "nan"').groupby('subject_id').size() > 1).sum()

# bc currently also grouped-by ppi status, patients with pre- and post-PPI show up
# twice. need to re-groupby site and subject, reset index to get back to subject, site, number_of_samples
# format
sites = ['bal', 'gastric_fluid', 'throat_swab']
(patientsamples\
    .query('site == @sites')\
    .groupby(['site', 'subject_id'])\
    .size().reset_index()\
    .groupby('subject_id')\
    .size() > 1)\
    .sum()


159

# Neutrophil info?

Which are the metadata columns that might have this information?

From Rachel:

> It would be great to show that the shift of the microbiome towards oropharyngeal flora results in a greater percentage of neutrophils in the lung—we have this data so if you do not, it would be worth including—it would highlight that the shift towards the oropharyngeal bacteria actually increases risk of inflammation

In [8]:
neutro_cols = [i for i in meta.columns if 'neutro' in i.lower()]
neutro_cols

['If Yes, what was the percentage of neutrophils',
 'If Yes, what was the percentage of neutrophils?']

In [28]:
meta[neutro_cols[0]].unique()

array([ nan,   4.,   0.,   2.,   3.,   5.,  30.,   1.,  10.,  35.])

In [29]:
meta[neutro_cols[1]].unique()

array([nan, '5', '3', '0', '10', '1', '39', '50', '34', '2', '20',
       'Not Done', '60', '15', '25', '30', '4', '40', '7', '9', '8', '2.5',
       '70'], dtype=object)

In [21]:
meta[neutro_cols].dropna()

Unnamed: 0,"If Yes, what was the percentage of neutrophils","If Yes, what was the percentage of neutrophils?"


Good, no patients have data in both columns!

Next step: consolidate these two columns (TODO: add these to metadata wrangling script!!)

In [36]:
newcol = 'neutrophil_consolidated'

def consolidate_neutrophils(row, neutro_cols):
    val = np.nan
    
    # First neutro_col should only have numbers or nans (no strings)
    n0 = row[neutro_cols[0]]
    if not np.isnan(n0):
        val = float(n0)
    
    # Second neutro_col has a few 'Not Done', so the column is strings
    n1 = row[neutro_cols[1]]
    if n1 != "Not Done" and not np.isnan(float(n1)):
        val = float(n1)
        
    return val

meta[newcol] = meta.apply(lambda row: consolidate_neutrophils(row, neutro_cols), axis=1)

In [43]:
# Re-build the "patientsamples" dataframe with counts
meta['mbs_consolidated'] = meta['mbs_consolidated'].fillna('nan')
meta['ppi_consolidated'] = meta['ppi_consolidated'].fillna('nan')
meta['neutrophil_consolidated'] = meta['neutrophil_consolidated'].fillna('nan')

patientsamples = meta\
    .groupby(['mbs_consolidated', 'ppi_consolidated', 'neutrophil_consolidated', 'site', 'subject_id'])\
    .size()\
    .to_frame('n_samples').reset_index()
patientsamples.iloc[::65, :]

Unnamed: 0,mbs_consolidated,ppi_consolidated,neutrophil_consolidated,site,subject_id,n_samples
0,Aspiration/Penetration,conflicting,0.0,throat_swab,03-102-4,1
65,Aspiration/Penetration,on,0.0,throat_swab,04-166-7,1
130,Normal,off,0.0,throat_swab,03-114-6,1
195,Normal,on,0.0,throat_swab,04-147-3,1
260,Normal,on,,throat_swab,04-275-6,1
325,,,,throat_swab,01-112-7,1
390,,off,5.0,gastric_fluid,03-112-5,1
455,,off,,throat_swab,13-089-1,1
520,,on,35.0,gastric_fluid,05-128-2,1


In [44]:
meta.groupby(newcol).size()

neutrophil_consolidated
0.0      85
1.0      11
2.0      30
2.5       4
3.0      13
4.0      10
5.0      71
7.0       3
8.0       3
9.0       2
10.0     35
15.0     16
20.0      9
25.0     10
30.0     10
34.0      3
35.0      3
39.0      2
40.0     11
50.0      6
60.0      2
70.0      2
nan     287
dtype: int64

In [46]:
for site1 in sites:
    for site2 in sites[sites.index(site1)+1:]:
        subjects = patientsamples\
                    .query('(site == @site1) | (site == @site2)')\
                    .groupby(['ppi_consolidated', 'mbs_consolidated', 'neutrophil_consolidated', 'subject_id'])\
                    .size()
        print('{} + {}'.format(site1, site2))
                
        # This line shows the number of within-patient comparisons,
        # just grouped by MBS status 
        print(subjects[subjects == 2].reset_index()
              .groupby(['mbs_consolidated', 'neutrophil_consolidated']).size())

        ## Uncomment this line to see disaggregation by PPI status too
        ## (this only affects stool_throat comparisons)
        #print(subjects[subjects == 2].reset_index()
        #      .groupby(['ppi_consolidated', 'mbs_consolidated']).size())        
        
        # And this line is just straight-up the number of unique patients
        print(subjects[subjects == 2].reset_index()['subject_id'].unique().shape)
        print('')


stool + bal
Series([], dtype: int64)
(0,)

stool + gastric_fluid
Series([], dtype: int64)
(0,)

stool + throat_swab
mbs_consolidated  neutrophil_consolidated
nan               nan                        38
dtype: int64
(27,)

bal + gastric_fluid
mbs_consolidated        neutrophil_consolidated
Aspiration/Penetration  0.0                         4
                        1.0                         1
                        2.0                         1
                        2.5                         1
                        4.0                         2
                        5.0                         5
                        8.0                         1
                        10.0                        1
                        15.0                        2
                        20.0                        1
                        25.0                        1
                        30.0                        2
                        39.0                        1
    