This notebook makes Table 2 and Supplementary Table 2, which shows the number of patients with each combination of samples sequenced.

In [8]:
import pandas as pd

In [9]:
fname = '../../final/supp_files/patients_with_sites_sampled.csv'

df = pd.read_csv(fname, index_col=0)
df.index.name = 'subject'
df = df.reset_index()
df.head()

Unnamed: 0,subject,bal,gastric_fluid,throat_swab,stool
0,04-080-7,True,True,True,False
1,01-299-7,False,False,True,False
2,04-167-8,False,True,False,False
3,04-087-1,True,True,True,False
4,042-6-F1,False,False,True,False


In [10]:
tidydf = (
    df
     .melt(id_vars='subject', var_name='site')
     .query('value == True')
)

In [11]:
subj2sites = {}
for subj, subdf in tidydf.sort_values(by='site').groupby('subject'):
    subj2sites[subj] = '-'.join(subdf['site'].values)

In [12]:
sitecombodf = pd.DataFrame(subj2sites, index=['sites']).T
sitecombodf.head()

Unnamed: 0,sites
01-112-7,gastric_fluid-throat_swab
01-164-7,gastric_fluid-throat_swab
01-165-8,stool
01-173-4,gastric_fluid-throat_swab
01-200-1,throat_swab


# Patients will different site combinations

In [13]:
sitecombodf.groupby('sites').size()

sites
bal                               6
bal-gastric_fluid                22
bal-gastric_fluid-throat_swab    66
bal-throat_swab                   7
gastric_fluid                    12
gastric_fluid-throat_swab        45
stool                             5
stool-throat_swab                20
throat_swab                      37
dtype: int64

In [14]:
# Just checking I didn't mess anything up...
sitecombodf.groupby('sites').size().sum()

220

# Split by aspiration status

For the supplementary table.

In [15]:
fmeta = '../../data/clean/rosen.metadata.clean'
meta = pd.read_csv(fmeta, sep='\t', index_col=0)

print(meta.shape)
keep_subjs = df['subject'].values.tolist()
meta = meta.query('subject_id == @keep_subjs')
print(meta.shape)

(455, 958)
(455, 958)


In [16]:
print(sitecombodf.shape)
sitecombo_withasp = (
    meta[['subject_id', 'mbs_consolidated']].drop_duplicates()
        .join(sitecombodf, on='subject_id', how='left')
)
print(sitecombo_withasp.shape)
sitecombo_withasp.head()

(220, 1)
(220, 3)


Unnamed: 0,subject_id,mbs_consolidated,sites
01-112-7GI,01-112-7,,gastric_fluid-throat_swab
01-164-7GI,01-164-7,,gastric_fluid-throat_swab
01-165-8SI,01-165-8,,stool
01-173-4G,01-173-4,,gastric_fluid-throat_swab
01-200-1TI,01-200-1,,throat_swab


In [17]:

sitecombo_withasp.fillna('not_tested').groupby(['sites', 'mbs_consolidated']).size()

sites                          mbs_consolidated      
bal                            Aspiration/Penetration     2
                               Normal                     1
                               not_tested                 3
bal-gastric_fluid              Aspiration/Penetration     6
                               Normal                     9
                               not_tested                 7
bal-gastric_fluid-throat_swab  Aspiration/Penetration    23
                               Normal                    19
                               not_tested                24
bal-throat_swab                Aspiration/Penetration     2
                               Normal                     4
                               not_tested                 1
gastric_fluid                  Aspiration/Penetration     3
                               Normal                     4
                               not_tested                 5
gastric_fluid-throat_swab      Aspiration/Pene

In [18]:
sitecombo_withasp.fillna('not_tested').groupby(['mbs_consolidated']).size()

mbs_consolidated
Aspiration/Penetration     47
Normal                     57
not_tested                116
dtype: int64