In [62]:
import os
import pandas as pd
from ete3 import Tree
from collections import Counter

os.getcwd()

'/Volumes/AHN/captive_ape_microbiome/results/16s'

In [63]:
os.chdir('/Volumes/AHN/captive_ape_microbiome/results/16s/')
metadata_file = 'inputs_old/16S_metadata.txt'
tax_table_file = 'inputs_old/ASVs_taxonomy.txt'
asv_table_file = 'inputs_old/ASV_tab.txt'

#### designate 16-ASVs as host-restricted, mixed host, or unique to captive

In [73]:
#read in asv table, get sample names and number for each asv
def is_HR(sampleNames,sample_type_dict):
    """given a list of sample names uses sample type dictionary to determine how many sample types are present
    designates ASVs as host restricted = 1 sample type or mixed = multiple sample types.
    Captive sample types are not considered so some clades will have a 0 sample type length and they can fall within
    host-restricted clades or mixed clades or neither"""
    sampleTypes = [sample_type_dict[name] for name in sampleNames]
    sampleTypes = [x.replace('non_western_','').replace('western_','') for x in sampleTypes]
    neutral_sampleTypes = ['captive_gorilla','captive_bonobo','captive_chimp','captive_orangutan']
    captiveNames = [name for name in sampleNames if 'captive' in sample_type_dict[name]]
    HR_sampleTypes = list(set(sampleTypes) - set(neutral_sampleTypes))
    HR_sampleNum = len([x for x in sampleTypes if x not in neutral_sampleTypes])
    CP_sampleTypes = list(set(sampleTypes) & set(neutral_sampleTypes))
    CP_sampleNum = len([x for x in sampleTypes if x in neutral_sampleTypes])
    CP_pres = True if len(CP_sampleTypes) > 0 else False
    if len(HR_sampleTypes) == 0:
        HR_cat,HR_type='Unique_CP','Unique_CP'
    if len(HR_sampleTypes) == 1: #identifies host-restricted clades
        HR_cat,HR_type='HR','HR_'+HR_sampleTypes[0]  
    if len(HR_sampleTypes) > 1: 
        HR_cat = 'MX'
        if len(HR_sampleTypes) == 2:
            if 'human' in HR_sampleTypes:
                HR_type = 'MX_human_single_wild_ape'
            else:
                HR_type = 'MX_2_wild_apes'
        if len(HR_sampleTypes) == 3:
            if 'human' in HR_sampleTypes:
                HR_type = 'MX_human_2_wild_apes'
            else:
                HR_type = 'MX_3_wild_apes'
        if len(HR_sampleTypes) == 4:
            HR_type = 'MX_4_hominids'
        
    return(HR_sampleTypes,HR_sampleNum,HR_cat,HR_type,CP_pres,CP_sampleNum,CP_sampleTypes,captiveNames)


def asv_hr_table(asv_table_file,metadata_file,tax_table_file):
    asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
    sampleNames = asv_table.apply(lambda row: list(row.index[row>0]),axis=1)
    asv_df = sampleNames.reset_index()
    asv_df.columns = ['ASV','sampleNames']
    asv_df['sampleNum'] = asv_df['sampleNames'].apply(lambda names: len(names))
    
    #add host restriction info
    metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
    sample_type_dict = dict(zip(metadata['X.SampleID'], metadata['Description'])) 
    hr = asv_df['sampleNames'].apply(lambda x: pd.Series(is_HR(x,sample_type_dict),
                                                         index=['HR_sampleTypes','HR_sampleNum','HR_cat','HR_type',
                                                                'CP_pres','CP_sampleNum','CP_sampleTypes','captiveNames']))
    asv_hr_df = asv_df.merge(hr,left_index=True, right_index=True)
    
    #add taxonomic info
    tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
    tax_table = tax_table[['ASV','Phylum','Order','Family','Genus']]
    asv_full = asv_hr_df.merge(tax_table,on='ASV',how='left')
    
    return(asv_full)

In [75]:
asv_16s = asv_hr_table(asv_table_file,metadata_file,tax_table_file)
print(len(asv_16s),'total 16S ASVs')
asv_16s_captive = asv_16s[asv_16s['CP_pres']==True]
print(len(asv_16s_captive),'16S ASVs found in captive apes')
#print(asv_16s_captive.head())
print(asv_16s_captive['HR_type'].value_counts())
asv_16s.to_csv('analyses/tables/16S_ASVs_summary_old.txt',sep='\t',index=None)

1977 total 16S ASVs
871 16S ASVs found in captive apes
HR_human                    227
MX_human_single_wild_ape    187
MX_4_hominids               166
MX_human_2_wild_apes        128
Unique_CP                   106
MX_2_wild_apes               20
MX_3_wild_apes               19
HR_wild_chimp                10
HR_wild_gorilla               8
Name: HR_type, dtype: int64


In [66]:
#asv_16s_old = pd.read_csv('analyses/tables/16S_ASVs_summary_old.txt',sep='\t').sort_values('ASV').reset_index()
#asv_16s = pd.read_csv('analyses/tables/16S_ASVs_summary.txt',sep='\t').sort_values('ASV').reset_index()
#asv_16s[asv_16s['ASV']!=asv_16s_old['ASV']]
#asv_16s[asv_16s['sampleNames']!=asv_16s_old['sampleNames']]
#asv_16s[asv_16s['CP_pres']!=asv_16s_old['CP_pres']]
#asv_16s_old[asv_16s['CP_pres']!=asv_16s_old['CP_pres']]

Unnamed: 0,index,ASV,sampleNames,sampleNum,HR_sampleTypes,HR_sampleNum,HR_cat,HR_type,CP_pres,CP_sampleNum,CP_sampleTypes,captiveNames,Phylum,Order,Family,Genus


#### Determine distribution of captive associated 16-ASVs across host species and locations

In [67]:
def multi_site_sp(cp_desc):
    """Designate ASVs based on whether they are found across 
    multiple locations or a single location, observed in a single
    host species or multiple host species"""
    sites = list(set([x.split('_')[0] for x in cp_desc]))
    sitesNum = len(sites)
    sitesMulti = 'multi_site' if sitesNum > 1 else 'single_site'      
    species = list(set([x.split('_')[2] for x in cp_desc]))
    speciesNum = len(species)
    speciessMulti = 'multi_sp' if speciesNum > 1 else 'single_sp' 
    return(sitesMulti + '_' + speciessMulti)

#multi_site_sp(['HOUZ_captive_chimp','HOUZ_captive_gorilla'])

def captive_apes_asv_summary(asv_hr_table_output,metadata_file):
    asv_cp = asv_hr_table_output[asv_hr_table_output['CP_pres']==True]
    metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
    metadata['Description_site'] = metadata['site_code']+'_'+metadata['Description']
    sample_type_site_dict = dict(zip(metadata['X.SampleID'], metadata['Description_site']))

    description_df = asv_cp['sampleNames'].apply(lambda l: pd.Series(
        [sample_type_site_dict[name] for name in l]).value_counts())
    description_df = description_df.fillna(0)  
    capt_desc = list(set(metadata['Description_site'][metadata['captivity_status']=='captive']))
    description_df = description_df[capt_desc]
    description_df['CP_sp_loc'] = description_df.apply(lambda row: list(row.index[row>0]),axis=1)
    description_df['numEnclosure'] = description_df['CP_sp_loc'].apply(lambda x: len(x))
    description_df['multi_site_sp'] = description_df['CP_sp_loc'].apply(lambda x:  
                                                                        multi_site_sp(x))
    asv_cp_table = asv_cp.merge(description_df,left_index=True,right_index=True)
    return(asv_cp_table)


In [68]:
asv_cp_table = captive_apes_asv_summary(asv_16s,metadata_file)
asv_cp_table_summary = asv_cp_table.groupby(['multi_site_sp']).size().reset_index(name="count")
print(asv_cp_table_summary)
#asv_cp_table
asv_cp_table.to_csv('analyses/figures/16S_captive_Figure1A_data.txt',sep='\t',index=None)

KeyError: '['

#### Determine proportion of shared ASVs (Figure 1B)

In [69]:
from itertools import combinations

def add_tax_to_asv_table(asv_table_file,tax_table_file):
    """adds taxonomic information to the last few columns of the ASV table"""
    asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
    asv_table = asv_table.reset_index().rename(columns={"index": "ASV"})
    tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
    asv_table_tax = asv_table.merge(tax_table[['ASV','Phylum','Order','Family','Genus']],on='ASV',how='left')
    asv_table_tax = asv_table_tax[asv_table_tax['Genus']!='Unassigned']
    asv_table_tax = asv_table_tax[~asv_table_tax['Genus'].isna()]
    return(asv_table_tax)

def get_sp_site_comp(ind1,ind2):
    """determines if any two samples belongs to the same host species or location"""
    sp = ('same_spec' if ind1.split('_')[2] == ind2.split('_')[2] else 'diff_spec')
    st = ('same_site' if ind1.split('_')[0] == ind2.split('_')[0] else 'diff_site')
    return(sp+'_'+st)
get_sp_site_comp('COLZ_captive_bonobo','COLZ_captive_chimp')


def pw_metadata_capt_samples(metadata_file):
    """generate all pairwise comparisons between two individuals with host species and site metadata"""
    metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
    metadata['Description_site'] = metadata['site_code']+'_'+metadata['Description']
    sample_type_site_dict = dict(zip(metadata['X.SampleID'], metadata['Description_site']))

    cp_samples = metadata['X.SampleID'][metadata['captivity_status']=='captive']
    pw_df = pd.DataFrame(combinations(cp_samples, 2),columns=['ind1','ind2'])
    pw_df['desc_site_ind1'] = pw_df['ind1'].apply(lambda x:sample_type_site_dict[x])
    pw_df['desc_site_ind2'] = pw_df['ind2'].apply(lambda x:sample_type_site_dict[x])
    pw_df['full_desc_comp'] = pw_df['desc_site_ind1'] + '_vs_' + pw_df['desc_site_ind2']
    pw_df['sp_site_comp'] = pw_df.apply(lambda row: 
                                        get_sp_site_comp(row['desc_site_ind1'],row['desc_site_ind2']),
                                        axis=1)
    return(pw_df)


def shared_ASVs_common_gen(ind1,ind2,asv_table_gen):
    pw = asv_table_gen[['Phylum','Order','Family','Genus','ASV',ind1,ind2]][
        (asv_table_gen[ind1]>0)|(asv_table_gen[ind2]>0)
        ]
    #print(pw)
    common_gen = list(
                 set(pw[pw[ind1]>0]['Genus']) & 
                 set(pw[pw[ind2]>0]['Genus']))
    len_common_gen = len(common_gen)
    
    pw= pw[pw['Genus'].isin(common_gen)]
    ind1_ASVs = len(pw[(pw[ind1]>0)])
    ind2_ASVs = len(pw[(pw[ind2]>0)])
    common_ASVs = len(pw[(pw[ind1]>0)&(pw[ind2]>0)]) 
    if len_common_gen > 0:
        prop_shared = common_ASVs/min(ind1_ASVs,ind2_ASVs)
    else:
        prop_shared = 'nan'
    res = pd.Series([ind1,ind2,common_gen,len_common_gen,ind1_ASVs,ind2_ASVs,common_ASVs,prop_shared],
              index=['ind1','ind2','common_gen','len_common_gen',
                     'ind1_ASVs','ind2_ASVs','common_ASVs','prop_shared'])
    return(res)



In [70]:
pw_df = pw_metadata_capt_samples(metadata_file)
asv_table_tax = add_tax_to_asv_table(asv_table_file,tax_table_file)
shared_ASV_all_genera = pw_df.apply(lambda row: 
                    shared_ASVs_common_gen(row['ind1'],row['ind2'],asv_table_tax),axis=1)
pw_shared_ASV_all_genera = pw_df.merge(shared_ASV_all_genera,on=['ind1','ind2'])                       
pw_shared_ASV_all_genera.to_csv(
    'analyses/figures/16S_captive_Figure1B_data.txt',
    sep='\t',index=False)



In [71]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [72]:
%%R
library(ggplot2)
table_file <- paste0('analyses/figures/16S_captive_Figure1B_data.txt')
Figure1B_data = read.table(table_file,header=TRUE,sep='\t')

Figure1B <- ggplot(Figure1B_data, aes(x=sp_site_comp, y=prop_shared,fill=sp_site_comp)) + 
    geom_violin()+
    theme_bw()+
    scale_fill_manual(values=c('#7fc97f','#beaed4','#fdc086','#ffff99'))+
    ylab('Proportion of shared ASVs')#
#ggsave(Figure1B, file = file.path(paste0(OUTDIR,'/figures/Fig_propsharedASVs.pdf')))
#
print(c('diff_spec_diff_site mean', mean(Figure1B_data$prop_shared[Figure1B_data$sp_site_comp=='diff_spec_diff_site'])))
print(c('diff_spec_same_site mean', mean(Figure1B_data$prop_shared[Figure1B_data$sp_site_comp=='diff_spec_same_site'])))
print(c('same_spec_diff_site', mean(Figure1B_data$prop_shared[Figure1B_data$sp_site_comp=='same_spec_diff_site'])))
print(c('same_spec_same_site',mean(Figure1B_data$prop_shared[Figure1B_data$sp_site_comp=='same_spec_same_site'])))



[1] "diff_spec_diff_site mean" "0.615436040485297"       
[1] "diff_spec_same_site mean" "0.645869535219191"       
[1] "same_spec_diff_site" "0.626528995471842"  
[1] "same_spec_same_site" "0.753325130301663"  


 #### Determine proportion of shared ASVs by bacterial order (Figure 1C)

In [69]:
print(asv_16s[asv_16s['CP_pres']==True]['Order'].value_counts()[:10].index)

Index(['Clostridiales', 'Bacteroidales', 'Mollicutes_RF39',
       'Erysipelotrichales', 'Spirochaetales', 'Gastranaerophilales',
       'Selenomonadales', 'Betaproteobacteriales', 'Coriobacteriales',
       'Lactobacillales'],
      dtype='object')


In [70]:
def prop_shared_by_order(order):
    pw_df = pw_metadata_capt_samples(metadata_file)
    asv_table_tax = add_tax_to_asv_table(asv_table_file,tax_table_file)
    asv_table_order = asv_table_tax[asv_table_tax['Order']==order]
    shared_ASV_all_genera = pw_df.apply(lambda row: 
                    shared_ASVs_common_gen(row['ind1'],row['ind2'],asv_table_order),axis=1)
    pw_shared_ASV_all_genera = pw_df.merge(shared_ASV_all_genera,on=['ind1','ind2']) 
    pw_shared_ASV_all_genera['order']=order
    return(pw_shared_ASV_all_genera)

In [71]:
Clostridales = prop_shared_by_order('Clostridiales')
Bacteroidales = prop_shared_by_order('Bacteroidales')
Mollicutes_RF39 = prop_shared_by_order('Mollicutes_RF39')
Erysipelotrichales = prop_shared_by_order('Erysipelotrichales')
Spirochaetales = prop_shared_by_order('Spirochaetales')
Gastranaerophilales = prop_shared_by_order('Gastranaerophilales')
Selenomonadales = prop_shared_by_order('Selenomonadales')
Betaproteobacteriales = prop_shared_by_order('Selenomonadales')
Coriobacteriales = prop_shared_by_order('Coriobacteriales')
Lactobacillales = prop_shared_by_order('Lactobacillales')

In [72]:
pw_shared_ASV_all_genera['order']='all'
pw_shared_ASV_top10_gen = pd.concat([pw_shared_ASV_all_genera,Clostridales,Bacteroidales,
                                     Mollicutes_RF39,Erysipelotrichales,Spirochaetales,
                                     Gastranaerophilales,Selenomonadales,Betaproteobacteriales,
                                     Coriobacteriales,Lactobacillales])
pw_shared_ASV_top10_gen =  pw_shared_ASV_top10_gen[pw_shared_ASV_top10_gen['prop_shared']!='nan']
pw_shared_ASV_top10_gen.to_csv(
    'analyses/figures/16S_captive_Figure1C_data.txt',
    sep='\t',index=False)

In [120]:
pw_df = pw_metadata_capt_samples(metadata_file)
asv_table_tax = add_tax_to_asv_table(asv_table_file,tax_table_file)
Prevotella_genera = ['Prevotella','Prevotella_2','Prevotella_7','Prevotella_1', 'Prevotella_9']
asv_table_Prevotella = asv_table_tax[asv_table_tax['Genus'].isin(Prevotella_genera)]
shared_ASV_Prevotella = pw_df.apply(lambda row: 
    shared_ASVs_common_gen(row['ind1'],row['ind2'],asv_table_Prevotella),axis=1)
pw_shared_ASV_Prevotella = pw_df.merge(shared_ASV_Prevotella,on=['ind1','ind2']) 
pw_shared_ASV_Prevotella.to_csv(
    'analyses/figures/16S_captive_FigureS3_Prevotella_data.txt',
    sep='\t',index=False)

### Table 1

In [139]:
def output_summary_table(asv_hr_table_output):
     #create summary table
    all_asvs = pd.Series(asv_hr_table_output['HR_type'].value_counts(),name='ALL')
    cp_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['CP_pres']==True]['HR_type'].value_counts(),name='CP')
    Bacteroidales_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['Order']=='Bacteroidales']['HR_type'].value_counts(),name='Bacteroidales')
    Bacteroides_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['Genus']=='Bacteroides']['HR_type'].value_counts(),name='Bacteroides')
    Prevotella_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['Genus'].isin(Prevotella_genera)]['HR_type'].value_counts(),name='Prevotella')
    Parabacteroides_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['Genus']=='Parabacteroides']['HR_type'].value_counts(),name='Parabacteroides')
    Bifidobacteriales_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['Order']=='Bifidobacteriales']['HR_type'].value_counts(),name='Bifidobacteriales')
    Bifidobacterium_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['Genus']=='Bifidobacterium']['HR_type'].value_counts(),name='Bifidobacterium')
    res = pd.concat([all_asvs,cp_asvs,Bacteroidales_asvs,
                     Bacteroides_asvs,Prevotella_asvs,Parabacteroides_asvs,
                     Bifidobacteriales_asvs,Bifidobacterium_asvs],axis=1).fillna(0).T
    res = res[['HR_human','HR_wild_bonobo','HR_wild_chimp','HR_wild_gorilla',
         'MX_2_wild_apes','MX_3_wild_apes',
         'MX_human_single_wild_ape','MX_human_2_wild_apes',
         'MX_4_hominids','Unique_CP']]
    return(res)

In [144]:
asv_16s_summary = output_summary_table(asv_16s)
asv_16s_summary['MX_total'] = asv_16s_summary.loc[:,'MX_2_wild_apes':'MX_4_hominids'].sum(axis=1)
Table1 = asv_16s_summary[['HR_human','HR_wild_bonobo','HR_wild_chimp','HR_wild_gorilla',
                         'MX_total','Unique_CP']]Table1.to_csv(
    'analyses/tables/Table1_raw.txt',
    sep='\t',index=False)
Table1

Unnamed: 0,HR_human,HR_wild_bonobo,HR_wild_chimp,HR_wild_gorilla,MX_total,Unique_CP
ALL,553.0,57.0,105.0,210.0,946.0,106.0
CP,227.0,0.0,10.0,8.0,520.0,106.0
Bacteroidales,211.0,22.0,50.0,34.0,88.0,29.0
Bacteroides,33.0,0.0,0.0,0.0,16.0,0.0
Prevotella,63.0,5.0,15.0,8.0,16.0,4.0
Parabacteroides,6.0,1.0,0.0,0.0,9.0,0.0
Bifidobacteriales,3.0,0.0,0.0,0.0,4.0,0.0
Bifidobacterium,3.0,0.0,0.0,0.0,4.0,0.0


In [143]:
print(asv_16s_summary.loc[:,'HR_human':'HR_wild_gorilla'].sum(axis=1))
print(asv_16s_summary.loc[:,'MX_2_wild_apes':'MX_4_hominids'].sum(axis=1))
print(asv_16s_summary.loc[:,'MX_2_wild_apes':'MX_3_wild_apes'].sum(axis=1))
print(asv_16s_summary.loc[:,'MX_human_single_wild_ape':'MX_4_hominids'].sum(axis=1))
print(asv_16s_summary.loc['CP','MX_human_single_wild_ape']/asv_16s_summary.loc['ALL','MX_human_single_wild_ape'])
print(asv_16s_summary.loc['CP','MX_human_2_wild_apes']/asv_16s_summary.loc['ALL','MX_human_2_wild_apes'])
print(asv_16s_summary.loc['CP','MX_4_hominids']/asv_16s_summary.loc['ALL','MX_4_hominids'])
asv_16s_summary

ALL                  925.0
CP                   245.0
Bacteroidales        317.0
Bacteroides           33.0
Prevotella            91.0
Parabacteroides        7.0
Bifidobacteriales      3.0
Bifidobacterium        3.0
dtype: float64
ALL                  946.0
CP                   520.0
Bacteroidales         88.0
Bacteroides           16.0
Prevotella            16.0
Parabacteroides        9.0
Bifidobacteriales      4.0
Bifidobacterium        4.0
dtype: float64
ALL                  331.0
CP                    39.0
Bacteroidales         32.0
Bacteroides            0.0
Prevotella             5.0
Parabacteroides        1.0
Bifidobacteriales      0.0
Bifidobacterium        0.0
dtype: float64
ALL                  615.0
CP                   481.0
Bacteroidales         56.0
Bacteroides           16.0
Prevotella            11.0
Parabacteroides        8.0
Bifidobacteriales      4.0
Bifidobacterium        4.0
dtype: float64
0.6951672862453532
0.8152866242038217
0.8783068783068783


Unnamed: 0,HR_human,HR_wild_bonobo,HR_wild_chimp,HR_wild_gorilla,MX_2_wild_apes,MX_3_wild_apes,MX_human_single_wild_ape,MX_human_2_wild_apes,MX_4_hominids,Unique_CP,MX_total
ALL,553.0,57.0,105.0,210.0,227.0,104.0,269.0,157.0,189.0,106.0,946.0
CP,227.0,0.0,10.0,8.0,20.0,19.0,187.0,128.0,166.0,106.0,520.0
Bacteroidales,211.0,22.0,50.0,34.0,24.0,8.0,50.0,4.0,2.0,29.0,88.0
Bacteroides,33.0,0.0,0.0,0.0,0.0,0.0,13.0,2.0,1.0,0.0,16.0
Prevotella,63.0,5.0,15.0,8.0,3.0,2.0,11.0,0.0,0.0,4.0,16.0
Parabacteroides,6.0,1.0,0.0,0.0,0.0,1.0,8.0,0.0,0.0,0.0,9.0
Bifidobacteriales,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,4.0
Bifidobacterium,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,4.0
