In [278]:
import os
import pandas as pd
from ete3 import Tree
from collections import Counter

os.getcwd()



'/Volumes/AHN/captive_ape_microbiome/results/16s'

In [279]:
#gyrb inputs
os.chdir('/Volumes/AHN/captive_ape_microbiome/results/gyrb/')
metadata_file = 'inputs/metadata_gyrb_amp_meta_passing_samples.txt'
tax_table_file = 'inputs/ASVs_taxonomy.txt'
asv_table_file = 'inputs/ASVs_filtered_counts.tsv'


In [280]:
#read in asv table, get sample names and number for each asv
def is_HR(sampleNames,sample_type_dict):
    """given a list of sample names uses sample type dictionary to determine how many sample types are present
    designates ASVs as host restricted = 1 sample type or mixed = multiple sample types.
    Captive sample types are not considered so some clades will have a 0 sample type length and they can fall within
    host-restricted clades or mixed clades or neither"""
    sampleTypes = [sample_type_dict[name] for name in sampleNames]
    sampleTypes = [x.replace('non_western_','').replace('western_','') for x in sampleTypes]
    neutral_sampleTypes = ['captive_gorilla','captive_bonobo','captive_chimp','captive_orangutan']
    HR_sampleTypes = list(set(sampleTypes) - set(neutral_sampleTypes))
    HR_sampleNum = len([x for x in sampleTypes if x not in neutral_sampleTypes])
    CP_sampleTypes = list(set(sampleTypes) & set(neutral_sampleTypes))
    CP_sampleNum = len([x for x in sampleTypes if x in neutral_sampleTypes])
    CP_pres = True if len(CP_sampleTypes) > 0 else False
    if len(HR_sampleTypes) == 0:
        HR_cat,HR_type='CP','CP'
    if len(HR_sampleTypes) == 1: #identifies host-restricted clades
        HR_cat,HR_type='HR',HR_sampleTypes[0]  
    if len(HR_sampleTypes) > 1:    
        HR_cat,HR_type ='MX','MX'
    return(HR_sampleTypes,HR_sampleNum,HR_cat,HR_type,CP_pres,CP_sampleTypes,CP_sampleNum)


def asv_hr_table(asv_table_file,metadata_file,tax_table_file):
    asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
    sampleNames = asv_table.apply(lambda row: list(row.index[row>0]),axis=1)
    asv_df = sampleNames.reset_index()
    asv_df.columns = ['ASV','sampleNames']
    asv_df['sampleNum'] = asv_df['sampleNames'].apply(lambda names: len(names))
    
    #add host restriction info
    metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
    sample_type_dict = dict(zip(metadata['X.SampleID'], metadata['Description'])) 
    hr = asv_df['sampleNames'].apply(lambda x: pd.Series(is_HR(x,sample_type_dict),
                                                         index=['HR_sampleTypes','HR_sampleNum','HR_cat','HR_type',
                                                                'CP_pres','CP_sampleTypes','CP_sampleNum']))
    asv_hr_df = asv_df.merge(hr,left_index=True, right_index=True)
    
    #add taxonomic info
    tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
    tax_table = tax_table[['ASV','Phylum','Order','Family','Genus']]
    asv_full = asv_hr_df.merge(tax_table,on='ASV',how='left')
    
    return(asv_full)
    
asv_full = asv_hr_table(asv_table_file,metadata_file,tax_table_file)

def output_summary_table(asv_full,outfile):
     #create summary table
    all_asvs = pd.Series(asv_full['HR_type'].value_counts(),name='ALL')
    cp_asv = pd.Series(asv_full[asv_full['CP_pres']==True]['HR_type'].value_counts(),name='CP')
    num_Gen = len(asv_full['Genus'].value_counts())
    top_Gen = pd.DataFrame(
            asv_full['Genus'].value_counts().index[:10],
            columns=['Genus'])
    gen_asvs = pd.DataFrame(top_Gen['Genus'].apply(
            lambda Genus: 
            pd.Series(asv_full[asv_full['Genus']==Genus]['HR_type'].value_counts(),name=Genus)
            )).T    
    gen_asvs.columns = list(top_Gen['Genus'])  
    res = pd.concat([all_asvs,cp_asv,gen_asvs],axis=1).fillna(0).T
    res.to_csv(outfile,sep='\t')
    return(res)
    
output_summary_table(asv_full,'analyses/codiv_moeller_ASVs/gyrb_hr_asv_table.txt')

Unnamed: 0,human,wild_bonobo,wild_gorilla,wild_chimp,CP,MX
ALL,6183.0,353.0,258.0,207.0,94.0,53.0
CP,27.0,0.0,0.0,4.0,94.0,14.0
g__Prevotella,2450.0,203.0,46.0,114.0,55.0,3.0
g__Bacteroides,940.0,1.0,0.0,14.0,12.0,25.0
Unassigned,199.0,94.0,181.0,41.0,11.0,13.0
g__Alistipes,515.0,0.0,0.0,1.0,0.0,3.0
g__RC9,363.0,0.0,0.0,0.0,0.0,0.0
g__Parabacteroides,247.0,38.0,1.0,3.0,3.0,4.0
g__Bacteroides_A,264.0,0.0,2.0,17.0,3.0,0.0
g__Porphyromonas,243.0,1.0,0.0,2.0,4.0,0.0


In [281]:
clades_df=pd.read_csv('analyses/codiv_moeller_ASVs/full_tree_clades_collapsed_table.txt',sep='\t')
clades_df=clades_df.rename(columns={"cladeTax": "Genus"})
output_summary_table(clades_df,'analyses/codiv_moeller_ASVs/gyrb_hr_clade_table.txt')
clades_df_15perc =  clades_df.loc[(clades_df.wild_bonobo > 0.15) | 
              (clades_df.wild_chimp > 0.15)  |
              (clades_df.wild_gorilla > 0.15)|
              (clades_df.western_human > 0.15)|
              (clades_df.non_western_human > 0.15)] 
output_summary_table(clades_df_15perc,'analyses/codiv_moeller_ASVs/gyrb_hr_clade_15perc_table.txt')

Unnamed: 0,human,wild_gorilla,wild_chimp,wild_bonobo,MX
ALL,22.0,19.0,15.0,14.0,13.0
CP,7.0,0.0,1.0,0.0,8.0
Bacteroidaceae_Prevotella,11.0,4.0,8.0,8.0,2.0
WCHB1-69_Unassigned,0.0,7.0,0.0,0.0,3.0
Bacteroidaceae_Unassigned,2.0,3.0,0.0,3.0,0.0
Bacteroidaceae_Bacteroides,1.0,0.0,1.0,0.0,4.0
Tannerellaceae_Parabacteroides,1.0,0.0,1.0,1.0,1.0
Unassigned_Unassigned,1.0,3.0,0.0,0.0,0.0
Rikenellaceae_Alistipes,2.0,0.0,0.0,0.0,1.0
Tannerellaceae_Unassigned,0.0,1.0,2.0,0.0,0.0


In [288]:
def multi_site_sp(cp_desc):
    sites = list(set([x.split('_')[2] for x in cp_desc]))
    sitesNum = len(sites)
    sitesMulti = 'multi_site' if sitesNum > 1 else 'single_site'      
    species = list(set([x.split('_')[1] for x in cp_desc]))
    speciesNum = len(species)
    speciessMulti = 'multi_sp' if speciesNum > 1 else 'single_sp' 
    return(sitesMulti + '_' + speciessMulti)
multi_site_sp(['captive_chimp_HOUZ','captive_gorilla_HOUZ'])

def captive_apes_asv_summary(asv_full,metadata_file):
    asv_cp = asv_full[asv_full['CP_pres']==True]
    metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
    metadata['Description_site'] = metadata['Description']+'_' +metadata['site_code']
    sample_type_site_dict = dict(zip(metadata['X.SampleID'], metadata['Description_site']))

    description_df = asv_cp['sampleNames'].apply(lambda l: pd.Series(
        [sample_type_site_dict[name] for name in l]).value_counts())
    description_df = description_df.fillna(0)  
    capt_desc = list(set(metadata['Description_site'][metadata['captivity_status']=='captive']))
    description_df = description_df[capt_desc]
    description_df['CP_sp_loc'] = description_df.apply(lambda row: list(row.index[row>0]),axis=1)
    description_df['numEnclosure'] = description_df['CP_sp_loc'].apply(lambda x: len(x))
    description_df['multi_site_sp'] = description_df['CP_sp_loc'].apply(lambda x:  
                                                                        multi_site_sp(x))
    asv_cp_table = asv_cp.merge(description_df,left_index=True,right_index=True)
    return(asv_cp_table)

asv_cp_table = captive_apes_asv_summary(asv_full,metadata_file)
asv_cp_table_summary = asv_cp_table.groupby(['numEnclosure','multi_site_sp']).size().reset_index(name="count")
print(asv_cp_table_summary)
asv_cp_table_summary.to_csv('analyses/codiv_moeller_ASVs/numEnclosures_table.txt',sep='\t',index=False)


   numEnclosure          multi_site_sp  count
0             1  single_site_single_sp     97
1             2    multi_site_multi_sp      8
2             2   multi_site_single_sp      2
3             2   single_site_multi_sp      9
4             3    multi_site_multi_sp     10
5             4    multi_site_multi_sp      5
6             5    multi_site_multi_sp      3
7             6    multi_site_multi_sp      4
8             7    multi_site_multi_sp      1


In [None]:
asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
asv_table = asv_table.reset_index().rename(columns={"index": "ASV"})
tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
asv_table_gen = asv_table.merge(tax_table[['ASV','Phylum','Family','Genus']],on='ASV',how='left')
asv_table_gen = asv_table_gen[asv_table_gen['Genus']!='Unassigned']

def shared_ASVs_common_gen(ind1,ind2,asv_table_gen):
    pw = asv_table_gen[['Phylum','Family','Genus','ASV',ind1,ind2]][
        (asv_table_gen[ind1]>0)|(asv_table_gen[ind2]>0)
        ]
    #print(pw)
    common_gen = list(
                 set(pw[pw[ind1]>0]['Genus']) & 
                 set(pw[pw[ind2]>0]['Genus']))
    pw= pw[pw['Genus'].isin(common_gen)]
    ind1_ASVs = pw[ind1].sum()
    ind2_ASVs = pw[ind2].sum()
    common_ASVs = len(pw[(pw[ind1]>0)&(pw[ind2]>0)])
    res = pd.Series([ind1,ind2,ind1_ASVs,ind2_ASVs,common_ASVs],
              index=['ind1','ind2','ind1_ASVs','ind2_ASVs','common_ASVs'])
    return(res)

shared_ASVs_common_gen('cp.bon.COLZ.01.Bt','cp.bon.COLZ.02.Bt',asv_table_gen)



In [201]:
pw_shared_ASV_all_genera = pw_df.merge(shared_ASV_all_genera,on=['ind1','ind2'])

def prop_shared(row):
    if row['common_ASVs'] > 0:
        return(row['common_ASVs']/min(row['ind1_ASVs'],row['ind2_ASVs']))
    else:
        return(0)
                                   
pw_shared_ASV_all_genera['prop_shared_ASVs'] = pw_shared_ASV_all_genera.apply(
    lambda row: prop_shared(row),axis=1)
pw_shared_ASV_all_genera.to_csv('analyses/codiv_moeller_ASVs/gyrb_hr_clade_15perc_table.txt',sep='\t',index=Faluse)

In [202]:
asv_table_Prevotella = asv_table_gen[asv_table_gen['Genus']=='g__Prevotella']
shared_ASV_Prevotella = pw_df.apply(lambda row: 
                    shared_ASVs_common_gen(row['ind1'],row['ind2'],asv_table_Prevotella),axis=1)
pw_shared_ASV_Prevotella = pw_df.merge(shared_ASV_Prevotella,on=['ind1','ind2'])
pw_shared_ASV_Prevotella['prop_shared_ASVs'] = pw_shared_ASV_Prevotella.apply(
    lambda row: prop_shared(row),axis=1)
pw_shared_ASV_all_genera.to_csv('analyses/codiv_moeller_ASVs/gyrb_prop_shared_ASVs_Prevotella_table.txt',sep='\t',index=False)

# 16S

In [271]:
os.chdir('/Volumes/AHN/captive_ape_microbiome/results/16s/')
metadata_file = 'inputs/16S_metadata.txt'
tax_table_file = 'inputs/ASVs_taxonomy.txt'
asv_table_file = 'inputs/ASV_tab.txt'

In [272]:
#sanity check make sure all sample names overlap
asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
set(asv_table.columns) - set(metadata['X.SampleID'])


set()

In [273]:
asv_16s = asv_hr_table(asv_table_file,metadata_file,tax_table_file)
output_summary_table(asv_16s,'analyses/tables/16s_hr_all_asvs_table.txt')

asv_Bifidobacteriales = asv_16s[asv_16s['Order']=='Bifidobacteriales']
output_summary_table(asv_Bifidobacteriales,'analyses/tables/16s_hr_Bifidobacteriales_asvs_table.txt')

asv_Bacteroidales = asv_16s[asv_16s['Order']=='Bacteroidales']
output_summary_table(asv_Bacteroidales,'analyses/tables/16s_hr_Bacteroidales_asvs_table.txt')

Unnamed: 0,human,MX,wild_chimp,wild_gorilla,CP,wild_bonobo
ALL,211.0,88.0,50.0,34.0,29.0,22.0
CP,83.0,35.0,0.0,2.0,29.0,0.0
Prevotella_9,38.0,11.0,7.0,2.0,0.0,2.0
Bacteroides,33.0,16.0,0.0,0.0,0.0,0.0
Rikenellaceae_RC9_gut_group,16.0,13.0,7.0,8.0,0.0,0.0
Prevotella_7,4.0,4.0,7.0,6.0,0.0,2.0
Alloprevotella,9.0,6.0,4.0,1.0,0.0,2.0
Prevotella_2,20.0,1.0,0.0,0.0,0.0,0.0
Parabacteroides,6.0,9.0,0.0,0.0,0.0,1.0
Alistipes,12.0,3.0,0.0,0.0,0.0,0.0


NameError: name 'asv_16s' is not defined

In [276]:
def captive_apes_asv_summary(asv_full,metadata_file):
    asv_cp = asv_full[asv_full['CP_pres']==True]
    metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
    metadata['Description_site'] = metadata['Description']+'_' +metadata['site_code']
    sample_type_site_dict = dict(zip(metadata['X.SampleID'], metadata['Description_site']))

    description_df = asv_cp['sampleNames'].apply(lambda l: pd.Series(
        [sample_type_site_dict[name] for name in l]).value_counts())
    description_df = description_df.fillna(0)
    captive_desc = list(set([x for x in metadata['Description_site'] if 'captive' in x]))
    print(captive_desc)
    description_df = description_df[captive_desc]
    description_df['CP_sp_loc'] = description_df.apply(lambda row: list(row.index[row>0]),axis=1)
    description_df['numEnclosure'] = description_df['CP_sp_loc'].apply(lambda x: len(x))
    description_df['multi_site_sp'] = description_df['CP_sp_loc'].apply(lambda x:  
                                                                        multi_site_sp(x))
    asv_cp_table = asv_cp.merge(description_df,left_index=True,right_index=True)
    return(asv_cp_table)


In [277]:
asv_cp_table = captive_apes_asv_summary(asv_16s,metadata_file)
asv_cp_table_summary = asv_cp_table.groupby(['numEnclosure','multi_site_sp']).size().reset_index(name="count")
print(asv_cp_table_summary)
asv_cp_table_summary.to_csv('analyses/tables/numEnclosures_table.txt',sep='\t',index=False)

cp_Bifidobacteriales = asv_cp_table[asv_cp_table['Order']=='Bifidobacteriales']
cp_Bifidobacteriales_summary = cp_Bifidobacteriales.groupby(['numEnclosure','multi_site_sp']).size().reset_index(name="count")
print(cp_Bifidobacteriales_summary)
cp_Bifidobacteriales_summary.to_csv('analyses/tables/numEnclosures_Bifidobacteriales_table.txt',sep='\t',index=False)

cp_Bacteroidales = asv_cp_table[asv_cp_table['Order']=='Bacteroidales']
cp_Bacteroidales_summary = cp_Bacteroidales.groupby(['numEnclosure','multi_site_sp']).size().reset_index(name="count")
print(cp_Bacteroidales_summary)
cp_Bacteroidales_summary.to_csv('analyses/tables/numEnclosures_Bacteroidales_table.txt',sep='\t',index=False)

['captive_gorilla_COLZ', 'captive_orangutan_COMZ', 'captive_orangutan_HOUZ', 'captive_chimp_HOUZ', 'captive_orangutan_ATLZ', 'captive_gorilla_COMZ', 'captive_orangutan_COLZ', 'captive_gorilla_HOUZ', 'captive_chimp_PC', 'captive_bonobo_COLZ']
    numEnclosure          multi_site_sp  count
0              1  single_site_single_sp    168
1              2    multi_site_multi_sp     71
2              2   multi_site_single_sp     30
3              2   single_site_multi_sp     19
4              3    multi_site_multi_sp     92
5              3   multi_site_single_sp     10
6              3   single_site_multi_sp     11
7              4    multi_site_multi_sp     72
8              5    multi_site_multi_sp     72
9              6    multi_site_multi_sp     63
10             7    multi_site_multi_sp     56
11             8    multi_site_multi_sp     66
12             9    multi_site_multi_sp     66
13            10    multi_site_multi_sp     75
   numEnclosure        multi_site_sp  count
0        