In [106]:
import os
import pandas as pd
from ete3 import Tree
from collections import Counter

os.getcwd()

'/Volumes/AHN/captive_ape_microbiome/results/16s'

In [218]:
#gyrb inputs
os.chdir('/Volumes/AHN/captive_ape_microbiome/results/gyrb/')
metadata_file = 'inputs/metadata_gyrb_amp_meta_passing_samples.txt'
tax_table_file = 'inputs/ASVs_taxonomy.txt'
asv_table_file = 'inputs/ASVs_filtered_counts.tsv'


In [219]:
#read in asv table, get sample names and number for each asv
def is_HR(sampleNames,sample_type_dict):
    """given a list of sample names uses sample type dictionary to determine how many sample types are present
    designates ASVs as host restricted = 1 sample type or mixed = multiple sample types.
    Captive sample types are not considered so some clades will have a 0 sample type length and they can fall within
    host-restricted clades or mixed clades or neither"""
    sampleTypes = [sample_type_dict[name] for name in sampleNames]
    sampleTypes = [x.replace('non_western_','').replace('western_','') for x in sampleTypes]
    neutral_sampleTypes = ['captive_gorilla','captive_bonobo','captive_chimp','captive_orangutan']
    HR_sampleTypes = list(set(sampleTypes) - set(neutral_sampleTypes))
    HR_sampleNum = len([x for x in sampleTypes if x not in neutral_sampleTypes])
    CP_sampleTypes = list(set(sampleTypes) & set(neutral_sampleTypes))
    CP_sampleNum = len([x for x in sampleTypes if x in neutral_sampleTypes])
    CP_pres = True if len(CP_sampleTypes) > 0 else False
    if len(HR_sampleTypes) == 0:
        HR_cat,HR_type='CP','CP'
    if len(HR_sampleTypes) == 1: #identifies host-restricted clades
        HR_cat,HR_type='HR',HR_sampleTypes[0]  
    if len(HR_sampleTypes) > 1:    
        HR_cat,HR_type ='MX','MX'
    return(HR_sampleTypes,HR_sampleNum,HR_cat,HR_type,CP_pres,CP_sampleTypes,CP_sampleNum)


def asv_hr_table(asv_table_file,metadata_file,tax_table_file):
    asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
    sampleNames = asv_table.apply(lambda row: list(row.index[row>0]),axis=1)
    asv_df = sampleNames.reset_index()
    asv_df.columns = ['ASV','sampleNames']
    asv_df['sampleNum'] = asv_df['sampleNames'].apply(lambda names: len(names))
    
    #add host restriction info
    metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
    sample_type_dict = dict(zip(metadata['X.SampleID'], metadata['Description'])) 
    hr = asv_df['sampleNames'].apply(lambda x: pd.Series(is_HR(x,sample_type_dict),
                                                         index=['HR_sampleTypes','HR_sampleNum','HR_cat','HR_type',
                                                                'CP_pres','CP_sampleTypes','CP_sampleNum']))
    asv_hr_df = asv_df.merge(hr,left_index=True, right_index=True)
    
    #add taxonomic info
    tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
    tax_table = tax_table[['ASV','Phylum','Order','Family','Genus']]
    asv_full = asv_hr_df.merge(tax_table,on='ASV',how='left')
    
    return(asv_full)
    
def output_summary_table(asv_full,outfile):
     #create summary table
    all_asvs = pd.Series(asv_full['HR_type'].value_counts(),name='ALL')
    cp_asv = pd.Series(asv_full[asv_full['CP_pres']==True]['HR_type'].value_counts(),name='CP')
    num_Gen = len(asv_full['Genus'].value_counts())
    top_Gen = pd.DataFrame(
            asv_full['Genus'].value_counts().index[:10],
            columns=['Genus'])
    gen_asvs = pd.DataFrame(top_Gen['Genus'].apply(
            lambda Genus: 
            pd.Series(asv_full[asv_full['Genus']==Genus]['HR_type'].value_counts(),name=Genus)
            )).T    
    gen_asvs.columns = list(top_Gen['Genus'])  
    res = pd.concat([all_asvs,cp_asv,gen_asvs],axis=1).fillna(0).T
    res.to_csv(outfile,sep='\t')
    return(res)


In [109]:
asv_gyrb = asv_hr_table(asv_table_file,metadata_file,tax_table_file)
output_summary_table(asv_gyrb,'analyses/codiv_moeller_ASVs/gyrb_hr_asv_table.txt')

Unnamed: 0,human,wild_bonobo,wild_gorilla,wild_chimp,CP,MX
ALL,6183.0,353.0,258.0,207.0,94.0,53.0
CP,27.0,0.0,0.0,4.0,94.0,14.0
g__Prevotella,2450.0,203.0,46.0,114.0,55.0,3.0
g__Bacteroides,940.0,1.0,0.0,14.0,12.0,25.0
Unassigned,199.0,94.0,181.0,41.0,11.0,13.0
g__Alistipes,515.0,0.0,0.0,1.0,0.0,3.0
g__RC9,363.0,0.0,0.0,0.0,0.0,0.0
g__Parabacteroides,247.0,38.0,1.0,3.0,3.0,4.0
g__Bacteroides_A,264.0,0.0,2.0,17.0,3.0,0.0
g__Porphyromonas,243.0,1.0,0.0,2.0,4.0,0.0


In [110]:
clades_df=pd.read_csv('analyses/codiv_moeller_ASVs/full_tree_clades_collapsed_table.txt',sep='\t')
clades_df=clades_df.rename(columns={"cladeTax": "Genus"})
output_summary_table(clades_df,'analyses/codiv_moeller_ASVs/gyrb_hr_clade_table.txt')
clades_df_15perc =  clades_df.loc[(clades_df.wild_bonobo > 0.15) | 
              (clades_df.wild_chimp > 0.15)  |
              (clades_df.wild_gorilla > 0.15)|
              (clades_df.western_human > 0.15)|
              (clades_df.non_western_human > 0.15)] 
output_summary_table(clades_df_15perc,'analyses/codiv_moeller_ASVs/gyrb_hr_clade_15perc_table.txt')

Unnamed: 0,human,wild_gorilla,wild_chimp,wild_bonobo,MX
ALL,22.0,19.0,15.0,14.0,13.0
CP,7.0,0.0,1.0,0.0,8.0
Bacteroidaceae_Prevotella,11.0,4.0,8.0,8.0,2.0
WCHB1-69_Unassigned,0.0,7.0,0.0,0.0,3.0
Bacteroidaceae_Unassigned,2.0,3.0,0.0,3.0,0.0
Bacteroidaceae_Bacteroides,1.0,0.0,1.0,0.0,4.0
Unassigned_Unassigned,1.0,3.0,0.0,0.0,0.0
Tannerellaceae_Parabacteroides,1.0,0.0,1.0,1.0,1.0
Tannerellaceae_Unassigned,0.0,1.0,2.0,0.0,0.0
Rikenellaceae_Alistipes,2.0,0.0,0.0,0.0,1.0


In [111]:
def multi_site_sp(cp_desc):
    sites = list(set([x.split('_')[2] for x in cp_desc]))
    sitesNum = len(sites)
    sitesMulti = 'multi_site' if sitesNum > 1 else 'single_site'      
    species = list(set([x.split('_')[1] for x in cp_desc]))
    speciesNum = len(species)
    speciessMulti = 'multi_sp' if speciesNum > 1 else 'single_sp' 
    return(sitesMulti + '_' + speciessMulti)
multi_site_sp(['captive_chimp_HOUZ','captive_gorilla_HOUZ'])

def captive_apes_asv_summary(asv_full,metadata_file):
    asv_cp = asv_full[asv_full['CP_pres']==True]
    metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
    metadata['Description_site'] = metadata['Description']+'_' +metadata['site_code']
    sample_type_site_dict = dict(zip(metadata['X.SampleID'], metadata['Description_site']))

    description_df = asv_cp['sampleNames'].apply(lambda l: pd.Series(
        [sample_type_site_dict[name] for name in l]).value_counts())
    description_df = description_df.fillna(0)  
    capt_desc = list(set(metadata['Description_site'][metadata['captivity_status']=='captive']))
    description_df = description_df[capt_desc]
    description_df['CP_sp_loc'] = description_df.apply(lambda row: list(row.index[row>0]),axis=1)
    description_df['numEnclosure'] = description_df['CP_sp_loc'].apply(lambda x: len(x))
    description_df['multi_site_sp'] = description_df['CP_sp_loc'].apply(lambda x:  
                                                                        multi_site_sp(x))
    asv_cp_table = asv_cp.merge(description_df,left_index=True,right_index=True)
    return(asv_cp_table)

asv_cp_table = captive_apes_asv_summary(asv_gyrb,metadata_file)
asv_cp_table_summary = asv_cp_table.groupby(['numEnclosure','multi_site_sp']).size().reset_index(name="count")
print(asv_cp_table_summary)
asv_cp_table_summary.to_csv('analyses/codiv_moeller_ASVs/numEnclosures_table.txt',sep='\t',index=False)


   numEnclosure          multi_site_sp  count
0             1  single_site_single_sp     97
1             2    multi_site_multi_sp      8
2             2   multi_site_single_sp      2
3             2   single_site_multi_sp      9
4             3    multi_site_multi_sp     10
5             4    multi_site_multi_sp      5
6             5    multi_site_multi_sp      3
7             6    multi_site_multi_sp      4
8             7    multi_site_multi_sp      1


In [112]:
asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
asv_table = asv_table.reset_index().rename(columns={"index": "ASV"})
tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
asv_table_gen = asv_table.merge(tax_table[['ASV','Phylum','Family','Genus']],on='ASV',how='left')
asv_table_gen = asv_table_gen[asv_table_gen['Genus']!='Unassigned']

def shared_ASVs_common_gen(ind1,ind2,asv_table_gen):
    pw = asv_table_gen[['Phylum','Family','Genus','ASV',ind1,ind2]][
        (asv_table_gen[ind1]>0)|(asv_table_gen[ind2]>0)
        ]
    #print(pw)
    common_gen = list(
                 set(pw[pw[ind1]>0]['Genus']) & 
                 set(pw[pw[ind2]>0]['Genus']))
    pw= pw[pw['Genus'].isin(common_gen)]
    ind1_ASVs = len(pw[(pw[ind1]>0)])
    ind2_ASVs = len(pw[(pw[ind2]>0)])
    common_ASVs = len(pw[(pw[ind1]>0)&(pw[ind2]>0)])
    res = pd.Series([ind1,ind2,common_gen,ind1_ASVs,ind2_ASVs,common_ASVs],
              index=['ind1','ind2','common_gen','ind1_ASVs','ind2_ASVs','common_ASVs'])
    return(res)

shared_ASVs_common_gen('cp.bon.COLZ.01.Bt','cp.bon.COLZ.02.Bt',asv_table_gen)

def prop_shared(row):
    if row['common_ASVs'] > 0:
        return(row['common_ASVs']/min(row['ind1_ASVs'],row['ind2_ASVs']))
    else:
        return(0)

In [113]:
from itertools import combinations

def get_sp_site_comp(ind1,ind2):
    sp = ('same_spec' if ind1.split('_')[1] == ind2.split('_')[1] else 'diff_spec')
    st = ('same_site' if ind1.split('_')[2] == ind2.split('_')[2] else 'diff_site')
    return(sp+'_'+st)
get_sp_site_comp('captive_bonobo_COLZ','captive_chimp_COLZ')


def pw_metadata_capt_samples(metadata_file):
    metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
    metadata['Description_site'] = metadata['Description']+'_' +metadata['site_code']
    sample_type_site_dict = dict(zip(metadata['X.SampleID'], metadata['Description_site']))

    cp_samples = metadata['X.SampleID'][metadata['captivity_status']=='captive']
    pw_df = pd.DataFrame(combinations(cp_samples, 2),columns=['ind1','ind2'])
    pw_df['desc_site_ind1'] = pw_df['ind1'].apply(lambda x:sample_type_site_dict[x])
    pw_df['desc_site_ind2'] = pw_df['ind2'].apply(lambda x:sample_type_site_dict[x])
    pw_df['full_desc_comp'] = pw_df['desc_site_ind1'] + '_vs_' + pw_df['desc_site_ind2']
    pw_df['sp_site_comp'] = pw_df.apply(lambda row: 
                                        get_sp_site_comp(row['desc_site_ind1'],row['desc_site_ind2']),
                                        axis=1)
    return(pw_df)
pw_df = pw_metadata_capt_samples(metadata_file)
print(pw_df.head())

def add_tax_to_asv_table(asv_table_file,tax_table_file):
    asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
    asv_table = asv_table.reset_index().rename(columns={"index": "ASV"})
    tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
    asv_table_tax = asv_table.merge(tax_table[['ASV','Phylum','Order','Family','Genus']],on='ASV',how='left')
    return(asv_table_tax)
    
asv_table_tax = add_tax_to_asv_table(asv_table_file,tax_table_file)
print(len(asv_table_tax))

                ind1               ind2       desc_site_ind1  \
0  cp.bon.COLZ.01.Bt  cp.bon.COLZ.02.Bt  captive_bonobo_COLZ   
1  cp.bon.COLZ.01.Bt  cp.bon.COLZ.03.Bt  captive_bonobo_COLZ   
2  cp.bon.COLZ.01.Bt  cp.bon.COLZ.04.Bt  captive_bonobo_COLZ   
3  cp.bon.COLZ.01.Bt  cp.bon.COLZ.05.Bt  captive_bonobo_COLZ   
4  cp.bon.COLZ.01.Bt  cp.bon.COLZ.06.Bt  captive_bonobo_COLZ   

        desc_site_ind2                              full_desc_comp  \
0  captive_bonobo_COLZ  captive_bonobo_COLZ_vs_captive_bonobo_COLZ   
1  captive_bonobo_COLZ  captive_bonobo_COLZ_vs_captive_bonobo_COLZ   
2  captive_bonobo_COLZ  captive_bonobo_COLZ_vs_captive_bonobo_COLZ   
3  captive_bonobo_COLZ  captive_bonobo_COLZ_vs_captive_bonobo_COLZ   
4  captive_bonobo_COLZ  captive_bonobo_COLZ_vs_captive_bonobo_COLZ   

          sp_site_comp  
0  same_spec_same_site  
1  same_spec_same_site  
2  same_spec_same_site  
3  same_spec_same_site  
4  same_spec_same_site  
7148


In [114]:
pw_df = pw_metadata_capt_samples(metadata_file)
asv_table_tax = add_tax_to_asv_table(asv_table_file,tax_table_file)

asv_table_tax = asv_table_tax[asv_table_tax['Genus']!='Unassigned']
shared_ASV_all_genera = pw_df.apply(lambda row: 
                    shared_ASVs_common_gen(row['ind1'],row['ind2'],asv_table_tax),axis=1)
pw_shared_ASV_all_genera = pw_df.merge(shared_ASV_all_genera,on=['ind1','ind2'])                       
pw_shared_ASV_all_genera['prop_shared_ASVs'] = pw_shared_ASV_all_genera.apply(
    lambda row: prop_shared(row),axis=1)
pw_shared_ASV_all_genera.to_csv(
    'analyses/codiv_moeller_ASVs/gyrb_prop_shared_ASVs_all_gen_table.txt',
    sep='\t',index=False)


In [115]:
asv_table_Prevotella = asv_table_gen[asv_table_gen['Genus']=='g__Prevotella']
shared_ASV_Prevotella = pw_df.apply(lambda row: 
                    shared_ASVs_common_gen(row['ind1'],row['ind2'],asv_table_Prevotella),axis=1)
pw_shared_ASV_Prevotella = pw_df.merge(shared_ASV_Prevotella,on=['ind1','ind2'])
pw_shared_ASV_Prevotella['prop_shared_ASVs'] = pw_shared_ASV_Prevotella.apply(
    lambda row: prop_shared(row),axis=1)
pw_shared_ASV_all_genera.to_csv('analyses/codiv_moeller_ASVs/gyrb_prop_shared_ASVs_Prevotella_table.txt',sep='\t',index=False)

# 16S

In [216]:
os.chdir('/Volumes/AHN/captive_ape_microbiome/results/16s/')
metadata_file = 'inputs/16S_metadata.txt'
tax_table_file = 'inputs/ASVs_taxonomy.txt'
asv_table_file = 'inputs/ASV_tab.txt'

In [217]:
#sanity check make sure all sample names overlap
asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
set(asv_table.columns) - set(metadata['X.SampleID'])
set(asv_table.index) - set(tax_table['ASV'])
print(asv_table.shape)
print(asv_table.shape)

(1977, 728)
(1977, 728)


In [129]:
asv_16s = asv_hr_table(asv_table_file,metadata_file,tax_table_file)
output_summary_table(asv_16s,'analyses/tables/16s_hr_all_asvs_table.txt')

asv_Bifidobacteriales = asv_16s[asv_16s['Order']=='Bifidobacteriales']
output_summary_table(asv_Bifidobacteriales,'analyses/tables/16s_hr_Bifidobacteriales_asvs_table.txt')

asv_Bacteroidales = asv_16s[asv_16s['Order']=='Bacteroidales']
output_summary_table(asv_Bacteroidales,'analyses/tables/16s_hr_Bacteroidales_asvs_table.txt')

Unnamed: 0,human,MX,wild_chimp,wild_gorilla,CP,wild_bonobo
ALL,211.0,88.0,50.0,34.0,29.0,22.0
CP,83.0,35.0,0.0,2.0,29.0,0.0
Prevotella_9,38.0,11.0,7.0,2.0,0.0,2.0
Bacteroides,33.0,16.0,0.0,0.0,0.0,0.0
Rikenellaceae_RC9_gut_group,16.0,13.0,7.0,8.0,0.0,0.0
Prevotella_7,4.0,4.0,7.0,6.0,0.0,2.0
Alloprevotella,9.0,6.0,4.0,1.0,0.0,2.0
Prevotella_2,20.0,1.0,0.0,0.0,0.0,0.0
Parabacteroides,6.0,9.0,0.0,0.0,0.0,1.0
Alistipes,12.0,3.0,0.0,0.0,0.0,0.0


In [130]:
asv_16s.head()


Unnamed: 0,ASV,sampleNames,sampleNum,HR_sampleTypes,HR_sampleNum,HR_cat,HR_type,CP_pres,CP_sampleTypes,CP_sampleNum,Phylum,Order,Family,Genus
0,ASV_617,"[wd.gor.BI.a172.16s, wd.gor.BI.a173.16s, wd.go...",108,[wild_gorilla],108,HR,wild_gorilla,False,[],0,Firmicutes,Clostridiales,Lachnospiraceae,Lachnospiraceae_UCG-009
1,ASV_982,"[wd.gor.BI.a175.16s, wd.gor.BI.a177.16s, wd.go...",64,[wild_gorilla],64,HR,wild_gorilla,False,[],0,Firmicutes,Clostridiales,Lachnospiraceae,
2,ASV_453,"[wd.chi.GM.10.16s, wd.chi.GM.100.16s, wd.chi.G...",86,"[wild_chimp, wild_gorilla]",86,MX,MX,False,[],0,Firmicutes,Clostridiales,Lachnospiraceae,
3,ASV_932,"[wd.bon.ML.a207.16s, wd.bon.ML.a208.16s, wd.bo...",48,[wild_bonobo],48,HR,wild_bonobo,False,[],0,Firmicutes,Clostridiales,Lachnospiraceae,
4,ASV_1398,"[wd.bon.LA.a187.16s, wd.bon.LA.a189.16s, wd.bo...",18,"[wild_bonobo, wild_gorilla]",18,MX,MX,False,[],0,Firmicutes,Clostridiales,Lachnospiraceae,Lachnospiraceae_FCS020_group


In [132]:
asv_cp_table = captive_apes_asv_summary(asv_16s,metadata_file)
asv_cp_table_summary = asv_cp_table.groupby(['numEnclosure','multi_site_sp']).size().reset_index(name="count")
print(asv_cp_table_summary)
asv_cp_table_summary.to_csv('analyses/tables/numEnclosures_table.txt',sep='\t',index=False)

cp_Bifidobacteriales = asv_cp_table[asv_cp_table['Order']=='Bifidobacteriales']
cp_Bifidobacteriales_summary = cp_Bifidobacteriales.groupby(['numEnclosure','multi_site_sp']).size().reset_index(name="count")
print(cp_Bifidobacteriales_summary)
cp_Bifidobacteriales_summary.to_csv('analyses/tables/numEnclosures_Bifidobacteriales_table.txt',sep='\t',index=False)

cp_Bacteroidales = asv_cp_table[asv_cp_table['Order']=='Bacteroidales']
cp_Bacteroidales_summary = cp_Bacteroidales.groupby(['numEnclosure','multi_site_sp']).size().reset_index(name="count")
print(cp_Bacteroidales_summary)
cp_Bacteroidales_summary.to_csv('analyses/tables/numEnclosures_Bacteroidales_table.txt',sep='\t',index=False) 

Prevotella_genera = ['Prevotella','Prevotella_2','Prevotella_7','Prevotella_1', 'Prevotella_9']
cp_Prevotella = asv_cp_table[asv_cp_table['Genus'].isin(Prevotella_genera)]
cp_Prevotella_summary = cp_Prevotella.groupby(['numEnclosure','multi_site_sp']).size().reset_index(name="count")
cp_Prevotella_summary.to_csv('analyses/tables/numEnclosures_Prevotella_table.txt',sep='\t',index=False) 


    numEnclosure          multi_site_sp  count
0              1  single_site_single_sp    168
1              2    multi_site_multi_sp     71
2              2   multi_site_single_sp     30
3              2   single_site_multi_sp     19
4              3    multi_site_multi_sp     92
5              3   multi_site_single_sp     10
6              3   single_site_multi_sp     11
7              4    multi_site_multi_sp     72
8              5    multi_site_multi_sp     72
9              6    multi_site_multi_sp     63
10             7    multi_site_multi_sp     56
11             8    multi_site_multi_sp     66
12             9    multi_site_multi_sp     66
13            10    multi_site_multi_sp     75
   numEnclosure        multi_site_sp  count
0             2  multi_site_multi_sp      2
    numEnclosure          multi_site_sp  count
0              1  single_site_single_sp     30
1              2    multi_site_multi_sp     11
2              2   multi_site_single_sp      5
3              2   

In [215]:
asv_cp_table.head()

Unnamed: 0,ASV,sampleNames,sampleNum,HR_sampleTypes,HR_sampleNum,HR_cat,HR_type,CP_pres,CP_sampleTypes,CP_sampleNum,...,captive_orangutan_ATLZ,captive_chimp_PC,captive_chimp_HOUZ,captive_orangutan_COMZ,captive_gorilla_HOUZ,captive_gorilla_COLZ,captive_orangutan_HOUZ,CP_sp_loc,numEnclosure,multi_site_sp
5,ASV_224,"[cp.ora.ATLZ.114.16s, cp.ora.ATLZ.115.16s, cp....",293,"[wild_bonobo, wild_gorilla, human, wild_chimp]",238,MX,MX,True,"[captive_chimp, captive_bonobo, captive_orangu...",55,...,6.0,16.0,8.0,3.0,7.0,8.0,2.0,"[captive_bonobo_COLZ, captive_orangutan_ATLZ, ...",8,multi_site_multi_sp
6,ASV_840,"[cp.ora.COMZ.MB31.16s, cp.ora.COMZ.MB7.16s, cp...",37,[human],34,HR,human,True,[captive_orangutan],3,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,[captive_orangutan_COMZ],1,single_site_single_sp
7,ASV_1451,"[cp.ora.ATLZ.117.16s, cp.ora.ATLZ.118.16s, cp....",34,"[wild_gorilla, human, wild_chimp]",25,MX,MX,True,"[captive_chimp, captive_orangutan, captive_gor...",9,...,4.0,2.0,1.0,0.0,1.0,1.0,0.0,"[captive_orangutan_ATLZ, captive_chimp_PC, cap...",5,multi_site_multi_sp
8,ASV_546,"[cp.ora.ATLZ.114.16s, cp.ora.ATLZ.115.16s, cp....",102,"[human, wild_chimp]",48,MX,MX,True,"[captive_chimp, captive_bonobo, captive_orangu...",54,...,8.0,14.0,6.0,3.0,5.0,4.0,6.0,"[captive_gorilla_COMZ, captive_bonobo_COLZ, ca...",9,multi_site_multi_sp
9,ASV_1846,"[cp.ora.ATLZ.116.16s, cp.ora.ATLZ.123.16s, wd....",23,"[human, wild_chimp]",20,MX,MX,True,[captive_orangutan],3,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,"[captive_orangutan_COLZ, captive_orangutan_ATLZ]",2,multi_site_single_sp


In [121]:
def output_proportion_shared_ASVs(pw_df,asv_table_tax):
    shared = pw_df.apply(lambda row: 
                    shared_ASVs_common_gen(row['ind1'],row['ind2'],asv_table_tax),axis=1)
    pw_shared = pw_df.merge(shared,on=['ind1','ind2'])                       #pw_shared_ASV_all_genera

    pw_shared['prop_shared_ASVs'] = pw_shared.apply(
        lambda row: prop_shared(row),axis=1)
    return(pw_shared)

In [122]:
pw_df = pw_metadata_capt_samples(metadata_file)
asv_table_tax = add_tax_to_asv_table(asv_table_file,tax_table_file)
asv_table_tax = asv_table_tax[~asv_table_tax['Genus'].isna()]


pw_shared_ASV_all_genera = output_proportion_shared_ASVs(pw_df,asv_table_tax)
pw_shared_ASV_all_genera.to_csv(
    'analyses/tables/16s_prop_shared_ASVs_all_gen_table.txt',
    sep='\t',index=False)

In [124]:
asv_table_Bacteroidales = asv_table_tax[asv_table_tax['Order']=='Bacteroidales']
pw_shared_ASV_Bacteroidales = output_proportion_shared_ASVs(pw_df,asv_table_Bacteroidales)
pw_shared_ASV_Bacteroidales.to_csv(
    'analyses/tables/16s_prop_shared_ASVs_Bacteroidales_table.txt',
    sep='\t',index=False)

In [126]:
print(set(asv_table_Bacteroidales['Genus']))
Prevotella_genera = ['Prevotella','Prevotella_2','Prevotella_7','Prevotella_1', 'Prevotella_9']
asv_table_Prevotella = asv_table_tax[asv_table_tax['Genus'].isin(Prevotella_genera)]
pw_shared_ASV_Prevotella = output_proportion_shared_ASVs(pw_df,asv_table_Prevotella)
pw_shared_ASV_Prevotella.to_csv(
    'analyses/tables/16s_prop_shared_ASVs_Prevotella_table.txt',
    sep='\t',index=False)



{'Prevotella', 'Alloprevotella', 'Prevotellaceae_UCG-003', 'Prevotella_2', 'dgA-11_gut_group', 'Bacteroides', 'Barnesiella', 'Butyricimonas', 'Prevotellaceae_UCG-001', 'Prevotella_7', 'Rikenellaceae_RC9_gut_group', 'Prevotellaceae_NK3B31_group', 'Prevotellaceae_UCG-004', 'Alistipes', 'Prevotellaceae_Ga6A1_group', 'Coprobacter', 'CAG-873', 'Parabacteroides', 'Prevotella_1', 'Prevotella_9', 'Paraprevotella', 'Odoribacter'}


In [212]:
def prop_shared_by_order(order):
    asv_table_order = asv_table_tax[asv_table_tax['Order']==order]
    asv_table_order = output_proportion_shared_ASVs(pw_df,asv_table_order)
    asv_table_order['common_gen'].apply(lambda x: len(x)) #remove comparisons where two ind don't share gen
    asv_table_order = asv_table_order[asv_table_order['common_gen'].apply(lambda x: len(x)) > 0]
    asv_table_order['order']=order
    return(asv_table_order)

Clostridales = prop_shared_by_order('Clostridiales')

In [200]:
print(asv_16s[asv_16s['CP_pres']==True]['Order'].value_counts()[:10].index)


Index(['Clostridiales', 'Bacteroidales', 'Mollicutes_RF39',
       'Erysipelotrichales', 'Spirochaetales', 'Gastranaerophilales',
       'Selenomonadales', 'Betaproteobacteriales', 'Coriobacteriales',
       'Lactobacillales'],
      dtype='object')


2926

In [205]:
Bacteroidales = prop_shared_by_order('Bacteroidales')
Mollicutes_RF39 = prop_shared_by_order('Mollicutes_RF39')
Erysipelotrichales = prop_shared_by_order('Erysipelotrichales')
Spirochaetales = prop_shared_by_order('Spirochaetales')
Gastranaerophilales = prop_shared_by_order('Gastranaerophilales')
Selenomonadales = prop_shared_by_order('Selenomonadales')
Betaproteobacteriales = prop_shared_by_order('Selenomonadales')
Coriobacteriales = prop_shared_by_order('Coriobacteriales')
Lactobacillales = prop_shared_by_order('Lactobacillales')


In [214]:
pw_shared_ASV_all_genera['order']='all'
pw_shared_ASV_top10_gen = pd.concat([pw_shared_ASV_all_genera,Clostridales,Bacteroidales,Mollicutes_RF39,Erysipelotrichales,Spirochaetales,
 Gastranaerophilales,Selenomonadales,Betaproteobacteriales,Coriobacteriales,Lactobacillales])
pw_shared_ASV_top10_gen.to_csv(
    'analyses/tables/16s_prop_shared_ASVs_top10_table.txt',
    sep='\t',index=False)

In [210]:
pw_shared_ASV_top10_gen.head()

Unnamed: 0,ind1,ind2,desc_site_ind1,desc_site_ind2,full_desc_comp,sp_site_comp,common_gen,ind1_ASVs,ind2_ASVs,common_ASVs,prop_shared_ASVs,order
0,cp.ora.ATLZ.114.16s,cp.ora.ATLZ.115.16s,captive_orangutan_ATLZ,captive_orangutan_ATLZ,captive_orangutan_ATLZ_vs_captive_orangutan_ATLZ,same_spec_same_site,"[Intestinimonas, Ruminococcaceae_UCG-005, Rumi...",136,118,91,0.771186,all
1,cp.ora.ATLZ.114.16s,cp.ora.ATLZ.116.16s,captive_orangutan_ATLZ,captive_orangutan_ATLZ,captive_orangutan_ATLZ_vs_captive_orangutan_ATLZ,same_spec_same_site,"[Intestinimonas, Ruminococcaceae_UCG-005, Rumi...",141,131,96,0.732824,all
2,cp.ora.ATLZ.114.16s,cp.ora.ATLZ.117.16s,captive_orangutan_ATLZ,captive_orangutan_ATLZ,captive_orangutan_ATLZ_vs_captive_orangutan_ATLZ,same_spec_same_site,"[Intestinimonas, Ruminococcaceae_UCG-005, Rumi...",138,114,96,0.842105,all
3,cp.ora.ATLZ.114.16s,cp.ora.ATLZ.118.16s,captive_orangutan_ATLZ,captive_orangutan_ATLZ,captive_orangutan_ATLZ_vs_captive_orangutan_ATLZ,same_spec_same_site,"[Intestinimonas, Catenibacterium, Ruminococcac...",124,100,85,0.85,all
4,cp.ora.ATLZ.114.16s,cp.ora.ATLZ.119.16s,captive_orangutan_ATLZ,captive_orangutan_ATLZ,captive_orangutan_ATLZ_vs_captive_orangutan_ATLZ,same_spec_same_site,"[Intestinimonas, Ruminococcaceae_UCG-005, Rumi...",104,72,54,0.75,all


In [191]:
df2

Unnamed: 0,gen,prop_shared_ASVs,num_comp
0,0 0 Candidat...,,0
1,0 0 Candidat...,0.927483,1772
2,0 0 Candidat...,0.575687,1297
3,0 0 Candidat...,1.0,401
4,0 0 Candidat...,0.491477,528
5,0 0 Candidat...,0.868993,1712
