In [1]:
import os
import pandas as pd
from ete3 import Tree
from collections import Counter

In [2]:
os.chdir('/Volumes/AHN/captive_ape_microbiome/scripts/analyses')
%run functions.ipynb
os.chdir('/Volumes/AHN/captive_ape_microbiome/results/gyrb')

#results/gyrb/

In [3]:
#inputs
metadata_file = 'inputs/metadata_gyrb_amp_meta_passing_samples.txt'
tax_table_file = 'inputs/ASVs_taxonomy.txt'
asv_table_file = 'inputs/ASVs_filtered_counts.tsv'
asv_fasta_file = 'inputs/ASVs_filtered.fasta'
full_tree_file = 'inputs/ASVs_filtered_ref_full.tree'
#read tree with ete3 
full_tree = Tree(full_tree_file, format=0)
moeller_codiv_fasta = 'inputs/codiv_Bacteroidaceae.fna'
#set blast percent identity cutoff 
pident_cutoff = 95

In [4]:
gyrb_asvs = asv_hr_table(asv_table_file,metadata_file,tax_table_file)
gyrb_asvs['Genus'].value_counts()[:10]

g__Prevotella            2871
g__Bacteroides            992
Unassigned                539
g__Alistipes              519
g__RC9                    363
g__Parabacteroides        296
g__Bacteroides_A          286
g__Porphyromonas          250
g__Prevotellamassilia     121
g__Bacteroides_B          113
Name: Genus, dtype: int64

In [5]:
def output_summary_table(asv_hr_table_output):
     #create summary table
    Bacteroides_genera=['g__Bacteroides','g__Bacteroides_A','g__Bacteroides_B']
    all_asvs = pd.Series(asv_hr_table_output['HR_type'].value_counts(),name='ALL')
    cp_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['CP_pres']==True]['HR_type'].value_counts(),name='CP')
    Bacteroidales_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['Order']=='o__Bacteroidales']['HR_type'].value_counts(),name='Bacteroidales')
    Prevotella_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['Genus']=='g__Prevotella']['HR_type'].value_counts(),name='Prevotella')
    Bacteroides_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['Genus'].isin(Bacteroides_genera)]['HR_type'].value_counts(),name='Bacteroides')
    Parabacteroides_asvs = pd.Series(asv_hr_table_output[asv_hr_table_output['Genus']=='g__Parabacteroides']['HR_type'].value_counts(),name='Parabacteroides')

    res = pd.concat([all_asvs,cp_asvs,Bacteroidales_asvs,
                     Prevotella_asvs,Bacteroides_asvs,Parabacteroides_asvs],
                     axis=1).fillna(0).T
    res['Total'] = res.sum(axis=1)
    res['MX_Total'] = res.loc[:,'MX_human_single_wild_ape':'MX_human_2_wild_apes'].sum(axis=1)
    return(res)  

In [6]:
gyrb_asv_summary = output_summary_table(gyrb_asvs)
print('Host restricted ASVs totals')
print(gyrb_asv_summary.loc[:,'HR_human':'HR_wild_chimp'].sum(axis=1))
gyrb_asv_summary

Host restricted ASVs totals
ALL                7001.0
CP                   31.0
Bacteroidales      7001.0
Prevotella         2813.0
Bacteroides        1348.0
Parabacteroides     289.0
dtype: float64


Unnamed: 0,HR_human,HR_wild_bonobo,HR_wild_gorilla,HR_wild_chimp,Unique_CP,MX_human_single_wild_ape,MX_2_wild_apes,MX_human_2_wild_apes,Total,MX_Total
ALL,6183.0,353.0,258.0,207.0,94.0,34.0,14.0,5.0,7148.0,53.0
CP,27.0,0.0,0.0,4.0,94.0,10.0,2.0,2.0,139.0,14.0
Bacteroidales,6183.0,353.0,258.0,207.0,94.0,34.0,14.0,5.0,7148.0,53.0
Prevotella,2450.0,203.0,46.0,114.0,55.0,2.0,1.0,0.0,2871.0,3.0
Bacteroides,1314.0,1.0,2.0,31.0,15.0,23.0,0.0,5.0,1391.0,28.0
Parabacteroides,247.0,38.0,1.0,3.0,3.0,3.0,1.0,0.0,296.0,4.0


In [7]:
clades_df,clades_ASVs_df = host_restricted_clades(asv_table_file,metadata_file,tax_table_file) 


western_human        6805
non_western_human     511
wild_chimp             69
wild_gorilla           37
captive_chimp          26
wild_bonobo            24
captive_gorilla        22
captive_bonobo         13
captive_orangutan      11
Name: Description, dtype: int64


In [8]:
#host-restricted, mixed-host, and unique-to-captive clades
print(clades_df.shape)
print('numbers of clades in various host types')
print(clades_df['HR_type'].value_counts())

#Used for Figure3, where tips on tree are ASVs not clades
print(clades_ASVs_df.shape)
print('number of ASVs falling into various clades')
print(clades_ASVs_df['HR_type'].value_counts())

#Whats the breakdown of clades present in captive ape samples?
print('How many of the clades are present in captive apes?')
captive_clades = clades_df[clades_df['CP_pres']==True]
print(len(captive_clades))
print('Are these clades host-restricted, mixed host, or unique to captive?')
print(captive_clades['HR_cat'].value_counts())

#Whats the breakdown of clades present in 15% of captive ape samples of any species?
threshold=0.15
prominent_clades = clades_df.loc[(clades_df.captive_bonobo > threshold) | 
              (clades_df.captive_chimp > threshold)  |
              (clades_df.captive_gorilla > threshold)|
              (clades_df.captive_orangutan > threshold)]
print(len(prominent_clades),'captive clades in',threshold,'of captive samples of any species')
clades_df['CP_prominent'] = clades_df['cladeName'].apply(lambda x: x in list(prominent_clades['cladeName']))
clades_df_CP_prominent = clades_df[clades_df['CP_prominent']==True]
print(clades_df_CP_prominent['HR_type'].value_counts())

(359, 23)
numbers of clades in various host types
HR_human                    249
MX_human_single_wild_ape     35
HR_wild_chimp                20
HR_wild_gorilla              19
HR_wild_bonobo               14
MX_2_wild_apes                8
MX_human_2_wild_apes          7
Unique_CP                     6
MX_3_wild_apes                1
Name: HR_type, dtype: int64
(6438, 13)
number of ASVs falling into various clades
HR_human                    5625
HR_wild_bonobo               342
HR_wild_gorilla              168
HR_wild_chimp                111
MX_human_single_wild_ape      92
MX_2_wild_apes                43
MX_human_2_wild_apes          41
Unique_CP                      9
MX_3_wild_apes                 7
Name: HR_type, dtype: int64
How many of the clades are present in captive apes?
52
Are these clades host-restricted, mixed host, or unique to captive?
HR           27
MX           19
Unique_CP     6
Name: HR_cat, dtype: int64
31 captive clades in 0.15 of captive samples of any speci

In [37]:
threshold = .15
clades_df['heatmap_col1'] = clades_df.apply(
    lambda row: 'MX' 
        if 'MX' in row['HR_type'] else row['HR_type'],axis=1)
clades_df['heatmap_col2'] = clades_df.apply(
    lambda row: row['heatmap_col1'] 
        if max(row['captive_bonobo':'wild_gorilla'])>.15 else 'Blank',axis=1)
clades_df['heatmap_col3'] = clades_df.apply(
    lambda row: row['heatmap_col1'] 
        if max(row['captive_bonobo':'captive_orangutan'])>.15 else 'Blank',axis=1)




### Determine ASVs that hit to Moeller co-div clades

In [38]:
%%bash

mkdir analyses/codiv_moeller_ASVs
cp inputs/moeller_codiv_Bacteroidaceae.fna analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna

#make blastdb
makeblastdb -in analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna -dbtype nucl

#blast moeller co-div seqs
blastn -query inputs/ASVs_filtered.fasta -db analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna \
-outfmt "7 qseqid salltitles sseqid pident length qlen evalue" \
-out analyses/codiv_moeller_ASVs/codiv_blastout_ASVs.txt \
-max_target_seqs 5



Building a new DB, current time: 09/15/2020 06:46:46
New DB name:   /Volumes/AHN/captive_ape_microbiome/results/gyrb/analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna
New DB title:  analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Volumes/AHN/captive_ape_microbiome/results/gyrb/analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 208 sequences in 0.00554895 seconds.


mkdir: analyses/codiv_moeller_ASVs: File exists


In [39]:
#filter blast output to top hit per ASV, remove hits not pass thresholds
blast_res = pd.read_csv('analyses/codiv_moeller_ASVs/codiv_blastout_ASVs.txt',sep='\t',comment='#',header=None)
blast_res.columns = ['ASV','codiv_clade_seq','sseqid','pident','length','qlen','evalue']
blast_res = blast_res.groupby('ASV').head(1).reset_index(drop=True) #get top hit
blast_res_pass = blast_res[blast_res['pident']>pident_cutoff].reset_index(drop=True) #QC filter
blast_res_pass = blast_res_pass[blast_res_pass['length']>200].reset_index(drop=True) #QC filter
print(len(blast_res_pass),'ASVs matching co-div clades with greater than',pident_cutoff,'percent identity')
blast_res_pass['codiv_clade'] = blast_res_pass['codiv_clade_seq'].apply(lambda x: x.split(' ')[1])
blast_res_pass['lineage'] = blast_res_pass['codiv_clade'].apply(lambda x: x.split('_')[0]) 
blast_res_pass = blast_res_pass[['ASV','codiv_clade_seq', 'codiv_clade','lineage']]
#print(blast_res_pass.head())
print(blast_res_pass['codiv_clade'].value_counts())
#output hits passing that will be used to determine the mcra of the lineage
blast_res_pass.to_csv('analyses/codiv_moeller_ASVs/codiv_clades_ASVs.txt',sep='\t',index=False)


314 ASVs matching co-div clades with greater than 95 percent identity
Bt2_clade1_bonobo     90
Bt3_clade1_human      85
Bt2_clade2_bonobo     33
Bt1_clade1_bonobo     23
Bt2_clade1_chimp      20
Bt3_clade1_chimp      17
Bt3_clade1_bonobo     15
Bt1_clade1_gorilla    10
Bt1_clade1_chimp       8
Bt2_clade1_gorilla     7
Bt2_clade2_chimp       6
Name: codiv_clade, dtype: int64


### Determine HR clades that hit to Moeller co-div clades

In [40]:
print(clades_df.shape)
clades_ASVs_codiv = clades_ASVs_df.merge(blast_res_pass, how='left', left_on='ASVs',right_on='ASV')
clades_ASVs_codiv = clades_ASVs_codiv.drop(['ASV'],axis=1)
clades_ASVs_codiv = clades_ASVs_codiv[~clades_ASVs_codiv['lineage'].isna()]
clades_ASVs_codiv = clades_ASVs_codiv[['cladeName','codiv_clade','lineage']].drop_duplicates()
clades_df_codiv = clades_df.merge(clades_ASVs_codiv, how='left',on='cladeName')
print(clades_df_codiv.groupby(['codiv_clade','lineage']).size())

(359, 27)
codiv_clade         lineage
Bt1_clade1_bonobo   Bt1         1
Bt1_clade1_chimp    Bt1         1
Bt1_clade1_gorilla  Bt1         1
Bt2_clade1_bonobo   Bt2         1
Bt2_clade1_chimp    Bt2         2
Bt2_clade1_gorilla  Bt2         1
Bt2_clade2_bonobo   Bt2         1
Bt2_clade2_chimp    Bt2         1
Bt3_clade1_bonobo   Bt3         1
Bt3_clade1_chimp    Bt3         2
Bt3_clade1_human    Bt3        15
dtype: int64


In [41]:
#Output table for Figure 2 
clades_df_sh = clades_df_codiv[['cladeName', 'cladeTax', 'sampleNum', 'ASVsNum', 
      'HR_sampleTypes', 'HR_sampleNum','HR_cat', 'HR_type', 
      'CP_pres','CP_prominent', 'CP_sampleTypes', 'CP_sampleNum','captiveNames',
       'captive_bonobo', 'captive_chimp', 'captive_gorilla',
       'captive_orangutan', 'non_western_human', 'western_human',
       'wild_bonobo', 'wild_chimp', 'wild_gorilla',
       'heatmap_col1','heatmap_col2','heatmap_col3',
        'codiv_clade','lineage']]
clades_df_sh.to_csv('analyses/figures/HRclades_Figure2_table.txt',sep='\t',index=False)

### search full tree for host-restricted clades

In [10]:
collasped_tree = Tree(full_tree_file, format=0)

def collapse_node(tree,clade_ASVs,cladeName):
    if len(clade_ASVs) > 1:
        node = tree.get_common_ancestor(clade_ASVs)
        node.name = cladeName
        children = node.get_children()
        node.remove_child(children[1])
        node.remove_child(children[0])
    else:
        ASV = clade_ASVs[0]
        leaf = tree.get_leaves_by_name(name=ASV)[0]
        leaf.name = cladeName

for clade,row in clades_df.iterrows():
    collapse_node(collasped_tree,row['ASVs'],row['cladeName'])

clade_leaves = [leaf for leaf in collasped_tree.get_leaves() if 'clade' in leaf.name] 
collasped_tree.prune(clade_leaves) #eliminate leftover ASVs and ref taxa
collasped_tree.write(format=2, outfile='analyses/figures/HRclades_Figure2.tre')


### Table 2

In [44]:
Table2 = clades_df.groupby(['cladeTax','HR_type']).size().reset_index(name="count")
Table2 = Table2.pivot(index='cladeTax', columns='HR_type')['count'].fillna(0)

Table2['MX'] = Table2[['MX_2_wild_apes', 'MX_3_wild_apes',
                 'MX_human_2_wild_apes', 'MX_human_single_wild_ape']].sum(axis=1)
Table2 = Table2[['HR_human', 'HR_wild_bonobo', 'HR_wild_chimp', 
        'HR_wild_gorilla','Unique_CP', 'MX']]

Bacteroides_genera = ['Bacteroidaceae_Bacteroides','Bacteroidaceae_Bacteroides_A','Bacteroidaceae_Bacteroides_B']
Bacteroides_row = pd.Series(Table2.loc[Bacteroides_genera,:].sum(),name='Bacteroidaceae_Bacteroides_combined')
Table2 = Table2.append(Bacteroides_row).drop(Bacteroides_genera)

print(Table2.sum(axis=1).sum())

taxa_over_5 = Table2[Table2.sum(axis=1)>5].sum(axis=1)
print(taxa_over_5)
sel_genera = ['Bacteroidaceae_Bacteroides_combined',
              'Bacteroidaceae_Prevotella',
              'Rikenellaceae_Alistipes',
              'Tannerellaceae_Parabacteroides',
              'Bacteroidaceae_Unassigned',
              'WCHB1-69_Unassigned']
Table2_sel_genera = Table2.loc[sel_genera,:]
Table2_other_genera = pd.Series(Table2.drop(sel_genera).sum(),name='Other_genera')
Table2_sel_genera = Table2_sel_genera.append(Table2_other_genera)
Table2_sel_genera.sum(axis=1).sum()
#Table2_sel_genera.sum(axis=1).sum()

359.0
cladeTax
Bacteroidaceae_Prevotella              101.0
Bacteroidaceae_Unassigned               21.0
Porphyromonadaceae_Porphyromonas         9.0
Rikenellaceae_Alistipes                 22.0
Tannerellaceae_Parabacteroides          24.0
Tannerellaceae_Unassigned               10.0
Unassigned_Unassigned                    7.0
WCHB1-69_Unassigned                     10.0
Bacteroidaceae_Bacteroides_combined    131.0
dtype: float64


359.0

In [48]:
#taxonomic info, family and genus
tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
tax_fam_dict = dict(zip(tax_table['ASV'], tax_table['Family']))
tax_gen_dict = dict(zip(tax_table['ASV'], tax_table['Genus']))

#sample to sample type category
metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
sample_type_dict = dict(zip(metadata['X.SampleID'], metadata['Description']))

asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)

sampleNames = asv_table.apply(lambda row: list(row.index[row>0]),axis=1)
ASV_sampleName_dict = dict(zip(sampleNames.index,sampleNames))

allASVs = [leaf.name for leaf in full_tree.get_leaves() if 'ASV' in leaf.name]
allASVs = pd.DataFrame(allASVs,columns=['ASV'])
print(len(allASVs))

#Host restricted clades 
print(clades_ASVs_df.head())
print(len(clades_ASVs_df))
#Moeller codiv clades
print(blast_res_pass.head())
print(len(blast_res_pass))

#combine
allASVs_HRclade = allASVs.merge(clades_ASVs_df, how='left', left_on='ASV',right_on='ASVs')
allASVs_HRclade_codiv = allASVs_HRclade.merge(blast_res_pass, how='left', on='ASV')
allASVs_HRclade_codiv.head()

#add taxonomy
allASVs_HRclade_codiv['Family'] = allASVs_HRclade_codiv['ASV'].apply(lambda ASV: tax_fam_dict[ASV])
allASVs_HRclade_codiv['Genus'] = allASVs_HRclade_codiv['ASV'].apply(lambda ASV: tax_gen_dict[ASV])

#add captive sp and site info
metadata['Description_site'] = metadata['Description']+'_' +metadata['site_code']
sample_type_site_dict = dict(zip(metadata['X.SampleID'], metadata['Description_site']))

allASVs_HRclade_codiv['sampleNames'] = allASVs_HRclade_codiv['ASV'].apply(lambda x: ASV_sampleName_dict[x])

description_df = allASVs_HRclade_codiv['sampleNames'].apply(lambda l: pd.Series(
    [sample_type_site_dict[name] for name in l]).value_counts())
description_df = description_df.fillna(0) 
description_cp_df= description_df[['captive_chimp_HOUZ','captive_gorilla_HOUZ','captive_orangutan_HOUZ',
'captive_bonobo_COLZ','captive_gorilla_COLZ','captive_orangutan_COLZ','captive_chimp_PC']]
allASVs_HRclade_codiv_cp = allASVs_HRclade_codiv.merge(description_cp_df, left_index=True,right_index=True)

allASVs_HRclade_codiv_cp.to_csv('analyses/codiv_moeller_ASVs/full_tree_ASV_table.txt',sep='\t',index=False)
print(allASVs_HRclade_codiv_cp.head())

7148
  cladeName                                        sampleNames  sampleNum  \
0  clade_41  [NielsenHB_2014__O2_UC60_2, ZellerG_2014__CCIS...        788   
1  clade_41  [NielsenHB_2014__O2_UC60_2, ZellerG_2014__CCIS...        788   
2  clade_41  [NielsenHB_2014__O2_UC60_2, ZellerG_2014__CCIS...        788   
3  clade_41  [NielsenHB_2014__O2_UC60_2, ZellerG_2014__CCIS...        788   
4  clade_41  [NielsenHB_2014__O2_UC60_2, ZellerG_2014__CCIS...        788   

   ASVsNum HR_sampleTypes  HR_sampleNum HR_cat   HR_type  CP_pres  \
0      366        [human]           788     HR  HR_human    False   
1      366        [human]           788     HR  HR_human    False   
2      366        [human]           788     HR  HR_human    False   
3      366        [human]           788     HR  HR_human    False   
4      366        [human]           788     HR  HR_human    False   

   CP_sampleNum CP_sampleTypes captiveNames      ASVs  
0             0             []           []  ASV_2623  
1    

In [35]:
colsum = Table2.sum(axis=1)
Bacteroidaceae_Bacteroides
Table2[Table2.sum(axis=1) >= 10]

HR_type,HR_human,HR_wild_bonobo,HR_wild_chimp,HR_wild_gorilla,MX_2_wild_apes,MX_3_wild_apes,MX_human_2_wild_apes,MX_human_single_wild_ape,Unique_CP
cladeTax,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bacteroidaceae_Bacteroides,75.0,0.0,1.0,0.0,0.0,0.0,5.0,20.0,4.0
Bacteroidaceae_Bacteroides_B,15.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
Bacteroidaceae_Prevotella,75.0,8.0,9.0,4.0,3.0,0.0,1.0,1.0,0.0
Bacteroidaceae_Unassigned,13.0,3.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0
Rikenellaceae_Alistipes,17.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
Tannerellaceae_Parabacteroides,18.0,1.0,2.0,0.0,0.0,0.0,0.0,3.0,0.0
Tannerellaceae_Unassigned,3.0,0.0,4.0,1.0,0.0,0.0,0.0,2.0,0.0
WCHB1-69_Unassigned,0.0,0.0,0.0,7.0,3.0,0.0,0.0,0.0,0.0
