In [3]:
import os
import sys
import subprocess
import pandas as pd
from ete3 import Tree
from collections import Counter

In [4]:
ROOT_DIR = '/Volumes/AHN/captive_ape_microbiome/'

In [5]:
os.chdir(ROOT_DIR)
%run scripts/analyses/functions.ipynb

In [6]:
INDIR = ROOT_DIR+'results/gyrb/inputs/'
OUTDIR = ROOT_DIR+'results/gyrb/analyses/'

In [7]:
os.system(f'mkdir -pv {OUTDIR}/intermediate_outputs')
os.system(f'mkdir -pv {OUTDIR}/tables')
os.system(f'mkdir -pv {OUTDIR}/figures')

0

In [8]:
tree_file = f'{INDIR}/physeq_Bacteroidales_ASVs_ref.tree'
full_tree = Tree(tree_file, format=0)

In [9]:
#inputs
metadata_file = f'{INDIR}/physeq_metadata_passing_samples.txt'
tax_table_file = f'{INDIR}/physeq_Bacteroidales_taxonomy.txt'
asv_table_file = f'{INDIR}/physeq_Bacteroidales_asv_tab.txt'
asv_fasta_file = f'{INDIR}/physeq_Bacteroidales_asv.fasta'
moeller_codiv_fasta = f'{INDIR}/moeller_codiv_Bacteroidaceae.fna'
#set blast percent identity cutoff 
pident_cutoff = 95
len_cutoff = .80

In [10]:
gyrb_asvs = asv_hr_table(asv_table_file,metadata_file,tax_table_file)
gyrb_asvs.to_csv(f'{OUTDIR}/tables/gyrb_asv_hr_table.txt',sep='\t',index=False)
print(gyrb_asvs['HR_cat'].value_counts())
print(gyrb_asvs['HR_type'].value_counts())

HR           7456
Unique_CP      96
MX             44
Name: HR_cat, dtype: int64
HR_human                    6683
HR_wild_bonobo               378
HR_wild_chimp                257
HR_wild_gorilla              138
Unique_CP                     96
MX_human_single_wild_ape      33
MX_2_wild_apes                 6
MX_human_2_wild_apes           5
Name: HR_type, dtype: int64


In [11]:
#collapse asvs into hr clades
#host_restricted_clades loaded from functions.ipynb 
clades_df,clades_ASVs_df = host_restricted_clades(asv_table_file,metadata_file,tax_table_file,tree_file) 


In [12]:
#host-restricted, mixed-host, and unique-to-captive clades
print(clades_df.shape)
print('numbers of clades in various host types')
print(clades_df['HR_type'].value_counts())

#Used for Figure3, where tips on tree are ASVs not clades
print(clades_ASVs_df.shape)
print('number of ASVs falling into various clades')
print(clades_ASVs_df['HR_type'].value_counts())

#Whats the breakdown of clades present in captive ape samples?
print('How many of the clades are present in captive apes?')
captive_clades = clades_df[clades_df['CP_pres']==True]
print(len(captive_clades))
print('Are these clades host-restricted, mixed host, or unique to captive?')
print(captive_clades['HR_cat'].value_counts())

#Whats the breakdown of clades present in 25% of captive ape samples of any species?
threshold=0.25
prominent_clades = clades_df.loc[(clades_df.captive_bonobo > threshold) | 
              (clades_df.captive_chimp > threshold)  |
              (clades_df.captive_gorilla > threshold)|
              (clades_df.captive_orangutan > threshold)]
print(len(prominent_clades),'captive clades in',threshold,'of captive samples of any species')
clades_df['CP_prominent'] = clades_df['cladeName'].apply(lambda x: x in list(prominent_clades['cladeName']))
clades_df_CP_prominent = clades_df[clades_df['CP_prominent']==True]
print(clades_df_CP_prominent['HR_type'].value_counts())

(356, 23)
numbers of clades in various host types
HR_human                    261
MX_human_single_wild_ape     33
HR_wild_chimp                21
HR_wild_bonobo               13
HR_wild_gorilla               9
MX_human_2_wild_apes          7
Unique_CP                     6
MX_2_wild_apes                5
MX_3_wild_apes                1
Name: HR_type, dtype: int64
(6957, 13)
number of ASVs falling into various clades
HR_human                    6173
HR_wild_bonobo               354
HR_wild_chimp                157
HR_wild_gorilla              100
MX_human_single_wild_ape      90
MX_human_2_wild_apes          33
MX_2_wild_apes                30
MX_3_wild_apes                11
Unique_CP                      9
Name: HR_type, dtype: int64
How many of the clades are present in captive apes?
52
Are these clades host-restricted, mixed host, or unique to captive?
HR           30
MX           16
Unique_CP     6
Name: HR_cat, dtype: int64
18 captive clades in 0.25 of captive samples of any speci

In [13]:
clades_df['heatmap_col1'] = clades_df.apply(
    lambda row: 'MX' 
        if 'MX' in row['HR_type'] else row['HR_type'],axis=1)
clades_df['heatmap_col2'] = clades_df.apply(
    lambda row: row['heatmap_col1'] 
        if max(row['captive_bonobo':'wild_gorilla'])>threshold else 'Blank',axis=1)
clades_df['heatmap_col3'] = clades_df.apply(
    lambda row: row['heatmap_col1'] 
        if max(row['captive_bonobo':'captive_orangutan'])>threshold else 'Blank',axis=1)

### Determine ASVs that hit to Moeller co-div clades

In [14]:
!cp {INDIR}/moeller_codiv_Bacteroidaceae.fna {OUTDIR}/intermediate_outputs/moeller_codiv_Bacteroidaceae.fna
!makeblastdb -in {OUTDIR}/intermediate_outputs/moeller_codiv_Bacteroidaceae.fna -dbtype nucl
!blastn -query {asv_fasta_file} -db {OUTDIR}/intermediate_outputs/moeller_codiv_Bacteroidaceae.fna -outfmt "7 qseqid salltitles sseqid pident length qlen evalue" -out {OUTDIR}/intermediate_outputs/codiv_blastout_ASVs.txt -max_target_seqs 5



Building a new DB, current time: 12/01/2020 16:35:28
New DB name:   /Volumes/AHN/captive_ape_microbiome/results/gyrb/analyses//intermediate_outputs/moeller_codiv_Bacteroidaceae.fna
New DB title:  /Volumes/AHN/captive_ape_microbiome/results/gyrb/analyses//intermediate_outputs/moeller_codiv_Bacteroidaceae.fna
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 208 sequences in 0.00991416 seconds.


In [15]:
#filter blast output to top hit per ASV, remove hits not pass thresholds
blast_res = pd.read_csv(f'{OUTDIR}/intermediate_outputs/codiv_blastout_ASVs.txt',sep='\t',comment='#',header=None)
blast_res.columns = ['ASV','codiv_clade_seq','sseqid','pident','length','qlen','evalue']
blast_res = blast_res.groupby('ASV').head(1).reset_index(drop=True) #get top hit
blast_res_pass = blast_res[blast_res['pident']>pident_cutoff].reset_index(drop=True) #QC filter
blast_res_pass = blast_res_pass[blast_res_pass['length']>(250*len_cutoff)].reset_index(drop=True) #QC filter
print(len(blast_res_pass),'ASVs matching co-div clades with greater than',pident_cutoff,'percent identity')
blast_res_pass['codiv_clade'] = blast_res_pass['codiv_clade_seq'].apply(lambda x: x.split(' ')[1])
blast_res_pass['lineage'] = blast_res_pass['codiv_clade'].apply(lambda x: x.split('_')[0]) 
blast_res_pass = blast_res_pass[['ASV','codiv_clade_seq', 'codiv_clade','lineage']]
#print(blast_res_pass.head())
print(blast_res_pass['codiv_clade'].value_counts())
#output hits passing that will be used to determine the mcra of the lineage
blast_res_pass.to_csv(f'{OUTDIR}/intermediate_outputs/codiv_clades_ASVs.txt',sep='\t',index=False)

344 ASVs matching co-div clades with greater than 95 percent identity
Bt2_clade1_bonobo     93
Bt3_clade1_human      87
Bt2_clade1_chimp      39
Bt2_clade2_bonobo     34
Bt1_clade1_bonobo     24
Bt3_clade1_chimp      18
Bt3_clade1_bonobo     15
Bt1_clade1_gorilla    11
Bt1_clade1_chimp       8
Bt2_clade1_gorilla     8
Bt2_clade2_chimp       7
Name: codiv_clade, dtype: int64


### Determine HR clades that hit to Moeller co-div clades

In [16]:
print(clades_df.shape)
clades_ASVs_codiv = clades_ASVs_df.merge(blast_res_pass, how='left', left_on='ASVs',right_on='ASV')
clades_ASVs_codiv = clades_ASVs_codiv.drop(['ASV'],axis=1)
clades_ASVs_codiv = clades_ASVs_codiv[~clades_ASVs_codiv['lineage'].isna()]
clades_ASVs_codiv = clades_ASVs_codiv[['cladeName','codiv_clade','lineage']].drop_duplicates()
clades_df_codiv = clades_df.merge(clades_ASVs_codiv, how='left',on='cladeName')
print(clades_df_codiv.groupby(['codiv_clade','lineage']).size())

(356, 27)
codiv_clade         lineage
Bt1_clade1_bonobo   Bt1         1
Bt1_clade1_chimp    Bt1         1
Bt1_clade1_gorilla  Bt1         1
Bt2_clade1_bonobo   Bt2         1
Bt2_clade1_chimp    Bt2         2
Bt2_clade1_gorilla  Bt2         1
Bt2_clade2_bonobo   Bt2         1
Bt2_clade2_chimp    Bt2         1
Bt3_clade1_bonobo   Bt3         1
Bt3_clade1_chimp    Bt3         2
Bt3_clade1_human    Bt3        19
dtype: int64


In [17]:
#Output table for Figure 2 
os.system(f'mkdir {OUTDIR}/figures')
clades_df_sh = clades_df_codiv[['cladeName', 'cladeTax', 'sampleNum', 'ASVsNum', 
      'HR_sampleTypes', 'HR_sampleNum','HR_cat', 'HR_type', 
      'CP_pres','CP_prominent', 'CP_sampleTypes', 'CP_sampleNum','captiveNames',
       'captive_bonobo', 'captive_chimp', 'captive_gorilla',
       'captive_orangutan', 'non_industrialized_human', 'industrialized_human',
       'wild_bonobo', 'wild_chimp', 'wild_gorilla',
       'heatmap_col1','heatmap_col2','heatmap_col3',
        'codiv_clade','lineage']]
clades_df_sh.to_csv(f'{OUTDIR}/intermediate_outputs/HRclades_wholetree_table.txt',sep='\t',index=False)

### search full tree for host-restricted clades

In [18]:
collasped_tree = Tree(tree_file, format=0)

def collapse_node(tree,clade_ASVs,cladeName):
    if len(clade_ASVs) > 1:
        node = tree.get_common_ancestor(clade_ASVs)
        node.name = cladeName
        children = node.get_children()
        for child in children:
            node.remove_child(child)
    else:
        ASV = clade_ASVs[0]
        leaf = tree.get_leaves_by_name(name=ASV)[0]
        leaf.name = cladeName

for clade,row in clades_df.iterrows():
    collapse_node(collasped_tree,row['ASVs'],row['cladeName'])

clade_leaves = [leaf for leaf in collasped_tree.get_leaves() if 'clade' in leaf.name] 
collasped_tree.prune(clade_leaves) #eliminate leftover ASVs and ref taxa
collasped_tree.write(format=2, outfile=f'{OUTDIR}/intermediate_outputs/HRclades_wholetree.tre')

### Figure 3 output table

In [19]:
#taxonomic info, family and genus
tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
tax_table['Family'] = tax_table['Family'].apply(lambda x: 'unclassified' if 'unclassified' in x else x)
tax_table['Genus'] = tax_table['Genus'].apply(lambda x: 'unclassified' if 'unclassified' in x else x)
tax_fam_dict = dict(zip(tax_table['ASV'], tax_table['Family']))
tax_gen_dict = dict(zip(tax_table['ASV'], tax_table['Genus']))

#sample to sample type category
metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
sample_type_dict = dict(zip(metadata['X.SampleID'], metadata['Description']))

#ASV to sample names dict
asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
sampleNames = asv_table.apply(lambda row: list(row.index[row>0]),axis=1)
ASV_sampleName_dict = dict(zip(sampleNames.index,sampleNames))

#generate ASV dataframe 
allASVs = [leaf.name for leaf in full_tree.get_leaves() if 'ASV' in leaf.name]
allASVs = pd.DataFrame(allASVs,columns=['ASV'])
print(len(allASVs),'total ASVs')

#Host restricted clades 
#print(clades_ASVs_df.head()) 
print(len(clades_ASVs_df),'ASVs that fall into HR clades')
#Moeller codiv clades
#print(blast_res_pass.head())
print(len(blast_res_pass),'ASVs that hit greater >95% identity to codiv clade')

#combine all ASVs, HR clade info, and codiv clade info
allASVs_HRclade = allASVs.merge(clades_ASVs_df, how='left', left_on='ASV',right_on='ASVs')
allASVs_HRclade_codiv = allASVs_HRclade.merge(blast_res_pass, how='left', on='ASV')
allASVs_HRclade_codiv.head()

#add taxonomy
allASVs_HRclade_codiv['Family'] = allASVs_HRclade_codiv['ASV'].apply(lambda ASV: tax_fam_dict[ASV])
allASVs_HRclade_codiv['Genus'] = allASVs_HRclade_codiv['ASV'].apply(lambda ASV: tax_gen_dict[ASV])
print('taxonomic breakdown of ASVs')
print(allASVs_HRclade_codiv['Family'].value_counts())

#add captive sp and site description info
metadata['Description_site'] = metadata['Description']+'_' +metadata['site_code']
sample_type_site_dict = dict(zip(metadata['X.SampleID'], metadata['Description_site']))
allASVs_HRclade_codiv['sampleNames'] = allASVs_HRclade_codiv['ASV'].apply(lambda x: ASV_sampleName_dict[x])
description_df = allASVs_HRclade_codiv['sampleNames'].apply(lambda l: pd.Series(
    [sample_type_site_dict[name] for name in l]).value_counts())
description_df = description_df.fillna(0) 
description_cp_df= description_df[['captive_chimp_HOUZ','captive_gorilla_HOUZ','captive_orangutan_HOUZ',
'captive_bonobo_COLZ','captive_gorilla_COLZ','captive_orangutan_COLZ','captive_chimp_PC']]
allASVs_HRclade_codiv_cp = allASVs_HRclade_codiv.merge(description_cp_df, left_index=True,right_index=True)
allASVs_HRclade_codiv_cp['captive_all'] = allASVs_HRclade_codiv_cp.loc[:,
                        'captive_chimp_HOUZ':'captive_chimp_PC'].sum(axis=1)
print('taxonomic breakdown of ASVs found in captive apes')
print(allASVs_HRclade_codiv_cp[allASVs_HRclade_codiv_cp['captive_all']>0]['Family'].value_counts())

#add ASV HR type, 
#ind ASVs in mixed clades may be HR, also ASV not in HR clades may be HR
ASV_HR_type = gyrb_asvs[['ASV','HR_type']]
ASV_HR_type.columns = ['ASV','ASV_HR_type']
allASVs_HRclade_codiv_cp_asv = allASVs_HRclade_codiv_cp.merge(ASV_HR_type,how='left',on='ASV')
print('breakdown of HRtype for individual ASVs that dont fall into HR clades')
print(allASVs_HRclade_codiv_cp_asv[allASVs_HRclade_codiv_cp_asv['HR_cat']!='HR']['ASV_HR_type'].value_counts())
print('breakdown of HRtype for individual ASVs that fall into HR clades')
print(allASVs_HRclade_codiv_cp_asv[allASVs_HRclade_codiv_cp_asv['HR_cat']=='HR']['ASV_HR_type'].value_counts())
allASVs_HRclade_codiv_cp_asv = allASVs_HRclade_codiv_cp_asv.drop(columns = ['sampleNames','ASVs'])

allASVs_HRclade_codiv_cp_asv.to_csv(f'{OUTDIR}/intermediate_outputs/HRclades_subtrees_table.txt',sep='\t',index=False)


7596 total ASVs
6957 ASVs that fall into HR clades
344 ASVs that hit greater >95% identity to codiv clade
taxonomic breakdown of ASVs
f__Bacteroidaceae        5019
f__Rikenellaceae          586
f__UBA932                 427
f__Tannerellaceae         411
f__Porphyromonadaceae     344
f__Muribaculaceae         333
f__Marinifilaceae         180
unclassified               76
f__Barnesiellaceae         75
f__Coprobacteraceae        40
f__Paludibacteraceae       34
f__Dysgonomonadaceae       31
f__UBA11471                24
f__Bacteroidaceae_A        14
f__F082                     2
Name: Family, dtype: int64
taxonomic breakdown of ASVs found in captive apes
f__Bacteroidaceae        103
f__Tannerellaceae         27
f__Porphyromonadaceae      5
unclassified               2
f__Paludibacteraceae       1
f__Barnesiellaceae         1
Name: Family, dtype: int64
breakdown of HRtype for individual ASVs that dont fall into HR clades
HR_human                    561
HR_wild_chimp               104
HR_w