In [1]:
import os
import pandas as pd
from ete3 import Tree
from collections import Counter

os.getcwd()
os.chdir('/Volumes/AHN/captive_ape_microbiome/results/gyrb/')

os.listdir('inputs/')

['.DS_Store',
 'ASVs_filtered.fasta',
 'ASVs_filtered_counts.tsv',
 'ASVs_filtered_ref_amp.tree',
 'ASVs_filtered_ref_full.tree',
 'ASVs_taxonomy.txt',
 'codiv_Bacteroidaceae.fna',
 'metadata_gyrb_amp_meta_passing_samples.txt',
 'ref_gyrb_gtdbtk',
 'test.tre']

In [2]:
#inputs
metadata_file = 'inputs/metadata_gyrb_amp_meta_passing_samples.txt'
tax_table_file = 'inputs/ASVs_taxonomy.txt'
asv_table_file = 'inputs/ASVs_filtered_counts.tsv'
asv_fasta_file = 'inputs/ASVs_filtered.fasta'
tree_file = 'inputs/ASVs_filtered_ref_full.tree'
moeller_codiv_fasta = 'inputs/codiv_Bacteroidaceae.fna'
pident_cutoff = 95

### Blastn moeller co-div seqs against ASVs and filter ASVs matchin X% identity

In [3]:
%%bash

mkdir analyses/codiv_moeller_ASVs
cp inputs/codiv_Bacteroidaceae.fna analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna

#make blastdb
makeblastdb -in analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna -dbtype nucl

#blast moeller co-div seqs
blastn -query inputs/ASVs_filtered.fasta -db analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna \
-outfmt "7 qseqid salltitles sseqid pident length qlen evalue" \
-out analyses/codiv_moeller_ASVs/codiv_blastout_ASVs.txt \
-max_target_seqs 5



Building a new DB, current time: 07/21/2020 12:33:43
New DB name:   /Volumes/AHN/captive_ape_microbiome/results/gyrb/analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna
New DB title:  analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Volumes/AHN/captive_ape_microbiome/results/gyrb/analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 208 sequences in 0.0233161 seconds.


mkdir: analyses/codiv_moeller_ASVs: File exists


### Subset the matching ASVs 

In [4]:
#filter blast output to top hit per ASV, remove hits not pass thresholds
blast_res = pd.read_csv('analyses/codiv_moeller_ASVs/codiv_blastout_ASVs.txt',sep='\t',comment='#',header=None)
blast_res.columns = ['ASV','codiv_clade_seq','sseqid','pident','length','qlen','evalue']
blast_res = blast_res.groupby('ASV').head(1).reset_index(drop=True) #get top hit
blast_res_pass = blast_res[blast_res['pident']>pident_cutoff].reset_index(drop=True)
print(len(blast_res_pass))
blast_res_pass = blast_res_pass[blast_res_pass['length']>200].reset_index(drop=True)
print(len(blast_res_pass),'ASVs matching co-div clades with greater than',pident_cutoff,'percent identity')
blast_res_pass['codiv_clade'] = blast_res_pass['codiv_clade_seq'].apply(lambda x: x.split(' ')[1])
blast_res_pass['lineage'] = blast_res_pass['codiv_clade'].apply(lambda x: x.split('_')[0]) 
blast_res_pass = blast_res_pass[['ASV', 'lineage', 'codiv_clade', 'codiv_clade_seq', 'sseqid', 'pident', 'length', 'qlen', 'evalue']]
#output hits passing that will be used to determine the mcra of the lineage
blast_res_pass.to_csv('analyses/codiv_moeller_ASVs/codiv_blastout_ASVs_passing.txt',sep='\t',index=False)

def get_lineage_ASVs(tree_file,listASVs):
    tree = Tree(tree_file, format=0)
    #makes sure we don't miss any ASVs that didn't match in the blast search \n"
    #but are descended from the MRCA of those that did\n"
    lineage_MRCA = tree.get_common_ancestor(listASVs)
    lineageASVs = [x.name for x in lineage_MRCA.get_leaves()] 
    print(len(listASVs), 'ASVs identified in blast search but', len(lineageASVs), 'ASVs are within the monophyletic clade')
    return(lineageASVs)

print('Bt1 lineage')
Bt1_blasthits_table = blast_res_pass[blast_res_pass['codiv_clade_seq'].apply(lambda x: 'Bt1' in x)]
Bt1_blasthits_ASVs = list(Bt1_blasthits_table['ASV'])
Bt1_lineage_ASVs = get_lineage_ASVs("inputs/ASVs_filtered_ref_full.tree",Bt1_blasthits_ASVs)
Bt1_lineage_ASVs = pd.DataFrame(Bt1_lineage_ASVs,columns=['ASV'])
Bt1_lineage_ASVs_merged = Bt1_lineage_ASVs.merge(blast_res_pass,on='ASV',how='left')
Bt1_lineage_ASVs_merged['lineage'] = 'Bt1'
print('Bt2 lineage') 
Bt2_blasthits_table = blast_res_pass[blast_res_pass['codiv_clade_seq'].apply(lambda x: 'Bt2' in x)]
Bt2_blasthits_ASVs = list(Bt2_blasthits_table['ASV'])
Bt2_lineage_ASVs = get_lineage_ASVs("inputs/ASVs_filtered_ref_full.tree",Bt2_blasthits_ASVs)
Bt2_lineage_ASVs = pd.DataFrame(Bt2_lineage_ASVs,columns=['ASV'])
Bt2_lineage_ASVs_merged = Bt2_lineage_ASVs.merge(blast_res_pass,on='ASV',how='left')
Bt2_lineage_ASVs_merged['lineage'] = 'Bt2'
print('Bt3 lineage') 
Bt3_blasthits_table = blast_res_pass[blast_res_pass['codiv_clade_seq'].apply(lambda x: 'Bt3' in x)]
Bt3_blasthits_ASVs = list(Bt3_blasthits_table['ASV'])
Bt3_lineage_ASVs = get_lineage_ASVs("inputs/ASVs_filtered_ref_full.tree",Bt3_blasthits_ASVs)
Bt3_lineage_ASVs = pd.DataFrame(Bt3_lineage_ASVs,columns=['ASV'])
Bt3_lineage_ASVs_merged = Bt3_lineage_ASVs.merge(blast_res_pass,on='ASV',how='left')
Bt3_lineage_ASVs_merged['lineage'] = 'Bt3'

#append table to output single file
codiv_clades = pd.concat([Bt1_lineage_ASVs_merged,Bt2_lineage_ASVs_merged,Bt3_lineage_ASVs_merged])
codiv_clades.to_csv('analyses/codiv_moeller_ASVs/codiv_clades_ASVs.txt',sep='\t',index=False)

316
314 ASVs matching co-div clades with greater than 95 percent identity
Bt1 lineage
41 ASVs identified in blast search but 41 ASVs are within the monophyletic clade
Bt2 lineage
156 ASVs identified in blast search but 360 ASVs are within the monophyletic clade
Bt3 lineage
117 ASVs identified in blast search but 586 ASVs are within the monophyletic clade


In [5]:
#generates three dictionaries where keys are ASVs, and values are metadata
#all the samples with a given ASV
#all the sample types with a given ASV per metadata info
#taxonomic info, family and genus

metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
sample_type_dict = dict(zip(metadata['X.SampleID'], metadata['Description']))

tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=0)
tax_table['FamilyGenus'] = tax_table['Family'] + tax_table['Genus']
ASV_taxonomy_dict = dict(zip(tax_table.index, tax_table['FamilyGenus']))
print(ASV_taxonomy_dict["ASV_1"])

ASV_sampleName_dict = {}  
asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
for ASV,row in asv_table.iterrows():
    sampleNames = list(asv_table.columns[row>0])
    ASV_sampleName_dict[ASV] = sampleNames

f__Bacteroidaceaeg__Bacteroides_B


In [46]:
from pandas.core.common import flatten

full_tree = Tree("inputs/ASVs_filtered_ref_amp.tree", format=0)
Bt1_tree = full_tree.get_common_ancestor(list(Bt1_lineage_ASVs['ASV']))

def is_HR(sampleNames):
    sampleTypes = [sample_type_dict[name] for name in sampleNames]
    sampleTypes = [x.replace('non_western_','').replace('western_','') for x in sampleTypes]
    neutral_sampleTypes = ['captive_gorilla','captive_bonobo','captive_chimp','captive_orangutan']
    select_sampleTypes = [x for x in sampleTypes if x not in neutral_sampleTypes]
    unique_sampleTypes = list(set(select_sampleTypes))
    
    if len(unique_sampleTypes) == 0:
        return(unique_sampleTypes,'captive',len(sampleTypes))
    if len(unique_sampleTypes)  == 1: #identifies host-restricted clades
        HR_clade = select_sampleTypes[0]
        HR_sample_num = len([t for t in sampleTypes if t == HR_clade])
        return(unique_sampleTypes,unique_sampleTypes[0],HR_sample_num) #doesn't count captive ape samples
    if len(unique_sampleTypes)  > 1:    
        return(unique_sampleTypes,'mixed',len(sampleTypes))

def search_clades(tree, samples_cutoff, BS_support):
    """Finds nodes with at least 50% BS support containing ASVs only found in a single wild ape species
    ie wild_gorilla, wild_chimp or wild_bonobo"""
    HR_clades_passing_thresholds = []
    counter = 1
    for n in tree.traverse():
        if n.support > float(BS_support): #makes sure Bootstrap support is over threshold
            ASVs = [leaf.name for leaf in n.iter_leaves() if 'ASV' in leaf.name]
            ASVsNum = len(ASVs) 
            ASVsTax = list(set(list(flatten([ASV_taxonomy_dict[ASV] for ASV in ASVs]))))
            sampleNames = list(set(list(flatten([ASV_sampleName_dict[ASV] for ASV in ASVs]))))
            sampleNum = len(sampleNames)
            sampleTypes,HR_clade,HR_sampleNum = is_HR(sampleNames)
            if HR_sampleNum > samples_cutoff:
                cladeName = 'clade_'+str(counter)
                counter+=1
                clade = [cladeName,HR_clade,HR_sampleNum,ASVs,ASVsNum,ASVsTax,sampleNames,sampleNum,sampleTypes]
                HR_clades_passing_thresholds.append(clade)  
    clades = pd.DataFrame(HR_clades_passing_thresholds, columns = 
                          ['cladeName','HR_clade','HR_sampleNum',
                           'ASV','ASVsNum','ASVsTax',
                           'sampleNames','sampleNum','sampleTypes'])
    return(clades)

#initial search contains all sub clades within the largest host restricted clade
clades_Bt1 = search_clades(Bt1_tree,samples_cutoff=5,BS_support=.5) 
#print(len(clades_Bt1))
HRclades_Bt1 = clades_Bt1[clades_Bt1['HR_clade']!='mixed'][clades_Bt1['HR_clade']!='captive']     
MXclades_Bt1 = clades_Bt1[clades_Bt1['HR_clade']=='mixed']
CPclades_Bt1 = clades_Bt1[clades_Bt1['HR_clade']=='captive']
#print(len(HRclades_Bt1),len(MXclades_Bt1),len(CPclades_Bt1))

def eliminate_redundant_clades(search_clades_res):
    df = search_clades_res.sort_values('ASVsNum',ascending=False) #start with the largest clades first
    NRclades = []
    NR_ASVs = []
    for index, row in df.iterrows():
        firstASV = row['ASV'][0]    
        if firstASV not in NR_ASVs:
            NR_ASVs = NR_ASVs + row['ASV']
            NRclades.append(row['cladeName']) 
            #print(row['cladeName'])
    res = df[df['cladeName'].isin(NRclades)]   
    return(res)
HRclades_Bt1 = eliminate_redundant_clades(HRclades_Bt1)
HRclades_Bt1.to_csv('analyses/codiv_moeller_ASVs/HRclades_Bt1.txt',sep='\t',index=False)
print(len(HRclades_Bt1),'HR clades')

def expand_clades(clades_df):
    ASV = clades_df.apply(lambda x: pd.Series(x['ASV']),axis=1).stack().reset_index(level=1, drop=True)
    ASV.name = 'ASV'
    clades_ASVs_df = clades_df.drop('ASV', axis=1).join(ASV)
    return(clades_ASVs_df)

HRclades_Bt1_ASVs = expand_clades(HRclades_Bt1)
print(len(HRclades_Bt1_ASVs),'HR ASVs')
HRclades_Bt1_ASVs.to_csv('analyses/codiv_moeller_ASVs/HRclades_Bt1_ASVs.txt',sep='\t',index=False)


def is_unique_MX(HRclades_ASVs,cladeASVs):
    overlapASVs = list(set(cladeASVs) & 
         set(HRclades_ASVs))
    if len(overlapASVs) == 0:
        return(True)
    else:
        return(False)

HRclades_ASVs = list(HRclades_Bt1_ASVs['ASV'])        
MXclades_Bt1 = MXclades_Bt1[MXclades_Bt1['ASV'].apply(lambda cladeASVs: is_unique_MX(HRclades_ASVs,cladeASVs))]
MXclades_Bt1 = eliminate_redundant_clades(MXclades_Bt1)
MXclades_Bt1_ASVs = expand_clades(MXclades_Bt1)
print(len(MXclades_Bt1_ASVs))



3 HR clades
41 HR ASVs
0


In [56]:
Bt2_ASVs = list(Bt2_lineage_ASVs['ASV'])
print(len(Bt2_ASVs),'total ASVs')
Bt2_tree = full_tree.get_common_ancestor(Bt2_ASVs)
clades_Bt2 = search_clades(Bt2_tree,samples_cutoff=5,BS_support=.5) 
#print(len(clades_Bt1))
HRclades_Bt2 = clades_Bt2[clades_Bt2['HR_clade']!='mixed'][clades_Bt2['HR_clade']!='captive']     
MXclades_Bt2 = clades_Bt2[clades_Bt2['HR_clade']=='mixed']
CPclades_Bt2 = clades_Bt1[clades_Bt2['HR_clade']=='captive']
HRclades_Bt2 = eliminate_redundant_clades(HRclades_Bt2)
print(len(HRclades_Bt2),'HR clades')
HRclades_Bt2_ASVs = expand_clades(HRclades_Bt2)
HRclades_Bt2_ASVs.to_csv('analyses/codiv_moeller_ASVs/HRclades_Bt2_ASVs.txt',sep='\t',index=False)

print(HRclades_Bt2_ASVs)
HRclades_ASVs = list(HRclades_Bt2_ASVs['ASV'])  
print(len(HRclades_ASVs),'HR ASVs')

      
MXclades_Bt2 = MXclades_Bt2[MXclades_Bt2['ASV'].apply(lambda cladeASVs: is_unique_MX(HRclades_ASVs,cladeASVs))]
MXclades_Bt2 = eliminate_redundant_clades(MXclades_Bt2)
print(MXclades_Bt2['ASV'])
MXclades_Bt2_ASVs = expand_clades(MXclades_Bt2)



360 total ASVs


  
  


36 HR clades
     cladeName    HR_clade  HR_sampleNum  ASVsNum  \
4      clade_5       human           301       89   
4      clade_5       human           301       89   
4      clade_5       human           301       89   
4      clade_5       human           301       89   
4      clade_5       human           301       89   
..         ...         ...           ...      ...   
331  clade_332  wild_chimp            38        7   
331  clade_332  wild_chimp            38        7   
331  clade_332  wild_chimp            38        7   
331  clade_332  wild_chimp            38        7   
331  clade_332  wild_chimp            38        7   

                                               ASVsTax  \
4    [f__Bacteroidaceaeg__Prevotella, f__Bacteroida...   
4    [f__Bacteroidaceaeg__Prevotella, f__Bacteroida...   
4    [f__Bacteroidaceaeg__Prevotella, f__Bacteroida...   
4    [f__Bacteroidaceaeg__Prevotella, f__Bacteroida...   
4    [f__Bacteroidaceaeg__Prevotella, f__Bacteroida...   
..

In [9]:
fulltree = Tree("inputs/ASVs_filtered_ref_full.tree", format=0)
HRclades_fulltree = search_clades(fulltree,samples_cutoff=5,BS_support=.5) 
HRclades_fulltree = eliminate_redundant_clades(HRclades_fulltree)
HRclades_fulltree.to_csv('analyses/codiv_moeller_ASVs/HRclades_fulltree.txt',sep='\t',index=False)
HRclades_fulltree_ASVs = expand_clades(HRclades_fulltree)
HRclades_fulltree_ASVs.to_csv('analyses/codiv_moeller_ASVs/HRclades_fulltree_ASVs.txt',sep='\t',index=False)
HRclades_fulltree_ASVs['HR_clade'].value_counts()

human           5625
wild_bonobo      342
wild_gorilla     168
wild_chimp       111
Name: HR_clade, dtype: int64

### Tables summarizing metadata of samples harboring ASVs

In [10]:
#creates table where ASVs are rows, columns are sample descriptions (i.e wild_chimp, captive_orangutan)
#counts are the number of samples or the percent of samples of a given type over the total number of samples belong to that type

sample_type_num = pd.DataFrame()
for ASV in ASV_sampleName_dict:
        sample_names = ASV_sampleName_dict[ASV]
        #sample_type_dict relates sample names to their descriptions 
        sample_types = [sample_type_dict[name] for name in sample_names]
        sample_types = pd.Series(sample_types).value_counts()
        row = pd.DataFrame(sample_types)
        sample_type_num = pd.concat([sample_type_num, row], axis=1, sort=False)
sample_type_num.columns = ASV_sampleName_dict.keys()
sample_type_num = sample_type_num.T 
sample_type_num = sample_type_num.fillna(0)
print(sample_type_num.head())
sample_type_num.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_ASV_count.txt',sep='\t',index=False)
sample_type_counts =  metadata['Description'].value_counts()
print(sample_type_counts)
sample_type_perc = sample_type_num/sample_type_counts 
print(sample_type_perc.head())
sample_type_perc.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_ASV_perc.txt',sep='\t',index=False)


       western_human  non_western_human  captive_chimp  captive_gorilla  \
ASV_1         2917.0               64.0            3.0              2.0   
ASV_2         2074.0               14.0            9.0              5.0   
ASV_3         1907.0               17.0            1.0              2.0   
ASV_4         1798.0               18.0            0.0              0.0   
ASV_5         1208.0               18.0            0.0              2.0   

       wild_chimp  captive_bonobo  captive_orangutan  wild_bonobo  \
ASV_1         1.0             1.0                0.0          0.0   
ASV_2         9.0             0.0                4.0          0.0   
ASV_3         1.0             2.0                0.0          0.0   
ASV_4         1.0             0.0                0.0          0.0   
ASV_5        18.0             2.0                4.0          0.0   

       wild_gorilla  
ASV_1           0.0  
ASV_2           0.0  
ASV_3           0.0  
ASV_4           0.0  
ASV_5           0.0  
we

In [88]:
#creates table where ASVs are rows, columns are sample descriptions plus site 
#counts are the number of samples or the percent of samples of a given type over the total number of samples belong to that type

metadata['Description_site'] = metadata['Description']+'_' +metadata['site_code']
sample_type_site_dict = dict(zip(metadata['X.SampleID'], metadata['Description_site']))

captive_only_desc = list(set(metadata['Description_site']))
captive_only_desc = [desc for desc in captive_only_desc if 'captive' in str(desc)]
captive_only_desc

sample_type_site_num = pd.DataFrame()
for ASV in ASV_sampleName_dict:
        sample_names = ASV_sampleName_dict[ASV]
        sample_types = [sample_type_site_dict[name] for name in sample_names]
        sample_types = pd.Series(sample_types).value_counts()
        row = pd.DataFrame(sample_types)
        sample_type_site_num = pd.concat([sample_type_site_num, row], axis=1, sort=False)
sample_type_site_num.columns = ASV_sampleName_dict.keys()
sample_type_site_num = sample_type_site_num.T 
sample_type_site_num = sample_type_site_num.fillna(0)
print(sample_type_site_num.head())
sample_type_site_num.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_site_ASV_count.txt',sep='\t',index=False)
sample_type_site_counts =  metadata['Description_site'].value_counts()
print(sample_type_counts)
sample_type_site_perc = sample_type_site_num/sample_type_site_counts 
print(sample_type_site_perc.head())
sample_type_site_perc.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_site_ASV_perc.txt',sep='\t',index=False)



       western_human_CHN  western_human_DNK  western_human_USA  \
ASV_1              571.0              278.0              269.0   
ASV_2              382.0              255.0              134.0   
ASV_3              447.0              229.0              174.0   
ASV_4              366.0              179.0              160.0   
ASV_5              265.0               94.0              128.0   

       western_human_SWE  western_human_ISR  western_human_ESP  \
ASV_1              233.0              220.0              204.0   
ASV_2              203.0              160.0              141.0   
ASV_3              177.0              121.0              129.0   
ASV_4              130.0              212.0              128.0   
ASV_5              172.0               26.0               44.0   

       western_human_NLD  western_human_DEU  western_human_GBR  \
ASV_1              194.0              165.0              137.0   
ASV_2              111.0              124.0              126.0   
ASV_3   

In [11]:
#subset to just captive ape descriptions
sample_type_site_num.columns
sample_type_site_num_just_captive = sample_type_site_num[captive_only_desc]
sample_type_site_num_just_captive.index.name = 'ASV'
sample_type_site_num_just_captive.reset_index(inplace=True)
sample_type_site_num_just_captive.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_site_captive_ASV_count.txt',sep='\t',index=False)
sample_type_site_perc_just_captive = sample_type_site_perc[captive_only_desc]
sample_type_site_perc_just_captive.index.name = 'ASV'
sample_type_site_perc_just_captive.reset_index(inplace=True)
sample_type_site_perc_just_captive.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_site_captive_ASV_perc.txt',sep='\t',index=False)



NameError: name 'sample_type_site_num' is not defined