In [11]:
import os
import pandas as pd
from ete3 import Tree
from collections import Counter

os.getcwd()
os.chdir('/Volumes/AHN/captive_ape_microbiome/results/gyrb/')


In [12]:
#inputs
metadata_file = 'inputs/metadata_gyrb_amp_meta_passing_samples.txt'
tax_table_file = 'inputs/ASVs_taxonomy.txt'
asv_table_file = 'inputs/ASVs_filtered_counts.tsv'
asv_fasta_file = 'inputs/ASVs_filtered.fasta'
full_tree_file = 'inputs/ASVs_filtered_ref_full.tree'
full_tree = Tree(full_tree_file, format=0)
moeller_codiv_fasta = 'inputs/codiv_Bacteroidaceae.fna'
pident_cutoff = 95

In [13]:
asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)

sampleNames = asv_table.apply(lambda row: list(row.index[row>0]),axis=1)
ASV_sampleName_dict = dict(zip(sampleNames.index,sampleNames))

#sample to sample type category
metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
sample_type_dict = dict(zip(metadata['X.SampleID'], metadata['Description']))

#taxonomic info, family and genus
tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
tax_fam_dict = dict(zip(tax_table['ASV'], tax_table['Family']))
tax_gen_dict = dict(zip(tax_table['ASV'], tax_table['Genus']))
    

### Determine ASVs that hit to Moeller co-div clades

In [14]:
%%bash

mkdir analyses/codiv_moeller_ASVs
cp inputs/moeller_codiv_Bacteroidaceae.fna analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna

#make blastdb
makeblastdb -in analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna -dbtype nucl

#blast moeller co-div seqs
blastn -query inputs/ASVs_filtered.fasta -db analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna \
-outfmt "7 qseqid salltitles sseqid pident length qlen evalue" \
-out analyses/codiv_moeller_ASVs/codiv_blastout_ASVs.txt \
-max_target_seqs 5



Building a new DB, current time: 08/08/2020 23:12:20
New DB name:   /Volumes/AHN/captive_ape_microbiome/results/gyrb/analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna
New DB title:  analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Volumes/AHN/captive_ape_microbiome/results/gyrb/analyses/codiv_moeller_ASVs/moeller_codiv_Bacteroidaceae.fna
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 208 sequences in 0.0111239 seconds.


mkdir: analyses/codiv_moeller_ASVs: File exists


In [15]:
#filter blast output to top hit per ASV, remove hits not pass thresholds
blast_res = pd.read_csv('analyses/codiv_moeller_ASVs/codiv_blastout_ASVs.txt',sep='\t',comment='#',header=None)
blast_res.columns = ['ASV','codiv_clade_seq','sseqid','pident','length','qlen','evalue']
blast_res = blast_res.groupby('ASV').head(1).reset_index(drop=True) #get top hit
blast_res_pass = blast_res[blast_res['pident']>pident_cutoff].reset_index(drop=True) #QC filter
blast_res_pass = blast_res_pass[blast_res_pass['length']>200].reset_index(drop=True) #QC filter
print(len(blast_res_pass),'ASVs matching co-div clades with greater than',pident_cutoff,'percent identity')
blast_res_pass['codiv_clade'] = blast_res_pass['codiv_clade_seq'].apply(lambda x: x.split(' ')[1])
blast_res_pass['lineage'] = blast_res_pass['codiv_clade'].apply(lambda x: x.split('_')[0]) 
blast_res_pass = blast_res_pass[['ASV','codiv_clade_seq', 'codiv_clade','lineage']]
print(blast_res_pass.head())
print(blast_res_pass['codiv_clade'].value_counts())
#output hits passing that will be used to determine the mcra of the lineage
blast_res_pass.to_csv('analyses/codiv_moeller_ASVs/codiv_clades_ASVs.txt',sep='\t',index=False)


314 ASVs matching co-div clades with greater than 95 percent identity
      ASV                codiv_clade_seq       codiv_clade lineage
0   ASV_1  Human6700665 Bt3_clade1_human  Bt3_clade1_human     Bt3
1  ASV_13  Human7859459 Bt3_clade1_human  Bt3_clade1_human     Bt3
2  ASV_34  Human9825266 Bt3_clade1_human  Bt3_clade1_human     Bt3
3  ASV_37  Human8829943 Bt3_clade1_human  Bt3_clade1_human     Bt3
4  ASV_73  Human9825266 Bt3_clade1_human  Bt3_clade1_human     Bt3
Bt2_clade1_bonobo     90
Bt3_clade1_human      85
Bt2_clade2_bonobo     33
Bt1_clade1_bonobo     23
Bt2_clade1_chimp      20
Bt3_clade1_chimp      17
Bt3_clade1_bonobo     15
Bt1_clade1_gorilla    10
Bt1_clade1_chimp       8
Bt2_clade1_gorilla     7
Bt2_clade2_chimp       6
Name: codiv_clade, dtype: int64


###defining host restricted clades

In [16]:
def is_HR(sampleNames,sample_type_dict):
    """given a list of sample names uses sample type dictionary to determine how many sample types are present
    designates ASVs as host restricted = 1 sample type or mixed = multiple sample types.
    Captive sample types are not considered so some clades will have a 0 sample type length and they can fall within
    host-restricted clades or mixed clades or neither"""
    sampleTypes = [sample_type_dict[name] for name in sampleNames]
    sampleTypes = [x.replace('non_western_','').replace('western_','') for x in sampleTypes]
    neutral_sampleTypes = ['captive_gorilla','captive_bonobo','captive_chimp','captive_orangutan']
    HR_sampleTypes = list(set(sampleTypes) - set(neutral_sampleTypes))
    HR_sampleNum = len([x for x in sampleTypes if x not in neutral_sampleTypes])
    CP_sampleTypes = list(set(sampleTypes) & set(neutral_sampleTypes))
    CP_sampleNum = len([x for x in sampleTypes if x in neutral_sampleTypes])
    CP_pres = True if len(CP_sampleTypes) > 0 else False
    if len(HR_sampleTypes) == 0:
        HR_cat,HR_type='CP','CP'
    if len(HR_sampleTypes) == 1: #identifies host-restricted clades
        HR_cat,HR_type='HR',HR_sampleTypes[0]  
    if len(HR_sampleTypes) > 1:    
        HR_cat,HR_type ='MX','MX'
    return(HR_sampleTypes,HR_sampleNum,HR_cat,HR_type,CP_pres,CP_sampleTypes,CP_sampleNum)
           
           
print(is_HR(['CosteaPI_2017__SID713A076-11-0-0','LiSS_2016__FAT_024-22-42-0', 
       'LiSS_2016__FAT_024-22-84-0', 'LiSS_2016__FAT_DON_11-22-0-4'],sample_type_dict)) 

def get_consensus_taxonomy(listASVs,tax_fam_dict,tax_gen_dict):
    fam = list(set([tax_fam_dict[ASV] for ASV in listASVs])-set(['Unassigned']))
    gen = list(set([tax_gen_dict[ASV] for ASV in listASVs])-set(['Unassigned']))
    fam = fam[0].split('__')[1] if len(fam) == 1 else 'Unassigned'
    if fam == 'Unassigned':
        gen='Unassigned'
    else:
        gen = gen[0].split('__')[1] if len(gen) == 1 else 'Unassigned'  
    return(fam+'_'+gen)   
print(get_consensus_taxonomy(['ASV_1','ASV_2','ASV_1078'],tax_fam_dict,tax_gen_dict))

def search_clades(tree, samples_cutoff, BS_support,ASV_sampleName_dict):
    """Finds nodes with at least 50% BS support containing ASVs only found in a single wild ape species
    ie wild_gorilla, wild_chimp or wild_bonobo, cpat"""
    clades_prelim = []
    counter = 1
    for n in tree.traverse():
        if n.support > float(BS_support): #makes sure Bootstrap support is over threshold
            ASVs = [leaf.name for leaf in n.iter_leaves() if 'ASV' in leaf.name]
            sampleNames = list(set(list(flatten([ASV_sampleName_dict[ASV] for ASV in ASVs])))) 
            cladeName='clade_'+str(counter)
            counter+=1
            clade = [cladeName,ASVs,sampleNames]
            if len(sampleNames)>samples_cutoff:
                clades_prelim.append(clade)
    clades_prelim = pd.DataFrame(clades_prelim, columns = 
                          ['cladeName','ASVs','sampleNames'])
    clades_prelim['sampleNum'] = clades_prelim['sampleNames'].apply(lambda x: len(x))
    clades_prelim['ASVsNum'] = clades_prelim['ASVs'].apply(lambda x: len(x))
    return(clades_prelim)

def eliminate_redundant_clades(clades_df,offlimits_ASVs):
    """sorts clades and returns the largest non overlapping clade """
    df = clades_df.sort_values('ASVsNum',ascending=False) #start with the largest clades first
    NRclades = []
    for index, row in df.iterrows():
        if len(set(row['ASVs']) & set(offlimits_ASVs)) == 0: 
            offlimits_ASVs = offlimits_ASVs + row['ASVs']
            NRclades.append(row['cladeName']) 
    res = df[df['cladeName'].isin(NRclades)]   
    return(res)  

def expand_clades(clades_df):
    ASV = clades_df.apply(lambda x: pd.Series(x['ASVs']),axis=1).stack().reset_index(level=1, drop=True)
    ASV.name = 'ASVs'
    clades_ASVs_df = clades_df.drop('ASVs', axis=1).join(ASV)
    return(clades_ASVs_df)
                                 
def host_restricted_clades(asv_table_file,metadata_file,tax_table_file):
    asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
    sampleNames = asv_table.apply(lambda row: list(row.index[row>0]),axis=1)
    ASV_sampleName_dict = dict(zip(sampleNames.index,sampleNames))

    #sample to sample type category
    metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
    sample_type_dict = dict(zip(metadata['X.SampleID'], metadata['Description']))

    #taxonomic info, family and genus
    tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
    tax_fam_dict = dict(zip(tax_table['ASV'], tax_table['Family']))
    tax_gen_dict = dict(zip(tax_table['ASV'], tax_table['Genus']))
    
    #search tree for clades
    clades_prelim = search_clades(full_tree, 5, .5, ASV_sampleName_dict)
    hr =clades_prelim['sampleNames'].apply(lambda x:  pd.Series(is_HR(x,sample_type_dict),
                index=['HR_sampleTypes','HR_sampleNum','HR_cat','HR_type','CP_pres','CP_sampleTypes','CP_sampleNum']))
    clades_prelim_hr = clades_prelim.merge(hr,right_index=True,left_index=True)

    #identify HR clades
    HR_clades = clades_prelim_hr[clades_prelim_hr['HR_cat']=='HR']
    HR_clades = HR_clades[HR_clades['HR_sampleNum']>5]
    HR_clades = eliminate_redundant_clades(HR_clades,[])
    HR_clades_ASVs = expand_clades(HR_clades)

    #identify MX clades that don't contain any HR clades
    MX_clades = clades_prelim_hr[clades_prelim_hr['HR_cat']=='MX']
    MX_clades = MX_clades[MX_clades['sampleNum']>5]
    MX_clades = eliminate_redundant_clades(MX_clades,offlimits_ASVs=list(HR_clades_ASVs['ASVs']))
    MX_clades_ASVs = expand_clades(MX_clades)
    
    #identify MX clades that don't contain any HR clades
    CP_clades = clades_prelim_hr[clades_prelim_hr['HR_cat']=='CP']
    CP_clades = CP_clades[CP_clades['sampleNum']>5]
    HR_MX_ASVs = list(HR_clades_ASVs['ASVs']) + list(MX_clades_ASVs['ASVs'])
    CP_clades = eliminate_redundant_clades(CP_clades,offlimits_ASVs=HR_MX_ASVs)
    CP_clades_ASVs = expand_clades(CP_clades)
    
    #merge dataframes
    clades = pd.concat([HR_clades,MX_clades,CP_clades])
    clades.reset_index(drop=True, inplace=True)
    clades_ASVs = pd.concat([HR_clades_ASVs,MX_clades_ASVs,CP_clades_ASVs])
    clades_ASVs.reset_index(drop=True, inplace=True)
    
    
    clades['cladeTax']=clades['ASVs'].apply(lambda x: 
        get_consensus_taxonomy(x,tax_fam_dict,tax_gen_dict))
    
    sample_type_counts =  metadata['Description'].value_counts()
    description_df = clades['sampleNames'].apply(lambda l: pd.Series(
    [sample_type_dict[name] for name in l]).value_counts())
    description_df = description_df.fillna(0)  
    sample_type_percent = description_df/sample_type_counts
    clades=clades.merge(sample_type_percent,left_index=True,right_index=True)  
    
    return(clades,clades_ASVs)
    
clades_df,clades_ASVs_df = host_restricted_clades(asv_table_file,metadata_file,tax_table_file)   

(['human'], 4, 'HR', 'human', False, [], 0)
Bacteroidaceae_Unassigned


In [19]:
clades_df_sh = clades_df.drop(['ASVs','sampleNames'], axis=1)
clades_df_sh.to_csv('analyses/codiv_moeller_ASVs/table_full_tree_clades_collapsed.txt',sep='\t',index=False)
clades_df.head()
clades_df_cp = clades_df.loc[(clades_df.captive_bonobo > 0.15) | 
              (clades_df.captive_chimp > 0.15)  |
              (clades_df.captive_gorilla > 0.15)|
              (clades_df.captive_orangutan > 0.15)] 
clades_df_sh.to_csv('analyses/codiv_moeller_ASVs/table_prominent_captive_clades.txt',sep='\t',index=False)


Unnamed: 0,cladeName,ASVs,sampleNames,sampleNum,ASVsNum,HR_sampleTypes,HR_sampleNum,HR_cat,HR_type,CP_pres,...,cladeTax,captive_bonobo,captive_chimp,captive_gorilla,captive_orangutan,non_western_human,western_human,wild_bonobo,wild_chimp,wild_gorilla
0,clade_41,"[ASV_2623, ASV_1788, ASV_7289, ASV_7288, ASV_7...","[HMP_2012__SRS017701, LeChatelierE_2013__MH043...",788,366,[human],788,HR,human,False,...,UBA932_Unassigned,0.0,0.0,0.0,0.0,0.493151,0.078766,0.0,0.0,0.0
1,clade_8751,"[ASV_10387, ASV_408, ASV_3299, ASV_10369, ASV_...","[HeQ_2017__SZAXPI029466-158, SchirmerM_2016__G...",1751,191,[human],1732,HR,human,True,...,Bacteroidaceae_Prevotella,0.153846,0.5,0.181818,0.0,0.616438,0.208229,0.0,0.0,0.0
2,clade_2710,"[ASV_1720, ASV_2738, ASV_6484, ASV_8324, ASV_8...","[LiuW_2016__SRR3993059, QinN_2014__LV-11, LiSS...",487,190,[human],485,HR,human,True,...,Bacteroidaceae_Unassigned,0.153846,0.0,0.0,0.0,0.465753,0.036297,0.0,0.0,0.0
3,clade_1342,"[ASV_8718, ASV_838, ASV_1875, ASV_8091, ASV_28...","[LeChatelierE_2013__MH0430, ZeeviD_2015__PNP_M...",744,182,[human],744,HR,human,False,...,Bacteroidaceae_Prevotella,0.0,0.0,0.0,0.0,0.37182,0.081411,0.0,0.0,0.0
4,clade_4172,"[ASV_1694, ASV_6113, ASV_6131, ASV_1340, ASV_6...","[CosteaPI_2017__SID713A055-11-0-0, QinN_2014__...",700,147,[human],639,HR,human,True,...,Bacteroidaceae_Unassigned,0.923077,0.961538,0.863636,0.454545,0.526419,0.054372,0.0,0.0,0.0


### search full tree for host-restricted clades

In [18]:
collasped_tree = Tree(full_tree_file, format=0)

def collapse_node(tree,clade_ASVs,cladeName):
    if len(clade_ASVs) > 1:
        node = tree.get_common_ancestor(clade_ASVs)
        node.name = cladeName
        children = node.get_children()
        node.remove_child(children[1])
        node.remove_child(children[0])
    else:
        ASV = clade_ASVs[0]
        leaf = tree.get_leaves_by_name(name=ASV)[0]
        leaf.name = cladeName

for clade,row in clades_df.iterrows():
    collapse_node(collasped_tree,row['ASVs'],row['cladeName'])

clade_leaves = [leaf for leaf in collasped_tree.get_leaves() if 'clade' in leaf.name] 
collasped_tree.prune(clade_leaves) #eliminate leftover ASVs and ref taxa
collasped_tree.write(format=2, outfile='analyses/codiv_moeller_ASVs/full_tree_clades_collapsed.tre')


## ASV info 

In [None]:
#AllASVs
allASVs = [leaf.name for leaf in full_tree.get_leaves() if 'ASV' in leaf.name]
allASVs = pd.DataFrame(allASVs,columns=['ASV'])

#Host restricted clades 
clades_ASV = pd.concat([HRclades_ASVs,MXclades_ASVs,CPclades_ASVs])
clades_ASV['ASV'].value_counts() #double check each 
HRclades_ASV = clades_ASV[clades_ASV['HR_cat']=='HR']
HRclades_ASV = HRclades_ASV[['ASV','cladeName','HR_clade']]
HRclades_ASV.shape

#Moeller codiv clades
print(blast_res_pass.head())

#combine
allASVs_HRclade = allASVs.merge(HRclades_ASV, how='left', on='ASV')
allASVs_HRclade_codiv = allASVs_HRclade.merge(blast_res_pass, how='left', on='ASV')

#add taxonomy
allASVs_HRclade_codiv['Family'] = allASVs_HRclade_codiv['ASV'].apply(lambda ASV: tax_fam_dict[ASV])
allASVs_HRclade_codiv['Genus'] = allASVs_HRclade_codiv['ASV'].apply(lambda ASV: tax_gen_dict[ASV])

#add captive ape data
metadata['Description_site'] = metadata['Description']+'_' +metadata['site_code']
sample_type_site_dict = dict(zip(metadata['X.SampleID'], metadata['Description_site']))

def get_captive_sample(ASV):
    captive_desc = ['captive_chimp_HOUZ','captive_gorilla_HOUZ','captive_orangutan_HOUZ',
                    'captive_bonobo_COLZ','captive_gorilla_COLZ','captive_orangutan_COLZ','captive_chimp_PC']
    res = pd.Series([0 for nil in captive_desc],index=captive_desc)
    sampleNames = ASV_sampleName_dict[ASV]
    
    desc = [sample_type_site_dict[sample] for sample in sampleNames]
    desc = [d for d in desc if d in captive_desc]
    desc = pd.Series(desc).value_counts()
    desc_res = res.add(desc,fill_value=0)
    
    HR_res = pd.Series(is_HR(sampleNames),index=['sampleTypes','ASV_cat','HR_ASV','ASV_sampleNum'])
    res = pd.concat([HR_res,desc_res])
    return(res)

capt_desc = allASVs_HRclade_codiv['ASV'].apply(lambda ASV: get_captive_sample(ASV))
capt_desc['captive_all']= capt_desc.loc[:, 'captive_chimp_HOUZ':'captive_chimp_PC'].sum(axis=1)

allASVs_HRclade_codiv_capt = allASVs_HRclade_codiv.merge(capt_desc,left_index=True, right_index=True)

#output to tsv
allASVs_HRclade_codiv_capt.to_csv('analyses/codiv_moeller_ASVs/full_tree_ASV_table.txt',sep='\t',index=False)
allASVs_HRclade_codiv_capt.head()