In [25]:
import os
import pandas as pd
from ete3 import Tree
from collections import Counter
from pandas.core.common import flatten
os.getcwd()

'/Volumes/AHN/captive_ape_microbiome/results/gyrb'

In [26]:
#gyrb inputs
os.chdir('/Volumes/AHN/captive_ape_microbiome/results/gyrb/')
metadata_file = 'inputs/metadata_gyrb_amp_meta_passing_samples.txt'
tax_table_file = 'inputs/ASVs_taxonomy.txt'
asv_table_file = 'inputs/ASVs_filtered_counts.tsv'
full_tree_file = 'inputs/ASVs_filtered_ref_full.tree'
full_tree = Tree(full_tree_file, format=0)


In [4]:
asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)

sampleNames = asv_table.apply(lambda row: list(row.index[row>0]),axis=1)
ASV_sampleName_dict = dict(zip(sampleNames.index,sampleNames))

#sample to sample type category
metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
sample_type_dict = dict(zip(metadata['X.SampleID'], metadata['Description']))

#taxonomic info, family and genus
tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
tax_fam_dict = dict(zip(tax_table['ASV'], tax_table['Family']))
tax_gen_dict = dict(zip(tax_table['ASV'], tax_table['Genus']))
    

In [76]:
def is_HR(sampleNames,sample_type_dict):
    """given a list of sample names uses sample type dictionary to determine how many sample types are present
    designates ASVs as host restricted = 1 sample type or mixed = multiple sample types.
    Captive sample types are not considered so some clades will have a 0 sample type length and they can fall within
    host-restricted clades or mixed clades or neither"""
    sampleTypes = [sample_type_dict[name] for name in sampleNames]
    sampleTypes = [x.replace('non_western_','').replace('western_','') for x in sampleTypes]
    neutral_sampleTypes = ['captive_gorilla','captive_bonobo','captive_chimp','captive_orangutan']
    HR_sampleTypes = list(set(sampleTypes) - set(neutral_sampleTypes))
    HR_sampleNum = len([x for x in sampleTypes if x not in neutral_sampleTypes])
    CP_sampleTypes = list(set(sampleTypes) & set(neutral_sampleTypes))
    CP_sampleNum = len([x for x in sampleTypes if x in neutral_sampleTypes])
    CP_pres = True if len(CP_sampleTypes) > 0 else False
    if len(HR_sampleTypes) == 0:
        HR_cat,HR_type='CP','CP'
    if len(HR_sampleTypes) == 1: #identifies host-restricted clades
        HR_cat,HR_type='HR',HR_sampleTypes[0]  
    if len(HR_sampleTypes) > 1:    
        HR_cat,HR_type ='MX','MX'
    return(HR_sampleTypes,HR_sampleNum,HR_cat,HR_type,CP_pres,CP_sampleTypes,CP_sampleNum)
           
           
print(is_HR(['CosteaPI_2017__SID713A076-11-0-0','LiSS_2016__FAT_024-22-42-0', 
       'LiSS_2016__FAT_024-22-84-0', 'LiSS_2016__FAT_DON_11-22-0-4'],sample_type_dict)) 

def get_consensus_taxonomy(listASVs,tax_fam_dict,tax_gen_dict):
    fam = list(set([tax_fam_dict[ASV] for ASV in listASVs])-set(['Unassigned']))
    gen = list(set([tax_gen_dict[ASV] for ASV in listASVs])-set(['Unassigned']))
    fam = fam[0].split('__')[1] if len(fam) == 1 else 'Unassigned'
    if fam == 'Unassigned':
        gen='Unassigned'
    else:
        gen = gen[0].split('__')[1] if len(gen) == 1 else 'Unassigned'  
    return(fam+'_'+gen)   
print(get_consensus_taxonomy(['ASV_1','ASV_2','ASV_1078'],tax_fam_dict,tax_gen_dict))

def search_clades(tree, samples_cutoff, BS_support,ASV_sampleName_dict):
    """Finds nodes with at least 50% BS support containing ASVs only found in a single wild ape species
    ie wild_gorilla, wild_chimp or wild_bonobo, cpat"""
    clades_prelim = []
    counter = 1
    for n in tree.traverse():
        if n.support > float(BS_support): #makes sure Bootstrap support is over threshold
            ASVs = [leaf.name for leaf in n.iter_leaves() if 'ASV' in leaf.name]
            sampleNames = list(set(list(flatten([ASV_sampleName_dict[ASV] for ASV in ASVs])))) 
            cladeName='clade_'+str(counter)
            counter+=1
            clade = [cladeName,ASVs,sampleNames]
            if len(sampleNames)>samples_cutoff:
                clades_prelim.append(clade)
    clades_prelim = pd.DataFrame(clades_prelim, columns = 
                          ['cladeName','ASVs','sampleNames'])
    clades_prelim['sampleNum'] = clades_prelim['sampleNames'].apply(lambda x: len(x))
    clades_prelim['ASVsNum'] = clades_prelim['ASVs'].apply(lambda x: len(x))
    return(clades_prelim)

def eliminate_redundant_clades(clades_df,offlimits_ASVs):
    """sorts clades and returns the largest non overlapping clade """
    df = clades_df.sort_values('ASVsNum',ascending=False) #start with the largest clades first
    NRclades = []
    for index, row in df.iterrows():
        if len(set(row['ASVs']) & set(offlimits_ASVs)) == 0: 
            offlimits_ASVs = offlimits_ASVs + row['ASVs']
            NRclades.append(row['cladeName']) 
    res = df[df['cladeName'].isin(NRclades)]   
    return(res)  

def expand_clades(clades_df):
    ASV = clades_df.apply(lambda x: pd.Series(x['ASVs']),axis=1).stack().reset_index(level=1, drop=True)
    ASV.name = 'ASVs'
    clades_ASVs_df = clades_df.drop('ASVs', axis=1).join(ASV)
    return(clades_ASVs_df)
                                 
def host_restricted_clades(asv_table_file,metadata_file,tax_table_file):
    asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
    sampleNames = asv_table.apply(lambda row: list(row.index[row>0]),axis=1)
    ASV_sampleName_dict = dict(zip(sampleNames.index,sampleNames))

    #sample to sample type category
    metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
    sample_type_dict = dict(zip(metadata['X.SampleID'], metadata['Description']))

    #taxonomic info, family and genus
    tax_table = pd.read_csv(tax_table_file,sep='\t',index_col=None)
    tax_fam_dict = dict(zip(tax_table['ASV'], tax_table['Family']))
    tax_gen_dict = dict(zip(tax_table['ASV'], tax_table['Genus']))
    
    #search tree for clades
    clades_prelim = search_clades(full_tree, 5, .5, ASV_sampleName_dict)
    hr =clades_prelim['sampleNames'].apply(lambda x:  pd.Series(is_HR(x,sample_type_dict),
                index=['HR_sampleTypes','HR_sampleNum','HR_cat','HR_type','CP_pres','CP_sampleTypes','CP_sampleNum']))
    clades_prelim_hr = clades_prelim.merge(hr,right_index=True,left_index=True)

    #identify HR clades
    HR_clades = clades_prelim_hr[clades_prelim_hr['HR_cat']=='HR']
    HR_clades = HR_clades[HR_clades['HR_sampleNum']>5]
    HR_clades = eliminate_redundant_clades(HR_clades,[])
    HR_clades_ASVs = expand_clades(HR_clades)

    #identify MX clades that don't contain any HR clades
    MX_clades = clades_prelim_hr[clades_prelim_hr['HR_cat']=='MX']
    MX_clades = MX_clades[MX_clades['sampleNum']>5]
    MX_clades = eliminate_redundant_clades(MX_clades,offlimits_ASVs=list(HR_clades_ASVs['ASVs']))
    MX_clades_ASVs = expand_clades(MX_clades)
    
    #identify MX clades that don't contain any HR clades
    CP_clades = clades_prelim_hr[clades_prelim_hr['HR_cat']=='CP']
    CP_clades = CP_clades[CP_clades['sampleNum']>5]
    HR_MX_ASVs = list(HR_clades_ASVs['ASVs']) + list(MX_clades_ASVs['ASVs'])
    CP_clades = eliminate_redundant_clades(CP_clades,offlimits_ASVs=HR_MX_ASVs)
    CP_clades_ASVs = expand_clades(CP_clades)
    
    #merge dataframes
    clades = pd.concat([HR_clades,MX_clades,CP_clades])
    clades.reset_index(drop=True, inplace=True)
    clades_ASVs = pd.concat([HR_clades_ASVs,MX_clades_ASVs,CP_clades_ASVs])
    clades_ASVs.reset_index(drop=True, inplace=True)
    
    clades['cladeTax']=clades['ASVs'].apply(lambda x: 
        get_consensus_taxonomy(x,tax_fam_dict,tax_gen_dict))
    
    sample_type_counts =  metadata['Description'].value_counts()
    description_df = clades['sampleNames'].apply(lambda l: pd.Series(
    [sample_type_dict[name] for name in l]).value_counts())
    description_df = description_df.fillna(0)  
    sample_type_percent = description_df/sample_type_counts
    clades=clades.merge(sample_type_percent,left_index=True,right_index=True)  
    
    return(clades,clades_ASVs)
    
clades_df,clades_ASVs_df = host_restricted_clades(asv_table_file,metadata_file,tax_table_file)   

(['human'], 4, 'HR', 'human', False, [], 0)
Bacteroidaceae_Unassigned


In [88]:
clades_df_sh = clades_df.drop(['ASVs','sampleNames'], axis=1)
clades_df_sh.to_csv('analyses/codiv_moeller_ASVs/table_full_tree_clades_collapsed.txt',sep='\t',index=False)
clades_df.head()
clades_df_cp = clades_df.loc[(clades_df.captive_bonobo > 0.15) | 
              (clades_df.captive_chimp > 0.15)  |
              (clades_df.captive_gorilla > 0.15)|
              (clades_df.captive_orangutan > 0.15)] 
clades_df_sh.to_csv('analyses/codiv_moeller_ASVs/table_prominent_captive_clades.txt',sep='\t',index=False)
