In [1]:
import os
import pandas as pd

os.getcwd()
os.chdir('/Volumes/AHN/captive_ape_microbiome')

In [4]:
metadata = pd.read_csv('metadata/metadata_Bt_samples_gyrb.txt',sep='\t',index_col=None)
tax_table = pd.read_csv('results/gyrb_all/Bt/assigned_taxonomy/ASVs_filtered_taxonomy.txt',sep='\t',index_col=0)
asv_table = pd.read_csv('results/gyrb_all/Bt/ASVs_all_counts.tsv',sep='\t',index_col=0)
fasta = 'results/gyrb_all/Bt/phylogeny/ASVs_filtered_ref_fulllength.fasta.aln'

def get_sample_description(sample_name):
    description = metadata[metadata.index==sample_name]['Description'].to_string(index=False).replace(' ','')
    return(description)

def get_ASV_taxonomy(ASV):
    genus = tax_table[tax_table.index==ASV][['Genus']].to_string(index=False,header=False)
    family = tax_table[tax_table.index==ASV][['Family']].to_string(index=False,header=False)
    
    return(family+genus)
print(get_ASV_taxonomy("ASV_1"))

ASV_sampleName_dict = {}  
ASV_sampleType_dict = {} 
ASV_taxonomy_dict = {}

for index,row in asv_table.iterrows():
    sampleNames = list(asv_table.columns[row>0])
    ASV_sampleName_dict[index] = sampleNames
    sampleDescriptions = list(set([get_sample_description(x) for x in sampleNames]))
    ASV_sampleType_dict[index] = sampleDescriptions
    ASV_taxonomy_dict[index] = get_ASV_taxonomy(index)

#for ASV in ASV_sampleType_dict:
 #   if len(ASV_sampleType_dict[ASV]) != 1:
 #       print(ASV,ASV_sampleType_dict[ASV])

 f__Bacteroidaceae g__Prevotella


In [5]:
from ete3 import Tree
tree_file = "results/gyrb_all/Bt/phylogeny/ASVs_filtered_ref.fasta.aln.tree"
tree = Tree(tree_file)

In [6]:
neutral_sampleTypes = ['captive_gorilla','captive_bonobo','captive_chimp','captive_orangutan']
#groups not considered 

def search_clades(node, samples_cutoff, BS_support):
    """Finds nodes with at least 50% BS support containing ASVs only found in a single wild ape species
    ie wild_gorilla, wild_chimp or wild_bonobo"""
    matches = []
    counter = 1
    for n in node.traverse():
        if n.support > float(BS_support): #makes sure Bootstrap support is over threshold
            ASVs = [leaf.name for leaf in n.iter_leaves() if 'ASV' in leaf.name]
            ASVsNum = len(ASVs)
            sampleTypes = []
            for ASV in ASVs:
                sampleTypes = sampleTypes + ASV_sampleType_dict[ASV]
            sampleTypes = list(set(sampleTypes))
            select_sampleTypes = [x for x in sampleTypes if x not in neutral_sampleTypes]
            if len(select_sampleTypes) == 1: #identifies host-restricted clades
                #length zero means clade only has neutral sample types
                #lengh greater than 1 means two or more sample types
                #now filter them
                HR_clade = select_sampleTypes[0]
                
                if HR_clade == 'wild_chimp' or HR_clade == 'wild_gorilla' or HR_clade == 'wild_bonobo':
                    #now determine how many samples have an ASV belonging to this clade
                    sampleNames = []
                    #iterates through ASVs in a clade and generates list of unique sample names
                    for ASV in ASVs: 
                        sampleNames = sampleNames + ASV_sampleName_dict[ASV]  
                    sampleNames = list(set(sampleNames))
                    #iterates through sample names, counts only wild ape samples into the sample number total,
                    #so captive apes not includes in the total sample number
                    sampleNum = len([x for x in sampleNames if 'wd.' in x])
                    if sampleNum > samples_cutoff:
                        cladeTax = []
                        for ASV in ASVs:
                            cladeTax.append(ASV_taxonomy_dict[ASV])
                        cladeTax = list(set(cladeTax))
                        cladeName = 'clade_'+str(counter) #there may be a better way to name the clades
                        counter += 1
                        #print(cladeName)
                        #print(ASVs)
                        #print(ASVsNum)
                        #print(sampleTypes)
                        #print(sampleNum)
                        #print(sampleNames)
                        match = [cladeName,
                                 sampleNum,
                                 sampleNames,
                                 sampleTypes,
                                 ASVsNum,
                                 ASVs,
                                 cladeTax
                                ]
                        matches.append(match)
                            
                        
    return matches


# returns nodes containing 6 leaves
res = search_clades(tree, samples_cutoff=5,BS_support=.5)


In [7]:
df = pd.DataFrame(res, columns = ['cladeName',
                                 'sampleNum',
                                 'sampleNames',
                                 'sampleTypes',
                                 'ASVsNum',
                                 'ASVs',
                                 'cladeTax'
                                ])  
df = df.sort_values('ASVsNum',ascending=False)

nonRedundantclades = []
names = []
for index, row in df.iterrows():
    firstASV = row['ASVs'][0]
    unique = 'Y'
    for clade in nonRedundantclades:
        if firstASV in clade:
            unique = 'N'
    if unique == 'Y':
        nonRedundantclades.append(row['ASVs'])    
        names.append(row['cladeName'])   
    #print(row['cladeName'],unique)
df = df[df['cladeName'].isin(names)]


os.system('mkdir results/gyrb_all/Bt/HR_clades')
df.to_csv('results/gyrb_all/Bt/HR_clades/wild_ape_HR_clades.txt',sep='\t')
df.head()

Unnamed: 0,cladeName,sampleNum,sampleNames,sampleTypes,ASVsNum,ASVs,cladeTax
132,clade_133,21,"[wd.bon.KR.MMBB046.Bt, wd.bon.LK.KSG0668.Bt, w...",[wild_bonobo],90,"[ASV_305, ASV_123, ASV_1381, ASV_567, ASV_521,...","[ f__Bacteroidaceae g__Prevotella, f__Bactero..."
213,clade_214,22,"[wd.bon.LK.KSG0668.Bt, wd.bon.KR.MMBB046.Bt, w...",[wild_bonobo],71,"[ASV_124, ASV_1277, ASV_1380, ASV_859, ASV_240...","[ f__Tannerellaceae g__Parabacteroides, f__Ta..."
178,clade_179,16,"[wd.gor.CP.CR3443.Bt, wd.gor.CP.CR3455.Bt, wd....",[wild_gorilla],57,"[ASV_289, ASV_735, ASV_63, ASV_1772, ASV_2503,...","[ f__F082 Unassigned, f__Paludibacteraceae Un..."
67,clade_68,18,"[wd.bon.LK.KSG0668.Bt, wd.bon.KR.MMBB046.Bt, w...",[wild_bonobo],33,"[ASV_695, ASV_1834, ASV_1346, ASV_1648, ASV_16...",[ f__Bacteroidaceae g__Prevotella]
52,clade_53,19,"[wd.gor.CP.CR3443.Bt, wd.gor.CP.CR5781.Bt, wd....",[wild_gorilla],25,"[ASV_1489, ASV_1620, ASV_1688, ASV_161, ASV_10...","[ f__Bacteroidaceae g__UBA4334, f__Bacteroida..."


In [9]:
HR_ASVs = 0
for clade in df['ASVs']:
    HR_ASVs += len(clade)
print(HR_ASVs)

619


In [11]:
619-232


387