In [9]:
import os
import pandas as pd
from ete3 import Tree
from collections import Counter

os.getcwd()
os.chdir('/Volumes/AHN/captive_ape_microbiome/results/gyrb/')
os.listdir('inputs/')

['.DS_Store',
 'ASVs_filtered.fasta',
 'ASVs_filtered_counts.tsv',
 'ASVs_filtered_ref_full.tree',
 'ASVs_taxonomy.txt',
 'metadata_gyrb_amp_meta_passing_samples.txt',
 'moeller_codiv_Bacteroidaceae.fna',
 'moeller_codiv_HRclades.txt',
 'moeller_codiv_lin_Bt1.tree',
 'moeller_codiv_lin_Bt2.tree',
 'moeller_codiv_lin_Bt3.tree',
 'ref_gyrb_gtdbtk']

In [10]:
#inputs
metadata_file = 'inputs/metadata_gyrb_amp_meta_passing_samples.txt'
tax_table_file = 'inputs/ASVs_taxonomy.txt'
asv_table_file = 'inputs/ASVs_filtered_counts.tsv'
asv_fasta_file = 'inputs/ASVs_filtered.fasta'
tree_file = 'inputs/ASVs_filtered_ref_full.tree'
full_tree = Tree(tree_file, format=0)
moeller_codiv_fasta = 'inputs/codiv_Bacteroidaceae.fna'
pident_cutoff = 95

### Blastn moeller co-div seqs against ASVs and filter ASVs matchin X% identity

In [12]:
%%bash

mkdir analyses/codiv_moeller_ASVs
cp inputs/codiv_Bacteroidaceae.fna analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna

#make blastdb
makeblastdb -in analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna -dbtype nucl

#blast moeller co-div seqs
blastn -query inputs/ASVs_filtered.fasta -db analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna \
-outfmt "7 qseqid salltitles sseqid pident length qlen evalue" \
-out analyses/codiv_moeller_ASVs/codiv_blastout_ASVs.txt \
-max_target_seqs 5



Building a new DB, current time: 07/28/2020 11:23:45
New DB name:   /Volumes/AHN/captive_ape_microbiome/results/gyrb/analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna
New DB title:  analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Volumes/AHN/captive_ape_microbiome/results/gyrb/analyses/codiv_moeller_ASVs/codiv_Bacteroidaceae.fna
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 208 sequences in 0.00704718 seconds.


mkdir: analyses/codiv_moeller_ASVs: File exists
cp: inputs/codiv_Bacteroidaceae.fna: No such file or directory


### Subset the matching ASVs 

In [15]:
#filter blast output to top hit per ASV, remove hits not pass thresholds
blast_res = pd.read_csv('analyses/codiv_moeller_ASVs/codiv_blastout_ASVs.txt',sep='\t',comment='#',header=None)
blast_res.columns = ['ASV','codiv_clade_seq','sseqid','pident','length','qlen','evalue']
blast_res = blast_res.groupby('ASV').head(1).reset_index(drop=True) #get top hit
blast_res_pass = blast_res[blast_res['pident']>pident_cutoff].reset_index(drop=True) #QC filter
blast_res_pass = blast_res_pass[blast_res_pass['length']>200].reset_index(drop=True) #QC filter
print(len(blast_res_pass),'ASVs matching co-div clades with greater than',pident_cutoff,'percent identity')
blast_res_pass['codiv_clade'] = blast_res_pass['codiv_clade_seq'].apply(lambda x: x.split(' ')[1])
blast_res_pass['lineage'] = blast_res_pass['codiv_clade'].apply(lambda x: x.split('_')[0]) 
blast_res_pass = blast_res_pass[['ASV','codiv_clade_seq', 'codiv_clade','lineage']]
print(blast_res_pass.head())
print(blast_res_pass['codiv_clade'].value_counts())
#output hits passing that will be used to determine the mcra of the lineage
blast_res_pass.to_csv('analyses/codiv_moeller_ASVs/codiv_clades_ASVs.txt',sep='\t',index=False)



314 ASVs matching co-div clades with greater than 95 percent identity
      ASV                codiv_clade_seq       codiv_clade lineage
0   ASV_1  Human6700665 Bt3_clade1_human  Bt3_clade1_human     Bt3
1  ASV_13  Human7859459 Bt3_clade1_human  Bt3_clade1_human     Bt3
2  ASV_34  Human9825266 Bt3_clade1_human  Bt3_clade1_human     Bt3
3  ASV_37  Human8829943 Bt3_clade1_human  Bt3_clade1_human     Bt3
4  ASV_73  Human9825266 Bt3_clade1_human  Bt3_clade1_human     Bt3
Bt2_clade1_bonobo     90
Bt3_clade1_human      85
Bt2_clade2_bonobo     33
Bt1_clade1_bonobo     23
Bt2_clade1_chimp      20
Bt3_clade1_chimp      17
Bt3_clade1_bonobo     15
Bt1_clade1_gorilla    10
Bt1_clade1_chimp       8
Bt2_clade2_gorilla     7
Bt2_clade2_chimp       6
Name: codiv_clade, dtype: int64


In [17]:
#generates three dictionaries where keys are ASVs, and values are metadata
#all the samples with a given ASV
#all the sample types with a given ASV per metadata info
#taxonomic info, family and genus
metadata = pd.read_csv(metadata_file,sep='\t',index_col=None)
sample_type_dict = dict(zip(metadata['X.SampleID'], metadata['Description']))

ASV_sampleName_dict = {}  
asv_table = pd.read_csv(asv_table_file,sep='\t',index_col=0)
for ASV,row in asv_table.iterrows():
    sampleNames = list(asv_table.columns[row>0])
    ASV_sampleName_dict[ASV] = sampleNames

f__Bacteroidaceae.g__Bacteroides_B


In [29]:
from pandas.core.common import flatten
Bt1_tree = full_tree.get_common_ancestor(list(Bt1_lineage_ASVs['ASV']))

def is_HR(sampleNames):
    """given a list of sample names uses sample type dictionary to determine how many sample types are present
    designates clades/nodes as host restricted = 1 sample type or mixed = multiple sample types.
    Captive sample types are not considered so some clades will have a 0 sample type length and they can fall within
    host-restricted clades or mixed clades or neither"""
    sampleTypes = [sample_type_dict[name] for name in sampleNames]
    sampleTypes = [x.replace('non_western_','').replace('western_','') for x in sampleTypes]
    neutral_sampleTypes = ['captive_gorilla','captive_bonobo','captive_chimp','captive_orangutan']
    select_sampleTypes = [x for x in sampleTypes if x not in neutral_sampleTypes]
    unique_sampleTypes = list(set(select_sampleTypes))
    
    if len(unique_sampleTypes) == 0:
        return(unique_sampleTypes,'captive',len(sampleTypes))
    if len(unique_sampleTypes)  == 1: #identifies host-restricted clades
        HR_clade = select_sampleTypes[0]
        HR_sample_num = len([t for t in sampleTypes if t == HR_clade])
        return(unique_sampleTypes,'HR',HR_sample_num) #doesn't count captive ape samples
    if len(unique_sampleTypes)  > 1:    
        return(unique_sampleTypes,'mixed',len(sampleTypes))

def search_clades(tree, samples_cutoff, BS_support):
    """Finds nodes with at least 50% BS support containing ASVs only found in a single wild ape species
    ie wild_gorilla, wild_chimp or wild_bonobo, cpat"""
    HR_clades_passing_thresholds = []
    counter = 1
    for n in tree.traverse():
        if n.support > float(BS_support): #makes sure Bootstrap support is over threshold
            ASVs = [leaf.name for leaf in n.iter_leaves() if 'ASV' in leaf.name]
            ASVsNum = len(ASVs)
            sampleNames = list(set(list(flatten([ASV_sampleName_dict[ASV] for ASV in ASVs]))))
            
            sampleNum = len(sampleNames)
            sampleTypes,HR_clade,HR_sampleNum = is_HR(sampleNames)
            if HR_sampleNum > samples_cutoff:
                cladeName = 'clade_'+str(counter)
                counter+=1
                clade = [cladeName,HR_clade,HR_sampleNum,ASVs,ASVsNum,sampleNum,sampleTypes]
                HR_clades_passing_thresholds.append(clade)  
    clades = pd.DataFrame(HR_clades_passing_thresholds, columns = 
                          ['cladeName','HR_clade','HR_sampleNum',
                           'ASV','ASVsNum','sampleNum','sampleTypes'])
    return(clades)

#initial search contains all sub clades within the largest host restricted clade
clades_Bt1 = search_clades(Bt1_tree,samples_cutoff=5,BS_support=.5) 
print(clades_Bt1.head())

CPclades_Bt1 = clades_Bt1[clades_Bt1['HR_clade']=='captive']
#print(len(HRclades_Bt1),len(MXclades_Bt1),len(CPclades_Bt1))

def eliminate_redundant_clades(search_clades_res):
    """sorts clades and returns the largest non overlapping clade """
    df = search_clades_res.sort_values('ASVsNum',ascending=False) #start with the largest clades first
    NRclades = []
    NR_ASVs = []
    for index, row in df.iterrows():
        firstASV = row['ASV'][0]    
        if firstASV not in NR_ASVs:
            NR_ASVs = NR_ASVs + row['ASV']
            NRclades.append(row['cladeName']) 
            #print(row['cladeName'])
    res = df[df['cladeName'].isin(NRclades)]   
    return(res)

HRclades_Bt1 = clades_Bt1[clades_Bt1['HR_clade']=='HR']    
HRclades_Bt1 = eliminate_redundant_clades(HRclades_Bt1)
HRclades_Bt1.to_csv('analyses/codiv_moeller_ASVs/HRclades_Bt1.txt',sep='\t',index=False)
print(len(HRclades_Bt1),'HR clades')

def expand_clades(clades_df):
    ASV = clades_df.apply(lambda x: pd.Series(x['ASV']),axis=1).stack().reset_index(level=1, drop=True)
    ASV.name = 'ASV'
    clades_ASVs_df = clades_df.drop('ASV', axis=1).join(ASV)
    return(clades_ASVs_df)

HRclades_Bt1_ASVs = expand_clades(HRclades_Bt1)
print(len(HRclades_Bt1_ASVs),'HR ASVs')
HRclades_Bt1_ASVs.to_csv('analyses/codiv_moeller_ASVs/HRclades_Bt1_ASVs.txt',sep='\t',index=False)



MXclades_Bt1 = clades_Bt1[clades_Bt1['HR_clade']=='mixed']
MXclades_Bt1 = MXclades_Bt1[MXclades_Bt1['ASV'].apply(lambda cladeASVs: len(
                                                                        list(set(cladeASVs) & set(list(HRclades_Bt1_ASVs['ASV'])))
                                                                            ) == 0 )]
                                                                                
MXclades_Bt1_ASVs = expand_clades(MXclades_Bt1)
print(len(MXclades_Bt1_ASVs),'mixed clade ASVs')


leftover_ASVs = len(Bt1_lineage_ASVs) - len(HRclades_Bt1_ASVs) - len(MXclades_Bt1_ASVs)
print(leftover_ASVs,'leftover ASVs')

  cladeName HR_clade  HR_sampleNum  \
0   clade_1    mixed            75   
1   clade_2       HR            18   
2   clade_3       HR            35   
3   clade_4       HR            22   
4   clade_5       HR             7   

                                                 ASV  ASVsNum  sampleNum  \
0  [ASV_4444, ASV_1277, ASV_6056, ASV_6020, ASV_4...       41         75   
1  [ASV_4444, ASV_1277, ASV_6056, ASV_6020, ASV_4...       10         18   
2  [ASV_1090, ASV_2134, ASV_482, ASV_5974, ASV_64...        8         35   
3  [ASV_4623, ASV_1643, ASV_1616, ASV_4160, ASV_4...       23         22   
4                               [ASV_1090, ASV_2134]        2          7   

                               sampleTypes  
0  [wild_gorilla, wild_bonobo, wild_chimp]  
1                           [wild_gorilla]  
2                             [wild_chimp]  
3                            [wild_bonobo]  
4                             [wild_chimp]  
3 HR clades
41 HR ASVs
  cladeName HR_clade 

In [206]:
full_tree.get_common_ancestor(Bt2_ASVs)

Bt2_ASVs = list(Bt2_lineage_ASVs['ASV'])
print(len(Bt2_ASVs),'total ASVs')
Bt2_tree = full_tree.get_common_ancestor(Bt2_ASVs)

clades_Bt2 = search_clades(Bt2_tree,samples_cutoff=5,BS_support=.5) 

#Define host-restricted 
HRclades_Bt2 = clades_Bt2[clades_Bt2['HR_clade']!='mixed'][clades_Bt2['HR_clade']!='captive']     
HRclades_Bt2 = eliminate_redundant_clades(HRclades_Bt2)
HRclades_Bt2.to_csv('analyses/codiv_moeller_ASVs/Bt2_HRclades.txt',sep='\t',index=False)
HRclades_Bt2_ASVs = expand_clades(HRclades_Bt2)
HRclades_Bt2_ASVs.to_csv('analyses/codiv_moeller_ASVs/Bt2_HRclades_ASVs.txt',sep='\t',index=False)
print(len(HRclades_Bt2_ASVs),'HR clade ASVs')

MXclades_Bt2 = clades_Bt2[clades_Bt2['HR_clade']=='mixed']
MXclades_Bt2 = MXclades_Bt2[MXclades_Bt2['ASV'].apply(lambda cladeASVs: is_unique_MX(list(HRclades_Bt2_ASVs['ASV']),cladeASVs))]
MXclades_Bt2 = eliminate_redundant_clades(MXclades_Bt2)
MXclades_Bt2_ASVs = expand_clades(MXclades_Bt2)
print(len(MXclades_Bt2_ASVs),'mixed clade ASVs')

leftover_ASVs = len(Bt2_ASVs) - len(HRclades_Bt2_ASVs) - len(MXclades_Bt2_ASVs)
print(leftover_ASVs,'leftover ASVs')

print(len(HRclades_Bt2),len(MXclades_Bt2))

360 total ASVs


  import sys


337 HR clade ASVs
4 mixed clade ASVs
19 leftover ASVs
17 1


In [40]:
clades_fulltree = search_clades(full_tree,samples_cutoff=5,BS_support=.5) 
fulltree_ASVs = [leaf.name for leaf in full_tree.get_leaves() if 'ASV' in leaf.name]
print(len(fulltree_ASVs))
HRclades_fulltree = clades_fulltree[clades_fulltree['HR_clade']=='HR']     
HRclades_fulltree = eliminate_redundant_clades(HRclades_fulltree)
HRclades_fulltree_ASVs = expand_clades(HRclades_fulltree)
HR_ASVs = [asv for asv in HRclades_fulltree_ASVs['ASV'] if 'ASV' in asv]
print(len(HR_ASVs),'ASVs in',len(HRclades_fulltree),'HR clades')

MXclades_fulltree = clades_fulltree[clades_fulltree['HR_clade']=='mixed']
#make sure mixed clades do not include HR clades
MXclades_fulltree = MXclades_fulltree[MXclades_fulltree['ASV'].apply(lambda cladeASVs: len(
                                                                        list(set(cladeASVs) & set(list(HRclades_fulltree_ASVs['ASV'])))
                                                                            ) == 0 )]
MXclades_fulltree = eliminate_redundant_clades(MXclades_fulltree)
MXclades_fulltree_ASVs = expand_clades(MXclades_fulltree)
MX_ASVs = [asv for asv in MXclades_fulltree_ASVs['ASV'] if 'ASV' in asv]
print(len(MX_ASVs),'ASVs in',len(MXclades_fulltree),'mixed clades')

CPclades_fulltree = clades_fulltree[clades_fulltree['HR_clade']=='captive']
CPclades_fulltree = CPclades_fulltree[CPclades_fulltree['ASV'].apply(lambda cladeASVs: len(
                                                                        list(set(cladeASVs) & set(list(HRclades_fulltree_ASVs['ASV'])))
                                                                            ) == 0 )]
CPclades_fulltree = CPclades_fulltree[CPclades_fulltree['ASV'].apply(lambda cladeASVs: len(
                                                                        list(set(cladeASVs) & set(list(MXclades_fulltree_ASVs['ASV'])))
                                                                            ) == 0 )]
CPclades_fulltree = eliminate_redundant_clades(CPclades_fulltree)
CPclades_fulltree_ASVs = expand_clades(CPclades_fulltree)
CP_ASVs = [asv for asv in CPclades_fulltree_ASVs['ASV'] if 'ASV' in asv]
print(len(CP_ASVs),'ASVs in',len(CPclades_fulltree),'captive clades')

leftover_ASVs = set(fulltree_ASVs) - set(HR_ASVs) - set(MX_ASVs) - set(CP_ASVs)
print(len(leftover_ASVs),'leftover ASVs')
print(leftover_ASVs)

print(CPclades_fulltree.head())

7148
6246 ASVs in 302 HR clades
183 ASVs in 51 mixed clades
9 ASVs in 6 captive clades
710 leftover ASVs
{'ASV_10018', 'ASV_10157', 'ASV_7435', 'ASV_6318', 'ASV_6277', 'ASV_9173', 'ASV_7692', 'ASV_6312', 'ASV_9297', 'ASV_1404', 'ASV_10553', 'ASV_3172', 'ASV_4568', 'ASV_4505', 'ASV_4001', 'ASV_8866', 'ASV_9377', 'ASV_10335', 'ASV_7717', 'ASV_7691', 'ASV_2876', 'ASV_6186', 'ASV_10177', 'ASV_2214', 'ASV_8710', 'ASV_6550', 'ASV_6305', 'ASV_6043', 'ASV_10003', 'ASV_7664', 'ASV_7270', 'ASV_10325', 'ASV_9830', 'ASV_9859', 'ASV_9571', 'ASV_9570', 'ASV_5715', 'ASV_9177', 'ASV_4316', 'ASV_10182', 'ASV_7118', 'ASV_9335', 'ASV_7974', 'ASV_10127', 'ASV_9314', 'ASV_1696', 'ASV_9858', 'ASV_8214', 'ASV_2465', 'ASV_2298', 'ASV_10316', 'ASV_8780', 'ASV_10027', 'ASV_9572', 'ASV_9851', 'ASV_6570', 'ASV_9313', 'ASV_9772', 'ASV_9295', 'ASV_6266', 'ASV_1919', 'ASV_2880', 'ASV_10502', 'ASV_1617', 'ASV_6307', 'ASV_10200', 'ASV_8747', 'ASV_9630', 'ASV_6547', 'ASV_7674', 'ASV_9968', 'ASV_5297', 'ASV_4030', 'ASV_

In [289]:
clades = pd.concat([HRclades_fulltree,MXclades_fulltree,CPclades_fulltree])
clades.reset_index(drop=True, inplace=True) 
print(len(clades))
sample_type_num = pd.DataFrame()
for index,row in clades.iterrows():
    #print(row)
    sample_names= row['sampleNames']
    sample_types = [sample_type_dict[name] for name in sample_names]
    sample_types = pd.Series(sample_types).value_counts()
    sample_type_num = pd.concat([sample_type_num, 
                                pd.DataFrame(sample_types)], 
                                axis=1, sort=False)
sample_type_num = sample_type_num.T.fillna(0)
sample_type_num.index = clades.index
sample_type_counts =  metadata['Description'].value_counts()
print(sample_type_counts)
sample_type_perc = sample_type_num/sample_type_counts 
clades_merged = pd.concat([clades,sample_type_perc],axis=1)
print(clades_merged.columns)
clades_merged = clades_merged[['cladeName', 'HR_clade', 'HR_sampleNum', 'ASVsNum', 'ASVsTax',
        'sampleNum', 'sampleTypes', 'captive_bonobo',
       'captive_chimp', 'captive_gorilla', 'captive_orangutan',
       'non_western_human', 'western_human', 'wild_bonobo', 'wild_chimp',
       'wild_gorilla']]

359
western_human        6805
non_western_human     511
wild_chimp             69
wild_gorilla           37
captive_chimp          26
wild_bonobo            24
captive_gorilla        22
captive_bonobo         13
captive_orangutan      11
Name: Description, dtype: int64
Index(['cladeName', 'HR_clade', 'HR_sampleNum', 'ASV', 'ASVsNum', 'ASVsTax',
       'sampleNames', 'sampleNum', 'sampleTypes', 'captive_bonobo',
       'captive_chimp', 'captive_gorilla', 'captive_orangutan',
       'non_western_human', 'western_human', 'wild_bonobo', 'wild_chimp',
       'wild_gorilla'],
      dtype='object')


In [325]:
def con_Tax(x):
    fam = list(set([fam.split('.')[0] for fam in x]))
    gen = list(set([gen.split('.')[1] for gen in x])) 
    if len(fam) == 1:
        fam = fam[0].replace('f__','')
        if len(gen) == 1:
            gen = gen[0].replace('g__','')
        else:
            gen = 'Unassigned'
    else:
        fam = 'Unassigned'
        gen = 'Unassigned' 
    return(fam+'_'+gen)
    
clades_merged['Fam_Gen']=clades_merged['ASVsTax'].apply(lambda x: con_Tax(x))
clades_merged['Fam']=clades_merged['Fam_Gen'].apply(lambda x: str(x).split('_')[0])
clades_merged['Gen']=clades_merged['Fam_Gen'].apply(lambda x: str(x).split('_')[1])
clades_merged['Fam'].value_counts()

clades_merged.to_csv('analyses/codiv_moeller_ASVs/ALLclades_fulltree.txt',sep='\t',index=False)

In [278]:
collasped_tree = Tree(tree_file, format=0)

def collapse_node(tree,clade_ASVs,cladeName):
    if len(clade_ASVs) > 1:
        node = tree.get_common_ancestor(clade_ASVs)
        node.name=cladeName
        children = node.get_children()
        node.remove_child(children[1])
        node.remove_child(children[0])
        return(tree)
    else:
        ASV = clade_ASVs[0]
        leaf = tree.get_leaves_by_name(name=ASV)[0]
        leaf.name = cladeName
         
clades_merged.apply(lambda row: collapse_node(collasped_tree,row['ASV'],row['cladeName']),axis=1)
clade_leaves = [leaf for leaf in collasped_tree.get_leaves() if 'clade' in leaf.name]
collasped_tree.prune(clade_leaves)
collasped_tree.write(format=2, outfile='analyses/codiv_moeller_ASVs/full_tree_collapsed.tre')


In [209]:
full_tree = Tree(tree_file, format=0)
print(Bt1_tree)

def collapse_node(tree,clade_ASVs,cladeName):
    node = tree.get_common_ancestor(clade_ASVs)
    node.name=cladeName
    children = node.get_children()
    node.remove_child(children[1])
    node.remove_child(children[0])
    return(tree)

Bt1_tree = full_tree.get_common_ancestor(list(Bt1_lineage_ASVs['ASV']))
HRclades_Bt1.apply(lambda row: collapse_node(Bt1_tree,row['ASV'],row['cladeName']),axis=1)
HRclades_Bt1.to_csv('analyses/codiv_moeller_ASVs/HR_collapsed.txt',sep='\t')
Bt1_tree.write(format=2, outfile='analyses/codiv_moeller_ASVs/Bt1_collapsed.tre')

Bt2_tree = full_tree.get_common_ancestor(list(Bt2_lineage_ASVs['ASV']))
HRclades_Bt2.apply(lambda row: collapse_node(Bt2_tree,row['ASV'],row['cladeName']),axis=1)
MXclades_Bt2.apply(lambda row: collapse_node(Bt2_tree,row['ASV'],row['cladeName']),axis=1)
Bt2_tree.write(format=2, outfile='analyses/codiv_moeller_ASVs/Bt2_collapsed.tre')




   /-clade_2
--|
  |   /-clade_3
   \-|
      \-clade_4


In [135]:
print(set(CPclades_fulltree_ASVs['ASV']) & set(leftover_ASVs))
sample_type_num_leftovrs = sample_type_num[sample_type_num.index.isin(leftover_ASVs)]
sample_type_num_leftovrs.index.name = 'ASV'
sample_type_num_leftovrs.reset_index(inplace=True)
print(sample_type_num_leftovrs.head())
sample_type_num_leftovrs.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_leftovers_ASV.txt',sep='\t',index=False)


{'ASV_411', 'ASV_300', 'ASV_725', 'ASV_726', 'ASV_930', 'ASV_935', 'ASV_365', 'ASV_931', 'ASV_727'}
       ASV  western_human  non_western_human  captive_chimp  captive_gorilla  \
0  ASV_300            0.0                0.0           13.0              8.0   
1  ASV_365            0.0                0.0           15.0              3.0   
2  ASV_410            0.0                1.0            5.0              0.0   
3  ASV_411            0.0                0.0           14.0              0.0   
4  ASV_725            0.0                0.0            5.0              3.0   

   captive_bonobo  wild_chimp  captive_orangutan  wild_bonobo  wild_gorilla  
0             1.0         0.0                1.0          0.0           0.0  
1             0.0         0.0                0.0          0.0           0.0  
2             5.0         0.0                5.0          0.0           0.0  
3             2.0         0.0                0.0          0.0           0.0  
4             0.0         0.0

### Tables summarizing metadata of samples harboring ASVs

In [124]:
#creates table where ASVs are rows, columns are sample descriptions (i.e wild_chimp, captive_orangutan)
#counts are the number of samples or the percent of samples of a given type over the total number of samples belong to that type

sample_type_num = pd.DataFrame()
for ASV in ASV_sampleName_dict:
        sample_names = ASV_sampleName_dict[ASV]
        #sample_type_dict relates sample names to their descriptions 
        sample_types = [sample_type_dict[name] for name in sample_names]
        sample_types = pd.Series(sample_types).value_counts()
        row = pd.DataFrame(sample_types)
        sample_type_num = pd.concat([sample_type_num, row], axis=1, sort=False)
sample_type_num.columns = ASV_sampleName_dict.keys()
sample_type_num = sample_type_num.T 
sample_type_num = sample_type_num.fillna(0)
print(sample_type_num.head())
sample_type_num.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_ASV_count.txt',sep='\t',index=False)
sample_type_counts =  metadata['Description'].value_counts()
print(sample_type_counts)
sample_type_perc = sample_type_num/sample_type_counts 
print(sample_type_perc.head())
sample_type_perc.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_ASV_perc.txt',sep='\t',index=False)




       western_human  non_western_human  captive_chimp  captive_gorilla  \
ASV_1         2917.0               64.0            3.0              2.0   
ASV_2         2074.0               14.0            9.0              5.0   
ASV_3         1907.0               17.0            1.0              2.0   
ASV_4         1798.0               18.0            0.0              0.0   
ASV_5         1208.0               18.0            0.0              2.0   

       captive_bonobo  wild_chimp  captive_orangutan  wild_bonobo  \
ASV_1             1.0         1.0                0.0          0.0   
ASV_2             0.0         9.0                4.0          0.0   
ASV_3             2.0         1.0                0.0          0.0   
ASV_4             0.0         1.0                0.0          0.0   
ASV_5             2.0        18.0                4.0          0.0   

       wild_gorilla  
ASV_1           0.0  
ASV_2           0.0  
ASV_3           0.0  
ASV_4           0.0  
ASV_5           0.0  
we

Index(['ASV_1', 'ASV_2', 'ASV_3', 'ASV_4', 'ASV_5', 'ASV_6', 'ASV_7', 'ASV_8',
       'ASV_9', 'ASV_10',
       ...
       'ASV_10701', 'ASV_10702', 'ASV_10703', 'ASV_10704', 'ASV_10706',
       'ASV_10707', 'ASV_10708', 'ASV_10709', 'ASV_10710', 'ASV_10711'],
      dtype='object', length=7148)

In [88]:
#creates table where ASVs are rows, columns are sample descriptions plus site 
#counts are the number of samples or the percent of samples of a given type over the total number of samples belong to that type

metadata['Description_site'] = metadata['Description']+'_' +metadata['site_code']
sample_type_site_dict = dict(zip(metadata['X.SampleID'], metadata['Description_site']))

captive_only_desc = list(set(metadata['Description_site']))
captive_only_desc = [desc for desc in captive_only_desc if 'captive' in str(desc)]
captive_only_desc

sample_type_site_num = pd.DataFrame()
for ASV in ASV_sampleName_dict:
        sample_names = ASV_sampleName_dict[ASV]
        sample_types = [sample_type_site_dict[name] for name in sample_names]
        sample_types = pd.Series(sample_types).value_counts()
        row = pd.DataFrame(sample_types)
        sample_type_site_num = pd.concat([sample_type_site_num, row], axis=1, sort=False)
sample_type_site_num.columns = ASV_sampleName_dict.keys()
sample_type_site_num = sample_type_site_num.T 
sample_type_site_num = sample_type_site_num.fillna(0)
print(sample_type_site_num.head())
sample_type_site_num.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_site_ASV_count.txt',sep='\t',index=False)
sample_type_site_counts =  metadata['Description_site'].value_counts()
print(sample_type_counts)
sample_type_site_perc = sample_type_site_num/sample_type_site_counts 
print(sample_type_site_perc.head())
sample_type_site_perc.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_site_ASV_perc.txt',sep='\t',index=False)



       western_human_CHN  western_human_DNK  western_human_USA  \
ASV_1              571.0              278.0              269.0   
ASV_2              382.0              255.0              134.0   
ASV_3              447.0              229.0              174.0   
ASV_4              366.0              179.0              160.0   
ASV_5              265.0               94.0              128.0   

       western_human_SWE  western_human_ISR  western_human_ESP  \
ASV_1              233.0              220.0              204.0   
ASV_2              203.0              160.0              141.0   
ASV_3              177.0              121.0              129.0   
ASV_4              130.0              212.0              128.0   
ASV_5              172.0               26.0               44.0   

       western_human_NLD  western_human_DEU  western_human_GBR  \
ASV_1              194.0              165.0              137.0   
ASV_2              111.0              124.0              126.0   
ASV_3   

In [128]:
#subset to just captive ape descriptions
sample_type_site_num.columns
sample_type_site_num_just_captive = sample_type_site_num[captive_only_desc]
sample_type_site_num_just_captive.index.name = 'ASV'
sample_type_site_num_just_captive.reset_index(inplace=True)
sample_type_site_num_just_captive.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_site_captive_ASV_count.txt',sep='\t',index=False)
sample_type_site_perc_just_captive = sample_type_site_perc[captive_only_desc]
sample_type_site_perc_just_captive.index.name = 'ASV'
sample_type_site_perc_just_captive.reset_index(inplace=True)
sample_type_site_perc_just_captive.to_csv('analyses/codiv_moeller_ASVs/Table_sampletypes_site_captive_ASV_perc.txt',sep='\t',index=False)



NameError: name 'sample_type_site_num' is not defined