In [3]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from augur.utils import json_to_tree
import json

Code to separate input data fasta file into multiple data files based on different lineages. Need to have run a build already with all sequences (including both lineages), and the separate data files will be constructed by finding all tips that descend from the common ancestors of the lineages 

In [2]:
#need to look at tree json file to find the branch name for the common ancestor of the different lineages
#format of dict {species:{lineage:common_ancestor}}
lineage_ancestors = {'1':{'A':'NODE_0000049', 'B':'NODE_0000040'}}

In [6]:
for subtype, lineages in lineage_ancestors.items():
    #read in tree
    with open(f'../auspice/parainfluenza_{subtype}.json', 'r') as f:
        tree_json = json.load(f)

    #Put tree in Bio.Phylo format
    tree = json_to_tree(tree_json)
    
    #store all tip names for each lineage
    tips_by_lineage = {}
    lineage_by_tip = {}
    
    #for each lineage
    for l, ca in lineages.items():
        tips_in_this_lineage = []
        #find tips descending from common ancestor branch of the lineages
        for node in tree.find_clades(terminal=True):
            parents = [x.name for x in tree.get_path(node)]
            if ca in parents:
                tips_in_this_lineage.append(node.name)
                lineage_by_tip[node.name] = l
        tips_by_lineage[l] = tips_in_this_lineage
    
    seq_records = {x:[] for x in list(tips_by_lineage.keys())}
    
    for record in SeqIO.parse(open(f'hpiv_{subtype}.fasta',"r"), "fasta"):
        genbank_id = record.id.split('|')[0]
        if genbank_id in lineage_by_tip.keys():
            lineage_this_tip = lineage_by_tip[genbank_id]

            seq_records[lineage_this_tip].append(SeqRecord(record.seq, id=record.id, description=record.description))
        
    for l, seqs in seq_records.items():
        SeqIO.write(seqs, f'hpiv_{subtype}{l}.fasta', "fasta")