In [None]:
#Make separate data files for each lineage

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from augur.utils import json_to_tree
import json

In [3]:
#need to look at tree json file to find the branch name for the common ancestor of the different lineages
#format of dict {lineage:common_ancestor}

#Lineage definitions from Corcoran et al (https://journals.asm.org/doi/10.1128/JCM.00610-09)
lineage_ancestors = {'1a': 'NODE_0000088', '1b': 'NODE_0000092', '2': 'NODE_0000115', 
                     '3a': 'NODE_0000096', '3b': 'NODE_0000100'}

In [5]:
for lineage, ca in lineage_ancestors.items():
    #read in tree
    with open(f'../auspice/parvovirusB19_all.json', 'r') as f:
        tree_json = json.load(f)

    #Put tree in Bio.Phylo format
    tree = json_to_tree(tree_json)
    
    #store all tip names for each lineage
    tips_in_this_lineage = []

    
    #find tips descending from common ancestor branch of the lineages
    for node in tree.find_clades(terminal=True):
        parents = [x.name for x in tree.get_path(node)]
        if ca in parents:
            tips_in_this_lineage.append(node.name)

    
    seq_records = []
    
    for record in SeqIO.parse(open(f'parvovirusB19_all.fasta',"r"), "fasta"):
        genbank_id = record.id.split('|')[0]
        if genbank_id in tips_in_this_lineage:
            seq_records.append(SeqRecord(record.seq, id=record.id, description=record.description))
        
        
    SeqIO.write(seq_records, f'parvovirusB19_{lineage}.fasta', "fasta")