In [None]:
#Make separate data files for each lineage

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from augur.utils import json_to_tree
import json

In [6]:
#need to look at tree json file to find the branch name for the common ancestor of the different lineages
#format of dict {species:{lineage:common_ancestor}}

#used Matsuzaki et al 2016 (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5008092/) to find clades
#many internal segments were not clearly delineated on the auspice trees, so there's only one 
#clade that can be found by a mutally exclusive common ancestor.
#For these, I will find all descendents of that branch and assign it to the appropriate clades, 
#and call everything else the other clade (there are only Yamagata and Mississippi for the internals)
lineage_ancestors = {'HE':{'Mississippi': 'NODE_0000008', 'Aichi': 'NODE_0000032', 
                           'Taylor':'NODE_0000004', 'Yamagata':'NODE_0000199', 
                           'Kanagawa':'NODE_0000260', 'SaoPaolo':'NODE_0000046'}, 
                     'PB2':{'Yamagata':'NODE_0000041'}, 
                     'PB1':{'Mississippi': 'NODE_0000006', 'Yamagata':'NODE_0000031'}, 
                     'P3':{'Mississippi':'NODE_0000037'}, 
                     'NP':{'Mississippi':'NODE_0000062'}, 
                     'M':{'Yamagata':'NODE_0000027'}, 
                     'NS':{'Yamagata':'NODE_0000032'}}

In [12]:
for segment, lineages in lineage_ancestors.items():
    #read in tree
    with open(f'../auspice/fluC_{segment}.json', 'r') as f:
        tree_json = json.load(f)

    #Put tree in Bio.Phylo format
    tree = json_to_tree(tree_json)
    
    #store all tip names for each lineage
    tips_by_lineage = {}
    lineage_by_tip = {}
    
    #for each lineage
    for l, ca in lineages.items():
        tips_in_this_lineage = []
        #find tips descending from common ancestor branch of the lineages
        for node in tree.find_clades(terminal=True):
            parents = [x.name for x in tree.get_path(node)]
            if ca in parents:
                tips_in_this_lineage.append(node.name)
                lineage_by_tip[node.name] = l
                
        tips_by_lineage[l] = tips_in_this_lineage
        
    #will treat internal genes differently, call all tips that aren't 
    #descending from the common ancestor as the other clade

    if len(lineages)==1:

        lineage_with_common_ancestor = list(lineages.keys())[0]
        if lineage_with_common_ancestor == 'Yamagata':
            lineage_without_common_ancestor = 'Mississippi'    
        elif lineage_with_common_ancestor == 'Mississippi':
            lineage_without_common_ancestor = 'Yamagata'
        
        #for internal genes where common ancestor branch can't always be assigned
        tips_other_lineage = []
        for node in tree.find_clades(terminal=True):
            if node.name not in tips_by_lineage[lineage_with_common_ancestor]:
                tips_other_lineage.append(node.name)
                lineage_by_tip[node.name] = lineage_without_common_ancestor
                
        tips_by_lineage[lineage_without_common_ancestor] = tips_other_lineage
    
    seq_records = {x:[] for x in list(tips_by_lineage.keys())}
    
    for record in SeqIO.parse(open(f'fluC_{segment}.fasta',"r"), "fasta"):
        genbank_id = record.id.split('|')[0]
        if genbank_id in lineage_by_tip.keys():
            lineage_this_tip = lineage_by_tip[genbank_id]

            seq_records[lineage_this_tip].append(SeqRecord(record.seq, id=record.id, description=record.description))
        
    for l, seqs in seq_records.items():
        SeqIO.write(seqs, f'../separate_lineages/data/fluC_{segment}_{l}.fasta', "fasta")