In [20]:
import json
from augur.utils import json_to_tree
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [12]:
tree_json_file = 'old_jsons/rsv_A-refisrootwithinsertion.json'
with open(tree_json_file, 'r') as f:
    tree_json = json.load(f)
tree = json_to_tree(tree_json)

In [24]:
on1_strains = []
other_strains = []

#in this tree, these tips are nested within the ON1 clade but have 72nt deletions (aka do not have duplication)
manual_exceptions = ['MK167035', 'MF001057','KU950594','MH279547',
                     'MZ151853','MH383066','MN306050','MN310477',
                     'MW160746','KY967362','MN306054']

#tips that descend from NODE_0000644 have the G duplication, with a few exceptions
for node in tree.find_clades(terminal=True):
    node_path = [x.name for x in tree.get_path(node)]
    if 'NODE_0000644' in node_path:
        if node.name not in manual_exceptions:
            on1_strains.append(node.name)
    else:
        other_strains.append(node.name)
        


In [28]:
# write a new data file with all subtype A viruses that have duplication, and another that do not

data_file = '../rsv_testing-ref/data/rsv_A_genome.fasta'

with open(data_file, 'r') as handle:
    
    A_ON1_records = []
    A_other_records = []
    
    for virus in SeqIO.parse(handle, 'fasta'):
        accession = virus.id.split('|')[0]
            
        if accession in on1_strains:
            A_ON1_records.append(SeqRecord(virus.seq, id=virus.id, description=virus.description))
        elif accession in other_strains:
            A_other_records.append(SeqRecord(virus.seq, id=virus.id, description=virus.description))
            
    SeqIO.write(A_ON1_records, '../rsv_testing-ref/data/rsv_A_on1.fasta', "fasta")
    SeqIO.write(A_other_records, '../rsv_testing-ref/data/rsv_A_other.fasta', "fasta")