In [5]:
import requests
import json
import pandas as pd
import numpy as np
from augur.utils import json_to_tree
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

Not all RSV sequences have subtype (A or B) labeled. Use the phylogeny of all RSV sequences (including A and B) to label sequences according to their subtype so that separate A and B trees can be run 

In [20]:
#read in tree
tree_json_file = f'../rsv_step0/auspice/rsv.json'
with open(tree_json_file, 'r') as f:
    tree_json = json.load(f)
tree = json_to_tree(tree_json)

# randomly chosen tips that are known to be Rsv-A or Rsv-B
known_A = 'KY883566'
known_B = 'KU950682'

# find the name of the node that is parent to all RSV-A (or all RSV-B) isolates
for node in tree.find_clades():
    if node.name == known_B:
        node_path_B = tree.get_path(node)
        B_ancestral_node = node_path_B[1].name
    elif node.name == known_A:
        node_path_A = tree.get_path(node)
        A_ancestral_node = node_path_A[1].name


In [21]:
# for each tip on the tree, find which subtype it belongs to
# store this info in a dictionary with isolate accession ID as key and subtype as value
subtype_dict = {}

for node in tree.find_clades(terminal=True):
    node_path = tree.get_path(node)
    ancestral_node = node_path[1].name
    if ancestral_node == A_ancestral_node:
        subtype_dict[node.name] = 'A'
    elif ancestral_node == B_ancestral_node:
        subtype_dict[node.name] = 'B'

In [23]:
# write a data file with all subtype A viruses and another with all subtype B

data_file = '../rsv_step0/data/rsv.fasta'

with open(data_file, 'r') as handle:
    
    edited_records_A = []
    edited_records_B = []
    
    for virus in SeqIO.parse(handle, 'fasta'):
        accession = virus.id.split('|')[0]
        
        if accession in subtype_dict.keys():
        
            subtype = subtype_dict[accession]

            virus.id = '|'.join(virus.id.split('|')[0:6]) + f'|{subtype}|' + virus.id.split('|')[-1]
            virus.description = virus.id
            
            if subtype =='A':
                edited_records_A.append(SeqRecord(virus.seq, id=virus.id, description=virus.description))
            elif subtype == 'B':
                edited_records_B.append(SeqRecord(virus.seq, id=virus.id, description=virus.description))

    SeqIO.write(edited_records_A, '../rsv_step0/data/rsv_A_genome.fasta', "fasta")
    SeqIO.write(edited_records_B, '../rsv_step0/data/rsv_B_genome.fasta', "fasta")