In [4]:
import json
from augur.utils import json_to_tree
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

Differences from Step3:
- Exclude RSV-B outlier that is much older that other sequences and looks to be skewing divergence analysis
- Split RSV-A into two lineages. Call them RSV-A1 (the upper clade of RSV-A Step3 tree, which contains duplication) and RSV-A2 (the lower clade, which dies out)

In [7]:
#read in tree
tree_json_file = f'../rsv_step3/auspice/rsv_A.json'
with open(tree_json_file, 'r') as f:
    tree_json = json.load(f)
tree = json_to_tree(tree_json)

#split RSV-A into two lineages
known_rsv_A1 = 'KU950615'
known_rsv_A2 = 'KP258727'

# find the name of the node that is parent to all RSV-A1 (or all RSV-A2) isolates
for node in tree.find_clades():
    if node.name == known_rsv_A1:
        node_path_A1 = tree.get_path(node)
        A1_ancestral_node = node_path_A1[0].name
    elif node.name == known_rsv_A2:
        node_path_A2 = tree.get_path(node)
        A2_ancestral_node = node_path_A2[0].name

In [24]:
# for each tip on the RSV-A tree, find which lineage it belongs to
# store this info in lists of isolate accession IDs
lineage_A1_tips = []
lineage_A2_tips = []

for node in tree.find_clades(terminal=True):
    node_path = tree.get_path(node)
    ancestral_node = node_path[0].name
    if ancestral_node == A1_ancestral_node:
        lineage_A1_tips.append(node.name)
    elif ancestral_node == A2_ancestral_node:
        lineage_A2_tips.append(node.name)

In [18]:
dup_seq_dict = {'a':'GTCAAGAGGAAACCCTCCACTCAACCACCTCCGAAGGCTATCTAAGCCCATCACAAGTCTATACAACATCCG', 
                'b': 'ACAGAAAGAGACACCAGCACCTCACAATCCACTGTGCTCGACACAACCACATCAAAACAC'}

In [16]:
#in this tree, these tips are nested within the ON1 clade but have 72nt deletions (aka do not have duplication)
rsv_a_CA_butnodup = ['MK167035', 'MF001057','KU950629','KU950594','MH279547',
                     'MZ151853','MH383066','MN306050','MN310477', 'KU950626',
                     'MW160746','KY967362','KY967363','MN306054']
#in this tree, the following have the duplication but are not nested within the same clade
rsv_a_dup_butnoCA = ['KJ672446','KM042384', 'KJ672442', 'KJ672442', 'KM042385', 
                     'KX765894', 'KX765911', 'KX765931', 'KX765960', 'KX765967', 
                     'KX765936', 'KX655675', 'KX655694', 'KX765938']


#in this tree, these tips are nested within the BA clade but have 60nt deletions (aka do not have duplication)
rsv_b_CA_butnodup = ['MG431253', 'KX655690', 'MG813994', 'MT040081', 
                     'KU950605', 'MT040085', 'MN163124', 'MT040087', 
                     'MT040084', 'MT040089']
#in this tree, the following have the duplication but are not nested within the same clade
rsv_b_dup_butnoCA = ['KP258739', 'KU316158', 'KU316105', 'KU316172']

#remove outlier
rsv_b_outlier = ['MG813995']

to_exclude = {'a': rsv_a_CA_butnodup+rsv_a_dup_butnoCA, 
              'b': rsv_b_CA_butnodup+rsv_b_dup_butnoCA+rsv_b_outlier}

In [22]:
#find where duplication placeholdher should go
def find_duplication_location(subtype):
    
    dup_ref_fasta = f'../rsv_step3/config/rsv_{subtype}_step3_reference.fasta'
    
    dup_seq = dup_seq_dict[subtype]

    with open(dup_ref_fasta, 'r') as handle:
        for ref in SeqIO.parse(handle, 'fasta'):
            loc_template = str(ref.seq).find(dup_seq.lower())
            start_dup = loc_template+len(dup_seq)
            #check that this is the same seq as above
            dup_seq_in_ref = str(ref.seq)[start_dup:start_dup+len(dup_seq)]
#             print(dup_seq_in_ref==dup_seq.lower())
            
    return start_dup

In [19]:
def add_placeholder_for_duplication(subtype):
    #get duplicated sequence and its position in the genome
    dup_seq = dup_seq_dict[subtype]
    start_dup = find_duplication_location(subtype)
    
    #read in alignment done on all strains without the duplication
    #using a reference that does not have the duplication
    other_aligned_fasta = f'../rsv_step2/results/aligned_{subtype.upper()}_other.fasta'

    other_aligned_records = []

    with open(other_aligned_fasta, 'r') as handle:
        for virus in SeqIO.parse(handle, 'fasta'):
            #do not include the strains in the 'to_exclude' list
            if virus.id in to_exclude[subtype]:
                pass
            else:
                #for RSV-A the duplication starts at second position of reading frame 
                #so offset the placeholder so translation will work
                if subtype=='a':
                    before_dup = str(virus.seq)[:start_dup-1]
                    after_dup = str(virus.seq)[start_dup-1:]
                else:
                    before_dup = str(virus.seq)[:start_dup]
                    after_dup = str(virus.seq)[start_dup:]
                dup_placeholder = '-'*len(dup_seq)
                virus_seq_w_placeholder = Seq(before_dup+dup_placeholder+after_dup)
                #check
    #             print(virus_seq_w_placeholder[4680:5646].translate())
                other_aligned_records.append(SeqRecord(seq=virus_seq_w_placeholder, id=virus.id))
    return other_aligned_records

In [25]:
def merge_aligned_fastas(subtype):
    #get the alignment of sequences without duplications (but with placeholder for them)
    all_aligned_records = add_placeholder_for_duplication(subtype)

    dup_aligned_fasta = f'../rsv_step2/results/aligned_{subtype.upper()}_dup.fasta'

    with open(dup_aligned_fasta, 'r') as handle:
        for virus in SeqIO.parse(handle, 'fasta'):
            #do not include the strains in the 'to_exclude' list
            if virus.id in to_exclude[subtype]:
                pass
            else:
                all_aligned_records.append(SeqRecord(seq=virus.seq, id=virus.id))
    #split RSV-A into lineage A1 and A2
    if subtype == 'a':
        A1_aligned_records = []
        A2_aligned_records = []
        for x in all_aligned_records:
            if x.id in lineage_A1_tips:
                A1_aligned_records.append(x)
            elif x.id in lineage_A2_tips:
                A2_aligned_records.append(x)
                
        with open(f'../rsv_builds_for_divergence/data/aligned_A1_all.fasta','w') as handle:
            SeqIO.write(A1_aligned_records, handle, 'fasta')
        with open(f'../rsv_builds_for_divergence/data/aligned_A2_all.fasta','w') as handle:
            SeqIO.write(A2_aligned_records, handle, 'fasta')
        
    elif subtype == 'b':    
        with open(f'../rsv_builds_for_divergence/data/aligned_{subtype.upper()}_all.fasta','w') as handle:
            SeqIO.write(all_aligned_records, handle, 'fasta')

In [26]:
merge_aligned_fastas('a')

In [30]:
merge_aligned_fastas('b')

In [29]:
#check
count=0
with open('../rsv_builds_for_divergence/data/aligned_A2_all.fasta', 'r') as handle:
    for virus in SeqIO.parse(handle, 'fasta'):
        count+=1
        if len(virus)!= 15277:
            print(virus.id)
            
print(count)

194


In [31]:
#check
with open('../rsv_builds_for_divergence/data/aligned_B_all.fasta', 'r') as handle:
    for virus in SeqIO.parse(handle, 'fasta'):
        if len(virus)!= 15285:
            print(virus.id)