In [25]:
import json
import pandas as pd
from augur.utils import json_to_tree
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [1]:
#the tree in Rsv_step1 was run with a reference containing the G gene duplication
#the strains without the duplication can be seen in this tree by highlighting the auspice tree 
#5520- for RSV-A and 5420-/5480- for RSV-B
#the clade without this deletion is the clade with the duplication (ON1 for RSV-A and BA for RSV-B)

In [2]:
#Need to look at the tree from Step1 to find the branch that is the common ancestor of strains with the duplication
#and also to identify outliers 
#(genomes with duplication that don't descend from this common ancestor and genomes without duplication that do)

In [37]:
#RSV-A

#common ancestor of the ON1 clade
rsv_a_on1_common_ancestor = 'NODE_0000652'
#in this tree, these tips are nested within the ON1 clade but have 72nt deletions (aka do not have duplication)
rsv_a_CA_butnodup = ['MK167035', 'MF001057','KU950629','KU950594','MH279547',
                     'MZ151853','MH383066','MN306050','MN310477', 'KU950626',
                     'MW160746','KY967362','KY967363','MN306054']
#in this tree, the following have the duplication but are not nested within the same clade. Add manually
rsv_a_dup_butnoCA = ['KJ672446','KM042384', 'KJ672442', 'KJ672442', 'KM042385', 
                     'KX765894', 'KX765911', 'KX765931', 'KX765960', 'KX765967', 
                     'KX765936', 'KX655675', 'KX655694', 'KX765938']
rsv_a_exceptions = {'manual_exclusions': rsv_a_CA_butnodup, 'manual_inclusions': rsv_a_dup_butnoCA}

In [38]:
#RSV-B

#common ancestor of the BA clade
rsv_b_ba_common_ancestor = 'NODE_0000001'
#in this tree, these tips are nested within the BA clade but have 60nt deletions (aka do not have duplication)
rsv_b_CA_butnodup = ['MG431253', 'KX655690', 'MG813994', 'MT040081', 
                     'KU950605', 'MT040085', 'MN163124', 'MT040087', 
                     'MT040084', 'MT040089']
#in this tree, the following have the duplication but are not nested within the same clade. Add manually
rsv_b_dup_butnoCA = ['KP258739', 'KU316158', 'KU316105', 'KU316172']
rsv_b_exceptions = {'manual_exclusions': rsv_b_CA_butnodup, 'manual_inclusions': rsv_b_dup_butnoCA}

In [39]:
#Read in the tree from Step1 and make a list of strains with and without the duplication

def parse_strains_by_duplication(subtype, common_ancestor, exceptions):
    
    #read in tree
    tree_json_file = f'../rsv_step1/auspice/rsv_{subtype}.json'
    with open(tree_json_file, 'r') as f:
        tree_json = json.load(f)
    tree = json_to_tree(tree_json)
    
    manual_exclusions = exceptions['manual_exclusions']
    manual_inclusions = exceptions['manual_inclusions']
    
    dup_strains = []
    other_strains = []

    #tips that descend from NODE_0000652 have the G duplication, with a few exceptions
    for node in tree.find_clades(terminal=True):
        dup_clade=False
        node_path = [x.name for x in tree.get_path(node)]
        if common_ancestor in node_path:
            if node.name not in manual_exclusions:
                dup_strains.append(node.name)
                dup_clade=True

        elif node.name in manual_inclusions:
            dup_strains.append(node.name)
            dup_clade=True

        if dup_clade==False:
            other_strains.append(node.name)
            
    return dup_strains, other_strains

In [40]:
a_on1_strains, a_other_strains = parse_strains_by_duplication('A', rsv_a_on1_common_ancestor, rsv_a_exceptions)

In [41]:
b_ba_strains, b_other_strains = parse_strains_by_duplication('B', rsv_b_ba_common_ancestor, rsv_b_exceptions)

In [42]:
# write a new data file in rsv_step2/data/ with all subtype A viruses that have duplication, and another that do not

def write_step2_data(subtype, dup_strains, other_strains):
    data_file = f'../rsv_step1/data/rsv_{subtype}_genome.fasta'

    with open(data_file, 'r') as handle:

        dup_records = []
        other_records = []

        for virus in SeqIO.parse(handle, 'fasta'):
            accession = virus.id.split('|')[0]

            if accession in dup_strains:
                dup_records.append(SeqRecord(virus.seq, id=virus.id, description=virus.description))
            elif accession in other_strains:
                other_records.append(SeqRecord(virus.seq, id=virus.id, description=virus.description))

        SeqIO.write(dup_records, f'../rsv_step2/data/rsv_{subtype}_dup.fasta', "fasta")
        SeqIO.write(other_records, f'../rsv_step2/data/rsv_{subtype}_other.fasta', "fasta")

In [43]:
write_step2_data('A', a_on1_strains, a_other_strains)

In [44]:
write_step2_data('B', b_ba_strains, b_other_strains)

In [45]:
# add a column to metadata (that can be used to color_by in auspice) based on whether sequence has duplication
#this will be used in step3

def write_step3_metadata(subtype, dup_strains, other_strains):
    
    #read in metadata tsv from Step1
    metadata = pd.read_csv(f'../rsv_step1/results/metadata_{subtype.upper()}.tsv', sep='\t')
    
    #make dataframe with strain name and whether it has duplication
    duplication_info = []
    for x in dup_strains:
        duplication_info.append({'strain':x, 'G_duplication':True})
    for y in other_strains:
        duplication_info.append({'strain':y, 'G_duplication':False})
    duplication_df = pd.DataFrame(duplication_info)
    
    new_metadata = metadata.merge(duplication_df, on='strain')
    
    new_metadata.to_csv(f'../rsv_step3/data/metadata_{subtype.upper()}.tsv', index=False)

In [46]:
write_step3_metadata('a', a_on1_strains, a_other_strains)

In [47]:
write_step3_metadata('b', b_on1_strains, b_other_strains)