In [8]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqFeature
import json
import re

In [9]:
#for all insertions inferred on the tree, find whether they occur within genes (and would affect coding)
#also check for strains with duplication that were missed and manually add them to the duplication subset in step2

In [11]:
def get_gene_by_pos(subtype):
    #map each position to gene
    #noncoding positions will not map to anything
    map_pos_to_gene = {}

    for record in SeqIO.parse(open(f'config/rsv_{subtype}_step3_reference.gb', 'r'), "gb"):
        for feature in record.features:
            if feature.type =='CDS':
                for x in range(feature.location.start, feature.location.end):
                    map_pos_to_gene[x] = feature.qualifiers['gene'][0]

In [14]:
def find_coding_insertions(subtype):
    
    #read in file with all insertions and the node they were inferred on
    with open(f'results/insertions_{subtype.upper()}.json') as f:
        insertions_json = json.load(f)
        
    #hardcode length of duplications
    dup_len = {'a':72,'b':60}
    
    #find what (if any) insertions occur in coding regions
    #also find which strains have G duplication (but were missed as ON1 in step2)
    insertions_in_coding_regions = []

    missed_dup_strains = []

    for k, v in insertions_json['nodes'].items():
        if len(v['muts']) != 0:
            if 'ins' in v['muts'].keys():
                insertions_at_node = v['muts']['ins']
                for ins in insertions_at_node:
                    ins_pos = int(re.findall('^[0-9]+', ins)[0])
                    ins_seq = re.findall('[A-Z]+', ins)[0]
                    if ins_pos in map_pos_to_gene.keys():
                        #find missed duplications
                        if map_pos_to_gene[ins_pos] == 'G':

                            if len(ins_seq)==dup_len[subtype]:
                                missed_dup_strains.append(k)
                        #find other insertions that were in genes
                        else:
                            insertions_in_coding_regions.append([k, ins, map_pos_to_gene[ins_pos]])
                            
    return insertions_in_coding_regions, missed_dup_strains
                        


In [21]:
insertions_in_coding_regions_a, missed_dup_strains_a = find_coding_insertions('a')

In [22]:
insertions_in_coding_regions_a

[['KU316143', '9097AAACAA (L:180/2)', 'L'],
 ['MG642048', '1709A', 'N'],
 ['NODE_0000281', '8959AAAGAA (L:134/2)', 'L'],
 ['NODE_0000562', '8320AA', 'M2-2'],
 ['KY460517', '2303AT', 'N'],
 ['MZ515999', '11452T (L:965/2)', 'L'],
 ['MZ516005', '11500TTNN (L:981/2)', 'L'],
 ['MZ516096', '14749AAAAAAGGAATTNN (L:2064/2)', 'L'],
 ['MZ515760', '11443T (L:962/2)', 'L'],
 ['MZ515888', '10108T (L:517/2)', 'L'],
 ['MN310477', '10249GTC (L:564/2)', 'L'],
 ['MZ515861', '12406N (L:1283/2)', 'L'],
 ['MZ515734', '10369CAAAGTTATCTTAA (L:604/2)', 'L'],
 ['MZ515734', '9829T (L:424/2)', 'L'],
 ['MZ516007', '14749A (L:2064/2)', 'L'],
 ['MZ515987', '14749A (L:2064/2)', 'L'],
 ['MZ516004', '15004A (L:2149/2)', 'L'],
 ['MZ516004', '10144A (L:529/2)', 'L']]

In [23]:
missed_dup_strains_a

[]

In [24]:
insertions_in_coding_regions_b, missed_dup_strains_b = find_coding_insertions('b')

In [25]:
insertions_in_coding_regions_b

[['KU316179', '13545AAT (L:1660/2)', 'L'],
 ['KY249672', '13542AAAAAT (L:1659/2)', 'L'],
 ['MZ515832', '9342N (L:259/2)', 'L'],
 ['MZ515670', '8721CAAAGCCC (L:52/2)', 'L'],
 ['MZ515950', '11458C (L:964)', 'L'],
 ['MZ515727', '11458C (L:964)', 'L'],
 ['LC495297', '13566T (L:1667/2)', 'L'],
 ['LC495297', '9144G (L:193/2)', 'L']]

In [26]:
missed_dup_strains_b

[]