In [12]:
import re
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Seq import MutableSeq
from Bio import SeqFeature

In [3]:
#reference genomes for ON1 and "other" RSV-A ref should be the same except for the duplication event
#same for BA and "other" for RSV-B
#this way coordinates will be the same except for the duplication

In [2]:
dup_seq_a = 'GTCAAGAGGAAACCCTCCACTCAACCACCTCCGAAGGCTATCTAAGCCCATCACAAGTCTATACAACATCCG'
dup_seq_b = 'ACAGAAAGAGACACCAGCACCTCACAATCCACTGTGCTCGACACAACCACATCAAAACAC'

In [18]:
def make_nondup_ref(subtype, dup_seq):

    dup_reference_file = f'rsv_{subtype}_step2_dup_reference.gb'
    
    #make new genome seq without the duplication
    new_genome_seq = ''

    with open(dup_reference_file) as handle:
        for record in SeqIO.parse(handle, "genbank"):
            genome_seq = str(record.seq)
            seq_wo_dup = Seq(genome_seq.replace(dup_seq,'',1))
            
            old_id = record.id
            old_description = record.description

            for feature in record.features:
                if feature.type == 'CDS':
                    if feature.qualifiers['gene'] == ['G']:
                        g_start = feature.location.start
                        g_end_wo_dup = feature.location.end - len(dup_seq)
                        g_nt = Seq(seq_wo_dup[feature.location.start:g_end_wo_dup])
                        #remove stop codon
                        feature.qualifiers['translation'] = g_nt.translate()[:-1]
            new_genome_seq = seq_wo_dup

    # Create a seq record
    new_record = SeqRecord(new_genome_seq,
                       id=f'{old_id}-nodup', # random accession number
                       name=f'{old_id}-nodup',
                       description=f'{old_description} but deletion removed', 
                       annotations={"molecule_type": "RNA"})
    
    #make new features, with adjusted locations

    for record in SeqIO.parse(open(dup_reference_file, 'r'), "gb"):
        for feature in record.features:
            if feature.type =='source':
                feature.location = SeqFeature.FeatureLocation(SeqFeature.ExactPosition(feature.location.start.position),
                        SeqFeature.ExactPosition(feature.location.end - len(dup_seq)), feature.location.strand)

            else:
                #adjust the end position of G by length of duplication
                if feature.location.start==g_start:
                    feature.location = SeqFeature.FeatureLocation(SeqFeature.ExactPosition(feature.location.start.position),
                            SeqFeature.ExactPosition(feature.location.end - len(dup_seq)), feature.location.strand)
                    if feature.type =='CDS':
                        feature.qualifiers['translation'] = feature.location.extract(new_genome_seq).translate()
                #adjust the start and end positions of all features downstream of G by 60nt
                elif feature.location.start>g_start:
                    feature.location = SeqFeature.FeatureLocation(SeqFeature.ExactPosition(feature.location.start.position - len(dup_seq)),
                            SeqFeature.ExactPosition(feature.location.end - len(dup_seq)), feature.location.strand)
                    if feature.type =='CDS':
                        feature.qualifiers['translation'] = feature.location.extract(new_genome_seq).translate()

            new_record.features.append(feature)

    with open(f'rsv_{subtype}_step2_other_reference.gb','w') as output_file:
        SeqIO.write(new_record, output_file, 'genbank')

    

In [19]:
make_nondup_ref('a', dup_seq_a)

In [20]:
make_nondup_ref('b', dup_seq_b)