In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqFeature
import json

In [16]:
for record in SeqIO.parse(open('rsv_a_JN257693.gb', 'r'), "gb"):
    on1_g_seq = record.seq
    for feature in record.features:
        if feature.type =='CDS':
            on1_g_translation = feature.qualifiers['translation']
#             print(on1_g_seq)

In [14]:
duplicated_seq = 'GTCAAGAGGAAACCCTCCACTCAACCACCTCCGAAGGCTATCTAAGCCCATCACAAGTCTATACAACATCCG'
dup_seq_aa = 'QEETLHSTTSEGYLSPSQVYTTS'

In [8]:
#confirm duplication
on1_g_seq.count(duplicated_seq)

2

In [40]:
for record in SeqIO.parse(open('rsv_a_JX627336.gb', 'r'), "gb"):
    on1_genome_seq = record.seq
    for feature in record.features:
        if feature.type =='CDS':
            if feature.qualifiers['gene'] == ['G']:
                g_start = feature.location.start
                g_end = feature.location.end
                old_g_seq = feature.location.extract(on1_genome_seq)


In [36]:
#JN257693 is one nt longer... seems that there is an extra T on the end after the stop codon
print(len(on1_g_seq))
print(len(old_g_seq))
#should truncate it and then switching JN257693 G seq into JX627336 should be ok because they are the same length
#and don't have to worry about changing coordinates of things down stream
on1_g_seq_edited = on1_g_seq[0:-1]

967
966


In [49]:
for record in SeqIO.parse(open('rsv_a_JX627336.gb', 'r'), "gb"):
    #make sequence mutable
    record.seq = record.seq.tomutable()
    #make sure coordinates are right
#     print(new_genome_nt[g_start:g_end])
    record.seq[g_start:g_end] = on1_g_seq_edited
    
    for feature in record.features:
        if feature.type =='CDS':
            if feature.qualifiers['gene'] == ['G']:
                feature.qualifiers['translation'] = on1_g_translation
    
    SeqIO.write(record, 'rsv_a_dup_reference.gb', 'genbank')

In [50]:
#make sure it worked
for record in SeqIO.parse(open('rsv_a_dup_reference.gb', 'r'), "gb"):
    for feature in record.features:
        if feature.type =='CDS':
            if feature.qualifiers['gene'] == ['G']:
                print(on1_g_seq_edited==feature.location.extract(record.seq))
                print(feature.qualifiers['translation'] == on1_g_translation)


True
True


In [4]:
with open('old_jsons/rsv_A_chimeraroot_root-sequence.json') as f:
    root_seq = json.load(f)

In [37]:
#make reference from the root of the tree made by aligning to reference
#to align the non-on1 strains to
#G will be shorter than the ON1 reference
new_features = []
for record in SeqIO.parse(open('rsv_a_dup_reference.gb', 'r'), "gb"):
    record.id = 'RSVA-root'
    record.name = 'RSVA-root'
    record.description = 'RSV-A root'
    record.seq = Seq(root_seq['nuc'])
    #make sequence mutable
    record.seq = record.seq.tomutable()
    #remove the deletion that is a place holder for duplication
    record.seq = record.seq.replace('-'*72,'')
    
    #edit the new locations for each gene (after having deleted 72 nt from G)
    for feature in record.features:
        if feature.type =='CDS':
            if feature.qualifiers['gene'] == ['G']:
                old_g_start = feature.location.start
    
    #only need to change features that come after G (or are G)
    for feature in record.features:
        #this is g
        if feature.location.start == old_g_start:
            feature.location = SeqFeature.FeatureLocation(SeqFeature.ExactPosition(feature.location.start.position),
                        SeqFeature.ExactPosition(feature.location.end -72), feature.location.strand)
            feature.qualifiers['translation'] = feature.location.extract(record.seq).translate()
            
        elif feature.location.start > old_g_start:
            feature.location = SeqFeature.FeatureLocation(SeqFeature.ExactPosition(feature.location.start.position-72),
                        SeqFeature.ExactPosition(feature.location.end -72), feature.location.strand)
            feature.qualifiers['translation'] = feature.location.extract(record.seq).translate()

        new_features.append(feature)
        
    record.features = new_features


    
    SeqIO.write(record, 'rsv_a_fromroot_dupremoved.gb', 'genbank')


MSKTKDQRTAKTLEKTWDTLNHLLFISSCLYKLNLKSIAQITLSILAMIISTSLIIAAIIFIASANHKVTLTTAIIQDATSQIKNTTPTYLTQNPQLGISFSNLSETTSQTTTILASTTPSVESTLQSTTVKTKNTTTTQIQPSKPTTKQRQNKPPNKPNNDFHFEVFNFVPCSICSNNPTCWAICKRIPNKKPGKKTTTKPTKKPTIKTTKKDLKPQTTKPKEVPTTKPTEKPTINTTKTNIRTTLLTNNTTGNPEHTSQKETLHSTSSEGNPSPSQVYTTSEYLSQPPSPSNTTNQ
MSKTKDQRTAKTLEKTWDTLNHLLFISSCLYKLNLKSIAQITLSILAMIISTSLIIAAIIFIASANHKVTLTTAIIQDATSQIKNTTPTYLTQNPQLGISFSNLSETTSQTTTILASTTPSVESTLQSTTVKTKNTTTTQIQPSKPTTKQRQNKPPNKPNNDFHFEVFNFVPCSICSNNPTCWAICKRIPNKKPGKKTTTKPTKKPTIKTTKKDLKPQTTKPKEVPTTKPTEKPTINTTKTNIRTTLLTNNTTGNPEHTSQKETLHSTSSEGNPSPSQVYTTSEYLSQPPSPSNTTNQ


In [12]:
#make reference from the root of the tree made by aligning to reference (so there will be a gap in G that can be filled in by the duplication)
for record in SeqIO.parse(open('rsv_a_dup_reference.gb', 'r'), "gb"):
    record.id = 'RSVA-root'
    record.name = 'RSVA-root'
    record.description = 'RSV-A root with space for G duplication'
    record.seq = Seq(root_seq['nuc'])
    #make sequence mutable
    record.seq = record.seq.tomutable()
    for feature in record.features:
        if feature.type =='CDS':
            if feature.qualifiers['gene'] == ['G']:
                #in the root sequence, the ---s for the duplication are shifted frame by one nt
                #the "template" for the duplication occurs after the duplication in this root sequence 
                #g in "template" needs to be moved to other side
                mutable_g = feature.location.extract(record.seq).tomutable()

                mutable_g[778:850] = 'GTCAAGAGGAAACCCTCCACTCAACCACCTCCGAAGGCTATCTAAGCCCATCACAAGTCTATACAACATCCG'
                mutable_g[850] = 'G'
                
                record.seq[feature.location.start:feature.location.end] = mutable_g

                feature.qualifiers['translation'] = feature.location.extract(record.seq).translate()

            else:
                feature.qualifiers['translation'] = feature.location.extract(record.seq).translate()
            
            #check
#             if str(feature.qualifiers['translation']) != root_seq[feature.qualifiers['gene'][0]]:
#                 print(root_seq[feature.qualifiers['gene'][0]])

    
    SeqIO.write(record, 'rsv_a_fromroot_dupfilled.gb', 'genbank')