In [32]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
#For RSV-A:
#ON1 sequences were aligned to an ON1 genome, 
#and non-ON1 sequences were aligned to the same genome but with the duplication removed
#now merge all of these sequences together into one aligned fasta file
#need to add ---- to the non-ON1 (other) seqs as a placeholder for the 72nt duplication in ON1        

#Do the same for RSV-B (but the duplication in BA is 60nt)

In [8]:
dup_seq_dict = {'a':'GTCAAGAGGAAACCCTCCACTCAACCACCTCCGAAGGCTATCTAAGCCCATCACAAGTCTATACAACATCCG', 
                'b': 'ACAGAAAGAGACACCAGCACCTCACAATCCACTGTGCTCGACACAACCACATCAAAACAC'}

In [9]:
#find where duplication placeholdher should go
def find_duplication_location(subtype):
    dup_ref_fasta = f'../config/rsv_{subtype}_step3_reference.fasta'
    
    dup_seq = dup_seq_dict[subtype]

    with open(dup_ref_fasta, 'r') as handle:
        for ref in SeqIO.parse(handle, 'fasta'):
            loc_template = str(ref.seq).find(dup_seq.lower())
            start_dup = loc_template+len(dup_seq)
            #check that this is the same seq as above
            dup_seq_in_ref = str(ref.seq)[start_dup:start_dup+len(dup_seq)]
#             print(dup_seq_in_ref==dup_seq.lower())
            
    return start_dup

In [24]:
def add_placeholder_for_duplication(subtype):
    #get duplicated sequence and its position in the genome
    dup_seq = dup_seq_dict[subtype]
    start_dup = find_duplication_location(subtype)
    
    #read in alignment done on all strains without the duplication
    #using a reference that does not have the duplication
    other_aligned_fasta = f'../../rsv_step2/results/aligned_{subtype.upper()}_other.fasta'

    other_aligned_records = []

    with open(other_aligned_fasta, 'r') as handle:
        for virus in SeqIO.parse(handle, 'fasta'):
            #for RSV-A the duplication starts at second position of reading frame 
            #so offset the placeholder so translation will work
            if subtype=='a':
                before_dup = str(virus.seq)[:start_dup-1]
                after_dup = str(virus.seq)[start_dup-1:]
            else:
                before_dup = str(virus.seq)[:start_dup]
                after_dup = str(virus.seq)[start_dup:]
            dup_placeholder = '-'*len(dup_seq)
            virus_seq_w_placeholder = Seq(before_dup+dup_placeholder+after_dup)
            #check
#             print(virus_seq_w_placeholder[4680:5646].translate())
            other_aligned_records.append(SeqRecord(seq=virus_seq_w_placeholder, id=virus.id))
    return other_aligned_records

In [25]:
def merge_aligned_fastas(subtype):
    #get the alignment of sequences without duplications (but with placeholder for them)
    all_aligned_records = add_placeholder_for_duplication(subtype)

    dup_aligned_fasta = f'../../rsv_step2/results/aligned_{subtype.upper()}_dup.fasta'

    with open(dup_aligned_fasta, 'r') as handle:
        for virus in SeqIO.parse(handle, 'fasta'):
            all_aligned_records.append(SeqRecord(seq=virus.seq, id=virus.id))
            
    with open(f'aligned_{subtype.upper()}_all.fasta','w') as handle:
        SeqIO.write(all_aligned_records, handle, 'fasta')

In [33]:
merge_aligned_fastas('a')

In [34]:
merge_aligned_fastas('b')

In [35]:
#check
with open('aligned_A_all.fasta', 'r') as handle:
    for virus in SeqIO.parse(handle, 'fasta'):
        if len(virus)!= 15277:
            print(virus.id)

In [36]:
#check
with open('aligned_B_all.fasta', 'r') as handle:
    for virus in SeqIO.parse(handle, 'fasta'):
        if len(virus)!= 15285:
            print(virus.id)