In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [None]:
#use the insertions from Step2 that are found as insertions of ON1 seqs relative to a reference with a duplication
#and for non-ON1 sequences compared to a reference without a duplication
#use these in step3 to infer ancestral insertions

#need to merge the insertions tsvs for ON1 and non-ON1 sequences
#but need to adjust the positions of the non-ON1 sequences to account for the duplication
#this means adjusting all positions after the duplication by 72nt

In [14]:
dup_seq_dict = {'a':'GTCAAGAGGAAACCCTCCACTCAACCACCTCCGAAGGCTATCTAAGCCCATCACAAGTCTATACAACATCCG', 
                'b': 'ACAGAAAGAGACACCAGCACCTCACAATCCACTGTGCTCGACACAACCACATCAAAACAC'}

In [6]:
#find where duplication placeholdher should go
def find_duplication_location(subtype):
    dup_ref_fasta = f'../config/rsv_{subtype}_step3_reference.fasta'
    
    dup_seq = dup_seq_dict[subtype]

    with open(dup_ref_fasta, 'r') as handle:
        for ref in SeqIO.parse(handle, 'fasta'):
            loc_template = str(ref.seq).find(dup_seq.lower())
            start_dup = loc_template+len(dup_seq)
            #check that this is the same seq as above
            dup_seq_in_ref = str(ref.seq)[start_dup:start_dup+len(dup_seq)]
#             print(dup_seq_in_ref==dup_seq.lower())
            
    return start_dup

In [8]:
#the postitions of insertions inferred in step2 for the non-duplication strains need to be adjusted
#if the occurred after the duplication
def adjust_insertion_positions(subtype):
    start_dup = find_duplication_location(subtype)
    
    dup_seq = dup_seq_dict[subtype]
    
    #read in insertions tsv from Step2
    other_insertions = pd.read_csv(f'../../rsv_step2/results/insertions_{subtype}_other.tsv')

    #remove lines with null entries
    other_insertions = other_insertions[other_insertions['insertions'].notnull()]

    for k,v in other_insertions.iterrows():
        list_of_insertions = v['insertions'].split(';')

        new_insertions = []
        for i in list_of_insertions:
            insertion_pos = int(i.split(':')[0])
            if insertion_pos>=start_dup:
                pos_adjusted = insertion_pos+len(dup_seq)
                ins_nts = i.split(':')[1]
                ins_adjusted = f'{pos_adjusted}:{ins_nts}'
                new_insertions.append(ins_adjusted)
            else:
                new_insertions.append(i)
        v['insertions'] = ';'.join(new_insertions)
        
    return other_insertions


In [11]:
def merge_insertion_tsvs(subtype):
    other_insertions = adjust_insertion_positions(subtype)
    
    #read in ba/on1 insertions tsv from Step2
    dup_insertions = pd.read_csv(f'../../rsv_step2/results/insertions_{subtype}_dup.tsv')
    
    all_insertions_step2 = other_insertions.append(dup_insertions, ignore_index=True)

    all_insertions_step2.to_csv(f'insertions_{subtype}_step2.tsv', index=False)
    
    

In [15]:
merge_insertion_tsvs('a')

  all_insertions_step2 = other_insertions.append(dup_insertions, ignore_index=True)


In [16]:
merge_insertion_tsvs('b')

  all_insertions_step2 = other_insertions.append(dup_insertions, ignore_index=True)
