In [52]:
from io import StringIO
from Bio import SeqIO, pairwise2
import pandas as pd
import numpy as np
import matplotlib.lines as mlines
import matplotlib.pyplot as plt

In [53]:
def local_align(consensus, RBS, verbose = False):
    '''peforms a global alignment between a consensus and sample sequence
    gap open penalty is raised very high to prevent gaps in consensus sequence'''
    # def scoring metrics
    gap_open_default = -3
    gap_open_consensus = -3
    gap_extend_default = -0.1
    alns = pairwise2.align.localxd(sequenceA=consensus, sequenceB=RBS, 
                                      openA = gap_open_consensus, 
                                      extendA = gap_extend_default, 
                                      openB = gap_open_default, 
                                      extendB = gap_extend_default)
    top_aln = alns[0] # choose first (best) alignment
    aln_consensus, aln_RBS, _, _, _ = top_aln
    
    if verbose:
        print("consensus aln ", aln_consensus)
        print("RBS aln ", aln_RBS)
    
    return aln_consensus, aln_RBS

In [54]:
def get_start_and_end(consensus, RBS, verbose = False):
    aln_consensus, aln_RBS = local_align(consensus, RBS)
    
    start = -1
    for aa_char in aln_RBS:
        start +=1
        if aa_char!="-":
            break
    
    end = len(aln_RBS)
    for aa_char in reversed(aln_RBS):
        end -=1
        if aa_char!="-":
            break
    
    if verbose:
        print(aln_consensus[start:end+1])
    
    return start, end

In [55]:
def aln_idx_to_consensus_idx(start, end, consensus_aln):
    idx = -1
    for char in consensus_aln[0:start+1]:
        if char!="-":
            idx+=1
    start_idx = idx
    
    for char in consensus_aln[start+1:end+1]:
        if char!="-":
            idx+=1
    end_idx = idx
    
    return start_idx,end_idx 
        

In [56]:
RBS_seq_file = "data/consensus_RBS.fasta"
RBS_record = SeqIO.parse(RBS_seq_file, "fasta")
RBS_seq = (next(RBS_record)).seq.strip()
compare_seq_file = "data/H3_consensus.fasta"
compare_seq_record = SeqIO.parse(compare_seq_file, "fasta")
compare_seq = (next(compare_seq_record)).seq.strip()

In [57]:
aln_compare, aln_RBS = local_align(compare_seq, RBS_seq, verbose=True)

consensus aln  MKTIIALSYILCLVFAQKLPGNDNSTATLCLGHHAVPNGTLVKTITNDQIEVTNATELVQSSSTGRICDSPHQILDGENCTLIDALLGDPHCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLR---SLVASSGTLEFNNESFNWTGVTQNGTSSACKRRSNNSFFSRLNWLTHLKYKYPALNVTMPNNEKFDKLYIWGVHHPSTDSDQISLYAQASGRVTVSTKRSQQTVIPNIGSRPWVRGISSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCNSECITPNGSIPNDKPFQNVNRITYGACPRYVKQNTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGTGQAADLKSTQAAIDQINGKLNRLIEKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFERTRKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHDVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVVLLGFIMWACQKGNIRCNICI
RBS aln  ----------------------------------------------------------------------------------------------------------------CYPGDFPDYEELREQLSSVSSFERFEIFPKTSSWPNHTQNGVSAACPHAGAKSFYKNLNWLTKKGNSYPALNVTYPNNKGKEVLVLWGVHHPSTDADQQSLYQNADAYVTV-TSRYSQKFIPEIASRPKVRDQEGRI-------------------------------------------------------------------------------------------------------------------------------------------------------------

In [58]:
start, end = get_start_and_end(compare_seq, RBS_seq)
print(start, end)
str(aln_compare.strip())[start:end+1], str(aln_RBS)[start:end+1]

112 248


('CYPYDVPDYASLR---SLVASSGTLEFNNESFNWTGVTQNGTSSACKRRSNNSFFSRLNWLTHLKYKYPALNVTMPNNEKFDKLYIWGVHHPSTDSDQISLYAQASGRVTVSTKRSQQTVIPNIGSRPWVRGISSRI',
 'CYPGDFPDYEELREQLSSVSSFERFEIFPKTSSWPNHTQNGVSAACPHAGAKSFYKNLNWLTKKGNSYPALNVTYPNNKGKEVLVLWGVHHPSTDADQQSLYQNADAYVTV-TSRYSQKFIPEIASRPKVRDQEGRI')

In [60]:
new_start, new_end = aln_idx_to_consensus_idx(start, end, aln_compare)
str(compare_seq)[start:end+1]

'CYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQNGTSSACKRRSNNSFFSRLNWLTHLKYKYPALNVTMPNNEKFDKLYIWGVHHPSTDSDQISLYAQASGRVTVSTKRSQQTVIPNIGSRPWVRGISSRISIY'