In [9]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

In [2]:
def receive_sequences(path):
    """
    
    path: a pathway to fasta file 
    
    return: a list of sequences from fasta file
    
    """
    sequences = []
    for seq_record in SeqIO.parse(path, "fasta"): 
        sequences.append(str(seq_record.seq))
    return sequences

In [3]:
def score_allignment(seq, sequence):
    
    """
    
    seq: the first sequence in input fasta file
    sequence: the list of input sequences
    
    return: the dictionary with score of alignment sequences
    
    """
    
    score = {}
    
    for i in range(1, len(sequence)):
        seq1 = sequence[i]
        alignment = pairwise2.align.localms(seq, seq1, 1, -1, -1, -1)
        if alignment:
            alignment = alignment[0]
            score[i] = alignment[2]
        else:
            score[i] = 0
    
    return score

In [10]:
def match(sequence, seq, score, threshold):
    '''
    
    sequence: the list of input sequences
    seq: the first sequence in input fasta file
    score : the dictionary with alignment scores
    
    return : the list with alignment sequences 
    '''
    
    while len(sequence) != 1 and max(score.values()) >= threshold:
        for i in sorted(score, key = score.get, reverse = True):

            if score[i] >= threshold:
                position = i
                seq1 = sequence[position]
                alignment = pairwise2.align.localms(seq, seq1, 1, -1, -1, -1)[0]
                fragment = alignment[0][alignment[3]:alignment[4]]
                flagment_len = alignment[4] - alignment[3]

                if (seq.endswith(fragment) and seq1.startswith(fragment)):
                    sequence.pop(position)
                    sequence.pop(0)
                    sequence.append(SeqRecord(seq + seq1[flagment_len:]))
                    break

                elif (seq1.endswith(fragment) and seq.startswith(fragment)):
                    sequence.pop(position)
                    sequence.pop(0)
                    sequence.append(SeqRecord(seq1 + seq[flagment_len:]))
                    break

            else:
                sequence.append(sequence[0])
                sequence.pop(0)
            
    return sequence

In [11]:
path = 'cds.fasta'

In [12]:
sequences = receive_sequences(path)

In [13]:
score = score_allignment(sequences[0], sequences)
print(match(sequences, sequences[0], score, threshold = 6))

['GTAGC', 'AGCTC', 'GCTCC', 'TGTAG', 'TAGCT', 'ATGTA']
