In [1]:
from Bio import SeqIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
import math

In [2]:
def receive_sequences(path):
    """
    
    path: a pathway to fasta file 
    return: a list of sequences from fasta file
    
    """
    sequences = []
    for seq_record in SeqIO.parse(path, "fasta"): 
        sequences.append(str(seq_record.seq))
    return sequences

In [3]:
def score_allignment(seq1, seq2):
    
    """
    
    seq1: the first sequence
    seq2: the second sequence
    return: the score of alignment sequences
    
    """
    
    for a in pairwise2.align.localms(seq1, seq2, 1, -1, -1, -1):
        format_alignment(*a)
    return int(a[2])

In [4]:
def match(seq1, seq2):
    """
    
    seq1: the first sequence
    seq2: the second sequence
    
    This function check the assumption 
    that the first sequence ends by nucleotides 
    from which another sequence starts
    
    retun True or False
    
    """
    
    score = score_allignment(seq1, seq2)
    finish = seq1[len(seq1) - score:]
    start = seq2[:score]
    if finish == start:
        return True
    else:
        return False

In [5]:
path = 'cds.fasta'

In [6]:
sequences = receive_sequences(path)

In [164]:
alignment = ''
stop_index = []
add_seq = []
for i in range(len(sequences)):
    score = 0
    if i not in stop_index:
        for j in range(len(sequences)):
            if i != j and match(sequences[j], sequences[i]) == False:
                score += 1
        if score == len(sequences) - 1:
            alignment += sequences[i]
            add_seq.append(sequences[i])
            stop_index.append(i)

In [165]:
alignment

'ACTCGTC'

In [158]:
def matching_score(alignment, seq):
    a = []
    for elem in seq:
        a.append(score_allignment(alignment, elem))
    a = sorted(a)
    return a[:-1]

In [159]:
scores = matching_score(alignment, sequences)

In [160]:
scores

[3, 4]

In [166]:
while len(scores) != 0:
    for i in range(len(sequences)):
        scores = matching_score(alignment, sequences)
        if match(alignment, sequences[i]) and score_allignment(alignment, sequences[i]) == scores[-1]:
            alignment += sequences[i][scores[-1]:]
            scores = scores[:-1]
            add_seq.append(sequences[i])
    if len(scores) == 1:
        for elem in sequences:
            if elem not in add_seq:
                alignment += elem[scores[0]:]
                break
        break
        

In [167]:
alignment

'ACTCGTCAAAAGTGTCATC'