In [159]:
import numpy as np
import itertools


class Action:
    MATCH = 1
    GAP_I = 2
    GAP_J = 3
    START = 4


class MultiSequenceAligner:

    def __init__(self,
                 match: int = 5,
                 mismatch: int = -2,
                 indel: int = -4,
                 two_gaps: int = 0):
        """

        :param match: Score if AA match
        :param mismatch: Score if AA mismatch
        :param indel: Linear gap penalty (only applied if there is only one gap)
        :param two_gaps: Gap penalty applied if two or more sequences have a gap at given position
        """
        self.match = match
        self.mismatch = mismatch
        self.indel = indel
        self.two_gaps = two_gaps

    def get_alignment_matrix(self, sequences: list[str], method: str = "global"):
        """
        Returns a matrix of scores for all possible alignments of the given sequences and a backtrack matrix.
        :param sequences: List of sequences to align
        :param method: Method to use for alignment. Either "global" or "local"
        :return: Matrix of scores for all possible alignments of the given sequences and a backtrack matrix
        """
        assert method in ["global", "local"], "Method must be either 'global' or 'local'"

        # Initialize matrix
        # n-dimensional matrix where n is the number of sequences, each dimension is the length of the sequence + 1 (for the inital gap)
        dimensions = [len(sequence) + 1 for sequence in sequences]
        matrix = np.zeros(dimensions)

        # Backtrack matrix contains directions, which is a dict for each pair of sequences indicating which action the score came from
        # The keys are tuples of the indices of the sequences in the list of sequences
        # The values are tuples of the indices of the sequences in the list of sequences and the action that led to the score
        backtrack_dimensions = dimensions + [len(sequences)]
        backtrack_matrix = np.zeros(backtrack_dimensions)

        # Fill matrix
        for idx, value in np.ndenumerate(matrix):
            matrix[idx], backtrack_matrix[idx] = self._get_score(idx, sequences, matrix, method)

        return matrix, backtrack_matrix

    def _get_score(self, idx: tuple[int], sequences: list[str], matrix: np.ndarray, method: str):
        """
        Returns the score for the given position in the matrix. The score is s completely analogous to the pairwise
        case, only now the scores for each position are equal to the sum of the individual
        pairwise comparisons (i.e. a position that is identical for three sequences has a
        score of 5s1,s2 + 5s1,s3 + 5s2,s3 = 15).

        :param idx: Index of the position in the matrix
        :param sequences: List of sequences to align
        :param matrix: Partially completed matrix of scores for all possible alignments of the given sequences
        """
        # if idx all zeros, return 0
        if all([i == 0 for i in idx]):
            return 0, None

        num_sequences = len(sequences)
        all_neighbours = list(itertools.product([0, -1], repeat=num_sequences))
        all_neighbours.remove(tuple([0 for _ in range(num_sequences)]))
        neighbours = [tuple(np.array(idx) + np.array(neighbour)) for neighbour in all_neighbours]

        # get scores for all neighbours
        scores = []
        for neighbour in neighbours:

            # if any of the indices is negative, return -inf
            if any([i < 0 for i in neighbour]):
                scores.append(-np.inf)
                continue

            pairs = self._get_pairs(num_sequences)
            total = 0
            for i, j in pairs:
                score, action = self._get_pairwise_score(neighbour, i, j, matrix, sequences, idx)
                total += score
            scores.append(score)

        max_score = max(scores)
        max_idx = scores.index(max_score)
        max_score_neighbour = neighbours[max_idx]

        if method == "global":
            return total, max_score_neighbour
        elif method == "local":
            return max(max_score, 0), max_score_neighbour if max_score > 0 else None
        else:
            raise ValueError("Method must be either 'global' or 'local'")

    def _get_pairwise_score(self, neighbour, i, j, matrix, sequences, idx):
        neighbour_i = neighbour[i]
        neighbour_j = neighbour[j]

        idx_i = idx[i]
        idx_j = idx[j]

        diff_i = idx_i - neighbour_i
        diff_j = idx_j - neighbour_j

        neighbour_score = matrix[neighbour] if all([neighbour_i >= 0, neighbour_j >= 0]) else -np.inf

        # determin the action based on diff_i and diff_j
        if diff_i == 1 and diff_j == 1:
            action = Action.MATCH
        elif diff_i == 1 and diff_j == 0:
            action = Action.GAP_J
        elif diff_i == 0 and diff_j == 1:
            action = Action.GAP_I
        elif diff_i == 0 and diff_j == 0:
            # insertion somewhere else
            return 0, None
        else:
            raise ValueError(f"Invalid offset: {diff_i}, {diff_j}")

        if action == Action.MATCH:
            # match
            aa_i = sequences[i][idx_i-1]
            aa_j = sequences[j][idx_j-1]
            score = neighbour_score + self.match if aa_i == aa_j else neighbour_score + self.mismatch

        elif action == Action.GAP_I:
            # gap in j
            score = neighbour_score + self.indel
        elif action == Action.GAP_J:
            # gap in i
            score = neighbour_score + self.indel
        else:
            raise ValueError("Invalid offset")

        return score, action


    def _get_pairs(self, n):
        """
        Returns all possible pairs of n numbers
        :param n: Number of numbers
        :return: List of all possible pairs of n numbers
        """
        pairs = []
        for i in range(n):
            for j in range(i + 1, n):
                pairs.append((i, j))
        return pairs

    def _get_action_combinations(self, n):
        actions = [Action.MATCH, Action.GAP_I, Action.GAP_J]
        action_combinations = itertools.product(actions, repeat=n)
        return action_combinations

    def global_alignment(self, sequences: list[str]):
        """
        Returns the optimal global alignment of the given sequences
        :param sequences: List of sequences to align
        :return: List of aligned sequences
        """
        alignment_matrix, backtrack_matrix = self.get_alignment_matrix(sequences, method="global")
        return self._get_alignment(sequences, alignment_matrix, backtrack_matrix, method="global")

    def local_alignment(self, sequences: list[str]):
        """
        Returns the optimal local alignment of the given sequences
        :param sequences: List of sequences to align
        :return: List of aligned sequences
        """
        alignment_matrix, backtrack_matrix = self.get_alignment_matrix(sequences, method="local")
        return self._get_alignment(sequences, alignment_matrix, backtrack_matrix, method="local")

    def _get_alignment(self, sequences: list[str], alignment_matrix: np.ndarray, backtrack_matrix: np.ndarray,
                       method: str):
        """
        Backtraces through the alignment matrix to get the optimal alignment of the given sequences
        :param sequences: List of sequences to align
        :param alignment_matrix: Matrix of scores for all possible alignments of the given sequences
        :param backtrach_matrix: Matrix of backtrach directions for all possible alignments of the given sequences
        :param method: Method to use for alignment. Either "global" or "local"
        :return: List of aligned sequences
        """
        aligned_sequences_reversed = [""] * len(sequences)

        current_position = [len(sequence) for sequence in sequences]
        if method == "local":
            current_position = np.unravel_index(np.argmax(alignment_matrix), alignment_matrix.shape)

        while True:

            # change the floats (e.g. 2.000000) in current_position to ints
            current_position = [int(i) for i in current_position]
            previous_neighbour = backtrack_matrix[tuple(current_position)]

            if np.all(np.isnan(previous_neighbour)):
                break

            for sequence_idx in range(len(sequences)):
                if current_position[sequence_idx] == previous_neighbour[sequence_idx]:
                    aligned_sequences_reversed[sequence_idx] += "."
                else:
                    aligned_sequences_reversed[sequence_idx] += sequences[sequence_idx][current_position[sequence_idx] - 1]

            current_position = previous_neighbour

        aligned_sequences = [sequence[::-1] for sequence in aligned_sequences_reversed]
        return aligned_sequences

    def _get_match_score(self, aa_i: str, aa_j: str):
        if aa_i is None or aa_j is None:
            return -np.inf
        return self.match if aa_i == aa_j else self.mismatch



In [160]:
aligner = MultiSequenceAligner()
alignments = aligner.global_alignment(["ACTGGTCA", "CAGGGTCA", "CCAGGACCA"])
print('\n'.join(alignments))

..ACTGGTCA.
CA..GGGTCA.
CC..AGGACCA


In [121]:
aligner = MultiSequenceAligner()
am, bm = aligner.get_alignment_matrix(["A", "A"], method="global")

array([[ 0., -4.],
       [-4.,  5.]])

array([[[nan, nan],
        [ 0.,  0.]],

       [[ 0.,  0.],
        [ 0.,  0.]]])

In [123]:
aligner = MultiSequenceAligner()
am, bm = aligner.get_alignment_matrix(["A", "A"], method="global")
am

array([[ 0., -4.],
       [-4.,  5.]])

In [124]:
bm

array([[[nan, nan],
        [ 0.,  0.]],

       [[ 0.,  0.],
        [ 0.,  0.]]])

TypeError: can only concatenate str (not "NoneType") to str

In [35]:

scores, backtrack = aligner.get_alignment_matrix(["ABC", "ABD"], method="global")
# aligner.get_alignment_matrix(["ABC", "ABD", "CAB"])
scores

array([[ 0., -4., -4., -4.],
       [-4.,  5.,  1., -3.],
       [-4.,  1., 10.,  6.],
       [-4., -3.,  6.,  8.]])

In [36]:
backtrack

array([[[[4., 4.],
         [4., 4.]],

        [[0., 2.],
         [0., 0.]],

        [[0., 2.],
         [0., 0.]],

        [[0., 2.],
         [0., 0.]]],


       [[[0., 2.],
         [0., 0.]],

        [[0., 1.],
         [0., 0.]],

        [[0., 3.],
         [0., 0.]],

        [[0., 3.],
         [0., 0.]]],


       [[[0., 3.],
         [0., 0.]],

        [[0., 2.],
         [0., 0.]],

        [[0., 1.],
         [0., 0.]],

        [[0., 3.],
         [0., 0.]]],


       [[[0., 3.],
         [0., 0.]],

        [[0., 2.],
         [0., 0.]],

        [[0., 2.],
         [0., 0.]],

        [[0., 1.],
         [0., 0.]]]])

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [54]:
a = (1, 2)
a[0] += 1
a

TypeError: 'tuple' object does not support item assignment