In [82]:
import numpy as np


class Action:
    MATCH = 1
    GAP_I = 2
    GAP_J = 3
    START = 4

    @staticmethod
    def fix_upper_action_matrix(action_matrix):
        """
        Make the lower triangular matrix equal to the upper triangular matrix, but with GAP_I and GAP_J swapped.
        """
        lower_matrix = np.triu(action_matrix).T

        # swap GAP_I and GAP_J
        pos_i = np.where(lower_matrix == Action.GAP_I)
        pos_j = np.where(lower_matrix == Action.GAP_J)
        lower_matrix[pos_i] = Action.GAP_J
        lower_matrix[pos_j] = Action.GAP_I

        return action_matrix + lower_matrix


class MultiSequenceAligner:

    def __init__(self,
                 match: int = 5,
                 mismatch: int = -2,
                 indel: int = -4,
                 two_gaps: int = 0):
        """

        :param match: Score if AA match
        :param mismatch: Score if AA mismatch
        :param indel: Linear gap penalty (only applied if there is only one gap)
        :param two_gaps: Gap penalty applied if two or more sequences have a gap at given position
        """
        self.match = match
        self.mismatch = mismatch
        self.indel = indel
        self.two_gaps = two_gaps

    def get_alignment_matrix(self, sequences: list[str], method: str = "global"):
        """
        Returns a matrix of scores for all possible alignments of the given sequences and a backtrack matrix.
        :param sequences: List of sequences to align
        :param method: Method to use for alignment. Either "global" or "local"
        :return: Matrix of scores for all possible alignments of the given sequences and a backtrack matrix
        """
        assert method in ["global", "local"], "Method must be either 'global' or 'local'"

        # Initialize matrix
        # n-dimensional matrix where n is the number of sequences, each dimension is the length of the sequence + 1 (for the inital gap)
        dimensions = [len(sequence) + 1 for sequence in sequences]
        matrix = np.zeros(dimensions)

        # Backtrack matrix contains directions, which is a dict for each pair of sequences indicating which action the score came from
        # The keys are tuples of the indices of the sequences in the list of sequences
        # The values are tuples of the indices of the sequences in the list of sequences and the action that led to the score
        backtrack_dimensions = dimensions + [len(sequences), len(sequences)]
        backtrack_matrix = np.zeros(backtrack_dimensions)

        # Fill matrix
        for idx, value in np.ndenumerate(matrix):
            matrix[idx], backtrack_matrix[idx] = self._get_score(idx, sequences, matrix, method)

        return matrix, backtrack_matrix

    def _get_score(self, idx: tuple[int], sequences: list[str], matrix: np.ndarray, method: str):
        """
        Returns the score for the given position in the matrix. The score is s completely analogous to the pairwise
        case, only now the scores for each position are equal to the sum of the individual
        pairwise comparisons (i.e. a position that is identical for three sequences has a
        score of 5s1,s2 + 5s1,s3 + 5s2,s3 = 15).

        :param idx: Index of the position in the matrix
        :param sequences: List of sequences to align
        :param matrix: Partially completed matrix of scores for all possible alignments of the given sequences
        """
        # if idx all zeros, return 0
        if all([i == 0 for i in idx]):
            # backtrack direction is start for all
            backtrack_directions = np.full((len(sequences), len(sequences)), Action.START)
            return 0, backtrack_directions

        # Get all possible pairwise combinations
        total_sequences = len(sequences)
        pairwise_combinations = [(i, j) for i in range(total_sequences) for j in range(i + 1, total_sequences)]

        backtrack_directions = np.zeros((total_sequences, total_sequences))

        # Get scores for all pairwise combinations
        total = 0
        for i, j in pairwise_combinations:
            pairwise_matrix = self._get_pairwise_matrix(i, j, idx, matrix, sequences)
            score, action = self._get_pairwise_score(idx, sequences, pairwise_matrix, i, j)
            total += score
            backtrack_directions[i, j] = action

        # create a full matrix from the upper triangular matrix
        backtrack_directions = Action.fix_upper_action_matrix(backtrack_directions)

        if method == "global":
            return total, backtrack_directions
        elif method == "local":
            # backtrack direction is start for all
            backtrack_directions = np.full((len(sequences), len(sequences)), Action.START)
            return max(total, 0), backtrack_directions
        else:
            raise ValueError("Method must be either 'global' or 'local'")

    def _get_pairwise_matrix(self, i, j, idx, matrix, sequences):
        # create a pairwise_matrix, which is the 2D matrix for i and j, for all other indices, use the idx value
        pairwise_matrix = np.zeros((len(sequences[i]) + 1, len(sequences[j]) + 1))
        for idx_i, idx_j in np.ndindex(pairwise_matrix.shape):
            new_idx = list(idx)
            new_idx[i] = idx_i
            new_idx[j] = idx_j
            pairwise_matrix[idx_i, idx_j] = matrix[tuple(new_idx)]
        return pairwise_matrix

    def _get_pairwise_score(self, idx: tuple[int], sequences: list[str], pairwise_matrix: np.ndarray, i: int, j: int):
        """
        Returns the score for the given pairs at given position in the matrix. This is similar to the Needleman-Wunsch and Smith-Waterman algorithms. This assigns the max of following  scores to S(i, j) (for global alignment, same for local alignment, but all scores would be set to max(0, score)):
        - S(i-1, j-1) + s(ai, bj)
        - S(i-1, j) - w  (gap in sequence 1)
        - S(i, j-1) - w  (gap in sequence 2)
        Note that i and j are the indices of the sequences in the list of sequences, not the indices of the position in the matrix.

        :param idx: Index of the position in the matrix
        :param sequences: List of sequences to align
        :param matrix: Partially completeed matrix of scores for all possible alignments of the given pair
        :param i: Index of the first sequence in the pair
        :param j: Index of the second sequence in the pair
        """
        # get the amino acids at the given position
        seq_i = sequences[i]
        seq_j = sequences[j]
        aa_i = seq_i[idx[i] - 1] if idx[
                                        i] > 0 else None  # -1 because the matrix has an extra row and column for the initial gap
        aa_j = seq_j[idx[j] - 1] if idx[j] > 0 else None

        idx_i = idx[i]
        idx_j = idx[j]

        # get the scores for the three possible cases
        previous_score = pairwise_matrix[(idx_i - 1, idx_j - 1)] if idx_i > 0 and idx_j > 0 else -np.inf

        match_score = previous_score + self._get_match_score(aa_i, aa_j)
        gap_i_score = pairwise_matrix[(idx_i - 1, idx_j)] + self.indel if idx_i > 0 else self.indel
        gap_j_score = pairwise_matrix[(idx_i, idx_j - 1)] + self.indel if idx_j > 0 else self.indel
        # TODO: two gaps score

        # get the score for the current position
        score = max(match_score, gap_i_score, gap_j_score)

        if score == match_score:
            action = Action.MATCH
        elif score == gap_i_score:
            action = Action.GAP_I
        elif score == gap_j_score:
            action = Action.GAP_J

        return score, action

    def global_alignment(self, sequences: list[str]):
        """
        Returns the optimal global alignment of the given sequences
        :param sequences: List of sequences to align
        :return: List of aligned sequences
        """
        alignment_matrix, backtrack_matrix = self.get_alignment_matrix(sequences, method="global")
        return self._get_alignment(sequences, alignment_matrix, backtrack_matrix, method="global")

    def local_alignment(self, sequences: list[str]):
        """
        Returns the optimal local alignment of the given sequences
        :param sequences: List of sequences to align
        :return: List of aligned sequences
        """
        alignment_matrix, backtrack_matrix = self.get_alignment_matrix(sequences, method="local")
        return self._get_alignment(sequences, alignment_matrix, backtrack_matrix, method="local")

    def _get_alignment(self, sequences: list[str], alignment_matrix: np.ndarray, backtrack_matrix: np.ndarray,
                       method: str):
        """
        Backtraces through the alignment matrix to get the optimal alignment of the given sequences
        :param sequences: List of sequences to align
        :param alignment_matrix: Matrix of scores for all possible alignments of the given sequences
        :param backtrach_matrix: Matrix of backtrach directions for all possible alignments of the given sequences
        :param method: Method to use for alignment. Either "global" or "local"
        :return: List of aligned sequences
        """
        aligned_sequences_reversed = [""] * len(sequences)

        current_position = [len(sequence) for sequence in sequences]
        if method == "local":
            current_position = np.unravel_index(np.argmax(alignment_matrix), alignment_matrix.shape)

        while True:
            print(current_position)
            score_actions = backtrack_matrix[tuple(current_position)]

            # if all actions are start, we are done
            if np.all(score_actions == Action.START):
                break

            # if there is a match, add the amino acid to the aligned sequence
            for i, sequence in enumerate(sequences):
                actions = score_actions[i]

                match_count = np.sum(actions == Action.MATCH)
                gapi_count = np.sum(actions == Action.GAP_I)
                gapj_count = np.sum(actions == Action.GAP_J)

                aa_idx =  current_position[i] - 1

                if match_count >= gapi_count and match_count >= gapj_count and aa_idx >= 0:
                    aligned_sequences_reversed[i] += sequence[aa_idx]
                    current_position[i] -= 1
                elif gapi_count >= gapj_count and current_position[i] > 0:
                    aligned_sequences_reversed[i] += "."
                    # current_position[i] -= 1
                else:
                    aligned_sequences_reversed[i] += "."
                    current_position[i] -= 1


        aligned_sequences = [sequence[::-1] for sequence in aligned_sequences_reversed]
        return aligned_sequences

    def _get_match_score(self, aa_i: str, aa_j: str):
        if aa_i is None or aa_j is None:
            return -np.inf
        return self.match if aa_i == aa_j else self.mismatch


In [83]:
aligner = MultiSequenceAligner()
alignmets = aligner.global_alignment(["ABC", "A", "A"])
print('\n'.join(alignmets))

[3, 1, 1]
[2, 0, 0]
[1, -1, -1]
[0, -2, -2]
A.C
..A
..A


In [32]:
aligner = MultiSequenceAligner()
am, bm = aligner.get_alignment_matrix(["A", "A"], method="global")
am

array([[ 0., -4.],
       [-4.,  5.]])

In [33]:
bm

array([[[[4., 4.],
         [4., 4.]],

        [[0., 2.],
         [0., 0.]]],


       [[[0., 2.],
         [0., 0.]],

        [[0., 1.],
         [0., 0.]]]])

TypeError: can only concatenate str (not "NoneType") to str

In [35]:

scores, backtrack = aligner.get_alignment_matrix(["ABC", "ABD"], method="global")
# aligner.get_alignment_matrix(["ABC", "ABD", "CAB"])
scores

array([[ 0., -4., -4., -4.],
       [-4.,  5.,  1., -3.],
       [-4.,  1., 10.,  6.],
       [-4., -3.,  6.,  8.]])

In [36]:
backtrack

array([[[[4., 4.],
         [4., 4.]],

        [[0., 2.],
         [0., 0.]],

        [[0., 2.],
         [0., 0.]],

        [[0., 2.],
         [0., 0.]]],


       [[[0., 2.],
         [0., 0.]],

        [[0., 1.],
         [0., 0.]],

        [[0., 3.],
         [0., 0.]],

        [[0., 3.],
         [0., 0.]]],


       [[[0., 3.],
         [0., 0.]],

        [[0., 2.],
         [0., 0.]],

        [[0., 1.],
         [0., 0.]],

        [[0., 3.],
         [0., 0.]]],


       [[[0., 3.],
         [0., 0.]],

        [[0., 2.],
         [0., 0.]],

        [[0., 2.],
         [0., 0.]],

        [[0., 1.],
         [0., 0.]]]])

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [54]:
a = (1, 2)
a[0] += 1
a

TypeError: 'tuple' object does not support item assignment