In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Add the parent directory to the path so we can import the module
sys.path.append(os.path.abspath(os.path.join('..')))

In [None]:
from Bio import pairwise2
from Bio.Align import substitution_matrices

In [None]:
from modules.needleman_wuhsch import *

# Needleman-Wunsch Algorithm Implementation

This project implements multiple versions of the Needleman-Wunsch algorithm:

- **Pairwise sequence alignment** (classic Needleman-Wunsch).  
- **N-sequence progressive alignment**, where a list of \( N \) sequences is aligned sequentially:  
  - First, sequences \( S_0 \) and \( S_1 \) are aligned.  
  - Then, sequence \( S_2 \) is aligned to the existing alignment of \( S_0 \) and \( S_1 \).  
  - This process continues iteratively until all \( N \) sequences are aligned.    
- **N+M sequence alignment**, where two pre-aligned groups of \( N \) and \( M \) sequences are merged.  

In all implementations, **gap opening cost > gap extension cost** to encourage the formation of fewer, larger gaps rather than multiple small gaps. This approach better reflects biological reality, as insertions and deletions in DNA, RNA, and protein evolution tend to occur in longer segments rather than isolated positions.

The functions support alignment of both **nucleotide** (DNA/RNA) and **protein** sequences:

- **For DNA/RNA alignment**, set `blosum_m=False`. The scoring scheme is:  
  - **Match** = \( +1 \)  
  - **Mismatch** = \( -1 \)  
  - **Gap opening** = \( -10 \)  
  - **Gap extension** = \( -2 \)  

- **For protein alignment**, set `blosum_m=True` to use the **BLOSUM62** substitution matrix.

Additionally, visualization functions are provided for **alignment matrices, final sequence alignment**, and other outputs to enhance clarity and debugging.

#### DNA/RNA examples

The simpliest short example.

In [None]:
blossum = False
sequences = ["ATCG", "AG"]

score, alignment = needleman_wunsch(sequences, blossum, print_result=True)

Let's try longer sequences.

In [None]:
blossum = False
sequences = ["ATCGTACGTCCTAGGCTAAGCTTAGCGTACGATCGTTAGCTA", "ATGCCGTTAGCCTAGGCTAAGCGTACGATCGTAGCTATTTA"]

score, alignment = needleman_wunsch(sequences, blossum, print_result=False)
print_alignments(alignment)
print("Score: ", score)

Now we will test Needleman-Wunsch algorithm for N-sequence progressive alignment.

In [None]:
blossum = False
sequences = ["TAGCCT", "CCATGCT", "TAGCCCTA", "CATGCT", "AGCT", "TAGTA"]

score, alignment = needleman_wunsch(sequences, blossum, print_result=True)
print_alignments(alignment)
print("Score: ", score)

Finally, let's test Needleman-Wunsch algorithm for **N+M sequence alignment**, where two pre-aligned groups of \( N \) and \( M \) sequences are merged.  

In [None]:
blossum = False

sequences = ["TAGCCT", "CCATGCT"]
_, sequences1 = needleman_wunsch(sequences, blossum, print_result=False)
print("Aligned sequences 1: ", sequences1)

sequences = ["CATGCT", "CAGCT"]
_, sequences2 = needleman_wunsch(sequences, blossum, print_result=False)
print("Aligned sequences 2: ", sequences2)

score, alignment = needleman_wunsch_multiple(sequences1, sequences2, blossum, print_result=True)

#### Protein examples

To test it we need just change blossum=True.

In [None]:
sequences = ["CHAT", "CAT"]

score, alignment = needleman_wunsch(sequences, True, print_result=True)

In [None]:

blossum = True
sequences = ["CHAT", "CAT", "HER", "HAT", "HARAT"]

score, alignment = needleman_wunsch(sequences, blossum, print_result=True)

In [None]:
blossum = True
sequences1 = ["CHAT", "C-AT"]
sequences2 = ["HER--", "HA--T", "HARAT"]

score, alignment = needleman_wunsch_multiple(sequences1, sequences2, blossum, print_result=True)

## Evaluation of the Needleman-Wunsch Algorithm

To verify the correctness of the classic Needleman-Wunsch algorithm, I compared my implementation with the reference implementation from the **Biopython** library. 

For evaluating the alignment quality, the **sum-of-pairs (SP) score** was used. This metric is widely used in multiple sequence alignment because it provides a straightforward way to assess alignment quality by summing the pairwise similarity scores across all aligned positions. It effectively captures the consistency of alignments and allows direct comparison between different methods.

Additionally, the implemented **alignment comparison function** accounts for cases where sequences in the two alignments may be presented in a different order. It calculates the **SP-score for both possible orderings** and selects the maximum value, ensuring that identical alignments are correctly recognized as fully matching, even if the sequence order differs.

In [None]:
from Bio.Align import PairwiseAligner

aligner = PairwiseAligner()
aligner.open_gap_score = -10
aligner.extend_gap_score = -2
print(f"Gap opening penalty: {aligner.open_gap_score}")
print(f"Gap extension penalty: {aligner.extend_gap_score}")

In [None]:
def sp_score(alignment1, alignment2):
    """
    Computes Sum-of-Pairs (SP) score to compare two multiple sequence alignments.

    Parameters:
    ----------
    alignment1 : list of str
        The first alignment (e.g., from user's algorithm).
    alignment2 : list of str
        The reference alignment (e.g., from BALIbase).

    Returns:
    -------
    float
        The similarity score (1 = identical, 0 = completely different).
    """
    total_pairs = 0
    matching_pairs = 0

    for col in zip(*alignment1):
        total_pairs += 1
        if col in zip(*alignment2):  # Check if column exists in reference alignment
            matching_pairs += 1

    return matching_pairs / total_pairs if total_pairs > 0 else 0

In [None]:
def alignment_result_comparison(result1, result2):
    """ 
    Compare two alignment results, considering sequence order variations.
    
    Parameters:
    ----------
    result1 : tuple (float, list of str, list of str)
        Custom Needleman-Wunsch alignment result in the form (score, alignment1, alignment2).
    result2 : tuple (float, list of str, list of str)
        Biopython Needleman-Wunsch alignment result in the same format.
    
    Prints:
    ------
    - Whether the scores match.
    - The highest similarity between alignments using the sum-of-pairs (SP) score.
    """

    score1, alignment1_1, alignment1_2 = result1
    score2, alignment2_1, alignment2_2 = result2

    print(f"Custom Needleman-Wunsch Score: {score1}")
    print(f"Biopython Needleman-Wunsch Score: {score2}")

    if score1 == score2:
        print("✅ The alignment scores match!")
    else:
        print("❌ The alignment scores differ.")

    # Compute SP-score for both orderings and take the maximum
    sp_similarity = max(
        sp_score([alignment1_1, alignment1_2], [alignment2_1, alignment2_2]),
        sp_score([alignment1_1, alignment1_2], [alignment2_2, alignment2_1])
    )

    print(f"SP Similarity Score: {sp_similarity:.4f}")

    if sp_similarity == 1.0:
        print("✅ Alignments are identical.")
    elif sp_similarity > 0.8:
        print("⚠️ Alignments are highly similar but not identical.")
    else:
        print("❌ Alignments differ significantly.")

In [None]:
substitution_matrix = substitution_matrices.load("BLOSUM62")
gap_open_penalty = -10
gap_extension_penalty = -2
seq1 = "CHAT"
seq2 = "CAT"

alignments_biopython = pairwise2.align.globalds(seq1, seq2, substitution_matrix, gap_open_penalty, gap_extension_penalty)
print("Biopython results: \n",alignments_biopython)
print(" \nCustom results:")
score, alignment = needleman_wunsch([seq1, seq2], True, print_result=False)
print_alignments(alignment)
print()

custom_result = (score, alignment[0], alignment[1])
result_biopython = (alignments_biopython[0][2], alignments_biopython[0][0], alignments_biopython[0][1])
alignment_result_comparison(custom_result, result_biopython)

In [None]:
substitution_matrix = substitution_matrices.load("BLOSUM62")
gap_open_penalty = -10
gap_extension_penalty = -2
seq1 = "MCGNIQLEYAHHGPATQFLWTYIMIGCLKFKGFREQHFYIPGICKDWHFKFLCFYRMIHIPIGPGYITQNTSPAGHYRHSEKAICVMQMFKYICRFRA"
seq2 = "MHGQLEYIAHSPATRFLYTIGCLKFKWFREHHFNIPGECKDWHFKFDCFYRMIHIPIGPAIMYITSPAGHYRHSEMAITVMQMNKVGCRFRDICLYFVES"

alignments_biopython = pairwise2.align.globalds(seq1, seq2, substitution_matrix, gap_open_penalty, gap_extension_penalty)
print("Biopython results: \n",alignments_biopython)
print(" \nCustom results:")
score, alignment = needleman_wunsch([seq1, seq2], True, print_result=False)
print_alignments(alignment)
print()

custom_result = (score, alignment[0], alignment[1])
result_biopython = (alignments_biopython[0][2], alignments_biopython[0][0], alignments_biopython[0][1])
alignment_result_comparison(custom_result, result_biopython)

In [None]:
substitution_matrix = substitution_matrices.load("BLOSUM62")
gap_open_penalty = -15
gap_extension_penalty = -5
seq1 = "MCGNIQLEYAHHGPATQFLWTYIMIGCLKFKGFRQHFYIPGICKDWHFKFLCFYRMIHIPIYITQNTSPAGHYRHSEKAICVMQMFKYICRFRA"
seq2 = "MHGQLEYIAHSPATRFLYTIGCLKFKWFRIPGECKDWHFKFDCFYRMIHIPIGPAIMYITSPAGHYRHSEMAITVMQMNKVGCRFRDICLYFVES"

alignments_biopython = pairwise2.align.globalds(seq1, seq2, substitution_matrix, gap_open_penalty, gap_extension_penalty)
print("Biopython results: \n",alignments_biopython)
print_alignments([alignments_biopython[0][0], alignments_biopython[0][1]])
print(" \nCustom results:")
score, alignment = needleman_wunsch([seq1, seq2], True, print_result=False, gap_opening_score=gap_open_penalty, gap_extension_score=gap_extension_penalty)
print_alignments(alignment)
print()

custom_result = (score, alignment[0], alignment[1])
result_biopython = (alignments_biopython[0][2], alignments_biopython[0][0], alignments_biopython[0][1])
alignment_result_comparison(custom_result, result_biopython)