In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## Loading Dataset
Scrubbed dataset collected by filtering through NCBI datasets (see `scrubbed-viral-data/README.md` for more info).

In [2]:
data_path = '../scrubbed-viral-data/scrubbed_sequences.csv'
df = pd.read_csv(data_path)
seqs = df['Sequence'].to_numpy()

## Find Local Alignment Windows
Let $n = |v|, m = |w|$, assume $n << m$.

First we will find a list of all quartets $(i_{begin}, j_{begin}, i_{end}, j_{end})$ of all optimal
local alignments of $v$ with $w$. Since this quartet captures the 4 points of a "window" of what would
be the 2D DP table for local alignment, I'll call it a window.

This is extremely similar to the midterm! Professor El-Kebir pointed out a clever solution to this
problem by keeping track of an `origin` "table" where each cell $(i, j)$ has an entry $origin[i][j]$
corresponding to the start of the optimal local alignment ending at $(i, j)$. We run space
efficient local alignment with 2 columns (and 2 columns for our `origin` tracking), and
save the highest scores + their corrseponding windows. There may be multiple ties by score.

These windows will
then be plugged into Hirschberg's algorithm to be able to figure out the backtraces for the local
aligns in $O(n)$ space.


In [3]:
def delta(v_i, w_j):
    if v_i == w_j:
        return 1
    else:
        return -1


def find_windows(v: str, w: str):
    n = len(v)
    m = len(w)
    prev = [(0, -1, -1) for _ in range(n + 1)]  # (score, i, j) i and j of origin of score
    cur = [(0, -1, -1) for _ in range(n + 1)]
    max_score = 0
    windows = []

    for j_end in range(m + 1):
        for i_end in range(n + 1):
            new_score, i_beg, j_beg = 0, i_end, j_end  # origin is itself by default
            if i_end > 0:
                from_score, from_i_beg, from_j_beg = cur[i_end - 1]
                deletion_score = from_score - 1
                if deletion_score > new_score:
                    # inherit origins from previous cell
                    new_score, i_beg, j_beg = deletion_score, from_i_beg, from_j_beg
            if j_end > 0:
                from_score, from_i_beg, from_j_beg = prev[i_end]
                insertion_score = from_score - 1
                if insertion_score > new_score:
                    new_score, i_beg, j_beg = insertion_score, from_i_beg, from_j_beg
            if i_end > 0 and j_end > 0:
                from_score, from_i_beg, from_j_beg = prev[i_end - 1]
                match_score = from_score + delta(v[i_end - 1], w[j_end - 1])
                if match_score > new_score:
                    new_score, i_beg, j_beg = match_score, from_i_beg, from_j_beg

            cur[i_end] = (new_score, i_beg, j_beg)
            if new_score > max_score:
                max_score = new_score
                # print(f'new window of score {max_score}')
                # window index is 0-indexed for string access, but sequence index is 1-indexed
                windows = [(i_beg, j_beg, i_end - 1, j_end - 1)]
            elif new_score == max_score and max_score > 0:
                # print(f'continuing window of score {max_score}')
                windows.append((i_beg, j_beg, i_end - 1, j_end - 1))

        prev = cur
        cur = [(0, -1, -1) for _ in range(n + 1)]
        # print([p[0] for p in prev])

    # print([c[0] for c in cur])
    return max_score, windows

In [5]:
# Test finding windows
test_in = 'GTAAATCCTTTGAGAAGAAGAGTCTCT'
test_seq = 'GTTTAATGATTCACGATGTTGAGCACAGTTTTCCAACATTATGACCGAAATGATGAGGAACGCGCGTTGGTACCCTATAATCCGAGGCCGCCGAGTTACG'
print(f'{len(test_in)}: {test_in}')
print(f'{len(test_seq)}: {test_seq}')

27: GTAAATCCTTTGAGAAGAAGAGTCTCT
100: GTTTAATGATTCACGATGTTGAGCACAGTTTTCCAACATTATGACCGAAATGATGAGGAACGCGCGTTGGTACCCTATAATCCGAGGCCGCCGAGTTACG


In [6]:
find_windows(test_in, test_seq)

(8, [(3, 34, 21, 56)])

In [7]:
len(test_in)
print(test_in[3:21+1])
print(test_seq[34:56+1])

AATCCTTTGAGAAGAAGAG
AACATTATGACCGAAATGATGAG


## Hirschberg Implementation

Now that we have our window-finding done, here we'll implement Hirschberg's algorithm to be able
to recover the backtraces in $O(n)$ space where $n = \text{length of input sequence}$ and assuming
$n << m$.

Note that our implementation is more closely aligned with [Wikipedia's suggested implementation](https://en.wikipedia.org/wiki/Hirschberg%27s_algorithm)
than the one discussed in class.

In [None]:
# Returns score only, not alignment string
def needlemann_wunsch(v: str, w: str):
    n = len(v)
    m = len(w)
    prev = [0 for _ in range(n + 1)]
    cur = [0 for _ in range(n + 1)]

    for j_end in range(m + 1):
        for i_end in range(n + 1):
            new_score = -np.inf
            if i_end == 0 and j_end == 0:
                new_score = 0
            if i_end > 0:
                deletion_score = cur[i_end - 1] - 1
                if deletion_score > new_score:
                    new_score = deletion_score
            if j_end > 0:
                insertion_score = prev[i_end] - 1
                if insertion_score > new_score:
                    new_score = insertion_score
            if i_end > 0 and j_end > 0:
                match_score = prev[i_end - 1] + delta(v[i_end - 1], w[j_end - 1])
                if match_score > new_score:
                    new_score = match_score

            cur[i_end] = new_score

        prev = cur
        cur = [0 for _ in range(n + 1)]

    return prev[n]


def hirschberg(v: str, w: str, i_beg: int, j_beg: int, i_end: int, j_end: int):
    pass


def space_efficient_local_align(v: str, w: str):
    results = []
    _, windows = find_windows(v, w)
    for (i_beg, j_beg, i_end, j_end) in windows:
        score, alignment_str = hirschberg(v, w, i_beg, j_beg, i_end, j_end)
        results.append((score, alignment_str))

In [10]:
# Test NW and Hirschberg
nw_in = 'ACAGTTTTCCAACATTA'
# nw_seq = 'ATGTCTTACCATATTAGACAGAACTAGTGAGAG'
nw_seq = 'ACAGTTTTCACCATATTAGACAGAACTAGTGAGAG'
needlemann_wunsch(nw_in, nw_seq)

-1

## Input Sequences
Here you should put in the gene or sequence you wish to find local alignments of.

In [None]:
v = 'GTAAATCCTTTGAGAAGAAGAGTCTCT'