# Sequences Alignment

## 1st approach: alignment based on sequences distance

- Hamming
- Levenshtein

## 2nd approach: scoring scheme

### Hamming distance

In [1]:
function hamming(x,y)
    if length(x) != length(y)
        println("ERROR: sequences should have equal lenghts!")
        return
    else
        d = 0
        for i = 1:length(x)
            if x[i] != y[i]
                d = d + 1
            end
        end
        return d
    end
end

hamming (generic function with 1 method)

In [2]:
x = "wheeaaa"; y = "ghearpa"
d = hamming(x,y)

4

### How to read real sequences from online database

In [2]:
using FastaIO
#using Images
using OffsetArrays
using PyPlot
using DelimitedFiles
using BenchmarkTools
using StatsBase
using LinearAlgebra
using Printf
using HTTP

In [3]:
function sequenceDownload(sequence)

    sequenceFile = sequence * ".fasta"

    URL = "https://www.uniprot.org/uniprotkb/" * sequenceFile

    query = HTTP.get(URL)
    fastaString=String(query.body)

    open(sequenceFile,"w") do f
        write(f,fastaString)
    end

    FastaIO.readfasta(sequenceFile)[1][2]
end

sequenceDownload (generic function with 1 method)

In [4]:
HBB_Human = sequenceDownload("P68871")
HBA_Bonobo = sequenceDownload("P69906")
HBA_Chimp = sequenceDownload("P69907")
HBA_Donkey = sequenceDownload("P01959")
LegHem = sequenceDownload("P02240")

"MGALTESQAALVKSSWEEFNANIPKHTHRFFILVLEIAPAAKDLFSFLKGTSEVPQNNPELQAHAGKVFKLVYEAAIQLQVTGVVVTDATLKNLGSVHVSKGVADAHFPVVKEAILKTIKEVVGAKWSEELNSAWTIAYDELAIVIKKEMNDAA"

In [8]:
d = hamming(HBB_Human,HBA_Bonobo)

ERROR: sequences should have equal lenghts!


In [9]:
@show length(HBB_Human)
@show length(HBA_Bonobo)
@show length(HBA_Chimp)
@show length(HBA_Donkey)
@show length(LegHem)

length(HBB_Human) = 147
length(HBA_Bonobo) = 142
length(HBA_Chimp) = 142
length(HBA_Donkey) = 142
length(LegHem) = 154


154

In [69]:
@show hamming(HBA_Bonobo,HBA_Chimp)
@show hamming(HBA_Bonobo,HBA_Donkey)
@show hamming(HBA_Chimp,HBA_Donkey)

hamming(HBA_Bonobo, HBA_Chimp) = 0
hamming(HBA_Bonobo, HBA_Donkey) = 20
hamming(HBA_Chimp, HBA_Donkey) = 20


20

### Levenshtein distance: recursive

$$L(i,j) = \min{\begin{cases}1-\delta_{i,j}+L(i-1,j-1)\\1+L(i-1,j)\\1+L(i,j-1)\end{cases}}$$

In [11]:
function leven(x,y)
    D = Dict()

    function levenshtein(x,y)
        isempty(x) && return length(y)
        isempty(y) && return length(x)
        haskey(D,(x,y)) && return D[(x,y)]
        D[(x,y)] = min(1 - (x[end] == y[end]) + levenshtein(x[1:end-1],y[1:end-1]), 1 + levenshtein(x[1:end-1],y), 1 + levenshtein(x,y[1:end-1]))
    end

    levenshtein(x,y)
end

leven (generic function with 1 method)

In [72]:
@show leven(HBA_Bonobo,HBA_Chimp);
@show leven(HBA_Bonobo,HBA_Donkey);
@show leven(HBA_Chimp,HBA_Donkey);

leven(HBA_Bonobo, HBA_Chimp) = 0
leven(HBA_Bonobo, HBA_Donkey) = 20
leven(HBA_Chimp, HBA_Donkey) = 20


In [73]:
@show leven(HBB_Human,HBA_Bonobo);
@show leven(HBB_Human,HBA_Chimp);
@show leven(HBB_Human,HBA_Donkey);
@show leven(HBB_Human,LegHem);

leven(HBB_Human, HBA_Bonobo) = 84
leven(HBB_Human, HBA_Chimp) = 

84


leven(HBB_Human, HBA_Donkey) = 84
leven(HBB_Human, LegHem) = 119


### Scoring scheme

#### Substitution matrix
$$S(X,Y)=\sum_{i=1}^N\log{\frac{p_{x_iy_i}}{q_{x_i}q_{y_i}}}=\sum_{i=1}^Ns(x_i,y_i)$$

#### Gap score
$$\begin{cases}\gamma(g)=-dg\\\gamma(g)=-d-e(g-1) & e < d\end{cases}$$


### Sequence Alignment: global
Global alignment between $X=(x_1,\dots,x_n)$ and $Y=(y_1,\dots,y_m)$.
- Initialization: $F(0,0)=0,\ F(i,0)=-id,\ F(0,j)=-jd\ \forall\ i,j$

- Recursion: $F(i,j)=\max{\begin{cases}F(i-1,j-1)+s(x_i,u_j) & \text{Substitution}\\F(i-1,j)-d & \text{Deletion (in X)}\\F(i,j-1)-d & \text{Insertion (in Y)}\end{cases}}$ for $\begin{cases}0\leq i \leq n \\ 0\leq j \leq m\end{cases}$

- Termination: $F(n,m)$ optimal score

In [49]:
#=
x = "wheea" -> |x| = 5
y = "hepga" -> |y| = 5
whe-ea
-hepga
DMMISM
D: DELETION
M: MATCH
I: INSERTION
S: SUBSTITUTION

    0   1   2   3   4   5
0   0  -1  -2  -3  -4  -5     
1  -1 
2  -2 
3  -3 
4  -4 
5  -5 
=#

function simpleAlignment(x,y)
    d = 1
    # initialization
    F = zeros(Int64,length(x) + 1,length(y) + 1)
    F[1,1] = 0
    for i in eachindex(x)
        F[i + 1,1] = - d * i
        for j in eachindex(y)
            F[1,j + 1] = - d * j
            F[i + 1,j + 1] = max(F[i,j] - (x[i] != y[j]),F[i,j + 1] - d,F[i + 1,j] - d)
        end
    end
    return F
end

function globalAlignment(x,y)
    d = 8
    # initialization
    F = zeros(Int64,length(x) + 1,length(y) + 1)
    F[1,1] = 0
    for i in eachindex(x)
        F[i + 1,1] = - d * i
        for j in eachindex(y)
            F[1,j + 1] = - d * j
            F[i + 1,j + 1] = max(F[i,j] + blosum(x[i],y[j]),F[i,j + 1] - d,F[i + 1,j] - d)
        end
    end
    return F
end

function localAlignment(x,y)
    d = 8
    # initialization
    F = zeros(Int64,length(x) + 1,length(y) + 1)
    F[1,1] = 0
    for i in eachindex(x)
        F[i + 1,1] = 0
        for j in eachindex(y)
            F[1,j + 1] = 0
            F[i + 1,j + 1] = max(0, F[i,j] + blosum(x[i],y[j]),F[i,j + 1] - d,F[i + 1,j] - d)
        end
    end
    return F
end

function overlapAlignment(x,y)
    d = 8
    # initialization
    F = zeros(Int64,length(x) + 1,length(y) + 1)
    F[1,1] = 0
    for i in eachindex(x)
        F[i + 1,1] = 0
        for j in eachindex(y)
            F[1,j + 1] = 0
            F[i + 1,j + 1] = max(F[i,j] + blosum(x[i],y[j]),F[i,j + 1] - d,F[i + 1,j] - d)
        end
    end
    return F
end

overlapAlignment (generic function with 1 method)

In [8]:
# Entries for the BLOSUM50 matrix at a scale of ln(2)/3.0.
# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/BLOSUM50
#  A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  J  Z  X  *

blosum50 = [
[  5 -2 -1 -2 -1 -1 -1  0 -2 -1 -2 -1 -1 -3 -1  1  0 -3 -2  0 -2 -2 -1 -1 -5 ]
[ -2  7 -1 -2 -4  1  0 -3  0 -4 -3  3 -2 -3 -3 -1 -1 -3 -1 -3 -1 -3  0 -1 -5 ]
[ -1 -1  7  2 -2  0  0  0  1 -3 -4  0 -2 -4 -2  1  0 -4 -2 -3  5 -4  0 -1 -5 ]
[ -2 -2  2  8 -4  0  2 -1 -1 -4 -4 -1 -4 -5 -1  0 -1 -5 -3 -4  6 -4  1 -1 -5 ]
[ -1 -4 -2 -4 13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -3 -2 -3 -1 -5 ]
[ -1  1  0  0 -3  7  2 -2  1 -3 -2  2  0 -4 -1  0 -1 -1 -1 -3  0 -3  4 -1 -5 ]
[ -1  0  0  2 -3  2  6 -3  0 -4 -3  1 -2 -3 -1 -1 -1 -3 -2 -3  1 -3  5 -1 -5 ]
[  0 -3  0 -1 -3 -2 -3  8 -2 -4 -4 -2 -3 -4 -2  0 -2 -3 -3 -4 -1 -4 -2 -1 -5 ]
[ -2  0  1 -1 -3  1  0 -2 10 -4 -3  0 -1 -1 -2 -1 -2 -3  2 -4  0 -3  0 -1 -5 ]
[ -1 -4 -3 -4 -2 -3 -4 -4 -4  5  2 -3  2  0 -3 -3 -1 -3 -1  4 -4  4 -3 -1 -5 ]
[ -2 -3 -4 -4 -2 -2 -3 -4 -3  2  5 -3  3  1 -4 -3 -1 -2 -1  1 -4  4 -3 -1 -5 ]
[ -1  3  0 -1 -3  2  1 -2  0 -3 -3  6 -2 -4 -1  0 -1 -3 -2 -3  0 -3  1 -1 -5 ]
[ -1 -2 -2 -4 -2  0 -2 -3 -1  2  3 -2  7  0 -3 -2 -1 -1  0  1 -3  2 -1 -1 -5 ]
[ -3 -3 -4 -5 -2 -4 -3 -4 -1  0  1 -4  0  8 -4 -3 -2  1  4 -1 -4  1 -4 -1 -5 ]
[ -1 -3 -2 -1 -4 -1 -1 -2 -2 -3 -4 -1 -3 -4 10 -1 -1 -4 -3 -3 -2 -3 -1 -1 -5 ]
[  1 -1  1  0 -1  0 -1  0 -1 -3 -3  0 -2 -3 -1  5  2 -4 -2 -2  0 -3  0 -1 -5 ]
[  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  2  5 -3 -2  0  0 -1 -1 -1 -5 ]
[ -3 -3 -4 -5 -5 -1 -3 -3 -3 -3 -2 -3 -1  1 -4 -4 -3 15  2 -3 -5 -2 -2 -1 -5 ]
[ -2 -1 -2 -3 -3 -1 -2 -3  2 -1 -1 -2  0  4 -3 -2 -2  2  8 -1 -3 -1 -2 -1 -5 ]
[  0 -3 -3 -4 -1 -3 -3 -4 -4  4  1 -3  1 -1 -3 -2  0 -3 -1  5 -3  2 -3 -1 -5 ]
[ -2 -1  5  6 -3  0  1 -1  0 -4 -4  0 -3 -4 -2  0  0 -5 -3 -3  6 -4  1 -1 -5 ]
[ -2 -3 -4 -4 -2 -3 -3 -4 -3  4  4 -3  2  1 -3 -3 -1 -2 -1  2 -4  4 -3 -1 -5 ]
[ -1  0  0  1 -3  4  5 -2  0 -3 -3  1 -1 -4 -1  0 -1 -2 -2 -3  1 -3  5 -1 -5 ]
[ -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -5 ]
[ -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5  1 ]
]

aal = ["A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y" "V" "B" "J" "Z" "X" "*"]
aa = Dict(aal[k] => k for k in 1:length(aal))

function blosum(x,y)
    blosum50[aa[string(x)],aa[string(y)]]
end

blosum (generic function with 1 method)

In [50]:
#=
---PAW-HEAE
HEAGAWGHE-E
=#
#globalAlignment(HBB_Human,HBA_Bonobo)
simpleAlignment("PAWHEAE","HEAGAWGHEE")
#globalAlignment("PAWHEAE","HEAGAWGHEE")
#overlapAlignment("PAWHEAE","HEAGAWGHEE")

8×11 Matrix{Int64}:
  0  -1  -2  -3  -4  -5  -6  -7  -8  -9  -10
 -1  -1  -2  -3  -4  -5  -6  -7  -8  -9  -10
 -2  -2  -2  -2  -3  -4  -5  -6  -7  -8   -9
 -3  -3  -3  -3  -3  -4  -4  -5  -6  -7   -8
 -4  -3  -4  -4  -4  -4  -5  -5  -5  -6   -7
 -5  -4  -3  -4  -5  -5  -5  -6  -6  -5   -6
 -6  -5  -4  -3  -4  -5  -6  -6  -7  -6   -6
 -7  -6  -5  -4  -4  -5  -6  -7  -7  -7   -6