# How to read real sequences from online database

In [1]:
using FastaIO
using OffsetArrays
using PyPlot
using DelimitedFiles
using BenchmarkTools
using StatsBase
using LinearAlgebra
using Printf
using HTTP

In [None]:
function sequenceDownload(sequence)

    sequenceFile = sequence * ".fasta"

    URL = "https://www.uniprot.org/uniprotkb/" * sequenceFile

    query = HTTP.get(URL)
    fastaString=String(query.body)

    open(sequenceFile,"w") do f
        write(f,fastaString)
    end

    FastaIO.readfasta(sequenceFile)[1][2]
end

# Hamming distance

In [None]:
function hamming(x,y)
    if length(x) != length(y)
        println("ERROR: sequences should have equal lenghts!")
        return
    else
        d = 0
        for i = 1:length(x)
            if x[i] != y[i]
                d = d + 1
            end
        end
        return d
    end
end

# Levenshtein distance 

In [None]:
function leven(x,y)
    D = Dict()

    function levenshtein(x,y)
        isempty(x) && return length(y)
        isempty(y) && return length(x)

        haskey(D,(x,y)) && return D[(x,y)]
        
        D[(x,y)] = min(1 - (x[end] == y[end]) + levenshtein(x[1:end-1],y[1:end-1]), 1 + levenshtein(x[1:end-1],y), 1 + levenshtein(x,y[1:end-1]))
    end

    levenshtein(x,y)
end

# Scoring scheme

In [22]:
# Entries for the BLOSUM50 matrix at a scale of ln(2)/3.0.
# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/BLOSUM50
#  A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  J  Z  X  *

blosum50 = [
[  5 -2 -1 -2 -1 -1 -1  0 -2 -1 -2 -1 -1 -3 -1  1  0 -3 -2  0 -2 -2 -1 -1 -5 ]
[ -2  7 -1 -2 -4  1  0 -3  0 -4 -3  3 -2 -3 -3 -1 -1 -3 -1 -3 -1 -3  0 -1 -5 ]
[ -1 -1  7  2 -2  0  0  0  1 -3 -4  0 -2 -4 -2  1  0 -4 -2 -3  5 -4  0 -1 -5 ]
[ -2 -2  2  8 -4  0  2 -1 -1 -4 -4 -1 -4 -5 -1  0 -1 -5 -3 -4  6 -4  1 -1 -5 ]
[ -1 -4 -2 -4 13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -3 -2 -3 -1 -5 ]
[ -1  1  0  0 -3  7  2 -2  1 -3 -2  2  0 -4 -1  0 -1 -1 -1 -3  0 -3  4 -1 -5 ]
[ -1  0  0  2 -3  2  6 -3  0 -4 -3  1 -2 -3 -1 -1 -1 -3 -2 -3  1 -3  5 -1 -5 ]
[  0 -3  0 -1 -3 -2 -3  8 -2 -4 -4 -2 -3 -4 -2  0 -2 -3 -3 -4 -1 -4 -2 -1 -5 ]
[ -2  0  1 -1 -3  1  0 -2 10 -4 -3  0 -1 -1 -2 -1 -2 -3  2 -4  0 -3  0 -1 -5 ]
[ -1 -4 -3 -4 -2 -3 -4 -4 -4  5  2 -3  2  0 -3 -3 -1 -3 -1  4 -4  4 -3 -1 -5 ]
[ -2 -3 -4 -4 -2 -2 -3 -4 -3  2  5 -3  3  1 -4 -3 -1 -2 -1  1 -4  4 -3 -1 -5 ]
[ -1  3  0 -1 -3  2  1 -2  0 -3 -3  6 -2 -4 -1  0 -1 -3 -2 -3  0 -3  1 -1 -5 ]
[ -1 -2 -2 -4 -2  0 -2 -3 -1  2  3 -2  7  0 -3 -2 -1 -1  0  1 -3  2 -1 -1 -5 ]
[ -3 -3 -4 -5 -2 -4 -3 -4 -1  0  1 -4  0  8 -4 -3 -2  1  4 -1 -4  1 -4 -1 -5 ]
[ -1 -3 -2 -1 -4 -1 -1 -2 -2 -3 -4 -1 -3 -4 10 -1 -1 -4 -3 -3 -2 -3 -1 -1 -5 ]
[  1 -1  1  0 -1  0 -1  0 -1 -3 -3  0 -2 -3 -1  5  2 -4 -2 -2  0 -3  0 -1 -5 ]
[  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  2  5 -3 -2  0  0 -1 -1 -1 -5 ]
[ -3 -3 -4 -5 -5 -1 -3 -3 -3 -3 -2 -3 -1  1 -4 -4 -3 15  2 -3 -5 -2 -2 -1 -5 ]
[ -2 -1 -2 -3 -3 -1 -2 -3  2 -1 -1 -2  0  4 -3 -2 -2  2  8 -1 -3 -1 -2 -1 -5 ]
[  0 -3 -3 -4 -1 -3 -3 -4 -4  4  1 -3  1 -1 -3 -2  0 -3 -1  5 -3  2 -3 -1 -5 ]
[ -2 -1  5  6 -3  0  1 -1  0 -4 -4  0 -3 -4 -2  0  0 -5 -3 -3  6 -4  1 -1 -5 ]
[ -2 -3 -4 -4 -2 -3 -3 -4 -3  4  4 -3  2  1 -3 -3 -1 -2 -1  2 -4  4 -3 -1 -5 ]
[ -1  0  0  1 -3  4  5 -2  0 -3 -3  1 -1 -4 -1  0 -1 -2 -2 -3  1 -3  5 -1 -5 ]
[ -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -5 ]
[ -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5  1 ]
]

aal = ["A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y" "V" "B" "J" "Z" "X" "*"]
aa = Dict(aal[k] => k for k in 1:length(aal))

function blosum(x,y)
    blosum50[aa[string(x)],aa[string(y)]]
end

blosum (generic function with 1 method)

# Alignment
## 1. Global, simple (Levenshtein)
Global, simple alignment between $X=(x_1,\dots,x_n)$ and $Y=(y_1,\dots,y_m)$.
- Initialization: $L(0,0)=0,\ L(i,0)=i,\ L(0,j)=j\ \forall\ i,j$

- Recursion: $L(i,j)=\min{\begin{cases}1 - \delta(x_i,y_j) + L(i-1,j-1) & \text{Substitution}\\1+L(i-1,j) & \text{Deletion (in X)}\\1+L(i,j-1) & \text{Insertion (in Y)}\end{cases}}$ for $\begin{cases}0\leq i \leq n \\ 0\leq j \leq m\end{cases}$

- Termination: $L(n,m)$ optimal score

In [16]:
function simpleGlobalAlignment(x,y)
    
    L = OffsetArray(zeros(length(x) + 1, length(y) + 1), 0:length(x), 0:length(y)) # "Score" matrix
    B = OffsetArray(zeros(length(x) + 1, length(y) + 1), 0:length(x), 0:length(y)) # "Traceback" matrix

    for i in eachindex(x)
        L[i,0] = i 
        B[i,0] = 2 # gap on top sequence (deletion)
        for j in eachindex(y)
            L[0,j] = j
            B[0,j] = 3 # gap on left sequence (insertion)
            L[i,j], B[i,j] = findmin([1 - (x[i] == y[j]) + L[i-1,j-1], 1 + L[i-1,j], 1 + L[i,j-1]]) # depending on the min position: for example, if it is in 1st, then return a 1, so a "1" means a match, otherwise "2" and "3" mean gap on top/left sequence respectively
        end
    end
    return L, B, traceback.(B)
end

function traceback(x)
    if     x == 1 return "↖" #substitution
    elseif x == 2 return "↑" #deletion
    elseif x == 3 return "←" #insertion
    elseif x == 4 return " "
    end
end

traceback (generic function with 1 method)

## 2. Global, scoring scheme & linear gap penalties
---
### Scoring scheme
#### Substitution matrix
$$S(X,Y)=\sum_{i=1}^N\log{\frac{p_{x_iy_i}}{q_{x_i}q_{y_i}}}=\sum_{i=1}^Ns(x_i,y_i)$$
#### Gap score
$$\begin{cases}\gamma(g)=-dg\\\gamma(g)=-d-e(g-1) & e < d\end{cases}$$
---
Global, scoring scheme & linear gap penalties alignment between $X=(x_1,\dots,x_n)$ and $Y=(y_1,\dots,y_m)$.
- Initialization: $F(0,0)=0,\ F(i,0)=-id,\ F(0,j)=-jd\ \forall\ i,j$

- Recursion: $F(i,j)=\max{\begin{cases}L(i-1,j-1) + s(x_i,y_j)& \text{Substitution}\\L(i-1,j) - d & \text{Deletion (in X)}\\L(i,j-1) - d & \text{Insertion (in Y)}\end{cases}}$ for $\begin{cases}0\leq i \leq n \\ 0\leq j \leq m\end{cases}$

- Termination: $F(n,m)$ optimal score

Score $s(x_i,y_j)$ is taken from [BLOSUM50](https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/BLOSUM50)

In [24]:
function GlobalAlignment1(x,y)
    
    d = 8 # linear gap penalty
    F = OffsetArray(zeros(length(x) + 1, length(y) + 1), 0:length(x), 0:length(y)) # "Score" matrix
    B = OffsetArray(zeros(length(x) + 1, length(y) + 1), 0:length(x), 0:length(y)) # "Traceback" matrix

    for i in eachindex(x)
        F[i,0] = - d * i
        B[i,0] = 2 # gap on top sequence (deletion)
        for j in eachindex(y)
            F[0,j] = - d * j
            B[0,j] = 3 # gap on left sequence (insertion)
            F[i,j], B[i,j] = findmax([F[i,j] + blosum(x[i],y[j]), F[i-1,j] - d, F[i,j-1] - d]) # depending on the min position: for example, if it is in 1st, then return a 1, so a "1" means a match, otherwise "2" and "3" mean gap on top/left sequence respectively
        end
    end
    return F, B, traceback.(B)
end

GlobalAlignment1 (generic function with 1 method)

## 3. Local, scoring scheme & linear gap penalties
Local, scoring scheme & linear gap penalties alignment between $X=(x_1,\dots,x_n)$ and $Y=(y_1,\dots,y_m)$.
- Initialization: $F(0,0)=0,\ F(i,0)=0,\ F(0,j)=0\ \forall\ i,j$

- Recursion: $F(i,j)=\max{\begin{cases}L(i-1,j-1) + s(x_i,y_j)& \text{Substitution}\\L(i-1,j) - d & \text{Deletion (in X)}\\L(i,j-1) - d & \text{Insertion (in Y)}\\ 0 & \text{start new local alignment}\end{cases}}$ for $\begin{cases}0\leq i \leq n \\ 0\leq j \leq m\end{cases}$

- Termination: $\max_{n,m}\{F\}$ optimal score ($F$ is the score matrix)

Score $s(x_i,y_j)$ is taken from [BLOSUM50](https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/BLOSUM50)

In [36]:
function LocalAlignment1(x,y)
    
    d = 8 # linear gap penalty
    F = OffsetArray(zeros(length(x) + 1, length(y) + 1), 0:length(x), 0:length(y)) # "Score" matrix
    B = OffsetArray(zeros(length(x) + 1, length(y) + 1), 0:length(x), 0:length(y)) # "Traceback" matrix

    for i in eachindex(x)
        F[i,0] = 0
        B[i,0] = 2 # gap on top sequence (deletion)
        for j in eachindex(y)
            F[0,j] = 0
            B[0,j] = 3 # gap on left sequence (insertion)
            F[i,j], B[i,j] = findmax([F[i,j] + blosum(x[i],y[j]), F[i-1,j] - d, F[i,j-1] - d, 0]) # depending on the min position: for example, if it is in 1st, then return a 1, so a "1" means a match, otherwise "2" and "3" mean gap on top/left sequence respectively
        end
    end
    return F, B, traceback.(B)
end

LocalAlignment1 (generic function with 1 method)

## 4. Overlap, scoring scheme & linear gap penalties
Overlap, scoring scheme & linear gap penalties alignment between $X=(x_1,\dots,x_n)$ and $Y=(y_1,\dots,y_m)$.
- Initialization: $F(0,0)=0,\ F(i,0)=0,\ F(0,j)=0\ \forall\ i,j$

- Recursion: $F(i,j)=\max{\begin{cases}L(i-1,j-1) + s(x_i,y_j)& \text{Substitution}\\L(i-1,j) - d & \text{Deletion (in X)}\\L(i,j-1) - d & \text{Insertion (in Y)}\end{cases}}$ for $\begin{cases}0\leq i \leq n \\ 0\leq j \leq m\end{cases}$

- Termination: $F(n,m)$ optimal score ($F$ is the score matrix)

Score $s(x_i,y_j)$ is taken from [BLOSUM50](https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/BLOSUM50)

In [37]:
function OverlapAlignment1(x,y)
    
    d = 8 # linear gap penalty
    F = OffsetArray(zeros(length(x) + 1, length(y) + 1), 0:length(x), 0:length(y)) # "Score" matrix
    B = OffsetArray(zeros(length(x) + 1, length(y) + 1), 0:length(x), 0:length(y)) # "Traceback" matrix

    for i in eachindex(x)
        F[i,0] = 0
        B[i,0] = 2 # gap on top sequence (deletion)
        for j in eachindex(y)
            F[0,j] = 0
            B[0,j] = 3 # gap on left sequence (insertion)
            F[i,j], B[i,j] = findmax([F[i,j] + blosum(x[i],y[j]), F[i-1,j] - d, F[i,j-1] - d]) # depending on the min position: for example, if it is in 1st, then return a 1, so a "1" means a match, otherwise "2" and "3" mean gap on top/left sequence respectively
        end
    end
    return F, B, traceback.(B)
end

OverlapAlignment1 (generic function with 1 method)

# Alignment (full)

In [51]:
"""
Find best alignment using given method

F,B=alignment(x,y;method)

F: matrix of scores between x[1:i] and y[1:j] 
B: matrix of traceback pointers: 1 for deletion, 2 for insertion, 3 for substitution

method="simple":  score is minus the number of edit moves
method="global":  global alignement
method="local":   local alignement
method="overlap": overlap alignement

Using linear gap score.
The Levenshtein distance between x and y is: -score(x,y;method="simple").F[end,end]
"""

function score(x,y;method)
    
    if method == "simple"
        d = 1
        cost = simplecost
    elseif method == "global" || method == "local" || method == "overlap"
        d = 8 # linear gap penalty
        cost = blosum
    end

    n, m = length(x), length(y)
    
    F = OffsetArray(zeros(n + 1, m + 1), 0:n, 0:m) # "Score" matrix
    B = OffsetArray(zeros(n + 1, m + 1), 0:n, 0:m) # "Traceback" matrix
    
    if method == "local" || method == "overlap"
        F[0:n,0] .= 0
        F[0,0:m] .= 0
    else
        F[0:n,0] .= (0:n) * (-d)
        F[0,0:m] .= (0:m) * (-d)
    end
    
    B[0:n,0] .= 2 # gaps on top sequence (deletions)
    B[0,0:m] .= 3 # gaps on left sequence (insertions)
    
    for i = 1:n
        for j = 1:m
            if method == "local"
                F[i,j], B[i,j] = findmax([F[i-1,j-1] + cost(x[i],y[j]), F[i-1,j] - d , F[i,j-1] - d, 0]) # depending on the min position: for example, if it is in 1st, then return a 1, so a "1" means a match, otherwise "2" and "3" mean gap on top/left sequence respectively, "4" means starting a new local alignment
            else
                F[i,j], B[i,j] = findmax([F[i-1,j-1] + cost(x[i],y[j]), F[i-1,j] - d , F[i,j-1] - d]) # depending on the min position: for example, if it is in 1st, then return a 1, so a "1" means a match, otherwise "2" and "3" mean gap on top/left sequence respectively
            end
        end
    end
    
    return F, B, traceback.(B)
    
end

function simplecost(a,b)
    -(a!=b) 
 end

simplecost (generic function with 1 method)