# Sequences Alignment

## 1st approach: alignment based on sequences distance

- Hamming
- Levenshtein

## 2nd approach: scoring scheme

### Hamming distance

In [1]:
function hamming(x,y)
    if length(x) != length(y)
        println("ERROR: sequences should have equal lenghts!")
        return
    else
        d = 0
        for i = 1:length(x)
            if x[i] != y[i]
                d = d + 1
            end
        end
        return d
    end
end

hamming (generic function with 1 method)

In [2]:
x = "wheeaaa"; y = "ghearpa"
d = hamming(x,y)

4

### How to read real sequences from online database

In [3]:
using FastaIO
#using Images
using OffsetArrays
using PyPlot
using DelimitedFiles
using BenchmarkTools
using StatsBase
using LinearAlgebra
using Printf
using HTTP

In [4]:
function sequenceDownload(sequence)

    sequenceFile = sequence * ".fasta"

    URL = "https://www.uniprot.org/uniprotkb/" * sequenceFile

    query = HTTP.get(URL)
    fastaString=String(query.body)

    open(sequenceFile,"w") do f
        write(f,fastaString)
    end

    FastaIO.readfasta(sequenceFile)[1][2]
end

sequenceDownload (generic function with 1 method)

In [7]:
HBB_Human = sequenceDownload("P68871")
HBA_Bonobo = sequenceDownload("P69906")
HBA_Chimp = sequenceDownload("P69907")
HBA_Donkey = sequenceDownload("P01959")
LegHem = sequenceDownload("P02240")

"MGALTESQAALVKSSWEEFNANIPKHTHRFFILVLEIAPAAKDLFSFLKGTSEVPQNNPELQAHAGKVFKLVYEAAIQLQVTGVVVTDATLKNLGSVHVSKGVADAHFPVVKEAILKTIKEVVGAKWSEELNSAWTIAYDELAIVIKKEMNDAA"

In [8]:
d = hamming(HBB_Human,HBA_Bonobo)

ERROR: sequences should have equal lenghts!


In [9]:
@show length(HBB_Human)
@show length(HBA_Bonobo)
@show length(HBA_Chimp)
@show length(HBA_Donkey)
@show length(LegHem)

length(HBB_Human) = 147
length(HBA_Bonobo) = 142
length(HBA_Chimp) = 142
length(HBA_Donkey) = 142
length(LegHem) = 154


154

In [69]:
@show hamming(HBA_Bonobo,HBA_Chimp)
@show hamming(HBA_Bonobo,HBA_Donkey)
@show hamming(HBA_Chimp,HBA_Donkey)

hamming(HBA_Bonobo, HBA_Chimp) = 0
hamming(HBA_Bonobo, HBA_Donkey) = 20
hamming(HBA_Chimp, HBA_Donkey) = 20


20

### Levenshtein distance: recursive

$$L(i,j) = \min{\begin{cases}1-\delta_{i,j}+L(i-1,j-1)\\1+L(i-1,j)\\1+L(i,j-1)\end{cases}}$$

In [11]:
function leven(x,y)
    D = Dict()

    function levenshtein(x,y)
        isempty(x) && return length(y)
        isempty(y) && return length(x)
        haskey(D,(x,y)) && return D[(x,y)]
        D[(x,y)] = min(1 - (x[end] == y[end]) + levenshtein(x[1:end-1],y[1:end-1]), 1 + levenshtein(x[1:end-1],y), 1 + levenshtein(x,y[1:end-1]))
    end

    levenshtein(x,y)
end

leven (generic function with 1 method)

In [72]:
@show leven(HBA_Bonobo,HBA_Chimp);
@show leven(HBA_Bonobo,HBA_Donkey);
@show leven(HBA_Chimp,HBA_Donkey);

leven(HBA_Bonobo, HBA_Chimp) = 0
leven(HBA_Bonobo, HBA_Donkey) = 20
leven(HBA_Chimp, HBA_Donkey) = 20


In [73]:
@show leven(HBB_Human,HBA_Bonobo);
@show leven(HBB_Human,HBA_Chimp);
@show leven(HBB_Human,HBA_Donkey);
@show leven(HBB_Human,LegHem);

leven(HBB_Human, HBA_Bonobo) = 84
leven(HBB_Human, HBA_Chimp) = 

84


leven(HBB_Human, HBA_Donkey) = 84
leven(HBB_Human, LegHem) = 119
