Agnieszka Dutka
# Laboratory 4 - Edit distance and longest common subsequence

Contents:  
[Edit distance](#ed)   
[Editing visualization](#vis)  
[LCS](#lsi)  
[Diff](#diff)

&nbsp;&nbsp;&nbsp;

In [14]:
import numpy as np
from bisect import bisect
from unidecode import unidecode

<a id='ed'></a>
### Edit distance

**_space complexity O(m x n)_**

In [15]:
def edit_distance(x, y, delta, whole_array=False):
    edit_table=np.zeros((len(x)+1,len(y)+1))
    for i in range(len(x)+1):
        edit_table[i,0]=i
    for j in range(len(y)+1):
        edit_table[0,j]=j
    for i in range(len(x)):
        k = i+1
        for j in range(len(y)):
            l = j+1
            edit_table[k,l]= min(
                edit_table[k-1,l]+1, edit_table[k,l-1]+1, edit_table[k-1,l-1]+delta(x[i],y[j]))
    if whole_array:
        return edit_table
    return edit_table[len(x),len(y)]

**_space complexity O(min{m, n})_**

In [17]:
def edit_distance2(x, y, delta):
    if len(x) > len(y):
        x, y = y, x
    edit_row = [i for i in range(len(x)+1)]
    for i in range(1, len(y)+1):
        new_row = [0]*(len(edit_row))
        new_row[0]=i
        for j in range(1, len(edit_row)):
            new_row[j]= min(
                new_row[j-1]+1, edit_row[j]+1, edit_row[j-1]+delta(x[j-1],y[i-1]))
        edit_row = new_row
    return edit_row[-1]

#### Delta functions

In [18]:
def delta1(a,b):  # classic
    if(a==b):
        return 0
    return 1

def delta2(a,b):  # no swap 
    if(a==b):
        return 0
    return 2

def delta3(a,b):  # with unidecode opt
    if(a==b):
        return 0
    elif unidecode(a)==unidecode(b):
        return 0.5
    return 1


<a id='path'></a>
### Path finding and visualization

edit distance algorithm returning min_distance and path

In [19]:
def get_path(x, y, delta): # x, y swapped xD
    x, y = y, x
    edit_row = [(i, 'i'*i) for i in range(len(x)+1)]
    for i in range(1, len(y)+1):
        new_row = [0]*(len(edit_row))
        new_row[0]=(i, 'd'*i)
        for j in range(1, len(edit_row)):
            min_tuple = min(
                (new_row[j-1][0]+1, new_row[j-1][1]+'i'), 
                (edit_row[j][0]+1, edit_row[j][1]+'d'),
                (edit_row[j-1][0]+delta(x[j-1],y[i-1]), edit_row[j-1][1]+'s'))
            new_row[j]= min_tuple
        edit_row = new_row
    return edit_row[-1]

def edit_distance_vis(a: str, b: str):
    res, path = get_path(a, b, delta3)
    ai, bi = 0, 0
    a = list(a)
    print(''.join(a), "<- start")
    for move in path:
        if move == 's':
            if a[ai]!= b[bi]: # swap
                a[ai] = b[bi]
                print(''.join(a[:ai]+['*', a[ai], '*']+a[ai+1:]), "\t[swap]")
            ai, bi = ai+1, bi+1
        elif move == 'i': # insertion
            a.insert(ai, b[bi])
            print(''.join(a[:ai]+['*', a[ai], '*']+a[ai+1:]), "\t[ins]")
            ai, bi = ai+1, bi+1
        elif move == 'd': #deletion
            print(''.join(a[:ai]+['*','*']+a[ai+1:]), "\t[del]")
            del a[ai]


In [20]:
print(edit_distance("kast", "plotek", delta1, True))

[[0. 1. 2. 3. 4. 5. 6.]
 [1. 1. 2. 3. 4. 5. 5.]
 [2. 2. 2. 3. 4. 5. 6.]
 [3. 3. 3. 3. 4. 5. 6.]
 [4. 4. 4. 4. 3. 4. 5.]]


<a id='vis'></a>
### Visualization for given strings

In [21]:
edit_distance_vis("los", "kloc")

los <- start
*k*los 	[ins]
klo*c* 	[swap]


In [22]:
edit_distance_vis("Łódź", "Lodz")

Łódź <- start
*L*ódź 	[swap]
L*o*dź 	[swap]
Lod*z* 	[swap]


In [9]:
edit_distance_vis("kwintesencja", "quintessence")

kwintesencja <- start
*q*wintesencja 	[swap]
q*u*intesencja 	[swap]
quinte*s*sencja 	[ins]
quintessenc**a 	[del]
quintessenc*e* 	[swap]


In [10]:
edit_distance_vis("ATGAATCTTACCGCCTCG", "ATGAGGCTCTGGCCCCTG")

ATGAATCTTACCGCCTCG <- start
ATGA*G*ATCTTACCGCCTCG 	[ins]
ATGAG*G*ATCTTACCGCCTCG 	[ins]
ATGAGG*C*TCTTACCGCCTCG 	[swap]
ATGAGGCTCT*G*ACCGCCTCG 	[swap]
ATGAGGCTCTG*G*CCGCCTCG 	[swap]
ATGAGGCTCTGGCC**CCTCG 	[del]
ATGAGGCTCTGGCCCCT**G 	[del]


<a id='lsi'></a>
### Longest common subsequence

In [110]:

def lcs1(x, y):
    return(len(x)+len(y)-edit_distance(x,y,delta2))/2

def lcs2(x: list,y: list):  # faster, works on any kind of lists
    ranges=[]
    ranges.append(len(y))# I_0 = [0..n]
    for i in range(len(x)):
        positions=[j for j,l in enumerate(y) if l==x[i]]
        positions.reverse()
        for p in positions:
            k = bisect(ranges,p)
            if(k == bisect(ranges,p-1)):
                if(k<len(ranges)-1):
                    ranges[k]=p
                else:
                    ranges[k:k]=[p]
    return len(ranges)-1


In [111]:
lcs1('cbabac','abcabba') == lcs2(list('cbabac'),list('abcabba'))

True

<a id='romeo'></a>
### Romeo & Juliet


In [52]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.pl import Polish
from typing import List

#### Tokenize Romeo & Juliet

In [9]:
nlp = spacy.blank("pl")
tokenizer = nlp.Defaults.create_tokenizer(nlp)

f = open("romeo-i-julia.txt", encoding='utf-8')
rnj_text = ''.join(f.readlines())
rnj_tok = tokenizer(rnj_text)

Delete random tokens

In [99]:
from random import random
def delete_random(tokens, part: float):
    res = []
    for t in tokens:
        if t.text == "\n" or random() >= part:
            res.append(t)
    return res

def save(tokens, to_file:str):
    with open(to_file, 'w', encoding='utf-8') as f:
        for token in tokens:
            f.write(token.text_with_ws)
        f.close

#### Create 2 files with random tokens removed

In [105]:
rnj_tok1 = delete_random(rnj_tok, 0.03)  # with deleted random 3%
rnj_tok2 = delete_random(rnj_tok, 0.03)
print("original tokens:",len(rnj_tok))
print("tokens in rnj_tok1:",len(rnj_tok1))
print("tokens in rnj_tok2:",len(rnj_tok2))
save(rnj_tok1, "romeo-i-julia1.txt")
save(rnj_tok2, "romeo-i-julia2.txt")


original tokens: 32009
tokens in rnj_tok1: 31133
tokens in rnj_tok2: 31148


In [115]:
def lcs_matrix(x, y): 
    # works on lists of any kind of data with == defined (chars, tokens, struct)
    c = [[0 for _ in range(len(y) + 1)] for _ in range(len(x) + 1)]

    for i, xi in enumerate(x):
        for j, yj in enumerate(y):
            if xi == yj:
                c[i][j] = 1 + c[i-1][j-1]
            else:
                c[i][j] = max(c[i][j-1], c[i-1][j])
    return c

def print_diff(c, x, y, i, j):
    if i < 0 and j < 0:
        return ""
    elif i < 0:
        print_diff(c, x, y, i, j-1)
        print(f">>> [{j}] {y[j]}")
    elif j < 0:
        print_diff(c, x, y, i-1, j)
        print(f"<< [{i}] {x[i]}")
    elif x[i] == y[j]:
        print_diff(c, x, y, i-1, j-1)
    elif c[i][j-1] >= c[i-1][j]:
        print_diff(c, x, y, i, j-1)
        print(f">>> [{j}] {y[j]}")
    elif c[i][j-1] < c[i-1][j]:
        print_diff(c, x, y, i-1, j)
        print(f"<< [{i}] {x[i]}")
        
def diff(x, y):
    c = lcs_matrix(x, y)
    return print_diff(c, x, y, len(x)-1, len(y)-1)


In [114]:
with open("romeo-i-julia1.txt", encoding='utf-8') as f1, open("romeo-i-julia2.txt", encoding='utf-8') as f2:
        diff(f1.readlines()[:100], f2.readlines()[:100])

<< [2] i Julia

>>> [2] Romeo i Julia

<< [11]  * — młody Weroneńczyk szlachetnego rodu, krewny księcia

<< [12]  * MONTEKI, KAPULET — naczelnicy dwóch domów nieprzyjaznych 

<< [13]  * STARZEC stryjeczny brat Kapuleta

<< [14]  * — syn Montekiego

<< [15]  * MERKUCJO krewny księcia

>>> [11]  * PARYS — młody Weroneńczyk szlachetnego rodu, krewny księcia

>>> [12]  * MONTEKI, KAPULET — naczelnicy dwóch domów nieprzyjaznych sobie

>>> [13]  * STARZEC — stryjeczny brat Kapuleta

>>> [14]  * ROMEO — syn Montekiego

>>> [15]  * MERKUCJO — krewny księcia

<< [21]  * SAMSON, GRZEGORZ słudzy Kapuleta

>>> [21]  * SAMSON, GRZEGORZ — słudzy Kapuleta

<< [32]  * Obywatele weroneńscy, różne osoby płci obojej, liczący się do przyjaciół obu domów, maski, straż wojskowa i inne osoby.

<< [33] 

<< [34] 

>>> [32]  * Obywatele weroneńscy, różne osoby płci obojej, liczący się do przyjaciół obu domów, maski, straż wojskowa i inne .

<< [37] Rzecz odbywa się przez większą część sztuki w Weronie, przez c