# Odległość edycyjna

In [1]:
import numpy as np
from unidecode import unidecode
from queue import PriorityQueue
from enum import Enum
from bisect import bisect
from spacy.language import Language
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
from random import sample

In [2]:
class Operation(Enum):
    ADD = 1
    DELETE = 2
    SWAP = 3
    NO_CHANGE = 4
    
    
def delta(a , b):
    if a == b: return 0
    elif unidecode(a) == unidecode(b): return .5
    else: return 1


def edit_distance(x, y, delta):
    n, m = len(x)+1, len(y)+1

    queue = PriorityQueue()
    operation = [[None for _ in range(m)] for _ in range(n)]
    parent = [[None for _ in range(m)] for _ in range(n)]
    distance = np.empty((n, m))
    distance.fill(np.inf)

    def relax(u, v, opr):
        nonlocal queue, distance
        w = 1
        if opr == Operation.SWAP:
            w = delta(x[v[0]-1], y[v[1]-1])
            if w == 0:
                opr = Operation.NO_CHANGE

        if distance[v[0]][v[1]] > distance[u[0]][u[1]] + w:
            distance[v[0]][v[1]] = distance[u[0]][u[1]] + w
            parent[v[0]][v[1]] = u
            operation[v[0]][v[1]] = opr
            queue.put((distance[v[0]][v[1]], v))
    
    s, t = (0, 0), (n-1, m-1)

    distance[s[0]][s[1]] = 0
    queue.put((0, s))
    while not queue.empty():
        _, u = queue.get()
        
        if u == t:
            break
        i, j = u
        
        if i+1 < n:
            relax(u, (i+1, j), Operation.DELETE)
            
        if j+1 < m:
            relax(u, (i, j+1), Operation.ADD)
            
        if i+1 < n and j+1 < m:
            relax(u, (i+1, j+1), Operation.SWAP)

    u = t
    seq = []
    while u != s:
        seq.insert(0, operation[u[0]][u[1]])
        u = parent[u[0]][u[1]]
    
    return distance[-1, -1], seq
  

In [3]:
def mark_letter(word, index):
    return word[:index] + '*' + word[index] + '*' + word[index+1:]

def swap_letter(word1, word2, i, j):
    return word1[:i] + word2[j] + word1[i+1:]

def add_letter(word1, word2, i, j):
    return word1[:i] + word2[j] + word1[i:]

def remove_letter(word1, i):
    return word1[:i] + word1[i+1:]

def visualize(sequance, x, y):
    print(f'x = {x}')
    print(f'y = {y}\n')
    cur_x = x
    i = 0
    j = 0
    k = 1
    for operation in sequance:
        if operation == Operation.SWAP:
            last_letter = cur_x[i]
            cur_x = swap_letter(cur_x, y, i, j)
            print(f'{k}. {mark_letter(cur_x, i)} (swap: {last_letter} -> {y[j]})')
            i+=1
            j+=1
        elif operation == Operation.ADD:
            cur_x = add_letter(cur_x, y, i, j)
            print(f'{k}. {mark_letter(cur_x, i)} (add: {y[j]})')
            i+=1
            j+=1
        elif operation == Operation.DELETE:
            last_x = cur_x
            cur_x = remove_letter(cur_x, i)
            print(f'{k}. {mark_letter(last_x, i)} (remove: {last_x[i]})')
        elif operation == Operation.NO_CHANGE:
            i+=1
            j+=1
            k-=1
        k+=1
            
    print(f'\nResult: {cur_x}')
            

In [4]:
x, y = 'los', 'kloc'
d, s = edit_distance(x, y, delta)
visualize(s, x, y)
print(f'Edit distance: {d}\n')

x = los
y = kloc

1. *k*los (add: k)
2. klo*c* (swap: s -> c)

Result: kloc
Edit distance: 2.0



In [5]:
x, y = 'Łódź', 'Lodz'
d, s = edit_distance(x, y, delta)
visualize(s, x, y)
print(f'Edit distance: {d}\n')

x = Łódź
y = Lodz

1. *L*ódź (swap: Ł -> L)
2. L*o*dź (swap: ó -> o)
3. Lod*z* (swap: ź -> z)

Result: Lodz
Edit distance: 1.5



In [6]:
x, y = 'kwintesencja', 'quintessence'
d, s = edit_distance(x, y, delta)
visualize(s, x, y)
print(f'Edit distance: {d}\n')

x = kwintesencja
y = quintessence

1. *q*wintesencja (swap: k -> q)
2. q*u*intesencja (swap: w -> u)
3. quintes*s*encja (add: s)
4. quintessenc*j*a (remove: j)
5. quintessenc*e* (swap: a -> e)

Result: quintessence
Edit distance: 5.0



In [7]:
x, y = 'ATGAATCTTACCGCCTCG', 'ATGAGGCTCTGGCCCCTG'
d, s = edit_distance(x, y, delta)
visualize(s, x, y)
print(f'Edit distance: {d}\n')

x = ATGAATCTTACCGCCTCG
y = ATGAGGCTCTGGCCCCTG

1. ATGA*G*TCTTACCGCCTCG (swap: A -> G)
2. ATGAG*G*CTTACCGCCTCG (swap: T -> G)
3. ATGAGGCT*C*TACCGCCTCG (add: C)
4. ATGAGGCTCT*G*CCGCCTCG (swap: A -> G)
5. ATGAGGCTCTG*G*CGCCTCG (swap: C -> G)
6. ATGAGGCTCTGGC*C*CCTCG (swap: G -> C)
7. ATGAGGCTCTGGCCCCT*C*G (remove: C)

Result: ATGAGGCTCTGGCCCCTG
Edit distance: 7.0



# Najdłuższy wspólny podciąg

In [8]:
def delta2(x, y):
    return 0 if x == y else 2

def lcs1(x, y):
    return (len(x) + len(y) - edit_distance(x, y, delta2)[0])/2

def lcs2(x,y):
    ranges = []
    ranges.append(len(y)) 
    y_letters = list(y)
    for i in range(len(x)):
        positions = [j for j, l in enumerate(y_letters) if l == x[i]]
        positions.reverse()
        for p in positions:
            k = bisect(ranges, p)
            if(k == bisect(ranges, p-1)):
                if(k < len(ranges) - 1):
                    ranges[k] = p
                else:
                    ranges[k:k] = [p]
    return len(ranges) - 1

In [9]:
def remove_random_tokens(tokens, percent):
    n = len(tokens)
    i_to_remove = sample(range(n), int(n*percent))
    result = []
    for i, token in enumerate(tokens):
        if i in i_to_remove:
            continue
        result.append(token)
    return result

In [27]:
with open('romeo-i-julia-700.txt', 'r') as file:
    text = file.read()
    vocab = Language(Vocab()).vocab
    tokenizer = Tokenizer(vocab)
    tokens = tokenizer(text)
    tokens_cut_1 = remove_random_tokens(tokens, 0.03)
    tokens_cut_2 = remove_random_tokens(tokens, 0.03)
    
with open('text1.txt', 'w') as file:
    for token in tokens_cut_1:
        file.write(token.text_with_ws)
        
with open('text2.txt', 'w') as file:
    for token in tokens_cut_2:
        file.write(token.text_with_ws)
    
print(f'Liczba tokenów oryginalnego tekstu: {len(tokens)}')
print(f'Liczba tokenów skróconego tekstu: {len(tokens_cut_1)}\n')
print(f'Długość najdłuższego wspólnego podciągu tokenów skróconych tekstów: {lcs2(tokens_cut_1, tokens_cut_2)}')
  

Liczba tokenów oryginalnego tekstu: 2272
Liczba tokenów skróconego tekstu: 2204

Długość najdłuższego wspólnego podciągu tokenów skróconych tekstów: 2138


In [28]:
with open("text1.txt", "r", encoding="utf-8") as f:
    text1 = f.read()

with open("text2.txt", "r", encoding="utf-8") as f:
    text2 = f.read()

In [29]:
def diff(x, y):
    x, y = x.split('\n'), y.split('\n')
     
    d, sequance = edit_distance(x, y, delta)
    i = 0
    j = 0
    for operation in sequance:
        if operation == Operation.ADD:
            print(f"> ({j+1}) {y[j]}")
            j+=1
        elif operation == Operation.DELETE:
            print(f"< ({i + 1}) {x[i]}")
            i+=1
        elif operation == Operation.NO_CHANGE:
            i+=1
            j+=1

In [30]:
diff(text1, text2)

< (10) OSOBY:
< (11)  * ESKALUS — książę panujący w Weronie
> (44) Dwa rody, zacne jednako i sławne —
> (45) Tam, gdzie się rzecz ta rozgrywa, w 
> (153) / Wchodzą Abraham i Baltazar. /
> (154) 
> (155) 
> (156) SAMSON
> (157) 
< (212) 
< (214) 
< (221) 
< (222) Zaczepki waść szukasz?
< (223) 
< (274) 
< (276) 
< (277) 
< (278) TYBALT
> (282) 
> (283) 
> (298) 
< (310) Co za hałas? Podajcie mi długi
> (325) Ha! nędzny Kapulecie!
> (390) 
> (392) PANI MONTEKI
> (448) MONTEKI
> (450) Wybadywałem i sam, i przez drugich,
> (452) im jest wierny, tak zamknięty w sobie,
> (465) Oto nadchodzi. Odstąpcie na stronę;
< (482) 
< (483)                         Jeszcze–ż nie południe?
< (486) BENWOLIO
< (487) 
< (491) ROMEO
< (493)                         Jak nudnie
> (501) Tak jest. Lecz cóż tak chwile twoje dłuży?
> (505) 
> (507) 
< (534) Miłość na oślep zawsze cel swój goni!
< (535) Gdzież dziś jeść będziem? Ach! Był tu podobno
< (536) Jakiś spór? Nie mów mi o nim, wiem wszystko.
< (574) Morze łe