Anna Mrukwa  
Makrokierunek sem.5

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from time import time
pd.set_option('display.float_format', '{:.4E}'.format)
pd.options.display.max_colwidth = 100

# LCS algorithm

### Path table:  
0 - do nothing  
1 - go up  
2 - go left  
3 - go up-left

In [2]:
def LCS_rec(word1, word2):
  t0 = time()
  if len(word1) == 0 or len(word2) == 0:
    return 0, time()-t0
  if word1[len(word1)-1] == word2[len(word2)-1]:
    res, _ = LCS_rec(word1[:-1], word2[:-1])
    return res+1, time()-t0
  res1, _ = LCS_rec(word1[:], word2[:-1])
  res2, _ = LCS_rec(word1[:-1], word2[:])
  return max(res1, res2), time()-t0

In [3]:
def LCS_DP(word1, word2):
  ncols = len(word2)+1
  nrows = len(word1)+1
  t0 = time()
  lcs_table = np.zeros((nrows, ncols), dtype=int)
  path_table = np.zeros((nrows, ncols), dtype=int)
  for i in range(1, nrows):
    for j in range(1, ncols):
      if word1[i-1] == word2[j-1]:
        lcs_table[i, j] = lcs_table[i-1, j-1] + 1
        path_table[i, j] = 3
      else:
        if lcs_table[i-1, j] >= lcs_table[i, j-1]:
          lcs_table[i, j] = lcs_table[i-1, j]
          path_table[i, j] = 1
        else:
          lcs_table[i, j] = lcs_table[i, j-1]
          path_table[i, j] = 2
  LCS = lcs_table[-1, -1]
  return LCS, path_table, time()-t0

In [4]:
def print_LCS(path_table, word1): 
  # this one is with longer word for LITE or the first one put in normal ver.
  LCS = ""
  nrows = path_table.shape[0] - 1
  ncols = path_table.shape[1] - 1
  while nrows >= 0 and ncols >= 0:
    if path_table[nrows, ncols] == 3: # go diagonally
      LCS = word1[nrows-1] + LCS # because of 0 in the algorithm
      nrows -= 1
      ncols -= 1
    elif path_table[nrows, ncols] == 2: # go left
      ncols -= 1
    else: # go up
      nrows -= 1
  return LCS

# Edit distance

### Change table: 
0 - do nothing  
1 - go up  
2 - go left  
3 - go up-left

In [5]:
def insert_cost(dist_type):
  if dist_type=='Levenshtein' or dist_type=='LCS':
    return 1

def del_cost(dist_type):
  if dist_type=='Levenshtein' or dist_type=='LCS':
    return 1

def repl_cost(letter1, letter2, dist_type):
  if letter1==letter2:
    return 0
  if dist_type=='Levenshtein':
    return 1
  if dist_type=='LCS':
    return np.Inf

In [6]:
def ED_rec(word1, word2, dist_type='Levenshtein'):
  t0 = time()
  if len(word1) == 0: # inserting all the characters
    return len(word2)*insert_cost(dist_type=dist_type), time()-t0
  if len(word2) == 0: # deleting all the characters
     return len(word1)*del_cost(dist_type=dist_type), time()-t0
  if word1[len(word1)-1] == word2[len(word2)-1]: # last characters are the same
    res, _ = ED_rec(word1[:-1], word2[:-1], dist_type=dist_type)
    return res, time()-t0
  # if not the same, do the least costly operation from:
  # insertion
  res_ins, _ = ED_rec(word1[:], word2[:-1], dist_type=dist_type)
  res_ins += insert_cost(dist_type=dist_type)
  # deletion
  res_del, _ = ED_rec(word1[:-1], word2[:], dist_type=dist_type)
  res_del += del_cost(dist_type=dist_type)
  # replacement
  res_rep, _ = ED_rec(word1[:-1], word2[:-1], dist_type=dist_type)
  res_rep += repl_cost(word1[-1], word2[-1], dist_type=dist_type)

  ED = min(res_ins, res_rep, res_del)
  return ED, time()-t0

In [7]:
def ED_DP(word1, word2, dist_type='Levenshtein'):
  ncols = len(word2)+1
  nrows = len(word1)+1
  t0 = time()
  ed_table = np.zeros((nrows, ncols), dtype=int)
  change_table = np.zeros((nrows, ncols), dtype=int)

  for i in range(1, nrows):
    ed_table[i, 0] = i*del_cost(dist_type=dist_type)
    change_table[i, 0] = 1
  for j in range(1, ncols):
    ed_table[0, j] = j*insert_cost(dist_type=dist_type)
    change_table[0, j] = 2

  for i in range(1, nrows):
    for j in range(1, ncols):
      deletion_cost = ed_table[i-1, j] + del_cost(dist_type=dist_type)
      insertion_cost = ed_table[i, j-1] + insert_cost(dist_type=dist_type)
      replacement_cost = ed_table[i-1, j-1] + repl_cost(word1[i-1], word2[j-1], dist_type=dist_type)
      # del, insert, repl
      operations = np.array([deletion_cost, insertion_cost, replacement_cost])
      op = np.argmin(operations)
      ed_table[i,j] = operations[op]
      change_table[i,j] = op+1
  ED = ed_table[-1, -1]
  return ED, change_table, time()-t0

In [8]:
def print_operations(path_table, word1, word2):

  operations = ""
  nrows = path_table.shape[0] - 1
  ncols = path_table.shape[1] - 1
  while nrows >= 0 and ncols >= 0:
    if path_table[nrows, ncols] == 3: # go diagonally
      # LCS = word1[nrows-1] + LCS # because of 0 in the algorithm
      nrows -= 1
      ncols -= 1
      if word1[nrows] == word2[ncols]:
        operations = word2[ncols] + " " + operations
      else:
        operations = "rep("+word1[nrows]+ ","+ word2[ncols] + ") " + operations
    elif path_table[nrows, ncols] == 2: # go left - insertion
      ncols -= 1
      operations = "ins("+ word2[ncols] + ") " + operations
    elif path_table[nrows, ncols] == 1: # go up - deletion
      nrows -= 1
      operations = "del("+ word1[nrows] + ") " + operations
    else:
      return operations[:-1]
  return operations[:-1]

  return 0

# Testing

## LCS

In [19]:
LCS_strings1 = ["ABCDGH", "AGGTAB", "ABCBDAB", "XMJYAUZ", "ABDTTTTTTT", "WORD1", '', 'CONSEQUENCE', '', 'ABCDEFG', 'QWERTY', "WORD1"]
LCS_strings2 = ["AEDFHR", "GXTXAYB", "BDCABA", "MZJAWXU", "ZNCXMVZNXM", '', 'WORD2', 'CONSISTENCE', '', 'ABCDEFG', 'YTREWQ', 'WORD2']

colnames = ['STRING A', 'STRING B', 'LENGTH OF LCS', 'LCS', 
            'Times for DP ver.', 'Times for recursive ver.']
LCS_lens = []
found_LCS = []
DP_times = []
rec_times = []

In [20]:
for i in tqdm(range(len(LCS_strings1))):
  str1 = LCS_strings1[i]
  str2 = LCS_strings2[i]
  # DP
  LCS_len1, pt, t = LCS_DP(str1, str2)
  LCS_lens.append(LCS_len1)
  DP_times.append(t)
  LCS1 = print_LCS(pt, str1)
  found_LCS.append(LCS1)
  # REC
  LCS_len3, t = LCS_rec(str1, str2)
  assert LCS_len3 == LCS_len1
  rec_times.append(t)


100%|██████████| 12/12 [00:00<00:00, 35.06it/s]


In [21]:
df = pd.DataFrame(list(zip(LCS_strings1, LCS_strings2, LCS_lens, found_LCS, 
                           DP_times, rec_times)),
               columns=colnames)
df

Unnamed: 0,STRING A,STRING B,LENGTH OF LCS,LCS,Times for DP ver.,Times for recursive ver.
0,ABCDGH,AEDFHR,3,ADH,0.0001514,0.0010519
1,AGGTAB,GXTXAYB,4,GTAB,8.9169e-05,0.0004971
2,ABCBDAB,BDCABA,4,BCBA,8.1301e-05,0.000211
3,XMJYAUZ,MZJAWXU,4,MJAU,8.8453e-05,0.0023642
4,ABDTTTTTTT,ZNCXMVZNXM,0,,0.00010967,0.33233
5,WORD1,,0,,1.7166e-05,4.7684e-07
6,,WORD2,0,,1.9073e-06,2.3842e-07
7,CONSEQUENCE,CONSISTENCE,8,CONSENCE,0.00016332,0.0018802
8,,,0,,1.0014e-05,4.7684e-07
9,ABCDEFG,ABCDEFG,7,ABCDEFG,5.3644e-05,7.6294e-06


## ED

In [22]:
ED_strings1 = ["INTENTION", "SUNDAY", "CART", "QUARANTINE", "INDUSTRY", 'WORD1', '', 'QWERTY', '', 'ABCDEFG', 'VISUALIZATION', 'WORD1']
ED_strings2 = ["EXECUTION", "SATURDAY", "MARCH", "RUNTIME", 'INTEREST', '', 'WORD2', 'YTREWQ','', 'ABCDEFG', 'QUALIFICATION', 'WORD2']
colnames = ['STRING A', 'STRING B', 'ED', 'Operations', 
            'Times for DP ver.', 'Times for recursive ver.']

### Levenshtein distance

In [23]:
dist_type='Levenshtein'
found_ED = []
operations = []
DP_times = []
rec_times = []

In [24]:
for i in tqdm(range(len(ED_strings1))):
  str1 = ED_strings1[i]
  str2 = ED_strings2[i]
  # DP
  ED1, pt1, t = ED_DP(str1, str2, dist_type)
  found_ED.append(ED1)
  DP_times.append(t)
  ops = print_operations(pt1, str1, str2)
  operations.append(ops)
  # REC
  ED3, t = ED_rec(str1, str2, dist_type)
  assert ED3 == ED1
  rec_times.append(t)


100%|██████████| 12/12 [00:00<00:00, 36.35it/s]


In [25]:
df = pd.DataFrame(list(zip(ED_strings1, ED_strings2, found_ED, operations, 
                           DP_times, rec_times)),
               columns=colnames)
df

Unnamed: 0,STRING A,STRING B,ED,Operations,Times for DP ver.,Times for recursive ver.
0,INTENTION,EXECUTION,5,"rep(I,E) rep(N,X) del(T) E rep(N,C) ins(U) T I O N",0.0012784,0.0018647
1,SUNDAY,SATURDAY,3,"S ins(A) ins(T) U rep(N,R) D A Y",0.00076461,0.00020862
2,CART,MARCH,3,"rep(C,M) A R rep(T,C) ins(H)",0.00040245,0.00051475
3,QUARANTINE,RUNTIME,5,"rep(Q,R) U del(A) del(R) del(A) N T I rep(N,M) E",0.0010915,0.023792
4,INDUSTRY,INTEREST,6,"I N rep(D,T) rep(U,E) ins(R) ins(E) S T del(R) del(Y)",0.0010018,0.15407
5,WORD1,,5,del(W) del(O) del(R) del(D) del(1),2.4557e-05,7.1526e-07
6,,WORD2,5,ins(W) ins(O) ins(R) ins(D) ins(2),5.0068e-06,4.7684e-07
7,QWERTY,YTREWQ,6,"rep(Q,Y) rep(W,T) ins(R) E rep(R,W) rep(T,Q) del(Y)",0.00063014,0.0067577
8,,,0,,1.7405e-05,1.4305e-06
9,ABCDEFG,ABCDEFG,0,A B C D E F G,0.0014493,9.2983e-06


### LCS distance

In [26]:
dist_type='LCS'
found_ED = []
operations = []
DP_times = []
rec_times = []

In [27]:
for i in tqdm(range(len(ED_strings1))):
  str1 = ED_strings1[i]
  str2 = ED_strings2[i]
  # DP
  ED1, pt1, t = ED_DP(str1, str2, dist_type)
  found_ED.append(ED1)
  DP_times.append(t)
  ops = print_operations(pt1, str1, str2)
  operations.append(ops)
  # REC
  ED3, t = ED_rec(str1, str2, dist_type)
  assert ED3 == ED1
  rec_times.append(t)


100%|██████████| 12/12 [00:00<00:00, 35.16it/s]


In [28]:
df = pd.DataFrame(list(zip(ED_strings1, ED_strings2, found_ED, operations, 
                           DP_times, rec_times)),
               columns=colnames)
df

Unnamed: 0,STRING A,STRING B,ED,Operations,Times for DP ver.,Times for recursive ver.
0,INTENTION,EXECUTION,8,del(I) del(N) del(T) E ins(X) ins(E) ins(C) ins(U) del(N) T I O N,0.0013468,0.0049264
1,SUNDAY,SATURDAY,4,S ins(A) ins(T) U ins(R) del(N) D A Y,0.00082135,0.0004518
2,CART,MARCH,5,ins(M) del(C) A R ins(C) ins(H) del(T),0.00022984,0.00046062
3,QUARANTINE,RUNTIME,7,ins(R) del(Q) U del(A) del(R) del(A) N T I ins(M) del(N) E,0.00059676,0.029205
4,INDUSTRY,INTEREST,8,I N ins(T) ins(E) ins(R) ins(E) del(D) del(U) S T del(R) del(Y),0.0014324,0.14863
5,WORD1,,5,del(W) del(O) del(R) del(D) del(1),2.6941e-05,9.5367e-07
6,,WORD2,5,ins(W) ins(O) ins(R) ins(D) ins(2),8.8215e-06,7.1526e-07
7,QWERTY,YTREWQ,10,ins(Y) ins(T) ins(R) ins(E) ins(W) Q del(W) del(E) del(R) del(T) del(Y),0.00059438,0.0066938
8,,,0,,1.3828e-05,4.7684e-07
9,ABCDEFG,ABCDEFG,0,A B C D E F G,0.00046539,2.0266e-05
