# **Benchmark Text Similarity**

Author : Bayu Aditya

**Source :**

1. Levensthein Distance : https://www.cuelogic.com/blog/the-levenshtein-algorithm
2. Sequence Matcher : https://stackoverflow.com/questions/17388213/find-the-similarity-metric-between-two-strings

In [21]:
import os
import numpy as np
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime

from levensthein import levensthein
from random_generator import random_word_generator, random_number_generator
from difflib import SequenceMatcher

print('Matplotlib : {}'.format(mpl.__version__))
print('Numpy      : {}'.format(np.__version__))

Matplotlib : 3.0.1
Numpy      : 1.15.4


## 1. Import Data Filename

In [2]:
DIR_IMAGE = 'image_plate_benchmark/image_bbox/'

data_filename = os.listdir(DIR_IMAGE)

## 2. Levensthein Distance

In [19]:
def summary_levensthein(key, data_filename, original_key):
    start_time = datetime.now()
    lev_dist = []
    for i, target_name in enumerate(data_filename):
        lev_dist.append([i, levensthein(key, target_name[:-4])])
    end_time = datetime.now()
    
    print('='*50)
    print('key          : {}'.format(key))
    print('Original key : {}'.format(original_key))
    print('Duration     : {}'.format(end_time - start_time))
    
    lev_dist.sort(key=(lambda x : x[1]))
    
    lev_dist = np.array(lev_dist)
    lev_dist[:,1] = 100 - ((lev_dist[:,1] - min(lev_dist[:,1]))*100/( max(lev_dist[:,1]) - min(lev_dist[:,1]) ))

    print('='*20 + ' Result ' + '='*20)
    for i in range(4):
        loc = int(lev_dist[i,0])
        score = lev_dist[i,1]
        name = data_filename[loc]
        print('score : {:7.4f} %  ID : {:4d}     plat : {}'.format(score, loc, name[:-4]))
    print('='*20 + ' Finish ' + '='*20 + 3*'\n')

### 2.1. Using Word Reduction (0 until 3 word)

#### 2.1.a. From Right Side

In [22]:
num_key = 1000

original_name_key = data_filename[num_key][:-4]
for i in range(0, 4):
    print(' '*10 + 'REDUCTION {} WORD (from right side)'.format(i), ' '*10)
    key = data_filename[num_key][:-4-i]
    summary_levensthein(key, data_filename, original_name_key)

          REDUCTION 0 WORD (from right side)           
key          : B4838SCN
Original key : B4838SCN
Duration     : 0:00:00.800154
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 50.0000 %  ID :   26     plat : B1483SLP
score : 50.0000 %  ID :   40     plat : B2937SON
score : 50.0000 %  ID :  106     plat : B4936SEM



          REDUCTION 1 WORD (from right side)           
key          : B4838SC
Original key : B4838SCN
Duration     : 0:00:00.629836
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 57.1429 %  ID :    2     plat : B4913BPC
score : 57.1429 %  ID :   26     plat : B1483SLP
score : 57.1429 %  ID :   89     plat : B389BSP



          REDUCTION 2 WORD (from right side)           
key          : B4838S
Original key : B4838SCN
Duration     : 0:00:00.551917
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 83.3333 %  ID :  430     plat : B6231S
score : 83.3333 %  ID :  875     plat : B8381VS
score : 66.6667 %  ID :   26     plat : B1483SLP



     

#### 2.1.b. From Left Side

In [23]:
num_key = 1000

original_name_key = data_filename[num_key][:-4]
for i in range(0, 4):
    print(' '*10 + 'REDUCTION {} WORD (from left side)'.format(i), ' '*10)
    key = data_filename[num_key][i:-4]
    summary_levensthein(key, data_filename, original_name_key)

          REDUCTION 0 WORD (from left side)           
key          : B4838SCN
Original key : B4838SCN
Duration     : 0:00:00.751301
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 50.0000 %  ID :   26     plat : B1483SLP
score : 50.0000 %  ID :   40     plat : B2937SON
score : 50.0000 %  ID :  106     plat : B4936SEM



          REDUCTION 1 WORD (from left side)           
key          : 4838SCN
Original key : B4838SCN
Duration     : 0:00:00.712576
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 57.1429 %  ID :  477     plat : B1386CB
score : 57.1429 %  ID :  611     plat : B8180CY
score : 57.1429 %  ID :  875     plat : B8381VS



          REDUCTION 2 WORD (from left side)           
key          : 838SCN
Original key : B4838SCN
Duration     : 0:00:00.605252
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 66.6667 %  ID :  477     plat : B1386CB
score : 66.6667 %  ID :  611     plat : B8180CY
score : 66.6667 %  ID :  875     plat : B8381VS



          

### 2.2. Using Replacement Random Word (0 until 3 word)

#### 2.2.a. From Right Side

In [24]:
num_key = 1000

# KEY
plate = data_filename[num_key][:-4]
plate_key_right = []

plate_key_right.append(plate)
plate_key_right.append(plate[:-1] + random_word_generator())
plate_key_right.append(plate[:-2] + random_word_generator() + random_word_generator())
plate_key_right.append(plate[:-3] + random_word_generator() + random_word_generator() + random_word_generator())

original_name_key = data_filename[num_key][:-4]
for i, key in enumerate(plate_key_right):
    print(' '*10 + 'RANDOM {} WORD (from right side)'.format(i), ' '*10)
    summary_levensthein(key, data_filename, original_name_key)

          RANDOM 0 WORD (from right side)           
key          : B4838SCN
Original key : B4838SCN
Duration     : 0:00:00.694725
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 50.0000 %  ID :   26     plat : B1483SLP
score : 50.0000 %  ID :   40     plat : B2937SON
score : 50.0000 %  ID :  106     plat : B4936SEM



          RANDOM 1 WORD (from right side)           
key          : B4838SCT
Original key : B4838SCN
Duration     : 0:00:00.809448
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 57.1429 %  ID :   26     plat : B1483SLP
score : 57.1429 %  ID :   51     plat : B6825SHT
score : 57.1429 %  ID :  106     plat : B4936SEM



          RANDOM 2 WORD (from right side)           
key          : B4838SSS
Original key : B4838SCN
Duration     : 0:00:00.754603
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 83.3333 %  ID :  875     plat : B8381VS
score : 66.6667 %  ID :   26     plat : B1483SLP
score : 66.6667 %  ID :  106     plat : B4936SEM



        

#### 2.2.b. From Left Side

In [25]:
num_key = 1000

# KEY
plate = data_filename[num_key][:-4]
plate_key_left = []

plate_key_left.append(plate)
plate_key_left.append(random_word_generator() + plate[1:])
plate_key_left.append(random_word_generator() + random_word_generator() + plate[2:])
plate_key_left.append(random_word_generator() + random_word_generator() + str(np.random.randint(0, 9)) + plate[3:])

original_name_key = data_filename[num_key][:-4]
for i, key in enumerate(plate_key_left):
    print(' '*10 + 'RANDOM {} WORD (from left side)'.format(i), ' '*10)
    summary_levensthein(key, data_filename, original_name_key)

          RANDOM 0 WORD (from left side)           
key          : B4838SCN
Original key : B4838SCN
Duration     : 0:00:00.703663
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 50.0000 %  ID :   26     plat : B1483SLP
score : 50.0000 %  ID :   40     plat : B2937SON
score : 50.0000 %  ID :  106     plat : B4936SEM



          RANDOM 1 WORD (from left side)           
key          : P4838SCN
Original key : B4838SCN
Duration     : 0:00:00.746149
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 42.8571 %  ID :   26     plat : B1483SLP
score : 42.8571 %  ID :   40     plat : B2937SON
score : 42.8571 %  ID :  106     plat : B4936SEM



          RANDOM 2 WORD (from left side)           
key          : EZ838SCN
Original key : B4838SCN
Duration     : 0:00:00.735755
score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 50.0000 %  ID :   40     plat : B2937SON
score : 50.0000 %  ID :  143     plat : B688RCH
score : 50.0000 %  ID :  412     plat : B1838WUA



          R

## 3. Sequence Matcher

In [26]:
def sequencematcher(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

In [27]:
def summary_sequencematcher(key, data_filename, original_name_key):
    start_time = datetime.now()
    seq_dist = []
    for i, target_name in enumerate(data_filename):
        seq_dist.append([i, sequencematcher(key, target_name[:-4])])
    end_time = datetime.now()
    print('='*50)
    print('key          : {}'.format(key))
    print('Original key : {}'.format(original_name_key))
    print('Duration     : {}'.format(end_time - start_time))
    
    seq_dist.sort(key=(lambda x : x[1]), reverse = True)
    
    seq_dist = np.array(seq_dist)
    seq_dist[:,1] = ((seq_dist[:,1] - min(seq_dist[:,1]))*100/( max(seq_dist[:,1]) - min(seq_dist[:,1]) ))

    print('='*22 + ' Result ' + '='*22)
    for i in range(4):
        loc = int(seq_dist[i][0])
        score = seq_dist[i][1]
        name = data_filename[loc][:-4]
        print('score : {:7.4f} %    ID : {:4d}    plat : {}'.format(score, loc, name))
    print('='*22 + ' Finish ' + '='*22 + 3*'\n')

### 3.1. Using Word Reduction (0 until 4 word)

#### 3.1.a. From Right Side

In [28]:
num_key = 1000

original_name_key = data_filename[num_key][:-4]
for i in range(0, 4):
    print(' '*10 + 'REDUCTION {} WORD (from right side)'.format(i), ' '*10)
    key = data_filename[num_key][:-4-i]
    summary_sequencematcher(key, data_filename, original_name_key)

          REDUCTION 0 WORD (from right side)           
key          : B4838SCN
Original key : B4838SCN
Duration     : 0:00:00.063074
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 66.6667 %    ID :  875    plat : B8381VS
score : 62.5000 %    ID :   26    plat : B1483SLP
score : 62.5000 %    ID :  135    plat : B3598SYC



          REDUCTION 1 WORD (from right side)           
key          : B4838SC
Original key : B4838SCN
Duration     : 0:00:00.037645
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 76.5306 %    ID :  875    plat : B8381VS
score : 71.4286 %    ID :   26    plat : B1483SLP
score : 71.4286 %    ID :  135    plat : B3598SYC



          REDUCTION 2 WORD (from right side)           
key          : B4838S
Original key : B4838SCN
Duration     : 0:00:00.031159
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 89.7436 %    ID :  875    plat : B8381VS
score : 83.3333 %    ID :   26    plat : B1483SLP
score : 83.3333 %    ID :  798    plat : B436

#### 3.1.b. From Left Side

In [29]:
num_key = 1000

original_name_key = data_filename[num_key][:-4]
for i in range(0, 4):
    print(' '*10 + 'REDUCTION {} WORD (from left side)'.format(i), ' '*10)
    key = data_filename[num_key][i:-4]
    summary_sequencematcher(key, data_filename, original_name_key)

          REDUCTION 0 WORD (from left side)           
key          : B4838SCN
Original key : B4838SCN
Duration     : 0:00:00.071425
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 66.6667 %    ID :  875    plat : B8381VS
score : 62.5000 %    ID :   26    plat : B1483SLP
score : 62.5000 %    ID :  135    plat : B3598SYC



          REDUCTION 1 WORD (from left side)           
key          : 4838SCN
Original key : B4838SCN
Duration     : 0:00:00.032850
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 61.2245 %    ID :  875    plat : B8381VS
score : 57.1429 %    ID :   26    plat : B1483SLP
score : 57.1429 %    ID :  135    plat : B3598SYC



          REDUCTION 2 WORD (from left side)           
key          : 838SCN
Original key : B4838SCN
Duration     : 0:00:00.029397
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 71.7949 %    ID :  875    plat : B8381VS
score : 66.6667 %    ID :  135    plat : B3598SYC
score : 53.8462 %    ID :   89    plat : B389BSP

### 3.2. Using Replacement Random Word (0 until 4 word)

#### 3.2.a. From Right Side

In [30]:
num_key = 1000

# KEY
#plate = data_filename[num_key][:-4]
#plate_key = []

#plate_key.append(plate)
#plate_key.append(plate[:-1] + random_word_generator())
#plate_key.append(plate[:-2] + random_word_generator() + random_word_generator())
#plate_key.append(plate[:-3] + random_word_generator() + random_word_generator() + random_word_generator())
#plate_key.append(plate[:-4] + random_number_generator() + random_word_generator() + random_word_generator() + random_word_generator())

original_name_key = data_filename[num_key][:-4]
for i, key in enumerate(plate_key_right):
    print(' '*10 + 'RANDOM {} WORD (from right side)'.format(i), ' '*10)
    summary_sequencematcher(key, data_filename, original_name_key)

          RANDOM 0 WORD (from right side)           
key          : B4838SCN
Original key : B4838SCN
Duration     : 0:00:00.042819
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 66.6667 %    ID :  875    plat : B8381VS
score : 62.5000 %    ID :   26    plat : B1483SLP
score : 62.5000 %    ID :  135    plat : B3598SYC



          RANDOM 1 WORD (from right side)           
key          : B4838SCT
Original key : B4838SCN
Duration     : 0:00:00.040803
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 76.1905 %    ID :  875    plat : B8381VS
score : 71.4286 %    ID :   26    plat : B1483SLP
score : 71.4286 %    ID :   99    plat : B4383TME



          RANDOM 2 WORD (from right side)           
key          : B4838SSS
Original key : B4838SCN
Duration     : 0:00:00.041161
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 88.8889 %    ID :  875    plat : B8381VS
score : 83.3333 %    ID :   26    plat : B1483SLP
score : 83.3333 %    ID :  798    plat : B4368SGJ



#### 3.2.b. From Left Side

In [31]:
num_key = 1000

# KEY
#plate = data_filename[num_key][:-4]
#plate_key_left = []

#plate_key_left.append(plate)
#plate_key_left.append(random_word_generator() + plate[1:])
#plate_key_left.append(random_word_generator() + random_word_generator() + plate[2:])
#plate_key_left.append(random_word_generator() + random_word_generator() + str(np.random.randint(0, 9)) + plate[3:])

original_name_key = data_filename[num_key][:-4]
for i, key in enumerate(plate_key_left):
    print(' '*10 + 'RANDOM {} WORD (from left side)'.format(i), ' '*10)
    summary_sequencematcher(key, data_filename, original_name_key)

          RANDOM 0 WORD (from left side)           
key          : B4838SCN
Original key : B4838SCN
Duration     : 0:00:00.069168
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 66.6667 %    ID :  875    plat : B8381VS
score : 62.5000 %    ID :   26    plat : B1483SLP
score : 62.5000 %    ID :  135    plat : B3598SYC



          RANDOM 1 WORD (from left side)           
key          : P4838SCN
Original key : B4838SCN
Duration     : 0:00:00.037677
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 60.9524 %    ID :  875    plat : B8381VS
score : 57.1429 %    ID :   26    plat : B1483SLP
score : 57.1429 %    ID :  135    plat : B3598SYC



          RANDOM 2 WORD (from left side)           
key          : EZ838SCN
Original key : B4838SCN
Duration     : 0:00:00.030872
score : 100.0000 %    ID : 1000    plat : B4838SCN
score : 71.1111 %    ID :  875    plat : B8381VS
score : 66.6667 %    ID :  135    plat : B3598SYC
score : 53.3333 %    ID :   89    plat : B389BSP



  