# **Benchmark Text Similarity**

Author : Bayu Aditya

**Source :**

1. Levensthein Distance : https://www.cuelogic.com/blog/the-levenshtein-algorithm
2. Sequence Matcher : https://stackoverflow.com/questions/17388213/find-the-similarity-metric-between-two-strings

In [1]:
import os
import numpy as np
import json
import matplotlib as mpl
import matplotlib.pyplot as plt

from levensthein import levensthein
from random_generator import random_word_generator, random_number_generator
from difflib import SequenceMatcher

print('Matplotlib : {}'.format(mpl.__version__))
print('Numpy      : {}'.format(np.__version__))

Matplotlib : 3.0.1
Numpy      : 1.15.4


## 1. Import Data Filename

In [2]:
DIR_IMAGE = 'image_plate_benchmark/image_bbox/'

data_filename = os.listdir(DIR_IMAGE)

## 2. Levensthein Distance

In [57]:
def summary_levensthein(key, data_filename, original_key):
    lev_dist = []
    for i, target_name in enumerate(data_filename):
        lev_dist.append([i, levensthein(key, target_name[:-4])])
    print('='*50)
    print('key          : {}'.format(key))
    print('Original key : {} \n'.format(original_key))
    
    lev_dist.sort(key=(lambda x : x[1]))
    
    lev_dist = np.array(lev_dist)
    lev_dist[:,1] = 100 - ((lev_dist[:,1] - min(lev_dist[:,1]))*100/( max(lev_dist[:,1]) - min(lev_dist[:,1]) ))

    print('='*20 + ' Result ' + '='*20)
    for i in range(4):
        loc = int(lev_dist[i,0])
        score = lev_dist[i,1]
        name = data_filename[loc]
        print('score : {:7.4f} %  ID : {:4d}     plat : {}'.format(score, loc, name[:-4]))
    print('='*20 + ' Finish ' + '='*20 + 3*'\n')

### 2.1. Using Word Reduction (0 until 3 word)

In [58]:
num_key = 1000

original_name_key = data_filename[num_key][:-4]
for i in range(0, 4):
    print(' '*20 + 'REDUCTION {} WORD'.format(i), ' '*20)
    key = data_filename[num_key][:-4-i]
    summary_levensthein(key, data_filename, original_name_key)

                    REDUCTION 0 WORD                     
key          : B4838SCN
Original key : B4838SCN 

score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 50.0000 %  ID :   26     plat : B1483SLP
score : 50.0000 %  ID :   40     plat : B2937SON
score : 50.0000 %  ID :  106     plat : B4936SEM



                    REDUCTION 1 WORD                     
key          : B4838SC
Original key : B4838SCN 

score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 57.1429 %  ID :    2     plat : B4913BPC
score : 57.1429 %  ID :   26     plat : B1483SLP
score : 57.1429 %  ID :   89     plat : B389BSP



                    REDUCTION 2 WORD                     
key          : B4838S
Original key : B4838SCN 

score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 83.3333 %  ID :  430     plat : B6231S
score : 83.3333 %  ID :  875     plat : B8381VS
score : 66.6667 %  ID :   26     plat : B1483SLP



                    REDUCTION 3 WORD                     
key          : B4838
Orig

### 2.2. Using Replacement Random Word (0 until 3 word)

In [59]:
num_key = 1000

# KEY
plate = data_filename[num_key][:-4]
plate_key = []

plate_key.append(plate)
plate_key.append(plate[:-1] + random_word_generator())
plate_key.append(plate[:-2] + random_word_generator() + random_word_generator())
plate_key.append(plate[:-3] + random_word_generator() + random_word_generator() + random_word_generator())

original_name_key = data_filename[num_key][:-4]
for i, key in enumerate(plate_key):
    print(' '*20 + 'RANDOM {} WORD'.format(i), ' '*20)
    summary_levensthein(key, data_filename, original_name_key)

                    RANDOM 0 WORD                     
key          : B4838SCN
Original key : B4838SCN 

score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 50.0000 %  ID :   26     plat : B1483SLP
score : 50.0000 %  ID :   40     plat : B2937SON
score : 50.0000 %  ID :  106     plat : B4936SEM



                    RANDOM 1 WORD                     
key          : B4838SCG
Original key : B4838SCN 

score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 57.1429 %  ID :   20     plat : B1835UAG
score : 57.1429 %  ID :   26     plat : B1483SLP
score : 57.1429 %  ID :  106     plat : B4936SEM



                    RANDOM 2 WORD                     
key          : B4838SFM
Original key : B4838SCN 

score : 100.0000 %  ID : 1000     plat : B4838SCN
score : 83.3333 %  ID :  106     plat : B4936SEM
score : 66.6667 %  ID :   26     plat : B1483SLP
score : 66.6667 %  ID :   99     plat : B4383TME



                    RANDOM 3 WORD                     
key          : B4838UZP
Origin

## 3. Sequence Matcher

In [6]:
def sequencematcher(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

In [7]:
def summary_sequencematcher(key, data_filename, original_name_key):
    seq_dist = []
    for i, target_name in enumerate(data_filename):
        seq_dist.append([i, sequencematcher(key, target_name[:-4])])
    print('='*50)
    print('key          : {}'.format(key))
    print('Original key : {} \n'.format(original_name_key))
    
    seq_dist.sort(key=(lambda x : x[1]), reverse = True)

    print('='*20 + ' Result ' + '='*20)
    for i in range(4):
        loc = seq_dist[i][0]
        score = seq_dist[i][1]
        name = data_filename[loc][:-4]
        print('score : {:7.4f}    ID : {:4d}    plat : {}'.format(score, loc, name))
    print('='*20 + ' Finish ' + '='*20 + 3*'\n')

### 3.1. Using Word Reduction (0 until 4 word)

In [8]:
num_key = 1000

original_name_key = data_filename[num_key][:-4]
for i in range(0, 5):
    print(' '*20 + 'REDUCTION {} WORD'.format(i), ' '*20)
    key = data_filename[num_key][:-4-i]
    summary_sequencematcher(key, data_filename, original_name_key)

                    REDUCTION 0 WORD                     
key          : B4838SCN
Original key : B4838SCN 

score :  1.0000    ID : 1000    plat : B4838SCN
score :  0.6667    ID :  875    plat : B8381VS
score :  0.6250    ID :   26    plat : B1483SLP
score :  0.6250    ID :  135    plat : B3598SYC



                    REDUCTION 1 WORD                     
key          : B4838SC
Original key : B4838SCN 

score :  0.9333    ID : 1000    plat : B4838SCN
score :  0.7143    ID :  875    plat : B8381VS
score :  0.6667    ID :   26    plat : B1483SLP
score :  0.6667    ID :  135    plat : B3598SYC



                    REDUCTION 2 WORD                     
key          : B4838S
Original key : B4838SCN 

score :  0.8571    ID : 1000    plat : B4838SCN
score :  0.7692    ID :  875    plat : B8381VS
score :  0.7143    ID :   26    plat : B1483SLP
score :  0.7143    ID :  798    plat : B4368SGJ



                    REDUCTION 3 WORD                     
key          : B4838
Original key : B48

### 3.2. Using Replacement Random Word (0 until 4 word)

In [9]:
num_key = 1000

# KEY
plate = data_filename[num_key][:-4]
plate_key = []

plate_key.append(plate)
plate_key.append(plate[:-1] + random_word_generator())
plate_key.append(plate[:-2] + random_word_generator() + random_word_generator())
plate_key.append(plate[:-3] + random_word_generator() + random_word_generator() + random_word_generator())
plate_key.append(plate[:-4] + random_number_generator() + random_word_generator() + random_word_generator() + random_word_generator())

original_name_key = data_filename[num_key][:-4]
for i, key in enumerate(plate_key):
    print(' '*20 + 'RANDOM {} WORD'.format(i), ' '*20)
    summary_sequencematcher(key, data_filename, original_name_key)

                    RANDOM 0 WORD                     
key          : B4838SCN
Original key : B4838SCN 

score :  1.0000    ID : 1000    plat : B4838SCN
score :  0.6667    ID :  875    plat : B8381VS
score :  0.6250    ID :   26    plat : B1483SLP
score :  0.6250    ID :  135    plat : B3598SYC



                    RANDOM 1 WORD                     
key          : B4838SCO
Original key : B4838SCN 

score :  0.8750    ID : 1000    plat : B4838SCN
score :  0.7500    ID :  245    plat : B4693SCO
score :  0.6667    ID :  875    plat : B8381VS
score :  0.6250    ID :   26    plat : B1483SLP



                    RANDOM 2 WORD                     
key          : B4838SYO
Original key : B4838SCN 

score :  0.7500    ID : 1000    plat : B4838SCN
score :  0.6667    ID :  875    plat : B8381VS
score :  0.6250    ID :   26    plat : B1483SLP
score :  0.6250    ID :  135    plat : B3598SYC



                    RANDOM 3 WORD                     
key          : B4838ZXW
Original key : B4838SCN 