In [None]:
# https://dzone.com/articles/the-levenshtein-algorithm-1

# The Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (i.e. insertions, deletions, or substitutions) required to change one word into the other.

In [1]:
import pandas as pd
import numpy as np    
from Levenshtein import distance
from scipy.spatial.distance import pdist, squareform

# Load Data

In [2]:
https://alphabetizer.flap.tv/lists/list-of-fruits-and-vegetables.php 
#266 vegtable records  
data = 'vegtable_data.csv'
df = pd.read_csv(data)
df = df.reset_index()

In [3]:
# 133 correct spellings
df[:5]

Unnamed: 0,index,vegtable_names
0,0,Alfalfa Sprouts
1,1,Apple
2,2,Apricot
3,3,Artichoke
4,4,Asian Pear


In [4]:
# 133 with vowels removed
df[133:138]

Unnamed: 0,index,vegtable_names
133,133,lflf Sprts
134,134,ppl
135,135,prct
136,136,rtchk
137,137,sn Pr


# Distance Metrics

In [5]:
veg_list = list(df['vegtable_names'])

In [6]:
# list to 2 dim array 
transformed_strings = np.array(veg_list).reshape(-1,1)

# create distance matrix
distance_matrix = pdist(transformed_strings,lambda x,y: distance(x[0],y[0]))


In [7]:
distance_matrix.shape

(35245,)

In [8]:
# Create quare matrix.
sq_distance_matrix = squareform(distance_matrix)

In [9]:
sq_distance_matrix.shape

(266, 266)

# Fill diagonal

In [10]:
#1000 is arbitary number. Need a number higher than any distance change measurement
np.fill_diagonal(sq_distance_matrix, 1000)

In [11]:
distance_matrix_df = pd.DataFrame(sq_distance_matrix)
distance_matrix_df[:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,256,257,258,259,260,261,262,263,264,265
0,1000.0,13.0,12.0,13.0,12.0,10.0,13.0,12.0,10.0,13.0,...,14.0,11.0,12.0,13.0,14.0,13.0,14.0,11.0,15.0,13.0
1,13.0,1000.0,5.0,7.0,8.0,7.0,6.0,6.0,13.0,6.0,...,5.0,6.0,11.0,7.0,5.0,7.0,5.0,8.0,6.0,10.0
2,12.0,5.0,1000.0,5.0,9.0,6.0,6.0,6.0,11.0,7.0,...,6.0,6.0,9.0,6.0,6.0,7.0,7.0,8.0,7.0,10.0
3,13.0,7.0,5.0,1000.0,8.0,8.0,6.0,7.0,11.0,9.0,...,8.0,9.0,10.0,7.0,8.0,9.0,9.0,9.0,9.0,10.0
4,12.0,8.0,9.0,8.0,1000.0,7.0,8.0,9.0,12.0,7.0,...,9.0,9.0,11.0,10.0,10.0,9.0,9.0,9.0,10.0,8.0
5,10.0,7.0,6.0,8.0,7.0,1000.0,8.0,7.0,12.0,7.0,...,8.0,8.0,10.0,7.0,8.0,8.0,8.0,9.0,9.0,9.0
6,13.0,6.0,6.0,6.0,8.0,8.0,1000.0,6.0,12.0,6.0,...,7.0,7.0,10.0,6.0,5.0,7.0,6.0,8.0,7.0,10.0
7,12.0,6.0,6.0,7.0,9.0,7.0,6.0,1000.0,11.0,6.0,...,7.0,7.0,11.0,6.0,7.0,7.0,7.0,8.0,7.0,10.0
8,10.0,13.0,11.0,11.0,12.0,12.0,12.0,11.0,1000.0,11.0,...,13.0,11.0,10.0,12.0,13.0,11.0,11.0,11.0,13.0,11.0
9,13.0,6.0,7.0,9.0,7.0,7.0,6.0,6.0,11.0,1000.0,...,5.0,6.0,10.0,7.0,6.0,6.0,6.0,8.0,6.0,9.0


# Find closet match

In [12]:
#get index
min_idx = np.argmin(np.array(sq_distance_matrix), axis=1)

In [13]:
#get value
min_moves = np.min(np.array(sq_distance_matrix), axis=1)

In [14]:
arg_mx_list_df = pd.DataFrame({'index':min_idx,'moves':min_moves})

In [15]:
arg_mx_list_df = arg_mx_list_df.reset_index()

In [16]:
cols = ['original_index','lookup_index','moves']
arg_mx_list_df.columns = cols

In [17]:
top_results = pd.merge(left=arg_mx_list_df,
                               right=df, 
                               left_on='lookup_index', 
                               right_on='index')

In [19]:
top_results[:5]

Unnamed: 0,original_index,lookup_index,moves,index,vegtable_names
0,0,133,5.0,133,lflf Sprts
1,1,134,2.0,134,ppl
2,226,134,2.0,134,ppl
3,2,135,3.0,135,prct
4,3,136,4.0,136,rtchk


In [20]:
df_sm = df[['index','vegtable_names']]

In [21]:
final_df = pd.merge(left=top_results,
                               right=df_sm, 
                               left_on='original_index', 
                               right_on='index')

In [23]:
final_df

Unnamed: 0,original_index,lookup_index,moves,index_x,vegtable_names_x,index_y,vegtable_names_y
0,0,133,5.0,133,lflf Sprts,0,Alfalfa Sprouts
1,1,134,2.0,134,ppl,1,Apple
2,226,134,2.0,134,ppl,226,Pnppl
3,2,135,3.0,135,prct,2,Apricot
4,3,136,4.0,136,rtchk,3,Artichoke
5,193,136,6.0,136,rtchk,193,Jrslm rtchk
6,4,137,5.0,137,sn Pr,4,Asian Pear
7,220,137,4.0,137,sn Pr,220,Pssn Frt
8,5,138,4.0,138,sprgs,5,Asparagus
9,6,33,4.0,33,Cherimoya,6,Atemoya


In [35]:
final_df = final_df[['original_index','vegtable_names_y','moves','lookup_index','vegtable_names_x',]]
final_df = final_df.sort_values(by=['original_index'])
final_df = final_df[:133]

In [37]:
cols = [
'original_index',
'original_vegtable_name',
'moves',
'lookup_index',
'lookup_vegtable_name'

]

final_df.columns = cols
final_df

Unnamed: 0,original_index,original_vegtable_name,moves,lookup_index,lookup_vegtable_name
0,0,Alfalfa Sprouts,5.0,133,lflf Sprts
1,1,Apple,2.0,134,ppl
3,2,Apricot,3.0,135,prct
4,3,Artichoke,4.0,136,rtchk
6,4,Asian Pear,5.0,137,sn Pr
...,...,...,...,...,...
80,128,Waxed Beans,4.0,52,Green Beans
180,129,Yams,2.0,203,Lms
184,130,Yellow Squash,5.0,263,llw Sqsh
185,131,Yuca/Cassava,6.0,264,c/Cssv


In [66]:
final_df_new.to_csv('closest_match.csv')