In [1]:
import pandas as pd
import numpy as np
import spacy
from spellchecker import SpellChecker

In [2]:
nlp = spacy.load("en_core_web_sm")

def keep(token):
    if token.is_stop: return False
    if not token.is_alpha: return False
    return True

def spell_check(text):
    spell = SpellChecker()
    
    # split each clause by whitespaces
    for word in text.split():
        mispelled = spell.unknown(word)
        if mispelled: text = text.replace(word, spell.correction(word))
                
    return text
    

def normalize(item):
    doc = nlp(item)
    print(item)
    return spell_check(" ".join([str(token.lemma_) for token in doc if keep(token)]))


In [9]:
df = pd.read_csv("./cleaned_more.csv")
foods = np.array(df["Shrt_Desc"])


In [10]:
def edit_distance(s, t):
    if s == "": return len(t)
    if t == "": return len(s)
    rs = s[:-1]
    rt = t[:-1]
    cost = s[-1] != t[-1]
    
    return min([
        edit_distance(rs, t) + 1, 
        edit_distance(s, rt) + 1, 
        edit_distance(rs, rt) + cost
    ])


In [37]:
hashmap = dict()

def dynamic_distance(s, t):
    key = str(sorted([s, t]))
    stored = hashmap.get(key, None)
    
    if not stored:
        hashmap[key] = edit_distance(s, t)
        
    return hashmap[key]

In [42]:
MIN_EDIT_DISTANCE_THRESHOLD = 0.8

def is_match(t1, t2):
    max_changes = round((1 - MIN_EDIT_DISTANCE_THRESHOLD) * len(t2))
    return dynamic_distance(t1, t2) <= max_changes


In [48]:
def match_item(foods, item):
    item_tokens = normalize(item).split()

    match_counts = list()
    
    for food in foods:
        food_tokens = food.lower().split()
        match_count = 0
        
        if food_tokens[0].lower() == item_tokens[-1]:
        
            for i1, t1 in enumerate(item_tokens):


                index = next((i2 for i2, t2 in enumerate(food_tokens) if is_match(t1, t2)), None)

                if index != None:
                    food_tokens.pop(index)
                    match_count += 1

                match_counts.append((match_count, -len(food), food))
        
    return max(match_counts)[2]


In [49]:
import time
start = time.time()
closest = match_item(foods, "mozzerella cheese")
end = time.time()
print(closest)
print(end - start)

mozzerella cheese
CHEESE MOZZARELLA MILK
34.36539697647095


In [24]:
edit_distance("mozzerella", "mozerela")

2