In [1]:
# Basic usage of the textdistance library
import textdistance
# Compute the normalized similarity of the two strings
textdistance.hamming.normalized_similarity('test', 'text')

0.75

In [2]:
# Globals as converted by PY function.
data1 = "example"
data2 = ["samples", "exemplar", "sample", "examples"]
data3 = 'jaccard'

In [3]:
# function to run in Python for Excel
import textdistance

def fuzzy_distance(needle, haystack, algorithm='jaccard'):
    # Get the algorithm function from textdistance
    algo_func = getattr(textdistance, algorithm)
    # Calculate similarity scores with normalization
    scores = [(index, algo_func.normalized_similarity(needle, item)) for index, item in enumerate(haystack)]
    # Sort based on scores in descending order
    scores.sort(key=lambda x: x[1], reverse=True)
    # Return the index and its score
    return scores[0]

# Example usage
index, score = fuzzy_distance(data1, data2, data3)
# Convert the output to a list, e.g. [0, 0.75]
pyout = [index, score]
pyout

[1, 0.875]

In [4]:
# List of algorithms to test
algorithms = [
    'jaccard', 'levenshtein', 'hamming', 'cosine', 'jaro', 'jaro_winkler', 
    'sorensen', 'ratcliff_obershelp', 'damerau_levenshtein', 'strcmp95', 
    'needleman_wunsch', 'smith_waterman', 'tversky', 'overlap', 'monge_elkan',
    'lcsseq', 'lcsstr', 'gotoh', 'sorensen_dice', 'dice', 'bag', 'editex', 
    'mlipns', 'mra'
]

# Calculate results for each algorithm
results = [['Algorithm', 'Closest Match', 'Score']]
for algo in algorithms:
    match, score = fuzzy_distance(data1, data2, algo)
    results.append([algo, match, float(score)])

# Return results as a nested list with headers
results

[['Algorithm', 'Closest Match', 'Score'],
 ['jaccard', 1, 0.875],
 ['levenshtein', 3, 0.875],
 ['hamming', 3, 0.875],
 ['cosine', 1, 0.9354143466934853],
 ['jaro', 3, 0.9583333333333334],
 ['jaro_winkler', 3, 0.975],
 ['sorensen', 1, 0.9333333333333333],
 ['ratcliff_obershelp', 3, 0.9333333333333333],
 ['damerau_levenshtein', 3, 0.875],
 ['strcmp95', 3, 0.975],
 ['needleman_wunsch', 3, 0.875],
 ['smith_waterman', 3, 0.8571428571428572],
 ['tversky', 1, 0.875],
 ['overlap', 1, 1.0],
 ['monge_elkan', 1, 0.0714285714285714],
 ['lcsseq', 3, 0.875],
 ['lcsstr', 3, 0.875],
 ['gotoh', 3, 0.9285714285714286],
 ['sorensen_dice', 1, 0.9333333333333333],
 ['dice', 1, 0.9333333333333333],
 ['bag', 1, 0.875],
 ['editex', 3, 0.875],
 ['mlipns', 1, 1.0],
 ['mra', 1, 0.8333333333333334]]

In [5]:
# Gradio demo
import gradio as gr
import pandas as pd
import textdistance

# Copy of fuzzy function
def fuzzy_distance(needle, haystack, algorithm='jaccard'):
    # Get the algorithm function from textdistance
    algo_func = getattr(textdistance, algorithm)
    # Calculate similarity scores with normalization
    scores = [(index, algo_func.normalized_similarity(needle, item)) for index, item in enumerate(haystack)]
    # Sort based on scores in descending order
    scores.sort(key=lambda x: x[1], reverse=True)
    # Return the index and its score
    return scores[0]

# Define the Gradio interface
def gradio_fuzzy(needle, haystack_df, algorithm):
    # Convert the DataFrame to a list of items
    haystack_list = haystack_df['items'].tolist()
    # Call the fuzzy function
    index, score = fuzzy_distance(needle, haystack_list, algorithm)
    # Return the index and score as a list
    return [index, score]

# List of algorithms to test
algorithms = [
    'jaccard', 'levenshtein', 'hamming', 'cosine', 'jaro', 'jaro_winkler', 
    'sorensen', 'ratcliff_obershelp', 'damerau_levenshtein', 'strcmp95', 
    'needleman_wunsch', 'smith_waterman', 'tversky', 'overlap', 'monge_elkan',
    'lcsseq', 'lcsstr', 'gotoh', 'sorensen_dice', 'dice', 'bag', 'editex', 
    'mlipns', 'mra'
]

# Create the Gradio interface
demo = gr.Interface(
    fn=gradio_fuzzy,
    inputs=[
        gr.Textbox(label="lookup_value"),
        gr.Dataframe(headers=["items"], label="lookup_array", type="pandas"),  # Set type to pandas DataFrame
        gr.Dropdown(choices=algorithms, label="algorithm")
    ],
    outputs=gr.Textbox(label="Result"),  # Change output to Textbox
    examples = [
    ["apple", pd.DataFrame({"items": ["apples", "apply", "ape", "grape"]}), "jaccard"],
    ["cat", pd.DataFrame({"items": ["cats", "cut", "bat", "hat"]}), "levenshtein"],
    ["hello", pd.DataFrame({"items": ["helloo", "helo", "hallo", "hey"]}), "cosine"]
    ]
)

# Launch the interface
demo.launch(show_error=True)

  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7900

To create a public link, set `share=True` in `launch()`.




In [6]:
import unittest

class TestFuzzy(unittest.TestCase):
    def test_jaccard(self):
        needle = 'example'
        haystack = ['samples', 'exemplar', 'sample', 'examples']
        algorithm = 'jaccard'
        expected = (1, 0.7)
        self.assertEqual(fuzzy(needle, haystack, algorithm), expected, 'Test case 1 failed')

    def test_levenshtein(self):
        needle = 'example'
        haystack = ['samples', 'exemplar', 'sample', 'examples']
        algorithm = 'levenshtein'
        expected = (3, 0.875)
        self.assertEqual(fuzzy(needle, haystack, algorithm), expected, 'Test case 2 failed')

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

EE
ERROR: test_jaccard (__main__.TestFuzzy.test_jaccard)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\brent\AppData\Local\Temp\ipykernel_22372\1883970406.py", line 9, in test_jaccard
    self.assertEqual(fuzzy(needle, haystack, algorithm), expected, 'Test case 1 failed')
                     ^^^^^
NameError: name 'fuzzy' is not defined

ERROR: test_levenshtein (__main__.TestFuzzy.test_levenshtein)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\brent\AppData\Local\Temp\ipykernel_22372\1883970406.py", line 16, in test_levenshtein
    self.assertEqual(fuzzy(needle, haystack, algorithm), expected, 'Test case 2 failed')
                     ^^^^^
NameError: name 'fuzzy' is not defined

----------------------------------------------------------------------
Ran 2 tests in 0.006s

FAILED (errors=2)
