This function only works in Excel `PY` as it requires `thefuzz` library which is not available in Pyodide.  The main reason to use it is that it has a levenshtein algorithm implementation that is roughly 15x faster than the one in the `nltk_distance` function.

In [1]:
import pandas as pd

# Setup globals similar to RUNPY function.
# Arrays must be in pandas DataFrame.
arg1 = pd.DataFrame(["sample", "exemplary", "sampler", "example"], columns=['needles'])
arg2 = pd.DataFrame(["samples", "exemplar", "sample", "examples"], columns=['haystack'])
arg3 = 'ratio'

In [2]:
from thefuzz import fuzz
import pandas as pd

def thefuzz_distance(needle, haystack_df, algorithm='ratio'):
    # Flatten the DataFrame to a list
    haystack = haystack_df.values.flatten().tolist()

    # Check if needle is a DataFrame
    if isinstance(needle, pd.DataFrame):
        needle_list = needle.values.flatten().tolist()
    else:
        needle_list = [needle]

    results = []
    for needle_item in needle_list:
        # Calculate similarity scores
        if algorithm == 'ratio':
            scores = [(index + 1, fuzz.ratio(needle_item, item) / 100) for index, item in enumerate(haystack)]
        elif algorithm == 'partial_ratio':
            scores = [(index + 1, fuzz.partial_ratio(needle_item, item) / 100) for index, item in enumerate(haystack)]
        elif algorithm == 'token_sort_ratio':
            scores = [(index + 1, fuzz.token_sort_ratio(needle_item, item) / 100) for index, item in enumerate(haystack)]
        elif algorithm == 'token_set_ratio':
            scores = [(index + 1, fuzz.token_set_ratio(needle_item, item) / 100) for index, item in enumerate(haystack)]
        else:
            raise ValueError(f"Unknown algorithm: {algorithm}")

        # Sort based on scores in descending order
        scores.sort(key=lambda x: x[1], reverse=True)
        # Append the top index and score to results as a list
        results.append(list(scores[0]))

    # results is 2D list, e.g. [[1, 0.75], [2, 0.85]]
    return results

# Example usage
thefuzz_distance(arg1, arg2, arg3)

[[3, 1.0], [2, 0.94], [3, 0.92], [4, 0.93]]