In [7]:
import pandas as pd
import json

# Setup globals similar to RUNPY function.
# Arrays must be in pandas DataFrame.
arg1 = pd.DataFrame(["sample", "exemplary", "sampler", "example"], columns=['needles'])
arg2 = pd.DataFrame(["samples", "exemplar", "sample", "examples"], columns=['haystack'])
arg3 = 'jaccard'

# Serialize the arguments for loading into the demo workbook
args = {
    "arg1": arg1.values.tolist(),
    "arg2": arg2.values.tolist(),
    "arg3": arg3,
}
json.dumps(args)

'{"arg1": [["sample"], ["exemplary"], ["sampler"], ["example"]], "arg2": [["samples"], ["exemplar"], ["sample"], ["examples"]], "arg3": "jaccard"}'

In [7]:
def fuzzy_top_n(needle, haystack_df, algorithm, top_n):
    """
    Calculate text distance matches between needle and haystack.
    
    Args:
        needle (str|DataFrame): Search term(s) to match
        haystack_df (DataFrame): DataFrame to search within
        algorithm (str): Text distance algorithm (e.g. 'jaccard')
        top_n (int): Number of top matches to return
        
    Returns:
        list: Top N matching strings
    """
    algo_func = getattr(textdistance, algorithm)
    haystack = haystack_df.values.flatten().tolist()
    
    if isinstance(needle, pd.DataFrame):
        needle_list = needle.values.flatten().tolist()
    else:
        needle_list = [needle]
    
    results = []
    for needle_item in needle_list:
        scores = [(item, round(algo_func.normalized_similarity(needle_item, item), 2))
                 for item in haystack]
        scores.sort(key=lambda x: x[1], reverse=True)
        top_matches = [score[0] for score in scores[:top_n]]
        results.append(top_matches)
    
    return results[0] if len(results) == 1 else results

# Example usage:
fuzzy_top_n(arg1, arg2, 'jaccard', 3)

[[3, 1.0], [2, 0.89], [3, 0.86], [2, 0.88]]