In [1]:
# Function to run test cases
def run_tests(func, test_cases):
    if not test_cases:
        raise ValueError("No test cases provided.")
    
    if not isinstance(test_cases, list):
        raise TypeError("Test cases should be provided as a list.")
    
    for i, test_case in enumerate(test_cases):
        if not isinstance(test_case, list):
            raise TypeError(f"Test case {i+1} is not a list.")
        
        # Handle both single arguments and lists of arguments
        if not test_case:
            raise ValueError(f"Test case {i+1} is empty.")
        
        if isinstance(test_case[0], list):
            result = func(test_case)
            test_case_str = str(test_case)
        else:
            result = func(*test_case)
            test_case_str = str(test_case)
        
        print(f"Case {i+1}: {test_case_str} -> {result}")

## Similarity Algorithms

The similarity algorithms available in `textdistance` package are given in the tables below.

### Edit Distance

| Algorithm            | Description                                                                 |
|----------------------|-----------------------------------------------------------------------------|
| [`damerau_levenshtein`](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) | Similar to Levenshtein but considers transpositions as a single edit. |
| [`hamming`](https://en.wikipedia.org/wiki/Hamming_distance)            | Measures the number of positions at which the corresponding symbols are different. |
| [`levenshtein`](https://en.wikipedia.org/wiki/Levenshtein_distance)        | Calculates the minimum number of single-character edits required to change one word into the other. |
| [`jaro`](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)               | Measures similarity between two strings, giving more weight to common prefixes. |
| [`jaro_winkler`](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)       | An extension of Jaro, giving more weight to strings that match from the beginning. |
| [`lcsseq`](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem)             | Measures the longest common subsequence. |
| [`lcsstr`](https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher)             | Measures the longest common substring. |
| [`ratcliff_obershelp`](https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching) | Measures similarity based on the longest common subsequence. |
| [`strcmp95`](http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c)           | A string comparison algorithm developed by the U.S. Census Bureau. |
| [`needleman_wunsch`](https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm)   | A dynamic programming algorithm for sequence alignment. |
| [`smith_waterman`](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm)     | A dynamic programming algorithm for local sequence alignment. |
| [`gotoh`](http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/LOA/Lec6-Sequence-Alignment-Affine-Gaps-Gotoh1982.pdf)              | An extension of Needleman-Wunsch with affine gap penalties. |

### Token

| Algorithm            | Description                                                                 |
|----------------------|-----------------------------------------------------------------------------|
| [`cosine`](https://en.wikipedia.org/wiki/Cosine_similarity)             | Measures the cosine of the angle between two non-zero vectors. |
| [`jaccard`](https://en.wikipedia.org/wiki/Jaccard_index)            | Measures similarity between finite sample sets. |
| [`overlap`](https://en.wikipedia.org/wiki/Overlap_coefficient)            | Measures the overlap coefficient between two sets. |
| [`sorensen`](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)           | Measures similarity between two sets, based on the size of the intersection divided by the size of the union. |
| [`sorensen_dice`](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)      | Similar to Sorensen, but uses Dice's coefficient. |
| [`dice`](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)               | Another name for Sorensen-Dice coefficient. |
| [`tversky`](https://en.wikipedia.org/wiki/Tversky_index)            | A generalization of the Jaccard index. |

### Sequence

| Algorithm            | Description                                                                 |
|----------------------|-----------------------------------------------------------------------------|
| [`bag`](https://github.com/Yomguithereal/talisman/blob/master/src/metrics/bag.js)                | Measures bag similarity between two sequences                               |
| [`mlipns`](http://www.sial.iias.spb.su/files/386-386-1-PB.pdf)             | Measures similarity using the MLIPNS algorithm                              |
| [`monge_elkan`](https://www.academia.edu/200314/Generalized_Monge-Elkan_Method_for_Approximate_Text_String_Comparison)        | A hybrid algorithm combining multiple similarity measures. $ME(a,b)$ |

### Phonetic

| Algorithm                                                                    | Description                                                                 |
|------------------------------------------------------------------------------|-----------------------------------------------------------------------------|
| [`mra`](https://en.wikipedia.org/wiki/Match_rating_approach)                 | Measures similarity using the MRA algorithm                                 |
| [`editex`](https://anhaidgroup.github.io/py_stringmatching/v0.3.x/Editex.html) | Measures similarity using the Editex algorithm                              |                              |

In [2]:
import textdistance
import pandas as pd

def text_distance(needle, haystack_df, algorithm):
    """
    Calculate similarity scores between search term(s) and a list of strings.
    
    Args:
        needle (str|DataFrame): Single string or DataFrame containing search term(s)
        haystack_df (DataFrame): DataFrame containing strings to search within
        algorithm (str): Text distance algorithm name from textdistance library 
            (e.g., 'jaccard', 'levenshtein', 'jaro_winkler')
    
    Returns:
        list[list]: 2D list of [index, score] pairs, where index is 1-based position 
            in haystack and score is normalized similarity (0-1 scale).
            Returns one pair per needle term.
    """
    # Get the algorithm function from textdistance
    algo_func = getattr(textdistance, algorithm)
    # Flatten the DataFrame to a list
    haystack = haystack_df.values.flatten().tolist()
    
    # Check if needle is a DataFrame
    if isinstance(needle, pd.DataFrame):
        needle_list = needle.values.flatten().tolist()
    else:
        needle_list = [needle]
    
    results = [] 
    for needle_item in needle_list:
        # Calculate similarity scores with normalization and round to 2 decimal places
        # Adjust index to be 1-based
        scores = [(index + 1, round(algo_func.normalized_similarity(needle_item, item), 2)) for index, item in enumerate(haystack)]
        # Sort based on scores in descending order
        scores.sort(key=lambda x: x[1], reverse=True)
        # Append the top index and score to results as a list
        results.append(list(scores[0]))

    # results is 2D list, e.g. [[1, 0.75], [2, 0.85]]
    return results

# Create test DataFrame
test_haystack = pd.DataFrame(['apple', 'banana', 'orange', 'pear'])
test_needle_df = pd.DataFrame(['aple', 'banan'])

test_cases = [
    ['aple', test_haystack, 'jaccard'],          # Single string, should match 'apple'
    ['banan', test_haystack, 'levenshtein'],     # Single string with different algorithm
    [test_needle_df, test_haystack, 'jaccard'],  # DataFrame input
    ['peer', test_haystack, 'jaro_winkler']      # Different spelling with jaro_winkler
]

# Excel usage: =TEXT_DISTANCE("aple", {"apple", "banana", "orange", "pear", "grape", "kiwi", "mango", "plum", "peach", "cherry"}, "jaccard")

run_tests(text_distance, test_cases)

Case 1: ['aple',         0
0   apple
1  banana
2  orange
3    pear, 'jaccard'] -> [[1, 0.8]]
Case 2: ['banan',         0
0   apple
1  banana
2  orange
3    pear, 'levenshtein'] -> [[2, 0.83]]
Case 3: [       0
0   aple
1  banan,         0
0   apple
1  banana
2  orange
3    pear, 'jaccard'] -> [[1, 0.8], [2, 0.83]]
Case 4: ['peer',         0
0   apple
1  banana
2  orange
3    pear, 'jaro_winkler'] -> [[4, 0.87]]


In [3]:
import textdistance
import pandas as pd

def fuzzy_top_n(needle, haystack_df, algorithm, top_n):
    """
    Find top N most similar strings for given search term(s).
    
    Args:
        needle (str|DataFrame): Single string or DataFrame containing search term(s)
        haystack_df (DataFrame): DataFrame containing strings to search within
        algorithm (str): Text distance algorithm name from textdistance library
            (e.g., 'jaccard', 'levenshtein', 'jaro_winkler')
        top_n (int): Number of closest matches to return for each needle
    
    Returns:
        list[list]: 2D list where each inner list contains top N matching strings
            for each needle term.
    """
    algo_func = getattr(textdistance, algorithm)
    haystack = haystack_df.values.flatten().tolist()
    
    if isinstance(needle, pd.DataFrame):
        needle_list = needle.values.flatten().tolist()
    else:
        needle_list = [needle]
    
    results = []
    for needle_item in needle_list:
        scores = [(item, round(algo_func.normalized_similarity(needle_item, item), 2))
                 for item in haystack]
        scores.sort(key=lambda x: x[1], reverse=True)
        top_matches = [score[0] for score in scores[:top_n]]
        results.append(top_matches)
    
    return [results[0]] if len(results) == 1 else results

# Create test DataFrame
test_haystack = pd.DataFrame(['apple', 'banana', 'orange', 'pear', 'apricot', 'grape'])
test_needle_df = pd.DataFrame(['aple', 'banan', 'oragne'])

test_cases = [
    ['aple', test_haystack, 'jaccard', 2],        # Should return ['apple', 'grape']
    ['oragne', test_haystack, 'levenshtein', 3],  # Should return ['orange', 'grape', 'pear']
    [test_needle_df, test_haystack, 'jaro_winkler', 2],  # Should return multiple lists of 2 matches
    ['peer', test_haystack, 'hamming', 3]         # Should return ['pear', 'grape', 'apple']
]

# Excel usage: =FUZZY_TOP_N("aple", {"apple", "banana", "orange", "pear", "grape", "kiwi"}, "jaccard", 3)

run_tests(fuzzy_top_n, test_cases)

Case 1: ['aple',          0
0    apple
1   banana
2   orange
3     pear
4  apricot
5    grape, 'jaccard', 2] -> [['apple', 'pear']]
Case 2: ['oragne',          0
0    apple
1   banana
2   orange
3     pear
4  apricot
5    grape, 'levenshtein', 3] -> [['orange', 'grape', 'apple']]
Case 3: [        0
0    aple
1   banan
2  oragne,          0
0    apple
1   banana
2   orange
3     pear
4  apricot
5    grape, 'jaro_winkler', 2] -> [['apple', 'orange'], ['banana', 'orange'], ['orange', 'grape']]
Case 4: ['peer',          0
0    apple
1   banana
2   orange
3     pear
4  apricot
5    grape, 'hamming', 3] -> [['pear', 'apple', 'banana']]
