In [5]:
import textdistance

# Basic usage of the textdistance library
# Compute the normalized similarity of two strings
textdistance.hamming.normalized_similarity('test', 'text')

0.75

In [6]:
import pandas as pd

# Setup globals similar to RUNPY function.
# Arrays must be in pandas DataFrame.
arg1 = pd.DataFrame(["sample", "exemplary", "sampler", "example"], columns=['needles'])
arg2 = pd.DataFrame(["samples", "exemplar", "sample", "examples"], columns=['haystack'])
arg3 = 'jaccard'

arg1

Unnamed: 0,needles
0,sample
1,exemplary
2,sampler
3,example


In [7]:
import textdistance
import pandas as pd

def text_distance(needle, haystack_df, algorithm='jaccard'):
    # Get the algorithm function from textdistance
    algo_func = getattr(textdistance, algorithm)
    # Flatten the DataFrame to a list
    haystack = haystack_df.values.flatten().tolist()
    
    # Check if needle is a DataFrame
    if isinstance(needle, pd.DataFrame):
        needle_list = needle.values.flatten().tolist()
    else:
        needle_list = [needle]
    
    results = [] 
    for needle_item in needle_list:
        # Calculate similarity scores with normalization and round to 2 decimal places
        # Adjust index to be 1-based
        scores = [(index + 1, round(algo_func.normalized_similarity(needle_item, item), 2)) for index, item in enumerate(haystack)]
        # Sort based on scores in descending order
        scores.sort(key=lambda x: x[1], reverse=True)
        # Append the top index and score to results as a list
        results.append(list(scores[0]))

    # results is 2D list, e.g. [[1, 0.75], [2, 0.85]]
    return results

text_distance(arg1, arg2, arg3)

[[3, 1.0], [2, 0.89], [3, 0.86], [2, 0.88]]

In [8]:
# List of algorithms to test
algorithms = [
    'jaccard', 'levenshtein', 'hamming', 'cosine', 'jaro', 'jaro_winkler', 
    'sorensen', 'ratcliff_obershelp', 'damerau_levenshtein', 'strcmp95', 
    'needleman_wunsch', 'smith_waterman', 'tversky', 'overlap', 'monge_elkan',
    'lcsseq', 'lcsstr', 'gotoh', 'sorensen_dice', 'dice', 'bag', 'editex', 
    'mlipns', 'mra'
]

# Example needle and haystack DataFrame
needle = "sampling"
haystack_df = pd.DataFrame(["sample", "example", "sampling", "test"])

# Calculate results for each algorithm
results = [['Algorithm', 'Closest Match', 'Score']]
for algo in algorithms:
    match, score = text_distance(needle, haystack_df, algo)[0]
    results.append([algo, match, float(score)])

# Return results as a nested list with headers
results

[['Algorithm', 'Closest Match', 'Score'],
 ['jaccard', 3, 1.0],
 ['levenshtein', 3, 1.0],
 ['hamming', 3, 1.0],
 ['cosine', 3, 1.0],
 ['jaro', 3, 1.0],
 ['jaro_winkler', 3, 1.0],
 ['sorensen', 3, 1.0],
 ['ratcliff_obershelp', 3, 1.0],
 ['damerau_levenshtein', 3, 1.0],
 ['strcmp95', 3, 1.0],
 ['needleman_wunsch', 3, 1.0],
 ['smith_waterman', 3, 1.0],
 ['tversky', 3, 1.0],
 ['overlap', 3, 1.0],
 ['monge_elkan', 3, 1.0],
 ['lcsseq', 3, 1.0],
 ['lcsstr', 3, 1.0],
 ['gotoh', 3, 1.0],
 ['sorensen_dice', 3, 1.0],
 ['dice', 3, 1.0],
 ['bag', 3, 1.0],
 ['editex', 3, 1.0],
 ['mlipns', 1, 1.0],
 ['mra', 3, 1.0]]