# Benchmarks for String Similarity Scoring Functions

Install the most commonly used Python packages for string similarity scoring. This includes JellyFish for Levenshtein and Levenshten-Damerau distance, RapidFuzz for Levenshtein distance, and BioPython for Needleman-Wunsh scores among others.

In [None]:
!pip install stringzilla # https://github.com/ashvardanian/stringzilla

# For Levenshtein distance:
!pip install rapidfuzz  # https://github.com/rapidfuzz/RapidFuzz
!pip install python-Levenshtein  # https://github.com/maxbachmann/python-Levenshtein
!pip install levenshtein # https://github.com/maxbachmann/Levenshtein
!pip install jellyfish # https://github.com/jamesturk/jellyfish/
!pip install editdistance # https://github.com/roy-ht/editdistance
!pip install distance # https://github.com/doukremt/distance
!pip install polyleven # https://github.com/fujimotos/polyleven
!pip install edlib # https://github.com/Martinsos/edlib
!pip install nltk # https://github.com/nltk/nltk

# For Needleman-Wunsch and Smith-Waterman algorithms with custom scoring matrices:
!pip install biopython # https://github.com/biopython/biopython

# For data manipulation:
!pip install pandas
!pip install tabulate

Let's import all those libraries, and check basic functinality of a couple of simple English examples, as well as Unicode, outputting the results in a table.

In [None]:
import jellyfish as jf
import Levenshtein as le
import editdistance as ed
from rapidfuzz.distance import Levenshtein as rf
from nltk.metrics.distance import edit_distance as nltk_ed
import edlib
import stringzilla as sz

import pandas as pd
from tabulate import tabulate

# Define the examples
examples = [
    ('apple', 'aple'),
    ('αβγδ', 'αγδ'),
    # ('مرحبا بالعالم', 'مرحبا يا عالم'), # "Hello World" vs "Welcome to the World" ?
    ('école', 'école'),  # etter "é" as a single character vs "e" + "´"
    ('Schön', 'Scho\u0308n'),  # "ö" represented as "o" + "¨"
    ('💖', '💗'),  # 4-byte emojis: Different hearts
    ('𠜎 𠜱 𠝹 𠱓', '𠜎𠜱𠝹𠱓'),  # Ancient Chinese characters, no spaces vs spaces
    ('München', 'Muenchen'),  # German name with umlaut vs. its transcription
    ('façade', 'facade'),  # "ç" represented as "c" with cedilla vs. plain "c"
    ('こんにちは世界', 'こんばんは世界'),  # Japanese: "Good morning world" vs "Good evening world"
    ('👩‍👩‍👧‍👦', '👨‍👩‍👧‍👦'),  # Family emojis with different compositions
    ('Data科学123', 'Data科學321'),
    ('🙂🌍🚀', '🙂🌎✨'),
]

results = []
for example in examples:
    example_str = example[0] + ' vs ' + example[1]
    jellyfish_distance = jf.levenshtein_distance(example[0], example[1])
    levenshtein_distance = le.distance(example[0], example[1])
    rapidfuzz_distance = rf.distance(example[0], example[1])
    editdistance_distance = ed.eval(example[0], example[1])
    nltk_distance = nltk_ed(example[0], example[1])
    stringzilla_bytes = sz.edit_distance(example[0], example[1])
    stringzilla_chars = sz.edit_distance_unicode(example[0], example[1])
    results.append({
        'Example': example_str,
        'Jellyfish': jellyfish_distance,
        'Levenshtein': levenshtein_distance,
        'RapidFuzz': rapidfuzz_distance,
        'EditDistance': editdistance_distance,
        'NLTK': nltk_distance,
        'StringZilla (Unicode)': stringzilla_chars,
        'StringZilla': stringzilla_bytes,
    })

# Convert results to a DataFrame for easy manipulation
df = pd.DataFrame(results)

# Use tabulate to print the table, setting tablefmt to "grid" for a nice grid-like table format
print(tabulate(df, headers='keys', tablefmt="grid"))


## Levenshtein Distance Between Short English Words

We will be conducting benchmarks on a real-world dataset of English words. Let's download the dataset and load it into memory.

In [None]:
!wget --no-clobber -O ../leipzig1M.txt https://introcs.cs.princeton.edu/python/42sort/leipzig1m.txt

In [None]:
words = open("../xlsum.csv", "r").read(1024 * 1024 * 1024).split()
words = tuple(words)
print(f"{len(words):,} words")

In [None]:
import random

def checksum_distances(tokens, distance_function, n: int = 1000000):
    distances_sum = 0
    while n:
        a = random.choice(tokens)
        b = random.choice(tokens)
        distances_sum += distance_function(a, b)
        n -= 1
    return distances_sum

In [None]:
%%timeit
checksum_distances(words, sz.edit_distance)

In [None]:
%%timeit
checksum_distances(words, jf.levenshtein_distance)

In [None]:
%%timeit
checksum_distances(words, nltk_ed)

In [None]:
%%timeit
checksum_distances(words, ed.eval)

In [None]:
%%timeit
checksum_distances(words, rf.distance)

In [None]:
%%timeit
checksum_distances(words, le.distance)

In [None]:
%%timeit
checksum_distances(words, lambda a, b: edlib.align(a, b, mode="NW", task="distance")["editDistance"])

## Levenshtein Distances for Longer Proteins

In [None]:
import random
proteins = [''.join(random.choices('ACGT', k=10_000)) for _ in range(1_000)]
print(f"{len(proteins):,} proteins")

In [None]:
%%timeit
checksum_distances(proteins, sz.edit_distance, 100)

In [None]:
%%timeit
checksum_distances(proteins, jf.levenshtein_distance, 100)

In [None]:
%%timeit
checksum_distances(proteins, ed.eval, 100)

In [None]:
%%timeit
checksum_distances(proteins, rf.distance, 100)

In [None]:
%%timeit
checksum_distances(proteins, le.distance, 100)

## Needleman-Wunsch Alignment Scores Between Random Protein Sequences

For Needleman-Wunsh, let's generate some random protein sequences:

In [None]:
from Bio import Align
from Bio.Align import substitution_matrices
aligner = Align.PairwiseAligner()
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
aligner.open_gap_score = 1
aligner.extend_gap_score = 1

In [None]:
aligner.substitution_matrix

Let's convert the BLOSUM matrix into a dense form with 256x256 elements. This will allow us to use the matrix with the Needleman-Wunsh algorithm implemented in StringZilla.

In [None]:
import numpy as np

subs_packed = np.array(aligner.substitution_matrix).astype(np.int8)
subs_reconstructed = np.zeros((256, 256), dtype=np.int8)

# Initialize all banned characters to a the largest possible penalty
subs_reconstructed.fill(127)
for packed_row, packed_row_aminoacid in enumerate(aligner.substitution_matrix.alphabet):
    for packed_column, packed_column_aminoacid in enumerate(aligner.substitution_matrix.alphabet):
        reconstructed_row = ord(packed_row_aminoacid)
        reconstructed_column = ord(packed_column_aminoacid)
        subs_reconstructed[reconstructed_row, reconstructed_column] = subs_packed[packed_row, packed_column]

(subs_reconstructed < 127).sum()

In [None]:
aligner.score(proteins[0], proteins[1])

In [None]:
sz.alignment_score(proteins[0], proteins[1], substitution_matrix=subs_reconstructed, gap_score=1)

In [None]:
%%timeit
def sz_score(a, b): return sz.alignment_score(a, b, substitution_matrix=subs_reconstructed, gap_score=1)
checksum_distances(proteins, sz_score, 100)

In [None]:
%%timeit
checksum_distances(proteins, aligner.score, 100)