In [2]:
import random
import string
from timeit import timeit

from dameraulevenshtein import damerau_levenshtein_diversity, damerau_levenshtein_distance

def damerau_levenshtein_diversity_py_cy(array):
    dl_distance = 0
    n = len(array)
    for i in range(len(array)):
        for j in range(i + 1, len(array)):
            dl_distance += damerau_levenshtein_distance(array[i], array[j])
    return dl_distance / (n * (n + 1) / 2)

def damerau_levenshtein_diversity_py_py(array):
    dl_distance = 0
    n = len(array)
    for i in range(len(array)):
        for j in range(i + 1, len(array)):
            dl_distance += damerau_levenshtein_distance_py(array[i], array[j])
    return dl_distance / (n * (n + 1) / 2)
    
def damerau_levenshtein_distance_py(seq1, seq2):
    oneago = None
    thisrow = list(range(1, len(seq2) + 1)) + [0]
    for x in range(len(seq1)):
        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
        for y in range(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
    return thisrow[len(seq2) - 1]    


In [3]:
seq1 = '1lpuy42nvy1inwz431inwz4360f40'
seq2 = '1jtwy41isxz41jqvy4360'
print(damerau_levenshtein_distance_py(seq1, seq2) - \
     damerau_levenshtein_distance(seq1, seq2))
print(damerau_levenshtein_distance(seq1, seq2))

0
17


In [4]:
%timeit damerau_levenshtein_distance(seq1, seq2)
%timeit damerau_levenshtein_distance_py(seq1, seq2)

73.3 µs ± 2.75 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
554 µs ± 15.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [5]:
# random.seed(0)
lettersdigits = string.ascii_lowercase + string.digits
def random_seq():
    return ''.join(random.choice(lettersdigits) for _ in range(random.randint(5, 10)))
array_seq = [random_seq() for _ in range(64)]
print(array_seq)

['glawv8u', 'ika91ihgjz', 'znsx819r3', 'i1fcmbne', 'zwbyy75h', 'ui1cw1w', 'o2sbr', 'v9eafjh1w', 'i311r', '3bcnhic', '12ypd8kz', 'lfa5zg89', 'jq98he', 'jmbha', 'kerjx', '6fiarlev', '8tg76oe', 'b7go0w', 'k9z1m', 'rqxo2ac', '26h2qbup', '7jvhs341', 'bj08y8m', 'zdrlh1cj5i', 'vw6c8v6i', 'uxc289', 'a2nuxbjwr', '62hmxr', '1tko9', 'kpcn1mybn', 'dz08pmx5ff', 'zfzsjzeew', 'ty6zw', '0uc6u7byd', 'o0n37', '7l5es9uv', 'glimpj', 'cl6z3p', 'bex5w', '49aik3u5q', 'q16jsn0w', 'mmo4rnvwc', '747x4x', '07kq70e9rh', '2k8yn1a7', 'hrz5cc1yt', '8yas36q6', 'kl940zkz3', '4c3ave8pc', 'mhj7s64p0', 'zou39nr4q', 'ovpgie', 't0tr6kbqhr', '0teozcdlj', 'oexnj', 'zz3ka', 'efn7tm', 'zhit9', 'k7qjdur', 'ms9hnuvn', 'j8ngb', '0uwo9', 'dje4rxn4', '05sqld0v3']


In [6]:
%timeit damerau_levenshtein_diversity(array_seq)
%timeit damerau_levenshtein_diversity_py_cy(array_seq)
%timeit damerau_levenshtein_diversity_py_py(array_seq)

14.3 ms ± 380 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
14.5 ms ± 57.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
107 ms ± 612 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
