In [3]:
!wget --no-clobber -O ../leipzig1M.txt https://introcs.cs.princeton.edu/python/42sort/leipzig1m.txt

File ‘../leipzig1M.txt’ already there; not retrieving.


In [1]:
import stringzilla as sz

In [2]:
pythonic_str: str = open("../leipzig1M.txt", "r").read()
sz_str = sz.Str(pythonic_str)
pattern = "the"

In [3]:
print(f"{len(pythonic_str):,}, {len(sz_str):,}")

129,644,797, 129,644,797


In [7]:
pythonic_str.count("\n"), sz_str.count("\n")

(1000000, 1000000)

## Throughput

In [8]:
%%timeit -n 1 -r 10
sorted(pythonic_str.splitlines())

455 ms ± 23.6 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [10]:
%%timeit -n 1 -r 10
sz_str.splitlines().sort()

455 ms ± 17.1 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [11]:
%%timeit -n 1 -r 100
pythonic_str.count(pattern)

132 ms ± 13 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


In [12]:
%%timeit -n 1 -r 100
sz_str.count(pattern)

33.1 ms ± 7.74 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


## Latency

In [13]:
%%timeit -n 1 -r 1
hash(pythonic_str)

30.1 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [14]:
%%timeit -n 1 -r 1
hash(sz_str)

21.5 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [15]:
%%timeit -n 1 -r 1
pythonic_str.find(" ")

1.23 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [16]:
%%timeit -n 1 -r 1
sz_str.find(" ")

3.4 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [17]:
%%timeit -n 1 -r 1
pythonic_str.partition(" ")

87.3 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [18]:
%%timeit -n 1 -r 1
sz_str.partition(" ")

18.3 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Sequences

In [19]:
%%timeit -n 1 -r 1
pythonic_str.split(" ").sort()

10.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [20]:
%%timeit -n 1 -r 1
sz_str.split(" ").sort()

9.19 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Edit Distance

In [21]:
!pip install python-Levenshtein  # 4.8 M/mo: https://github.com/maxbachmann/python-Levenshtein
!pip install levenshtein # 4.2 M/mo: https://github.com/maxbachmann/Levenshtein
!pip install jellyfish # 2.3 M/mo: https://github.com/jamesturk/jellyfish/
!pip install editdistance # 700 k/mo: https://github.com/roy-ht/editdistance
!pip install distance # 160 k/mo: https://github.com/doukremt/distance
!pip install polyleven # 34 k/mo: https://github.com/fujimotos/polyleven

Collecting distance
  Downloading Distance-0.1.3.tar.gz (180 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: distance
  Building wheel for distance (setup.py) ... [?25ldone
[?25h  Created wheel for distance: filename=Distance-0.1.3-py3-none-any.whl size=16258 sha256=b688ad5c13aada5f4d13ee0844df820e9f6260d94ac0456de71a70d11872ebf4
  Stored in directory: /home/av/.cache/pip/wheels/fb/cd/9c/3ab5d666e3bcacc58900b10959edd3816cc9557c7337986322
Successfully built distance
Installing collected packages: distance
Successfully installed distance-0.1.3


In [3]:
words = pythonic_str.split(" ")

In [4]:
%%timeit
for word in words:
    sz.levenshtein(word, "rebel")
    sz.levenshtein(word, "statement")
    sz.levenshtein(word, "sent")

4.5 s ± 55.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


: 

In [8]:
import polyleven as pl

In [14]:
%%timeit
for word in words:
    pl.levenshtein(word, "rebel", 100)
    pl.levenshtein(word, "statement", 100)
    pl.levenshtein(word, "sent", 100)

4.49 s ± 105 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
import editdistance as ed

In [16]:
%%timeit
for word in words:
    ed.eval(word, "rebel")
    ed.eval(word, "statement")
    ed.eval(word, "sent")

24.9 s ± 300 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
import jellyfish as jf

In [20]:
%%timeit
for word in words:
    jf.levenshtein_distance(word, "rebel")
    jf.levenshtein_distance(word, "statement")
    jf.levenshtein_distance(word, "sent")

21.8 s ± 390 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
