In [None]:
!wget --no-clobber -O ../leipzig1M.txt https://introcs.cs.princeton.edu/python/42sort/leipzig1m.txt

In [None]:
!pip install stringzilla memory_profiler

In [1]:
%load_ext memory_profiler

In [2]:
import stringzilla as sz

In [3]:
pythonic_str: str = open("../leipzig1M.txt", "r").read()
sz_str = sz.Str(pythonic_str)
pattern = "the"

In [4]:
import itertools
top5 = itertools.islice(sz_str.split_iter(" "), 5) # grab the first five words
top5 = list(top5)
top5

[sz.Str('A'),
 sz.Str('rebel'),
 sz.Str('statement'),
 sz.Str('sent'),
 sz.Str('to')]

In [5]:
print(f"{len(pythonic_str):,} characters taking {len(sz_str):,} bytes")

129,644,797 characters taking 129,644,797 bytes


In [6]:
python_lines_count = pythonic_str.count("\n")
sz_lines_count = sz_str.count("\n")
assert python_lines_count == sz_lines_count
print(f"Both libraries report the same number of lines: {python_lines_count:,}")

Both libraries report the same number of lines: 1,000,000


In [7]:
count_words = pythonic_str.count(" ")
print(f"Total of {count_words:,} words of average length ~{len(pythonic_str) / count_words:.2f} characters bytes or ~{len(sz_str) / count_words:.2f} bytes")

Total of 20,191,473 words of average length ~6.42 characters bytes or ~6.42 bytes


In [8]:
%timeit sum(1 for _ in pythonic_str.split())
%timeit sum(1 for _ in sz_str.split())
%timeit sum(1 for _ in sz_str.split_iter())

1.33 s ± 4.52 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
651 ms ± 2.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
465 ms ± 2.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%memit sum(1 for _ in pythonic_str.split())
%memit sum(1 for _ in sz_str.split())
%memit sum(1 for _ in sz_str.split_iter())

peak memory: 2235.44 MiB, increment: 1422.00 MiB
peak memory: 890.62 MiB, increment: 77.00 MiB
peak memory: 890.62 MiB, increment: 0.00 MiB


In [11]:
import random

In [12]:
%%timeit -n 1 -r 10
random.choices(pythonic_str.splitlines(), k=1000)

143 ms ± 7.21 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [13]:
%%timeit -n 1 -r 10
sz_str.splitlines().sample(1000)

25.1 ms ± 481 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)


## Throughput

In [14]:
%%timeit -n 1 -r 10
sorted(pythonic_str.splitlines())

506 ms ± 12.5 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [15]:
%%timeit -n 1 -r 10
sz_str.splitlines().sort()

382 ms ± 11.1 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [16]:
%%timeit -n 1 -r 100
pythonic_str.count(pattern)

144 ms ± 4 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


In [17]:
%%timeit -n 1 -r 100
sz_str.count(pattern)

31.2 ms ± 1.06 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


## Latency

In [18]:
%%timeit -n 1 -r 1
hash(pythonic_str)

28.9 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [19]:
%%timeit -n 1 -r 1
hash(sz_str)

365 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [20]:
%%timeit -n 1 -r 1
pythonic_str.find(" ")

2.34 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [21]:
%%timeit -n 1 -r 1
sz_str.find(" ")

4.37 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [22]:
%%timeit -n 1 -r 1
pythonic_str.partition(" ")

57.5 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [23]:
%%timeit -n 1 -r 1
sz_str.partition(" ")

4.65 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Sequences

In [24]:
%%timeit -n 1 -r 1
pythonic_str.split(" ").sort()

6.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [25]:
%%timeit -n 1 -r 1
sz_str.split(" ").sort()

8.86 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
