In [None]:
"""
Test how does dask handle multiple operations that need to read the same data.

Results show that at least for (mean, min, max, var) the overhead decreases
substantially when calling compute() after da.stack
"""

In [3]:
import os
import time

import create
import dask.array as da

fn = "t4e6_raw.hdf5"

if not os.path.exists(fn):
    create.t4e6_raw(fn)

ar = create.read_raw(fn)
print(ar)

print()
print("COMPUTE SEPARETELY")

for rep in range(2):
    print()
    print("REPETITION", rep)

    t = time.time()
    sums = ar.mean(axis=1)
    csums = sums.compute()
    print("mean", time.time() - t)
    print(csums)

    t = time.time()
    sums = ar.min(axis=1)
    csums = sums.compute()
    print("min", time.time() - t)
    print(csums)

    t = time.time()
    sums = ar.max(axis=1)
    csums = sums.compute()
    print("max", time.time() - t)
    print(csums)

    t = time.time()
    sums = ar.var(axis=1)
    csums = sums.compute()
    print("var", time.time() - t)
    print(csums)

print()
print("COMPUTE JOINTLY")

t = time.time()
con = da.stack([ar.mean(axis=1), ar.min(axis=1), ar.max(axis=1)])
print(con)
ccon = con.compute()
print("without var", time.time() - t)
print(ccon)

t = time.time()
con = da.stack([ar.mean(axis=1), ar.min(axis=1), ar.max(axis=1), ar.var(axis=1)])
print(con)
ccon = con.compute()
print("with var", time.time() - t)
print(ccon)

dask.array<array, shape=(20000, 20000), dtype=float64, chunksize=(4000, 4000), chunktype=numpy.ndarray>

COMPUTE SEPARETELY

REPETITION 0
mean 1.6078178882598877
[0.99114669 1.00532354 1.0106938  ... 1.00553299 0.99937319 0.99282876]
min 1.674778699874878
[0. 0. 0. ... 0. 0. 0.]
max 1.5857124328613281
[3. 3. 3. ... 3. 3. 3.]
var 3.0815768241882324
[0.9040912  0.91690765 0.93659773 ... 0.92337104 0.91253819 0.905121  ]

REPETITION 1
mean 2.189591407775879
[0.99114669 1.00532354 1.0106938  ... 1.00553299 0.99937319 0.99282876]
min 1.616518259048462
[0. 0. 0. ... 0. 0. 0.]
max 1.5758085250854492
[3. 3. 3. ... 3. 3. 3.]
var 3.30293607711792
[0.9040912  0.91690765 0.93659773 ... 0.92337104 0.91253819 0.905121  ]

COMPUTE JOINTLY
dask.array<stack, shape=(3, 20000), dtype=float64, chunksize=(1, 4000), chunktype=numpy.ndarray>
without var 2.1371119022369385
[[0.99114669 1.00532354 1.0106938  ... 1.00553299 0.99937319 0.99282876]
 [0.         0.         0.         ... 0.         0.         0.  