In [1]:
%matplotlib inline
import glob
import importlib
import uproot
import awkward
import numpy as np
import uproot_methods
import matplotlib.pyplot as plt
import concurrent.futures
import functools
import time
import pandas as pd
from tqdm.auto import tqdm

import utils
import manager
import plotting
import random
import blosc
import lz4.frame
import cloudpickle

importlib.reload(plotting);
importlib.reload(manager);
importlib.reload(utils);

In [2]:
%%time
# little endian magic number header for lz4: https://github.com/lz4/lz4/blob/master/doc/lz4_Frame_format.md
x = lz4.frame.compress(b"blah")
import struct
print(struct.unpack("<4b",x[:4]))
print(x[:4].hex())
print(x[:4] == b'\x04\x22\x4d\x18')

(4, 34, 77, 24)
04224d18
True
CPU times: user 0 ns, sys: 1e+03 µs, total: 1e+03 µs
Wall time: 262 µs


In [3]:
# BLOSC
# pip install blosc
# http://python-blosc.blosc.org/tutorial.html#compressing-and-decompressing-with-blosc

In [4]:
# some useful constants
print(blosc.cnames)
print(lz4.frame.COMPRESSIONLEVEL_MAX)
print(lz4.frame.COMPRESSIONLEVEL_MIN)
print(lz4.frame.COMPRESSIONLEVEL_MINHC)

['blosclz', 'lz4', 'lz4hc', 'zlib', 'zstd']
16
0
3


In [5]:
def get_mll_hist(args):
    fname,entrystart,entrystop = args
    try:
        worker = get_worker()
        cache = worker.cache
    except:
        cache = None
    f = uproot.open(fname)
    t = f["Events"]
    extra = dict(outputtype=tuple,namedecode="ascii",entrystart=entrystart,entrystop=entrystop,cache=cache)
    p4 = uproot_methods.TLorentzVectorArray.from_ptetaphim(
        *t.arrays(["Muon_pt","Muon_eta","Muon_phi","Muon_mass"],**extra)
    )
    charge,mediumid,reliso = t.arrays(["Muon_charge","Muon_mediumId","Muon_pfRelIso04_all"],**extra)
    mus = awkward.JaggedArray.zip(p4=p4,charge=charge,mediumid=mediumid,reliso=reliso)
    nevents = len(mus)
    mus = mus[mus.mediumid & (mus.reliso < 0.35) & (mus.p4.pt>=7)]
    mus = mus[mus.counts==2]
    mus = mus[(mus[:,0].p4.pt>15.) & (mus[:,1].p4.pt>15.) & (mus[:,:2].charge.sum()==0)]
    mll = (mus[:,0]+mus[:,1]).p4.mass
    bins = np.linspace(50,400,100)
    counts,_ = np.histogram(np.clip(mll,bins[0],bins[-1]),bins=bins)
    return counts

In [6]:
def make_payload(which="hist2d"):
    if which == "hist2d":
        counts,_,_  = np.histogram2d(
            np.random.normal(0,0.2,int(2e6)),
            np.random.normal(0,0.2,int(2e6)),
            bins=[np.linspace(-1,1.,500),np.linspace(-1,1.,500)],
        )
        return counts
    if which == "ranges1d":
        return [
            np.linspace(-100,100,450),
            np.arange(800),
        ]
    if which == "random1d":
        return np.random.random(15000)
    if which == "metadata":
        return {'args': 1,
                'job_num': 1,
                'read_bytes': 0,
                'result': {'name': None,
                 'node_read_bytes': 90667065299968,
                 'node_recv_bytes': 73927501073019,
                 'node_sent_bytes': 48290282498748,
                 'node_t': 1567295497.336089,
                 'node_write_bytes': 174765807226880,
                 'worker_mem_used': 32907264,
                 'worker_read_bytes': 0,
                 'worker_tasks': 81,
                 'worker_time_elapsed': 4.711894273757935,
                 'worker_write_bytes': 0},
                'task_id': '6d8342bfc2664a06',
                'tstart': 1567295497.335948,
                'tstop': 1567295497.3416238,
                'worker_name': 'namin__sdsc-8.t2.ucsd.edu__10192055.1',
                'write_bytes': 0}
    if which == "simplefunction":
        def f(x):
            return x
        return f
    if which == "complexfunction":
        return get_mll_hist

payload = make_payload()
payload

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
data = []
for which in tqdm([
    "simplefunction",
    "complexfunction",
    "ranges1d",
    "metadata",
    "hist2d",
    "random1d",
],position=0):
    for fcomp,fdecomp,label in tqdm([
        [lambda x: blosc.compress(x), blosc.decompress, "blosc"], # default should be blosc.SHUFFLE
        [lambda x: blosc.compress(x,shuffle=blosc.NOSHUFFLE), blosc.decompress, "blosc_noshuffle"],
        [lambda x: blosc.compress(x,shuffle=blosc.SHUFFLE), blosc.decompress,"blosc_shuffle"],
        [lambda x: blosc.compress(x,shuffle=blosc.BITSHUFFLE), blosc.decompress,"blosc_bitshuffle"],
        [lambda x: blosc.compress(x,cname="zlib"), blosc.decompress,"blosc_zlib"],
        [lambda x: blosc.compress(x,cname="zlib",shuffle=blosc.NOSHUFFLE), blosc.decompress,"blosc_zlib_noshuffle"],
        [lambda x: blosc.compress(x,cname="lz4"), blosc.decompress,"blosc_lz4"],
        [lambda x: blosc.compress(x,cname="lz4",shuffle=blosc.NOSHUFFLE), blosc.decompress,"blosc_lz4_noshuffle"],
        [lambda x: blosc.compress(x,cname="lz4hc"), blosc.decompress,"blosc_lz4hc"],
        [lambda x: lz4.frame.compress(x,compression_level=lz4.frame.COMPRESSIONLEVEL_MAX),lz4.frame.decompress,"lz4_max"],
        [lambda x: lz4.frame.compress(x,compression_level=lz4.frame.COMPRESSIONLEVEL_MIN),lz4.frame.decompress,"lz4_min"],
        [lambda x: lz4.frame.compress(x,compression_level=lz4.frame.COMPRESSIONLEVEL_MINHC),lz4.frame.decompress,"lz4_minhc"],
    ],position=1):
        payload = make_payload(which)

        info = {}
        info["label"] = label
        info["which"] = which
#         print(label)

        ts = []
        for _ in range(30):
            t0 = time.time()
            cpr = cloudpickle.dumps(payload)
            t1 = time.time()
            ts.append(t1-t0)
        info["t_cloudpickle_ms"] = round(1e3*np.median(ts),2)

        ts_comp = []
        ts_decomp = []
        for _ in range(20):
            t0 = time.time()
            comp = fcomp(cpr)
            t1 = time.time()
            decomp = fdecomp(comp)
            t2 = time.time()
            ts_comp.append(t1-t0)
            ts_decomp.append(t2-t1)
        info["t_compress_ms"] = round(1e3*np.median(ts_comp),3)
        info["t_decompress_ms"] = round(1e3*np.median(ts_decomp),3)
        info["uncompressed_bytes"] = len(cpr)
        info["compressed_bytes"] = len(comp)
        info["compression_ratio"] = round(1.0*len(cpr)/len(comp),2)


        data.append(info)

dfc = pd.DataFrame(data)


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [8]:
dfc

Unnamed: 0,compressed_bytes,compression_ratio,label,t_cloudpickle_ms,t_compress_ms,t_decompress_ms,uncompressed_bytes,which
0,438,0.96,blosc,0.31,0.011,0.001,422,simplefunction
1,438,0.96,blosc_noshuffle,0.30,0.011,0.001,422,simplefunction
2,438,0.96,blosc_shuffle,0.30,0.011,0.001,422,simplefunction
3,438,0.96,blosc_bitshuffle,0.30,0.042,0.002,422,simplefunction
4,386,1.09,blosc_zlib,0.30,0.032,0.010,422,simplefunction
5,357,1.18,blosc_zlib_noshuffle,0.30,0.034,0.009,422,simplefunction
6,438,0.96,blosc_lz4,0.30,0.006,0.001,422,simplefunction
7,419,1.01,blosc_lz4_noshuffle,0.30,0.007,0.001,422,simplefunction
8,438,0.96,blosc_lz4hc,0.30,0.024,0.001,422,simplefunction
9,406,1.04,lz4_max,0.30,0.085,0.005,422,simplefunction


In [9]:
dfc.sort_values("t_compress_ms",ascending=True).groupby("which").head(1)

Unnamed: 0,compressed_bytes,compression_ratio,label,t_cloudpickle_ms,t_compress_ms,t_decompress_ms,uncompressed_bytes,which
42,463,0.97,blosc_lz4,0.13,0.005,0.001,447,metadata
22,1439,1.14,lz4_min,1.82,0.006,0.002,1644,complexfunction
6,438,0.96,blosc_lz4,0.3,0.006,0.001,422,simplefunction
30,3720,2.75,blosc_lz4,0.25,0.014,0.005,10233,ranges1d
70,120186,1.0,lz4_min,0.18,0.04,0.014,120159,random1d
54,156183,12.76,blosc_lz4,1.71,0.971,0.678,1992170,hist2d


In [10]:
dfc.sort_values("compression_ratio",ascending=False).groupby("which").head(1)

Unnamed: 0,compressed_bytes,compression_ratio,label,t_cloudpickle_ms,t_compress_ms,t_decompress_ms,uncompressed_bytes,which
52,84546,23.56,blosc_zlib,1.71,92.119,3.021,1992170,hist2d
32,3678,2.78,blosc_lz4hc,0.23,0.127,0.005,10233,ranges1d
41,355,1.26,blosc_zlib_noshuffle,0.14,0.028,0.004,447,metadata
17,1376,1.19,blosc_zlib_noshuffle,1.83,0.151,0.045,1644,complexfunction
64,101360,1.19,blosc_zlib,0.19,1.906,0.221,120159,random1d
5,357,1.18,blosc_zlib_noshuffle,0.3,0.034,0.009,422,simplefunction


In [12]:
# yeah, yeah, zlib is the best but it's hella slow. ignore it.
dfc[~dfc.label.str.contains("zlib")].sort_values("compression_ratio",ascending=False).groupby("which").head(1)

Unnamed: 0,compressed_bytes,compression_ratio,label,t_cloudpickle_ms,t_compress_ms,t_decompress_ms,uncompressed_bytes,which
56,113667,17.53,blosc_lz4hc,1.74,25.253,0.472,1992170,hist2d
32,3678,2.78,blosc_lz4hc,0.23,0.127,0.005,10233,ranges1d
47,364,1.23,lz4_minhc,0.12,0.016,0.001,447,metadata
21,1413,1.16,lz4_max,1.85,0.076,0.005,1644,complexfunction
66,105290,1.14,blosc_lz4,0.19,0.059,0.026,120159,random1d
11,406,1.04,lz4_minhc,0.3,0.023,0.002,422,simplefunction


In [11]:
print(len(cloudpickle.dumps(lambda x:x)))
def f(x): return x
print(len(cloudpickle.dumps(f)))
# hmm 7 less bytes to compress an explicitly named function than a lambda

# pad the name, and now it's the same...
def flambdax(x): return x
print(len(cloudpickle.dumps(flambdax)))

404
397
404
