# Using compression

> Objectives:
> * How to compress chunked datasets
> * Learn how to fine-tune the HDF5 compression pipeline to suit your needs
> * How to use pandas for reading CSV files

## Load movielens datasets

In [5]:
import os
import numpy as np
import pandas as pd
import tables

In [6]:
# Import CSV files via pandas
dset = 'movielens-1m'
fdata = os.path.join(dset, 'ratings.dat.gz')
fitem = os.path.join(dset, 'movies.dat.gz')

# pass in column names for each CSV
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(fdata, sep=';', names=r_cols)

m_cols = ['movie_id', 'title', 'genres']
movies = pd.read_csv(fitem, sep=';', names=m_cols,
                     dtype={'title': object, 'genres': object})

In [7]:
movies.ftypes

movie_id     int64:dense
title       object:dense
genres      object:dense
dtype: object

In [8]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
ratings.ftypes

user_id           int64:dense
movie_id          int64:dense
rating            int64:dense
unix_timestamp    int64:dense
dtype: object

In [10]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Storing in HDF5/PyTables in compressed form

In [11]:
import os
import shutil
data_dir = "compression"
if os.path.exists(data_dir):
    shutil.rmtree(data_dir)
os.mkdir(data_dir)

In [12]:
def to_hdf5(ratings, movies, filters):
    
    class Ratings(tables.IsDescription):
        user_id = tables.Int32Col(pos=0)
        movie_id = tables.Int32Col(pos=1)
        rating = tables.Int8Col(pos=2)
        unix_timestamp = tables.Int64Col(pos=3)
    
    class Movies(tables.IsDescription):
        movie_id = tables.Int32Col(pos=0)
        title = tables.StringCol(100, pos=1)
        genres = tables.StringCol(50, pos=2)
    
    def get_filename(filters):
        if filters.complevel != 0:
            complib = filters.complib if ":" not in filters.complib else filters.complib.replace(":", "-")
            shuffle = "shuffle" if filters.shuffle else "noshuffle"
            filename = "%s/%s-%d-%s.h5" % (data_dir, complib, filters.complevel, shuffle)
        else:
            filename = "%s/no-compressed.h5" % (data_dir,)
        return filename

    filename = get_filename(filters)
    print("Creating file:", filename)
    with tables.open_file(filename, "w") as f:
        table_ratings = f.create_table(f.root, "ratings", Ratings, filters=filters, expectedrows=len(ratings))
        table_ratings.append([ratings[col].values for col in ratings.ftypes.keys()])
        table_movies = f.create_table(f.root, "movies", Movies, filters=filters, expectedrows=len(movies))
        table_movies.append([movies[col].values for col in movies.ftypes.keys()])
    return filename

In [13]:
%%time
filters = tables.Filters(complevel=5, shuffle=True)
h5file = to_hdf5(ratings, movies, filters)

('Creating file:', 'compression/zlib-5-shuffle.h5')
CPU times: user 360 ms, sys: 28.3 ms, total: 388 ms
Wall time: 387 ms


In [14]:
!ls -lah {h5file}

-rw-r--r--  1 albertofernandezmartinez  staff   4.2M May 19 16:00 compression/zlib-5-shuffle.h5


In [15]:
!ptdump -v {h5file}

/ (RootGroup) ''
/movies (Table(3883,), shuffle, zlib(5)) ''
  description := {
  "movie_id": Int32Col(shape=(), dflt=0, pos=0),
  "title": StringCol(itemsize=100, shape=(), dflt='', pos=1),
  "genres": StringCol(itemsize=50, shape=(), dflt='', pos=2)}
  byteorder := 'little'
  chunkshape := (425,)
/ratings (Table(1000209,), shuffle, zlib(5)) ''
  description := {
  "user_id": Int32Col(shape=(), dflt=0, pos=0),
  "movie_id": Int32Col(shape=(), dflt=0, pos=1),
  "rating": Int8Col(shape=(), dflt=0, pos=2),
  "unix_timestamp": Int64Col(shape=(), dflt=0, pos=3)}
  byteorder := 'little'
  chunkshape := (7710,)


### Exercise 1

PyTables comes with out-of-box support for a series of codecs.  Do a quick comparison between "zlib", "bzip2", and "blosc" for compression levels of 1 (fastest), 5 and 9 (slowest).  Which one compresses best?  Which one compresses faster?

Also, Blosc being a meta-compressor, it has support for different codecs internally that can be selected from PyTables in the "blosc:`codec`" form.  Do another comparison between internal Blosc codecs, namely, "blosc:blosclz" (the default), "blosc:lz4", "blosc:lz4hc", "blosc:snappy", "blosc:zlib" and "blosc:zstd".

Finally, avoid any compression totally (`complevel=0`).  How fast it is compared with existing codecs?

In [16]:
compression_libs = ["zlib", "bzip2", "blosc", "blosc:lz4", "blosc:lz4hc", "blosc:snappy", "blosc:zlib", "blosc:zstd"]

In [24]:
def generate_compressed(complevel):
    generated_files = []
    
    for complib in compression_libs:
        %%time
        filters = tables.Filters(complevel=complevel, shuffle=True, complib=complib)
        generated_files.append(to_hdf5(ratings, movies, filters))
        
    for file in generated_files:
        !ls -lah {file}

In [25]:
generate_compressed(1)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs
('Creating file:', 'compression/zlib-1-shuffle.h5')
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs
('Creating file:', 'compression/bzip2-1-shuffle.h5')
CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 4.05 µs
('Creating file:', 'compression/blosc-1-shuffle.h5')
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs
('Creating file:', 'compression/blosc-lz4-1-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
('Creating file:', 'compression/blosc-lz4hc-1-shuffle.h5')
CPU times: user 6 µs, sys: 3 µs, total: 9 µs
Wall time: 16.9 µs
('Creating file:', 'compression/blosc-snappy-1-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.96 µs
('Creating file:', 'compression/blosc-zlib-1-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
('Creating file:', 'compression/blosc-zstd-1-shuffle.h5')
-rw-r--r--  1 albertofernandez

In [26]:
generate_compressed(5)

CPU times: user 4 µs, sys: 2 µs, total: 6 µs
Wall time: 10 µs
('Creating file:', 'compression/zlib-5-shuffle.h5')
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs
('Creating file:', 'compression/bzip2-5-shuffle.h5')
CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.81 µs
('Creating file:', 'compression/blosc-5-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
('Creating file:', 'compression/blosc-lz4-5-shuffle.h5')
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.05 µs
('Creating file:', 'compression/blosc-lz4hc-5-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
('Creating file:', 'compression/blosc-snappy-5-shuffle.h5')
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs
('Creating file:', 'compression/blosc-zlib-5-shuffle.h5')
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs
('Creating file:', 'compression/blosc-zstd-5-shuffle.h5')
-rw-r--r--  1 albertofernandezma

In [27]:
generate_compressed(9)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs
('Creating file:', 'compression/zlib-9-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 11 µs
('Creating file:', 'compression/bzip2-9-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
('Creating file:', 'compression/blosc-9-shuffle.h5')
CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 5.01 µs
('Creating file:', 'compression/blosc-lz4-9-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
('Creating file:', 'compression/blosc-lz4hc-9-shuffle.h5')
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.96 µs
('Creating file:', 'compression/blosc-snappy-9-shuffle.h5')
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.96 µs
('Creating file:', 'compression/blosc-zlib-9-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
('Creating file:', 'compression/blosc-zstd-9-shuffle.h5')
-rw-r--r--  1 albertofernandezmartin

## Reading compressed datasets

In [29]:
files = list(os.walk(data_dir))[0][2]

In [30]:
files

['blosc-1-shuffle.h5',
 'blosc-5-shuffle.h5',
 'blosc-9-shuffle.h5',
 'blosc-lz4-1-shuffle.h5',
 'blosc-lz4-5-shuffle.h5',
 'blosc-lz4-9-shuffle.h5',
 'blosc-lz4hc-1-shuffle.h5',
 'blosc-lz4hc-5-shuffle.h5',
 'blosc-lz4hc-9-shuffle.h5',
 'blosc-snappy-1-shuffle.h5',
 'blosc-snappy-5-shuffle.h5',
 'blosc-snappy-9-shuffle.h5',
 'blosc-zlib-1-shuffle.h5',
 'blosc-zlib-5-shuffle.h5',
 'blosc-zlib-9-shuffle.h5',
 'blosc-zstd-1-shuffle.h5',
 'blosc-zstd-5-shuffle.h5',
 'blosc-zstd-9-shuffle.h5',
 'bzip2-1-shuffle.h5',
 'bzip2-5-shuffle.h5',
 'bzip2-9-shuffle.h5',
 'zlib-1-shuffle.h5',
 'zlib-5-shuffle.h5',
 'zlib-9-shuffle.h5']

In [31]:
for f in files:
    print("Reading file:", f)
    with tables.open_file(os.path.join(data_dir, f)) as h5f:
        %time h5f.root.ratings[:]

('Reading file:', 'blosc-1-shuffle.h5')
CPU times: user 8.25 ms, sys: 15.1 ms, total: 23.3 ms
Wall time: 22.2 ms
('Reading file:', 'blosc-5-shuffle.h5')
CPU times: user 25.9 ms, sys: 14.2 ms, total: 40 ms
Wall time: 40.6 ms
('Reading file:', 'blosc-9-shuffle.h5')
CPU times: user 16 ms, sys: 6.74 ms, total: 22.8 ms
Wall time: 22.6 ms
('Reading file:', 'blosc-lz4-1-shuffle.h5')
CPU times: user 15 ms, sys: 7.59 ms, total: 22.6 ms
Wall time: 23.7 ms
('Reading file:', 'blosc-lz4-5-shuffle.h5')
CPU times: user 14.7 ms, sys: 7.12 ms, total: 21.8 ms
Wall time: 21.8 ms
('Reading file:', 'blosc-lz4-9-shuffle.h5')
CPU times: user 14.4 ms, sys: 7.67 ms, total: 22.1 ms
Wall time: 22 ms
('Reading file:', 'blosc-lz4hc-1-shuffle.h5')
CPU times: user 16 ms, sys: 8.33 ms, total: 24.4 ms
Wall time: 23.5 ms
('Reading file:', 'blosc-lz4hc-5-shuffle.h5')
CPU times: user 16.2 ms, sys: 9.06 ms, total: 25.2 ms
Wall time: 25.5 ms
('Reading file:', 'blosc-lz4hc-9-shuffle.h5')
CPU times: user 13.4 ms, sys: 6.64 m

### Exercise 2

Which codec and compression level can read the fastest?  How does it compare with reading an uncompressed dataset?

### Exercise 3

Blosc can use multithreading for compressing/decompressing, although it is disabled by default.  You can enable a multithreaded Blosc in a series of ways, but perhaps the easiest is to set the "BLOSC_NTHREADS" environment variable to the desired number of threads (typically the available number of cores in your computer).

Execute the cell below and re-do the reading benchmarks and look at how the reading speed vary.  Pay special attention to the difference between the CPU times and wall times.

In [37]:
os.environ["BLOSC_NTHREADS"] = "8"  # set to any other number you prefer

In [40]:
for f in files:
    print("Reading file:", f)
    with tables.open_file(os.path.join(data_dir, f)) as h5f:
        %time h5f.root.ratings[:]

('Reading file:', 'blosc-1-shuffle.h5')
CPU times: user 9.23 ms, sys: 13.1 ms, total: 22.4 ms
Wall time: 22.2 ms
('Reading file:', 'blosc-5-shuffle.h5')
CPU times: user 27.9 ms, sys: 22.9 ms, total: 50.8 ms
Wall time: 33.3 ms
('Reading file:', 'blosc-9-shuffle.h5')
CPU times: user 27.5 ms, sys: 16.6 ms, total: 44.1 ms
Wall time: 46.2 ms
('Reading file:', 'blosc-lz4-1-shuffle.h5')
CPU times: user 19.7 ms, sys: 25.6 ms, total: 45.4 ms
Wall time: 26.6 ms
('Reading file:', 'blosc-lz4-5-shuffle.h5')
CPU times: user 18.6 ms, sys: 31 ms, total: 49.6 ms
Wall time: 31.2 ms
('Reading file:', 'blosc-lz4-9-shuffle.h5')
CPU times: user 15.2 ms, sys: 9.16 ms, total: 24.3 ms
Wall time: 24.2 ms
('Reading file:', 'blosc-lz4hc-1-shuffle.h5')
CPU times: user 20.7 ms, sys: 33.1 ms, total: 53.8 ms
Wall time: 34.5 ms
('Reading file:', 'blosc-lz4hc-5-shuffle.h5')
CPU times: user 18.6 ms, sys: 29.8 ms, total: 48.5 ms
Wall time: 33.6 ms
('Reading file:', 'blosc-lz4hc-9-shuffle.h5')
CPU times: user 14.4 ms, sys

In [38]:
generate_compressed(5)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.05 µs
('Creating file:', 'compression/zlib-5-shuffle.h5')
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 7.15 µs
('Creating file:', 'compression/bzip2-5-shuffle.h5')
CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.81 µs
('Creating file:', 'compression/blosc-5-shuffle.h5')
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs
('Creating file:', 'compression/blosc-lz4-5-shuffle.h5')
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs
('Creating file:', 'compression/blosc-lz4hc-5-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
('Creating file:', 'compression/blosc-snappy-5-shuffle.h5')
CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 5.01 µs
('Creating file:', 'compression/blosc-zlib-5-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 7.15 µs
('Creating file:', 'compression/blosc-zstd-5-shuffle.h5')
-rw-r--r--  1 albertof

In [39]:
generate_compressed(1)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs
('Creating file:', 'compression/zlib-1-shuffle.h5')
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs
('Creating file:', 'compression/bzip2-1-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
('Creating file:', 'compression/blosc-1-shuffle.h5')
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs
('Creating file:', 'compression/blosc-lz4-1-shuffle.h5')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
('Creating file:', 'compression/blosc-lz4hc-1-shuffle.h5')
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.96 µs
('Creating file:', 'compression/blosc-snappy-1-shuffle.h5')
CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 3.81 µs
('Creating file:', 'compression/blosc-zlib-1-shuffle.h5')
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.2 µs
('Creating file:', 'compression/blosc-zstd-1-shuffle.h5')
-rw-r--r--  1 albertofernandezm