# play with parquet, engines, compression

Try to identify performance issues at read time

https://github.com/dask/dask/issues/4701

https://github.com/dask/dask/issues/6376



In [2]:
import os, shutil
from glob import glob

import dask
import dask.dataframe as dd

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
%matplotlib inline

---

## local dask cluster

In [2]:
from dask.distributed import Client, LocalCluster
#
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:47190  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 56  Memory: 100.00 GiB


In [3]:
parquet = "/home1/scratch/aponte/dummy_parquet"

In [4]:
def store_parquet(df, engine="fastparquet", compression=None):
    _parquet = parquet+"_"+engine+"_"+str(compression)
    if os.path.isdir(_parquet):
        shutil.rmtree(_parquet)
    df.to_parquet(_parquet, engine=engine, compression=compression)
    
def load_mean_parquet(engine="fastparquet", compression=None, **kwargs):
    _parquet = parquet+"_"+engine+"_"+str(compression)
    df = dd.read_parquet(_parquet, engine=engine, **kwargs)
    return df.mean().compute()

In [5]:
df = dask.datasets.timeseries(end="2003-01-01", 
                              dtypes=dict(lon=float, lat=float, v=float),
                              seed=0,
                             ).persist()
df

Unnamed: 0_level_0,lat,lon,v
npartitions=1096,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01,float64,float64,float64
2000-01-02,...,...,...
...,...,...,...
2002-12-31,...,...,...
2003-01-01,...,...,...


In [6]:
partition_size = df.memory_usage_per_partition().compute().loc[0]
print("Partition size = {:.0f} MB".format(partition_size/1e6))

df_size = df.memory_usage().compute().sum()
print("Dataset global size = {:.0f} GB".format(df_size/1e9))

Partition size = 3 MB
Dataset global size = 3 GB


In [19]:
%time store_parquet(df)

CPU times: user 1.87 s, sys: 112 ms, total: 1.98 s
Wall time: 3.19 s


In [9]:
#%time store_parquet(df, engine="fastparquet", compression="ZSTD") # hangs?

In [20]:
%time store_parquet(df, engine="pyarrow")

CPU times: user 3.12 s, sys: 196 ms, total: 3.32 s
Wall time: 4.78 s


In [21]:
%time load_mean_parquet(engine="fastparquet")

CPU times: user 2.32 s, sys: 160 ms, total: 2.48 s
Wall time: 4.87 s


lat    0.000023
lon    0.000109
v     -0.000103
dtype: float64

In [22]:
%time load_mean_parquet(engine="pyarrow")

CPU times: user 2.49 s, sys: 244 ms, total: 2.74 s
Wall time: 4.91 s


lat    0.000023
lon    0.000109
v     -0.000103
dtype: float64

In [23]:
cluster.close()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


---

## distributed dask cluster

In [24]:
from dask.distributed import Client
from dask_jobqueue import PBSCluster
cluster = PBSCluster(walltime='01:00:00')
w = cluster.scale(jobs=4)
client = Client(cluster)
cluster

VBox(children=(HTML(value='<h2>PBSCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .d…

In [25]:
%time load_mean_parquet(engine="fastparquet")

CPU times: user 4.71 s, sys: 308 ms, total: 5.02 s
Wall time: 35 s


lat    0.000023
lon    0.000109
v     -0.000103
dtype: float64

dominated by `deserialize-read-parquet`: 9s-25s

In [26]:
%time load_mean_parquet(engine="fastparquet")

CPU times: user 1.78 s, sys: 156 ms, total: 1.93 s
Wall time: 3.28 s


lat    0.000023
lon    0.000109
v     -0.000103
dtype: float64

In [27]:
%time load_mean_parquet(engine="pyarrow")

CPU times: user 5.03 s, sys: 328 ms, total: 5.36 s
Wall time: 44.1 s


lat    0.000023
lon    0.000109
v     -0.000103
dtype: float64

In [28]:
%time load_mean_parquet(engine="pyarrow")

CPU times: user 1.95 s, sys: 264 ms, total: 2.21 s
Wall time: 3.12 s


lat    0.000023
lon    0.000109
v     -0.000103
dtype: float64

Recreate datasets

In [30]:
df = dask.datasets.timeseries(end="2003-01-01", 
                              dtypes=dict(lon=float, lat=float, v=float),
                              seed=0,
                             ).persist()
df

Unnamed: 0_level_0,lat,lon,v
npartitions=1096,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01,float64,float64,float64
2000-01-02,...,...,...
...,...,...,...
2002-12-31,...,...,...
2003-01-01,...,...,...


In [31]:
%time store_parquet(df)

CPU times: user 964 ms, sys: 80 ms, total: 1.04 s
Wall time: 1.26 s


In [32]:
%time store_parquet(df, engine="pyarrow")

CPU times: user 1.88 s, sys: 116 ms, total: 1.99 s
Wall time: 3.46 s


---

## original dataset

In [3]:
root_dir =  "/home1/datawork/aponte/parcels/global_extra_T365j_dt1j_dij50/parquets/"
parquet = root_dir+"trajectory"
#parquet = root_dir+"trajectory_mean"

def load_mean_parquet(parquet, engine="pyarrow", **kwargs):
    df = dd.read_parquet(parquet, engine=engine, **kwargs)
    return df.mean().compute()

In [4]:
from dask.distributed import Client
from dask_jobqueue import PBSCluster
cluster = PBSCluster(walltime='01:00:00')
w = cluster.scale(jobs=2)
client = Client(cluster)
cluster

VBox(children=(HTML(value='<h2>PBSCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .d…

In [5]:
%time load_mean_parquet(parquet) # hangs?

  meta = self._meta_nonempty.mean(axis=axis, skipna=skipna)


CPU times: user 6.42 s, sys: 284 ms, total: 6.71 s
Wall time: 1min 13s


lat                   -1.612673e+01
lon                   -4.706660e+00
z                      0.000000e+00
zonal_velocity         3.601927e-07
meridional_velocity    6.823303e-08
sea_level             -3.931222e-02
temperature            1.372074e+01
salinity               3.435661e+01
dtype: float64

Make a copy of the original dataset

In [6]:
%time load_mean_parquet(parquet, engined="fastparquet") # hangs?

CPU times: user 2.82 s, sys: 156 ms, total: 2.98 s
Wall time: 16.7 s


lat                   -1.612673e+01
lon                   -4.706660e+00
z                      0.000000e+00
zonal_velocity         3.601927e-07
meridional_velocity    6.823303e-08
sea_level             -3.931222e-02
temperature            1.372074e+01
salinity               3.435661e+01
dtype: float64

In [20]:
# hack
parquet_file_list = glob(parquet+"/*")

def read_parquet_file(filename, engine="pyarrow"):
    with open(filename, mode='rb') as f:
        return pd.read_parquet(f, engine=engine) # also tried fastparquet, pyarrow was faster

df = dd.from_delayed(map(dask.delayed(read_parquet_file), parquet_file_list)).persist()

In [None]:
for f in parquet_file_list:
    _df = read_parquet_file(f)
    print(_df["lon"].mean())
# crashes with: OSError: Tried reading 5708205 bytes starting at position 33972 from file but only got 1037084

In [7]:
cluster.close()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
