# Compare Timeseries storage size in Kudu vs Parquet
## Create timeseries table in kudu and fill with some data

In [1]:
import kudu
import random

In [2]:
client = kudu.connect('localhost')

In [3]:
if client.table_exists('t'):
    client.delete_table('t')

In [4]:
builder = kudu.schema_builder()
builder.add_column('metric', type_=kudu.int32, nullable=False, compression='lz4', encoding=kudu.ENCODING_RLE)
builder.add_column('ts', type_=kudu.int64, nullable=False, compression='lz4', encoding=kudu.ENCODING_BIT_SHUFFLE)
builder.add_column('val', type_=kudu.double, nullable=True, compression='lz4', encoding=kudu.ENCODING_BIT_SHUFFLE)
builder.set_primary_keys(['metric', 'ts'])
schema = builder.build()
partitioning = kudu.client.Partitioning() \
                   .set_range_partition_columns(['ts'])
                   #.add_hash_partitions(column_names=['metric'], num_buckets=1) \
        
client.create_table('t', schema, partitioning, n_replicas=1)

In [5]:
t = client.table('t')
t.schema

kudu.Schema {  
  metric  int32 NOT NULL
  ts      int64 NOT NULL
  val     double
  PRIMARY KEY (metric, ts)
}

In [6]:
session = client.new_session()
session.set_flush_mode(kudu.FLUSH_AUTO_BACKGROUND)

for metric in range(10):
    for ts in range(0, 1000000000, 1000):
        op = t.new_upsert((metric, ts, random.uniform(0., 1.)))
        session.apply(op)

try:
    session.flush()
except kudu.KuduBadStatus as e:
    print(session.get_pending_errors())

## Check Kudu data size

In [7]:
alias kd ~/tmp/kudu/build/release/bin/kudu -fs_wal_dir ~/tmp/kudu/build/release/bin/data/ts/wal -fs_data_dirs ~/tmp/kudu/build/release/bin/data/ts/data/

In [8]:
kd local_replica list

W1017 23:34:11.938608 2285372224 data_dirs.cc:459] IO error: Could not lock /Users/andreas/tmp/kudu/build/release/bin/data/ts/data/data/block_manager_instance: Could not lock /Users/andreas/tmp/kudu/build/release/bin/data/ts/data/data/block_manager_instance: lock /Users/andreas/tmp/kudu/build/release/bin/data/ts/data/data/block_manager_instance: Resource temporarily unavailable (error 35)
W1017 23:34:11.939244 2285372224 data_dirs.cc:460] Proceeding without lock
I1017 23:34:11.939870 2285372224 fs_manager.cc:329] Time spent opening directory manager: real 0.001s	user 18446710026.726s	sys 814881245.000s
I1017 23:34:11.939945 2285372224 env_posix.cc:1585] Raising process file limit from 256 to 24576
I1017 23:34:11.939997 2285372224 file_cache.cc:470] Constructed file cache fbm with capacity 9830
I1017 23:34:11.940085 2285372224 fs_report.cc:347] Block manager report
--------------------
1 data directories: /Users/andreas/tmp/kudu/build/release/bin/data/ts/data/data
Total live blo

In [9]:
kd local_replica data_size 9882d6f6cc444426af3a1105228a1738

W1017 23:34:21.249979 2285372224 data_dirs.cc:459] IO error: Could not lock /Users/andreas/tmp/kudu/build/release/bin/data/ts/data/data/block_manager_instance: Could not lock /Users/andreas/tmp/kudu/build/release/bin/data/ts/data/data/block_manager_instance: lock /Users/andreas/tmp/kudu/build/release/bin/data/ts/data/data/block_manager_instance: Resource temporarily unavailable (error 35)
W1017 23:34:21.250677 2285372224 data_dirs.cc:460] Proceeding without lock
I1017 23:34:21.251600 2285372224 fs_manager.cc:329] Time spent opening directory manager: real 0.002s	user 18446709878.578s	sys 802793616.000s
I1017 23:34:21.251644 2285372224 env_posix.cc:1585] Raising process file limit from 256 to 24576
I1017 23:34:21.251682 2285372224 file_cache.cc:470] Constructed file cache fbm with capacity 9830
I1017 23:34:21.251780 2285372224 fs_report.cc:347] Block manager report
--------------------
1 data directories: /Users/andreas/tmp/kudu/build/release/bin/data/ts/data/data
Total live blo

In [10]:
s = t.scanner().open()
sum_ = 0
while s.has_more_rows():
    sum_ += len(s.next_batch())
print(sum_)

10000000


## Write same data as Parquet

In [11]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [12]:
schema = pa.schema([
    pa.field('metric', pa.int32()),
    pa.field('ts', pa.int64()),
    pa.field('val', pa.float64()),
])

In [13]:
writer = pq.ParquetWriter('ts.pq', schema, compression='snappy', version='2.0')
s = t.scanner().open()
while s.has_more_rows():
    b = s.next_batch()
    df = pd.DataFrame.from_records(b.as_tuples(), columns=['metric', 'ts', 'val'])
    df['metric'] = df['metric'].astype('int32')
    pt = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
    writer.write_table(pt)
writer.close()

In [14]:
ls -lah ts.pq

-rw-r--r-- 1 andreas 127M Oct 17 23:35 ts.pq


In [15]:
alias pqt ~/Downloads/hadoop-2.8.1/bin/hadoop jar parquet-tools-1.9.0.jar

In [16]:
pqt dump -d -n ts.pq

17/10/17 23:35:12 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
17/10/17 23:35:12 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
row group 0 
--------------------------------------------------------------------------------
metric:  INT32 SNAPPY DO:0 FPO:4 SZ:9447/200047/21.18 VC:50000 ENC:PLAIN,RLE
ts:      INT64 SNAPPY DO:0 FPO:9507 SZ:250925/400055/1.59 VC:50000 ENC:PLAIN,RLE
val:     DOUBLE SNAPPY DO:0 FPO:260494 SZ:400079/400055/1.00 VC:50000 ENC:PLAIN,RLE

    metric TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 9, max: 9, num_nulls: 0] SZ:200008 VC:50000

    ts TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 733526000, max: 783525000, num_nulls: 0] SZ:400008 VC:50000

    val TV=5000

17/10/17 23:35:12 INFO compress.CodecPool: Got brand-new decompressor [.snappy]

row group 16 
--------------------------------------------------------------------------------
metric:  INT32 SNAPPY DO:0 FPO:10569680 SZ:9435/200047/21.20 VC:50000 ENC:PLAIN,RLE
ts:      INT64 SNAPPY DO:0 FPO:10579175 SZ:250928/400055/1.59 VC:50000 ENC:PLAIN,RLE
val:     DOUBLE SNAPPY DO:0 FPO:10830167 SZ:400079/400055/1.00 VC:50000 ENC:PLAIN,RLE

    metric TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 0, max: 0, num_nulls: 0] SZ:200008 VC:50000

    ts TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 533500000, max: 583499000, num_nulls: 0] SZ:400008 VC:50000

    val TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE R


row group 27 
--------------------------------------------------------------------------------
metric:  INT32 SNAPPY DO:0 FPO:17836657 SZ:9447/200047/21.18 VC:50000 ENC:PLAIN,RLE
ts:      INT64 SNAPPY DO:0 FPO:17846164 SZ:250937/400055/1.59 VC:50000 ENC:PLAIN,RLE
val:     DOUBLE SNAPPY DO:0 FPO:18097165 SZ:400079/400055/1.00 VC:50000 ENC:PLAIN,RLE

    metric TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 1, max: 1, num_nulls: 0] SZ:200008 VC:50000

    ts TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 83500000, max: 133499000, num_nulls: 0] SZ:400008 VC:50000

    val TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 0.00002, max: 0.99996, num_nulls: 0] SZ:400008 VC:50000


    metric TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 1, max: 1, num_nulls: 0] SZ:200008 VC:50000

    ts TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 833500000, max: 883499000, num_nulls: 0] SZ:400008 VC:50000

    val TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 0.00001, max: 0.99996, num_nulls: 0] SZ:400008 VC:50000
17/10/17 23:35:12 INFO compress.CodecPool: Got brand-new decompressor [.snappy]

row group 43 
--------------------------------------------------------------------------------
metric:  INT32 SNAPPY DO:0 FPO:28407018 SZ:9447/200047/21.18 VC:50000 ENC:PLAIN,RLE
ts:      INT64 SNAPPY DO:0 FPO:28416525 SZ:250916/400055/1.59 VC:50000 ENC:PLAIN,RLE
val:


row group 64 
--------------------------------------------------------------------------------
metric:  INT32 SNAPPY DO:0 FPO:42279610 SZ:9447/200047/21.18 VC:50000 ENC:PLAIN,RLE
ts:      INT64 SNAPPY DO:0 FPO:42289117 SZ:250908/400055/1.59 VC:50000 ENC:PLAIN,RLE
val:     DOUBLE SNAPPY DO:0 FPO:42540089 SZ:400079/400055/1.00 VC:50000 ENC:PLAIN,RLE

    metric TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 2, max: 2, num_nulls: 0] SZ:200008 VC:50000

    ts TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 933433000, max: 983432000, num_nulls: 0] SZ:400008 VC:50000

    val TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 0.00003, max: 1.00000, num_nulls: 0] SZ:400008 VC:50000


row group 85 
--------------------------------------------------------------------------------
metric:  INT32 SNAPPY DO:0 FPO:56153084 SZ:9454/200047/21.16 VC:50000 ENC:PLAIN,RLE
ts:      INT64 SNAPPY DO:0 FPO:56162598 SZ:250860/400055/1.59 VC:50000 ENC:PLAIN,RLE
val:     DOUBLE SNAPPY DO:0 FPO:56413522 SZ:400079/400055/1.00 VC:50000 ENC:PLAIN,RLE

    metric TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 3, max: 4, num_nulls: 0] SZ:200008 VC:50000

    ts TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 0, max: 999999000, num_nulls: 0] SZ:400008 VC:50000

    val TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 0.00000, max: 1.00000, num_nulls: 0] SZ:400008 VC:50000
17/10/


    metric TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 5, max: 5, num_nulls: 0] SZ:200008 VC:50000

    ts TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 133366000, max: 183365000, num_nulls: 0] SZ:400008 VC:50000

    val TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 0.00004, max: 0.99999, num_nulls: 0] SZ:400008 VC:50000
17/10/17 23:35:13 INFO compress.CodecPool: Got brand-new decompressor [.snappy]

row group 109 
--------------------------------------------------------------------------------
metric:  INT32 SNAPPY DO:0 FPO:72007543 SZ:9447/200047/21.18 VC:50000 ENC:PLAIN,RLE
ts:      INT64 SNAPPY DO:0 FPO:72017050 SZ:250930/400055/1.59 VC:50000 ENC:PLAIN,RLE
val


    metric TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 6, max: 6, num_nulls: 0] SZ:200008 VC:50000

    ts TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 433366000, max: 483365000, num_nulls: 0] SZ:400008 VC:50000

    val TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 0.00002, max: 0.99993, num_nulls: 0] SZ:400008 VC:50000
17/10/17 23:35:13 INFO compress.CodecPool: Got brand-new decompressor [.snappy]

row group 135 
--------------------------------------------------------------------------------
metric:  INT32 SNAPPY DO:0 FPO:89184248 SZ:9447/200047/21.18 VC:50000 ENC:PLAIN,RLE
ts:      INT64 SNAPPY DO:0 FPO:89193755 SZ:250923/400055/1.59 VC:50000 ENC:PLAIN,RLE
val


row group 157 
--------------------------------------------------------------------------------
metric:  INT32 SNAPPY DO:0 FPO:103717501 SZ:9447/200047/21.18 VC:50000 ENC:PLAIN,RLE
ts:      INT64 SNAPPY DO:0 FPO:103727008 SZ:251054/400055/1.59 VC:50000 ENC:PLAIN,RLE
val:     DOUBLE SNAPPY DO:0 FPO:103978126 SZ:400079/400055/1.00 VC:50000 ENC:PLAIN,RLE

    metric TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 7, max: 7, num_nulls: 0] SZ:200008 VC:50000

    ts TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 583299000, max: 633298000, num_nulls: 0] SZ:400008 VC:50000

    val TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 0.00001, max: 0.99999, num_nulls: 0] SZ:400008 VC:5

17/10/17 23:35:13 INFO compress.CodecPool: Got brand-new decompressor [.snappy]

row group 178 
--------------------------------------------------------------------------------
metric:  INT32 SNAPPY DO:0 FPO:117591112 SZ:9447/200047/21.18 VC:50000 ENC:PLAIN,RLE
ts:      INT64 SNAPPY DO:0 FPO:117600619 SZ:250922/400055/1.59 VC:50000 ENC:PLAIN,RLE
val:     DOUBLE SNAPPY DO:0 FPO:117851605 SZ:400079/400055/1.00 VC:50000 ENC:PLAIN,RLE

    metric TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 8, max: 8, num_nulls: 0] SZ:200008 VC:50000

    ts TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[min: 633299000, max: 683298000, num_nulls: 0] SZ:400008 VC:50000

    val TV=50000 RL=0 DL=1
    ----------------------------------------------------------------------------
    page 0:  DLE:R