In [1]:
import numpy as np
import tables as tb
file_path = 'test_tables.h5'

# When to flush

Our data consist of many events. For each event, we have 5 different signals we want to append to 5 different pytables. The number of rows we append to each pytable varies from signal to signal and event to event.     


This is a simplified representation of how we have implemented our tablel writers and an experimental investigation into when to flush the table.

#### Helper functions

In [2]:
class Signal(tb.IsDescription):
    event    = tb.  Int32Col()
    time     = tb.Float32Col()
    energy   = tb.Float32Col()

filt = tb.Filters()
    
def toy_signal():
    """
    makes a toy signal (time, energy),
    where time and energy are 1d np.ndarrays
    """
    signal_length = np.random.randint(1, high=30)
    t = np.arange(signal_length, dtype=np.float32)
    e = np.random.random(signal_length)
    return t, e

def write_signal_for_one_event(table, event, signal, flush_0=False):
    for t, e in zip(*signal):
        table.row["event"]  = event
        table.row["time"]   = t
        table.row["energy"] = e
        table.row.append()
        
    if flush_0: table.flush() # Should we flush here? 
                              # Sometimes? Always?
                              # Pytables documentation seems to recommend 
                              # always flushing here.
                              # But we have not run into problems 
                              # without this flush, and flushing,
                              # at least with our implementation, 
                              # slows things down a lot.

#### Main

In [4]:
def write_some_pytables(file_path, 
                        num_tables =  5,   # Number of tables to write
                        num_events =100,   # Number of events
                        flush_0 = False,   # Flush each table for each event 
                        flush_1 = False):  # Flush file before closing the file
    
    with tb.open_file(file_path, 'w') as f:
        g1 = f.create_group(f.root, 'g1')  # Make group
        tables = []
        for i in range(num_tables):     # Make num_tables tables in group
            path = 't{}'     .format(i)
            name = 'Table {}'.format(i)
            tables.append(f.create_table(g1, path, Signal, name, filt))
            tables[-1].cols.event.create_index()

        for event in range(num_events): # We want to extend the each table by
            for table in tables:        # by a different number of rows, for each event.
                write_signal_for_one_event(table, 
                                           event, 
                                           toy_signal(), 
                                           flush_0=flush_0)
        if flush_1: f.flush() 
        # Should we flush the entire file here? When we don't do this
        # we frequently end up with blank pytables.
        
        # But it's strange flushing here does anything, since the file closes 
        # immediately after this line is executed and pytables documentation
        #  says a file is flushed automatically as it closes....     

## Experimental Flushing
For up to 4 tables we have no problems, even with 100k events in each table

In [5]:
num_events=1000
%time write_some_pytables(file_path, num_tables=4, num_events=num_events, flush_0=False, flush_1=False)

with tb.open_file(file_path, 'r') as f: 
    for table in f.root.g1:
        print('fraction of events succesfully written:', 
              len(set(table[:]['event'])) / num_events)

CPU times: user 178 ms, sys: 16 ms, total: 194 ms
Wall time: 195 ms
fraction of events succesfully written: 1.0
fraction of events succesfully written: 1.0
fraction of events succesfully written: 1.0
fraction of events succesfully written: 1.0


and we if flush after every event, it's about 35 times slower, so we don't want to do that.

In [6]:
num_events=1000
%time write_some_pytables(file_path, num_tables=4, num_events=num_events, flush_0=True, flush_1=False)

with tb.open_file(file_path, 'r') as f: 
    for table in f.root.g1:
        print('fraction of events succesfully written:', 
              len(set(table[:]['event'])) / num_events)

CPU times: user 7.41 s, sys: 398 ms, total: 7.81 s
Wall time: 7.91 s
fraction of events succesfully written: 1.0
fraction of events succesfully written: 1.0
fraction of events succesfully written: 1.0
fraction of events succesfully written: 1.0


But, if we increase the number of tables to 5, even with only 1 event, one of the pytables is not written.    
(After some experimenting i think it is always the table that had this line run first: `tables[-1].cols.event.create_index()`

In [9]:
num_events=1
%time write_some_pytables(file_path, num_tables=5, num_events=num_events, flush_0=False, flush_1=False)

with tb.open_file(file_path, 'r') as f: 
    for table in f.root.g1:
        print('fraction of events succesfully written:', 
              len(set(table[:]['event'])) / num_events)

CPU times: user 49.3 ms, sys: 13.4 ms, total: 62.7 ms
Wall time: 77.8 ms
fraction of events succesfully written: 0.0
fraction of events succesfully written: 1.0
fraction of events succesfully written: 1.0
fraction of events succesfully written: 1.0
fraction of events succesfully written: 1.0


Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/Users/alej/miniconda/envs/IC3.6/lib/python3.6/site-packages/tables/node.py", line 321, in __del__
    self._f_close()
  File "/Users/alej/miniconda/envs/IC3.6/lib/python3.6/site-packages/tables/table.py", line 2957, in _f_close
    self.flush()
  File "/Users/alej/miniconda/envs/IC3.6/lib/python3.6/site-packages/tables/table.py", line 2891, in flush
    self.row._flush_buffered_rows()
  File "tables/tableextension.pyx", line 1333, in tables.tableextension.Row._flush_buffered_rows (tables/tableextension.c:16357)
  File "tables/tableextension.pyx", line 749, in tables.tableextension.Row.table.__get__ (tables/tableextension.c:9587)
  File "/Users/alej/miniconda/envs/IC3.6/lib/python3.6/site-packages/tables/file.py", line 2101, in _check_open
    raise ClosedFileError("the file object is closed")
tables.exceptions.ClosedFileError: the file object is closed


We can solve this problem by putting a flush just before the write file is closed (even for many tables and many events)

In [7]:
num_events=100000
%time write_some_pytables(file_path, num_tables=10, num_events=num_events, flush_0=False, flush_1=True)
with tb.open_file(file_path, 'r') as f: 
    for table in f.root.g1:
        assert len(set(table[:]['event'])) ==  num_events


CPU times: user 34.5 s, sys: 529 ms, total: 35 s
Wall time: 35.6 s


And flushing at the end just before closing the write file does not slow things down significantly 

In [8]:
num_events=100000
num_tables=4
print('Flush at end')
%time write_some_pytables(file_path, num_tables=num_tables, num_events=num_events, flush_0=False, flush_1=True)
with tb.open_file(file_path, 'r') as f: 
    for table in f.root.g1:
        assert len(set(table[:]['event'])) ==  num_events
print('--------')     
print('No flush')
%time write_some_pytables(file_path, num_tables=num_tables, num_events=num_events, flush_0=False, flush_1=False)
with tb.open_file(file_path, 'r') as f: 
    for table in f.root.g1:
        assert len(set(table[:]['event'])) ==  num_events

Flush at end
CPU times: user 13.7 s, sys: 281 ms, total: 13.9 s
Wall time: 14.1 s
--------
No flush
CPU times: user 13.5 s, sys: 200 ms, total: 13.7 s
Wall time: 13.8 s
