In [1]:
import numpy as np
import tables as tb
write_path = 'test_tables.h5'

# When to create index

#### Helper functions

In [2]:
class Signal(tb.IsDescription):
    event    = tb.  Int32Col()
    time     = tb.Float32Col()
    energy   = tb.Float32Col()
    
filt = tb.Filters()

def create_n_pytables(num_tables, h5out, group, CI=False):
    """
    create num_tables pytables in group. 
    tables are accessible via group.ti where i is in range num_tables
    """
    tables = []
    for i in range(num_tables):
        path = 't{}'     .format(i)
        name = 'Table  {}'.format(i)
        tables.append(h5out.create_table(group, path, Signal, name, filt))
        if CI: tables[-1].cols.event.create_index()
    return tables
    
def toy_signal():
    """
    makes a toy signal (time, energy),
    where time and energy are 1d np.ndarrays of equal but
    random length, between, minl and maxl
    """
    minl = 10; maxl = 100
    signal_length = np.random.randint(minl, high=maxl)
    t = np.arange(signal_length, dtype=np.float32)
    e = np.random.random(signal_length)
    return t, e

def write_signal_for_one_event(table, event, toy_signal, flush_0=False):
    for t, e in zip(*toy_signal):
        table.row["event"]  = event
        table.row["time"]   = t
        table.row["energy"] = e
        table.row.append()
        
    if flush_0: table.flush() # Should we flush here? 
                              # Sometimes? Always?
                              # Pytables documentation seems to  
                              # recommend flushing here.
                              # But we have never run into problems 
                              # without this flush, and flushing,
                              # at least with our implementation, 
                              # slows things down a lot.

#### Main

In [3]:
def write_some_pytables(write_path, 
                        num_tables =  5,   # Number of tables to write
                        num_events =100,   # Number of events
                        flush_0 = False,   # Flush each table for each event 
                        flush_1 = False,
                        create_index=False):  # Flush file before closing the file
    
    with tb.open_file(write_path, 'w') as h5out:
        g1 = h5out.create_group(h5out.root, 'g1')         # Make group
        tables = create_n_pytables(num_tables, h5out, g1, CI=create_index) # Make num_tables in group

        for event in range(num_events): # For each event,
            for table in tables:        # Write a toy signal to its table.
                write_signal_for_one_event(table, 
                                           event, 
                                           toy_signal(), 
                                           flush_0=flush_0)
                
        if flush_1: h5out.flush() 
        # Should we flush the entire file here? When we don't do this
        # we frequently end up with blank pytables.
        #
        # It's strange flushing here changes anything, since the file closes 
        # immediately after this line is executed, and pytables documentation
        # says a file is flushed automatically as it closes....     

In [4]:
num_events = 100000; num_tables = 5
print('CREATE INDEX')
%time write_some_pytables(write_path, num_tables=num_tables, num_events=num_events, flush_0=False, flush_1=True, create_index=True)
with tb.open_file(write_path, 'r') as f: 
    for table in f.root.g1: # Ensure each pytable has num_events events
        print('fraction of events succesfully written:', 
              len(set(table[:]['event'])) / num_events, 'in', table.name) # python 3 division
print('-----------')       
print('DONT CREATE INDEX')
%time write_some_pytables(write_path, num_tables=num_tables, num_events=num_events, flush_0=False, flush_1=True, create_index=False)
with tb.open_file(write_path, 'r') as f: 
    for table in f.root.g1: # Ensure each pytable has num_events events
        print('fraction of events succesfully written:', 
              len(set(table[:]['event'])) / num_events, 'in', table.name) # python 3 division

CREATE INDEX
CPU times: user 55.6 s, sys: 833 ms, total: 56.5 s
Wall time: 58.2 s
fraction of events succesfully written: 1.0 in t0
fraction of events succesfully written: 1.0 in t1
fraction of events succesfully written: 1.0 in t2
fraction of events succesfully written: 1.0 in t3
fraction of events succesfully written: 1.0 in t4
-----------
DONT CREATE INDEX
CPU times: user 49.3 s, sys: 388 ms, total: 49.7 s
Wall time: 50.1 s
fraction of events succesfully written: 1.0 in t0
fraction of events succesfully written: 1.0 in t1
fraction of events succesfully written: 1.0 in t2
fraction of events succesfully written: 1.0 in t3
fraction of events succesfully written: 1.0 in t4


In [5]:
print('DONT CREATE INDEX, DONT NEED TO FLUSH')
%time write_some_pytables(write_path, num_tables=num_tables, num_events=num_events, flush_0=False, flush_1=False, create_index=False)
with tb.open_file(write_path, 'r') as f: 
    for table in f.root.g1: # Ensure each pytable has num_events events
        print('fraction of events succesfully written:', 
              len(set(table[:]['event'])) / num_events, 'in', table.name) # python 3 division

DONT CREATE INDEX, DONT NEED TO FLUSH
CPU times: user 54 s, sys: 640 ms, total: 54.6 s
Wall time: 56.8 s
fraction of events succesfully written: 1.0 in t0
fraction of events succesfully written: 1.0 in t1
fraction of events succesfully written: 1.0 in t2
fraction of events succesfully written: 1.0 in t3
fraction of events succesfully written: 1.0 in t4
