### Experiment with PyArrow For Memmapped Storage

In [1]:
import pyarrow as pa 
import sys

In [7]:
# define the type for a single game
fields = [
    ("moves", pa.list_(pa.string())),
    ("white_elo", pa.uint16()),
    ("black_elo", pa.uint16()),
    ("result", pa.uint8())
]
schema = pa.schema(fields)
game_type = pa.struct(fields)

In [8]:
# make some dummy data
dummy_data = [
    {
        'moves': ['e2e4', 'e7e5', "e1e2"],
        'white_elo': 1000,
        'black_elo': 1600,
        'result': 0
    },
    {
        'moves': ['e2e4', 'e7e5', "e1e2"],
        'white_elo': 1200,
        'black_elo': 1100,
        'result': 2
    },
]

In [9]:
# make 10mil copies
for i in range(10000000):
    dummy_data += dummy_data[0:2]

In [5]:
def get_size(x):
    return sys.getsizeof(x) / (1024 * 1024 * 1024)
get_size(dummy_data)

0.1561788022518158

In [12]:
# use the streaming record batch interface to write to disk efficiently
write_freq = 1000
with pa.OSFile('games.arrow', 'wb') as sink:
    with pa.ipc.new_file(sink, schema=schema) as writer:
        for i in range(0, len(dummy_data), write_freq):
            rows = pa.array(dummy_data[i:i+write_freq], type=game_type)
            batch = pa.RecordBatch.from_struct_array(rows)
            writer.write(batch)

In [13]:
%%time
# read with mmap
with pa.memory_map('games.arrow', 'r') as source:
    loaded_arrays = pa.ipc.open_file(source).read_all()

CPU times: user 143 ms, sys: 82 ms, total: 225 ms
Wall time: 224 ms
