### Experiment with PyArrow For Memmapped Storage

In [69]:
import pyarrow as pa 
import numpy as np
import sys

from tqdm.notebook import tqdm

In [7]:
# define the type for a single game
fields = [
    ("moves", pa.list_(pa.string())),
    ("white_elo", pa.uint16()),
    ("black_elo", pa.uint16()),
    ("result", pa.uint8())
]
schema = pa.schema(fields)
game_type = pa.struct(fields)

In [8]:
# make some dummy data
dummy_data = [
    {
        'moves': ['e2e4', 'e7e5', "e1e2"],
        'white_elo': 1000,
        'black_elo': 1600,
        'result': 0
    },
    {
        'moves': ['e2e4', 'e7e5', "e1e2"],
        'white_elo': 1200,
        'black_elo': 1100,
        'result': 2
    },
]

In [9]:
# make 10mil copies
for i in range(10000000):
    dummy_data += dummy_data[0:2]

In [5]:
def get_size(x):
    return sys.getsizeof(x) / (1024 * 1024 * 1024)
get_size(dummy_data)

0.1561788022518158

In [12]:
# use the streaming record batch interface to write to disk efficiently
write_freq = 1000
with pa.OSFile('games.arrow', 'wb') as sink:
    with pa.ipc.new_file(sink, schema=schema) as writer:
        for i in range(0, len(dummy_data), write_freq):
            rows = pa.array(dummy_data[i:i+write_freq], type=game_type)
            batch = pa.RecordBatch.from_struct_array(rows)
            writer.write(batch)

In [79]:
%%time
# read with mmap
with pa.memory_map('games.arrow', 'r') as source:
    loaded_arrays = pa.ipc.open_file(source).read_all()

CPU times: user 210 ms, sys: 60.6 ms, total: 271 ms
Wall time: 273 ms


In [98]:
games = []
for _ in tqdm(np.arange(10000)):
    idx = np.random.randint(0, len(dummy_data))
    
    keys = ('moves', 'white_elo', 'black_elo', 'result')
    game = {}
    for key, type_cast in zip(keys, types):
        game[key] = loaded_arrays[key][idx].as_py()
    games.append(game)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [97]:
games[0]['moves'].as_py()
games[0]['white_elo'].as_py()
games[0]['result'].as_py()

2