In [68]:
import numpy

import pickle
from pickletools import optimize
import gzip

from timeit import timeit

from tf.core.helpers import deepSize
from tf.core.files import expanduser as ex

# The data

We take a sizable piece of data from the BHSA: the oslots data.

In [6]:
oslotsFile = ex("~/github/ETCBC/bhsa/tf/2021/.tf/3/oslots.tfx")

In [40]:
with gzip.open(oslotsFile, "rb") as f:
    data = pickle.load(f)
(eoslots, maxSlot, maxNode) = data
print(f"{maxSlot:>7} slots\n{maxNode:>7} nodes")

 426590 slots
1446831 nodes


eoslots is a tuple of array of int


In [58]:
def getSize(material):
    return int(round(deepSize(material) / 1024 / 1024))

def getInfo(name, material):
    typ = f"{type(material).__name__} of {type(material[0]).__name__} of {type(material[0][0]).__name__}"
    print(f"{name} is a {typ}")
    n = sum(len(x) for x in material)
    s = getSize(material)
    print(f"{len(material):>7} node linkages\n{n:>7} numbers\n{s:>7} MB")

In [59]:
getInfo("original", eoslots)

original is a tuple of array of int
1020241 node linkages
4854312 numbers
    116 MB


# Experiments

We'll pack the same data in numpy arrays, in several different ways.

# As array of arrays

We make a numpy array of numpy arrays.
Note that the sub-arrays have different length.
Numpy can handle this.

In [60]:
eoslots1 = numpy.array([numpy.array(x, dtype="uint32") for x in eoslots], dtype=object) 

In [61]:
getInfo("rough numpy", eoslots1)

rough numpy is a ndarray of ndarray of uint32
1020241 node linkages
4854312 numbers
      8 MB


# As one big array with an index array

In [99]:
eoslots2main = []
eoslots2start = []
eoslots2end = []

b = 0

for ns in eoslots:
    eoslots2start.append(b)
    eoslots2main.extend(ns)
    b += len(ns)
    eoslots2end.append(b)
    
eoslots2main = numpy.array(eoslots2main, dtype="uint32")
eoslots2start = numpy.array(eoslots2start, dtype="uint32")
eoslots2end = numpy.array(eoslots2end, dtype="uint32")

In [100]:
getSize(eoslots2main)

19

In [101]:
getSize(eoslots2start)

4

In [102]:
getSize(eoslots2end)

4

# Speed

What about speed?

Let's extract all data and take the average of all numbers found in all representations.

In [107]:
def getSpeed(material, indexed=False):
    if indexed:

        def action():
            (main, start, end) = material
            total = sum(sum(main[start[n] : end[n]]) for n in range(len(start)))
            n = sum(end[n] - start[n] for n in range(len(start)))
            return (n, total)

    else:

        def action():
            total = sum(sum(ns) for ns in material)
            n = sum(len(ns) for ns in material)
            return (n, total)

    (n, total) = action()
    xTime = timeit("action()", globals=locals(), number=1)
    print(f"{n:>8} {total:>8} {xTime}")

In [108]:
getSpeed(eoslots)

 4854312 1034928337345 0.18651545800094027


In [109]:
getSpeed(eoslots1)

 4854312 1034928337345 1.7245157919969643


In [110]:
getSpeed((eoslots2main, eoslots2start, eoslots2end), indexed=True)

 4854312 1034928337345 2.1193653749942314


# Observation

It is no use to cram all numbers in a single numpy array rather than in a numpy array of numpy arrays.
Both the memory footprint and the performance suffer from it.

The performance suffers 10 fold with numpy as compared to plain tuples.
But the memory footprint decreases almost 15 fold!