Exploring why multiprocess.heap is so slow...

In [2]:
import tempfile, os
from multiprocessing import util
import weakref
import numpy as np
import mmap

In [34]:
fd, name = tempfile.mkstemp(prefix='pym-%d-'%os.getpid(), dir=util.get_temp_dir())

In [35]:
os.unlink(name)
name

'/local/u46/brl654/tmp/pymp-rnpisdeb/pym-5183-ojt0z9vz'

In [3]:
size = 6400000000

In [7]:
def fill(fd, size):
    with open(fd, 'wb', closefd=False) as f:
        bs = 1024 * 1024
        if size >= bs:
            zeros = b'\0' * bs
            for _ in range(size // bs):
                f.write(zeros)
            del zeros
        f.write(b'\0' * (size % bs))
        assert f.tell() == size
        
%time fill(fd, size)

CPU times: user 15 ms, sys: 4.32 s, total: 4.33 s
Wall time: 4.39 s


In [37]:
def fill2():
    with open(fd, 'wb', closefd=False) as f:
        f.seek(size - 1)
        f.write(b'\0')
        assert f.tell() == size
%time fill2()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 99.2 µs


In [14]:
%time buffer = mmap.mmap(fd, size)

NameError: name 'fd' is not defined

Fundamentally... multiprocessing library is writing the full array size to disk, probably. If i just want anonymous..

In [66]:
%time buffer = mmap.mmap(-1, size)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 43.4 µs


In [67]:
z = np.frombuffer(buffer)

In [17]:
z

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [18]:
z.shape

(800000000,)

In [19]:
z.dtype

dtype('float64')

In [20]:
z.size * 64/8 == size

True

In [61]:
#weakref.finalize(z, buffer.close)
def f(closable):
    print("release!")
    closable.close()
    print(type(closable))
weakref.finalize(z, f, buffer)

<finalize object at 0x7f736eb50070; for 'ndarray' at 0x7f73608c3120>

In [62]:
r = weakref.ref(z)

In [29]:
r()

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [27]:
del z

NameError: name 'z' is not defined

In [55]:
y = np.ones(10000000)
rr = weakref.ref(y)

In [56]:
False if rr() is None else rr().shape

(10000000,)

In [57]:
del y

In [58]:
False if rr() is None else rr().shape

False

In [63]:
False if r() is None else r().shape

(800000000,)

In [64]:
del z

release!
<class 'mmap.mmap'>


In [70]:
type(z.data)

memoryview

In [71]:
type(buffer)

mmap.mmap

In [74]:
z.data.release

<function memoryview.release>

In [79]:
d = z.data

In [80]:
d

<memory at 0x7f6d0d4aa1c8>

In [81]:
type(d.obj)

numpy.ndarray

In [82]:
z.base

<mmap.mmap at 0x7f700d09f9c0>

I think what will happen is: 
 - numpy objects will refer to the mmap object as their .base attribute.
 - if those numpy objects all pass out of scope (or are copied to new memory areas) then the references to the mmap decrease
 - once the mmap has no remaining references, I want it to:
  - free the memory, if it is the original process
  - do nothing if it is a fork?
    - if a fork does unmap the memory, it only unmaps it for that process (which is fine if that process is dying)

What might be bad is if it were unmapped while still being referenced by a numpy array. The OS/Python might reallocate it, and then numpy might misbehave. (Confirmed: kills kernel to read numpy after buffer closed.)

In [84]:
mmap.__file__

'/g/data/v10/public/modules/agdc-py3-env/20171016/envs/agdc/lib/python3.6/lib-dynload/mmap.cpython-36m-x86_64-linux-gnu.so'

In [86]:
dir(buffer)

['__add__',
 '__class__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'close',
 'closed',
 'find',
 'flush',
 'move',
 'read',
 'read_byte',
 'readline',
 'resize',
 'rfind',
 'seek',
 'size',
 'tell',
 'write',
 'write_byte']

In [87]:
buffer.__add__

<method-wrapper '__add__' of mmap.mmap object at 0x7f700d09f9c0>

In [88]:
z.base

<mmap.mmap at 0x7f700d09f9c0>

In [89]:
z.base.close()

In [None]:
z

In [46]:
class thing:
    def __init__(self, id):
        self.id = id
    def close(self):
        print("closing", self.id)
class wrapper(thing):
    def __del__(self):
        self.close()
import weakref
t = wrapper(7)
#c = weakref.proxy(t.close)
#weakref.finalize(t, c)
None

In [47]:
t.close()

closing 7


In [48]:
del t

closing 7


In [42]:
thing.close(t)

closing 7


In [45]:
weakref.finalize(t, thing.close, t)

NameError: name 't' is not defined

In [44]:
del t

Exception ignored in: <finalize object at 0x7fb483872180; dead>
Traceback (most recent call last):
  File "/g/data/v10/public/modules/agdc-py3-env/20171016/envs/agdc/lib/python3.6/weakref.py", line 548, in __call__
    return info.func(*info.args, **(info.kwargs or {}))
  File "<ipython-input-39-0d7b4ac559f7>", line 5, in close
ReferenceError: weakly-referenced object no longer exists


In [54]:
import ctypes
import numpy as np

In [62]:
np.dtype(np.uint8).itemsize

1

In [63]:
import multiprocessing.sharedctypes

In [64]:
%time multiprocessing.sharedctypes.RawArray('b', 10^9)

CPU times: user 2 ms, sys: 3 ms, total: 5 ms
Wall time: 8.59 ms


<multiprocessing.sharedctypes.c_byte_Array_3 at 0x7fb475ac1400>

In [67]:
%time mmap.mmap(-1, 10^9)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 40.3 µs


<mmap.mmap at 0x7fb4758703d8>

In [66]:
import mmap

In [68]:
type(mmap.mmap)

type

In [69]:
mmap.mmap.__class__

type

In [70]:
mmap.__class__

module

In [None]:
 == 0) | (future == 0)

In [81]:
import multiprocessing.sharedctypes
import numpy as np



sharedresource = multiprocessing.sharedctypes.RawArray('b', 10)
x = np.frombuffer(sharedresource, dtype=np.uint8)

class Task:
    def __init__(self, x):
        self.__call__.x = x
    def __call__(self, i):
        self.__call__.x[i] = i

def taskmaker(x):
    def task(i, x=x):
        #global x
        x[i] = i
    return task

task = Task(x)

z = x
x = None

with multiprocessing.Pool(4) as pool:
    pool.map(task, range(10)) # implicitly modify x

print(z) # [0 1 2 3 4 5 6 7 8 9]

AttributeError: 'method' object has no attribute 'x'