Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: on_full policy #17

Closed
wants to merge 9 commits into from
2 changes: 1 addition & 1 deletion chest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .core import Chest
from .core import Chest, full_policy
from ._version import __version__
75 changes: 65 additions & 10 deletions chest/core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import MutableMapping
import errno
from functools import partial
from threading import Lock
from contextlib import contextmanager
Expand All @@ -14,6 +15,9 @@
from heapdict import heapdict
import hashlib

from .utils import BytesIO


DEFAULT_AVAILABLE_MEMORY = 1e9
MAX_OPEN_FILES = 512

Expand All @@ -38,14 +42,33 @@ def _do_nothing(*args, **kwargs):
pass


class full_policy(object):
""" Behaviours to use when a a chest is full.
"""
@staticmethod
def raise_(chest):
"""A policy that raises an ENOSPC error if the disk is full.
"""
raise OSError(errno.ENOSPC, os.strerror(errno.ENOSPC), chest.path)

@staticmethod
def pop_lru(chest):
"""A policy that removes the most recently used element from
the cache when the chest is full.
"""
del chest[chest.heap.popitem()[0]]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can these just be functions rather than staticmethods on a class?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am using this only as a namespace. I normally agree that free functions are better than functions but it is nice that I can import full_policy and then discover all of the policies we have provided.

How would you feel about making this class namespace a submodule?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I would prefer a submodule rather than a class. I would normally wait to produce a submodule until I had a larger critical mass of functions. Not a big deal either way though.



@contextmanager
def _open_many(fnames, mode='rb'):
fs = []
for fn in fnames:
fs.append(open(fn, mode=mode))
yield fs
for f in fs:
f.close()
try:
yield fs
finally:
for f in fs:
f.close()


class Chest(MutableMapping):
Expand All @@ -64,12 +87,21 @@ class Chest(MutableMapping):
A directory path to store contents of the chest. Defaults to a tmp dir
available_memory : int (optional)
Number of bytes that a chest should use for in-memory storage
available_disk : int (optional)
Number of bytes that a chest should use for on-disk storage.
If this is not provided then there will be no cap.
on_full : function (optional)
A function to be called if available_disk has been used; however,
more disk space is requested. This function will be passed the chest
dump : function (optional)
A function like pickle.dump or json.dump that dumps contents to file
load : function(optional)
A function like pickle.load or json.load that loads contents from file
key_to_filename : function (optional)
A function to determine filenames from key values
lock : Lock (optional)
The lock object to use to control access to the disk. If no lock is
given, a new threading lock will be used.
mode : str (t or b)
Binary or text mode for file storage

Expand All @@ -88,10 +120,14 @@ class Chest(MutableMapping):

>>> c.drop()
"""
def __init__(self, data=None, path=None, available_memory=None,
def __init__(self, data=None, path=None,
available_memory=None,
available_disk=None,
dump=partial(pickle.dump, protocol=1),
load=pickle.load,
key_to_filename=key_to_filename,
lock=None,
on_full=full_policy.raise_,
on_miss=_do_nothing, on_overflow=_do_nothing,
open=open,
open_many=_open_many,
Expand All @@ -108,9 +144,13 @@ def __init__(self, data=None, path=None, available_memory=None,
self.available_memory = (available_memory if available_memory
is not None else DEFAULT_AVAILABLE_MEMORY)
self.memory_usage = sum(map(nbytes, self.inmem.values()))
self.available_disk = available_disk
self.on_full = on_full
# Functions to control disk I/O
self.load = load
self.dump = dump
self._dump = dump
if available_disk is None:
self.dump = dump
self.mode = mode
self.open = open
self.open_many = open_many
Expand All @@ -124,7 +164,7 @@ def __init__(self, data=None, path=None, available_memory=None,
if not os.path.exists(self.path):
os.mkdir(self.path)

self.lock = Lock()
self.lock = lock if lock is not None else Lock()

# LRU state
self.counter = 0
Expand All @@ -134,8 +174,25 @@ def __init__(self, data=None, path=None, available_memory=None,
self._on_miss = on_miss
self._on_overflow = on_overflow

def __str__(self):
return '<chest at %s>' % self.path
def __repr__(self):
return '<chest at %s: keys=%s>' % (self.path, list(self._keys))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is potentially very very large, no?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be; however, it is closer to the repr for dict. I found it very hard to debug when I couldn't see what my keys were. I am happy to revert this.

__str__ = __repr__

@property
def disk_usage(self):
path = self.path
return sum(
os.path.getsize(os.path.join(r, f))
for r, _, fs in os.walk(path) for f in fs
)

def dump(self, data, file_):
tmp = BytesIO()
self._dump(data, tmp)
bs = tmp.getvalue()
while self.disk_usage + len(bs) > self.available_disk:
self.on_full(self)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the overhead of this like when there are many keys on disk? This seems like potentially a lot of file system access.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we maintain a total instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue is that I couldn't generalize the way to account for the filesystem overhead. I wasn't positive how big all of the files would actually be on disk so I figured it was safest to ask the os.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we could maintain a total, like how we do for memory_usage and add or subtract from it as we add or remove files from disk.

Alternatively can you measure the overhead of how this works when we have one million files? I expect this to be non-negligible at that scale.

file_.write(bs)

def key_to_filename(self, key):
""" Filename where key will be held """
Expand Down Expand Up @@ -251,7 +308,6 @@ def shrink(self):

while self.memory_usage > self.available_memory:
key, _ = self.heap.popitem()
data = self.inmem[key]
try:
self.move_to_disk(key)
except TypeError:
Expand All @@ -278,7 +334,6 @@ def __enter__(self):

def __exit__(self, eType, eValue, eTrace):
with self.lock:
L = os.listdir(self.path)
if not self._explicitly_given_path and os.path.exists(self.path):
self.drop() # pragma: no cover

Expand Down
24 changes: 22 additions & 2 deletions chest/tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from chest.core import Chest, nbytes, key_to_filename
import os
import re
import json
Expand All @@ -7,9 +6,11 @@
from contextlib import contextmanager
import numpy as np
from chest.utils import raises, raise_KeyError
import time
import hashlib

from chest.core import Chest, nbytes, key_to_filename, full_policy
from chest.utils import PY3


@contextmanager
def tmp_chest(*args, **kwargs):
Expand Down Expand Up @@ -458,3 +459,22 @@ def test_prefetch():
assert not raises(KeyError, lambda: c[1])
c.prefetch([1, 2])
assert not raises(KeyError, lambda: c[2])


def test_available_disk_raise():
with tmp_chest(available_disk=0) as c:
c[1] = 1
assert not raises(KeyError, lambda: c[1])
assert raises(OSError, lambda: c.flush())


def test_available_disk_pop_lru():
with tmp_chest(available_disk=100 if PY3 else 94,
on_full=full_policy.pop_lru) as c:
c[1] = 1
assert not raises(KeyError, lambda: c[1])
c[2] = 2
assert not raises(KeyError, lambda: c[2])
c.flush()
assert raises(KeyError, lambda: c[1])
assert not raises(KeyError, lambda: c[2])
15 changes: 13 additions & 2 deletions chest/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
def raises(err, lamda):
import sys

PY2 = sys.version_info[0] == 2
PY3 = sys.version_info[0] == 3

if PY2:
from StringIO import StringIO as BytesIO # pragma: no cover
else:
from io import BytesIO # pragma: no cover
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently this works in Python 2 as wll

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, didn't know that. I will update it.



def raises(err, lambda_):
try:
lamda()
lambda_()
return False
except err:
return True
Expand Down