-
Notifications
You must be signed in to change notification settings - Fork 10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: on_full policy #17
Changes from all commits
cb4cb44
e886603
b9a850a
7c23f43
40ec49d
706102c
815e615
c2a5a9a
34cb7cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
from .core import Chest | ||
from .core import Chest, full_policy | ||
from ._version import __version__ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
from collections import MutableMapping | ||
import errno | ||
from functools import partial | ||
from threading import Lock | ||
from contextlib import contextmanager | ||
|
@@ -14,6 +15,9 @@ | |
from heapdict import heapdict | ||
import hashlib | ||
|
||
from .utils import BytesIO | ||
|
||
|
||
DEFAULT_AVAILABLE_MEMORY = 1e9 | ||
MAX_OPEN_FILES = 512 | ||
|
||
|
@@ -38,14 +42,33 @@ def _do_nothing(*args, **kwargs): | |
pass | ||
|
||
|
||
class full_policy(object): | ||
""" Behaviours to use when a a chest is full. | ||
""" | ||
@staticmethod | ||
def raise_(chest): | ||
"""A policy that raises an ENOSPC error if the disk is full. | ||
""" | ||
raise OSError(errno.ENOSPC, os.strerror(errno.ENOSPC), chest.path) | ||
|
||
@staticmethod | ||
def pop_lru(chest): | ||
"""A policy that removes the most recently used element from | ||
the cache when the chest is full. | ||
""" | ||
del chest[chest.heap.popitem()[0]] | ||
|
||
|
||
@contextmanager | ||
def _open_many(fnames, mode='rb'): | ||
fs = [] | ||
for fn in fnames: | ||
fs.append(open(fn, mode=mode)) | ||
yield fs | ||
for f in fs: | ||
f.close() | ||
try: | ||
yield fs | ||
finally: | ||
for f in fs: | ||
f.close() | ||
|
||
|
||
class Chest(MutableMapping): | ||
|
@@ -64,12 +87,21 @@ class Chest(MutableMapping): | |
A directory path to store contents of the chest. Defaults to a tmp dir | ||
available_memory : int (optional) | ||
Number of bytes that a chest should use for in-memory storage | ||
available_disk : int (optional) | ||
Number of bytes that a chest should use for on-disk storage. | ||
If this is not provided then there will be no cap. | ||
on_full : function (optional) | ||
A function to be called if available_disk has been used; however, | ||
more disk space is requested. This function will be passed the chest | ||
dump : function (optional) | ||
A function like pickle.dump or json.dump that dumps contents to file | ||
load : function(optional) | ||
A function like pickle.load or json.load that loads contents from file | ||
key_to_filename : function (optional) | ||
A function to determine filenames from key values | ||
lock : Lock (optional) | ||
The lock object to use to control access to the disk. If no lock is | ||
given, a new threading lock will be used. | ||
mode : str (t or b) | ||
Binary or text mode for file storage | ||
|
||
|
@@ -88,10 +120,14 @@ class Chest(MutableMapping): | |
|
||
>>> c.drop() | ||
""" | ||
def __init__(self, data=None, path=None, available_memory=None, | ||
def __init__(self, data=None, path=None, | ||
available_memory=None, | ||
available_disk=None, | ||
dump=partial(pickle.dump, protocol=1), | ||
load=pickle.load, | ||
key_to_filename=key_to_filename, | ||
lock=None, | ||
on_full=full_policy.raise_, | ||
on_miss=_do_nothing, on_overflow=_do_nothing, | ||
open=open, | ||
open_many=_open_many, | ||
|
@@ -108,9 +144,13 @@ def __init__(self, data=None, path=None, available_memory=None, | |
self.available_memory = (available_memory if available_memory | ||
is not None else DEFAULT_AVAILABLE_MEMORY) | ||
self.memory_usage = sum(map(nbytes, self.inmem.values())) | ||
self.available_disk = available_disk | ||
self.on_full = on_full | ||
# Functions to control disk I/O | ||
self.load = load | ||
self.dump = dump | ||
self._dump = dump | ||
if available_disk is None: | ||
self.dump = dump | ||
self.mode = mode | ||
self.open = open | ||
self.open_many = open_many | ||
|
@@ -124,7 +164,7 @@ def __init__(self, data=None, path=None, available_memory=None, | |
if not os.path.exists(self.path): | ||
os.mkdir(self.path) | ||
|
||
self.lock = Lock() | ||
self.lock = lock if lock is not None else Lock() | ||
|
||
# LRU state | ||
self.counter = 0 | ||
|
@@ -134,8 +174,25 @@ def __init__(self, data=None, path=None, available_memory=None, | |
self._on_miss = on_miss | ||
self._on_overflow = on_overflow | ||
|
||
def __str__(self): | ||
return '<chest at %s>' % self.path | ||
def __repr__(self): | ||
return '<chest at %s: keys=%s>' % (self.path, list(self._keys)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is potentially very very large, no? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be; however, it is closer to the repr for dict. I found it very hard to debug when I couldn't see what my keys were. I am happy to revert this. |
||
__str__ = __repr__ | ||
|
||
@property | ||
def disk_usage(self): | ||
path = self.path | ||
return sum( | ||
os.path.getsize(os.path.join(r, f)) | ||
for r, _, fs in os.walk(path) for f in fs | ||
) | ||
|
||
def dump(self, data, file_): | ||
tmp = BytesIO() | ||
self._dump(data, tmp) | ||
bs = tmp.getvalue() | ||
while self.disk_usage + len(bs) > self.available_disk: | ||
self.on_full(self) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the overhead of this like when there are many keys on disk? This seems like potentially a lot of file system access. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we maintain a total instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The issue is that I couldn't generalize the way to account for the filesystem overhead. I wasn't positive how big all of the files would actually be on disk so I figured it was safest to ask the os. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps we could maintain a total, like how we do for Alternatively can you measure the overhead of how this works when we have one million files? I expect this to be non-negligible at that scale. |
||
file_.write(bs) | ||
|
||
def key_to_filename(self, key): | ||
""" Filename where key will be held """ | ||
|
@@ -251,7 +308,6 @@ def shrink(self): | |
|
||
while self.memory_usage > self.available_memory: | ||
key, _ = self.heap.popitem() | ||
data = self.inmem[key] | ||
try: | ||
self.move_to_disk(key) | ||
except TypeError: | ||
|
@@ -278,7 +334,6 @@ def __enter__(self): | |
|
||
def __exit__(self, eType, eValue, eTrace): | ||
with self.lock: | ||
L = os.listdir(self.path) | ||
if not self._explicitly_given_path and os.path.exists(self.path): | ||
self.drop() # pragma: no cover | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,17 @@ | ||
def raises(err, lamda): | ||
import sys | ||
|
||
PY2 = sys.version_info[0] == 2 | ||
PY3 = sys.version_info[0] == 3 | ||
|
||
if PY2: | ||
from StringIO import StringIO as BytesIO # pragma: no cover | ||
else: | ||
from io import BytesIO # pragma: no cover | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Apparently this works in Python 2 as wll There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, didn't know that. I will update it. |
||
|
||
|
||
def raises(err, lambda_): | ||
try: | ||
lamda() | ||
lambda_() | ||
return False | ||
except err: | ||
return True | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can these just be functions rather than
staticmethod
s on a class?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am using this only as a namespace. I normally agree that free functions are better than functions but it is nice that I can import
full_policy
and then discover all of the policies we have provided.How would you feel about making this class namespace a submodule?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I would prefer a submodule rather than a class. I would normally wait to produce a submodule until I had a larger critical mass of functions. Not a big deal either way though.