Commit
As described in the comment above arc_adapt_thread() it is critical that the arc_adapt_thread() function never sleep while holding a hash lock. This was possible in the Linux implementation because the arc_prune() logic was implemented to be synchronous. Under illumos the analogus dnlc_reduce_cache() function is also asynchromnous. This patch additionally adds the zfs_arc_meta_strategy module option while allows the meta reclaim strategy to be configured. It defaults to a balanced strategy which has been proved to work well under Linux but the illumos meta-only strategy can be enabled. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -167,6 +167,9 @@ static boolean_t arc_user_evicts_thread_exit; | |
/* number of objects to prune from caches when arc_meta_limit is reached */ | ||
int zfs_arc_meta_prune = 10000; | ||
|
||
/* The preferred strategy to employ when arc_meta_limit is reached */ | ||
int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; | ||
|
||
typedef enum arc_reclaim_strategy { | ||
ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ | ||
ARC_RECLAIM_CONS /* Conservative reclaim strategy */ | ||
|
@@ -2430,41 +2433,52 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, | |
} | ||
|
||
/* | ||
* Request that arc user drop references so that N bytes can be released | ||
* from the cache. This provides a mechanism to ensure the arc can honor | ||
* the arc_meta_limit and reclaim buffers which are pinned in the cache | ||
* by higher layers. (i.e. the zpl) | ||
* Helper function for arc_prune() it is responsible for safely handling | ||
* the execution of a registered arc_prune_func_t. | ||
*/ | ||
static void | ||
arc_do_user_prune(int64_t adjustment) | ||
arc_prune_task(void *ptr) | ||
{ | ||
arc_prune_func_t *func; | ||
void *private; | ||
arc_prune_t *cp, *np; | ||
arc_prune_t *ap = (arc_prune_t *)ptr; | ||
arc_prune_func_t *func = ap->p_pfunc; | ||
|
||
mutex_enter(&arc_prune_mtx); | ||
if (func != NULL) | ||
func(ap->p_adjust, ap->p_private); | ||
|
||
cp = list_head(&arc_prune_list); | ||
while (cp != NULL) { | ||
func = cp->p_pfunc; | ||
private = cp->p_private; | ||
np = list_next(&arc_prune_list, cp); | ||
refcount_add(&cp->p_refcnt, func); | ||
mutex_exit(&arc_prune_mtx); | ||
/* Callback unregistered concurrently with execution */ | ||
if (refcount_remove(&ap->p_refcnt, func) == 0) { | ||
ASSERT(!list_link_active(&ap->p_node)); | ||
refcount_destroy(&ap->p_refcnt); | ||
kmem_free(ap, sizeof (*ap)); | ||
} | ||
} | ||
|
||
if (func != NULL) | ||
func(adjustment, private); | ||
/* | ||
* Notify registered consumers they must drop holds on a portion of the ARC | ||
* buffered they reference. This provides a mechanism to ensure the ARC can | ||
* honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This | ||
* is analogous to dnlc_reduce_cache() but more generic. | ||
* | ||
* This operation is performed asyncronously so it may be safely called | ||
* in the context of the arc_adapt_thread(). A reference is taken here | ||
* for each registered arc_prune_t and the arc_prune_task() is responsible | ||
* for releasing it once the registered arc_prune_func_t has completed. | ||
*/ | ||
static void | ||
arc_prune(int64_t adjust) | ||
{ | ||
arc_prune_t *ap; | ||
|
||
mutex_enter(&arc_prune_mtx); | ||
mutex_enter(&arc_prune_mtx); | ||
for (ap = list_head(&arc_prune_list); ap != NULL; | ||
ap = list_next(&arc_prune_list, ap)) { | ||
|
||
/* User removed prune callback concurrently with execution */ | ||
if (refcount_remove(&cp->p_refcnt, func) == 0) { | ||
ASSERT(!list_link_active(&cp->p_node)); | ||
refcount_destroy(&cp->p_refcnt); | ||
kmem_free(cp, sizeof (*cp)); | ||
} | ||
if (refcount_count(&ap->p_refcnt) >= 2) | ||
continue; | ||
|
||
cp = np; | ||
refcount_add(&ap->p_refcnt, cp->p_pfunc); | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
behlendorf
Author
Owner
|
||
ap->p_adjust = adjust; | ||
taskq_dispatch(system_taskq, arc_prune_task, ap, KM_SLEEP); | ||
} | ||
|
||
ARCSTAT_BUMP(arcstat_prune); | ||
|
@@ -2511,7 +2525,7 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, | |
* available for reclaim. | ||
*/ | ||
static uint64_t | ||
arc_adjust_meta(void) | ||
arc_adjust_meta_balanced(void) | ||
{ | ||
int64_t adjustmnt, delta, prune = 0; | ||
uint64_t total_evicted = 0; | ||
|
@@ -2580,7 +2594,7 @@ arc_adjust_meta(void) | |
|
||
if (zfs_arc_meta_prune) { | ||
prune += zfs_arc_meta_prune; | ||
arc_do_user_prune(prune); | ||
arc_prune(prune); | ||
} | ||
} | ||
|
||
|
@@ -2592,6 +2606,50 @@ arc_adjust_meta(void) | |
return (total_evicted); | ||
} | ||
|
||
/* | ||
* Evict metadata buffers from the cache, such that arc_meta_used is | ||
* capped by the arc_meta_limit tunable. | ||
*/ | ||
static uint64_t | ||
arc_adjust_meta_only(void) | ||
{ | ||
uint64_t total_evicted = 0; | ||
int64_t target; | ||
|
||
/* | ||
* If we're over the meta limit, we want to evict enough | ||
* metadata to get back under the meta limit. We don't want to | ||
* evict so much that we drop the MRU below arc_p, though. If | ||
* we're over the meta limit more than we're over arc_p, we | ||
* evict some from the MRU here, and some from the MFU below. | ||
*/ | ||
target = MIN((int64_t)(arc_meta_used - arc_meta_limit), | ||
(int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); | ||
|
||
total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); | ||
|
||
/* | ||
* Similar to the above, we want to evict enough bytes to get us | ||
* below the meta limit, but not so much as to drop us below the | ||
* space alloted to the MFU (which is defined as arc_c - arc_p). | ||
*/ | ||
target = MIN((int64_t)(arc_meta_used - arc_meta_limit), | ||
(int64_t)(arc_mfu->arcs_size - (arc_c - arc_p))); | ||
|
||
total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); | ||
|
||
return (total_evicted); | ||
} | ||
|
||
static uint64_t | ||
arc_adjust_meta(void) | ||
{ | ||
if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) | ||
return (arc_adjust_meta_only()); | ||
else | ||
return (arc_adjust_meta_balanced()); | ||
} | ||
|
||
/* | ||
* Return the type of the oldest buffer in the given arc state | ||
* | ||
|
@@ -2905,6 +2963,14 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) | |
extern kmem_cache_t *zio_buf_cache[]; | ||
extern kmem_cache_t *zio_data_buf_cache[]; | ||
|
||
if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) { | ||
/* | ||
* We are exceeding our meta-data cache limit. | ||
* Prune some entries to release holds on meta-data. | ||
*/ | ||
arc_prune(zfs_arc_meta_prune); | ||
} | ||
|
||
/* | ||
* An aggressive reclamation will shrink the cache size as well as | ||
* reap free buffers from the arc kmem caches. | ||
|
@@ -2929,15 +2995,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) | |
} | ||
|
||
/* | ||
* Unlike other ZFS implementations this thread is only responsible for | ||
* adapting the target ARC size on Linux. The responsibility for memory | ||
* reclamation has been entirely delegated to the arc_shrinker_func() | ||
* which is registered with the VM. To reflect this change in behavior | ||
* the arc_reclaim thread has been renamed to arc_adapt. | ||
* | ||
* The following comment from arc_reclaim_thread() in illumos is still | ||
* applicable: | ||
* | ||
* Threads can block in arc_get_data_buf() waiting for this thread to evict | ||
* enough data and signal them to proceed. When this happens, the threads in | ||
* arc_get_data_buf() are sleeping while holding the hash lock for their | ||
|
@@ -6374,6 +6431,9 @@ module_param(zfs_arc_meta_adjust_restarts, ulong, 0644); | |
MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts, | ||
"Limit number of restarts in arc_adjust_meta"); | ||
|
||
module_param(zfs_arc_meta_strategy, int, 0644); | ||
MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy"); | ||
|
||
module_param(zfs_arc_grow_retry, int, 0644); | ||
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); | ||
|
||
|
I'm just getting my self up to speed on this patch after having applied it to my latest pull request.
If this is supposed to act like the existing
arc_do_user_prune()
insofar as bumping refcounts is concerned, you'll need to restore thecp
andnp
pointers (presumably current and next pointers) since this won't compile as-is.