Merge pull request facebookarchive#34 from willsmithorg/master

Skip sequential IO
acnithin · Oct 17, 2011 · 42321f0 · 42321f0
2 parents 5d28979 + ece3c8e
commit 42321f0
Show file tree

Hide file tree

Showing 9 changed files with 281 additions and 21 deletions.
diff --git a/doc/flashcache-doc.txt b/doc/flashcache-doc.txt
@@ -40,8 +40,9 @@ block. Note that a sequential range of disk blocks will all map onto a
 given set.
 
 The DM layer breaks up all IOs into blocksize chunks before passing
-the IOs down to the cache layer. Flashcache caches all full blocksize
-IOs.
+the IOs down to the cache layer. By default, flashcache caches all 
+full blocksize IOs, but can be configured to only cache random IO
+whilst ignoring sequential IO.
 
 Replacement policy is either FIFO or LRU within a cache set. The
 default is FIFO but policy can be switched at any point at run time
@@ -164,6 +165,19 @@ In spite of the limitations, we think the ability to mark Direct IOs
 issued by a pid will be valuable to prevent backups from wiping out
 the cache.
 
+Alternatively, rather than specifically marking pids as non-cacheable,
+users may wish to experiment with the sysctl 'skip_seq_thresh_kb' which
+disables caching of IO determined to be sequential, above a configurable
+threshold of consecutive reads or writes.  The algorithm to spot 
+sequential IO has some ability to handle multiple 'flows' of IO, so
+it should, for example, be able to skip caching of IOs of two
+flows of sequential reads or writes, but only cache IOs from a third
+random IO flow.  Note that multiple small files may be written to 
+consecutive blocks.  If these are written out in a batch (e.g. by
+an untar), this may appear as single sequential write, hence these 
+multiple small files will not be cached.  The categorization of IO as 
+sequential or random occurs purely at the block level, not the file level.
+
 (For a more detailed discussion about caching controls, see the SA Guide).
 
 Futures and Features :
@@ -298,3 +312,5 @@ Acknowledgements :
 I would like to thank Bob English for doing a critical review of the
 design and the code of flashcache, for discussing this in detail with
 me and providing valuable suggestions.
+
+The option to detect and skip sequential IO was added by Will Smith.
diff --git a/doc/flashcache-sa-guide.txt b/doc/flashcache-sa-guide.txt
@@ -199,6 +199,7 @@ dev.flashcache.ram3+ram4.pid_expiry_secs = 60
 dev.flashcache.ram3+ram4.max_pids = 100
 dev.flashcache.ram3+ram4.do_pid_expiry = 0
 dev.flashcache.ram3+ram4.io_latency_hist = 0
+dev.flashcache.ram3+ram4.skip_seq_thresh_kb = 0
 
 Sysctls for a writeback mode cache :
 cache device /dev/sdb, disk device /dev/cciss/c0d2
@@ -218,6 +219,7 @@ dev.flashcache.sdb+c0d2.dirty_thresh_pct = 20
 dev.flashcache.sdb+c0d2.stop_sync = 0
 dev.flashcache.sdb+c0d2.do_sync = 0
 dev.flashcache.sdb+c0d2.io_latency_hist = 0
+dev.flashcache.sdb+c0d2.skip_seq_thresh_kb = 0
 
 Sysctls common to all cache modes :
 
@@ -243,13 +245,19 @@ dev.flashcache.<cachedev>.do_pid_expiry:
 	Enable expiry on the list of pids in the white/black lists.
 dev.flashcache.<cachedev>.pid_expiry_secs:
 	Set the expiry on the pid white/black lists.
+dev.flashcache.<cachedev>.skip_seq_thresh_kb:
+	Skip (don't cache) sequential IO larger than this number (in kb).
+	0 (default) means cache all IO, both sequential and random.
+	Sequential IO can only be determined 'after the fact', so
+	this much of each sequential I/O will be cached before we skip 
+	the rest.  Does not affect searching for IO in an existing cache.
 
 Sysctls for writeback mode only :
 
 dev.flashcache.<cachedev>.fallow_delay = 900
 	In seconds. Clean dirty blocks that have been "idle" (not 
-	read or written) for fallow_delay seconds. Default is 60 
-	seconds. 
+	read or written) for fallow_delay seconds. Default is 15
+	minutes. 
 	Setting this to 0 disables idle cleaning completely.
 dev.flashcache.<cachedev>.fallow_clean_speed = 2
 	The maximum number of "fallow clean" disk writes per set 
@@ -350,13 +358,17 @@ not cache the IO. ELSE,
 2) If the tgid is in the blacklist, don't cache this IO. UNLESS
 3) The particular pid is marked as an exception (and entered in the
 whitelist, which makes the IO cacheable).
+4) Finally, even if IO is cacheable up to this point, skip sequential IO 
+if configured by the sysctl.
 
 Conversely, in "cache nothing" mode,
 1) If the pid of the process issuing the IO is in the whitelist,
 cache the IO. ELSE,
 2) If the tgid is in the whitelist, cache this IO. UNLESS
 3) The particular pid is marked as an exception (and entered in the
 blacklist, which makes the IO non-cacheable).
+4) Anything whitelisted is cached, regardless of sequential or random
+IO.
 
 Examples :
 --------
@@ -480,6 +492,34 @@ agsize * agcount ~= V
 
 Works just as well as the formula above.
 
+Tuning Sequential IO Skipping for better flashcache performance
+===============================================================
+Skipping sequential IO makes sense in two cases:
+1) your sequential write speed of your SSD is slower than
+   the sequential write speed or read speed of your disk.  In 
+   particular, for implementations with RAID disks (especially 
+   modes 0, 10 or 5) sequential reads may be very fast.  If 
+   'cache_all' mode is used, every disk read miss must also be 
+   written to SSD.  If you notice slower sequential reads and writes 
+   after enabling flashcache, this is likely your problem.
+2) Your 'resident set' of disk blocks that you want cached, i.e.
+   those that you would hope to keep in cache, is smaller
+   than the size of your SSD.  You can check this by monitoring
+   how quick your cache fills up ('dmsetup table').  If this
+   is the case, it makes sense to prioritize caching of random IO,
+   since SSD performance vastly exceeds disk performance for 
+   random IO, but is typically not much better for sequential IO.
+
+In the above cases, start with a high value (say 1024k) for
+sysctl dev.flashcache.<device>.skip_seq_thresh_kb, so only the
+largest sequential IOs are skipped, and gradually reduce
+if benchmarks show it's helping.  Don't leave it set to a very
+high value, return it to 0 (the default), since there is some
+overhead in categorizing IO as random or sequential.
+
+If neither of the above hold, continue to cache all IO, 
+(the default) you will likely benefit from it. 
+
 
 Further Information
 ===================

diff --git a/flashcache-wt/README b/flashcache-wt/README
@@ -1,7 +1,9 @@
 flashcache-wt is a simple, non-persistent write-through and write-around
 flashcache.
 
-It is a separate code base from flashcache (which is write back only).
+It is a separate code base from flashcache.  Note that flashcache itself, which
+is more configurable, now has options for writeback, writethrough and writearound
+caching.
 
 Notes :
 -----

diff --git a/src/flashcache.h b/src/flashcache.h
@@ -165,6 +165,7 @@ struct flashcache_stats {
 	unsigned long expiry;
 	unsigned long front_merge, back_merge;	/* Write Merging */
 	unsigned long uncached_reads, uncached_writes;
+	unsigned long uncached_sequential_reads, uncached_sequential_writes;
 	unsigned long disk_reads, disk_writes;
 	unsigned long ssd_reads, ssd_writes;
 	unsigned long uncached_io_requeue;
@@ -173,6 +174,24 @@ struct flashcache_stats {
 	unsigned long clean_set_ios;
 };
 
+/* 
+ * Sequential block history structure - each one
+ * records a 'flow' of i/o.
+ */
+struct sequential_io {
+ 	sector_t 		most_recent_sector;
+	unsigned long		sequential_count;
+	/* We use LRU replacement when we need to record a new i/o 'flow' */
+	struct sequential_io 	*prev, *next;
+};
+#define SKIP_SEQUENTIAL_THRESHOLD 0			/* 0 = cache all, >0 = dont cache sequential i/o more than this (kb) */
+#define SEQUENTIAL_TRACKER_QUEUE_DEPTH	32		/* How many io 'flows' to track (random i/o will hog many).
+							 * This should be large enough so that we don't quickly 
+							 * evict sequential i/o when we see some random,
+							 * but small enough that searching through it isn't slow
+							 * (currently we do linear search, we could consider hashed */
+
+
 /*
  * Cache context
  */
@@ -275,6 +294,12 @@ struct cache_c {
 	int sysctl_cache_all;
 	int sysctl_fallow_clean_speed;
 	int sysctl_fallow_delay;
+	int sysctl_skip_seq_thresh_kb;
+
+	/* Sequential I/O spotter */
+	struct sequential_io	seq_recent_ios[SEQUENTIAL_TRACKER_QUEUE_DEPTH];
+	struct sequential_io	*seq_io_head;
+	struct sequential_io 	*seq_io_tail;
 };
 
 /* kcached/pending job states */
@@ -333,7 +358,7 @@ enum {
 #define DIRTY			0x0040	/* Dirty, needs writeback to disk */
 /*
  * Old and Dirty blocks are cleaned with a Clock like algorithm. The leading hand
- * marks DIRTY_FALLOW_1. 60 seconds (default) later, the trailing hand comes along and
+ * marks DIRTY_FALLOW_1. 900 seconds (default) later, the trailing hand comes along and
  * marks DIRTY_FALLOW_2 if DIRTY_FALLOW_1 is already set. If the block was used in the 
  * interim, (DIRTY_FALLOW_1|DIRTY_FALLOW_2) is cleared. Any block that has both 
  * DIRTY_FALLOW_1 and DIRTY_FALLOW_2 marked is considered old and is eligible 

diff --git a/src/flashcache_conf.c b/src/flashcache_conf.c
@@ -1147,7 +1147,18 @@ flashcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	dmc->sysctl_cache_all = 1;
 	dmc->sysctl_fallow_clean_speed = FALLOW_CLEAN_SPEED;
 	dmc->sysctl_fallow_delay = FALLOW_DELAY;
-
+	dmc->sysctl_skip_seq_thresh_kb = SKIP_SEQUENTIAL_THRESHOLD;
+
+	/* Sequential i/o spotting */	
+	for (i = 0; i < SEQUENTIAL_TRACKER_QUEUE_DEPTH; i++) {
+		dmc->seq_recent_ios[i].most_recent_sector = 0;
+		dmc->seq_recent_ios[i].sequential_count = 0;
+		dmc->seq_recent_ios[i].prev = (struct sequential_io *)NULL;
+		dmc->seq_recent_ios[i].next = (struct sequential_io *)NULL;
+		seq_io_move_to_lruhead(dmc, &dmc->seq_recent_ios[i]);
+	}
+	dmc->seq_io_tail = &dmc->seq_recent_ios[0];
+
 	(void)wait_on_bit_lock(&flashcache_control->synch_flags, FLASHCACHE_UPDATE_LIST,
 			       flashcache_wait_schedule, TASK_UNINTERRUPTIBLE);
 	dmc->next_cache = cache_list_head;
@@ -1275,9 +1286,11 @@ flashcache_dtr_stats_print(struct cache_c *dmc)
 	/* All modes */
         DMINFO("\tdisk reads(%lu), disk writes(%lu) ssd reads(%lu) ssd writes(%lu)\n" \
                "\tuncached reads(%lu), uncached writes(%lu), uncached IO requeue(%lu)\n" \
+	       "\tuncached sequential reads (%lu), uncached sequential writes (%lu)\n" \
                "\tpid_adds(%lu), pid_dels(%lu), pid_drops(%lu) pid_expiry(%lu)",
                stats->disk_reads, stats->disk_writes, stats->ssd_reads, stats->ssd_writes,
                stats->uncached_reads, stats->uncached_writes, stats->uncached_io_requeue,
+	       stats->uncached_sequential_reads, stats->uncached_sequential_writes,
                stats->pid_adds, stats->pid_dels, stats->pid_drops, stats->expiry);
 	if (dmc->size > 0) {
 		dirty_pct = ((u_int64_t)dmc->nr_dirty * 100) / dmc->size;
@@ -1295,13 +1308,15 @@ flashcache_dtr_stats_print(struct cache_c *dmc)
 	DMINFO("conf:\n"						\
 	       "\tvirt dev (%s), ssd dev (%s), disk dev (%s) cache mode(%s)\n"		\
 	       "\tcapacity(%luM), associativity(%u), data block size(%uK) metadata block size(%ub)\n" \
+	       "\tskip sequential thresh(%uK)\n" \
 	       "\ttotal blocks(%lu), cached blocks(%lu), cache percent(%d)\n" \
 	       "\tdirty blocks(%d), dirty percent(%d)\n",
 	       dmc->dm_vdevname, dmc->cache_devname, dmc->disk_devname,
 	       cache_mode,
 	       dmc->size*dmc->block_size>>11, dmc->assoc,
 	       dmc->block_size>>(10-SECTOR_SHIFT), 
 	       dmc->md_block_size * 512, 
+	       dmc->sysctl_skip_seq_thresh_kb,
 	       dmc->size, dmc->cached_blocks, 
 	       (int)cache_pct, dmc->nr_dirty, (int)dirty_pct);
 	DMINFO("\tnr_queued(%lu)\n", dmc->pending_jobs_count);
@@ -1451,9 +1466,11 @@ flashcache_status_info(struct cache_c *dmc, status_type_t type,
 	/* All modes */
 	DMEMIT("\tdisk reads(%lu), disk writes(%lu) ssd reads(%lu) ssd writes(%lu)\n" \
 	       "\tuncached reads(%lu), uncached writes(%lu), uncached IO requeue(%lu)\n" \
+	       "\tuncached sequential reads (%lu), uncached sequential writes (%lu)\n" \
 	       "\tpid_adds(%lu), pid_dels(%lu), pid_drops(%lu) pid_expiry(%lu)",
 	       stats->disk_reads, stats->disk_writes, stats->ssd_reads, stats->ssd_writes,
 	       stats->uncached_reads, stats->uncached_writes, stats->uncached_io_requeue,
+	       stats->uncached_sequential_reads, stats->uncached_sequential_writes,
 	       stats->pid_adds, stats->pid_dels, stats->pid_drops, stats->expiry);
 	if (dmc->sysctl_io_latency_hist) {
 		int i;
@@ -1503,6 +1520,8 @@ flashcache_status_table(struct cache_c *dmc, status_type_t type,
 		       dmc->size*dmc->block_size>>11, dmc->assoc,
 		       dmc->block_size>>(10-SECTOR_SHIFT));
 	}
+	DMEMIT("\tskip sequential thresh(%uK)\n",
+	       dmc->sysctl_skip_seq_thresh_kb);
 	DMEMIT("\ttotal blocks(%lu), cached blocks(%lu), cache percent(%d)\n",
 	       dmc->size, dmc->cached_blocks,
 	       (int)cache_pct);