Skip to content
This repository
Browse code

Incrementally flush RDB on disk while loading it from a master.

This fixes issue #539.

Basically if there is enough free memory the OS may buffer the RDB file
that the slave transfers on disk from the master. The file may
actually be flused on disk at once by the operating system when it gets
closed by Redis, causing the close system call to block for a long time.

This patch is a modified version of one provided by yoav-steinberg of
@garantiadata (the original version was posted in the issue #539
comments), and tries to flush the OS buffers incrementally (every 8 MB
of loaded data).
  • Loading branch information...
commit 13732168a50d86a31a7bf01dad8038e316120afb 1 parent 06bd3b9
Salvatore Sanfilippo authored
8 src/config.h
@@ -52,6 +52,14 @@
52 52 #define aof_fsync fsync
53 53 #endif
54 54
  55 +/* Define rdb_fsync_range to sync_file_range() on Linux, otherwise we use
  56 + * the plain fsync() call. */
  57 +#ifdef __linux__
  58 +#define rdb_fsync_range(fd,off,size) sync_file_range(fd,off,size,SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE)
  59 +#else
  60 +#define rdb_fsync_range(fd,off,size) fsync(fd)
  61 +#endif
  62 +
55 63 /* Byte ordering detection */
56 64 #include <sys/types.h> /* This will likely define BYTE_ORDER */
57 65
4 src/fmacros.h
@@ -3,6 +3,10 @@
3 3
4 4 #define _BSD_SOURCE
5 5
  6 +#if defined(__linux__)
  7 +#define _GNU_SOURCE
  8 +#endif
  9 +
6 10 #if defined(__linux__) || defined(__OpenBSD__)
7 11 #define _XOPEN_SOURCE 700
8 12 #else
5 src/redis.c
@@ -2032,9 +2032,10 @@ sds genRedisInfoString(char *section) {
2032 2032
2033 2033 if (server.repl_state == REDIS_REPL_TRANSFER) {
2034 2034 info = sdscatprintf(info,
2035   - "master_sync_left_bytes:%ld\r\n"
  2035 + "master_sync_left_bytes:%lld\r\n"
2036 2036 "master_sync_last_io_seconds_ago:%d\r\n"
2037   - ,(long)server.repl_transfer_left,
  2037 + , (long long)
  2038 + (server.repl_transfer_size - server.repl_transfer_read),
2038 2039 (int)(server.unixtime-server.repl_transfer_lastio)
2039 2040 );
2040 2041 }
4 src/redis.h
@@ -549,7 +549,9 @@ struct redisServer {
549 549 redisClient *master; /* Client that is master for this slave */
550 550 int repl_syncio_timeout; /* Timeout for synchronous I/O calls */
551 551 int repl_state; /* Replication status if the instance is a slave */
552   - off_t repl_transfer_left; /* Bytes left reading .rdb */
  552 + off_t repl_transfer_size; /* Size of RDB to read from master during sync. */
  553 + off_t repl_transfer_read; /* Amount of RDB read from master during sync. */
  554 + off_t repl_transfer_last_fsync_off; /* Offset when we fsync-ed last time. */
553 555 int repl_transfer_s; /* Slave -> Master SYNC socket */
554 556 int repl_transfer_fd; /* Slave -> Master SYNC temp file descriptor */
555 557 char *repl_transfer_tmpfile; /* Slave-> master SYNC temp file name */
36 src/replication.c
@@ -311,16 +311,18 @@ void replicationAbortSyncTransfer(void) {
311 311 }
312 312
313 313 /* Asynchronously read the SYNC payload we receive from a master */
  314 +#define REPL_MAX_WRITTEN_BEFORE_FSYNC (1024*1024*8) /* 8 MB */
314 315 void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
315 316 char buf[4096];
316 317 ssize_t nread, readlen;
  318 + off_t left;
317 319 REDIS_NOTUSED(el);
318 320 REDIS_NOTUSED(privdata);
319 321 REDIS_NOTUSED(mask);
320 322
321   - /* If repl_transfer_left == -1 we still have to read the bulk length
  323 + /* If repl_transfer_size == -1 we still have to read the bulk length
322 324 * from the master reply. */
323   - if (server.repl_transfer_left == -1) {
  325 + if (server.repl_transfer_size == -1) {
324 326 if (syncReadLine(fd,buf,1024,server.repl_syncio_timeout*1000) == -1) {
325 327 redisLog(REDIS_WARNING,
326 328 "I/O error reading bulk count from MASTER: %s",
@@ -343,16 +345,16 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
343 345 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
344 346 goto error;
345 347 }
346   - server.repl_transfer_left = strtol(buf+1,NULL,10);
  348 + server.repl_transfer_size = strtol(buf+1,NULL,10);
347 349 redisLog(REDIS_NOTICE,
348 350 "MASTER <-> SLAVE sync: receiving %ld bytes from master",
349   - server.repl_transfer_left);
  351 + server.repl_transfer_size);
350 352 return;
351 353 }
352 354
353 355 /* Read bulk data */
354   - readlen = (server.repl_transfer_left < (signed)sizeof(buf)) ?
355   - server.repl_transfer_left : (signed)sizeof(buf);
  356 + left = server.repl_transfer_size - server.repl_transfer_read;
  357 + readlen = (left < (signed)sizeof(buf)) ? left : (signed)sizeof(buf);
356 358 nread = read(fd,buf,readlen);
357 359 if (nread <= 0) {
358 360 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
@@ -365,9 +367,23 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
365 367 redisLog(REDIS_WARNING,"Write error or short write writing to the DB dump file needed for MASTER <-> SLAVE synchronization: %s", strerror(errno));
366 368 goto error;
367 369 }
368   - server.repl_transfer_left -= nread;
  370 + server.repl_transfer_read += nread;
  371 +
  372 + /* Sync data on disk from time to time, otherwise at the end of the transfer
  373 + * we may suffer a big delay as the memory buffers are copied into the
  374 + * actual disk. */
  375 + if (server.repl_transfer_read >=
  376 + server.repl_transfer_last_fsync_off + REPL_MAX_WRITTEN_BEFORE_FSYNC)
  377 + {
  378 + off_t sync_size = server.repl_transfer_read -
  379 + server.repl_transfer_last_fsync_off;
  380 + rdb_fsync_range(server.repl_transfer_fd,
  381 + server.repl_transfer_last_fsync_off, sync_size);
  382 + server.repl_transfer_last_fsync_off += sync_size;
  383 + }
  384 +
369 385 /* Check if the transfer is now complete */
370   - if (server.repl_transfer_left == 0) {
  386 + if (server.repl_transfer_read == server.repl_transfer_size) {
371 387 if (rename(server.repl_transfer_tmpfile,server.rdb_filename) == -1) {
372 388 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
373 389 replicationAbortSyncTransfer();
@@ -538,7 +554,9 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
538 554 }
539 555
540 556 server.repl_state = REDIS_REPL_TRANSFER;
541   - server.repl_transfer_left = -1;
  557 + server.repl_transfer_size = -1;
  558 + server.repl_transfer_read = 0;
  559 + server.repl_transfer_last_fsync_off = 0;
542 560 server.repl_transfer_fd = dfd;
543 561 server.repl_transfer_lastio = server.unixtime;
544 562 server.repl_transfer_tmpfile = zstrdup(tmpfile);

0 comments on commit 1373216

Please sign in to comment.
Something went wrong with that request. Please try again.