diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 643c70d2b2e65a..9235643640981f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -244,8 +244,10 @@ struct btrfs_super_block {
 	__le64 cache_generation;
 	__le64 uuid_tree_generation;
 
+	/* r5log journal tail (where recovery starts) */
+	__le64 journal_tail;
 	/* future expansion */
-	__le64 reserved[30];
+	__le64 reserved[29];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 } __attribute__ ((__packed__));
@@ -697,6 +699,7 @@ struct btrfs_stripe_hash_table {
 void btrfs_init_async_reclaim_work(struct work_struct *work);
 
 /* fs_info */
+struct btrfs_r5l_log;
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
@@ -1114,6 +1117,9 @@ struct btrfs_fs_info {
 	u32 nodesize;
 	u32 sectorsize;
 	u32 stripesize;
+
+	/* raid56 log */
+	struct btrfs_r5l_log *r5log;
 };
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
@@ -2287,6 +2293,8 @@ BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
 			 log_root_transid, 64);
 BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
 			 log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_journal_tail, struct btrfs_super_block,
+			 journal_tail, 64);
 BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
 			 total_bytes, 64);
 BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
@@ -2932,6 +2940,8 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
 
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+	if (fs_info->r5log)
+		kfree(fs_info->r5log);
 	kfree(fs_info->balance_ctl);
 	kfree(fs_info->delayed_root);
 	kfree(fs_info->extent_root);
@@ -3278,6 +3288,10 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			unsigned long new_flags);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 
+/* raid56.c */
+void btrfs_r5l_write_journal_tail(struct btrfs_fs_info *fs_info);
+
+
 static inline __printf(2, 3)
 void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8685d67185d01b..3fbd34799d0f37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2987,6 +2987,22 @@ int open_ctree(struct super_block *sb,
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
 
+	if (fs_info->r5log) {
+		u64 cp = btrfs_super_journal_tail(fs_info->super_copy);
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("%s: get journal_tail %llu\n", __func__, cp);
+#endif
+		/* if the data is not replayed, data and parity on
+		 * disk are still consistent.  So we can move on.
+		 *
+		 * About fsync, since fsync can make sure data is
+		 * flushed onto disk and only metadata is kept into
+		 * write-ahead log, the fsync'd data will never ends
+		 * up with being replayed by raid56 log.
+		 */
+		btrfs_r5l_load_log(fs_info, NULL, cp);
+	}
+
 	ret = btrfs_recover_balance(fs_info);
 	if (ret) {
 		btrfs_err(fs_info, "failed to recover balance: %d", ret);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e176375f374f91..3d1ef4df4a4fde 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2653,6 +2653,50 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 	return ret;
 }
 
+/* identical to btrfs_ioctl_add_dev, but this is with flags */
+static long btrfs_ioctl_add_dev_v2(struct btrfs_fs_info *fs_info, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args_v2 *vol_args;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+
+	mutex_lock(&fs_info->volume_mutex);
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
+
+	if (vol_args->flags & BTRFS_DEVICE_RAID56_LOG &&
+	    fs_info->r5log) {
+		ret = -EEXIST;
+		btrfs_info(fs_info, "r5log: attempting to add another log device!");
+		goto out_free;
+	}
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	ret = btrfs_init_new_device(fs_info, vol_args->name, vol_args->flags);
+	if (!ret) {
+		if (vol_args->flags & BTRFS_DEVICE_RAID56_LOG) {
+			ASSERT(fs_info->r5log);
+			btrfs_info(fs_info, "disk added %s as raid56 log", vol_args->name);
+		} else {
+			btrfs_info(fs_info, "disk added %s", vol_args->name);
+		}
+	}
+out_free:
+	kfree(vol_args);
+out:
+	mutex_unlock(&fs_info->volume_mutex);
+	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+	return ret;
+}
+
 static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
@@ -2672,7 +2716,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-	ret = btrfs_init_new_device(fs_info, vol_args->name);
+	ret = btrfs_init_new_device(fs_info, vol_args->name, 0);
 
 	if (!ret)
 		btrfs_info(fs_info, "disk added %s", vol_args->name);
@@ -5539,6 +5583,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_resize(file, argp);
 	case BTRFS_IOC_ADD_DEV:
 		return btrfs_ioctl_add_dev(fs_info, argp);
+	case BTRFS_IOC_ADD_DEV_V2:
+		return btrfs_ioctl_add_dev_v2(fs_info, argp);
 	case BTRFS_IOC_RM_DEV:
 		return btrfs_ioctl_rm_dev(file, argp);
 	case BTRFS_IOC_RM_DEV_V2:
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d8ea0eb76325e9..ceca41537dddc3 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -43,6 +43,7 @@
 #include "async-thread.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "hash.h"
 
 /* set when additional merges to this rbio are not allowed */
 #define RBIO_RMW_LOCKED_BIT	1
@@ -177,6 +178,33 @@ struct btrfs_raid_bio {
 	unsigned long *dbitmap;
 };
 
+/* raid56 log */
+struct btrfs_r5l_log {
+	/* protect this struct and log io */
+	struct mutex io_mutex;
+
+	spinlock_t io_list_lock;
+	struct list_head io_list;
+
+	/* r5log device */
+	struct btrfs_device *dev;
+
+	struct btrfs_fs_info *fs_info;
+
+	/* allocation range for log entries */
+	u64 data_offset;
+	u64 device_size;
+
+	u64 next_checkpoint;
+
+	u64 last_checkpoint;
+	u64 last_cp_seq;
+	u64 seq;
+	u64 log_start;
+	u32 uuid_csum;
+	struct btrfs_r5l_io_unit *current_io;
+};
+
 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 static void rmw_work(struct btrfs_work *work);
@@ -1034,130 +1062,1193 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 			return -ENOMEM;
 		rbio->stripe_pages[i] = page;
 	}
-	return 0;
-}
+	return 0;
+}
+
+/* only allocate pages for p/q stripes */
+static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
+{
+	int i;
+	struct page *page;
+
+	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
+
+	for (; i < rbio->nr_pages; i++) {
+		if (rbio->stripe_pages[i])
+			continue;
+		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
+		rbio->stripe_pages[i] = page;
+	}
+	return 0;
+}
+
+/*
+ * add a single page from a specific stripe into our list of bios for IO
+ * this will try to merge into existing bios if possible, and returns
+ * zero if all went well.
+ */
+static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
+			    struct bio_list *bio_list,
+			    struct page *page,
+			    int stripe_nr,
+			    unsigned long page_index,
+			    unsigned long bio_max_len)
+{
+	struct bio *last = bio_list->tail;
+	u64 last_end = 0;
+	int ret;
+	struct bio *bio;
+	struct btrfs_bio_stripe *stripe;
+	u64 disk_start;
+
+	stripe = &rbio->bbio->stripes[stripe_nr];
+	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
+
+	/* if the device is missing, just fail this stripe */
+	if (!stripe->dev->bdev)
+		return fail_rbio_index(rbio, stripe_nr);
+
+	/* see if we can add this page onto our existing bio */
+	if (last) {
+		last_end = (u64)last->bi_iter.bi_sector << 9;
+		last_end += last->bi_iter.bi_size;
+
+		/*
+		 * we can't merge these if they are from different
+		 * devices or if they are not contiguous
+		 */
+		if (last_end == disk_start && stripe->dev->bdev &&
+		    !last->bi_error &&
+		    last->bi_bdev == stripe->dev->bdev) {
+			ret = bio_add_page(last, page, PAGE_SIZE, 0);
+			if (ret == PAGE_SIZE)
+				return 0;
+		}
+	}
+
+	/* put a new bio on the list */
+	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_iter.bi_size = 0;
+	bio->bi_bdev = stripe->dev->bdev;
+	bio->bi_iter.bi_sector = disk_start >> 9;
+
+	bio_add_page(bio, page, PAGE_SIZE, 0);
+	bio_list_add(bio_list, bio);
+	return 0;
+}
+
+/*
+ * while we're doing the read/modify/write cycle, we could
+ * have errors in reading pages off the disk.  This checks
+ * for errors and if we're not able to read the page it'll
+ * trigger parity reconstruction.  The rmw will be finished
+ * after we've reconstructed the failed stripes
+ */
+static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
+{
+	if (rbio->faila >= 0 || rbio->failb >= 0) {
+		BUG_ON(rbio->faila == rbio->real_stripes - 1);
+		__raid56_parity_recover(rbio);
+	} else {
+		finish_rmw(rbio);
+	}
+}
+
+/*
+ * helper function to walk our bio list and populate the bio_pages array with
+ * the result.  This seems expensive, but it is faster than constantly
+ * searching through the bio list as we setup the IO in finish_rmw or stripe
+ * reconstruction.
+ *
+ * This must be called before you trust the answers from page_in_rbio
+ */
+static void index_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+	struct bio *bio;
+	struct bio_vec *bvec;
+	u64 start;
+	unsigned long stripe_offset;
+	unsigned long page_index;
+	int i;
+
+	spin_lock_irq(&rbio->bio_list_lock);
+	bio_list_for_each(bio, &rbio->bio_list) {
+		start = (u64)bio->bi_iter.bi_sector << 9;
+		stripe_offset = start - rbio->bbio->raid_map[0];
+		page_index = stripe_offset >> PAGE_SHIFT;
+
+		bio_for_each_segment_all(bvec, bio, i)
+			rbio->bio_pages[page_index + i] = bvec->bv_page;
+	}
+	spin_unlock_irq(&rbio->bio_list_lock);
+}
+
+/* r5log */
+/* XXX: this allocation may be done earlier, eg. when allocating rbio */
+static struct btrfs_r5l_io_unit *btrfs_r5l_alloc_io_unit(struct btrfs_r5l_log *log)
+{
+	struct btrfs_r5l_io_unit *io;
+	gfp_t gfp = GFP_NOFS;
+
+	io = kzalloc(sizeof(*io), gfp);
+	ASSERT(io);
+	io->log = log;
+	/* need to use kmap. */
+	io->meta_page = alloc_page(gfp | __GFP_HIGHMEM | __GFP_ZERO);
+	ASSERT(io->meta_page);
+
+	return io;
+}
+
+static void btrfs_r5l_free_io_unit(struct btrfs_r5l_log *log, struct btrfs_r5l_io_unit *io)
+{
+	__free_page(io->meta_page);
+	ASSERT(list_empty(&io->list));
+	kfree(io);
+}
+
+static u64 btrfs_r5l_ring_add(struct btrfs_r5l_log *log, u64 start, u64 inc)
+{
+	start += inc;
+	if (start >= log->device_size)
+		start = start - log->device_size;
+	return start;
+}
+
+static void btrfs_r5l_reserve_log_entry(struct btrfs_r5l_log *log, struct btrfs_r5l_io_unit *io)
+{
+	log->log_start = btrfs_r5l_ring_add(log, log->log_start, PAGE_SIZE);
+	io->log_end = log->log_start;
+
+	if (log->log_start == 0)
+		io->need_split_bio = true;
+}
+
+/* the IO order is maintained in log->io_list. */
+static void btrfs_r5l_finish_io(struct btrfs_r5l_log *log)
+{
+	struct btrfs_r5l_io_unit *io, *next;
+
+	spin_lock(&log->io_list_lock);
+	list_for_each_entry_safe(io, next, &log->io_list, list) {
+		if (io->status != BTRFS_R5L_STRIPE_END)
+			break;
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("current log->next_checkpoint %llu (will be %llu after writing to RAID\n", log->next_checkpoint, io->log_start);
+#endif
+
+		list_del_init(&io->list);
+		log->next_checkpoint = io->log_start;
+		btrfs_r5l_free_io_unit(log, io);
+	}
+	spin_unlock(&log->io_list_lock);
+}
+
+static void btrfs_write_rbio(struct btrfs_raid_bio *rbio);
+
+static void btrfs_r5l_log_endio(struct bio *bio)
+{
+	struct btrfs_r5l_io_unit *io = bio->bi_private;
+	struct btrfs_r5l_log *log = io->log;
+
+	bio_put(bio);
+
+	/* move data to RAID. */
+	btrfs_write_rbio(io->rbio);
+
+	io->status = BTRFS_R5L_STRIPE_END;
+	/* After stripe data has been flushed into raid, set ->next_checkpoint. */
+	btrfs_r5l_finish_io(log);
+}
+
+static struct bio *btrfs_r5l_bio_alloc(struct btrfs_r5l_log *log)
+{
+	/* this allocation will not fail. */
+	struct bio *bio = btrfs_io_bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+
+	/* We need to make sure data/parity are settled down on the log disk. */
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA;
+	bio->bi_bdev = log->dev->bdev;
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("log->data_offset 0x%llx log->log_start 0x%llx\n", log->data_offset, log->log_start);
+#endif
+	bio->bi_iter.bi_sector = (log->data_offset + log->log_start) >> 9;
+
+	return bio;
+}
+
+static struct btrfs_r5l_io_unit *btrfs_r5l_new_meta(struct btrfs_r5l_log *log)
+{
+	struct btrfs_r5l_io_unit *io;
+	struct btrfs_r5l_meta_block *block;
+
+	io = btrfs_r5l_alloc_io_unit(log);
+	ASSERT(io);
+
+	block = kmap(io->meta_page);
+	clear_page(block);
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("%s pos %llu seq %llu\n", __func__, log->log_start, log->seq);
+#endif
+
+	block->magic = cpu_to_le32(BTRFS_R5LOG_MAGIC);
+	block->seq = cpu_to_le64(log->seq);
+	block->position = cpu_to_le64(log->log_start);
+
+	kunmap(io->meta_page);
+
+	io->log_start = log->log_start;
+	io->meta_offset = sizeof(struct btrfs_r5l_meta_block);
+	io->seq = log->seq++;
+
+	io->need_split_bio = false;
+	io->split_bio = NULL;
+	io->current_bio = btrfs_r5l_bio_alloc(log);
+	io->current_bio->bi_end_io = btrfs_r5l_log_endio;
+	io->current_bio->bi_private = io;
+
+	bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
+
+	btrfs_r5l_reserve_log_entry(log, io);
+
+	INIT_LIST_HEAD(&io->list);
+	spin_lock(&log->io_list_lock);
+	list_add_tail(&io->list, &log->io_list);
+	spin_unlock(&log->io_list_lock);
+	return io;
+}
+
+static int btrfs_r5l_get_meta(struct btrfs_r5l_log *log, struct btrfs_raid_bio *rbio, int payload_size)
+{
+	/* always allocate new meta block. */
+	log->current_io = btrfs_r5l_new_meta(log);
+	ASSERT(log->current_io);
+	log->current_io->rbio = rbio;
+	return 0;
+}
+
+static void btrfs_r5l_append_payload_meta(struct btrfs_r5l_log *log, u16 type, u64 location, u64 devid, u32 csum)
+{
+	struct btrfs_r5l_io_unit *io = log->current_io;
+	struct btrfs_r5l_payload *payload;
+	void *ptr;
+
+	ptr = kmap(io->meta_page);
+	payload = ptr + io->meta_offset;
+	payload->type = cpu_to_le16(type);
+	payload->flags = cpu_to_le16(0);
+
+	if (type == R5LOG_PAYLOAD_DATA)
+		payload->size = cpu_to_le32(1);
+	else if (type == R5LOG_PAYLOAD_PARITY)
+		payload->size = cpu_to_le32(16); /* stripe_len / PAGE_SIZE */
+	payload->devid = cpu_to_le64(devid);
+	payload->location = cpu_to_le64(location);
+	payload->csum = cpu_to_le32(csum);
+	kunmap(io->meta_page);
+
+	io->meta_offset += sizeof(*payload);
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("io->meta_offset %d\n", io->meta_offset);
+#endif
+}
+
+static void btrfs_r5l_append_payload_page(struct btrfs_r5l_log *log, struct page *page)
+{
+	struct btrfs_r5l_io_unit *io = log->current_io;
+
+	if (io->need_split_bio) {
+		/* We're submitting too much data at a time!! */
+		BUG_ON(io->split_bio);
+		io->split_bio = io->current_bio;
+		io->current_bio = btrfs_r5l_bio_alloc(log);
+		bio_chain(io->current_bio, io->split_bio);
+		io->need_split_bio = false;
+	}
+
+	ASSERT(bio_add_page(io->current_bio, page, PAGE_SIZE, 0));
+
+	btrfs_r5l_reserve_log_entry(log, io);
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("log->log_start %llu io->current_bio bi_iter (bi_sector 0x%llx bi_size %d\n", log->log_start, io->current_bio->bi_iter.bi_sector << 9, io->current_bio->bi_iter.bi_size);
+#endif
+
+}
+
+static u64 btrfs_compute_location(struct btrfs_raid_bio *rbio, int stripe_nr, unsigned long page_index)
+{
+	struct btrfs_bio_stripe *stripe;
+
+	stripe = &rbio->bbio->stripes[stripe_nr];
+	return stripe->physical + (page_index << PAGE_SHIFT);
+}
+
+static u64 btrfs_compute_devid(struct btrfs_raid_bio *rbio, int stripe_nr)
+{
+	struct btrfs_bio_stripe *stripe;
+
+	stripe = &rbio->bbio->stripes[stripe_nr];
+	ASSERT(stripe->dev);
+	return stripe->dev->devid;
+}
+
+static void btrfs_r5l_log_stripe(struct btrfs_r5l_log *log, int data_pages, int parity_pages, struct btrfs_raid_bio *rbio)
+{
+	int meta_size;
+	int stripe, pagenr;
+	struct page *page;
+	char *kaddr;
+	u32 csum;
+	u64 location;
+	u64 devid;
+
+	/*
+	 * parity pages are contiguous on disk, thus only one
+	 * payload is required.
+	 */
+	meta_size = sizeof(struct btrfs_r5l_payload) * data_pages +
+		    sizeof(struct btrfs_r5l_payload) * (rbio->real_stripes - rbio->nr_data);
+
+	/* add meta block */
+	btrfs_r5l_get_meta(log, rbio, meta_size);
+
+	/* add data blocks which need to be written */
+	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+				/* the page is from bio, queued for log bio */
+				location = btrfs_compute_location(rbio, stripe, pagenr);
+				devid = btrfs_compute_devid(rbio, stripe);
+#ifdef BTRFS_DEBUG_R5LOG
+				trace_printk("data: stripe %d pagenr %d location 0x%llx devid %llu\n", stripe, pagenr, location, devid);
+#endif
+				kaddr = kmap(page);
+				csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE);
+				kunmap(page);
+
+				btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, location, devid, csum);
+				btrfs_r5l_append_payload_page(log, page);
+			}
+		}
+	}
+
+	/* add the whole parity blocks */
+	for (; stripe < rbio->real_stripes; stripe++) {
+		location = btrfs_compute_location(rbio, stripe, 0);
+		devid = btrfs_compute_devid(rbio, stripe);
+
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("parity: stripe %d location 0x%llx devid %llu\n", stripe, location, devid);
+#endif
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+			page = rbio_stripe_page(rbio, stripe, pagenr);
+
+			kaddr = kmap(page);
+			if (pagenr == 0)
+				csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE);
+			else
+				csum = btrfs_crc32c(csum, kaddr, PAGE_SIZE);
+			kunmap(page);
+
+			btrfs_r5l_append_payload_page(log, page);
+		}
+
+		btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, location, devid, csum);
+	}
+}
+
+static void btrfs_r5l_submit_current_io(struct btrfs_r5l_log *log)
+{
+	struct btrfs_r5l_io_unit *io = log->current_io;
+	struct btrfs_r5l_meta_block *mb;
+	u32 csum;
+
+	if (!io)
+		return;
+
+	mb = kmap(io->meta_page);
+	mb->meta_size = cpu_to_le32(io->meta_offset);
+	ASSERT(mb->csum == 0);
+	csum = btrfs_crc32c(log->uuid_csum, mb, PAGE_SIZE);
+	mb->csum = cpu_to_le32(csum);
+	kunmap(io->meta_page);
+
+	log->current_io = NULL;
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("io->current bio bi_sector 0x%llx devid %llu\n", io->current_bio->bi_iter.bi_sector << 9, log->dev->devid);
+#endif
+	/*
+	 * make sure that r5l_log_endio does not run in interrupt
+	 * context.
+	 *
+	 * if io->split_bio is available, then current_bio is just a
+	 * chained bio.
+	 */
+	if (io->split_bio)
+		btrfs_bio_wq_end_io(log->fs_info, io->split_bio, BTRFS_WQ_ENDIO_RAID56);
+	else
+		btrfs_bio_wq_end_io(log->fs_info, io->current_bio, BTRFS_WQ_ENDIO_RAID56);
+
+	submit_bio(io->current_bio);
+	if (io->split_bio)
+		submit_bio(io->split_bio);
+}
+
+static u64 btrfs_r5l_ring_distance(struct btrfs_r5l_log *log, u64 start, u64 end)
+{
+	if (end >= start)
+		return end - start;
+	else
+		return end + (log->device_size) - start;
+}
+
+static bool btrfs_r5l_has_free_space(struct btrfs_r5l_log *log, u64 size)
+{
+	u64 used_size;
+	used_size = btrfs_r5l_ring_distance(log, log->last_checkpoint,
+					    log->log_start);
+	return log->device_size > (used_size + size);
+}
+
+static int btrfs_r5l_sync_page_io(struct btrfs_r5l_log *log,
+				  struct btrfs_device *dev, sector_t sector,
+				  int size, struct page *page, int op)
+{
+	struct bio *bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	int ret;
+
+	bio->bi_bdev = dev->bdev;
+	bio->bi_opf = op;
+	if (dev == log->dev)
+		bio->bi_iter.bi_sector = (log->data_offset >> 9) + sector;
+	else
+		bio->bi_iter.bi_sector = sector;
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("%s: op %d bi_sector 0x%llx\n", __func__, op, (bio->bi_iter.bi_sector << 9));
+#endif
+
+	bio_add_page(bio, page, size, 0);
+	submit_bio_wait(bio);
+	ret = !bio->bi_error;
+	bio_put(bio);
+	return ret;
+}
+
+static int btrfs_r5l_write_empty_meta_block(struct btrfs_r5l_log *log, u64 pos, u64 seq)
+{
+	struct page *page;
+	struct btrfs_r5l_meta_block *mb;
+	u32 csum;
+	int ret = 0;
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("%s: pos %llu seq %llu\n", __func__, pos, seq);
+#endif
+
+	page = alloc_page(GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO);
+	ASSERT(page);
+
+	mb = kmap(page);
+	mb->magic = cpu_to_le32(BTRFS_R5LOG_MAGIC);
+	mb->meta_size = cpu_to_le32(sizeof(struct btrfs_r5l_meta_block));
+	mb->seq = cpu_to_le64(seq);
+	mb->position = cpu_to_le64(pos);
+
+	csum = btrfs_crc32c(log->uuid_csum, mb, PAGE_SIZE);
+	mb->csum = cpu_to_le32(csum);
+	kunmap(page);
+
+	if (!btrfs_r5l_sync_page_io(log, log->dev, (pos >> 9), PAGE_SIZE, page, REQ_OP_WRITE | REQ_FUA)) {
+		ret = -EIO;
+	}
+
+	__free_page(page);
+	return ret;
+}
+
+#define BTRFS_R5L_RECOVER_IO_POOL_SIZE BIO_MAX_PAGES
+struct btrfs_r5l_recover_ctx {
+	u64 pos;
+	u64 seq;
+	u64 total_size;
+	struct page *meta_page;
+	struct page *io_page;
+
+	struct page *ra_pages[BTRFS_R5L_RECOVER_IO_POOL_SIZE];
+	struct bio *ra_bio;
+	int total;
+	int valid;
+	u64 start_offset;
+
+	struct btrfs_r5l_log *log;
+};
+
+static int btrfs_r5l_recover_read_ra(struct btrfs_r5l_recover_ctx *ctx, u64 offset)
+{
+	bio_reset(ctx->ra_bio);
+	ctx->ra_bio->bi_bdev = ctx->log->dev->bdev;
+	ctx->ra_bio->bi_opf = REQ_OP_READ;
+	ctx->ra_bio->bi_iter.bi_sector = (ctx->log->data_offset + offset) >> 9;
+
+	ctx->valid = 0;
+	ctx->start_offset = offset;
+
+	while (ctx->valid < ctx->total) {
+		bio_add_page(ctx->ra_bio, ctx->ra_pages[ctx->valid++], PAGE_SIZE, 0);
+
+		offset = btrfs_r5l_ring_add(ctx->log, offset, PAGE_SIZE);
+		if (offset == 0)
+			break;
+	}
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("to read %d pages starting from 0x%llx\n", ctx->valid, ctx->log->data_offset + ctx->start_offset);
+#endif
+	return submit_bio_wait(ctx->ra_bio);
+}
+
+static int btrfs_r5l_recover_read_page(struct btrfs_r5l_recover_ctx *ctx, struct page *page, u64 offset)
+{
+	struct page *tmp;
+	int index;
+	char *src;
+	char *dst;
+	int ret;
+
+	if (offset < ctx->start_offset || offset >= (ctx->start_offset + ctx->valid * PAGE_SIZE)) {
+		ret = btrfs_r5l_recover_read_ra(ctx, offset);
+		if (ret)
+			return ret;
+	}
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("offset 0x%llx start->offset 0x%llx ctx->valid %d\n", offset, ctx->start_offset, ctx->valid);
+#endif
+
+	ASSERT(IS_ALIGNED(ctx->start_offset, PAGE_SIZE));
+	ASSERT(IS_ALIGNED(offset, PAGE_SIZE));
+
+	index = (offset - ctx->start_offset) >> PAGE_SHIFT;
+	ASSERT(index < ctx->valid);
+
+	tmp = ctx->ra_pages[index];
+	src = kmap(tmp);
+	dst = kmap(page);
+	memcpy(dst, src, PAGE_SIZE);
+	kunmap(page);
+	kunmap(tmp);
+	return 0;
+}
+
+static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_recover_ctx *ctx)
+{
+	struct btrfs_r5l_meta_block *mb;
+	u32 csum;
+	u32 expected;
+	int ret = 0;
+
+	ret = btrfs_r5l_recover_read_page(ctx, ctx->meta_page, ctx->pos);
+	if (ret)
+		return ret;
+
+	mb = kmap(ctx->meta_page);
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("ctx->pos %llu ctx->seq %llu pos %llu seq %llu\n", ctx->pos, ctx->seq, le64_to_cpu(mb->position), le64_to_cpu(mb->seq));
+#endif
+
+	if (le32_to_cpu(mb->magic) != BTRFS_R5LOG_MAGIC ||
+	    le64_to_cpu(mb->position) != ctx->pos ||
+	    le64_to_cpu(mb->seq) != ctx->seq) {
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("%s: mismatch magic %llu default %llu\n", __func__, le32_to_cpu(mb->magic), BTRFS_R5LOG_MAGIC);
+#endif
+		ret = -EINVAL;
+		goto out;
+	}
+
+	expected = le32_to_cpu(mb->csum);
+	/*
+	 * when we calculate mb->csum, it's zero, so we need to zero
+	 * it back.
+	 */
+	mb->csum = 0;
+	csum = btrfs_crc32c(ctx->log->uuid_csum, mb, PAGE_SIZE);
+	if (csum != expected) {
+#ifdef BTRFS_DEBUG_R5LOG
+		pr_info("%s: mismatch checksum for r5l meta block\n", __func__);
+#endif
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ASSERT(le32_to_cpu(mb->meta_size) <= PAGE_SIZE);
+	/* meta_block */
+	ctx->total_size = PAGE_SIZE;
+
+out:
+	kunmap(ctx->meta_page);
+
+	return ret;
+}
+
+static int btrfs_r5l_recover_verify_checksum(struct btrfs_r5l_recover_ctx *ctx)
+{
+	u64 offset;
+	u32 meta_size;
+	u64 csum_io_offset;
+	u64 read_pos;
+	char *kaddr;
+	u32 csum;
+	int type;
+	struct btrfs_r5l_meta_block *mb;
+	struct btrfs_r5l_payload *payload;
+	struct btrfs_r5l_log *log = ctx->log;
+	struct btrfs_device *dev;
+	int ret = 0;
+
+	mb = kmap(ctx->meta_page);
+	meta_size = le32_to_cpu(mb->meta_size);
+	csum_io_offset = PAGE_SIZE;
+
+	for (offset = sizeof(struct btrfs_r5l_meta_block);
+	     offset < meta_size;
+	     offset += sizeof(struct btrfs_r5l_payload)) {
+		payload = (void *)mb + offset;
+
+		/* check if there is any invalid device, if so, skip writing this mb. */
+		dev = btrfs_find_device(log->fs_info, le64_to_cpu(payload->devid), NULL, NULL);
+		if (!dev || dev->missing) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		type = le16_to_cpu(payload->type);
+		if (type == R5LOG_PAYLOAD_DATA) {
+			read_pos = btrfs_r5l_ring_add(log, ctx->pos, csum_io_offset);
+			csum_io_offset += PAGE_SIZE;
+
+			ASSERT(le32_to_cpu(payload->size) == 1);
+			ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos);
+			if (ret) {
+				ret = -EIO;
+				goto out;
+			}
+
+			kaddr = kmap(ctx->io_page);
+			csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE);
+			kunmap(ctx->io_page);
+		} else if (type == R5LOG_PAYLOAD_PARITY) {
+			int i;
+			for (i = 0; i < le32_to_cpu(payload->size); i++) {
+				read_pos = btrfs_r5l_ring_add(log, ctx->pos, csum_io_offset);
+				csum_io_offset += PAGE_SIZE;
+
+				ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos);
+				if (ret) {
+					ret = -EIO;
+					goto out;
+				}
+
+				kaddr = kmap(ctx->io_page);
+				if (i == 0)
+					csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE);
+				else
+					csum = btrfs_crc32c(csum, kaddr, PAGE_SIZE);
+				kunmap(ctx->io_page);
+			}
+		} else {
+			ASSERT(0);
+		}
+
+		if (csum != le32_to_cpu(payload->csum)) {
+			trace_printk("r5l data csum fails location 0x%llx devid %llu\n", le64_to_cpu(payload->location), le64_to_cpu(payload->devid));
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
+out:
+	kunmap(ctx->meta_page);
+	return ret;
+}
+
+static int btrfs_r5l_recover_load_data(struct btrfs_r5l_recover_ctx *ctx)
+{
+	u64 offset;
+	struct btrfs_r5l_meta_block *mb;
+	u32 meta_size;
+	u64 io_offset;
+	u64 read_pos;
+	struct btrfs_device *dev;
+	struct btrfs_r5l_payload *payload;
+	struct btrfs_r5l_log *log = ctx->log;
+	int ret = 0;
+
+	/* if any checksum fails, skip writing this mb. */
+	ret = btrfs_r5l_recover_verify_checksum(ctx);
+	if (ret)
+		return ret;
+
+	mb = kmap(ctx->meta_page);
+
+	io_offset = PAGE_SIZE;
+	offset = sizeof(struct btrfs_r5l_meta_block);
+	meta_size = le32_to_cpu(mb->meta_size);
+
+	for (offset = sizeof(struct btrfs_r5l_meta_block);
+	     offset < meta_size;
+	     offset += sizeof(struct btrfs_r5l_payload)) {
+		payload = (void *)mb + offset;
+
+		/* read data from log disk and write to payload->location */
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("payload type %d flags %d size %d location 0x%llx devid %llu\n", le16_to_cpu(payload->type), le16_to_cpu(payload->flags), le32_to_cpu(payload->size), le64_to_cpu(payload->location), le64_to_cpu(payload->devid));
+#endif
+
+		/* liubo: how to handle the case where dev is suddenly off? */
+		dev = btrfs_find_device(log->fs_info, le64_to_cpu(payload->devid), NULL, NULL);
+		ASSERT(dev && !dev->missing);
+
+		if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_DATA) {
+			read_pos = btrfs_r5l_ring_add(log, ctx->pos, io_offset);
+			io_offset += PAGE_SIZE;
+
+			ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos);
+			if (ret)
+				goto out;
+
+			if (!btrfs_r5l_sync_page_io(log, dev, le64_to_cpu(payload->location) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE)) {
+				ret = -EIO;
+				goto out;
+			}
+		} else if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_PARITY) {
+			int i;
+
+			ASSERT(offset + sizeof(struct btrfs_r5l_payload) == meta_size);
+
+			for (i = 0; i < le32_to_cpu(payload->size); i++) {
+				u64 parity_off = le64_to_cpu(payload->location) + i * PAGE_SIZE;
+				read_pos = btrfs_r5l_ring_add(log, ctx->pos, io_offset);
+				io_offset += PAGE_SIZE;
+
+				ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos);
+				if (ret)
+					goto out;
+
+				if (!btrfs_r5l_sync_page_io(log, dev, parity_off >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE)) {
+					ret = -EIO;
+					goto out;
+				}
+			}
+		} else {
+			ASSERT(0);
+		}
+	}
+
+	ctx->total_size += (io_offset - PAGE_SIZE);
+out:
+	kunmap(ctx->meta_page);
+	return ret;
+}
+
+static int btrfs_r5l_recover_flush_log(struct btrfs_r5l_recover_ctx *ctx)
+{
+	int ret;
+
+	while (1) {
+		ret = btrfs_r5l_recover_load_meta(ctx);
+		if (ret)
+			break;
+
+		ret = btrfs_r5l_recover_load_data(ctx);
+		if (ret && ret != -EAGAIN)
+			break;
+
+		ctx->seq++;
+		ctx->pos = btrfs_r5l_ring_add(ctx->log, ctx->pos, ctx->total_size);
+	}
+
+	return 0;
+}
+
+static int btrfs_r5l_recover_allocate_ra(struct btrfs_r5l_recover_ctx *ctx)
+{
+	struct page *page;
+	ctx->ra_bio = btrfs_io_bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+
+	ctx->total = 0;
+	ctx->valid = 0;
+	while (ctx->total < BTRFS_R5L_RECOVER_IO_POOL_SIZE) {
+		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!page)
+			break;
+
+		ctx->ra_pages[ctx->total++] = page;
+	}
+
+	if (ctx->total == 0) {
+		bio_put(ctx->ra_bio);
+		return -ENOMEM;
+	}
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("readahead: %d allocated pages\n", ctx->total);
+#endif
+	return 0;
+}
+
+static void btrfs_r5l_recover_free_ra(struct btrfs_r5l_recover_ctx *ctx)
+{
+	int i;
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("readahead: %d to free pages\n", ctx->total);
+#endif
+	for (i = 0; i < ctx->total; i++)
+		__free_page(ctx->ra_pages[i]);
+	bio_put(ctx->ra_bio);
+}
+
+static void btrfs_r5l_write_super(struct btrfs_fs_info *fs_info, u64 cp);
+
+static int btrfs_r5l_recover_log(struct btrfs_r5l_log *log)
+{
+	struct btrfs_r5l_recover_ctx *ctx;
+	u64 pos;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_NOFS);
+	ASSERT(ctx);
+
+	ctx->log = log;
+	ctx->pos = log->last_checkpoint;
+	ctx->seq = log->last_cp_seq;
+	ctx->meta_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	ASSERT(ctx->meta_page);
+	ctx->io_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	ASSERT(ctx->io_page);
+
+	ret = btrfs_r5l_recover_allocate_ra(ctx);
+	ASSERT(ret == 0);
+
+	btrfs_r5l_recover_flush_log(ctx);
+
+	pos = ctx->pos;
+	log->next_checkpoint = ctx->pos;
+	ctx->seq += 10000;
+	btrfs_r5l_write_empty_meta_block(log, ctx->pos, ctx->seq++);
+	ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, PAGE_SIZE);
+
+	log->log_start = ctx->pos;
+	log->seq = ctx->seq;
+	/* last_checkpoint point to the empty block. */
+	log->last_checkpoint = pos;
+	btrfs_r5l_write_super(log->fs_info, pos);
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("%s: log_start %llu seq %llu\n", __func__, log->log_start, log->seq);
+#endif
+	__free_page(ctx->meta_page);
+	__free_page(ctx->io_page);
+	btrfs_r5l_recover_free_ra(ctx);
+	kfree(ctx);
+	return 0;
+}
+
+/* return 0 if success, otherwise return errors */
+int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, struct btrfs_r5l_log *r5log, u64 cp)
+{
+	struct btrfs_r5l_log *log;
+	struct page *page;
+	struct btrfs_r5l_meta_block *mb;
+	bool create_new = false;
+	int ret;
+
+	if (r5log)
+		ASSERT(fs_info->r5log == NULL);
+	if (fs_info->r5log)
+		ASSERT(r5log == NULL);
+
+	if (fs_info->r5log)
+		log = fs_info->r5log;
+	else
+		/*
+		 * this only happens when adding the raid56 log for
+		 * the first time.
+		 */
+		log = r5log;
+
+	ASSERT(log);
+
+	page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	ASSERT(page);
+
+	if (!btrfs_r5l_sync_page_io(log, log->dev, (cp >> 9), PAGE_SIZE, page,
+				    REQ_OP_READ)) {
+		__free_page(page);
+		return -EIO;
+	}
+
+	mb = kmap(page);
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("r5l: mb->pos %llu cp %llu mb->seq %llu\n", le64_to_cpu(mb->position), cp, le64_to_cpu(mb->seq));
+#endif
+
+	if (le32_to_cpu(mb->magic) != BTRFS_R5LOG_MAGIC) {
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("magic not match: create new r5l\n");
+#endif
+		create_new = true;
+		goto create;
+	}
+
+	ASSERT(le64_to_cpu(mb->position) == cp);
+	if (le64_to_cpu(mb->position) != cp) {
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("mb->position not match: create new r5l\n");
+#endif
+		create_new = true;
+		goto create;
+	}
+create:
+	if (create_new) {
+		/* initial new r5log */
+		log->last_cp_seq = prandom_u32();
+		cp = 0;
+
+		btrfs_r5l_write_empty_meta_block(log, cp, log->last_cp_seq);
+		btrfs_r5l_write_super(fs_info, cp);
+	} else {
+		log->last_cp_seq = le64_to_cpu(mb->seq);
+	}
+
+	log->last_checkpoint = cp;
+
+	kunmap(page);
+	__free_page(page);
+
+	if (create_new) {
+		log->log_start = btrfs_r5l_ring_add(log, cp, PAGE_SIZE);
+		log->seq = log->last_cp_seq + 1;
+		log->next_checkpoint = cp;
+	} else {
+		ret = btrfs_r5l_recover_log(log);
+	}
+
+	return ret;
+}
+
+/*
+ * writing super with log->next_checkpoint
+ *
+ * This is protected by log->io_mutex.
+ */
+static void btrfs_r5l_write_super(struct btrfs_fs_info *fs_info, u64 cp)
+{
+	int ret;
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("r5l writing super to reclaim space, cp %llu\n", cp);
+#endif
+
+	btrfs_set_super_journal_tail(fs_info->super_for_commit, cp);
+
+	/*
+	 * flush all disk cache so that all data prior to
+	 * %next_checkpoint lands on raid disks(recovery will start
+	 * from %next_checkpoint).
+	 */
+	ret = write_all_supers(fs_info, 1);
+	ASSERT(ret == 0);
+}
+
+/* this is called by commit transaction and it's followed by writing super. */
+void btrfs_r5l_write_journal_tail(struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->r5log) {
+		u64 cp = READ_ONCE(fs_info->r5log->next_checkpoint);
+
+		trace_printk("journal_tail %llu\n", cp);
+		btrfs_set_super_journal_tail(fs_info->super_copy, cp);
+		WRITE_ONCE(fs_info->r5log->last_checkpoint, cp);
+	}
+}
+
+/*
+ * return 0 if data/parity are written into log and it will move data
+ * to RAID in endio.
+ *
+ * return 1 if log is not available or there is no space in log.
+ */
+static int btrfs_r5l_write_stripe(struct btrfs_raid_bio *rbio)
+{
+	int stripe, pagenr;
+	int data_pages = 0, parity_pages = 0;
+	u64 reserve;
+	int meta_size;
+	bool do_submit = false;
+	struct btrfs_r5l_log *log = rbio->fs_info->r5log;
+
+	if (!log) {
+#ifdef BTRFS_DEBUG_R5LOG
+		btrfs_info(rbio->fs_info, "r5log is not available\n");
+#endif
+		return 1;
+	}
+
+	/* get data_pages and parity_pages */
+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+			struct page *page;
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+				data_pages++;
+			} else {
+				parity_pages++;
+			}
+		}
+	}
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("data_pages %d parity_pages %d\n", data_pages, parity_pages);
+	ASSERT(parity_pages == 16 * (rbio->real_stripes - rbio->nr_data));
+#endif
 
-/* only allocate pages for p/q stripes */
-static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
-{
-	int i;
-	struct page *page;
+	/*
+	 * parity pages are contiguous on disk, thus only one
+	 * payload is required.
+	 */
+	meta_size = sizeof(struct btrfs_r5l_payload) * data_pages +
+		sizeof(struct btrfs_r5l_payload) * (rbio->real_stripes - rbio->nr_data);
+
+	/* doesn't support large raid array */
+	if (meta_size + sizeof(struct btrfs_r5l_meta_block) > PAGE_SIZE) {
+#ifdef BTRFS_DEBUG_R5LOG
+		btrfs_info(rbio->fs_info, "meta_size (%d) is too big\n", meta_size);
+#endif
+		return 1;
+	}
 
-	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
+	mutex_lock(&log->io_mutex);
+	/* meta + data/parity */
+	reserve = (1 + data_pages + parity_pages) << PAGE_SHIFT;
+	if (btrfs_r5l_has_free_space(log, reserve)) {
+		btrfs_r5l_log_stripe(log, data_pages, parity_pages, rbio);
+		do_submit = true;
+	} else {
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("r5log: no space log->last_checkpoint %llu log->log_start %llu log->next_checkpoint %llu\n", log->last_checkpoint, log->log_start, log->next_checkpoint);
+#endif
 
-	for (; i < rbio->nr_pages; i++) {
-		if (rbio->stripe_pages[i])
-			continue;
-		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
-		if (!page)
-			return -ENOMEM;
-		rbio->stripe_pages[i] = page;
+		/*
+		 * reclaim works via writing to log device with the
+		 * new next_checkpoint.
+		 */
+		btrfs_r5l_write_super(rbio->fs_info, log->next_checkpoint);
+
+		log->last_checkpoint = log->next_checkpoint;
+
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("r5log: after reclaim(write super) log->last_checkpoint %llu log->log_start %llu log->next_checkpoint %llu\n", log->last_checkpoint, log->log_start, log->next_checkpoint);
+#endif
+		/* now we should have enough space. */
+		ASSERT(btrfs_r5l_has_free_space(log, reserve));
+		btrfs_r5l_log_stripe(log, data_pages, parity_pages, rbio);
+		do_submit = true;
 	}
-	return 0;
+
+	if (do_submit) {
+		btrfs_r5l_submit_current_io(log);
+	}
+	mutex_unlock(&log->io_mutex);
+
+	return (do_submit ? 0 : 1);
 }
 
-/*
- * add a single page from a specific stripe into our list of bios for IO
- * this will try to merge into existing bios if possible, and returns
- * zero if all went well.
- */
-static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
-			    struct bio_list *bio_list,
-			    struct page *page,
-			    int stripe_nr,
-			    unsigned long page_index,
-			    unsigned long bio_max_len)
+static void btrfs_write_rbio(struct btrfs_raid_bio *rbio)
 {
-	struct bio *last = bio_list->tail;
-	u64 last_end = 0;
-	int ret;
+	struct btrfs_bio *bbio = rbio->bbio;
+	int stripe, pagenr;
+	struct bio_list bio_list;
 	struct bio *bio;
-	struct btrfs_bio_stripe *stripe;
-	u64 disk_start;
-
-	stripe = &rbio->bbio->stripes[stripe_nr];
-	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
+	int ret = 0;
 
-	/* if the device is missing, just fail this stripe */
-	if (!stripe->dev->bdev)
-		return fail_rbio_index(rbio, stripe_nr);
+	bio_list_init(&bio_list);
 
-	/* see if we can add this page onto our existing bio */
-	if (last) {
-		last_end = (u64)last->bi_iter.bi_sector << 9;
-		last_end += last->bi_iter.bi_size;
+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+			struct page *page;
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+			} else {
+			       page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
 
-		/*
-		 * we can't merge these if they are from different
-		 * devices or if they are not contiguous
-		 */
-		if (last_end == disk_start && stripe->dev->bdev &&
-		    !last->bi_error &&
-		    last->bi_bdev == stripe->dev->bdev) {
-			ret = bio_add_page(last, page, PAGE_SIZE, 0);
-			if (ret == PAGE_SIZE)
-				return 0;
+			ret = rbio_add_io_page(rbio, &bio_list,
+				       page, stripe, pagenr, rbio->stripe_len);
+			if (ret)
+				goto out;
 		}
 	}
 
-	/* put a new bio on the list */
-	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
-	if (!bio)
-		return -ENOMEM;
+	if (likely(!bbio->num_tgtdevs))
+		goto write_data;
 
-	bio->bi_iter.bi_size = 0;
-	bio->bi_bdev = stripe->dev->bdev;
-	bio->bi_iter.bi_sector = disk_start >> 9;
+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+		if (!bbio->tgtdev_map[stripe])
+			continue;
 
-	bio_add_page(bio, page, PAGE_SIZE, 0);
-	bio_list_add(bio_list, bio);
-	return 0;
-}
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+			struct page *page;
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+			} else {
+			       page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
 
-/*
- * while we're doing the read/modify/write cycle, we could
- * have errors in reading pages off the disk.  This checks
- * for errors and if we're not able to read the page it'll
- * trigger parity reconstruction.  The rmw will be finished
- * after we've reconstructed the failed stripes
- */
-static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
-{
-	if (rbio->faila >= 0 || rbio->failb >= 0) {
-		BUG_ON(rbio->faila == rbio->real_stripes - 1);
-		__raid56_parity_recover(rbio);
-	} else {
-		finish_rmw(rbio);
+			ret = rbio_add_io_page(rbio, &bio_list, page,
+					       rbio->bbio->tgtdev_map[stripe],
+					       pagenr, rbio->stripe_len);
+			if (ret)
+				goto out;
+		}
 	}
-}
 
-/*
- * helper function to walk our bio list and populate the bio_pages array with
- * the result.  This seems expensive, but it is faster than constantly
- * searching through the bio list as we setup the IO in finish_rmw or stripe
- * reconstruction.
- *
- * This must be called before you trust the answers from page_in_rbio
- */
-static void index_rbio_pages(struct btrfs_raid_bio *rbio)
-{
-	struct bio *bio;
-	struct bio_vec *bvec;
-	u64 start;
-	unsigned long stripe_offset;
-	unsigned long page_index;
-	int i;
+write_data:
+	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
+	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
 
-	spin_lock_irq(&rbio->bio_list_lock);
-	bio_list_for_each(bio, &rbio->bio_list) {
-		start = (u64)bio->bi_iter.bi_sector << 9;
-		stripe_offset = start - rbio->bbio->raid_map[0];
-		page_index = stripe_offset >> PAGE_SHIFT;
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
 
-		bio_for_each_segment_all(bvec, bio, i)
-			rbio->bio_pages[page_index + i] = bvec->bv_page;
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_write_end_io;
+		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+		submit_bio(bio);
 	}
-	spin_unlock_irq(&rbio->bio_list_lock);
+out:
+	ASSERT(ret == 0 || ret == -EIO);
+	if (ret == -EIO)
+		rbio_orig_end_io(rbio, -EIO);
 }
 
 /*
@@ -1170,19 +2261,14 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
  */
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 {
-	struct btrfs_bio *bbio = rbio->bbio;
 	void *pointers[rbio->real_stripes];
 	int nr_data = rbio->nr_data;
 	int stripe;
 	int pagenr;
 	int p_stripe = -1;
 	int q_stripe = -1;
-	struct bio_list bio_list;
-	struct bio *bio;
 	int ret;
 
-	bio_list_init(&bio_list);
-
 	if (rbio->real_stripes - rbio->nr_data == 1) {
 		p_stripe = rbio->real_stripes - 1;
 	} else if (rbio->real_stripes - rbio->nr_data == 2) {
@@ -1262,68 +2348,15 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
 	 * everything else.
 	 */
-	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
-		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
-			struct page *page;
-			if (stripe < rbio->nr_data) {
-				page = page_in_rbio(rbio, stripe, pagenr, 1);
-				if (!page)
-					continue;
-			} else {
-			       page = rbio_stripe_page(rbio, stripe, pagenr);
-			}
-
-			ret = rbio_add_io_page(rbio, &bio_list,
-				       page, stripe, pagenr, rbio->stripe_len);
-			if (ret)
-				goto cleanup;
-		}
-	}
-
-	if (likely(!bbio->num_tgtdevs))
-		goto write_data;
 
-	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
-		if (!bbio->tgtdev_map[stripe])
-			continue;
-
-		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
-			struct page *page;
-			if (stripe < rbio->nr_data) {
-				page = page_in_rbio(rbio, stripe, pagenr, 1);
-				if (!page)
-					continue;
-			} else {
-			       page = rbio_stripe_page(rbio, stripe, pagenr);
-			}
-
-			ret = rbio_add_io_page(rbio, &bio_list, page,
-					       rbio->bbio->tgtdev_map[stripe],
-					       pagenr, rbio->stripe_len);
-			if (ret)
-				goto cleanup;
-		}
-	}
-
-write_data:
-	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
-	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
-
-	while (1) {
-		bio = bio_list_pop(&bio_list);
-		if (!bio)
-			break;
-
-		bio->bi_private = rbio;
-		bio->bi_end_io = raid_write_end_io;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+	/* write to log device firstly */
+	ret = btrfs_r5l_write_stripe(rbio);
+	if (ret == 0)
+		return;
 
-		submit_bio(bio);
-	}
+	/* if no log, lets write data to RAID. */
+	btrfs_write_rbio(rbio);
 	return;
-
-cleanup:
-	rbio_orig_end_io(rbio, -EIO);
 }
 
 /*
@@ -2715,3 +3748,67 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
 	if (!lock_stripe_add(rbio))
 		async_missing_raid56(rbio);
 }
+
+struct btrfs_r5l_log * btrfs_r5l_init_log_prepare(struct btrfs_fs_info *fs_info, struct btrfs_device *device, struct block_device *bdev)
+{
+	int num_devices = fs_info->fs_devices->num_devices;
+	u64 dev_total_bytes;
+	struct btrfs_r5l_log *log = kzalloc(sizeof(struct btrfs_r5l_log), GFP_NOFS);
+	if (!log)
+		return ERR_PTR(-ENOMEM);
+
+	ASSERT(device);
+	ASSERT(bdev);
+	dev_total_bytes = i_size_read(bdev->bd_inode);
+
+	/* see find_free_dev_extent for 1M start offset */
+	log->data_offset = 1024ull * 1024;
+	log->device_size = dev_total_bytes - log->data_offset;
+	log->device_size = round_down(log->device_size, PAGE_SIZE);
+
+	/*
+	 * when device has been included in fs_devices, do not take
+	 * into account this device when checking log size.
+	 */
+	if (device->in_fs_metadata)
+		num_devices--;
+
+	if (log->device_size < BTRFS_STRIPE_LEN * num_devices * 2) {
+		btrfs_info(fs_info, "r5log log device size (%llu < %llu) is too small", log->device_size, BTRFS_STRIPE_LEN * num_devices * 2);
+		kfree(log);
+		return ERR_PTR(-EINVAL);
+	}
+
+	log->dev = device;
+	log->fs_info = fs_info;
+	ASSERT(sizeof(device->uuid) == BTRFS_UUID_SIZE);
+	log->uuid_csum = btrfs_crc32c(~0, device->uuid, sizeof(device->uuid));
+	mutex_init(&log->io_mutex);
+	spin_lock_init(&log->io_list_lock);
+	INIT_LIST_HEAD(&log->io_list);
+
+	return log;
+}
+
+void btrfs_r5l_init_log_post(struct btrfs_fs_info *fs_info, struct btrfs_r5l_log *log)
+{
+	cmpxchg(&fs_info->r5log, NULL, log);
+	ASSERT(fs_info->r5log == log);
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("r5log: set a r5log in fs_info,  alloc_range 0x%llx 0x%llx\n",
+		     log->data_offset, log->data_offset + log->device_size);
+#endif
+}
+
+int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
+{
+	struct btrfs_r5l_log *log;
+
+	log = btrfs_r5l_init_log_prepare(fs_info, device, device->bdev);
+	if (IS_ERR(log))
+		return PTR_ERR(log);
+
+	btrfs_r5l_init_log_post(fs_info, log);
+	return 0;
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 4ee4fe346838ce..fc4ff20346778f 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -39,6 +39,80 @@ static inline int nr_data_stripes(struct map_lookup *map)
 #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\
 			     ((x) == RAID6_Q_STRIPE))
 
+/* r5log */
+struct btrfs_r5l_log;
+#define BTRFS_R5LOG_MAGIC 0x6433c509
+
+#define BTRFS_R5L_STRIPE_END 1
+
+/* one meta block + several data + parity blocks */
+struct btrfs_r5l_io_unit {
+	struct btrfs_r5l_log *log;
+	struct btrfs_raid_bio *rbio;
+
+	struct list_head list;
+	int status;
+
+	/* store meta block */
+	struct page *meta_page;
+
+	/* current offset in meta page */
+	int meta_offset;
+
+	/* current bio for accepting new data/parity block */
+	struct bio *current_bio;
+
+	/* sequence number in meta block */
+	u64 seq;
+
+	/* where io_unit starts and ends */
+	u64 log_start;
+	u64 log_end;
+
+	/* split bio to hold more data */
+	bool need_split_bio;
+	struct bio *split_bio;
+};
+
+enum r5l_payload_type {
+	R5LOG_PAYLOAD_DATA = 0,
+	R5LOG_PAYLOAD_PARITY = 1,
+};
+
+/*
+ * payload is appending to the meta block and it describes the
+ * location and the size of data or parity.
+ */
+struct btrfs_r5l_payload {
+	__le16 type;
+	__le16 flags;
+
+	__le32 size;
+
+	/* data or parity */
+	__le64 location;
+	__le64 devid;
+
+	__le32 csum;
+};
+
+/* io unit starts from a meta block. */
+struct btrfs_r5l_meta_block {
+	__le32 magic;
+
+	/* the whole size of the block */
+	__le32 meta_size;
+
+	__le32 csum;
+
+	__le64 seq;
+	__le64 position;
+
+	struct btrfs_r5l_payload payload[];
+};
+
+/* r5log end */
+
 struct btrfs_raid_bio;
 struct btrfs_device;
 
@@ -65,4 +139,12 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
 
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+struct btrfs_r5l_log * btrfs_r5l_init_log_prepare(struct btrfs_fs_info *fs_info,
+						  struct btrfs_device *device,
+						  struct block_device *bdev);
+void btrfs_r5l_init_log_post(struct btrfs_fs_info *fs_info,
+			     struct btrfs_r5l_log *log);
+int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device);
+int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info,
+		       struct btrfs_r5l_log *r5log, u64 cp);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2168654c90a1e6..e312e5ada7cc60 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2238,6 +2238,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
 	btrfs_set_super_log_root(fs_info->super_copy, 0);
 	btrfs_set_super_log_root_level(fs_info->super_copy, 0);
+	btrfs_r5l_write_journal_tail(fs_info);
+
 	memcpy(fs_info->super_for_commit, fs_info->super_copy,
 	       sizeof(*fs_info->super_copy));
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 017b67daa3bbf3..7f848d79ef513b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2313,7 +2313,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
+int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path, const u64 flags)
 {
 	struct btrfs_root *root = fs_info->dev_root;
 	struct request_queue *q;
@@ -2326,6 +2326,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	u64 tmp;
 	int seeding_dev = 0;
 	int ret = 0;
+	bool is_r5log = (flags & BTRFS_DEVICE_RAID56_LOG);
+	struct btrfs_r5l_log *r5log = NULL;
+
+	if (is_r5log)
+		ASSERT(!fs_info->fs_devices->seeding);
 
 	if ((sb->s_flags & MS_RDONLY) && !fs_info->fs_devices->seeding)
 		return -EROFS;
@@ -2363,6 +2368,15 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		goto error;
 	}
 
+	if (is_r5log) {
+		r5log = btrfs_r5l_init_log_prepare(fs_info, device, bdev);
+		if (IS_ERR(r5log)) {
+			kfree(device);
+			ret = PTR_ERR(r5log);
+			goto error;
+		}
+	}
+
 	name = rcu_string_strdup(device_path, GFP_KERNEL);
 	if (!name) {
 		kfree(device);
@@ -2382,6 +2396,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	q = bdev_get_queue(bdev);
 	if (blk_queue_discard(q))
 		device->can_discard = 1;
+	if (is_r5log)
+		device->type |= BTRFS_DEV_RAID56_LOG;
 	device->writeable = 1;
 	device->generation = trans->transid;
 	device->io_width = fs_info->sectorsize;
@@ -2434,11 +2450,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	/* add sysfs device entry */
 	btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
 
-	/*
-	 * we've got more storage, clear any full flags on the space
-	 * infos
-	 */
-	btrfs_clear_space_info_full(fs_info);
+	if (!is_r5log) {
+		/*
+		 * we've got more storage, clear any full flags on the space
+		 * infos
+		 */
+		btrfs_clear_space_info_full(fs_info);
+	}
 
 	mutex_unlock(&fs_info->chunk_mutex);
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -2502,6 +2520,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		ret = btrfs_commit_transaction(trans);
 	}
 
+	if (is_r5log) {
+		/* initialize r5log with cp == 0. */
+		btrfs_r5l_load_log(fs_info, r5log, 0);
+		btrfs_r5l_init_log_post(fs_info, r5log);
+	}
+
 	/* Update ctime/mtime for libblkid */
 	update_dev_time(device_path);
 	return ret;
@@ -4716,8 +4740,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		}
 
 		if (!device->in_fs_metadata ||
-		    device->is_tgtdev_for_dev_replace)
+		    device->is_tgtdev_for_dev_replace ||
+		    (device->type & BTRFS_DEV_RAID56_LOG)) {
+#ifdef BTRFS_DEBUG_R5LOG
+			if (device->type & BTRFS_DEV_RAID56_LOG)
+				btrfs_info(info, "skip a r5log when alloc chunk\n");
+#endif
 			continue;
+		}
 
 		if (device->total_bytes > device->bytes_used)
 			total_avail = device->total_bytes - device->bytes_used;
@@ -6689,6 +6719,18 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 			device->bytes_used;
 		spin_unlock(&fs_info->free_chunk_lock);
 	}
+
+	if (device->type & BTRFS_DEV_RAID56_LOG) {
+		ret = btrfs_set_r5log(fs_info, device);
+		if (ret) {
+			btrfs_err(fs_info, "error %d on loading r5log", ret);
+			return ret;
+		}
+
+		btrfs_info(fs_info, "devid %llu uuid %pU is raid56 log",
+			   device->devid, device->uuid);
+	}
+
 	ret = 0;
 	return ret;
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index c7d0fbc915cabd..44cc3fa0a8da32 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -26,6 +26,10 @@
 
 extern struct mutex uuid_mutex;
 
+#ifdef CONFIG_BTRFS_DEBUG
+#define BTRFS_DEBUG_R5LOG
+#endif
+
 #define BTRFS_STRIPE_LEN	SZ_64K
 
 struct buffer_head;
@@ -437,7 +441,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 				       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
-int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
+int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path,
+			  const u64 flags);
 int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 				  const char *device_path,
 				  struct btrfs_device *srcdev,
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index a456e5309238bb..be5991f267ee43 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -35,6 +35,7 @@ struct btrfs_ioctl_vol_args {
 #define BTRFS_DEVICE_PATH_NAME_MAX 1024
 
 #define BTRFS_DEVICE_SPEC_BY_ID		(1ULL << 3)
+#define BTRFS_DEVICE_RAID56_LOG		(1ULL << 4)
 
 #define BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED		\
 			(BTRFS_SUBVOL_CREATE_ASYNC |	\
@@ -818,5 +819,7 @@ enum btrfs_err_code {
 				   struct btrfs_ioctl_feature_flags[3])
 #define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \
 				   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_ADD_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 59, \
+				   struct btrfs_ioctl_vol_args_v2)
 
 #endif /* _UAPI_LINUX_BTRFS_H */
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 10689e1fdf11d1..52fed59e85e768 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -347,6 +347,10 @@ struct btrfs_key {
 	__u64 offset;
 } __attribute__ ((__packed__));
 
+/* dev_item.type */
+/* #define BTRFS_DEV_REGULAR	0 */
+#define BTRFS_DEV_RAID56_LOG	(1ULL << 0)
+
 struct btrfs_dev_item {
 	/* the internal btrfs device id */
 	__le64 devid;