From 04d259dccba9f1acfba1b0910acd6d72a773f953 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Jul 2017 16:47:38 -0700
Subject: [PATCH 01/14] Btrfs: raid56: add raid56 log via add_dev v2 ioctl

This introduces add_dev_v2 ioctl to add a device as raid56 journal
device.  With the help of a journal device, raid56 is able to to get
rid of potential write holes.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/ctree.h                |  6 +++++
 fs/btrfs/ioctl.c                | 48 ++++++++++++++++++++++++++++++++-
 fs/btrfs/raid56.c               | 42 +++++++++++++++++++++++++++++
 fs/btrfs/raid56.h               |  1 +
 fs/btrfs/volumes.c              | 26 +++++++++++++-----
 fs/btrfs/volumes.h              |  3 ++-
 include/uapi/linux/btrfs.h      |  3 +++
 include/uapi/linux/btrfs_tree.h |  4 +++
 8 files changed, 125 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 643c70d2b2e65a..d9676275b4cb96 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -697,6 +697,7 @@ struct btrfs_stripe_hash_table {
 void btrfs_init_async_reclaim_work(struct work_struct *work);
 
 /* fs_info */
+struct btrfs_r5l_log;
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
@@ -1114,6 +1115,9 @@ struct btrfs_fs_info {
 	u32 nodesize;
 	u32 sectorsize;
 	u32 stripesize;
+
+	/* raid56 log */
+	struct btrfs_r5l_log *r5log;
 };
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
@@ -2932,6 +2936,8 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
 
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+	if (fs_info->r5log)
+		kfree(fs_info->r5log);
 	kfree(fs_info->balance_ctl);
 	kfree(fs_info->delayed_root);
 	kfree(fs_info->extent_root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e176375f374f91..3d1ef4df4a4fde 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2653,6 +2653,50 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 	return ret;
 }
 
+/* identical to btrfs_ioctl_add_dev, but this is with flags */
+static long btrfs_ioctl_add_dev_v2(struct btrfs_fs_info *fs_info, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args_v2 *vol_args;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+
+	mutex_lock(&fs_info->volume_mutex);
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
+
+	if (vol_args->flags & BTRFS_DEVICE_RAID56_LOG &&
+	    fs_info->r5log) {
+		ret = -EEXIST;
+		btrfs_info(fs_info, "r5log: attempting to add another log device!");
+		goto out_free;
+	}
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	ret = btrfs_init_new_device(fs_info, vol_args->name, vol_args->flags);
+	if (!ret) {
+		if (vol_args->flags & BTRFS_DEVICE_RAID56_LOG) {
+			ASSERT(fs_info->r5log);
+			btrfs_info(fs_info, "disk added %s as raid56 log", vol_args->name);
+		} else {
+			btrfs_info(fs_info, "disk added %s", vol_args->name);
+		}
+	}
+out_free:
+	kfree(vol_args);
+out:
+	mutex_unlock(&fs_info->volume_mutex);
+	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+	return ret;
+}
+
 static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
@@ -2672,7 +2716,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-	ret = btrfs_init_new_device(fs_info, vol_args->name);
+	ret = btrfs_init_new_device(fs_info, vol_args->name, 0);
 
 	if (!ret)
 		btrfs_info(fs_info, "disk added %s", vol_args->name);
@@ -5539,6 +5583,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_resize(file, argp);
 	case BTRFS_IOC_ADD_DEV:
 		return btrfs_ioctl_add_dev(fs_info, argp);
+	case BTRFS_IOC_ADD_DEV_V2:
+		return btrfs_ioctl_add_dev_v2(fs_info, argp);
 	case BTRFS_IOC_RM_DEV:
 		return btrfs_ioctl_rm_dev(file, argp);
 	case BTRFS_IOC_RM_DEV_V2:
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d8ea0eb76325e9..2b91b955b81427 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -177,6 +177,25 @@ struct btrfs_raid_bio {
 	unsigned long *dbitmap;
 };
 
+/* raid56 log */
+struct btrfs_r5l_log {
+	/* protect this struct and log io */
+	struct mutex io_mutex;
+
+	/* r5log device */
+	struct btrfs_device *dev;
+
+	/* allocation range for log entries */
+	u64 data_offset;
+	u64 device_size;
+
+	u64 last_checkpoint;
+	u64 last_cp_seq;
+	u64 seq;
+	u64 log_start;
+	struct btrfs_r5l_io_unit *current_io;
+};
+
 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 static void rmw_work(struct btrfs_work *work);
@@ -2715,3 +2734,26 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
 	if (!lock_stripe_add(rbio))
 		async_missing_raid56(rbio);
 }
+
+int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
+{
+	struct btrfs_r5l_log *log;
+
+	log = kzalloc(sizeof(*log), GFP_NOFS);
+	if (!log)
+		return -ENOMEM;
+
+	/* see find_free_dev_extent for 1M start offset */
+	log->data_offset = 1024ull * 1024;
+	log->device_size = btrfs_device_get_total_bytes(device) - log->data_offset;
+	log->device_size = round_down(log->device_size, PAGE_SIZE);
+	log->dev = device;
+	mutex_init(&log->io_mutex);
+
+	cmpxchg(&fs_info->r5log, NULL, log);
+	ASSERT(fs_info->r5log == log);
+
+	trace_printk("r5log: set a r5log in fs_info,  alloc_range 0x%llx 0x%llx",
+		     log->data_offset, log->data_offset + log->device_size);
+	return 0;
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 4ee4fe346838ce..0c8bf6ad923090 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -65,4 +65,5 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
 
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device);
 #endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 017b67daa3bbf3..dafc541bc72791 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2313,7 +2313,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
+int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path, const u64 flags)
 {
 	struct btrfs_root *root = fs_info->dev_root;
 	struct request_queue *q;
@@ -2326,6 +2326,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	u64 tmp;
 	int seeding_dev = 0;
 	int ret = 0;
+	bool is_r5log = (flags & BTRFS_DEVICE_RAID56_LOG);
+
+	if (is_r5log)
+		ASSERT(!fs_info->fs_devices->seeding);
 
 	if ((sb->s_flags & MS_RDONLY) && !fs_info->fs_devices->seeding)
 		return -EROFS;
@@ -2382,6 +2386,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	q = bdev_get_queue(bdev);
 	if (blk_queue_discard(q))
 		device->can_discard = 1;
+	if (is_r5log)
+		device->type |= BTRFS_DEV_RAID56_LOG;
 	device->writeable = 1;
 	device->generation = trans->transid;
 	device->io_width = fs_info->sectorsize;
@@ -2434,11 +2440,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	/* add sysfs device entry */
 	btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
 
-	/*
-	 * we've got more storage, clear any full flags on the space
-	 * infos
-	 */
-	btrfs_clear_space_info_full(fs_info);
+	if (!is_r5log) {
+		/*
+		 * we've got more storage, clear any full flags on the space
+		 * infos
+		 */
+		btrfs_clear_space_info_full(fs_info);
+	}
 
 	mutex_unlock(&fs_info->chunk_mutex);
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -2459,6 +2467,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		goto error_trans;
 	}
 
+	if (is_r5log) {
+		ret = btrfs_set_r5log(fs_info, device);
+		if (ret)
+			goto error_trans;
+	}
+
 	if (seeding_dev) {
 		char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index c7d0fbc915cabd..60e347aa003fc4 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -437,7 +437,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 				       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
-int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
+int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path,
+			  const u64 flags);
 int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 				  const char *device_path,
 				  struct btrfs_device *srcdev,
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index a456e5309238bb..be5991f267ee43 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -35,6 +35,7 @@ struct btrfs_ioctl_vol_args {
 #define BTRFS_DEVICE_PATH_NAME_MAX 1024
 
 #define BTRFS_DEVICE_SPEC_BY_ID		(1ULL << 3)
+#define BTRFS_DEVICE_RAID56_LOG		(1ULL << 4)
 
 #define BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED		\
 			(BTRFS_SUBVOL_CREATE_ASYNC |	\
@@ -818,5 +819,7 @@ enum btrfs_err_code {
 				   struct btrfs_ioctl_feature_flags[3])
 #define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \
 				   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_ADD_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 59, \
+				   struct btrfs_ioctl_vol_args_v2)
 
 #endif /* _UAPI_LINUX_BTRFS_H */
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 10689e1fdf11d1..52fed59e85e768 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -347,6 +347,10 @@ struct btrfs_key {
 	__u64 offset;
 } __attribute__ ((__packed__));
 
+/* dev_item.type */
+/* #define BTRFS_DEV_REGULAR	0 */
+#define BTRFS_DEV_RAID56_LOG	(1ULL << 0)
+
 struct btrfs_dev_item {
 	/* the internal btrfs device id */
 	__le64 devid;

From 6725b39ecdc2c59f0c1e5bcbf43e1d0b2f1af6c3 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Jul 2017 16:52:46 -0700
Subject: [PATCH 02/14] Btrfs: raid56: do not allocate chunk on raid56 log

The journal device (aka raid56 log) is not for chunk allocation, lets
skip it.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/volumes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dafc541bc72791..5c50df7e753ecc 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4730,7 +4730,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		}
 
 		if (!device->in_fs_metadata ||
-		    device->is_tgtdev_for_dev_replace)
+		    device->is_tgtdev_for_dev_replace ||
+		    (device->type & BTRFS_DEV_RAID56_LOG))
 			continue;
 
 		if (device->total_bytes > device->bytes_used)

From 2c6f8c6ba4164aec767c91ca1c403c3d401572a9 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Jul 2017 16:53:48 -0700
Subject: [PATCH 03/14] Btrfs: raid56: detect raid56 log on mount

We've put the flag BTRFS_DEV_RAID56_LOG in device->type, so we can
recognize the journal device of raid56 while reading the chunk tree.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/volumes.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5c50df7e753ecc..a17a4885e7f290 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6696,6 +6696,18 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
+
+	if (device->type & BTRFS_DEV_RAID56_LOG) {
+		ret = btrfs_set_r5log(fs_info, device);
+		if (ret) {
+			btrfs_err(fs_info, "error %d on loading r5log", ret);
+			return ret;
+		}
+
+		btrfs_info(fs_info, "devid %llu uuid %pU is raid56 log",
+			   device->devid, device->uuid);
+	}
+
 	device->in_fs_metadata = 1;
 	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;

From c96a72d6413aec6f8d6757e6eefbd582a9b46966 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Jul 2017 16:54:43 -0700
Subject: [PATCH 04/14] Btrfs: raid56: add verbose debug

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/raid56.c  | 2 ++
 fs/btrfs/volumes.c | 7 ++++++-
 fs/btrfs/volumes.h | 4 ++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 2b91b955b81427..c75766f435cab8 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2753,7 +2753,9 @@ int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
 	cmpxchg(&fs_info->r5log, NULL, log);
 	ASSERT(fs_info->r5log == log);
 
+#ifdef BTRFS_DEBUG_R5LOG
 	trace_printk("r5log: set a r5log in fs_info,  alloc_range 0x%llx 0x%llx",
 		     log->data_offset, log->data_offset + log->device_size);
+#endif
 	return 0;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a17a4885e7f290..ac64d93774b219 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4731,8 +4731,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
 		if (!device->in_fs_metadata ||
 		    device->is_tgtdev_for_dev_replace ||
-		    (device->type & BTRFS_DEV_RAID56_LOG))
+		    (device->type & BTRFS_DEV_RAID56_LOG)) {
+#ifdef BTRFS_DEBUG_R5LOG
+			if (device->type & BTRFS_DEV_RAID56_LOG)
+				btrfs_info(info, "skip a r5log when alloc chunk\n");
+#endif
 			continue;
+		}
 
 		if (device->total_bytes > device->bytes_used)
 			total_avail = device->total_bytes - device->bytes_used;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 60e347aa003fc4..44cc3fa0a8da32 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -26,6 +26,10 @@
 
 extern struct mutex uuid_mutex;
 
+#ifdef CONFIG_BTRFS_DEBUG
+#define BTRFS_DEBUG_R5LOG
+#endif
+
 #define BTRFS_STRIPE_LEN	SZ_64K
 
 struct buffer_head;

From 73943ed901a58f0411c5063896bcf3b07d89df0e Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 14 Jun 2017 18:07:18 -0700
Subject: [PATCH 05/14] Btrfs: raid56: add stripe log for raid5/6

This is adding the ability to use a disk as raid5/6's stripe log (aka
journal), the primary goal is to fix write hole issue that is inherent
in raid56 setup.

In a typical raid5/6 setup, both full stripe write and a partial
stripe write will generate parity at the very end of writing, so after
parity is generated, it's the right time to issue writes.

Now with raid5/6's stripe log, every write will be put into the stripe
log prior to being written to raid5/6 array, so that we have
everything to rewrite all 'not-yet-on-disk' data/parity if a power
loss happens while writing data/parity to different disks in raid5/6
array.

A metadata block is used here to manage the information of data and
parity and it's placed ahead of data and parity on stripe log.  Right
now such metadata block is limited to one page size and the structure
is defined as

{metadata block} + {a few payloads}

- 'metadata block' contains a magic code, a sequence number and the
  start position on the stripe log.

- 'payload' contains the information about data and parity, e.g. the
 physical offset and device id where data/parity is supposed to be.

Each data block has a payload while each set of parity has a payload
(e.g. for raid6, parity p and q has their own payload respectively).

And we treat data and parity differently because btrfs always prepares
the whole stripe length(64k) of parity, but data may only come from a
partial stripe write.

This metadata block is written to the raid5/6 stripe log with data/parity
in a single bio(could be two bios, doesn't support more than two
bios).

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/raid56.c | 512 ++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/raid56.h |  65 ++++++
 2 files changed, 513 insertions(+), 64 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index c75766f435cab8..007ba633caa6db 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -185,6 +185,8 @@ struct btrfs_r5l_log {
 	/* r5log device */
 	struct btrfs_device *dev;
 
+	struct btrfs_fs_info *fs_info;
+
 	/* allocation range for log entries */
 	u64 data_offset;
 	u64 device_size;
@@ -1179,6 +1181,445 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
 	spin_unlock_irq(&rbio->bio_list_lock);
 }
 
+/* r5log */
+/* XXX: this allocation may be done earlier, eg. when allocating rbio */
+static struct btrfs_r5l_io_unit *btrfs_r5l_alloc_io_unit(struct btrfs_r5l_log *log)
+{
+	struct btrfs_r5l_io_unit *io;
+	gfp_t gfp = GFP_NOFS;
+
+	io = kzalloc(sizeof(*io), gfp);
+	ASSERT(io);
+	io->log = log;
+	/* need to use kmap. */
+	io->meta_page = alloc_page(gfp | __GFP_HIGHMEM | __GFP_ZERO);
+	ASSERT(io->meta_page);
+
+	return io;
+}
+
+static void btrfs_r5l_free_io_unit(struct btrfs_r5l_log *log, struct btrfs_r5l_io_unit *io)
+{
+	__free_page(io->meta_page);
+	kfree(io);
+}
+
+static u64 btrfs_r5l_ring_add(struct btrfs_r5l_log *log, u64 start, u64 inc)
+{
+	start += inc;
+	if (start >= log->device_size)
+		start = start - log->device_size;
+	return start;
+}
+
+static void btrfs_r5l_reserve_log_entry(struct btrfs_r5l_log *log, struct btrfs_r5l_io_unit *io)
+{
+	log->log_start = btrfs_r5l_ring_add(log, log->log_start, PAGE_SIZE);
+	io->log_end = log->log_start;
+
+	if (log->log_start == 0)
+		io->need_split_bio = true;
+}
+
+static void btrfs_write_rbio(struct btrfs_raid_bio *rbio);
+
+static void btrfs_r5l_log_endio(struct bio *bio)
+{
+	struct btrfs_r5l_io_unit *io = bio->bi_private;
+	struct btrfs_r5l_log *log = io->log;
+
+	bio_put(bio);
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("move data to disk\n");
+#endif
+	/* move data to RAID. */
+	btrfs_write_rbio(io->rbio);
+
+	if (log->current_io == io)
+		log->current_io = NULL;
+	btrfs_r5l_free_io_unit(log, io);
+}
+
+static struct bio *btrfs_r5l_bio_alloc(struct btrfs_r5l_log *log)
+{
+	/* this allocation will not fail. */
+	struct bio *bio = btrfs_io_bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+
+	/* We need to make sure data/parity are settled down on the log disk. */
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA;
+	bio->bi_bdev = log->dev->bdev;
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("log->data_offset 0x%llx log->log_start 0x%llx\n", log->data_offset, log->log_start);
+#endif
+	bio->bi_iter.bi_sector = (log->data_offset + log->log_start) >> 9;
+
+	return bio;
+}
+
+static struct btrfs_r5l_io_unit *btrfs_r5l_new_meta(struct btrfs_r5l_log *log)
+{
+	struct btrfs_r5l_io_unit *io;
+	struct btrfs_r5l_meta_block *block;
+
+	io = btrfs_r5l_alloc_io_unit(log);
+	ASSERT(io);
+
+	block = kmap(io->meta_page);
+	clear_page(block);
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("%s pos %llu seq %llu\n", __func__, log->log_start, log->seq);
+#endif
+
+	block->magic = cpu_to_le32(BTRFS_R5LOG_MAGIC);
+	block->seq = cpu_to_le64(log->seq);
+	block->position = cpu_to_le64(log->log_start);
+
+	kunmap(io->meta_page);
+
+	io->log_start = log->log_start;
+	io->meta_offset = sizeof(struct btrfs_r5l_meta_block);
+	io->seq = log->seq++;
+
+	io->need_split_bio = false;
+	io->split_bio = NULL;
+	io->current_bio = btrfs_r5l_bio_alloc(log);
+	io->current_bio->bi_end_io = btrfs_r5l_log_endio;
+	io->current_bio->bi_private = io;
+
+	bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
+
+	btrfs_r5l_reserve_log_entry(log, io);
+	return io;
+}
+
+static int btrfs_r5l_get_meta(struct btrfs_r5l_log *log, struct btrfs_raid_bio *rbio, int payload_size)
+{
+	/* always allocate new meta block. */
+	log->current_io = btrfs_r5l_new_meta(log);
+	ASSERT(log->current_io);
+	log->current_io->rbio = rbio;
+	return 0;
+}
+
+static void btrfs_r5l_append_payload_meta(struct btrfs_r5l_log *log, u16 type, u64 location, u64 devid)
+{
+	struct btrfs_r5l_io_unit *io = log->current_io;
+	struct btrfs_r5l_payload *payload;
+	void *ptr;
+
+	ptr = kmap(io->meta_page);
+	payload = ptr + io->meta_offset;
+	payload->type = cpu_to_le16(type);
+	payload->flags = cpu_to_le16(0);
+
+	if (type == R5LOG_PAYLOAD_DATA)
+		payload->size = cpu_to_le32(1);
+	else if (type == R5LOG_PAYLOAD_PARITY)
+		payload->size = cpu_to_le32(16); /* stripe_len / PAGE_SIZE */
+	payload->devid = cpu_to_le64(devid);
+	payload->location = cpu_to_le64(location);
+	kunmap(io->meta_page);
+
+	/* XXX: add checksum later */
+	io->meta_offset += sizeof(*payload);
+	//io->meta_offset += sizeof(__le32);
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("io->meta_offset %d\n", io->meta_offset);
+#endif
+}
+
+static void btrfs_r5l_append_payload_page(struct btrfs_r5l_log *log, struct page *page)
+{
+	struct btrfs_r5l_io_unit *io = log->current_io;
+
+	if (io->need_split_bio) {
+		/* We're submitting too much data at a time!! */
+		BUG_ON(io->split_bio);
+		io->split_bio = io->current_bio;
+		io->current_bio = btrfs_r5l_bio_alloc(log);
+		bio_chain(io->current_bio, io->split_bio);
+		io->need_split_bio = false;
+	}
+
+	ASSERT(bio_add_page(io->current_bio, page, PAGE_SIZE, 0));
+
+	btrfs_r5l_reserve_log_entry(log, io);
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("log->log_start %llu io->current_bio bi_iter (bi_sector 0x%llx bi_size %d\n", log->log_start, io->current_bio->bi_iter.bi_sector << 9, io->current_bio->bi_iter.bi_size);
+#endif
+
+}
+
+static u64 btrfs_compute_location(struct btrfs_raid_bio *rbio, int stripe_nr, unsigned long page_index)
+{
+	struct btrfs_bio_stripe *stripe;
+
+	stripe = &rbio->bbio->stripes[stripe_nr];
+	return stripe->physical + (page_index << PAGE_SHIFT);
+}
+
+static u64 btrfs_compute_devid(struct btrfs_raid_bio *rbio, int stripe_nr)
+{
+	struct btrfs_bio_stripe *stripe;
+
+	stripe = &rbio->bbio->stripes[stripe_nr];
+	ASSERT(stripe->dev);
+	return stripe->dev->devid;
+}
+
+static void btrfs_r5l_log_stripe(struct btrfs_r5l_log *log, int data_pages, int parity_pages, struct btrfs_raid_bio *rbio)
+{
+	int meta_size;
+	int stripe, pagenr;
+	struct page *page;
+
+	/*
+	 * parity pages are contiguous on disk, thus only one
+	 * payload is required.
+	 */
+	meta_size = sizeof(struct btrfs_r5l_payload) * data_pages +
+		    sizeof(struct btrfs_r5l_payload) * (rbio->real_stripes - rbio->nr_data);
+
+	/* add meta block */
+	btrfs_r5l_get_meta(log, rbio, meta_size);
+
+	/* add data blocks which need to be written */
+	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+			u64 location;
+			u64 devid;
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+				/* the page is from bio, queued for log bio */
+				location = btrfs_compute_location(rbio, stripe, pagenr);
+				devid = btrfs_compute_devid(rbio, stripe);
+#ifdef BTRFS_DEBUG_R5LOG
+				trace_printk("data: stripe %d pagenr %d location 0x%llx devid %llu\n", stripe, pagenr, location, devid);
+#endif
+				btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, location, devid);
+				btrfs_r5l_append_payload_page(log, page);
+			}
+		}
+	}
+
+	/* add the whole parity blocks */
+	for (; stripe < rbio->real_stripes; stripe++) {
+		u64 location = btrfs_compute_location(rbio, stripe, 0);
+		u64 devid = btrfs_compute_devid(rbio, stripe);
+
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("parity: stripe %d location 0x%llx devid %llu\n", stripe, location, devid);
+#endif
+		btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, location, devid);
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+			page = rbio_stripe_page(rbio, stripe, pagenr);
+			btrfs_r5l_append_payload_page(log, page);
+		}
+	}
+}
+
+static void btrfs_r5l_submit_current_io(struct btrfs_r5l_log *log)
+{
+	struct btrfs_r5l_io_unit *io = log->current_io;
+	struct btrfs_r5l_meta_block *mb;
+
+	if (!io)
+		return;
+
+	mb = kmap(io->meta_page);
+	mb->meta_size = cpu_to_le32(io->meta_offset);
+	kunmap(io->meta_page);
+
+	log->current_io = NULL;
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("io->current bio bi_sector 0x%llx devid %llu\n", io->current_bio->bi_iter.bi_sector << 9, log->dev->devid);
+#endif
+	/*
+	 * make sure that r5l_log_endio does not run in interrupt
+	 * context.
+	 *
+	 * if io->split_bio is available, then current_bio is just a
+	 * chained bio.
+	 */
+	if (io->split_bio)
+		btrfs_bio_wq_end_io(log->fs_info, io->split_bio, BTRFS_WQ_ENDIO_RAID56);
+	else
+		btrfs_bio_wq_end_io(log->fs_info, io->current_bio, BTRFS_WQ_ENDIO_RAID56);
+
+	submit_bio(io->current_bio);
+	if (io->split_bio)
+		submit_bio(io->split_bio);
+}
+
+static u64 btrfs_r5l_ring_distance(struct btrfs_r5l_log *log, u64 start, u64 end)
+{
+	if (end >= start)
+		return end - start;
+	else
+		return end + (log->device_size) - start;
+}
+
+static bool btrfs_r5l_has_free_space(struct btrfs_r5l_log *log, u64 size)
+{
+	u64 used_size;
+	used_size = btrfs_r5l_ring_distance(log, log->last_checkpoint,
+					    log->log_start);
+	return log->device_size > (used_size + size);
+}
+
+/*
+ * return 0 if data/parity are written into log and it will move data
+ * to RAID in endio.
+ *
+ * return 1 if log is not available or there is no space in log.
+ */
+static int btrfs_r5l_write_stripe(struct btrfs_raid_bio *rbio)
+{
+	int stripe, pagenr;
+	int data_pages = 0, parity_pages = 0;
+	u64 reserve;
+	int meta_size;
+	bool do_submit = false;
+	struct btrfs_r5l_log *log = rbio->fs_info->r5log;
+
+	if (!log) {
+#ifdef BTRFS_DEBUG_R5LOG
+		btrfs_info(rbio->fs_info, "r5log is not available\n");
+#endif
+		return 1;
+	}
+
+	/* get data_pages and parity_pages */
+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+			struct page *page;
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+				data_pages++;
+			} else {
+				parity_pages++;
+			}
+		}
+	}
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("data_pages %d parity_pages %d\n", data_pages, parity_pages);
+	ASSERT(parity_pages == 16 * (rbio->real_stripes - rbio->nr_data));
+#endif
+
+	/*
+	 * parity pages are contiguous on disk, thus only one
+	 * payload is required.
+	 */
+	meta_size = sizeof(struct btrfs_r5l_payload) * data_pages +
+		sizeof(struct btrfs_r5l_payload) * (rbio->real_stripes - rbio->nr_data);
+
+	/* doesn't support large raid array */
+	if (meta_size + sizeof(struct btrfs_r5l_meta_block) > PAGE_SIZE) {
+#ifdef BTRFS_DEBUG_R5LOG
+		btrfs_info(rbio->fs_info, "meta_size (%d) is too big\n", meta_size);
+#endif
+		return 1;
+	}
+
+	mutex_lock(&log->io_mutex);
+	/* meta + data/parity */
+	reserve = (1 + data_pages + parity_pages) << PAGE_SHIFT;
+	if (btrfs_r5l_has_free_space(log, reserve)) {
+		btrfs_r5l_log_stripe(log, data_pages, parity_pages, rbio);
+		do_submit = true;
+	} else {
+		; /* XXX: reclaim */
+	}
+
+	if (do_submit) {
+		btrfs_r5l_submit_current_io(log);
+	}
+	mutex_unlock(&log->io_mutex);
+
+	return (do_submit ? 0 : 1);
+}
+
+static void btrfs_write_rbio(struct btrfs_raid_bio *rbio)
+{
+	struct btrfs_bio *bbio = rbio->bbio;
+	int stripe, pagenr;
+	struct bio_list bio_list;
+	struct bio *bio;
+	int ret = 0;
+
+	bio_list_init(&bio_list);
+
+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+			struct page *page;
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+			} else {
+			       page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+
+			ret = rbio_add_io_page(rbio, &bio_list,
+				       page, stripe, pagenr, rbio->stripe_len);
+			if (ret)
+				goto out;
+		}
+	}
+
+	if (likely(!bbio->num_tgtdevs))
+		goto write_data;
+
+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+		if (!bbio->tgtdev_map[stripe])
+			continue;
+
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+			struct page *page;
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+			} else {
+			       page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+
+			ret = rbio_add_io_page(rbio, &bio_list, page,
+					       rbio->bbio->tgtdev_map[stripe],
+					       pagenr, rbio->stripe_len);
+			if (ret)
+				goto out;
+		}
+	}
+
+write_data:
+	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
+	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
+
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_write_end_io;
+		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+		submit_bio(bio);
+	}
+out:
+	ASSERT(ret == 0 || ret == -EIO);
+	if (ret == -EIO)
+		rbio_orig_end_io(rbio, -EIO);
+}
+
 /*
  * this is called from one of two situations.  We either
  * have a full stripe from the higher layers, or we've read all
@@ -1189,19 +1630,14 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
  */
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 {
-	struct btrfs_bio *bbio = rbio->bbio;
 	void *pointers[rbio->real_stripes];
 	int nr_data = rbio->nr_data;
 	int stripe;
 	int pagenr;
 	int p_stripe = -1;
 	int q_stripe = -1;
-	struct bio_list bio_list;
-	struct bio *bio;
 	int ret;
 
-	bio_list_init(&bio_list);
-
 	if (rbio->real_stripes - rbio->nr_data == 1) {
 		p_stripe = rbio->real_stripes - 1;
 	} else if (rbio->real_stripes - rbio->nr_data == 2) {
@@ -1281,68 +1717,15 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
 	 * everything else.
 	 */
-	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
-		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
-			struct page *page;
-			if (stripe < rbio->nr_data) {
-				page = page_in_rbio(rbio, stripe, pagenr, 1);
-				if (!page)
-					continue;
-			} else {
-			       page = rbio_stripe_page(rbio, stripe, pagenr);
-			}
-
-			ret = rbio_add_io_page(rbio, &bio_list,
-				       page, stripe, pagenr, rbio->stripe_len);
-			if (ret)
-				goto cleanup;
-		}
-	}
-
-	if (likely(!bbio->num_tgtdevs))
-		goto write_data;
-
-	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
-		if (!bbio->tgtdev_map[stripe])
-			continue;
-
-		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
-			struct page *page;
-			if (stripe < rbio->nr_data) {
-				page = page_in_rbio(rbio, stripe, pagenr, 1);
-				if (!page)
-					continue;
-			} else {
-			       page = rbio_stripe_page(rbio, stripe, pagenr);
-			}
-
-			ret = rbio_add_io_page(rbio, &bio_list, page,
-					       rbio->bbio->tgtdev_map[stripe],
-					       pagenr, rbio->stripe_len);
-			if (ret)
-				goto cleanup;
-		}
-	}
 
-write_data:
-	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
-	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
-
-	while (1) {
-		bio = bio_list_pop(&bio_list);
-		if (!bio)
-			break;
-
-		bio->bi_private = rbio;
-		bio->bi_end_io = raid_write_end_io;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+	/* write to log device firstly */
+	ret = btrfs_r5l_write_stripe(rbio);
+	if (ret == 0)
+		return;
 
-		submit_bio(bio);
-	}
+	/* if no log, lets write data to RAID. */
+	btrfs_write_rbio(rbio);
 	return;
-
-cleanup:
-	rbio_orig_end_io(rbio, -EIO);
 }
 
 /*
@@ -2748,6 +3131,7 @@ int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
 	log->device_size = btrfs_device_get_total_bytes(device) - log->data_offset;
 	log->device_size = round_down(log->device_size, PAGE_SIZE);
 	log->dev = device;
+	log->fs_info = fs_info;
 	mutex_init(&log->io_mutex);
 
 	cmpxchg(&fs_info->r5log, NULL, log);
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 0c8bf6ad923090..76a20faf189221 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -39,6 +39,71 @@ static inline int nr_data_stripes(struct map_lookup *map)
 #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\
 			     ((x) == RAID6_Q_STRIPE))
 
+/* r5log */
+struct btrfs_r5l_log;
+#define BTRFS_R5LOG_MAGIC 0x6433c509
+
+/* one meta block + several data + parity blocks */
+struct btrfs_r5l_io_unit {
+	struct btrfs_r5l_log *log;
+	struct btrfs_raid_bio *rbio;
+
+	/* store meta block */
+	struct page *meta_page;
+
+	/* current offset in meta page */
+	int meta_offset;
+
+	/* current bio for accepting new data/parity block */
+	struct bio *current_bio;
+
+	/* sequence number in meta block */
+	u64 seq;
+
+	/* where io_unit starts and ends */
+	u64 log_start;
+	u64 log_end;
+
+	/* split bio to hold more data */
+	bool need_split_bio;
+	struct bio *split_bio;
+};
+
+enum r5l_payload_type {
+	R5LOG_PAYLOAD_DATA = 0,
+	R5LOG_PAYLOAD_PARITY = 1,
+};
+
+/*
+ * payload is appending to the meta block and it describes the
+ * location and the size of data or parity.
+ */
+struct btrfs_r5l_payload {
+	__le16 type;
+	__le16 flags;
+
+	__le32 size;
+
+	/* data or parity */
+	__le64 location;
+	__le64 devid;
+};
+
+/* io unit starts from a meta block. */
+struct btrfs_r5l_meta_block {
+	__le32 magic;
+
+	/* the whole size of the block */
+	__le32 meta_size;
+
+	__le64 seq;
+	__le64 position;
+
+	struct btrfs_r5l_payload payload[];
+};
+
+/* r5log end */
+
 struct btrfs_raid_bio;
 struct btrfs_device;
 

From d3adc6b43cf8656ce0a867027d83eae2a80e5fd3 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 19 Jun 2017 13:03:52 -0700
Subject: [PATCH 06/14] Btrfs: raid56: add reclaim support

The log space is limited, so reclaim is necessary when there is not enough space to use.

By recording the largest position we've written to the log disk and
flushing all disks' cache and the superblock, we can be sure that data
and parity before this position have the identical copy in the log and
raid5/6 array.

Also we need to take care of the case when IOs get reordered.  A list
is used to keep the order right.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/ctree.h       | 10 ++++++-
 fs/btrfs/raid56.c      | 63 ++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/transaction.c |  2 ++
 3 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d9676275b4cb96..9235643640981f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -244,8 +244,10 @@ struct btrfs_super_block {
 	__le64 cache_generation;
 	__le64 uuid_tree_generation;
 
+	/* r5log journal tail (where recovery starts) */
+	__le64 journal_tail;
 	/* future expansion */
-	__le64 reserved[30];
+	__le64 reserved[29];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 } __attribute__ ((__packed__));
@@ -2291,6 +2293,8 @@ BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
 			 log_root_transid, 64);
 BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
 			 log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_journal_tail, struct btrfs_super_block,
+			 journal_tail, 64);
 BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
 			 total_bytes, 64);
 BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
@@ -3284,6 +3288,10 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			unsigned long new_flags);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 
+/* raid56.c */
+void btrfs_r5l_write_journal_tail(struct btrfs_fs_info *fs_info);
+
+
 static inline __printf(2, 3)
 void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 007ba633caa6db..60010a6ec4fdbe 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -191,6 +191,8 @@ struct btrfs_r5l_log {
 	u64 data_offset;
 	u64 device_size;
 
+	u64 next_checkpoint;
+
 	u64 last_checkpoint;
 	u64 last_cp_seq;
 	u64 seq;
@@ -1231,11 +1233,14 @@ static void btrfs_r5l_log_endio(struct bio *bio)
 	bio_put(bio);
 
 #ifdef BTRFS_DEBUG_R5LOG
-	trace_printk("move data to disk\n");
+	trace_printk("move data to disk(current log->next_checkpoint %llu (will be %llu after writing to RAID\n", log->next_checkpoint, io->log_start);
 #endif
 	/* move data to RAID. */
 	btrfs_write_rbio(io->rbio);
 
+	/* After stripe data has been flushed into raid, set ->next_checkpoint. */
+	log->next_checkpoint = io->log_start;
+
 	if (log->current_io == io)
 		log->current_io = NULL;
 	btrfs_r5l_free_io_unit(log, io);
@@ -1472,6 +1477,42 @@ static bool btrfs_r5l_has_free_space(struct btrfs_r5l_log *log, u64 size)
 	return log->device_size > (used_size + size);
 }
 
+/*
+ * writing super with log->next_checkpoint
+ *
+ * This is protected by log->io_mutex.
+ */
+static void btrfs_r5l_write_super(struct btrfs_fs_info *fs_info, u64 cp)
+{
+	int ret;
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("r5l writing super to reclaim space, cp %llu\n", cp);
+#endif
+
+	btrfs_set_super_journal_tail(fs_info->super_for_commit, cp);
+
+	/*
+	 * flush all disk cache so that all data prior to
+	 * %next_checkpoint lands on raid disks(recovery will start
+	 * from %next_checkpoint).
+	 */
+	ret = write_all_supers(fs_info, 1);
+	ASSERT(ret == 0);
+}
+
+/* this is called by commit transaction and it's followed by writing super. */
+void btrfs_r5l_write_journal_tail(struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->r5log) {
+		u64 cp = READ_ONCE(fs_info->r5log->next_checkpoint);
+
+		trace_printk("journal_tail %llu\n", cp);
+		btrfs_set_super_journal_tail(fs_info->super_copy, cp);
+		WRITE_ONCE(fs_info->r5log->last_checkpoint, cp);
+	}
+}
+
 /*
  * return 0 if data/parity are written into log and it will move data
  * to RAID in endio.
@@ -1535,7 +1576,25 @@ static int btrfs_r5l_write_stripe(struct btrfs_raid_bio *rbio)
 		btrfs_r5l_log_stripe(log, data_pages, parity_pages, rbio);
 		do_submit = true;
 	} else {
-		; /* XXX: reclaim */
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("r5log: no space log->last_checkpoint %llu log->log_start %llu log->next_checkpoint %llu\n", log->last_checkpoint, log->log_start, log->next_checkpoint);
+#endif
+
+		/*
+		 * reclaim works via writing to log device with the
+		 * new next_checkpoint.
+		 */
+		btrfs_r5l_write_super(rbio->fs_info, log->next_checkpoint);
+
+		log->last_checkpoint = log->next_checkpoint;
+
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("r5log: after reclaim(write super) log->last_checkpoint %llu log->log_start %llu log->next_checkpoint %llu\n", log->last_checkpoint, log->log_start, log->next_checkpoint);
+#endif
+		/* now we should have enough space. */
+		ASSERT(btrfs_r5l_has_free_space(log, reserve));
+		btrfs_r5l_log_stripe(log, data_pages, parity_pages, rbio);
+		do_submit = true;
 	}
 
 	if (do_submit) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2168654c90a1e6..e312e5ada7cc60 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2238,6 +2238,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
 	btrfs_set_super_log_root(fs_info->super_copy, 0);
 	btrfs_set_super_log_root_level(fs_info->super_copy, 0);
+	btrfs_r5l_write_journal_tail(fs_info);
+
 	memcpy(fs_info->super_for_commit, fs_info->super_copy,
 	       sizeof(*fs_info->super_copy));
 

From 7a640f3afe0c42982b9e4b259946f12403b0cd28 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 18 Jul 2017 13:49:03 -0700
Subject: [PATCH 07/14] Btrfs: raid56: load r5log

A raid5/6 log can be loaded while mounting a btrfs (which already has
a disk set up as raid5/6 log) or setting up a disk as raid5/6 log for
the first time.

It gets %journal_tail from super_block where it can read the first 4K
block and goes through the sanity checks, if it's valid, then go check
if anything needs to be replayed, otherwise it creates a new empty
block at the beginning of the disk and new writes will append to it.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/disk-io.c |  16 ++++++
 fs/btrfs/raid56.c  | 128 +++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/raid56.h  |   1 +
 3 files changed, 145 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8685d67185d01b..c2d8697f13c801 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2987,6 +2987,22 @@ int open_ctree(struct super_block *sb,
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
 
+	if (fs_info->r5log) {
+		u64 cp = btrfs_super_journal_tail(fs_info->super_copy);
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("%s: get journal_tail %llu\n", __func__, cp);
+#endif
+		/* if the data is not replayed, data and parity on
+		 * disk are still consistent.  So we can move on.
+		 *
+		 * About fsync, since fsync can make sure data is
+		 * flushed onto disk and only metadata is kept into
+		 * write-ahead log, the fsync'd data will never ends
+		 * up with being replayed by raid56 log.
+		 */
+		btrfs_r5l_load_log(fs_info, cp);
+	}
+
 	ret = btrfs_recover_balance(fs_info);
 	if (ret) {
 		btrfs_err(fs_info, "failed to recover balance: %d", ret);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 60010a6ec4fdbe..5d7ea235ae9a70 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1477,6 +1477,134 @@ static bool btrfs_r5l_has_free_space(struct btrfs_r5l_log *log, u64 size)
 	return log->device_size > (used_size + size);
 }
 
+static int btrfs_r5l_sync_page_io(struct btrfs_r5l_log *log,
+				  struct btrfs_device *dev, sector_t sector,
+				  int size, struct page *page, int op)
+{
+	struct bio *bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	int ret;
+
+	bio->bi_bdev = dev->bdev;
+	bio->bi_opf = op;
+	if (dev == log->dev)
+		bio->bi_iter.bi_sector = (log->data_offset >> 9) + sector;
+	else
+		bio->bi_iter.bi_sector = sector;
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("%s: op %d bi_sector 0x%llx\n", __func__, op, (bio->bi_iter.bi_sector << 9));
+#endif
+
+	bio_add_page(bio, page, size, 0);
+	submit_bio_wait(bio);
+	ret = !bio->bi_error;
+	bio_put(bio);
+	return ret;
+}
+
+static int btrfs_r5l_write_empty_meta_block(struct btrfs_r5l_log *log, u64 pos, u64 seq)
+{
+	struct page *page;
+	struct btrfs_r5l_meta_block *mb;
+	int ret = 0;
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("%s: pos %llu seq %llu\n", __func__, pos, seq);
+#endif
+
+	page = alloc_page(GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO);
+	ASSERT(page);
+
+	mb = kmap(page);
+	mb->magic = cpu_to_le32(BTRFS_R5LOG_MAGIC);
+	mb->meta_size = cpu_to_le32(sizeof(struct btrfs_r5l_meta_block));
+	mb->seq = cpu_to_le64(seq);
+	mb->position = cpu_to_le64(pos);
+	kunmap(page);
+
+	if (!btrfs_r5l_sync_page_io(log, log->dev, (pos >> 9), PAGE_SIZE, page, REQ_OP_WRITE | REQ_FUA)) {
+		ret = -EIO;
+	}
+
+	__free_page(page);
+	return ret;
+}
+
+static void btrfs_r5l_write_super(struct btrfs_fs_info *fs_info, u64 cp);
+
+static int btrfs_r5l_recover_log(struct btrfs_r5l_log *log)
+{
+	return 0;
+}
+
+/* return 0 if success, otherwise return errors */
+int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, u64 cp)
+{
+	struct btrfs_r5l_log *log = fs_info->r5log;
+	struct page *page;
+	struct btrfs_r5l_meta_block *mb;
+	bool create_new = false;
+
+	ASSERT(log);
+
+	page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	ASSERT(page);
+
+	if (!btrfs_r5l_sync_page_io(log, log->dev, (cp >> 9), PAGE_SIZE, page,
+				    REQ_OP_READ)) {
+		__free_page(page);
+		return -EIO;
+	}
+
+	mb = kmap(page);
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("r5l: mb->pos %llu cp %llu mb->seq %llu\n", le64_to_cpu(mb->position), cp, le64_to_cpu(mb->seq));
+#endif
+
+	if (le32_to_cpu(mb->magic) != BTRFS_R5LOG_MAGIC) {
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("magic not match: create new r5l\n");
+#endif
+		create_new = true;
+		goto create;
+	}
+
+	ASSERT(le64_to_cpu(mb->position) == cp);
+	if (le64_to_cpu(mb->position) != cp) {
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("mb->position not match: create new r5l\n");
+#endif
+		create_new = true;
+		goto create;
+	}
+create:
+	if (create_new) {
+		/* initial new r5log */
+		log->last_cp_seq = prandom_u32();
+		cp = 0;
+
+		btrfs_r5l_write_empty_meta_block(log, cp, log->last_cp_seq);
+		btrfs_r5l_write_super(fs_info, cp);
+	} else {
+		log->last_cp_seq = le64_to_cpu(mb->seq);
+	}
+
+	log->last_checkpoint = cp;
+
+	kunmap(page);
+	__free_page(page);
+
+	if (create_new) {
+		log->log_start = btrfs_r5l_ring_add(log, cp, PAGE_SIZE);
+		log->seq = log->last_cp_seq + 1;
+		log->next_checkpoint = cp;
+	} else {
+		btrfs_r5l_recover_log(log);
+	}
+
+	return 0;
+}
+
 /*
  * writing super with log->next_checkpoint
  *
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 76a20faf189221..314d299c78c9a8 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -131,4 +131,5 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
 int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device);
+int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, u64 cp);
 #endif

From b04e2b06351fa7778965ddcf1000e5800f46338d Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 24 Jul 2017 15:05:11 -0700
Subject: [PATCH 08/14] Btrfs: raid56: log recovery

This is adding recovery on raid5/6 log.

We've set a %journal_tail in super_block, which indicates the position
from where we need to replay data.  So we scan the log and replay
valid meta/data/parity pairs until finding an invalid one.  By
replaying, it simply reads data/parity from the raid5/6 log and issues
writes to the raid disks where it should be.  Please note that the
whole meta/data/parity pair can be discarded if it fails the sanity
check in the meta block.

After recovery, we also append an empty meta block and update the
%journal_tail in super_block in order to avoid a situation, where the
layout on the raid5/6 log is

[valid A][invalid B][valid C],

so block A is the only one we should replay.

Then the recovery ends up pointing to block A as block B is invalid,
and some new writes come in and append to block A so that block B is
now overwritten to be a valid meta/data/parity.  If a power loss
happens, the new recovery starts again from block A, and since block B
is now valid, it may replay block C as well which has become stale.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/raid56.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5d7ea235ae9a70..dea33c40a11a4c 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1530,10 +1530,161 @@ static int btrfs_r5l_write_empty_meta_block(struct btrfs_r5l_log *log, u64 pos,
 	return ret;
 }
 
+struct btrfs_r5l_recover_ctx {
+	u64 pos;
+	u64 seq;
+	u64 total_size;
+	struct page *meta_page;
+	struct page *io_page;
+};
+
+static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+{
+	struct btrfs_r5l_meta_block *mb;
+
+	btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos >> 9), PAGE_SIZE, ctx->meta_page, REQ_OP_READ);
+
+	mb = kmap(ctx->meta_page);
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("ctx->pos %llu ctx->seq %llu pos %llu seq %llu\n", ctx->pos, ctx->seq, le64_to_cpu(mb->position), le64_to_cpu(mb->seq));
+#endif
+
+	if (le32_to_cpu(mb->magic) != BTRFS_R5LOG_MAGIC ||
+	    le64_to_cpu(mb->position) != ctx->pos ||
+	    le64_to_cpu(mb->seq) != ctx->seq) {
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("%s: mismatch magic %llu default %llu\n", __func__, le32_to_cpu(mb->magic), BTRFS_R5LOG_MAGIC);
+#endif
+		return -EINVAL;
+	}
+
+	ASSERT(le32_to_cpu(mb->meta_size) <= PAGE_SIZE);
+	kunmap(ctx->meta_page);
+
+	/* meta_block */
+	ctx->total_size = PAGE_SIZE;
+
+	return 0;
+}
+
+static int btrfs_r5l_recover_load_data(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+{
+	u64 offset;
+	struct btrfs_r5l_meta_block *mb;
+	u64 meta_size;
+	u64 io_offset;
+	struct btrfs_device *dev;
+
+	mb = kmap(ctx->meta_page);
+
+	io_offset = PAGE_SIZE;
+	offset = sizeof(struct btrfs_r5l_meta_block);
+	meta_size = le32_to_cpu(mb->meta_size);
+
+	while (offset < meta_size) {
+		struct btrfs_r5l_payload *payload = (void *)mb + offset;
+
+		/* read data from log disk and write to payload->location */
+#ifdef BTRFS_DEBUG_R5LOG
+		trace_printk("payload type %d flags %d size %d location 0x%llx devid %llu\n", le16_to_cpu(payload->type), le16_to_cpu(payload->flags), le32_to_cpu(payload->size), le64_to_cpu(payload->location), le64_to_cpu(payload->devid));
+#endif
+
+		dev = btrfs_find_device(log->fs_info, le64_to_cpu(payload->devid), NULL, NULL);
+		if (!dev || dev->missing) {
+			ASSERT(0);
+		}
+
+		if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_DATA) {
+			ASSERT(le32_to_cpu(payload->size) == 1);
+			btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos + io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ);
+			btrfs_r5l_sync_page_io(log, dev, le64_to_cpu(payload->location) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE);
+			io_offset += PAGE_SIZE;
+		} else if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_PARITY) {
+			int i;
+			ASSERT(le32_to_cpu(payload->size) == 16);
+			for (i = 0; i < le32_to_cpu(payload->size); i++) {
+				/* liubo: parity are guaranteed to be
+				 * contiguous, use just one bio to
+				 * hold all pages and flush them. */
+				u64 parity_off = le64_to_cpu(payload->location) + i * PAGE_SIZE;
+				btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos + io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ);
+				btrfs_r5l_sync_page_io(log, dev, parity_off >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE);
+				io_offset += PAGE_SIZE;
+			}
+		} else {
+			ASSERT(0);
+		}
+
+		offset += sizeof(struct btrfs_r5l_payload);
+	}
+	kunmap(ctx->meta_page);
+
+	ctx->total_size += (io_offset - PAGE_SIZE);
+	return 0;
+}
+
+static int btrfs_r5l_recover_flush_log(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+{
+	int ret;
+
+	while (1) {
+		ret = btrfs_r5l_recover_load_meta(log, ctx);
+		if (ret)
+			break;
+
+		ret = btrfs_r5l_recover_load_data(log, ctx);
+		ASSERT(!ret || ret > 0);
+		if (ret)
+			break;
+
+		ctx->seq++;
+		ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, ctx->total_size);
+	}
+
+	return ret;
+}
+
 static void btrfs_r5l_write_super(struct btrfs_fs_info *fs_info, u64 cp);
 
 static int btrfs_r5l_recover_log(struct btrfs_r5l_log *log)
 {
+	struct btrfs_r5l_recover_ctx *ctx;
+	u64 pos;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_NOFS);
+	ASSERT(ctx);
+
+	ctx->pos = log->last_checkpoint;
+	ctx->seq = log->last_cp_seq;
+	ctx->meta_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	ASSERT(ctx->meta_page);
+	ctx->io_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	ASSERT(ctx->io_page);
+
+	ret = btrfs_r5l_recover_flush_log(log, ctx);
+	if (ret) {
+		;
+	}
+
+	pos = ctx->pos;
+	log->next_checkpoint = ctx->pos;
+	ctx->seq += 10000;
+	btrfs_r5l_write_empty_meta_block(log, ctx->pos, ctx->seq++);
+	ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, PAGE_SIZE);
+
+	log->log_start = ctx->pos;
+	log->seq = ctx->seq;
+	/* last_checkpoint point to the empty block. */
+	log->last_checkpoint = pos;
+	btrfs_r5l_write_super(log->fs_info, pos);
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("%s: log_start %llu seq %llu\n", __func__, log->log_start, log->seq);
+#endif
+	__free_page(ctx->meta_page);
+	__free_page(ctx->io_page);
+	kfree(ctx);
 	return 0;
 }
 

From a5fbff01d7782b6275347a9d7b0b8f7ab5438332 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 24 Jul 2017 14:40:21 -0700
Subject: [PATCH 09/14] Btrfs: raid56: add readahead for recovery

While doing recovery, blocks are read from the raid5/6 disk one by
one, so this is adding readahead so that we can read at most 256
contiguous blocks in one read IO.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/raid56.c | 114 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 109 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index dea33c40a11a4c..24f7cbb5e87e16 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1530,15 +1530,81 @@ static int btrfs_r5l_write_empty_meta_block(struct btrfs_r5l_log *log, u64 pos,
 	return ret;
 }
 
+#define BTRFS_R5L_RECOVER_IO_POOL_SIZE BIO_MAX_PAGES
 struct btrfs_r5l_recover_ctx {
 	u64 pos;
 	u64 seq;
 	u64 total_size;
 	struct page *meta_page;
 	struct page *io_page;
+
+	struct page *ra_pages[BTRFS_R5L_RECOVER_IO_POOL_SIZE];
+	struct bio *ra_bio;
+	int total;
+	int valid;
+	u64 start_offset;
+
+	struct btrfs_r5l_log *log;
 };
 
-static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+static int btrfs_r5l_recover_read_ra(struct btrfs_r5l_recover_ctx *ctx, u64 offset)
+{
+	bio_reset(ctx->ra_bio);
+	ctx->ra_bio->bi_bdev = ctx->log->dev->bdev;
+	ctx->ra_bio->bi_opf = REQ_OP_READ;
+	ctx->ra_bio->bi_iter.bi_sector = (ctx->log->data_offset + offset) >> 9;
+
+	ctx->valid = 0;
+	ctx->start_offset = offset;
+
+	while (ctx->valid < ctx->total) {
+		bio_add_page(ctx->ra_bio, ctx->ra_pages[ctx->valid++], PAGE_SIZE, 0);
+
+		offset = btrfs_r5l_ring_add(ctx->log, offset, PAGE_SIZE);
+		if (offset == 0)
+			break;
+	}
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("to read %d pages starting from 0x%llx\n", ctx->valid, ctx->log->data_offset + ctx->start_offset);
+#endif
+	return submit_bio_wait(ctx->ra_bio);
+}
+
+static int btrfs_r5l_recover_read_page(struct btrfs_r5l_recover_ctx *ctx, struct page *page, u64 offset)
+{
+	struct page *tmp;
+	int index;
+	char *src;
+	char *dst;
+	int ret;
+
+	if (offset < ctx->start_offset || offset >= (ctx->start_offset + ctx->valid * PAGE_SIZE)) {
+		ret = btrfs_r5l_recover_read_ra(ctx, offset);
+		if (ret)
+			return ret;
+	}
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("offset 0x%llx start->offset 0x%llx ctx->valid %d\n", offset, ctx->start_offset, ctx->valid);
+#endif
+
+	ASSERT(IS_ALIGNED(ctx->start_offset, PAGE_SIZE));
+	ASSERT(IS_ALIGNED(offset, PAGE_SIZE));
+
+	index = (offset - ctx->start_offset) >> PAGE_SHIFT;
+	ASSERT(index < ctx->valid);
+
+	tmp = ctx->ra_pages[index];
+	src = kmap(tmp);
+	dst = kmap(page);
+	memcpy(dst, src, PAGE_SIZE);
+	kunmap(page);
+	kunmap(tmp);
+	return 0;
+}
+
+static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_recover_ctx *ctx)
 {
 	struct btrfs_r5l_meta_block *mb;
 
@@ -1642,6 +1708,42 @@ static int btrfs_r5l_recover_flush_log(struct btrfs_r5l_log *log, struct btrfs_r
 	}
 
 	return ret;
+
+static int btrfs_r5l_recover_allocate_ra(struct btrfs_r5l_recover_ctx *ctx)
+{
+	struct page *page;
+	ctx->ra_bio = btrfs_io_bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+
+	ctx->total = 0;
+	ctx->valid = 0;
+	while (ctx->total < BTRFS_R5L_RECOVER_IO_POOL_SIZE) {
+		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!page)
+			break;
+
+		ctx->ra_pages[ctx->total++] = page;
+	}
+
+	if (ctx->total == 0) {
+		bio_put(ctx->ra_bio);
+		return -ENOMEM;
+	}
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("readahead: %d allocated pages\n", ctx->total);
+#endif
+	return 0;
+}
+
+static void btrfs_r5l_recover_free_ra(struct btrfs_r5l_recover_ctx *ctx)
+{
+	int i;
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("readahead: %d to free pages\n", ctx->total);
+#endif
+	for (i = 0; i < ctx->total; i++)
+		__free_page(ctx->ra_pages[i]);
+	bio_put(ctx->ra_bio);
 }
 
 static void btrfs_r5l_write_super(struct btrfs_fs_info *fs_info, u64 cp);
@@ -1655,6 +1757,7 @@ static int btrfs_r5l_recover_log(struct btrfs_r5l_log *log)
 	ctx = kzalloc(sizeof(*ctx), GFP_NOFS);
 	ASSERT(ctx);
 
+	ctx->log = log;
 	ctx->pos = log->last_checkpoint;
 	ctx->seq = log->last_cp_seq;
 	ctx->meta_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
@@ -1662,10 +1765,10 @@ static int btrfs_r5l_recover_log(struct btrfs_r5l_log *log)
 	ctx->io_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 	ASSERT(ctx->io_page);
 
-	ret = btrfs_r5l_recover_flush_log(log, ctx);
-	if (ret) {
-		;
-	}
+	ret = btrfs_r5l_recover_allocate_ra(ctx);
+	ASSERT(ret == 0);
+
+	btrfs_r5l_recover_flush_log(ctx);
 
 	pos = ctx->pos;
 	log->next_checkpoint = ctx->pos;
@@ -1684,6 +1787,7 @@ static int btrfs_r5l_recover_log(struct btrfs_r5l_log *log)
 #endif
 	__free_page(ctx->meta_page);
 	__free_page(ctx->io_page);
+	btrfs_r5l_recover_free_ra(ctx);
 	kfree(ctx);
 	return 0;
 }

From 0ef4bbe4a33c9cfafae031cc70d0217f4203c744 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 24 Jul 2017 14:44:31 -0700
Subject: [PATCH 10/14] Btrfs: raid56: use the readahead helper to get page

This updates recovery code to use the readahead helper.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/raid56.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 24f7cbb5e87e16..8f47e568743901 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1608,7 +1608,9 @@ static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_recover_ctx *ctx)
 {
 	struct btrfs_r5l_meta_block *mb;
 
-	btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos >> 9), PAGE_SIZE, ctx->meta_page, REQ_OP_READ);
+	ret = btrfs_r5l_recover_read_page(ctx, ctx->meta_page, ctx->pos);
+	if (ret)
+		return ret;
 
 	mb = kmap(ctx->meta_page);
 #ifdef BTRFS_DEBUG_R5LOG

From 3b81a09fb709c5385ec443d9f893d47dcaf7c4ad Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 24 Jul 2017 14:53:58 -0700
Subject: [PATCH 11/14] Btrfs: raid56: add csum support

This is adding checksum to meta/data/parity resident on the raid5/6
log.  So recovery now can verify checksum to see if anything inside
meta/data/parity has been changed.

If anything is wrong in meta block, we stops replaying data/parity at
that position, while if anything is wrong in data/parity block, we
just skip this this meta/data/parity pair and move onto the next one.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/raid56.c | 235 +++++++++++++++++++++++++++++++++++++---------
 fs/btrfs/raid56.h |   4 +
 2 files changed, 197 insertions(+), 42 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8f47e568743901..8bc7ba4fa6b089 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -43,6 +43,7 @@
 #include "async-thread.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "hash.h"
 
 /* set when additional merges to this rbio are not allowed */
 #define RBIO_RMW_LOCKED_BIT	1
@@ -197,6 +198,7 @@ struct btrfs_r5l_log {
 	u64 last_cp_seq;
 	u64 seq;
 	u64 log_start;
+	u32 uuid_csum;
 	struct btrfs_r5l_io_unit *current_io;
 };
 
@@ -1309,7 +1311,7 @@ static int btrfs_r5l_get_meta(struct btrfs_r5l_log *log, struct btrfs_raid_bio *
 	return 0;
 }
 
-static void btrfs_r5l_append_payload_meta(struct btrfs_r5l_log *log, u16 type, u64 location, u64 devid)
+static void btrfs_r5l_append_payload_meta(struct btrfs_r5l_log *log, u16 type, u64 location, u64 devid, u32 csum)
 {
 	struct btrfs_r5l_io_unit *io = log->current_io;
 	struct btrfs_r5l_payload *payload;
@@ -1326,11 +1328,11 @@ static void btrfs_r5l_append_payload_meta(struct btrfs_r5l_log *log, u16 type, u
 		payload->size = cpu_to_le32(16); /* stripe_len / PAGE_SIZE */
 	payload->devid = cpu_to_le64(devid);
 	payload->location = cpu_to_le64(location);
+	payload->csum = cpu_to_le32(csum);
 	kunmap(io->meta_page);
 
-	/* XXX: add checksum later */
 	io->meta_offset += sizeof(*payload);
-	//io->meta_offset += sizeof(__le32);
+
 #ifdef BTRFS_DEBUG_R5LOG
 	trace_printk("io->meta_offset %d\n", io->meta_offset);
 #endif
@@ -1380,6 +1382,10 @@ static void btrfs_r5l_log_stripe(struct btrfs_r5l_log *log, int data_pages, int
 	int meta_size;
 	int stripe, pagenr;
 	struct page *page;
+	char *kaddr;
+	u32 csum;
+	u64 location;
+	u64 devid;
 
 	/*
 	 * parity pages are contiguous on disk, thus only one
@@ -1394,8 +1400,6 @@ static void btrfs_r5l_log_stripe(struct btrfs_r5l_log *log, int data_pages, int
 	/* add data blocks which need to be written */
 	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
 		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
-			u64 location;
-			u64 devid;
 			if (stripe < rbio->nr_data) {
 				page = page_in_rbio(rbio, stripe, pagenr, 1);
 				if (!page)
@@ -1406,7 +1410,11 @@ static void btrfs_r5l_log_stripe(struct btrfs_r5l_log *log, int data_pages, int
 #ifdef BTRFS_DEBUG_R5LOG
 				trace_printk("data: stripe %d pagenr %d location 0x%llx devid %llu\n", stripe, pagenr, location, devid);
 #endif
-				btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, location, devid);
+				kaddr = kmap(page);
+				csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE);
+				kunmap(page);
+
+				btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, location, devid, csum);
 				btrfs_r5l_append_payload_page(log, page);
 			}
 		}
@@ -1414,17 +1422,26 @@ static void btrfs_r5l_log_stripe(struct btrfs_r5l_log *log, int data_pages, int
 
 	/* add the whole parity blocks */
 	for (; stripe < rbio->real_stripes; stripe++) {
-		u64 location = btrfs_compute_location(rbio, stripe, 0);
-		u64 devid = btrfs_compute_devid(rbio, stripe);
+		location = btrfs_compute_location(rbio, stripe, 0);
+		devid = btrfs_compute_devid(rbio, stripe);
 
 #ifdef BTRFS_DEBUG_R5LOG
 		trace_printk("parity: stripe %d location 0x%llx devid %llu\n", stripe, location, devid);
 #endif
-		btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, location, devid);
 		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 			page = rbio_stripe_page(rbio, stripe, pagenr);
+
+			kaddr = kmap(page);
+			if (pagenr == 0)
+				csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE);
+			else
+				csum = btrfs_crc32c(csum, kaddr, PAGE_SIZE);
+			kunmap(page);
+
 			btrfs_r5l_append_payload_page(log, page);
 		}
+
+		btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, location, devid, csum);
 	}
 }
 
@@ -1432,12 +1449,16 @@ static void btrfs_r5l_submit_current_io(struct btrfs_r5l_log *log)
 {
 	struct btrfs_r5l_io_unit *io = log->current_io;
 	struct btrfs_r5l_meta_block *mb;
+	u32 csum;
 
 	if (!io)
 		return;
 
 	mb = kmap(io->meta_page);
 	mb->meta_size = cpu_to_le32(io->meta_offset);
+	ASSERT(mb->csum == 0);
+	csum = btrfs_crc32c(log->uuid_csum, mb, PAGE_SIZE);
+	mb->csum = cpu_to_le32(csum);
 	kunmap(io->meta_page);
 
 	log->current_io = NULL;
@@ -1506,6 +1527,7 @@ static int btrfs_r5l_write_empty_meta_block(struct btrfs_r5l_log *log, u64 pos,
 {
 	struct page *page;
 	struct btrfs_r5l_meta_block *mb;
+	u32 csum;
 	int ret = 0;
 
 #ifdef BTRFS_DEBUG_R5LOG
@@ -1520,6 +1542,9 @@ static int btrfs_r5l_write_empty_meta_block(struct btrfs_r5l_log *log, u64 pos,
 	mb->meta_size = cpu_to_le32(sizeof(struct btrfs_r5l_meta_block));
 	mb->seq = cpu_to_le64(seq);
 	mb->position = cpu_to_le64(pos);
+
+	csum = btrfs_crc32c(log->uuid_csum, mb, PAGE_SIZE);
+	mb->csum = cpu_to_le32(csum);
 	kunmap(page);
 
 	if (!btrfs_r5l_sync_page_io(log, log->dev, (pos >> 9), PAGE_SIZE, page, REQ_OP_WRITE | REQ_FUA)) {
@@ -1607,6 +1632,9 @@ static int btrfs_r5l_recover_read_page(struct btrfs_r5l_recover_ctx *ctx, struct
 static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_recover_ctx *ctx)
 {
 	struct btrfs_r5l_meta_block *mb;
+	u32 csum;
+	u32 expected;
+	int ret = 0;
 
 	ret = btrfs_r5l_recover_read_page(ctx, ctx->meta_page, ctx->pos);
 	if (ret)
@@ -1623,25 +1651,131 @@ static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_recover_ctx *ctx)
 #ifdef BTRFS_DEBUG_R5LOG
 		trace_printk("%s: mismatch magic %llu default %llu\n", __func__, le32_to_cpu(mb->magic), BTRFS_R5LOG_MAGIC);
 #endif
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out;
 	}
 
-	ASSERT(le32_to_cpu(mb->meta_size) <= PAGE_SIZE);
-	kunmap(ctx->meta_page);
+	expected = le32_to_cpu(mb->csum);
+	/*
+	 * when we calculate mb->csum, it's zero, so we need to zero
+	 * it back.
+	 */
+	mb->csum = 0;
+	csum = btrfs_crc32c(ctx->log->uuid_csum, mb, PAGE_SIZE);
+	if (csum != expected) {
+#ifdef BTRFS_DEBUG_R5LOG
+		pr_info("%s: mismatch checksum for r5l meta block\n", __func__);
+#endif
+		ret = -EINVAL;
+		goto out;
+	}
 
+	ASSERT(le32_to_cpu(mb->meta_size) <= PAGE_SIZE);
 	/* meta_block */
 	ctx->total_size = PAGE_SIZE;
 
-	return 0;
+out:
+	kunmap(ctx->meta_page);
+
+	return ret;
+}
+
+static int btrfs_r5l_recover_verify_checksum(struct btrfs_r5l_recover_ctx *ctx)
+{
+	u64 offset;
+	u32 meta_size;
+	u64 csum_io_offset;
+	u64 read_pos;
+	char *kaddr;
+	u32 csum;
+	int type;
+	struct btrfs_r5l_meta_block *mb;
+	struct btrfs_r5l_payload *payload;
+	struct btrfs_r5l_log *log = ctx->log;
+	struct btrfs_device *dev;
+	int ret = 0;
+
+	mb = kmap(ctx->meta_page);
+	meta_size = le32_to_cpu(mb->meta_size);
+	csum_io_offset = PAGE_SIZE;
+
+	for (offset = sizeof(struct btrfs_r5l_meta_block);
+	     offset < meta_size;
+	     offset += sizeof(struct btrfs_r5l_payload)) {
+		payload = (void *)mb + offset;
+
+		/* check if there is any invalid device, if so, skip writing this mb. */
+		dev = btrfs_find_device(log->fs_info, le64_to_cpu(payload->devid), NULL, NULL);
+		if (!dev || dev->missing) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		type = le16_to_cpu(payload->type);
+		if (type == R5LOG_PAYLOAD_DATA) {
+			read_pos = btrfs_r5l_ring_add(log, ctx->pos, csum_io_offset);
+			csum_io_offset += PAGE_SIZE;
+
+			ASSERT(le32_to_cpu(payload->size) == 1);
+			ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos);
+			if (ret) {
+				ret = -EIO;
+				goto out;
+			}
+
+			kaddr = kmap(ctx->io_page);
+			csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE);
+			kunmap(ctx->io_page);
+		} else if (type == R5LOG_PAYLOAD_PARITY) {
+			int i;
+			for (i = 0; i < le32_to_cpu(payload->size); i++) {
+				read_pos = btrfs_r5l_ring_add(log, ctx->pos, csum_io_offset);
+				csum_io_offset += PAGE_SIZE;
+
+				ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos);
+				if (ret) {
+					ret = -EIO;
+					goto out;
+				}
+
+				kaddr = kmap(ctx->io_page);
+				if (i == 0)
+					csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE);
+				else
+					csum = btrfs_crc32c(csum, kaddr, PAGE_SIZE);
+				kunmap(ctx->io_page);
+			}
+		} else {
+			ASSERT(0);
+		}
+
+		if (csum != le32_to_cpu(payload->csum)) {
+			trace_printk("r5l data csum fails location 0x%llx devid %llu\n", le64_to_cpu(payload->location), le64_to_cpu(payload->devid));
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
+out:
+	kunmap(ctx->meta_page);
+	return ret;
 }
 
-static int btrfs_r5l_recover_load_data(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+static int btrfs_r5l_recover_load_data(struct btrfs_r5l_recover_ctx *ctx)
 {
 	u64 offset;
 	struct btrfs_r5l_meta_block *mb;
-	u64 meta_size;
+	u32 meta_size;
 	u64 io_offset;
+	u64 read_pos;
 	struct btrfs_device *dev;
+	struct btrfs_r5l_payload *payload;
+	struct btrfs_r5l_log *log = ctx->log;
+	int ret = 0;
+
+	/* if any checksum fails, skip writing this mb. */
+	ret = btrfs_r5l_recover_verify_checksum(ctx);
+	if (ret)
+		return ret;
 
 	mb = kmap(ctx->meta_page);
 
@@ -1649,67 +1783,81 @@ static int btrfs_r5l_recover_load_data(struct btrfs_r5l_log *log, struct btrfs_r
 	offset = sizeof(struct btrfs_r5l_meta_block);
 	meta_size = le32_to_cpu(mb->meta_size);
 
-	while (offset < meta_size) {
-		struct btrfs_r5l_payload *payload = (void *)mb + offset;
+	for (offset = sizeof(struct btrfs_r5l_meta_block);
+	     offset < meta_size;
+	     offset += sizeof(struct btrfs_r5l_payload)) {
+		payload = (void *)mb + offset;
 
 		/* read data from log disk and write to payload->location */
 #ifdef BTRFS_DEBUG_R5LOG
 		trace_printk("payload type %d flags %d size %d location 0x%llx devid %llu\n", le16_to_cpu(payload->type), le16_to_cpu(payload->flags), le32_to_cpu(payload->size), le64_to_cpu(payload->location), le64_to_cpu(payload->devid));
 #endif
 
+		/* liubo: how to handle the case where dev is suddenly off? */
 		dev = btrfs_find_device(log->fs_info, le64_to_cpu(payload->devid), NULL, NULL);
-		if (!dev || dev->missing) {
-			ASSERT(0);
-		}
+		ASSERT(dev && !dev->missing);
 
 		if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_DATA) {
-			ASSERT(le32_to_cpu(payload->size) == 1);
-			btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos + io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ);
-			btrfs_r5l_sync_page_io(log, dev, le64_to_cpu(payload->location) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE);
+			read_pos = btrfs_r5l_ring_add(log, ctx->pos, io_offset);
 			io_offset += PAGE_SIZE;
+
+			ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos);
+			if (ret)
+				goto out;
+
+			if (!btrfs_r5l_sync_page_io(log, dev, le64_to_cpu(payload->location) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE)) {
+				ret = -EIO;
+				goto out;
+			}
 		} else if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_PARITY) {
 			int i;
-			ASSERT(le32_to_cpu(payload->size) == 16);
+
+			ASSERT(offset + sizeof(struct btrfs_r5l_payload) == meta_size);
+
 			for (i = 0; i < le32_to_cpu(payload->size); i++) {
-				/* liubo: parity are guaranteed to be
-				 * contiguous, use just one bio to
-				 * hold all pages and flush them. */
 				u64 parity_off = le64_to_cpu(payload->location) + i * PAGE_SIZE;
-				btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos + io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ);
-				btrfs_r5l_sync_page_io(log, dev, parity_off >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE);
+				read_pos = btrfs_r5l_ring_add(log, ctx->pos, io_offset);
 				io_offset += PAGE_SIZE;
+
+				ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos);
+				if (ret)
+					goto out;
+
+				if (!btrfs_r5l_sync_page_io(log, dev, parity_off >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE)) {
+					ret = -EIO;
+					goto out;
+				}
 			}
 		} else {
 			ASSERT(0);
 		}
-
-		offset += sizeof(struct btrfs_r5l_payload);
 	}
-	kunmap(ctx->meta_page);
 
 	ctx->total_size += (io_offset - PAGE_SIZE);
-	return 0;
+out:
+	kunmap(ctx->meta_page);
+	return ret;
 }
 
-static int btrfs_r5l_recover_flush_log(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+static int btrfs_r5l_recover_flush_log(struct btrfs_r5l_recover_ctx *ctx)
 {
 	int ret;
 
 	while (1) {
-		ret = btrfs_r5l_recover_load_meta(log, ctx);
+		ret = btrfs_r5l_recover_load_meta(ctx);
 		if (ret)
 			break;
 
-		ret = btrfs_r5l_recover_load_data(log, ctx);
-		ASSERT(!ret || ret > 0);
-		if (ret)
+		ret = btrfs_r5l_recover_load_data(ctx);
+		if (ret && ret != -EAGAIN)
 			break;
 
 		ctx->seq++;
-		ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, ctx->total_size);
+		ctx->pos = btrfs_r5l_ring_add(ctx->log, ctx->pos, ctx->total_size);
 	}
 
-	return ret;
+	return 0;
+}
 
 static int btrfs_r5l_recover_allocate_ra(struct btrfs_r5l_recover_ctx *ctx)
 {
@@ -1801,6 +1949,7 @@ int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, u64 cp)
 	struct page *page;
 	struct btrfs_r5l_meta_block *mb;
 	bool create_new = false;
+	int ret;
 
 	ASSERT(log);
 
@@ -1856,10 +2005,10 @@ int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, u64 cp)
 		log->seq = log->last_cp_seq + 1;
 		log->next_checkpoint = cp;
 	} else {
-		btrfs_r5l_recover_log(log);
+		ret = btrfs_r5l_recover_log(log);
 	}
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -3576,6 +3725,8 @@ int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
 	log->device_size = round_down(log->device_size, PAGE_SIZE);
 	log->dev = device;
 	log->fs_info = fs_info;
+	ASSERT(sizeof(device->uuid) == BTRFS_UUID_SIZE);
+	log->uuid_csum = btrfs_crc32c(~0, device->uuid, sizeof(device->uuid));
 	mutex_init(&log->io_mutex);
 
 	cmpxchg(&fs_info->r5log, NULL, log);
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 314d299c78c9a8..569cec8d81e205 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -87,6 +87,8 @@ struct btrfs_r5l_payload {
 	/* data or parity */
 	__le64 location;
 	__le64 devid;
+
+	__le32 csum;
 };
 
 /* io unit starts from a meta block. */
@@ -96,6 +98,8 @@ struct btrfs_r5l_meta_block {
 	/* the whole size of the block */
 	__le32 meta_size;
 
+	__le32 csum;
+
 	__le64 seq;
 	__le64 position;
 

From 0b57507251732b833ffd84445f174ed8d7afed76 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 25 Jul 2017 15:16:39 -0700
Subject: [PATCH 12/14] Btrfs: raid56: fix error handling while adding a log
 device

Currently there is a memory leak if we have an error while adding a
raid5/6 log.  Moreover, it didn't abort the transaction as others do,
so this is fixing the broken error handling by applying two steps on
initializing the log, step #1 is to allocate memory, check if it has a
proper size, and step #2 is to assign the pointer in %fs_info.  And by
running step #1 ahead of starting transaction, we can gracefully bail
out on errors now.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/raid56.c  | 48 +++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/raid56.h  |  5 +++++
 fs/btrfs/volumes.c | 36 ++++++++++++++++++++--------------
 3 files changed, 68 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8bc7ba4fa6b089..0bfc97a1bb72ff 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -3711,30 +3711,64 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
 		async_missing_raid56(rbio);
 }
 
-int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
+struct btrfs_r5l_log * btrfs_r5l_init_log_prepare(struct btrfs_fs_info *fs_info, struct btrfs_device *device, struct block_device *bdev)
 {
-	struct btrfs_r5l_log *log;
-
-	log = kzalloc(sizeof(*log), GFP_NOFS);
+	int num_devices = fs_info->fs_devices->num_devices;
+	u64 dev_total_bytes;
+	struct btrfs_r5l_log *log = kzalloc(sizeof(struct btrfs_r5l_log), GFP_NOFS);
 	if (!log)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
+
+	ASSERT(device);
+	ASSERT(bdev);
+	dev_total_bytes = i_size_read(bdev->bd_inode);
 
 	/* see find_free_dev_extent for 1M start offset */
 	log->data_offset = 1024ull * 1024;
-	log->device_size = btrfs_device_get_total_bytes(device) - log->data_offset;
+	log->device_size = dev_total_bytes - log->data_offset;
 	log->device_size = round_down(log->device_size, PAGE_SIZE);
+
+	/*
+	 * when device has been included in fs_devices, do not take
+	 * into account this device when checking log size.
+	 */
+	if (device->in_fs_metadata)
+		num_devices--;
+
+	if (log->device_size < BTRFS_STRIPE_LEN * num_devices * 2) {
+		btrfs_info(fs_info, "r5log log device size (%llu < %llu) is too small", log->device_size, BTRFS_STRIPE_LEN * num_devices * 2);
+		kfree(log);
+		return ERR_PTR(-EINVAL);
+	}
+
 	log->dev = device;
 	log->fs_info = fs_info;
 	ASSERT(sizeof(device->uuid) == BTRFS_UUID_SIZE);
 	log->uuid_csum = btrfs_crc32c(~0, device->uuid, sizeof(device->uuid));
 	mutex_init(&log->io_mutex);
 
+	return log;
+}
+
+void btrfs_r5l_init_log_post(struct btrfs_fs_info *fs_info, struct btrfs_r5l_log *log)
+{
 	cmpxchg(&fs_info->r5log, NULL, log);
 	ASSERT(fs_info->r5log == log);
 
 #ifdef BTRFS_DEBUG_R5LOG
-	trace_printk("r5log: set a r5log in fs_info,  alloc_range 0x%llx 0x%llx",
+	trace_printk("r5log: set a r5log in fs_info,  alloc_range 0x%llx 0x%llx\n",
 		     log->data_offset, log->data_offset + log->device_size);
 #endif
+}
+
+int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
+{
+	struct btrfs_r5l_log *log;
+
+	log = btrfs_r5l_init_log_prepare(fs_info, device, device->bdev);
+	if (IS_ERR(log))
+		return PTR_ERR(log);
+
+	btrfs_r5l_init_log_post(fs_info, log);
 	return 0;
 }
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 569cec8d81e205..f6d6f368f0338e 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -134,6 +134,11 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
 
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+struct btrfs_r5l_log * btrfs_r5l_init_log_prepare(struct btrfs_fs_info *fs_info,
+						  struct btrfs_device *device,
+						  struct block_device *bdev);
+void btrfs_r5l_init_log_post(struct btrfs_fs_info *fs_info,
+			     struct btrfs_r5l_log *log);
 int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device);
 int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, u64 cp);
 #endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ac64d93774b219..851c00158ec039 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2327,6 +2327,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	int seeding_dev = 0;
 	int ret = 0;
 	bool is_r5log = (flags & BTRFS_DEVICE_RAID56_LOG);
+	struct btrfs_r5l_log *r5log = NULL;
 
 	if (is_r5log)
 		ASSERT(!fs_info->fs_devices->seeding);
@@ -2367,6 +2368,15 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		goto error;
 	}
 
+	if (is_r5log) {
+		r5log = btrfs_r5l_init_log_prepare(fs_info, device, bdev);
+		if (IS_ERR(r5log)) {
+			kfree(device);
+			ret = PTR_ERR(r5log);
+			goto error;
+		}
+	}
+
 	name = rcu_string_strdup(device_path, GFP_KERNEL);
 	if (!name) {
 		kfree(device);
@@ -2467,12 +2477,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		goto error_trans;
 	}
 
-	if (is_r5log) {
-		ret = btrfs_set_r5log(fs_info, device);
-		if (ret)
-			goto error_trans;
-	}
-
 	if (seeding_dev) {
 		char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
 
@@ -2516,6 +2520,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		ret = btrfs_commit_transaction(trans);
 	}
 
+	if (is_r5log) {
+		btrfs_r5l_init_log_post(fs_info, r5log);
+	}
+
 	/* Update ctime/mtime for libblkid */
 	update_dev_time(device_path);
 	return ret;
@@ -6701,6 +6709,14 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
+	device->in_fs_metadata = 1;
+	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+		device->fs_devices->total_rw_bytes += device->total_bytes;
+		spin_lock(&fs_info->free_chunk_lock);
+		fs_info->free_chunk_space += device->total_bytes -
+			device->bytes_used;
+		spin_unlock(&fs_info->free_chunk_lock);
+	}
 
 	if (device->type & BTRFS_DEV_RAID56_LOG) {
 		ret = btrfs_set_r5log(fs_info, device);
@@ -6713,14 +6729,6 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 			   device->devid, device->uuid);
 	}
 
-	device->in_fs_metadata = 1;
-	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
-		device->fs_devices->total_rw_bytes += device->total_bytes;
-		spin_lock(&fs_info->free_chunk_lock);
-		fs_info->free_chunk_space += device->total_bytes -
-			device->bytes_used;
-		spin_unlock(&fs_info->free_chunk_lock);
-	}
 	ret = 0;
 	return ret;
 }

From 9b7a565114495c3181002ad294a94fed851e3146 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 26 Jul 2017 11:11:24 -0700
Subject: [PATCH 13/14] Btrfs: raid56: initialize raid5/6 log after adding it

We need to initialize the raid5/6 log after adding it, but we don't
want to race with concurrent writes.  So we initialize it before
assigning the log pointer in %fs_info.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/disk-io.c |  2 +-
 fs/btrfs/raid56.c  | 18 ++++++++++++++++--
 fs/btrfs/raid56.h  |  3 ++-
 fs/btrfs/volumes.c |  2 ++
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c2d8697f13c801..3fbd34799d0f37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3000,7 +3000,7 @@ int open_ctree(struct super_block *sb,
 		 * write-ahead log, the fsync'd data will never ends
 		 * up with being replayed by raid56 log.
 		 */
-		btrfs_r5l_load_log(fs_info, cp);
+		btrfs_r5l_load_log(fs_info, NULL, cp);
 	}
 
 	ret = btrfs_recover_balance(fs_info);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0bfc97a1bb72ff..b771d7d98f3d19 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1943,14 +1943,28 @@ static int btrfs_r5l_recover_log(struct btrfs_r5l_log *log)
 }
 
 /* return 0 if success, otherwise return errors */
-int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, u64 cp)
+int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, struct btrfs_r5l_log *r5log, u64 cp)
 {
-	struct btrfs_r5l_log *log = fs_info->r5log;
+	struct btrfs_r5l_log *log;
 	struct page *page;
 	struct btrfs_r5l_meta_block *mb;
 	bool create_new = false;
 	int ret;
 
+	if (r5log)
+		ASSERT(fs_info->r5log == NULL);
+	if (fs_info->r5log)
+		ASSERT(r5log == NULL);
+
+	if (fs_info->r5log)
+		log = fs_info->r5log;
+	else
+		/*
+		 * this only happens when adding the raid56 log for
+		 * the first time.
+		 */
+		log = r5log;
+
 	ASSERT(log);
 
 	page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index f6d6f368f0338e..2cc64a3627ceac 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -140,5 +140,6 @@ struct btrfs_r5l_log * btrfs_r5l_init_log_prepare(struct btrfs_fs_info *fs_info,
 void btrfs_r5l_init_log_post(struct btrfs_fs_info *fs_info,
 			     struct btrfs_r5l_log *log);
 int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device);
-int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, u64 cp);
+int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info,
+		       struct btrfs_r5l_log *r5log, u64 cp);
 #endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 851c00158ec039..7f848d79ef513b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2521,6 +2521,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	}
 
 	if (is_r5log) {
+		/* initialize r5log with cp == 0. */
+		btrfs_r5l_load_log(fs_info, r5log, 0);
 		btrfs_r5l_init_log_post(fs_info, r5log);
 	}
 

From 397a805d9baadf1916142a890a63e11a0da0a1e2 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 27 Jul 2017 11:03:19 -0700
Subject: [PATCH 14/14] Btrfs: raid56: maintain IO order on raid5/6 log

A typical write to the raid5/6 log needs three steps:

1) collect data/parity pages into the bio in io_unit;
2) submit the bio in io_unit;
3) writeback data/parity to raid array in end_io.

1) and 2) are protected within log->io_mutex, while 3) is not.

Since recovery needs to know the checkpoint offset where the highest
successful writeback is, we cannot allow IO to be reordered.  This is
adding a list in which IO order is maintained properly.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/raid56.c | 42 ++++++++++++++++++++++++++++++++++--------
 fs/btrfs/raid56.h |  5 +++++
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index b771d7d98f3d19..ceca41537dddc3 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -183,6 +183,9 @@ struct btrfs_r5l_log {
 	/* protect this struct and log io */
 	struct mutex io_mutex;
 
+	spinlock_t io_list_lock;
+	struct list_head io_list;
+
 	/* r5log device */
 	struct btrfs_device *dev;
 
@@ -1205,6 +1208,7 @@ static struct btrfs_r5l_io_unit *btrfs_r5l_alloc_io_unit(struct btrfs_r5l_log *l
 static void btrfs_r5l_free_io_unit(struct btrfs_r5l_log *log, struct btrfs_r5l_io_unit *io)
 {
 	__free_page(io->meta_page);
+	ASSERT(list_empty(&io->list));
 	kfree(io);
 }
 
@@ -1225,6 +1229,27 @@ static void btrfs_r5l_reserve_log_entry(struct btrfs_r5l_log *log, struct btrfs_
 		io->need_split_bio = true;
 }
 
+/* the IO order is maintained in log->io_list. */
+static void btrfs_r5l_finish_io(struct btrfs_r5l_log *log)
+{
+	struct btrfs_r5l_io_unit *io, *next;
+
+	spin_lock(&log->io_list_lock);
+	list_for_each_entry_safe(io, next, &log->io_list, list) {
+		if (io->status != BTRFS_R5L_STRIPE_END)
+			break;
+
+#ifdef BTRFS_DEBUG_R5LOG
+	trace_printk("current log->next_checkpoint %llu (will be %llu after writing to RAID\n", log->next_checkpoint, io->log_start);
+#endif
+
+		list_del_init(&io->list);
+		log->next_checkpoint = io->log_start;
+		btrfs_r5l_free_io_unit(log, io);
+	}
+	spin_unlock(&log->io_list_lock);
+}
+
 static void btrfs_write_rbio(struct btrfs_raid_bio *rbio);
 
 static void btrfs_r5l_log_endio(struct bio *bio)
@@ -1234,18 +1259,12 @@ static void btrfs_r5l_log_endio(struct bio *bio)
 
 	bio_put(bio);
 
-#ifdef BTRFS_DEBUG_R5LOG
-	trace_printk("move data to disk(current log->next_checkpoint %llu (will be %llu after writing to RAID\n", log->next_checkpoint, io->log_start);
-#endif
 	/* move data to RAID. */
 	btrfs_write_rbio(io->rbio);
 
+	io->status = BTRFS_R5L_STRIPE_END;
 	/* After stripe data has been flushed into raid, set ->next_checkpoint. */
-	log->next_checkpoint = io->log_start;
-
-	if (log->current_io == io)
-		log->current_io = NULL;
-	btrfs_r5l_free_io_unit(log, io);
+	btrfs_r5l_finish_io(log);
 }
 
 static struct bio *btrfs_r5l_bio_alloc(struct btrfs_r5l_log *log)
@@ -1299,6 +1318,11 @@ static struct btrfs_r5l_io_unit *btrfs_r5l_new_meta(struct btrfs_r5l_log *log)
 	bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
 
 	btrfs_r5l_reserve_log_entry(log, io);
+
+	INIT_LIST_HEAD(&io->list);
+	spin_lock(&log->io_list_lock);
+	list_add_tail(&io->list, &log->io_list);
+	spin_unlock(&log->io_list_lock);
 	return io;
 }
 
@@ -3760,6 +3784,8 @@ struct btrfs_r5l_log * btrfs_r5l_init_log_prepare(struct btrfs_fs_info *fs_info,
 	ASSERT(sizeof(device->uuid) == BTRFS_UUID_SIZE);
 	log->uuid_csum = btrfs_crc32c(~0, device->uuid, sizeof(device->uuid));
 	mutex_init(&log->io_mutex);
+	spin_lock_init(&log->io_list_lock);
+	INIT_LIST_HEAD(&log->io_list);
 
 	return log;
 }
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 2cc64a3627ceac..fc4ff20346778f 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -43,11 +43,16 @@ static inline int nr_data_stripes(struct map_lookup *map)
 struct btrfs_r5l_log;
 #define BTRFS_R5LOG_MAGIC 0x6433c509
 
+#define BTRFS_R5L_STRIPE_END 1
+
 /* one meta block + several data + parity blocks */
 struct btrfs_r5l_io_unit {
 	struct btrfs_r5l_log *log;
 	struct btrfs_raid_bio *rbio;
 
+	struct list_head list;
+	int status;
+
 	/* store meta block */
 	struct page *meta_page;