From 219d33b26ac498ca472a02b9217b4e450474fc90 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 1 Sep 2017 16:14:26 -0600
Subject: [PATCH 001/109] Btrfs: remove batch plug in run_scheduled_IO

Block layer has a limit on plug, ie. BLK_MAX_REQUEST_COUNT == 16, so
we don't gain benefits by batching 64 bios here.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b39737568c223c..b92e82d4413f2a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -517,12 +517,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
 					 &device->work);
 			goto done;
 		}
-		/* unplug every 64 requests just for good measure */
-		if (batch_run % 64 == 0) {
-			blk_finish_plug(&plug);
-			blk_start_plug(&plug);
-			sync_pending = 0;
-		}
 	}
 
 	cond_resched();

From 69cc7151ee8055e6bf7241a21211231a1a208abd Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 1 Sep 2017 16:14:28 -0600
Subject: [PATCH 002/109] Btrfs: move finish_wait out of the loop

If we're still going to wait after schedule(), we don't have to do
finish_wait() to remove our %wait_queue_entry since prepare_to_wait()
won't add the same %wait_queue_entry twice.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6c7a49faf4e063..1d4f8a7a128c91 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -622,8 +622,8 @@ static void btrfs_wait_for_no_snapshotting_writes(struct btrfs_root *root)
 		if (writers)
 			schedule();
 
-		finish_wait(&root->subv_writers->wait, &wait);
 	} while (writers);
+	finish_wait(&root->subv_writers->wait, &wait);
 }
 
 static int create_snapshot(struct btrfs_root *root, struct inode *dir,

From 45bac0f3d24a76f127a118e7b95a54e616449d16 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 1 Sep 2017 16:14:29 -0600
Subject: [PATCH 003/109] Btrfs: use wait_event instead of a single function

Since TASK_UNINTERRUPTIBLE has been used here, wait_event() can do the
same job.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1d4f8a7a128c91..95bd5336e85ba0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -609,23 +609,6 @@ static noinline int create_subvol(struct inode *dir,
 	return ret;
 }
 
-static void btrfs_wait_for_no_snapshotting_writes(struct btrfs_root *root)
-{
-	s64 writers;
-	DEFINE_WAIT(wait);
-
-	do {
-		prepare_to_wait(&root->subv_writers->wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-
-		writers = percpu_counter_sum(&root->subv_writers->counter);
-		if (writers)
-			schedule();
-
-	} while (writers);
-	finish_wait(&root->subv_writers->wait, &wait);
-}
-
 static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 			   struct dentry *dentry,
 			   u64 *async_transid, bool readonly,
@@ -654,7 +637,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 
 	atomic_inc(&root->will_be_snapshotted);
 	smp_mb__after_atomic();
-	btrfs_wait_for_no_snapshotting_writes(root);
+	/* wait for no snapshot writes */
+	wait_event(root->subv_writers->wait,
+		   percpu_counter_sum(&root->subv_writers->counter) == 0);
 
 	ret = btrfs_start_delalloc_inodes(root, 0);
 	if (ret)

From 49e83f5735cf1aca03ca8b4eadca475c1f3b0b14 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 1 Sep 2017 16:14:30 -0600
Subject: [PATCH 004/109] Btrfs: protect conditions within root->log_mutex
 while waiting

Both wait_for_commit() and wait_for_writer() are checking the
condition out of the mutex lock.

This refactors code a bit to be lock safe.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c800d067fcbf0c..963f22b1750868 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2699,34 +2699,36 @@ static void wait_log_commit(struct btrfs_root *root, int transid)
 	 * so we know that if ours is more than 2 older than the
 	 * current transaction, we're done
 	 */
-	do {
+	for (;;) {
 		prepare_to_wait(&root->log_commit_wait[index],
 				&wait, TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&root->log_mutex);
 
-		if (root->log_transid_committed < transid &&
-		    atomic_read(&root->log_commit[index]))
-			schedule();
+		if (!(root->log_transid_committed < transid &&
+		      atomic_read(&root->log_commit[index])))
+			break;
 
-		finish_wait(&root->log_commit_wait[index], &wait);
+		mutex_unlock(&root->log_mutex);
+		schedule();
 		mutex_lock(&root->log_mutex);
-	} while (root->log_transid_committed < transid &&
-		 atomic_read(&root->log_commit[index]));
+	}
+	finish_wait(&root->log_commit_wait[index], &wait);
 }
 
 static void wait_for_writer(struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
 
-	while (atomic_read(&root->log_writers)) {
-		prepare_to_wait(&root->log_writer_wait,
-				&wait, TASK_UNINTERRUPTIBLE);
+	for (;;) {
+		prepare_to_wait(&root->log_writer_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (!atomic_read(&root->log_writers))
+			break;
+
 		mutex_unlock(&root->log_mutex);
-		if (atomic_read(&root->log_writers))
-			schedule();
-		finish_wait(&root->log_writer_wait, &wait);
+		schedule();
 		mutex_lock(&root->log_mutex);
 	}
+	finish_wait(&root->log_writer_wait, &wait);
 }
 
 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,

From f78541ddb1da92050a8315cbd09fb5fe7735d150 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 31 Aug 2017 11:48:51 +0300
Subject: [PATCH 005/109] btrfs: Remove redundant forward declarations

Some static functions are needlessly forward declared. Let's remove those
declarations since they add no value.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2d7e86b51d1b2..7697c03bdae64b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -61,9 +61,6 @@ enum {
 	CHUNK_ALLOC_FORCE = 2,
 };
 
-static int update_block_group(struct btrfs_trans_handle *trans,
-			      struct btrfs_fs_info *fs_info, u64 bytenr,
-			      u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_fs_info *fs_info,
 				struct btrfs_delayed_ref_node *node, u64 parent,
@@ -91,17 +88,8 @@ static int find_next_key(struct btrfs_path *path, int level,
 static void dump_space_info(struct btrfs_fs_info *fs_info,
 			    struct btrfs_space_info *info, u64 bytes,
 			    int dump_block_groups);
-static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
-				    u64 ram_bytes, u64 num_bytes, int delalloc);
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
-				     u64 num_bytes, int delalloc);
 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
 			       u64 num_bytes);
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
-				    struct btrfs_space_info *space_info,
-				    u64 orig_bytes,
-				    enum btrfs_reserve_flush_enum flush,
-				    bool system_chunk);
 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
 				     struct btrfs_space_info *space_info,
 				     u64 num_bytes);

From 012e513e1bfeb39e1ce393f5a444f772ea27e954 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 30 Aug 2017 22:36:48 +0800
Subject: [PATCH 006/109] btrfs: declare TRACE_DEFINE_ENUM for each of
 show_flush_state enum

So that perf can show the state symbol.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index dc1d0df91e0b0e..77437f545c639c 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -28,6 +28,13 @@ struct btrfs_qgroup_extent_record;
 struct btrfs_qgroup;
 struct prelim_ref;
 
+TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS_NR);
+TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS);
+TRACE_DEFINE_ENUM(FLUSH_DELALLOC);
+TRACE_DEFINE_ENUM(FLUSH_DELALLOC_WAIT);
+TRACE_DEFINE_ENUM(ALLOC_CHUNK);
+TRACE_DEFINE_ENUM(COMMIT_TRANS);
+
 #define show_ref_type(type)						\
 	__print_symbolic(type,						\
 		{ BTRFS_TREE_BLOCK_REF_KEY, 	"TREE_BLOCK_REF" },	\

From c9162bdfd6174edeaa276a4c235851113625c555 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 22 Aug 2017 23:46:04 -0700
Subject: [PATCH 007/109] Btrfs: make some volumes.c functions static

These aren't used outside of volumes.c.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b92e82d4413f2a..533b417d9342fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -541,7 +541,7 @@ static void pending_bios_fn(struct btrfs_work *work)
 }
 
 
-void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
 {
 	struct btrfs_fs_devices *fs_devs;
 	struct btrfs_device *dev;
@@ -1062,14 +1062,15 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	return ret;
 }
 
-void btrfs_release_disk_super(struct page *page)
+static void btrfs_release_disk_super(struct page *page)
 {
 	kunmap(page);
 	put_page(page);
 }
 
-int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
-		struct page **page, struct btrfs_super_block **disk_super)
+static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
+				 struct page **page,
+				 struct btrfs_super_block **disk_super)
 {
 	void *p;
 	pgoff_t index;
@@ -1811,8 +1812,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs,
-					struct btrfs_device *device)
+static struct btrfs_device * btrfs_find_next_active_device(
+		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
 {
 	struct btrfs_device *next_device;
 

From 718dc5fade1a97cff58aa6581d05c0ab8a9fcf1d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 22 Aug 2017 23:46:05 -0700
Subject: [PATCH 008/109] Btrfs: fix __user casting in ioctl.c

Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 95bd5336e85ba0..a74ed6c12d6a34 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2164,7 +2164,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
 
 	inode = file_inode(file);
 	ret = search_ioctl(inode, &args.key, &buf_size,
-			   (char *)(&uarg->buf[0]));
+			   (char __user *)(&uarg->buf[0]));
 	if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
 		ret = -EFAULT;
 	else if (ret == -EOVERFLOW &&
@@ -4489,8 +4489,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 		ipath->fspath->val[i] = rel_ptr;
 	}
 
-	ret = copy_to_user((void *)(unsigned long)ipa->fspath,
-			   (void *)(unsigned long)ipath->fspath, size);
+	ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
+			   ipath->fspath, size);
 	if (ret) {
 		ret = -EFAULT;
 		goto out;
@@ -4561,8 +4561,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 	if (ret < 0)
 		goto out;
 
-	ret = copy_to_user((void *)(unsigned long)loi->inodes,
-			   (void *)(unsigned long)inodes, size);
+	ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
+			   size);
 	if (ret)
 		ret = -EFAULT;
 

From ee87cf5ed9fe8e41d057f5f5c7f49c18b6d3f94c Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 1 Aug 2017 18:35:08 +0800
Subject: [PATCH 009/109] btrfs: copy fsid to super_block s_uuid

We didn't copy fsid to struct super_block.s_uuid so Overlay disables
index feature with btrfs as the lower FS.

kernel: overlayfs: fs on '/lower' does not support file handles, falling back to index=off.

Fix this by publishing the fsid through struct super_block.s_uuid.

[ dsterba: I think that setting s_uuid is the last missing bit. Overlay
  needs the file handle encoding support from the lower filesystem, which
  is supported. Filling the whole filesystem id is correct, the subvolume
  id is encoded in the file handle buffer from inside btrfs_encode_fh. ]

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dfdab849037b70..0a98567f8cfc74 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2901,6 +2901,7 @@ int open_ctree(struct super_block *sb,
 
 	sb->s_blocksize = sectorsize;
 	sb->s_blocksize_bits = blksize_bits(sectorsize);
+	memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);
 
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_sys_array(fs_info);

From 9cd3a7eb8507deb3c75077739a07ccccad0dc991 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 3 Aug 2017 13:53:31 -0600
Subject: [PATCH 010/109] Btrfs: search parity device wisely

After mapping block with BTRFS_MAP_WRITE, parities have been sorted to
the end position, so this search can start from the first parity
stripe.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ copied changelog as a comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 24a62224b24b1b..cba5e7339462bc 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2231,12 +2231,18 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
 	ASSERT(!bio->bi_iter.bi_size);
 	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
 
-	for (i = 0; i < rbio->real_stripes; i++) {
+	/*
+	 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
+	 * to the end position, so this search can start from the first parity
+	 * stripe.
+	 */
+	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
 		if (bbio->stripes[i].dev == scrub_dev) {
 			rbio->scrubp = i;
 			break;
 		}
 	}
+	ASSERT(i < rbio->real_stripes);
 
 	/* Now we just support the sectorsize equals to page size */
 	ASSERT(fs_info->sectorsize == PAGE_SIZE);

From 9b4a9b283d5efbe9a221e2282e41e272e9db5ae9 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 18 Aug 2017 11:54:02 -0600
Subject: [PATCH 011/109] Btrfs: do not async submit for nodatasum inodes

While we submit direct writes, if the inode is flagged with nodatasum,
there's no benefit to submit asynchronously, because

a) we don't have to calculate checksum across processors,

b) and direct IO has started a plug, but async submit makes us queue
IO on each device's scheduled IO list instead of DIO's plug list, so
that IOs get much less merges in general.

Lets use sync submit for nodatasum inodes.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d94e3f68b9b134..0460c5431a72ef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8577,7 +8577,7 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
 			goto err;
 	}
 map:
-	ret = btrfs_map_bio(fs_info, bio, 0, async_submit);
+	ret = btrfs_map_bio(fs_info, bio, 0, 0);
 err:
 	bio_put(bio);
 	return ret;

From 8ca199501ec1695273f158cf92def37480e23f32 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 21 Aug 2017 12:43:43 +0300
Subject: [PATCH 012/109] btrfs: Remove unused variable

Src was initially part of 31ff1cd25d37 ("Btrfs: Copy into the log tree in
big batches"), however 16e7549f045d ("Btrfs: incompatible format change
to remove hole extents") changed parameters passed to copy_items which
made the src variable redundant.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 963f22b1750868..1036ac7313a7d6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4647,7 +4647,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key min_key;
 	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
-	struct extent_buffer *src = NULL;
 	LIST_HEAD(logged_list);
 	u64 last_extent = 0;
 	int err = 0;
@@ -4890,7 +4889,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			goto next_slot;
 		}
 
-		src = path->nodes[0];
 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
 			ins_nr++;
 			goto next_slot;

From a0357511f29f9b83e2c0d12621c339a5900eaa5a Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 21 Aug 2017 12:43:44 +0300
Subject: [PATCH 013/109] btrfs: Remove unused parameters from various
 functions

iterate_dir_item:found_key - introduced in 31db9f7c23fb ("Btrfs:
  introduce BTRFS_IOC_SEND for btrfs send/receive"), yet never used.

record_ref:num - ditto

This is a first pass with the low-hanging fruit. There are still quite a
few unsued parameters in some function which have to abide by a callback
interface.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 8fd195cfe81b97..e4ad2fa82acf56 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -992,7 +992,6 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
  * path must point to the dir item when called.
  */
 static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
-			    struct btrfs_key *found_key,
 			    iterate_dir_item_t iterate, void *ctx)
 {
 	int ret = 0;
@@ -4106,8 +4105,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
 	return ret;
 }
 
-static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
-		      struct fs_path *name, void *ctx, struct list_head *refs)
+static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name,
+		      void *ctx, struct list_head *refs)
 {
 	int ret = 0;
 	struct send_ctx *sctx = ctx;
@@ -4143,8 +4142,7 @@ static int __record_new_ref(int num, u64 dir, int index,
 			    void *ctx)
 {
 	struct send_ctx *sctx = ctx;
-	return record_ref(sctx->send_root, num, dir, index, name,
-			  ctx, &sctx->new_refs);
+	return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs);
 }
 
 
@@ -4153,8 +4151,8 @@ static int __record_deleted_ref(int num, u64 dir, int index,
 				void *ctx)
 {
 	struct send_ctx *sctx = ctx;
-	return record_ref(sctx->parent_root, num, dir, index, name,
-			  ctx, &sctx->deleted_refs);
+	return record_ref(sctx->parent_root, dir, name, ctx,
+			  &sctx->deleted_refs);
 }
 
 static int record_new_ref(struct send_ctx *sctx)
@@ -4498,7 +4496,7 @@ static int process_new_xattr(struct send_ctx *sctx)
 	int ret = 0;
 
 	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
-			       sctx->cmp_key, __process_new_xattr, sctx);
+			       __process_new_xattr, sctx);
 
 	return ret;
 }
@@ -4506,7 +4504,7 @@ static int process_new_xattr(struct send_ctx *sctx)
 static int process_deleted_xattr(struct send_ctx *sctx)
 {
 	return iterate_dir_item(sctx->parent_root, sctx->right_path,
-				sctx->cmp_key, __process_deleted_xattr, sctx);
+				__process_deleted_xattr, sctx);
 }
 
 struct find_xattr_ctx {
@@ -4551,7 +4549,7 @@ static int find_xattr(struct btrfs_root *root,
 	ctx.found_data = NULL;
 	ctx.found_data_len = 0;
 
-	ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
+	ret = iterate_dir_item(root, path, __find_xattr, &ctx);
 	if (ret < 0)
 		return ret;
 
@@ -4621,11 +4619,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
 	int ret = 0;
 
 	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
-			sctx->cmp_key, __process_changed_new_xattr, sctx);
+			__process_changed_new_xattr, sctx);
 	if (ret < 0)
 		goto out;
 	ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
-			sctx->cmp_key, __process_changed_deleted_xattr, sctx);
+			__process_changed_deleted_xattr, sctx);
 
 out:
 	return ret;
@@ -4675,8 +4673,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
 			goto out;
 		}
 
-		ret = iterate_dir_item(root, path, &found_key,
-				       __process_new_xattr, sctx);
+		ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
 		if (ret < 0)
 			goto out;
 

From ee8c494f88736c1a1873fdd65559828f9a734bcf Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 21 Aug 2017 12:43:45 +0300
Subject: [PATCH 014/109] btrfs: Remove unused arguments from
 btrfs_changed_cb_t

btrfs_changed_cb_t represents the signature of the callback being passed
to btrfs_compare_trees. Currently there is only one such callback,
namely changed_cb in send.c. This function doesn't really uses the first
2 parameters, i.e. the roots. Since there are not other functions
implementing the btrfs_changed_cb_t let's remove the unused parameters
from the prototype and implementation.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 15 +++++----------
 fs/btrfs/ctree.h |  4 +---
 fs/btrfs/send.c  |  8 +++-----
 3 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d49db7d86be29..19b9c5131745a1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5496,8 +5496,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 			goto out;
 		} else if (left_end_reached) {
 			if (right_level == 0) {
-				ret = changed_cb(left_root, right_root,
-						left_path, right_path,
+				ret = changed_cb(left_path, right_path,
 						&right_key,
 						BTRFS_COMPARE_TREE_DELETED,
 						ctx);
@@ -5508,8 +5507,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 			continue;
 		} else if (right_end_reached) {
 			if (left_level == 0) {
-				ret = changed_cb(left_root, right_root,
-						left_path, right_path,
+				ret = changed_cb(left_path, right_path,
 						&left_key,
 						BTRFS_COMPARE_TREE_NEW,
 						ctx);
@@ -5523,8 +5521,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 		if (left_level == 0 && right_level == 0) {
 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
 			if (cmp < 0) {
-				ret = changed_cb(left_root, right_root,
-						left_path, right_path,
+				ret = changed_cb(left_path, right_path,
 						&left_key,
 						BTRFS_COMPARE_TREE_NEW,
 						ctx);
@@ -5532,8 +5529,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 					goto out;
 				advance_left = ADVANCE;
 			} else if (cmp > 0) {
-				ret = changed_cb(left_root, right_root,
-						left_path, right_path,
+				ret = changed_cb(left_path, right_path,
 						&right_key,
 						BTRFS_COMPARE_TREE_DELETED,
 						ctx);
@@ -5550,8 +5546,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 					result = BTRFS_COMPARE_TREE_CHANGED;
 				else
 					result = BTRFS_COMPARE_TREE_SAME;
-				ret = changed_cb(left_root, right_root,
-						 left_path, right_path,
+				ret = changed_cb(left_path, right_path,
 						 &left_key, result, ctx);
 				if (ret < 0)
 					goto out;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fc690384c5857..16b3537f31d484 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2821,9 +2821,7 @@ enum btrfs_compare_tree_result {
 	BTRFS_COMPARE_TREE_CHANGED,
 	BTRFS_COMPARE_TREE_SAME,
 };
-typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
-				  struct btrfs_root *right_root,
-				  struct btrfs_path *left_path,
+typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
 				  struct btrfs_path *right_path,
 				  struct btrfs_key *key,
 				  enum btrfs_compare_tree_result result,
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e4ad2fa82acf56..07445be8c1cc70 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6159,9 +6159,7 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
  * Updates compare related fields in sctx and simply forwards to the actual
  * changed_xxx functions.
  */
-static int changed_cb(struct btrfs_root *left_root,
-		      struct btrfs_root *right_root,
-		      struct btrfs_path *left_path,
+static int changed_cb(struct btrfs_path *left_path,
 		      struct btrfs_path *right_path,
 		      struct btrfs_key *key,
 		      enum btrfs_compare_tree_result result,
@@ -6243,8 +6241,8 @@ static int full_send_tree(struct send_ctx *sctx)
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(eb, &found_key, slot);
 
-		ret = changed_cb(send_root, NULL, path, NULL,
-				&found_key, BTRFS_COMPARE_TREE_NEW, sctx);
+		ret = changed_cb(path, NULL, &found_key,
+				 BTRFS_COMPARE_TREE_NEW, sctx);
 		if (ret < 0)
 			goto out;
 

From 8c70c9f81eac82f497a37e901fb09c1cc2f0733f Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 21 Aug 2017 12:43:46 +0300
Subject: [PATCH 015/109] btrfs: Remove unused parameter from check_direct_IO

Introduced by 5a5f79b57069 ("Btrfs: allow unaligned DIO") and never
used. The buffered fallback from unaligned DIO works as expected.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0460c5431a72ef..ef3915666ebfd2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8786,7 +8786,6 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
 }
 
 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
-			       struct kiocb *iocb,
 			       const struct iov_iter *iter, loff_t offset)
 {
 	int seg;
@@ -8833,7 +8832,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	bool relock = false;
 	ssize_t ret;
 
-	if (check_direct_IO(fs_info, iocb, iter, offset))
+	if (check_direct_IO(fs_info, iter, offset))
 		return 0;
 
 	inode_dio_begin(inode);

From 1efb72a3c3ea8e430d1984774692ecbe3235600a Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 21 Aug 2017 12:43:49 +0300
Subject: [PATCH 016/109] btrfs: Rework error handling of add_extent_mapping in
 __btrfs_alloc_chunk

Currently the code executes add_extent_mapping and if it is successful
it links the new mapping, it then proceeds to unlock the extent mapping
tree and check for failure and handle them. Instead, rework the code to
only perform a single check if add_extent_mapping has failed and handle
it, otherwise the code continues in a linear fashion. No functional
changes

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 533b417d9342fd..ac1e8686ba7815 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4808,16 +4808,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	em_tree = &info->mapping_tree.map_tree;
 	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em, 0);
-	if (!ret) {
-		list_add_tail(&em->list, &trans->transaction->pending_chunks);
-		refcount_inc(&em->refs);
-	}
-	write_unlock(&em_tree->lock);
 	if (ret) {
+		write_unlock(&em_tree->lock);
 		free_extent_map(em);
 		goto error;
 	}
 
+	list_add_tail(&em->list, &trans->transaction->pending_chunks);
+	refcount_inc(&em->refs);
+	write_unlock(&em_tree->lock);
+
 	ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
 	if (ret)
 		goto error_del_extent;

From c434d21c6442b107af7030be2d716cf6c89c5606 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 21 Aug 2017 12:43:50 +0300
Subject: [PATCH 017/109] btrfs: Remove redundant argument of
 __link_block_group

__link_block_group is called from only 2 places and at each call site the
space_info being passed is the same as the space info assigned to the passed
cache struct. Let's remove the redundant argument and make the function
reference the space_info from the passed block_group_cache. No functional
changes

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ renamed to link_block_group ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7697c03bdae64b..18f8274b8e3b39 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9956,9 +9956,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	return 0;
 }
 
-static void __link_block_group(struct btrfs_space_info *space_info,
-			       struct btrfs_block_group_cache *cache)
+static void link_block_group(struct btrfs_block_group_cache *cache)
 {
+	struct btrfs_space_info *space_info = cache->space_info;
 	int index = get_block_group_index(cache);
 	bool first = false;
 
@@ -10166,7 +10166,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 
 		cache->space_info = space_info;
 
-		__link_block_group(space_info, cache);
+		link_block_group(cache);
 
 		set_avail_alloc_bits(info, cache->flags);
 		if (btrfs_chunk_readonly(info, cache->key.objectid)) {
@@ -10325,7 +10325,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 				cache->bytes_super, &cache->space_info);
 	update_global_block_rsv(fs_info);
 
-	__link_block_group(cache->space_info, cache);
+	link_block_group(cache);
 
 	list_add_tail(&cache->bg_list, &trans->new_bgs);
 

From 9ca2e97fa3c3216200afe35a3b111ec51cc796d2 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 10 Sep 2017 13:19:38 +0200
Subject: [PATCH 018/109] btrfs: tests: Fix a memory leak in error handling
 path in 'run_test()'

If 'btrfs_alloc_path()' fails, we must free the resources already
allocated, as done in the other error handling paths in this function.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/free-space-tree-tests.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 1458bb0ea124a3..8444a018cca292 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -500,7 +500,8 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
 	path = btrfs_alloc_path();
 	if (!path) {
 		test_msg("Couldn't allocate path\n");
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	ret = add_block_group_free_space(&trans, root->fs_info, cache);

From fa0d0888bda13bc96d99024cba40ae3a135de385 Mon Sep 17 00:00:00 2001
From: Christos Gkekas <chris.gekas@gmail.com>
Date: Sat, 9 Sep 2017 15:29:34 +0100
Subject: [PATCH 019/109] btrfs: Clean up dead code in root-tree

The value of variable 'can_recover' is never used after being set, thus
it should be removed, as it was never used since the first commit
68a7342c51c95042 ("Btrfs: cleanup orphaned root orphan item").

Signed-off-by: Christos Gkekas <chris.gekas@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/root-tree.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 95bcc3cce78f80..3338407ef0f06c 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -226,10 +226,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
 	struct btrfs_root *root;
 	int err = 0;
 	int ret;
-	bool can_recover = true;
-
-	if (sb_rdonly(fs_info->sb))
-		can_recover = false;
 
 	path = btrfs_alloc_path();
 	if (!path)

From 3993b112dac968612b0b213ed59cb30f50b0015b Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 11 Sep 2017 16:15:28 +0100
Subject: [PATCH 020/109] btrfs: avoid null pointer dereference on fs_info when
 calling btrfs_crit

There are checks on fs_info in __btrfs_panic to avoid dereferencing a
null fs_info, however, there is a call to btrfs_crit that may also
dereference a null fs_info. Fix this by adding a check to see if fs_info
is null and only print the s_id if fs_info is non-null.

Detected by CoverityScan CID#401973 ("Dereference after null check")

Fixes: efe120a067c8 ("Btrfs: convert printk to btrfs_ and fix BTRFS prefix")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 161694b660385a..e8f5e24325f338 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -202,7 +202,6 @@ static struct ratelimit_state printk_limits[] = {
 
 void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 {
-	struct super_block *sb = fs_info->sb;
 	char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
 	struct va_format vaf;
 	va_list args;
@@ -228,7 +227,8 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 	vaf.va = &args;
 
 	if (__ratelimit(ratelimit))
-		printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
+		printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
+			fs_info ? fs_info->sb->s_id : "<unknown>", &vaf);
 
 	va_end(args);
 }

From d8953d69bca83fab616f37693152384cea49f964 Mon Sep 17 00:00:00 2001
From: Satoru Takeuchi <satoru.takeuchi@gmail.com>
Date: Tue, 12 Sep 2017 20:08:08 +0900
Subject: [PATCH 021/109] btrfs: convert all mount option checking code to use
 btrfs_test_opt

Signed-off-by: Satoru Takeuchi <satoru.takeuchi@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/super.c       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 18f8274b8e3b39..eccaac17258dd0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -640,7 +640,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 	cache->cached = BTRFS_CACHE_FAST;
 	spin_unlock(&cache->lock);
 
-	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 		mutex_lock(&caching_ctl->mutex);
 		ret = load_free_space_cache(fs_info, cache);
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e8f5e24325f338..f092fe057de78d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -292,7 +292,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 	vaf.va = &args;
 
 	errstr = btrfs_decode_error(errno);
-	if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
+	if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
 		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
 			s_id, function, line, &vaf, errno, errstr);
 

From 6300463b14c1c2665674eb8f15843e5bb7a7ff84 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 21 Aug 2017 15:49:59 -0600
Subject: [PATCH 022/109] Btrfs: make plug in writing meta blocks really work

We have started plug in btrfs_write_and_wait_marked_extents() but the
generated IOs actually go to device's schedule IO list where the work
is doing in another task, thus the started plug doesn't make any
sense.

And since we wait for IOs immediately after writing meta blocks, it's
the same case as writing log tree, doing sync submit can merge more
IOs.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c     | 6 ++++--
 fs/btrfs/transaction.c | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0a98567f8cfc74..23442747363ed9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1005,8 +1005,10 @@ static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
 	return ret;
 }
 
-static int check_async_write(unsigned long bio_flags)
+static int check_async_write(struct btrfs_inode *bi, unsigned long bio_flags)
 {
+	if (atomic_read(&bi->sync_writers))
+		return 0;
 	if (bio_flags & EXTENT_BIO_TREE_LOG)
 		return 0;
 #ifdef CONFIG_X86
@@ -1022,7 +1024,7 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
 {
 	struct inode *inode = private_data;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	int async = check_async_write(bio_flags);
+	int async = check_async_write(BTRFS_I(inode), bio_flags);
 	blk_status_t ret;
 
 	if (bio_op(bio) != REQ_OP_WRITE) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f615d59b0489e2..9c5f126064bdee 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -950,6 +950,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
 	u64 start = 0;
 	u64 end;
 
+	atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers);
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
 				      mark, &cached_state)) {
 		bool wait_writeback = false;
@@ -985,6 +986,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
 		cond_resched();
 		start = end + 1;
 	}
+	atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers);
 	return werr;
 }
 

From 18fdc67900c5bfd1eeb41cfa50ea6f2eb7266f73 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 13 Sep 2017 12:18:22 -0600
Subject: [PATCH 023/109] Btrfs: remove bio_flags which indicates a meta block
 of log-tree

Since both committing transaction and writing log-tree are doing
plugging on metadata IO, we can unify to use %sync_writers to benefit
both cases, instead of checking bio_flags while writing meta blocks of
log-tree.

We can remove this bio_flags because in order to write dirty blocks,
log tree also uses btrfs_write_marked_extents(), inside which we
have enabled %sync_writers, therefore, every write goes in a
synchronous way, so does checksuming.

Please also note that, bio_flags is applied per-context while
%sync_writers is applied per-inode, so this might incur some overhead, ie.

1) while log tree is flushing its dirty blocks via
   btrfs_write_marked_extents(), in which %sync_writers is increased
   by one.

2) in the meantime, some writeback operations may happen upon btrfs's
   metadata inode, so these writes go synchronously, too.

However, AFAICS, the overhead is not a big one while the win is that
we unify the two places that needs synchronous way and remove a
special hack/flag.

This removes the bio_flags related stuff for writing log-tree.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   |  6 ++----
 fs/btrfs/extent_io.c | 13 ++-----------
 fs/btrfs/extent_io.h |  1 -
 3 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 23442747363ed9..90310d9e647a80 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1005,12 +1005,10 @@ static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
 	return ret;
 }
 
-static int check_async_write(struct btrfs_inode *bi, unsigned long bio_flags)
+static int check_async_write(struct btrfs_inode *bi)
 {
 	if (atomic_read(&bi->sync_writers))
 		return 0;
-	if (bio_flags & EXTENT_BIO_TREE_LOG)
-		return 0;
 #ifdef CONFIG_X86
 	if (static_cpu_has(X86_FEATURE_XMM4_2))
 		return 0;
@@ -1024,7 +1022,7 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
 {
 	struct inode *inode = private_data;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	int async = check_async_write(BTRFS_I(inode), bio_flags);
+	int async = check_async_write(BTRFS_I(inode));
 	blk_status_t ret;
 
 	if (bio_op(bio) != REQ_OP_WRITE) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 970190cd347edd..0dfcef37352b01 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -109,7 +109,6 @@ struct extent_page_data {
 	struct bio *bio;
 	struct extent_io_tree *tree;
 	get_extent_t *get_extent;
-	unsigned long bio_flags;
 
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
@@ -3715,7 +3714,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 	u64 offset = eb->start;
 	u32 nritems;
 	unsigned long i, num_pages;
-	unsigned long bio_flags = 0;
 	unsigned long start, end;
 	unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
 	int ret = 0;
@@ -3723,8 +3721,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
 	num_pages = num_extent_pages(eb->start, eb->len);
 	atomic_set(&eb->io_pages, num_pages);
-	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
-		bio_flags = EXTENT_BIO_TREE_LOG;
 
 	/* set btree blocks beyond nritems with 0 to avoid stale content. */
 	nritems = btrfs_header_nritems(eb);
@@ -3751,8 +3747,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 					 p, offset >> 9, PAGE_SIZE, 0, bdev,
 					 &epd->bio,
 					 end_bio_extent_buffer_writepage,
-					 0, epd->bio_flags, bio_flags, false);
-		epd->bio_flags = bio_flags;
+					 0, 0, 0, false);
 		if (ret) {
 			set_btree_ioerr(p);
 			if (PageWriteback(p))
@@ -3789,7 +3784,6 @@ int btree_write_cache_pages(struct address_space *mapping,
 		.tree = tree,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
-		.bio_flags = 0,
 	};
 	int ret = 0;
 	int done = 0;
@@ -4062,7 +4056,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
 	if (epd->bio) {
 		int ret;
 
-		ret = submit_one_bio(epd->bio, 0, epd->bio_flags);
+		ret = submit_one_bio(epd->bio, 0, 0);
 		BUG_ON(ret < 0); /* -ENOMEM */
 		epd->bio = NULL;
 	}
@@ -4085,7 +4079,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
-		.bio_flags = 0,
 	};
 
 	ret = __extent_writepage(page, wbc, &epd);
@@ -4110,7 +4103,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.get_extent = get_extent,
 		.extent_locked = 1,
 		.sync_io = mode == WB_SYNC_ALL,
-		.bio_flags = 0,
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= mode,
@@ -4150,7 +4142,6 @@ int extent_writepages(struct extent_io_tree *tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
-		.bio_flags = 0,
 	};
 
 	ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index faffa28ba707f3..861dacb371c7f9 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -33,7 +33,6 @@
  * type for this bio
  */
 #define EXTENT_BIO_COMPRESSED 1
-#define EXTENT_BIO_TREE_LOG 2
 #define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */

From 6939f667247ed792535c0697a99f600d6770f127 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 13 Sep 2017 12:09:28 -0600
Subject: [PATCH 024/109] Btrfs: fix confusing worker helper info in stacktrace

We've seen the following backtrace stack in ftrace or dmesg log,

  kworker/u16:10-4244  [000] 241942.480955: function:             btrfs_put_ordered_extent
  kworker/u16:10-4244  [000] 241942.480956: kernel_stack:         <stack trace>
=> finish_ordered_fn (ffffffffa0384475)
=> btrfs_scrubparity_helper (ffffffffa03ca577)        <-----"incorrect"
=> btrfs_freespace_write_helper (ffffffffa03ca98e)    <-----"correct"
=> process_one_work (ffffffff81117b2f)
=> worker_thread (ffffffff81118c2a)
=> kthread (ffffffff81121de0)
=> ret_from_fork (ffffffff81d7087a)

btrfs_freespace_write_helper is actually calling normal_worker_helper
instead of btrfs_scrubparity_helper, so somehow kernel has parsed the
incorrect function address while unwinding the stack,
btrfs_scrubparity_helper really shouldn't be shown up.

It's caused by compiler doing inline for our helper function, adding a
noinline tag can fix that.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ use noinline_for_stack ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/async-thread.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index e00c8a9fd5bbc0..d5540749f0e543 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -67,7 +67,7 @@ struct btrfs_workqueue {
 static void normal_work_helper(struct btrfs_work *work);
 
 #define BTRFS_WORK_HELPER(name)					\
-void btrfs_##name(struct work_struct *arg)				\
+noinline_for_stack void btrfs_##name(struct work_struct *arg)		\
 {									\
 	struct btrfs_work *work = container_of(arg, struct btrfs_work,	\
 					       normal_work);		\

From 3afb0c501414aa3c4a7cfea9ae1a0acfbf7be14f Mon Sep 17 00:00:00 2001
From: Allen Pais <allen.lkml@gmail.com>
Date: Wed, 20 Sep 2017 11:47:46 +0530
Subject: [PATCH 025/109] btrfs: return -ENOMEM on allocation failure in
 btrfsic

Forward the correct return value -ENOMEM from btrfsic_dev_state_alloc()
too.

Signed-off-by: Allen Pais <allen.lkml@gmail.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ adjust changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7d5a9b51f0d7a8..9db1e7602dfcb9 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2913,7 +2913,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
 	state = kvzalloc(sizeof(*state), GFP_KERNEL);
 	if (!state) {
 		pr_info("btrfs check-integrity: allocation failed!\n");
-		return -1;
+		return -ENOMEM;
 	}
 
 	if (!btrfsic_is_initialized) {
@@ -2945,7 +2945,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
 		if (NULL == ds) {
 			pr_info("btrfs check-integrity: kmalloc() failed!\n");
 			mutex_unlock(&btrfsic_mutex);
-			return -1;
+			return -ENOMEM;
 		}
 		ds->bdev = device->bdev;
 		ds->state = state;

From 315d8e98aa4fb67e2765b4c8df10fdf0c0bf0bfb Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 19 Sep 2017 16:01:23 +0100
Subject: [PATCH 026/109] btrfs: make array types static const, reduces object
 code size

Don't populate the read-only array types on the stack, instead make
it static const.  Makes the object code smaller by nearly 60 bytes:

Before:
   text	   data	    bss	    dec	    hex	filename
  90536	   6552	     64	  97152	  17b80	fs/btrfs/ioctl.o

After:
   text	   data	    bss	    dec	    hex	filename
  90414	   6616	     64	  97094	  17b46	fs/btrfs/ioctl.o

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a74ed6c12d6a34..feab6f61cb9775 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4114,10 +4114,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
 	struct btrfs_ioctl_space_info *dest_orig;
 	struct btrfs_ioctl_space_info __user *user_dest;
 	struct btrfs_space_info *info;
-	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
-		       BTRFS_BLOCK_GROUP_SYSTEM,
-		       BTRFS_BLOCK_GROUP_METADATA,
-		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+	static const u64 types[] = {
+		BTRFS_BLOCK_GROUP_DATA,
+		BTRFS_BLOCK_GROUP_SYSTEM,
+		BTRFS_BLOCK_GROUP_METADATA,
+		BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA
+	};
 	int num_types = 4;
 	int alloc_size;
 	int ret = 0;

From 785884fc31883a11e88710d89c525bb7f76e33df Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 22 Sep 2017 12:11:18 -0600
Subject: [PATCH 027/109] Btrfs: fix memory leak in raid56

The local bio_list may have pending bios when doing cleanup, it can
end up with memory leak if they don't get freed.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index cba5e7339462bc..a7f79254eccaff 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1326,6 +1326,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 
 cleanup:
 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+	while ((bio = bio_list_pop(&bio_list)))
+		bio_put(bio);
 }
 
 /*
@@ -1582,6 +1585,10 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 
 cleanup:
 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+	while ((bio = bio_list_pop(&bio_list)))
+		bio_put(bio);
+
 	return -EIO;
 
 finish:
@@ -2107,6 +2114,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
 		rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+	while ((bio = bio_list_pop(&bio_list)))
+		bio_put(bio);
+
 	return -EIO;
 }
 
@@ -2460,6 +2471,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 
 cleanup:
 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+	while ((bio = bio_list_pop(&bio_list)))
+		bio_put(bio);
 }
 
 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@ -2569,12 +2583,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
 	int stripe;
 	struct bio *bio;
 
+	bio_list_init(&bio_list);
+
 	ret = alloc_rbio_essential_pages(rbio);
 	if (ret)
 		goto cleanup;
 
-	bio_list_init(&bio_list);
-
 	atomic_set(&rbio->error, 0);
 	/*
 	 * build a list of bios to read all the missing parts of this
@@ -2642,6 +2656,10 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
 
 cleanup:
 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+	while ((bio = bio_list_pop(&bio_list)))
+		bio_put(bio);
+
 	return;
 
 finish:

From eef16ba269ea1d55ca1b4eab8d91f02e185835c9 Mon Sep 17 00:00:00 2001
From: Kuanling Huang <peterh@synology.com>
Date: Fri, 15 Sep 2017 16:47:45 +0800
Subject: [PATCH 028/109] Btrfs: send, apply asynchronous page cache readahead
 to enhance page read

By analyzing the perf on btrfs send, we found it take large amount of
cpu time on page_cache_sync_readahead. This effort can be reduced after
switching to asynchronous one. Overall performance gain on HDD and SSD
were 9 and 15 percent if simply send a large file.

Signed-off-by: Kuanling Huang <peterh@synology.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 07445be8c1cc70..0746eda7231d2d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4720,16 +4720,27 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
 	/* initial readahead */
 	memset(&sctx->ra, 0, sizeof(struct file_ra_state));
 	file_ra_state_init(&sctx->ra, inode->i_mapping);
-	page_cache_sync_readahead(inode->i_mapping, &sctx->ra, NULL, index,
-		       last_index - index + 1);
 
 	while (index <= last_index) {
 		unsigned cur_len = min_t(unsigned, len,
 					 PAGE_SIZE - pg_offset);
-		page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
+
+		page = find_lock_page(inode->i_mapping, index);
 		if (!page) {
-			ret = -ENOMEM;
-			break;
+			page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
+				NULL, index, last_index + 1 - index);
+
+			page = find_or_create_page(inode->i_mapping, index,
+					GFP_KERNEL);
+			if (!page) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
+
+		if (PageReadahead(page)) {
+			page_cache_async_readahead(inode->i_mapping, &sctx->ra,
+				NULL, page, index, last_index + 1 - index);
 		}
 
 		if (!PageUptodate(page)) {

From 996478ca9c460886ac147eb0d00e99841b71d31b Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 22 Aug 2017 16:00:39 -0400
Subject: [PATCH 029/109] btrfs: change how we decide to commit transactions
 during flushing

Nikolay reported that generic/273 was failing currently with ENOSPC.
Turns out this is because we get to the point where the outstanding
reservations are greater than the pinned space on the fs.  This is a
mistake, previously we used the current reservation amount in
may_commit_transaction, not the entire outstanding reservation amount.
Fix this to find the minimum byte size needed to make progress in
flushing, and pass that into may_commit_transaction.  From there we can
make a smarter decision on whether to commit the transaction or not.
This fixes the failure in generic/273.

From Nikolai, IOW: when we go to the final stage of deciding whether to
do trans commit, instead of passing all the reservations from all
tickets we just pass the reservation for the current ticket. Otherwise,
in case all reservations exceed pinned space, then we don't commit
transaction and fail prematurely. Before we passed num_bytes from
flush_space, where num_bytes was the sum of all pending reserverations,
but now all we do is take the first ticket and commit the trans if we
can satisfy that.

Fixes: 957780eb2788 ("Btrfs: introduce ticketed enospc infrastructure")
Cc: stable@vger.kernel.org # 4.8
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Tested-by: Nikolay Borisov <nborisov@suse.com>
[ added Nikolai's comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index eccaac17258dd0..1a6aced00a19bb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4907,6 +4907,13 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 	}
 }
 
+struct reserve_ticket {
+	u64 bytes;
+	int error;
+	struct list_head list;
+	wait_queue_head_t wait;
+};
+
 /**
  * maybe_commit_transaction - possibly commit the transaction if its ok to
  * @root - the root we're allocating for
@@ -4918,18 +4925,29 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
  * will return -ENOSPC.
  */
 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
-				  struct btrfs_space_info *space_info,
-				  u64 bytes, int force)
+				  struct btrfs_space_info *space_info)
 {
+	struct reserve_ticket *ticket = NULL;
 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
 	struct btrfs_trans_handle *trans;
+	u64 bytes;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	if (trans)
 		return -EAGAIN;
 
-	if (force)
-		goto commit;
+	spin_lock(&space_info->lock);
+	if (!list_empty(&space_info->priority_tickets))
+		ticket = list_first_entry(&space_info->priority_tickets,
+					  struct reserve_ticket, list);
+	else if (!list_empty(&space_info->tickets))
+		ticket = list_first_entry(&space_info->tickets,
+					  struct reserve_ticket, list);
+	bytes = (ticket) ? ticket->bytes : 0;
+	spin_unlock(&space_info->lock);
+
+	if (!bytes)
+		return 0;
 
 	/* See if there is enough pinned space to make this reservation */
 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
@@ -4944,8 +4962,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 		return -ENOSPC;
 
 	spin_lock(&delayed_rsv->lock);
+	if (delayed_rsv->size > bytes)
+		bytes = 0;
+	else
+		bytes -= delayed_rsv->size;
 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
-				   bytes - delayed_rsv->size) < 0) {
+				   bytes) < 0) {
 		spin_unlock(&delayed_rsv->lock);
 		return -ENOSPC;
 	}
@@ -4959,13 +4981,6 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 	return btrfs_commit_transaction(trans);
 }
 
-struct reserve_ticket {
-	u64 bytes;
-	int error;
-	struct list_head list;
-	wait_queue_head_t wait;
-};
-
 /*
  * Try to flush some data based on policy set by @state. This is only advisory
  * and may fail for various reasons. The caller is supposed to examine the
@@ -5015,8 +5030,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 			ret = 0;
 		break;
 	case COMMIT_TRANS:
-		ret = may_commit_transaction(fs_info, space_info,
-					     num_bytes, 0);
+		ret = may_commit_transaction(fs_info, space_info);
 		break;
 	default:
 		ret = -ENOSPC;

From 6018ba0a0e1bc2378bdab353958e2db1a3d253fc Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Fri, 15 Sep 2017 01:57:26 +0300
Subject: [PATCH 030/109] Btrfs: cleanup 'start' subtraction from try
 uncompressed inline extent

Was added in:
  c8b978188c9a0fd3d535c13debd19d522b726f1f
  "Btrfs: Add zlib compression support"
Survive to near time (from 08.10.2008).

Because 'start' checked for zero before branch, so it's safe to remove
that subtraction.

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: Satoru Takeuchi <satoru.takeuchi@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ef3915666ebfd2..6274be64f4c555 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -570,7 +570,7 @@ static noinline void compress_file_range(struct inode *inode,
 cont:
 	if (start == 0) {
 		/* lets try to make an inline extent */
-		if (ret || total_in < (actual_end - start)) {
+		if (ret || total_in < actual_end) {
 			/* we didn't compress the entire range, try
 			 * to make an uncompressed inline extent.
 			 */

From c3267bbaa9cae09b62960eafe33ad19196803285 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Wed, 23 Aug 2017 16:57:56 +0900
Subject: [PATCH 031/109] btrfs: Refactor check_leaf function for later
 expansion

Current check_leaf() function does a good job checking key order and
item offset/size.

However it only checks from slot 0 to the last but one slot, this is
good but makes later expansion hard.

So this refactoring iterates from slot 0 to the last slot.
For key comparison, it uses a key with all 0 as initial key, so all
valid keys should be larger than that.

And for item size/offset checks, it compares current item end with
previous item offset.
For slot 0, use leaf end as a special case.

This makes later item/key offset checks and item size checks easier to
be implemented.

Also, makes check_leaf() to return -EUCLEAN other than -EIO to indicate
error.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 50 +++++++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 90310d9e647a80..6c97ae1e7288e7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -553,8 +553,9 @@ static noinline int check_leaf(struct btrfs_root *root,
 			       struct extent_buffer *leaf)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	/* No valid key type is 0, so all key should be larger than this key */
+	struct btrfs_key prev_key = {0, 0, 0};
 	struct btrfs_key key;
-	struct btrfs_key leaf_key;
 	u32 nritems = btrfs_header_nritems(leaf);
 	int slot;
 
@@ -587,7 +588,7 @@ static noinline int check_leaf(struct btrfs_root *root,
 				CORRUPT("non-root leaf's nritems is 0",
 					leaf, check_root, 0);
 				free_extent_buffer(eb);
-				return -EIO;
+				return -EUCLEAN;
 			}
 			free_extent_buffer(eb);
 		}
@@ -597,28 +598,23 @@ static noinline int check_leaf(struct btrfs_root *root,
 	if (nritems == 0)
 		return 0;
 
-	/* Check the 0 item */
-	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
-	    BTRFS_LEAF_DATA_SIZE(fs_info)) {
-		CORRUPT("invalid item offset size pair", leaf, root, 0);
-		return -EIO;
-	}
-
 	/*
-	 * Check to make sure each items keys are in the correct order and their
-	 * offsets make sense.  We only have to loop through nritems-1 because
-	 * we check the current slot against the next slot, which verifies the
-	 * next slot's offset+size makes sense and that the current's slot
-	 * offset is correct.
+	 * Check the following things to make sure this is a good leaf, and
+	 * leaf users won't need to bother with similar sanity checks:
+	 *
+	 * 1) key order
+	 * 2) item offset and size
+	 *    No overlap, no hole, all inside the leaf.
 	 */
-	for (slot = 0; slot < nritems - 1; slot++) {
-		btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
-		btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+	for (slot = 0; slot < nritems; slot++) {
+		u32 item_end_expected;
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
 
 		/* Make sure the keys are in the right order */
-		if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+		if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
 			CORRUPT("bad key order", leaf, root, slot);
-			return -EIO;
+			return -EUCLEAN;
 		}
 
 		/*
@@ -626,10 +622,14 @@ static noinline int check_leaf(struct btrfs_root *root,
 		 * item data starts at the end of the leaf and grows towards the
 		 * front.
 		 */
-		if (btrfs_item_offset_nr(leaf, slot) !=
-			btrfs_item_end_nr(leaf, slot + 1)) {
+		if (slot == 0)
+			item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
+		else
+			item_end_expected = btrfs_item_offset_nr(leaf,
+								 slot - 1);
+		if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
 			CORRUPT("slot offset bad", leaf, root, slot);
-			return -EIO;
+			return -EUCLEAN;
 		}
 
 		/*
@@ -640,8 +640,12 @@ static noinline int check_leaf(struct btrfs_root *root,
 		if (btrfs_item_end_nr(leaf, slot) >
 		    BTRFS_LEAF_DATA_SIZE(fs_info)) {
 			CORRUPT("slot end outside of leaf", leaf, root, slot);
-			return -EIO;
+			return -EUCLEAN;
 		}
+
+		prev_key.objectid = key.objectid;
+		prev_key.type = key.type;
+		prev_key.offset = key.offset;
 	}
 
 	return 0;

From 7f43d4affb2a254d421ab20b0cf65ac2569909fb Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Wed, 23 Aug 2017 16:57:57 +0900
Subject: [PATCH 032/109] btrfs: Check if item pointer overlaps with the item
 itself

Function check_leaf() checks if any item pointer points outside of the
leaf, but it doesn't check if the pointer overlaps with the item itself.

Normally only the last item may be the victim, but adding such check is
never a bad idea anyway.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6c97ae1e7288e7..d1770b3e0385e6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -643,6 +643,13 @@ static noinline int check_leaf(struct btrfs_root *root,
 			return -EUCLEAN;
 		}
 
+		/* Also check if the item pointer overlaps with btrfs item. */
+		if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
+		    btrfs_item_ptr_offset(leaf, slot)) {
+			CORRUPT("slot overlap with its data", leaf, root, slot);
+			return -EUCLEAN;
+		}
+
 		prev_key.objectid = key.objectid;
 		prev_key.type = key.type;
 		prev_key.offset = key.offset;

From 40c3c40947324d9f40bf47830c92c59a9bbadf4a Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Wed, 23 Aug 2017 16:57:58 +0900
Subject: [PATCH 033/109] btrfs: Add sanity check for EXTENT_DATA when reading
 out leaf

Add extra checks for item with EXTENT_DATA type.  This checks the
following thing:

0) Key offset
   All key offsets must be aligned to sectorsize.
   Inline extent must have 0 for key offset.

1) Item size
   Uncompressed inline file extent size must match item size.
   (Compressed inline file extent has no information about its on-disk size.)
   Regular/preallocated file extent size must be a fixed value.

2) Every member of regular file extent item
   Including alignment for bytenr and offset, possible value for
   compression/encryption/type.

3) Type/compression/encode must be one of the valid values.

This should be the most comprehensive and strict check in the context
of btrfs_item for EXTENT_DATA.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ switch to BTRFS_FILE_EXTENT_TYPES, similar to what
  BTRFS_COMPRESS_TYPES does ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c              | 103 ++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs_tree.h |   1 +
 2 files changed, 104 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d1770b3e0385e6..b863d41f7d0a17 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -549,6 +549,100 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
 		   btrfs_header_level(eb) == 0 ? "leaf" : "node",	\
 		   reason, btrfs_header_bytenr(eb), root->objectid, slot)
 
+static int check_extent_data_item(struct btrfs_root *root,
+				  struct extent_buffer *leaf,
+				  struct btrfs_key *key, int slot)
+{
+	struct btrfs_file_extent_item *fi;
+	u32 sectorsize = root->fs_info->sectorsize;
+	u32 item_size = btrfs_item_size_nr(leaf, slot);
+
+	if (!IS_ALIGNED(key->offset, sectorsize)) {
+		CORRUPT("unaligned key offset for file extent",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
+		CORRUPT("invalid file extent type", leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	/*
+	 * Support for new compression/encrption must introduce incompat flag,
+	 * and must be caught in open_ctree().
+	 */
+	if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
+		CORRUPT("invalid file extent compression", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (btrfs_file_extent_encryption(leaf, fi)) {
+		CORRUPT("invalid file extent encryption", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+		/* Inline extent must have 0 as key offset */
+		if (key->offset) {
+			CORRUPT("inline extent has non-zero key offset",
+				leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/* Compressed inline extent has no on-disk size, skip it */
+		if (btrfs_file_extent_compression(leaf, fi) !=
+		    BTRFS_COMPRESS_NONE)
+			return 0;
+
+		/* Uncompressed inline extent size must match item size */
+		if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+		    btrfs_file_extent_ram_bytes(leaf, fi)) {
+			CORRUPT("plaintext inline extent has invalid size",
+				leaf, root, slot);
+			return -EUCLEAN;
+		}
+		return 0;
+	}
+
+	/* Regular or preallocated extent has fixed item size */
+	if (item_size != sizeof(*fi)) {
+		CORRUPT(
+		"regluar or preallocated extent data item size is invalid",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) {
+		CORRUPT(
+		"regular or preallocated extent data item has unaligned value",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	return 0;
+}
+
+/*
+ * Common point to switch the item-specific validation.
+ */
+static int check_leaf_item(struct btrfs_root *root,
+			   struct extent_buffer *leaf,
+			   struct btrfs_key *key, int slot)
+{
+	int ret = 0;
+
+	switch (key->type) {
+	case BTRFS_EXTENT_DATA_KEY:
+		ret = check_extent_data_item(root, leaf, key, slot);
+		break;
+	}
+	return ret;
+}
+
 static noinline int check_leaf(struct btrfs_root *root,
 			       struct extent_buffer *leaf)
 {
@@ -605,9 +699,13 @@ static noinline int check_leaf(struct btrfs_root *root,
 	 * 1) key order
 	 * 2) item offset and size
 	 *    No overlap, no hole, all inside the leaf.
+	 * 3) item content
+	 *    If possible, do comprehensive sanity check.
+	 *    NOTE: All checks must only rely on the item data itself.
 	 */
 	for (slot = 0; slot < nritems; slot++) {
 		u32 item_end_expected;
+		int ret;
 
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 
@@ -650,6 +748,11 @@ static noinline int check_leaf(struct btrfs_root *root,
 			return -EUCLEAN;
 		}
 
+		/* Check if the item size and content meet other criteria */
+		ret = check_leaf_item(root, leaf, &key, slot);
+		if (ret < 0)
+			return ret;
+
 		prev_key.objectid = key.objectid;
 		prev_key.type = key.type;
 		prev_key.offset = key.offset;
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 10689e1fdf11d1..3142645a27f5db 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -732,6 +732,7 @@ struct btrfs_balance_item {
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
+#define BTRFS_FILE_EXTENT_TYPES	2
 
 struct btrfs_file_extent_item {
 	/*

From 4b865cab96fe2a30ed512cf667b354bd291b3b0a Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Wed, 23 Aug 2017 16:57:59 +0900
Subject: [PATCH 034/109] btrfs: Add checker for EXTENT_CSUM

EXTENT_CSUM checker is a relatively easy one, only needs to check:

1) Objectid
   Fixed to BTRFS_EXTENT_CSUM_OBJECTID

2) Key offset alignment
   Must be aligned to sectorsize

3) Item size alignedment
   Must be aligned to csum size

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b863d41f7d0a17..c8633f2abdf1bf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -626,6 +626,27 @@ static int check_extent_data_item(struct btrfs_root *root,
 	return 0;
 }
 
+static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
+			   struct btrfs_key *key, int slot)
+{
+	u32 sectorsize = root->fs_info->sectorsize;
+	u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
+
+	if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
+		CORRUPT("invalid objectid for csum item", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(key->offset, sectorsize)) {
+		CORRUPT("unaligned key offset for csum item", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
+		CORRUPT("unaligned csum item size", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	return 0;
+}
+
 /*
  * Common point to switch the item-specific validation.
  */
@@ -639,6 +660,9 @@ static int check_leaf_item(struct btrfs_root *root,
 	case BTRFS_EXTENT_DATA_KEY:
 		ret = check_extent_data_item(root, leaf, key, slot);
 		break;
+	case BTRFS_EXTENT_CSUM_KEY:
+		ret = check_csum_item(root, leaf, key, slot);
+		break;
 	}
 	return ret;
 }

From 0af2c4bf5a012a40a2f9230458087d7f068339d0 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 28 Sep 2017 14:51:09 +0800
Subject: [PATCH 035/109] btrfs: undo writable superblocke when sprouting fails

When new device is being added to seed FS, seed FS is marked writable,
but when we fail to bring in the new device, we missed to undo the
writable part. This patch fixes it.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ac1e8686ba7815..0be58b5a320cfc 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2496,6 +2496,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	return ret;
 
 error_trans:
+	if (seeding_dev)
+		sb->s_flags |= MS_RDONLY;
 	btrfs_end_transaction(trans);
 	rcu_string_free(device->name);
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);

From d31c32f6745a646b34ed3e29456cc95b12a70e11 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 28 Sep 2017 14:51:10 +0800
Subject: [PATCH 036/109] btrfs: fix BUG_ON in btrfs_init_new_device()

Instead of BUG_ON return error to the caller. And handle the fail
condition by calling the abort transaction and going through the
error path.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0be58b5a320cfc..911bee9e6ae261 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2394,7 +2394,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	if (seeding_dev) {
 		sb->s_flags &= ~MS_RDONLY;
 		ret = btrfs_prepare_sprout(fs_info);
-		BUG_ON(ret); /* -ENOMEM */
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto error_trans;
+		}
 	}
 
 	device->fs_devices = fs_info->fs_devices;
@@ -2440,14 +2443,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		mutex_unlock(&fs_info->chunk_mutex);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
-			goto error_trans;
+			goto error_sysfs;
 		}
 	}
 
 	ret = btrfs_add_device(trans, fs_info, device);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
-		goto error_trans;
+		goto error_sysfs;
 	}
 
 	if (seeding_dev) {
@@ -2456,7 +2459,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		ret = btrfs_finish_sprout(trans, fs_info);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
-			goto error_trans;
+			goto error_sysfs;
 		}
 
 		/* Sprouting would change fsid of the mounted root,
@@ -2495,12 +2498,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	update_dev_time(device_path);
 	return ret;
 
+error_sysfs:
+	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
 error_trans:
 	if (seeding_dev)
 		sb->s_flags |= MS_RDONLY;
 	btrfs_end_transaction(trans);
 	rcu_string_free(device->name);
-	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
 	kfree(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);

From 7132a262595a43abd483f1c5139afc038c67fbbb Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 28 Sep 2017 14:51:11 +0800
Subject: [PATCH 037/109] btrfs: error out if btrfs_attach_transaction() fails

btrfs_init_new_device() calls btrfs_attach_transaction() to
commit sys chunks, and it should error out if it fails.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 911bee9e6ae261..9f26c576911fec 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2318,6 +2318,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	u64 tmp;
 	int seeding_dev = 0;
 	int ret = 0;
+	bool unlocked = false;
 
 	if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
 		return -EROFS;
@@ -2477,6 +2478,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
+		unlocked = true;
 
 		if (ret) /* transaction commit */
 			return ret;
@@ -2489,7 +2491,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		if (IS_ERR(trans)) {
 			if (PTR_ERR(trans) == -ENOENT)
 				return 0;
-			return PTR_ERR(trans);
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			goto error_sysfs;
 		}
 		ret = btrfs_commit_transaction(trans);
 	}
@@ -2503,12 +2507,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 error_trans:
 	if (seeding_dev)
 		sb->s_flags |= MS_RDONLY;
-	btrfs_end_transaction(trans);
+	if (trans)
+		btrfs_end_transaction(trans);
 	rcu_string_free(device->name);
 	kfree(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
-	if (seeding_dev) {
+	if (seeding_dev && !unlocked) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
 	}

From 9417ebc8a676487c6ec8825f92fb28f7dbeb5f4b Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 28 Sep 2017 10:53:17 +0300
Subject: [PATCH 038/109] btrfs: Explicitly handle btrfs_update_root failure

btrfs_udpate_root can fail and it aborts the transaction, the correct
way to handle an aborted transaction is to explicitly end with
btrfs_end_transaction.  Even now the code is correct since
btrfs_commit_transaction would handle an aborted transaction but this is
more of an implementation detail. So let's be explicit in handling
failure in btrfs_update_root.

Furthermore btrfs_commit_transaction can also fail and by ignoring it's
return value we could have left the in-memory copy of the root item in
an inconsistent state. So capture the error value which allows us to
correctly revert the RO/RW flags in case of commit failure.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index feab6f61cb9775..86728e06e2633d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1827,8 +1827,13 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 
 	ret = btrfs_update_root(trans, fs_info->tree_root,
 				&root->root_key, &root->root_item);
+	if (ret < 0) {
+		btrfs_end_transaction(trans);
+		goto out_reset;
+	}
+
+	ret = btrfs_commit_transaction(trans);
 
-	btrfs_commit_transaction(trans);
 out_reset:
 	if (ret)
 		btrfs_set_root_flags(&root->root_item, root_flags);

From efd38150af45375b46576d0110a323d7fab7e142 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 28 Sep 2017 11:45:26 +0300
Subject: [PATCH 039/109] btrfs: Refactor transaction handling in received
 subvolume ioctl

If btrfs_transaction_commit fails it will proceed to call
cleanup_transaction, which in turn already does btrfs_abort_transaction.
So let's remove the unnecessary code duplication. Also let's be explicit
about handling failure of btrfs_uuid_tree_add by calling
btrfs_end_transaction.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 86728e06e2633d..dfc5b486228902 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5152,15 +5152,11 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 					  root->root_key.objectid);
 		if (ret < 0 && ret != -EEXIST) {
 			btrfs_abort_transaction(trans, ret);
+			btrfs_end_transaction(trans);
 			goto out;
 		}
 	}
 	ret = btrfs_commit_transaction(trans);
-	if (ret < 0) {
-		btrfs_abort_transaction(trans, ret);
-		goto out;
-	}
-
 out:
 	up_write(&fs_info->subvol_sem);
 	mnt_drop_write_file(file);

From 897ca8194cd1b287bc5e7d8a5edc2b9a041e15ba Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas@m3y3r.de>
Date: Sat, 7 Oct 2017 16:02:21 +0200
Subject: [PATCH 040/109] btrfs: Fix bool initialization/comparison

Bool initializations should use true and false. Bool tests don't need
comparisons.

Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c |  4 ++--
 fs/btrfs/file.c        |  2 +-
 fs/btrfs/inode.c       | 14 +++++++-------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1a6aced00a19bb..624aa6841bf2cb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6960,7 +6960,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(!is_data && refs_to_drop != 1);
 
 	if (is_data)
-		skinny_metadata = 0;
+		skinny_metadata = false;
 
 	ret = lookup_extent_backref(trans, info, path, &iref,
 				    bytenr, num_bytes, parent,
@@ -9313,7 +9313,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	 * don't have it in the radix (like when we recover after a power fail
 	 * or unmount) so we don't leak memory.
 	 */
-	if (!for_reloc && root_dropped == false)
+	if (!for_reloc && !root_dropped)
 		btrfs_add_dead_root(root);
 	if (err && err != -EAGAIN)
 		btrfs_handle_fs_error(fs_info, err, NULL);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index aafcc785f8408c..79945053c7e756 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2046,7 +2046,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_log_ctx ctx;
 	int ret = 0, err;
-	bool full_sync = 0;
+	bool full_sync = false;
 	u64 len;
 
 	/*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6274be64f4c555..2c08e0966082f1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4451,9 +4451,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	int err = 0;
 	u64 ino = btrfs_ino(BTRFS_I(inode));
 	u64 bytes_deleted = 0;
-	bool be_nice = 0;
-	bool should_throttle = 0;
-	bool should_end = 0;
+	bool be_nice = false;
+	bool should_throttle = false;
+	bool should_end = false;
 
 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
@@ -4463,7 +4463,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	 */
 	if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
 	    test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-		be_nice = 1;
+		be_nice = true;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -4669,7 +4669,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		} else {
 			break;
 		}
-		should_throttle = 0;
+		should_throttle = false;
 
 		if (found_extent &&
 		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
@@ -4688,11 +4688,11 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			if (be_nice) {
 				if (truncate_space_check(trans, root,
 							 extent_num_bytes)) {
-					should_end = 1;
+					should_end = true;
 				}
 				if (btrfs_should_throttle_delayed_refs(trans,
 								       fs_info))
-					should_throttle = 1;
+					should_throttle = true;
 			}
 		}
 

From a969f4cc1349fc123eb412332d264de51b05d2ed Mon Sep 17 00:00:00 2001
From: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Date: Sun, 8 Oct 2017 22:30:58 +0200
Subject: [PATCH 041/109] btrfs: prefix sysfs attribute struct names

Currently struct names for sysfs are generated only based on the
attribute names. This means that attribute names cannot be reused in
multiple places throughout the complete btrfs sysfs hierarchy.

E.g. allocation/data/total_bytes and allocation/data/single/total_bytes
result in the same struct name btrfs_attr_total_bytes. A workaround for
this case was made in the past by ad hoc creating an extra macro
wrapper, BTRFS_RAID_ATTR, that inserts some extra text in the struct
name.

Instead of polluting sysfs.h with such kind of extra macro definitions,
and only doing so when there are collisions, use a prefix which gets
inserted in the struct name, so we keep everything nicely grouped
together by default.

Current collections of attributes are:
* (the toplevel, empty prefix)
* allocation
* space_info
* raid
* features

Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 63 ++++++++++++++++++++++++------------------------
 fs/btrfs/sysfs.h | 26 +++++++++-----------
 2 files changed, 43 insertions(+), 46 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 883881b16c86f7..a28bba801264d7 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -247,7 +247,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj,
 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
 	return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
 }
-BTRFS_ATTR(global_rsv_size, global_rsv_size_show);
+BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show);
 
 static ssize_t global_rsv_reserved_show(struct kobject *kobj,
 					struct kobj_attribute *a, char *buf)
@@ -256,15 +256,15 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
 	return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
 }
-BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show);
+BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show);
 
 #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
 #define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
 
 static ssize_t raid_bytes_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf);
-BTRFS_RAID_ATTR(total_bytes, raid_bytes_show);
-BTRFS_RAID_ATTR(used_bytes, raid_bytes_show);
+BTRFS_ATTR(raid, total_bytes, raid_bytes_show);
+BTRFS_ATTR(raid, used_bytes, raid_bytes_show);
 
 static ssize_t raid_bytes_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
@@ -277,7 +277,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
 
 	down_read(&sinfo->groups_sem);
 	list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
-		if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes))
+		if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes))
 			val += block_group->key.offset;
 		else
 			val += btrfs_block_group_used(&block_group->item);
@@ -287,8 +287,8 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
 }
 
 static struct attribute *raid_attributes[] = {
-	BTRFS_RAID_ATTR_PTR(total_bytes),
-	BTRFS_RAID_ATTR_PTR(used_bytes),
+	BTRFS_ATTR_PTR(raid, total_bytes),
+	BTRFS_ATTR_PTR(raid, used_bytes),
 	NULL
 };
 
@@ -311,7 +311,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj,	\
 	struct btrfs_space_info *sinfo = to_space_info(kobj);		\
 	return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf);	\
 }									\
-BTRFS_ATTR(field, btrfs_space_info_show_##field)
+BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field)
 
 static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
 						       struct kobj_attribute *a,
@@ -331,19 +331,20 @@ SPACE_INFO_ATTR(bytes_may_use);
 SPACE_INFO_ATTR(bytes_readonly);
 SPACE_INFO_ATTR(disk_used);
 SPACE_INFO_ATTR(disk_total);
-BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
+BTRFS_ATTR(space_info, total_bytes_pinned,
+	   btrfs_space_info_show_total_bytes_pinned);
 
 static struct attribute *space_info_attrs[] = {
-	BTRFS_ATTR_PTR(flags),
-	BTRFS_ATTR_PTR(total_bytes),
-	BTRFS_ATTR_PTR(bytes_used),
-	BTRFS_ATTR_PTR(bytes_pinned),
-	BTRFS_ATTR_PTR(bytes_reserved),
-	BTRFS_ATTR_PTR(bytes_may_use),
-	BTRFS_ATTR_PTR(bytes_readonly),
-	BTRFS_ATTR_PTR(disk_used),
-	BTRFS_ATTR_PTR(disk_total),
-	BTRFS_ATTR_PTR(total_bytes_pinned),
+	BTRFS_ATTR_PTR(space_info, flags),
+	BTRFS_ATTR_PTR(space_info, total_bytes),
+	BTRFS_ATTR_PTR(space_info, bytes_used),
+	BTRFS_ATTR_PTR(space_info, bytes_pinned),
+	BTRFS_ATTR_PTR(space_info, bytes_reserved),
+	BTRFS_ATTR_PTR(space_info, bytes_may_use),
+	BTRFS_ATTR_PTR(space_info, bytes_readonly),
+	BTRFS_ATTR_PTR(space_info, disk_used),
+	BTRFS_ATTR_PTR(space_info, disk_total),
+	BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
 	NULL,
 };
 
@@ -361,8 +362,8 @@ struct kobj_type space_info_ktype = {
 };
 
 static const struct attribute *allocation_attrs[] = {
-	BTRFS_ATTR_PTR(global_rsv_reserved),
-	BTRFS_ATTR_PTR(global_rsv_size),
+	BTRFS_ATTR_PTR(allocation, global_rsv_reserved),
+	BTRFS_ATTR_PTR(allocation, global_rsv_size),
 	NULL,
 };
 
@@ -415,7 +416,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
 
 	return len;
 }
-BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
+BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store);
 
 static ssize_t btrfs_nodesize_show(struct kobject *kobj,
 				struct kobj_attribute *a, char *buf)
@@ -425,7 +426,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
 	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
 }
 
-BTRFS_ATTR(nodesize, btrfs_nodesize_show);
+BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
 
 static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
 				struct kobj_attribute *a, char *buf)
@@ -436,7 +437,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
 			fs_info->super_copy->sectorsize);
 }
 
-BTRFS_ATTR(sectorsize, btrfs_sectorsize_show);
+BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
 
 static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
 				struct kobj_attribute *a, char *buf)
@@ -447,7 +448,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
 			fs_info->super_copy->sectorsize);
 }
 
-BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
+BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
 
 static ssize_t quota_override_show(struct kobject *kobj,
 				   struct kobj_attribute *a, char *buf)
@@ -487,14 +488,14 @@ static ssize_t quota_override_store(struct kobject *kobj,
 	return len;
 }
 
-BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store);
+BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store);
 
 static const struct attribute *btrfs_attrs[] = {
-	BTRFS_ATTR_PTR(label),
-	BTRFS_ATTR_PTR(nodesize),
-	BTRFS_ATTR_PTR(sectorsize),
-	BTRFS_ATTR_PTR(clone_alignment),
-	BTRFS_ATTR_PTR(quota_override),
+	BTRFS_ATTR_PTR(, label),
+	BTRFS_ATTR_PTR(, nodesize),
+	BTRFS_ATTR_PTR(, sectorsize),
+	BTRFS_ATTR_PTR(, clone_alignment),
+	BTRFS_ATTR_PTR(, quota_override),
 	NULL,
 };
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index d7da1a4c2f6c12..928cc5c2334769 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -20,21 +20,16 @@ enum btrfs_feature_set {
 	.store	= _store,						\
 }
 
-#define BTRFS_ATTR_RW(_name, _show, _store)			\
-	static struct kobj_attribute btrfs_attr_##_name =		\
+#define BTRFS_ATTR_RW(_prefix, _name, _show, _store)			\
+	static struct kobj_attribute btrfs_attr_##_prefix##_##_name =	\
 			__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
 
-#define BTRFS_ATTR(_name, _show)					\
-	static struct kobj_attribute btrfs_attr_##_name =		\
+#define BTRFS_ATTR(_prefix, _name, _show)				\
+	static struct kobj_attribute btrfs_attr_##_prefix##_##_name =	\
 			__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
 
-#define BTRFS_ATTR_PTR(_name)    (&btrfs_attr_##_name.attr)
-
-#define BTRFS_RAID_ATTR(_name, _show)					\
-	static struct kobj_attribute btrfs_raid_attr_##_name =		\
-			__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
-
-#define BTRFS_RAID_ATTR_PTR(_name)    (&btrfs_raid_attr_##_name.attr)
+#define BTRFS_ATTR_PTR(_prefix, _name)					\
+	(&btrfs_attr_##_prefix##_##_name.attr)
 
 
 struct btrfs_feature_attr {
@@ -43,15 +38,16 @@ struct btrfs_feature_attr {
 	u64 feature_bit;
 };
 
-#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit)	     \
-static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
+#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit)  \
+static struct btrfs_feature_attr btrfs_attr_features_##_name = {	     \
 	.kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO,			     \
 				      btrfs_feature_attr_show,		     \
 				      btrfs_feature_attr_store),	     \
 	.feature_set	= _feature_set,					     \
-	.feature_bit	= _prefix ##_## _feature_bit,			     \
+	.feature_bit	= _feature_prefix ##_## _feature_bit,		     \
 }
-#define BTRFS_FEAT_ATTR_PTR(_name)    (&btrfs_attr_##_name.kobj_attr.attr)
+#define BTRFS_FEAT_ATTR_PTR(_name)					     \
+	(&btrfs_attr_features_##_name.kobj_attr.attr)
 
 #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
 	BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)

From a7e3c5f2f7ba3f1f413df5548ed51eb25cd1c74d Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Tue, 10 Oct 2017 13:48:05 +0300
Subject: [PATCH 042/109] btrfs: use appropriate replacements for
 __sb_{start,end}_write calls

Commit a53f4f8e9c8eb ("btrfs: Don't call btrfs_start_transaction() on
frozen fs to avoid deadlock.") started using internal calls and we
replace them with more suitable ones.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f092fe057de78d..770ac088dd9ccd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1205,8 +1205,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 			 * happens. The pending operations are delayed to the
 			 * next commit after thawing.
 			 */
-			if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
-				__sb_end_write(sb, SB_FREEZE_WRITE);
+			if (sb_start_write_trylock(sb))
+				sb_end_write(sb);
 			else
 				return 0;
 			trans = btrfs_start_transaction(root, 0);

From 1170862d783a3b47e49a1506fe15fa074e35bbb5 Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Tue, 3 Oct 2017 18:06:01 +0300
Subject: [PATCH 043/109] Btrfs: compress_file_range remove dead variable
 num_bytes

Remove dead assigment of num_bytes.

Also as num_bytes only used in the will_compress block as copy of
total_in just replace that with total_in and drop num_bytes entirely.

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2c08e0966082f1..aaf36f78fa078a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -458,7 +458,6 @@ static noinline void compress_file_range(struct inode *inode,
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 num_bytes;
 	u64 blocksize = fs_info->sectorsize;
 	u64 actual_end;
 	u64 isize = i_size_read(inode);
@@ -508,8 +507,6 @@ static noinline void compress_file_range(struct inode *inode,
 
 	total_compressed = min_t(unsigned long, total_compressed,
 			BTRFS_MAX_UNCOMPRESSED);
-	num_bytes = ALIGN(end - start + 1, blocksize);
-	num_bytes = max(blocksize,  num_bytes);
 	total_in = 0;
 	ret = 0;
 
@@ -625,7 +622,6 @@ static noinline void compress_file_range(struct inode *inode,
 		 */
 		total_in = ALIGN(total_in, PAGE_SIZE);
 		if (total_compressed + blocksize <= total_in) {
-			num_bytes = total_in;
 			*num_added += 1;
 
 			/*
@@ -633,12 +629,12 @@ static noinline void compress_file_range(struct inode *inode,
 			 * allocation on disk for these compressed pages, and
 			 * will submit them to the elevator.
 			 */
-			add_async_extent(async_cow, start, num_bytes,
+			add_async_extent(async_cow, start, total_in,
 					total_compressed, pages, nr_pages,
 					compress_type);
 
-			if (start + num_bytes < end) {
-				start += num_bytes;
+			if (start + total_in < end) {
+				start += total_in;
 				pages = NULL;
 				cond_resched();
 				goto again;

From 557ea5dd003d371536f6b4e8f7c8209a2b6fd4e3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Mon, 9 Oct 2017 01:51:02 +0000
Subject: [PATCH 044/109] btrfs: Move leaf and node validation checker to
 tree-checker.c

It's no doubt the comprehensive tree block checker will become larger,
so moving them into their own files is quite reasonable.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
[ wording adjustments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile       |   2 +-
 fs/btrfs/disk-io.c      | 285 +-----------------------------------
 fs/btrfs/tree-checker.c | 309 ++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/tree-checker.h |  26 ++++
 4 files changed, 340 insertions(+), 282 deletions(-)
 create mode 100644 fs/btrfs/tree-checker.c
 create mode 100644 fs/btrfs/tree-checker.h

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 962a95aefb811e..88255e133ade85 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-	   uuid-tree.o props.o hash.o free-space-tree.o
+	   uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c8633f2abdf1bf..09407384e996e8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,7 @@
 #include "sysfs.h"
 #include "qgroup.h"
 #include "compression.h"
+#include "tree-checker.h"
 
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -543,284 +544,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-#define CORRUPT(reason, eb, root, slot)					\
-	btrfs_crit(root->fs_info,					\
-		   "corrupt %s, %s: block=%llu, root=%llu, slot=%d",	\
-		   btrfs_header_level(eb) == 0 ? "leaf" : "node",	\
-		   reason, btrfs_header_bytenr(eb), root->objectid, slot)
-
-static int check_extent_data_item(struct btrfs_root *root,
-				  struct extent_buffer *leaf,
-				  struct btrfs_key *key, int slot)
-{
-	struct btrfs_file_extent_item *fi;
-	u32 sectorsize = root->fs_info->sectorsize;
-	u32 item_size = btrfs_item_size_nr(leaf, slot);
-
-	if (!IS_ALIGNED(key->offset, sectorsize)) {
-		CORRUPT("unaligned key offset for file extent",
-			leaf, root, slot);
-		return -EUCLEAN;
-	}
-
-	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-
-	if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
-		CORRUPT("invalid file extent type", leaf, root, slot);
-		return -EUCLEAN;
-	}
-
-	/*
-	 * Support for new compression/encrption must introduce incompat flag,
-	 * and must be caught in open_ctree().
-	 */
-	if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
-		CORRUPT("invalid file extent compression", leaf, root, slot);
-		return -EUCLEAN;
-	}
-	if (btrfs_file_extent_encryption(leaf, fi)) {
-		CORRUPT("invalid file extent encryption", leaf, root, slot);
-		return -EUCLEAN;
-	}
-	if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
-		/* Inline extent must have 0 as key offset */
-		if (key->offset) {
-			CORRUPT("inline extent has non-zero key offset",
-				leaf, root, slot);
-			return -EUCLEAN;
-		}
-
-		/* Compressed inline extent has no on-disk size, skip it */
-		if (btrfs_file_extent_compression(leaf, fi) !=
-		    BTRFS_COMPRESS_NONE)
-			return 0;
-
-		/* Uncompressed inline extent size must match item size */
-		if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
-		    btrfs_file_extent_ram_bytes(leaf, fi)) {
-			CORRUPT("plaintext inline extent has invalid size",
-				leaf, root, slot);
-			return -EUCLEAN;
-		}
-		return 0;
-	}
-
-	/* Regular or preallocated extent has fixed item size */
-	if (item_size != sizeof(*fi)) {
-		CORRUPT(
-		"regluar or preallocated extent data item size is invalid",
-			leaf, root, slot);
-		return -EUCLEAN;
-	}
-	if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) {
-		CORRUPT(
-		"regular or preallocated extent data item has unaligned value",
-			leaf, root, slot);
-		return -EUCLEAN;
-	}
-
-	return 0;
-}
-
-static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
-			   struct btrfs_key *key, int slot)
-{
-	u32 sectorsize = root->fs_info->sectorsize;
-	u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
-
-	if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
-		CORRUPT("invalid objectid for csum item", leaf, root, slot);
-		return -EUCLEAN;
-	}
-	if (!IS_ALIGNED(key->offset, sectorsize)) {
-		CORRUPT("unaligned key offset for csum item", leaf, root, slot);
-		return -EUCLEAN;
-	}
-	if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
-		CORRUPT("unaligned csum item size", leaf, root, slot);
-		return -EUCLEAN;
-	}
-	return 0;
-}
-
-/*
- * Common point to switch the item-specific validation.
- */
-static int check_leaf_item(struct btrfs_root *root,
-			   struct extent_buffer *leaf,
-			   struct btrfs_key *key, int slot)
-{
-	int ret = 0;
-
-	switch (key->type) {
-	case BTRFS_EXTENT_DATA_KEY:
-		ret = check_extent_data_item(root, leaf, key, slot);
-		break;
-	case BTRFS_EXTENT_CSUM_KEY:
-		ret = check_csum_item(root, leaf, key, slot);
-		break;
-	}
-	return ret;
-}
-
-static noinline int check_leaf(struct btrfs_root *root,
-			       struct extent_buffer *leaf)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	/* No valid key type is 0, so all key should be larger than this key */
-	struct btrfs_key prev_key = {0, 0, 0};
-	struct btrfs_key key;
-	u32 nritems = btrfs_header_nritems(leaf);
-	int slot;
-
-	/*
-	 * Extent buffers from a relocation tree have a owner field that
-	 * corresponds to the subvolume tree they are based on. So just from an
-	 * extent buffer alone we can not find out what is the id of the
-	 * corresponding subvolume tree, so we can not figure out if the extent
-	 * buffer corresponds to the root of the relocation tree or not. So skip
-	 * this check for relocation trees.
-	 */
-	if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
-		struct btrfs_root *check_root;
-
-		key.objectid = btrfs_header_owner(leaf);
-		key.type = BTRFS_ROOT_ITEM_KEY;
-		key.offset = (u64)-1;
-
-		check_root = btrfs_get_fs_root(fs_info, &key, false);
-		/*
-		 * The only reason we also check NULL here is that during
-		 * open_ctree() some roots has not yet been set up.
-		 */
-		if (!IS_ERR_OR_NULL(check_root)) {
-			struct extent_buffer *eb;
-
-			eb = btrfs_root_node(check_root);
-			/* if leaf is the root, then it's fine */
-			if (leaf != eb) {
-				CORRUPT("non-root leaf's nritems is 0",
-					leaf, check_root, 0);
-				free_extent_buffer(eb);
-				return -EUCLEAN;
-			}
-			free_extent_buffer(eb);
-		}
-		return 0;
-	}
-
-	if (nritems == 0)
-		return 0;
-
-	/*
-	 * Check the following things to make sure this is a good leaf, and
-	 * leaf users won't need to bother with similar sanity checks:
-	 *
-	 * 1) key order
-	 * 2) item offset and size
-	 *    No overlap, no hole, all inside the leaf.
-	 * 3) item content
-	 *    If possible, do comprehensive sanity check.
-	 *    NOTE: All checks must only rely on the item data itself.
-	 */
-	for (slot = 0; slot < nritems; slot++) {
-		u32 item_end_expected;
-		int ret;
-
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-
-		/* Make sure the keys are in the right order */
-		if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
-			CORRUPT("bad key order", leaf, root, slot);
-			return -EUCLEAN;
-		}
-
-		/*
-		 * Make sure the offset and ends are right, remember that the
-		 * item data starts at the end of the leaf and grows towards the
-		 * front.
-		 */
-		if (slot == 0)
-			item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
-		else
-			item_end_expected = btrfs_item_offset_nr(leaf,
-								 slot - 1);
-		if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
-			CORRUPT("slot offset bad", leaf, root, slot);
-			return -EUCLEAN;
-		}
-
-		/*
-		 * Check to make sure that we don't point outside of the leaf,
-		 * just in case all the items are consistent to each other, but
-		 * all point outside of the leaf.
-		 */
-		if (btrfs_item_end_nr(leaf, slot) >
-		    BTRFS_LEAF_DATA_SIZE(fs_info)) {
-			CORRUPT("slot end outside of leaf", leaf, root, slot);
-			return -EUCLEAN;
-		}
-
-		/* Also check if the item pointer overlaps with btrfs item. */
-		if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
-		    btrfs_item_ptr_offset(leaf, slot)) {
-			CORRUPT("slot overlap with its data", leaf, root, slot);
-			return -EUCLEAN;
-		}
-
-		/* Check if the item size and content meet other criteria */
-		ret = check_leaf_item(root, leaf, &key, slot);
-		if (ret < 0)
-			return ret;
-
-		prev_key.objectid = key.objectid;
-		prev_key.type = key.type;
-		prev_key.offset = key.offset;
-	}
-
-	return 0;
-}
-
-static int check_node(struct btrfs_root *root, struct extent_buffer *node)
-{
-	unsigned long nr = btrfs_header_nritems(node);
-	struct btrfs_key key, next_key;
-	int slot;
-	u64 bytenr;
-	int ret = 0;
-
-	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
-		btrfs_crit(root->fs_info,
-			   "corrupt node: block %llu root %llu nritems %lu",
-			   node->start, root->objectid, nr);
-		return -EIO;
-	}
-
-	for (slot = 0; slot < nr - 1; slot++) {
-		bytenr = btrfs_node_blockptr(node, slot);
-		btrfs_node_key_to_cpu(node, &key, slot);
-		btrfs_node_key_to_cpu(node, &next_key, slot + 1);
-
-		if (!bytenr) {
-			CORRUPT("invalid item slot", node, root, slot);
-			ret = -EIO;
-			goto out;
-		}
-
-		if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
-			CORRUPT("bad key order", node, root, slot);
-			ret = -EIO;
-			goto out;
-		}
-	}
-out:
-	return ret;
-}
-
 static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 				      u64 phy_offset, struct page *page,
 				      u64 start, u64 end, int mirror)
@@ -886,12 +609,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 	 * that we don't try and read the other copies of this block, just
 	 * return -EIO.
 	 */
-	if (found_level == 0 && check_leaf(root, eb)) {
+	if (found_level == 0 && btrfs_check_leaf(root, eb)) {
 		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 		ret = -EIO;
 	}
 
-	if (found_level > 0 && check_node(root, eb))
+	if (found_level > 0 && btrfs_check_node(root, eb))
 		ret = -EIO;
 
 	if (!ret)
@@ -4146,7 +3869,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 					 buf->len,
 					 fs_info->dirty_metadata_batch);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+	if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) {
 		btrfs_print_leaf(buf);
 		ASSERT(0);
 	}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
new file mode 100644
index 00000000000000..56e25a63010395
--- /dev/null
+++ b/fs/btrfs/tree-checker.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (C) Qu Wenruo 2017.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+/*
+ * The module is used to catch unexpected/corrupted tree block data.
+ * Such behavior can be caused either by a fuzzed image or bugs.
+ *
+ * The objective is to do leaf/node validation checks when tree block is read
+ * from disk, and check *every* possible member, so other code won't
+ * need to checking them again.
+ *
+ * Due to the potential and unwanted damage, every checker needs to be
+ * carefully reviewed otherwise so it does not prevent mount of valid images.
+ */
+
+#include "ctree.h"
+#include "tree-checker.h"
+#include "disk-io.h"
+#include "compression.h"
+
+#define CORRUPT(reason, eb, root, slot)					\
+	btrfs_crit(root->fs_info,					\
+		   "corrupt %s, %s: block=%llu, root=%llu, slot=%d",	\
+		   btrfs_header_level(eb) == 0 ? "leaf" : "node",	\
+		   reason, btrfs_header_bytenr(eb), root->objectid, slot)
+
+static int check_extent_data_item(struct btrfs_root *root,
+				  struct extent_buffer *leaf,
+				  struct btrfs_key *key, int slot)
+{
+	struct btrfs_file_extent_item *fi;
+	u32 sectorsize = root->fs_info->sectorsize;
+	u32 item_size = btrfs_item_size_nr(leaf, slot);
+
+	if (!IS_ALIGNED(key->offset, sectorsize)) {
+		CORRUPT("unaligned key offset for file extent",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
+		CORRUPT("invalid file extent type", leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	/*
+	 * Support for new compression/encrption must introduce incompat flag,
+	 * and must be caught in open_ctree().
+	 */
+	if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
+		CORRUPT("invalid file extent compression", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (btrfs_file_extent_encryption(leaf, fi)) {
+		CORRUPT("invalid file extent encryption", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+		/* Inline extent must have 0 as key offset */
+		if (key->offset) {
+			CORRUPT("inline extent has non-zero key offset",
+				leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/* Compressed inline extent has no on-disk size, skip it */
+		if (btrfs_file_extent_compression(leaf, fi) !=
+		    BTRFS_COMPRESS_NONE)
+			return 0;
+
+		/* Uncompressed inline extent size must match item size */
+		if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+		    btrfs_file_extent_ram_bytes(leaf, fi)) {
+			CORRUPT("plaintext inline extent has invalid size",
+				leaf, root, slot);
+			return -EUCLEAN;
+		}
+		return 0;
+	}
+
+	/* Regular or preallocated extent has fixed item size */
+	if (item_size != sizeof(*fi)) {
+		CORRUPT(
+		"regluar or preallocated extent data item size is invalid",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) {
+		CORRUPT(
+		"regular or preallocated extent data item has unaligned value",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	return 0;
+}
+
+static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
+			   struct btrfs_key *key, int slot)
+{
+	u32 sectorsize = root->fs_info->sectorsize;
+	u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
+
+	if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
+		CORRUPT("invalid objectid for csum item", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(key->offset, sectorsize)) {
+		CORRUPT("unaligned key offset for csum item", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
+		CORRUPT("unaligned csum item size", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	return 0;
+}
+
+/*
+ * Common point to switch the item-specific validation.
+ */
+static int check_leaf_item(struct btrfs_root *root,
+			   struct extent_buffer *leaf,
+			   struct btrfs_key *key, int slot)
+{
+	int ret = 0;
+
+	switch (key->type) {
+	case BTRFS_EXTENT_DATA_KEY:
+		ret = check_extent_data_item(root, leaf, key, slot);
+		break;
+	case BTRFS_EXTENT_CSUM_KEY:
+		ret = check_csum_item(root, leaf, key, slot);
+		break;
+	}
+	return ret;
+}
+
+int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	/* No valid key type is 0, so all key should be larger than this key */
+	struct btrfs_key prev_key = {0, 0, 0};
+	struct btrfs_key key;
+	u32 nritems = btrfs_header_nritems(leaf);
+	int slot;
+
+	/*
+	 * Extent buffers from a relocation tree have a owner field that
+	 * corresponds to the subvolume tree they are based on. So just from an
+	 * extent buffer alone we can not find out what is the id of the
+	 * corresponding subvolume tree, so we can not figure out if the extent
+	 * buffer corresponds to the root of the relocation tree or not. So
+	 * skip this check for relocation trees.
+	 */
+	if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
+		struct btrfs_root *check_root;
+
+		key.objectid = btrfs_header_owner(leaf);
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+
+		check_root = btrfs_get_fs_root(fs_info, &key, false);
+		/*
+		 * The only reason we also check NULL here is that during
+		 * open_ctree() some roots has not yet been set up.
+		 */
+		if (!IS_ERR_OR_NULL(check_root)) {
+			struct extent_buffer *eb;
+
+			eb = btrfs_root_node(check_root);
+			/* if leaf is the root, then it's fine */
+			if (leaf != eb) {
+				CORRUPT("non-root leaf's nritems is 0",
+					leaf, check_root, 0);
+				free_extent_buffer(eb);
+				return -EUCLEAN;
+			}
+			free_extent_buffer(eb);
+		}
+		return 0;
+	}
+
+	if (nritems == 0)
+		return 0;
+
+	/*
+	 * Check the following things to make sure this is a good leaf, and
+	 * leaf users won't need to bother with similar sanity checks:
+	 *
+	 * 1) key ordering
+	 * 2) item offset and size
+	 *    No overlap, no hole, all inside the leaf.
+	 * 3) item content
+	 *    If possible, do comprehensive sanity check.
+	 *    NOTE: All checks must only rely on the item data itself.
+	 */
+	for (slot = 0; slot < nritems; slot++) {
+		u32 item_end_expected;
+		int ret;
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		/* Make sure the keys are in the right order */
+		if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
+			CORRUPT("bad key order", leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/*
+		 * Make sure the offset and ends are right, remember that the
+		 * item data starts at the end of the leaf and grows towards the
+		 * front.
+		 */
+		if (slot == 0)
+			item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
+		else
+			item_end_expected = btrfs_item_offset_nr(leaf,
+								 slot - 1);
+		if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
+			CORRUPT("slot offset bad", leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/*
+		 * Check to make sure that we don't point outside of the leaf,
+		 * just in case all the items are consistent to each other, but
+		 * all point outside of the leaf.
+		 */
+		if (btrfs_item_end_nr(leaf, slot) >
+		    BTRFS_LEAF_DATA_SIZE(fs_info)) {
+			CORRUPT("slot end outside of leaf", leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/* Also check if the item pointer overlaps with btrfs item. */
+		if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
+		    btrfs_item_ptr_offset(leaf, slot)) {
+			CORRUPT("slot overlap with its data", leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/* Check if the item size and content meet other criteria */
+		ret = check_leaf_item(root, leaf, &key, slot);
+		if (ret < 0)
+			return ret;
+
+		prev_key.objectid = key.objectid;
+		prev_key.type = key.type;
+		prev_key.offset = key.offset;
+	}
+
+	return 0;
+}
+
+int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
+{
+	unsigned long nr = btrfs_header_nritems(node);
+	struct btrfs_key key, next_key;
+	int slot;
+	u64 bytenr;
+	int ret = 0;
+
+	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
+		btrfs_crit(root->fs_info,
+			   "corrupt node: block %llu root %llu nritems %lu",
+			   node->start, root->objectid, nr);
+		return -EIO;
+	}
+
+	for (slot = 0; slot < nr - 1; slot++) {
+		bytenr = btrfs_node_blockptr(node, slot);
+		btrfs_node_key_to_cpu(node, &key, slot);
+		btrfs_node_key_to_cpu(node, &next_key, slot + 1);
+
+		if (!bytenr) {
+			CORRUPT("invalid item slot", node, root, slot);
+			ret = -EIO;
+			goto out;
+		}
+
+		if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
+			CORRUPT("bad key order", node, root, slot);
+			ret = -EIO;
+			goto out;
+		}
+	}
+out:
+	return ret;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
new file mode 100644
index 00000000000000..96c486e95d7042
--- /dev/null
+++ b/fs/btrfs/tree-checker.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) Qu Wenruo 2017.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+#ifndef __BTRFS_TREE_CHECKER__
+#define __BTRFS_TREE_CHECKER__
+
+#include "ctree.h"
+#include "extent_io.h"
+
+int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
+
+#endif

From bba4f29896c986c4cec17bc0f19f2ce644fceae1 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Mon, 9 Oct 2017 01:51:03 +0000
Subject: [PATCH 045/109] btrfs: tree-checker: Enhance btrfs_check_node output

Use inline function to replace macro since we don't need
stringification.
(Macro still exists until all callers get updated)

And add more info about the error, and replace EIO with EUCLEAN.

For nr_items error, report if it's too large or too small, and output
the valid value range.

For node block pointer, added a new alignment checker.

For key order, also output the next key to make the problem more
obvious.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
[ wording adjustments, unindented long strings ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 68 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 56e25a63010395..5acdf3355a3ffe 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -37,6 +37,46 @@
 		   btrfs_header_level(eb) == 0 ? "leaf" : "node",	\
 		   reason, btrfs_header_bytenr(eb), root->objectid, slot)
 
+/*
+ * Error message should follow the following format:
+ * corrupt <type>: <identifier>, <reason>[, <bad_value>]
+ *
+ * @type:	leaf or node
+ * @identifier:	the necessary info to locate the leaf/node.
+ * 		It's recommened to decode key.objecitd/offset if it's
+ * 		meaningful.
+ * @reason:	describe the error
+ * @bad_value:	optional, it's recommened to output bad value and its
+ *		expected value (range).
+ *
+ * Since comma is used to separate the components, only space is allowed
+ * inside each component.
+ */
+
+/*
+ * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt.
+ * Allows callers to customize the output.
+ */
+__printf(4, 5)
+static void generic_err(const struct btrfs_root *root,
+			const struct extent_buffer *eb, int slot,
+			const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	btrfs_crit(root->fs_info,
+		"corrupt %s: root=%llu block=%llu slot=%d, %pV",
+		btrfs_header_level(eb) == 0 ? "leaf" : "node",
+		root->objectid, btrfs_header_bytenr(eb), slot, &vaf);
+	va_end(args);
+}
+
 static int check_extent_data_item(struct btrfs_root *root,
 				  struct extent_buffer *leaf,
 				  struct btrfs_key *key, int slot)
@@ -282,9 +322,11 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
 
 	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
 		btrfs_crit(root->fs_info,
-			   "corrupt node: block %llu root %llu nritems %lu",
-			   node->start, root->objectid, nr);
-		return -EIO;
+"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
+			   root->objectid, node->start,
+			   nr == 0 ? "small" : "large", nr,
+			   BTRFS_NODEPTRS_PER_BLOCK(root->fs_info));
+		return -EUCLEAN;
 	}
 
 	for (slot = 0; slot < nr - 1; slot++) {
@@ -293,14 +335,26 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
 		btrfs_node_key_to_cpu(node, &next_key, slot + 1);
 
 		if (!bytenr) {
-			CORRUPT("invalid item slot", node, root, slot);
-			ret = -EIO;
+			generic_err(root, node, slot,
+				"invalid NULL node pointer");
+			ret = -EUCLEAN;
+			goto out;
+		}
+		if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) {
+			generic_err(root, node, slot,
+			"unaligned pointer, have %llu should be aligned to %u",
+				bytenr, root->fs_info->sectorsize);
+			ret = -EUCLEAN;
 			goto out;
 		}
 
 		if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
-			CORRUPT("bad key order", node, root, slot);
-			ret = -EIO;
+			generic_err(root, node, slot,
+	"bad key order, current (%llu %u %llu) next (%llu %u %llu)",
+				key.objectid, key.type, key.offset,
+				next_key.objectid, next_key.type,
+				next_key.offset);
+			ret = -EUCLEAN;
 			goto out;
 		}
 	}

From 478d01b3fcf04f35aa33226637fe0becf6a35183 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Mon, 9 Oct 2017 01:51:04 +0000
Subject: [PATCH 046/109] btrfs: tree-checker: Enhance output for
 btrfs_check_leaf

Enhance the output to print:
1) the eason
2) the ad value, if reason is not sufficient
3) good value (range)

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
[ wording, unidented long strings ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 5acdf3355a3ffe..b507c320eff510 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -230,8 +230,9 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
 			eb = btrfs_root_node(check_root);
 			/* if leaf is the root, then it's fine */
 			if (leaf != eb) {
-				CORRUPT("non-root leaf's nritems is 0",
-					leaf, check_root, 0);
+				generic_err(check_root, leaf, 0,
+		"invalid nritems, have %u should not be 0 for non-root leaf",
+					nritems);
 				free_extent_buffer(eb);
 				return -EUCLEAN;
 			}
@@ -262,7 +263,11 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
 
 		/* Make sure the keys are in the right order */
 		if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
-			CORRUPT("bad key order", leaf, root, slot);
+			generic_err(root, leaf, slot,
+	"bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
+				prev_key.objectid, prev_key.type,
+				prev_key.offset, key.objectid, key.type,
+				key.offset);
 			return -EUCLEAN;
 		}
 
@@ -277,7 +282,10 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
 			item_end_expected = btrfs_item_offset_nr(leaf,
 								 slot - 1);
 		if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
-			CORRUPT("slot offset bad", leaf, root, slot);
+			generic_err(root, leaf, slot,
+				"unexpected item end, have %u expect %u",
+				btrfs_item_end_nr(leaf, slot),
+				item_end_expected);
 			return -EUCLEAN;
 		}
 
@@ -288,14 +296,21 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
 		 */
 		if (btrfs_item_end_nr(leaf, slot) >
 		    BTRFS_LEAF_DATA_SIZE(fs_info)) {
-			CORRUPT("slot end outside of leaf", leaf, root, slot);
+			generic_err(root, leaf, slot,
+			"slot end outside of leaf, have %u expect range [0, %u]",
+				btrfs_item_end_nr(leaf, slot),
+				BTRFS_LEAF_DATA_SIZE(fs_info));
 			return -EUCLEAN;
 		}
 
 		/* Also check if the item pointer overlaps with btrfs item. */
 		if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
 		    btrfs_item_ptr_offset(leaf, slot)) {
-			CORRUPT("slot overlap with its data", leaf, root, slot);
+			generic_err(root, leaf, slot,
+		"slot overlaps with its data, item end %lu data start %lu",
+				btrfs_item_nr_offset(slot) +
+				sizeof(struct btrfs_item),
+				btrfs_item_ptr_offset(leaf, slot));
 			return -EUCLEAN;
 		}
 

From d508c5f07cd6a6570947e980a598674fa34406dc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Mon, 9 Oct 2017 01:51:05 +0000
Subject: [PATCH 047/109] btrfs: tree-checker: Enhance output for
 check_csum_item

Output the bad value and expected good value (or its alignment).

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
[ unindent long strings ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index b507c320eff510..3d1ae839a367fa 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -161,15 +161,21 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
 	u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
-		CORRUPT("invalid objectid for csum item", leaf, root, slot);
+		generic_err(root, leaf, slot,
+		"invalid key objectid for csum item, have %llu expect %llu",
+			key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
 		return -EUCLEAN;
 	}
 	if (!IS_ALIGNED(key->offset, sectorsize)) {
-		CORRUPT("unaligned key offset for csum item", leaf, root, slot);
+		generic_err(root, leaf, slot,
+	"unaligned key offset for csum item, have %llu should be aligned to %u",
+			key->offset, sectorsize);
 		return -EUCLEAN;
 	}
 	if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
-		CORRUPT("unaligned csum item size", leaf, root, slot);
+		generic_err(root, leaf, slot,
+	"unaligned item size for csum item, have %u should be aligned to %u",
+			btrfs_item_size_nr(leaf, slot), csumsize);
 		return -EUCLEAN;
 	}
 	return 0;

From 8806d7185bf7a81c28ea46ce67f78669f91f0591 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Mon, 9 Oct 2017 01:51:06 +0000
Subject: [PATCH 048/109] btrfs: tree-checker: Enhance output for
 check_extent_data_item

Output the invalid member name and its bad value, along with its
expected value range or alignment.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 97 +++++++++++++++++++++++++++++------------
 1 file changed, 69 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 3d1ae839a367fa..0de11417967669 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -31,12 +31,6 @@
 #include "disk-io.h"
 #include "compression.h"
 
-#define CORRUPT(reason, eb, root, slot)					\
-	btrfs_crit(root->fs_info,					\
-		   "corrupt %s, %s: block=%llu, root=%llu, slot=%d",	\
-		   btrfs_header_level(eb) == 0 ? "leaf" : "node",	\
-		   reason, btrfs_header_bytenr(eb), root->objectid, slot)
-
 /*
  * Error message should follow the following format:
  * corrupt <type>: <identifier>, <reason>[, <bad_value>]
@@ -77,6 +71,46 @@ static void generic_err(const struct btrfs_root *root,
 	va_end(args);
 }
 
+/*
+ * Customized reporter for extent data item, since its key objectid and
+ * offset has its own meaning.
+ */
+__printf(4, 5)
+static void file_extent_err(const struct btrfs_root *root,
+			    const struct extent_buffer *eb, int slot,
+			    const char *fmt, ...)
+{
+	struct btrfs_key key;
+	struct va_format vaf;
+	va_list args;
+
+	btrfs_item_key_to_cpu(eb, &key, slot);
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	btrfs_crit(root->fs_info,
+	"corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
+		btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
+		btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf);
+	va_end(args);
+}
+
+/*
+ * Return 0 if the btrfs_file_extent_##name is aligned to @alignment
+ * Else return 1
+ */
+#define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment)		      \
+({									      \
+	if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
+		file_extent_err((root), (leaf), (slot),			      \
+	"invalid %s for file extent, have %llu, should be aligned to %u",     \
+			(#name), btrfs_file_extent_##name((leaf), (fi)),      \
+			(alignment));					      \
+	(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment)));   \
+})
+
 static int check_extent_data_item(struct btrfs_root *root,
 				  struct extent_buffer *leaf,
 				  struct btrfs_key *key, int slot)
@@ -86,15 +120,19 @@ static int check_extent_data_item(struct btrfs_root *root,
 	u32 item_size = btrfs_item_size_nr(leaf, slot);
 
 	if (!IS_ALIGNED(key->offset, sectorsize)) {
-		CORRUPT("unaligned key offset for file extent",
-			leaf, root, slot);
+		file_extent_err(root, leaf, slot,
+"unaligned file_offset for file extent, have %llu should be aligned to %u",
+			key->offset, sectorsize);
 		return -EUCLEAN;
 	}
 
 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 
 	if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
-		CORRUPT("invalid file extent type", leaf, root, slot);
+		file_extent_err(root, leaf, slot,
+		"invalid type for file extent, have %u expect range [0, %u]",
+			btrfs_file_extent_type(leaf, fi),
+			BTRFS_FILE_EXTENT_TYPES);
 		return -EUCLEAN;
 	}
 
@@ -103,18 +141,24 @@ static int check_extent_data_item(struct btrfs_root *root,
 	 * and must be caught in open_ctree().
 	 */
 	if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
-		CORRUPT("invalid file extent compression", leaf, root, slot);
+		file_extent_err(root, leaf, slot,
+	"invalid compression for file extent, have %u expect range [0, %u]",
+			btrfs_file_extent_compression(leaf, fi),
+			BTRFS_COMPRESS_TYPES);
 		return -EUCLEAN;
 	}
 	if (btrfs_file_extent_encryption(leaf, fi)) {
-		CORRUPT("invalid file extent encryption", leaf, root, slot);
+		file_extent_err(root, leaf, slot,
+			"invalid encryption for file extent, have %u expect 0",
+			btrfs_file_extent_encryption(leaf, fi));
 		return -EUCLEAN;
 	}
 	if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
 		/* Inline extent must have 0 as key offset */
 		if (key->offset) {
-			CORRUPT("inline extent has non-zero key offset",
-				leaf, root, slot);
+			file_extent_err(root, leaf, slot,
+		"invalid file_offset for inline file extent, have %llu expect 0",
+				key->offset);
 			return -EUCLEAN;
 		}
 
@@ -126,8 +170,10 @@ static int check_extent_data_item(struct btrfs_root *root,
 		/* Uncompressed inline extent size must match item size */
 		if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
 		    btrfs_file_extent_ram_bytes(leaf, fi)) {
-			CORRUPT("plaintext inline extent has invalid size",
-				leaf, root, slot);
+			file_extent_err(root, leaf, slot,
+	"invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
+				item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
+				btrfs_file_extent_ram_bytes(leaf, fi));
 			return -EUCLEAN;
 		}
 		return 0;
@@ -135,22 +181,17 @@ static int check_extent_data_item(struct btrfs_root *root,
 
 	/* Regular or preallocated extent has fixed item size */
 	if (item_size != sizeof(*fi)) {
-		CORRUPT(
-		"regluar or preallocated extent data item size is invalid",
-			leaf, root, slot);
+		file_extent_err(root, leaf, slot,
+	"invalid item size for reg/prealloc file extent, have %u expect %lu",
+			item_size, sizeof(*fi));
 		return -EUCLEAN;
 	}
-	if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) {
-		CORRUPT(
-		"regular or preallocated extent data item has unaligned value",
-			leaf, root, slot);
+	if (CHECK_FE_ALIGNED(root, leaf, slot, fi, ram_bytes, sectorsize) ||
+	    CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_bytenr, sectorsize) ||
+	    CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_num_bytes, sectorsize) ||
+	    CHECK_FE_ALIGNED(root, leaf, slot, fi, offset, sectorsize) ||
+	    CHECK_FE_ALIGNED(root, leaf, slot, fi, num_bytes, sectorsize))
 		return -EUCLEAN;
-	}
-
 	return 0;
 }
 

From f851689b5ae3eb8e1b4d397eaf2805ccfd88bf77 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 7 Sep 2017 11:22:20 -0600
Subject: [PATCH 049/109] Btrfs: remove nr_async_bios

This was intended to congest higher layers to not send bios, but as

1) the congested bit has been taken by writeback

Async bios come from buffered writes and DIO writes.

For DIO writes, we want to submit them ASAP, while for buffered writes,
writeback uses balance_dirty_pages() to throttle how much dirty pages we
can have.

2) and no one is waiting for %nr_async_bios down to zero,

Historically, it was introduced along with changes which let
checksumming workload spread accross different cpus.  And at that time,
pdflush was used instead of per-bdi flushing, perhaps pdflush did not
have the necessary information for writeback to do throttling.

We can safely remove them now.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
[ additional explanation from mails, removed unused variable 'limit' ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   |  1 -
 fs/btrfs/disk-io.c |  1 -
 fs/btrfs/volumes.c | 17 -----------------
 3 files changed, 19 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 16b3537f31d484..bc1b6a0337003a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -880,7 +880,6 @@ struct btrfs_fs_info {
 
 	atomic_t nr_async_submits;
 	atomic_t async_submit_draining;
-	atomic_t nr_async_bios;
 	atomic_t async_delalloc_pages;
 	atomic_t open_ioctl_trans;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 09407384e996e8..2b18b0c379c1bd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2518,7 +2518,6 @@ int open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
-	atomic_set(&fs_info->nr_async_bios, 0);
 	atomic_set(&fs_info->defrag_running, 0);
 	atomic_set(&fs_info->qgroup_op_seq, 0);
 	atomic_set(&fs_info->reada_works_cnt, 0);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9f26c576911fec..763d04b4e5b74f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -360,7 +360,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
 	int again = 0;
 	unsigned long num_run;
 	unsigned long batch_run = 0;
-	unsigned long limit;
 	unsigned long last_waited = 0;
 	int force_reg = 0;
 	int sync_pending = 0;
@@ -375,8 +374,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
 	blk_start_plug(&plug);
 
 	bdi = device->bdev->bd_bdi;
-	limit = btrfs_async_submit_limit(fs_info);
-	limit = limit * 2 / 3;
 
 loop:
 	spin_lock(&device->io_lock);
@@ -443,13 +440,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
 
-		/*
-		 * atomic_dec_return implies a barrier for waitqueue_active
-		 */
-		if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
-		    waitqueue_active(&fs_info->async_submit_wait))
-			wake_up(&fs_info->async_submit_wait);
-
 		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
 
 		/*
@@ -6075,13 +6065,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
 		return;
 	}
 
-	/*
-	 * nr_async_bios allows us to reliably return congestion to the
-	 * higher layers.  Otherwise, the async bio makes it appear we have
-	 * made progress against dirty pages when we've really just put it
-	 * on a queue for later
-	 */
-	atomic_inc(&fs_info->nr_async_bios);
 	WARN_ON(bio->bi_next);
 	bio->bi_next = NULL;
 

From 80e03a2c512bd78858c130d6fe3b11982a61c0c5 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 7 Sep 2017 11:22:21 -0600
Subject: [PATCH 050/109] Btrfs: do not make defrag wait on
 async_delalloc_pages

By setting compression for a defrag task, the task will start IO at
the end of defrag.

After the combo of filemap_flush(), we've already made sure that
dirty pages have made progress via async compress thread because the
second filemap_flush() will wait for page lock, which won't be
unlocked until those pages have been marked as writeback and ordered
extents have been queued.

And this is for per-inode defrag, it's not helpful to wait on a global
%async_delalloc_pages and %nr_async_submits from fs_info.

Although waiting on %nr_async_submits means that all bios are
submitted down to per-device schedule IO lists, it doesn't wait for
their completions, thus users still need to do fsync/sync to make sure
the data is on disk.  While with this change, it makes sure that pages
are marked with writeback bits and will be submitted asynchronously
shortly, therefore, the behavior of defrag option '-c' remains unchanged.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dfc5b486228902..83ccadaf67050f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1405,21 +1405,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			filemap_flush(inode->i_mapping);
 	}
 
-	if (do_compress) {
-		/* the filemap_flush will queue IO into the worker threads, but
-		 * we have to make sure the IO is actually started and that
-		 * ordered extents get created before we return
-		 */
-		atomic_inc(&fs_info->async_submit_draining);
-		while (atomic_read(&fs_info->nr_async_submits) ||
-		       atomic_read(&fs_info->async_delalloc_pages)) {
-			wait_event(fs_info->async_submit_wait,
-				   (atomic_read(&fs_info->nr_async_submits) == 0 &&
-				    atomic_read(&fs_info->async_delalloc_pages) == 0));
-		}
-		atomic_dec(&fs_info->async_submit_draining);
-	}
-
 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
 		btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
 	} else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {

From 736cd52e0c720103f52ab9da47b6cc3af6b083f6 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 7 Sep 2017 11:22:22 -0600
Subject: [PATCH 051/109] Btrfs: remove nr_async_submits and
 async_submit_draining

Now that we have the combo of flushing twice, which can make sure IO
have started since the second flush will wait for page lock which
won't be unlocked unless setting page writeback and queuing ordered
extents, we don't need %async_submit_draining, %async_delalloc_pages
and %nr_async_submits to tell whether the IO has actually started.

Moreover, all the flushers in use are followed by functions that wait
for ordered extents to complete, so %nr_async_submits, which tracks
whether bio's async submit has made progress, doesn't really make
sense.

However, %async_delalloc_pages is still required by shrink_delalloc()
as that function doesn't flush twice in the normal case (just issues a
writeback with WB_REASON_FS_FREE_SPACE).

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   |  2 --
 fs/btrfs/disk-io.c | 24 ------------------------
 fs/btrfs/inode.c   | 28 ----------------------------
 3 files changed, 54 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bc1b6a0337003a..7995666af95983 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -878,8 +878,6 @@ struct btrfs_fs_info {
 	rwlock_t tree_mod_log_lock;
 	struct rb_root tree_mod_log;
 
-	atomic_t nr_async_submits;
-	atomic_t async_submit_draining;
 	atomic_t async_delalloc_pages;
 	atomic_t open_ioctl_trans;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2b18b0c379c1bd..f3e6e8fa19b066 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -740,22 +740,9 @@ static void run_one_async_start(struct btrfs_work *work)
 
 static void run_one_async_done(struct btrfs_work *work)
 {
-	struct btrfs_fs_info *fs_info;
 	struct async_submit_bio *async;
-	int limit;
 
 	async = container_of(work, struct  async_submit_bio, work);
-	fs_info = async->fs_info;
-
-	limit = btrfs_async_submit_limit(fs_info);
-	limit = limit * 2 / 3;
-
-	/*
-	 * atomic_dec_return implies a barrier for waitqueue_active
-	 */
-	if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
-	    waitqueue_active(&fs_info->async_submit_wait))
-		wake_up(&fs_info->async_submit_wait);
 
 	/* If an error occurred we just want to clean up the bio and move on */
 	if (async->status) {
@@ -803,19 +790,10 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 
 	async->status = 0;
 
-	atomic_inc(&fs_info->nr_async_submits);
-
 	if (op_is_sync(bio->bi_opf))
 		btrfs_set_work_high_priority(&async->work);
 
 	btrfs_queue_work(fs_info->workers, &async->work);
-
-	while (atomic_read(&fs_info->async_submit_draining) &&
-	      atomic_read(&fs_info->nr_async_submits)) {
-		wait_event(fs_info->async_submit_wait,
-			   (atomic_read(&fs_info->nr_async_submits) == 0));
-	}
-
 	return 0;
 }
 
@@ -2515,9 +2493,7 @@ int open_ctree(struct super_block *sb,
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
 	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
 			     BTRFS_BLOCK_RSV_DELOPS);
-	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
-	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->defrag_running, 0);
 	atomic_set(&fs_info->qgroup_op_seq, 0);
 	atomic_set(&fs_info->reada_works_cnt, 0);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index aaf36f78fa078a..4ddb299af4728c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1222,13 +1222,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 
 		btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
 
-		while (atomic_read(&fs_info->async_submit_draining) &&
-		       atomic_read(&fs_info->async_delalloc_pages)) {
-			wait_event(fs_info->async_submit_wait,
-				   (atomic_read(&fs_info->async_delalloc_pages) ==
-				    0));
-		}
-
 		*nr_written += nr_pages;
 		start = cur_end + 1;
 	}
@@ -10332,19 +10325,6 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	ret = __start_delalloc_inodes(root, delay_iput, -1);
 	if (ret > 0)
 		ret = 0;
-	/*
-	 * the filemap_flush will queue IO into the worker threads, but
-	 * we have to make sure the IO is actually started and that
-	 * ordered extents get created before we return
-	 */
-	atomic_inc(&fs_info->async_submit_draining);
-	while (atomic_read(&fs_info->nr_async_submits) ||
-	       atomic_read(&fs_info->async_delalloc_pages)) {
-		wait_event(fs_info->async_submit_wait,
-			   (atomic_read(&fs_info->nr_async_submits) == 0 &&
-			    atomic_read(&fs_info->async_delalloc_pages) == 0));
-	}
-	atomic_dec(&fs_info->async_submit_draining);
 	return ret;
 }
 
@@ -10386,14 +10366,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
 	spin_unlock(&fs_info->delalloc_root_lock);
 
 	ret = 0;
-	atomic_inc(&fs_info->async_submit_draining);
-	while (atomic_read(&fs_info->nr_async_submits) ||
-	      atomic_read(&fs_info->async_delalloc_pages)) {
-		wait_event(fs_info->async_submit_wait,
-		   (atomic_read(&fs_info->nr_async_submits) == 0 &&
-		    atomic_read(&fs_info->async_delalloc_pages) == 0));
-	}
-	atomic_dec(&fs_info->async_submit_draining);
 out:
 	if (!list_empty_careful(&splice)) {
 		spin_lock(&fs_info->delalloc_root_lock);

From 709a95c3eb21e3b53fa602d75031a89c7118f50b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 13 Oct 2017 11:27:35 +0200
Subject: [PATCH 052/109] btrfs: tree-checker: use %zu format string for size_t

We now get a harmless compile-time on 32-bit architectures:

fs/btrfs/tree-checker.c: In function 'check_extent_data_item':
fs/btrfs/tree-checker.c:189:70: error: format '%lu' expects argument of type 'long unsigned int', but argument 6 has type 'unsigned int' [-Werror=format=]

This changes the format string to use %zu instead of %lu for size_t.

Fixes: c1f6520bf360 ("btrfs: tree-checker: Enhance output for check_extent_data_item")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 0de11417967669..114fc5f0ecc5ef 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -182,7 +182,7 @@ static int check_extent_data_item(struct btrfs_root *root,
 	/* Regular or preallocated extent has fixed item size */
 	if (item_size != sizeof(*fi)) {
 		file_extent_err(root, leaf, slot,
-	"invalid item size for reg/prealloc file extent, have %u expect %lu",
+	"invalid item size for reg/prealloc file extent, have %u expect %zu",
 			item_size, sizeof(*fi));
 		return -EUCLEAN;
 	}

From 9e882d6d059ae79afb7b8748cb5ffd83acd058e2 Mon Sep 17 00:00:00 2001
From: Christos Gkekas <chris.gekas@gmail.com>
Date: Thu, 12 Oct 2017 22:40:42 +0100
Subject: [PATCH 053/109] btrfs: Clean up unused variables in free-space-tree.c

Remove variables 'start' and 'end', which are set but never used.

Signed-off-by: Christos Gkekas <chris.gekas@gmail.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-tree.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 684f12247db7e3..fe5e0324dca923 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1286,12 +1286,8 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 					struct btrfs_block_group_cache *block_group,
 					struct btrfs_path *path)
 {
-	u64 start, end;
 	int ret;
 
-	start = block_group->key.objectid;
-	end = block_group->key.objectid + block_group->key.offset;
-
 	block_group->needs_free_space = 0;
 
 	ret = add_new_free_space_info(trans, fs_info, block_group, path);

From adfb69af7d8cb6a3958f75aad1ef4bc96891d116 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 11 Oct 2017 12:46:18 +0800
Subject: [PATCH 054/109] btrfs: add_missing_dev() should return the actual
 error

add_missing_dev() can return device pointer so that IS_ERR/PTR_ERR can
be used to check for the actual error that occurred in the function.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
[ minor error message adjustment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 763d04b4e5b74f..23b628cfad198e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6238,7 +6238,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
 
 	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
 	if (IS_ERR(device))
-		return NULL;
+		return device;
 
 	list_add(&device->dev_list, &fs_devices->devices);
 	device->fs_devices = fs_devices;
@@ -6443,9 +6443,12 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 			map->stripes[i].dev =
 				add_missing_dev(fs_info->fs_devices, devid,
 						uuid);
-			if (!map->stripes[i].dev) {
+			if (IS_ERR(map->stripes[i].dev)) {
 				free_extent_map(em);
-				return -EIO;
+				btrfs_err(fs_info,
+					"failed to init missing dev %llu: %ld",
+					devid, PTR_ERR(map->stripes[i].dev));
+				return PTR_ERR(map->stripes[i].dev);
 			}
 			btrfs_report_missing_device(fs_info, devid, uuid);
 		}
@@ -6571,8 +6574,12 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 		}
 
 		device = add_missing_dev(fs_devices, devid, dev_uuid);
-		if (!device)
-			return -ENOMEM;
+		if (IS_ERR(device)) {
+			btrfs_err(fs_info,
+				"failed to add missing dev %llu: %ld",
+				devid, PTR_ERR(device));
+			return PTR_ERR(device);
+		}
 		btrfs_report_missing_device(fs_info, devid, dev_uuid);
 	} else {
 		if (!device->bdev) {

From 45dbdbc9f66c749fa329130dcd641eb0606129d0 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 9 Oct 2017 11:07:44 +0800
Subject: [PATCH 055/109] btrfs: fix EIO misuse to report missing degraded
 option

EIO is only for the IO failure to the device, avoid it. Use ENOENT as
that's the closest error code describing what happened.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23b628cfad198e..3896dfd6c8e047 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6437,7 +6437,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 		    !btrfs_test_opt(fs_info, DEGRADED)) {
 			free_extent_map(em);
 			btrfs_report_missing_device(fs_info, devid, uuid);
-			return -EIO;
+			return -ENOENT;
 		}
 		if (!map->stripes[i].dev) {
 			map->stripes[i].dev =
@@ -6570,7 +6570,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 	if (!device) {
 		if (!btrfs_test_opt(fs_info, DEGRADED)) {
 			btrfs_report_missing_device(fs_info, devid, dev_uuid);
-			return -EIO;
+			return -ENOENT;
 		}
 
 		device = add_missing_dev(fs_devices, devid, dev_uuid);
@@ -6585,7 +6585,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 		if (!device->bdev) {
 			btrfs_report_missing_device(fs_info, devid, dev_uuid);
 			if (!btrfs_test_opt(fs_info, DEGRADED))
-				return -EIO;
+				return -ENOENT;
 		}
 
 		if(!device->bdev && !device->missing) {

From 5a2b8e601c572527adfc4986941311c21effae6c Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 9 Oct 2017 11:07:45 +0800
Subject: [PATCH 056/109] btrfs: declare btrfs_report_missing_device() static

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 12 ++++++------
 fs/btrfs/volumes.h |  2 --
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3896dfd6c8e047..ce5e9ec4cdde0c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6366,6 +6366,12 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
+						u64 devid, u8 *uuid)
+{
+	btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid);
+}
+
 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 			  struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
@@ -6752,12 +6758,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
 	return -EIO;
 }
 
-void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid,
-				 u8 *uuid)
-{
-	btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid);
-}
-
 /*
  * Check if all chunks in the fs are OK for read-write degraded mount
  *
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6108fdfec67fb4..ff15208344a7b0 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -542,7 +542,5 @@ void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
 
 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info);
-void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid,
-				 u8 *uuid);
 
 #endif

From 2b902dfc896ffab875842ed62820dca0b99ff461 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 9 Oct 2017 11:07:46 +0800
Subject: [PATCH 057/109] btrfs: fix use of error or warning for missing device

When device is missing without the -o degraded option then its an error
so report it as an error instead of a warning.  And when -o degraded
option is provided, log the missing device as warning.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ switch error to bool ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ce5e9ec4cdde0c..4931b2df4043c9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6367,9 +6367,14 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
 }
 
 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
-						u64 devid, u8 *uuid)
+					u64 devid, u8 *uuid, bool error)
 {
-	btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid);
+	if (error)
+		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
+			      devid, uuid);
+	else
+		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
+			      devid, uuid);
 }
 
 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
@@ -6442,7 +6447,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 		if (!map->stripes[i].dev &&
 		    !btrfs_test_opt(fs_info, DEGRADED)) {
 			free_extent_map(em);
-			btrfs_report_missing_device(fs_info, devid, uuid);
+			btrfs_report_missing_device(fs_info, devid, uuid, true);
 			return -ENOENT;
 		}
 		if (!map->stripes[i].dev) {
@@ -6456,7 +6461,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 					devid, PTR_ERR(map->stripes[i].dev));
 				return PTR_ERR(map->stripes[i].dev);
 			}
-			btrfs_report_missing_device(fs_info, devid, uuid);
+			btrfs_report_missing_device(fs_info, devid, uuid, false);
 		}
 		map->stripes[i].dev->in_fs_metadata = 1;
 	}
@@ -6575,7 +6580,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
 	if (!device) {
 		if (!btrfs_test_opt(fs_info, DEGRADED)) {
-			btrfs_report_missing_device(fs_info, devid, dev_uuid);
+			btrfs_report_missing_device(fs_info, devid,
+							dev_uuid, true);
 			return -ENOENT;
 		}
 
@@ -6586,12 +6592,16 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 				devid, PTR_ERR(device));
 			return PTR_ERR(device);
 		}
-		btrfs_report_missing_device(fs_info, devid, dev_uuid);
+		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
 	} else {
 		if (!device->bdev) {
-			btrfs_report_missing_device(fs_info, devid, dev_uuid);
-			if (!btrfs_test_opt(fs_info, DEGRADED))
+			if (!btrfs_test_opt(fs_info, DEGRADED)) {
+				btrfs_report_missing_device(fs_info,
+						devid, dev_uuid, true);
 				return -ENOENT;
+			}
+			btrfs_report_missing_device(fs_info, devid,
+							dev_uuid, false);
 		}
 
 		if(!device->bdev && !device->missing) {

From 2351f431f727223b4d82851ee8fc045bf51edd34 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 27 Sep 2017 10:43:13 -0400
Subject: [PATCH 058/109] btrfs: fix send ioctl on 32bit with 64bit kernel

We pass in a pointer in our send arg struct, this means the struct size
doesn't match with 32bit user space and 64bit kernel space.  Fix this by
adding a compat mode and doing the appropriate conversion.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ move structure to the beginning, next to receive 32bit compat ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/send.c  | 12 ++---------
 fs/btrfs/send.h  |  2 +-
 3 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 83ccadaf67050f..f969c034d7b383 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -86,6 +86,19 @@ struct btrfs_ioctl_received_subvol_args_32 {
 				struct btrfs_ioctl_received_subvol_args_32)
 #endif
 
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+struct btrfs_ioctl_send_args_32 {
+	__s64 send_fd;			/* in */
+	__u64 clone_sources_count;	/* in */
+	compat_uptr_t clone_sources;	/* in */
+	__u64 parent_root;		/* in */
+	__u64 flags;			/* in */
+	__u64 reserved[4];		/* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
+			       struct btrfs_ioctl_send_args_32)
+#endif
 
 static int btrfs_clone(struct inode *src, struct inode *inode,
 		       u64 off, u64 olen, u64 olen_aligned, u64 destoff,
@@ -5463,6 +5476,41 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
 	return ret;
 }
 
+static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
+{
+	struct btrfs_ioctl_send_args *arg;
+	int ret;
+
+	if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+		struct btrfs_ioctl_send_args_32 args32;
+
+		ret = copy_from_user(&args32, argp, sizeof(args32));
+		if (ret)
+			return -EFAULT;
+		arg = kzalloc(sizeof(*arg), GFP_KERNEL);
+		if (!arg)
+			return -ENOMEM;
+		arg->send_fd = args32.send_fd;
+		arg->clone_sources_count = args32.clone_sources_count;
+		arg->clone_sources = compat_ptr(args32.clone_sources);
+		arg->parent_root = args32.parent_root;
+		arg->flags = args32.flags;
+		memcpy(arg->reserved, args32.reserved,
+		       sizeof(args32.reserved));
+#else
+		return -ENOTTY;
+#endif
+	} else {
+		arg = memdup_user(argp, sizeof(*arg));
+		if (IS_ERR(arg))
+			return PTR_ERR(arg);
+	}
+	ret = btrfs_ioctl_send(file, arg);
+	kfree(arg);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -5568,7 +5616,11 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_set_received_subvol_32(file, argp);
 #endif
 	case BTRFS_IOC_SEND:
-		return btrfs_ioctl_send(file, argp);
+		return _btrfs_ioctl_send(file, argp, false);
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+	case BTRFS_IOC_SEND_32:
+		return _btrfs_ioctl_send(file, argp, true);
+#endif
 	case BTRFS_IOC_GET_DEV_STATS:
 		return btrfs_ioctl_get_dev_stats(fs_info, argp);
 	case BTRFS_IOC_QUOTA_CTL:
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 0746eda7231d2d..d9ddcdbdd2e7d3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -26,6 +26,7 @@
 #include <linux/radix-tree.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
+#include <linux/compat.h>
 
 #include "send.h"
 #include "backref.h"
@@ -6371,13 +6372,12 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
 	spin_unlock(&root->root_item_lock);
 }
 
-long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
+long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
 {
 	int ret = 0;
 	struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
 	struct btrfs_fs_info *fs_info = send_root->fs_info;
 	struct btrfs_root *clone_root;
-	struct btrfs_ioctl_send_args *arg = NULL;
 	struct btrfs_key key;
 	struct send_ctx *sctx = NULL;
 	u32 i;
@@ -6413,13 +6413,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 		goto out;
 	}
 
-	arg = memdup_user(arg_, sizeof(*arg));
-	if (IS_ERR(arg)) {
-		ret = PTR_ERR(arg);
-		arg = NULL;
-		goto out;
-	}
-
 	/*
 	 * Check that we don't overflow at later allocations, we request
 	 * clone_sources_count + 1 items, and compare to unsigned long inside
@@ -6660,7 +6653,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
 		btrfs_root_dec_send_in_progress(sctx->parent_root);
 
-	kfree(arg);
 	kvfree(clone_sources_tmp);
 
 	if (sctx) {
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 02e00166c4dab1..3aa4bc55754fdb 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -130,5 +130,5 @@ enum {
 #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
 
 #ifdef __KERNEL__
-long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
+long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg);
 #endif

From 6aa21263e3a0594c2bd37eac784a0259e47dabbd Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 4 Oct 2017 17:07:07 +0200
Subject: [PATCH 059/109] btrfs: scrub: get rid of sector_t

The use of sector_t is not necessry, it's just for a warning.  Switch to
u64 and rename the variable and use byte units instead of 512b, ie.
dropping the >> 9 shifts.  The messages are adjusted as well.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e3f6c49e5c4d8a..cd1b791d9706cc 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -231,7 +231,7 @@ struct scrub_warning {
 	struct btrfs_path	*path;
 	u64			extent_item_size;
 	const char		*errstr;
-	sector_t		sector;
+	u64			physical;
 	u64			logical;
 	struct btrfs_device	*dev;
 };
@@ -797,10 +797,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 	 */
 	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 		btrfs_warn_in_rcu(fs_info,
-				  "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
+"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
 				  swarn->errstr, swarn->logical,
 				  rcu_str_deref(swarn->dev->name),
-				  (unsigned long long)swarn->sector,
+				  swarn->physical,
 				  root, inum, offset,
 				  min(isize - offset, (u64)PAGE_SIZE), nlink,
 				  (char *)(unsigned long)ipath->fspath->val[i]);
@@ -810,10 +810,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 
 err:
 	btrfs_warn_in_rcu(fs_info,
-			  "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
+			  "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 			  swarn->errstr, swarn->logical,
 			  rcu_str_deref(swarn->dev->name),
-			  (unsigned long long)swarn->sector,
+			  swarn->physical,
 			  root, inum, offset, ret);
 
 	free_ipath(ipath);
@@ -845,7 +845,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	if (!path)
 		return;
 
-	swarn.sector = (sblock->pagev[0]->physical) >> 9;
+	swarn.physical = sblock->pagev[0]->physical;
 	swarn.logical = sblock->pagev[0]->logical;
 	swarn.errstr = errstr;
 	swarn.dev = NULL;
@@ -868,10 +868,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 						      item_size, &ref_root,
 						      &ref_level);
 			btrfs_warn_in_rcu(fs_info,
-				"%s at logical %llu on dev %s, sector %llu: metadata %s (level %d) in tree %llu",
+"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 				errstr, swarn.logical,
 				rcu_str_deref(dev->name),
-				(unsigned long long)swarn.sector,
+				swarn.physical,
 				ref_level ? "node" : "leaf",
 				ret < 0 ? -1 : ref_level,
 				ret < 0 ? -1 : ref_root);

From 6c5a4e2c12597ba7cc0c6c644e7c9d04a627001c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 4 Oct 2017 17:10:34 +0200
Subject: [PATCH 060/109] btrfs: rename page offset parameter in
 submit_extent_page

We're going to remove sector_t and will use 'offset', so this patch
frees the name.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0dfcef37352b01..4376095d2db900 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2761,7 +2761,7 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page,
 static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 			      struct writeback_control *wbc,
 			      struct page *page, sector_t sector,
-			      size_t size, unsigned long offset,
+			      size_t size, unsigned long pg_offset,
 			      struct block_device *bdev,
 			      struct bio **bio_ret,
 			      bio_end_io_t end_io_func,
@@ -2785,8 +2785,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 
 		if (prev_bio_flags != bio_flags || !contig ||
 		    force_bio_submit ||
-		    merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
-		    bio_add_page(bio, page, page_size, offset) < page_size) {
+		    merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) ||
+		    bio_add_page(bio, page, page_size, pg_offset) < page_size) {
 			ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
 			if (ret < 0) {
 				*bio_ret = NULL;
@@ -2801,7 +2801,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 	}
 
 	bio = btrfs_bio_alloc(bdev, (u64)sector << 9);
-	bio_add_page(bio, page, page_size, offset);
+	bio_add_page(bio, page, page_size, pg_offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
 	bio->bi_write_hint = page->mapping->host->i_write_hint;

From 6273b7f8ed78d1acc3db6774b9ab07f04115f520 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 4 Oct 2017 17:30:11 +0200
Subject: [PATCH 061/109] btrfs: get rid of sector_t and use u64 offset in
 submit_extent_page

The use of sector_t in the callchain of submit_extent_page is not
necessary.  Switch to u64 and rename the variable and use byte units
instead of 512b, ie.  dropping the >> 9 shifts and avoiding the
con(tro)versions of sector_t.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4376095d2db900..dd941885b9c30b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2760,7 +2760,7 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page,
  */
 static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 			      struct writeback_control *wbc,
-			      struct page *page, sector_t sector,
+			      struct page *page, u64 offset,
 			      size_t size, unsigned long pg_offset,
 			      struct block_device *bdev,
 			      struct bio **bio_ret,
@@ -2775,6 +2775,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 	int contig = 0;
 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
 	size_t page_size = min_t(size_t, size, PAGE_SIZE);
+	sector_t sector = offset >> 9;
 
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
@@ -2800,7 +2801,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 		}
 	}
 
-	bio = btrfs_bio_alloc(bdev, (u64)sector << 9);
+	bio = btrfs_bio_alloc(bdev, offset);
 	bio_add_page(bio, page, page_size, pg_offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
@@ -2891,7 +2892,6 @@ static int __do_readpage(struct extent_io_tree *tree,
 	u64 last_byte = i_size_read(inode);
 	u64 block_start;
 	u64 cur_end;
-	sector_t sector;
 	struct extent_map *em;
 	struct block_device *bdev;
 	int ret = 0;
@@ -2927,6 +2927,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 	}
 	while (cur <= end) {
 		bool force_bio_submit = false;
+		u64 offset;
 
 		if (cur >= last_byte) {
 			char *userpage;
@@ -2966,9 +2967,9 @@ static int __do_readpage(struct extent_io_tree *tree,
 		iosize = ALIGN(iosize, blocksize);
 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
 			disk_io_size = em->block_len;
-			sector = em->block_start >> 9;
+			offset = em->block_start;
 		} else {
-			sector = (em->block_start + extent_offset) >> 9;
+			offset = em->block_start + extent_offset;
 			disk_io_size = iosize;
 		}
 		bdev = em->bdev;
@@ -3061,8 +3062,8 @@ static int __do_readpage(struct extent_io_tree *tree,
 		}
 
 		ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
-					 page, sector, disk_io_size, pg_offset,
-					 bdev, bio,
+					 page, offset, disk_io_size,
+					 pg_offset, bdev, bio,
 					 end_bio_extent_readpage, mirror_num,
 					 *bio_flags,
 					 this_bio_flag,
@@ -3323,7 +3324,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
 	u64 extent_offset;
 	u64 block_start;
 	u64 iosize;
-	sector_t sector;
 	struct extent_map *em;
 	struct block_device *bdev;
 	size_t pg_offset = 0;
@@ -3366,6 +3366,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
 
 	while (cur <= end) {
 		u64 em_end;
+		u64 offset;
 
 		if (cur >= i_size) {
 			if (tree->ops && tree->ops->writepage_end_io_hook)
@@ -3387,7 +3388,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
 		BUG_ON(end < cur);
 		iosize = min(em_end - cur, end - cur + 1);
 		iosize = ALIGN(iosize, blocksize);
-		sector = (em->block_start + extent_offset) >> 9;
+		offset = em->block_start + extent_offset;
 		bdev = em->bdev;
 		block_start = em->block_start;
 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -3430,7 +3431,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
 		}
 
 		ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
-					 page, sector, iosize, pg_offset,
+					 page, offset, iosize, pg_offset,
 					 bdev, &epd->bio,
 					 end_bio_extent_writepage,
 					 0, 0, 0, false);
@@ -3744,7 +3745,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 		clear_page_dirty_for_io(p);
 		set_page_writeback(p);
 		ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
-					 p, offset >> 9, PAGE_SIZE, 0, bdev,
+					 p, offset, PAGE_SIZE, 0, bdev,
 					 &epd->bio,
 					 end_bio_extent_buffer_writepage,
 					 0, 0, 0, false);

From fb592373cddeb4ed6c21eef4d6063f15176ab463 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:43:48 -0400
Subject: [PATCH 062/109] btrfs: add ref-verify mount option

This adds the infrastructure for turning ref verify on and off for a
mount, to be used by a later patch.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ enhnance btrfs_print_mod_info to print if ref-verify is compiled in ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h |  1 +
 fs/btrfs/super.c | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7995666af95983..75947c6c04c3c2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1335,6 +1335,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 #define BTRFS_MOUNT_FRAGMENT_METADATA	(1 << 25)
 #define BTRFS_MOUNT_FREE_SPACE_TREE	(1 << 26)
 #define BTRFS_MOUNT_NOLOGREPLAY		(1 << 27)
+#define BTRFS_MOUNT_REF_VERIFY		(1 << 28)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(2048)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 770ac088dd9ccd..880ab4949f698b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -325,6 +325,9 @@ enum {
 	Opt_nologreplay, Opt_norecovery,
 #ifdef CONFIG_BTRFS_DEBUG
 	Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+	Opt_ref_verify,
 #endif
 	Opt_err,
 };
@@ -386,6 +389,9 @@ static const match_table_t tokens = {
 	{Opt_fragment_data, "fragment=data"},
 	{Opt_fragment_metadata, "fragment=metadata"},
 	{Opt_fragment_all, "fragment=all"},
+#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+	{Opt_ref_verify, "ref_verify"},
 #endif
 	{Opt_err, NULL},
 };
@@ -824,6 +830,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			btrfs_info(info, "fragmenting data");
 			btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
 			break;
+#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+		case Opt_ref_verify:
+			btrfs_info(info, "doing ref verification");
+			btrfs_set_opt(info->mount_opt, REF_VERIFY);
+			break;
 #endif
 		case Opt_err:
 			btrfs_info(info, "unrecognized mount option '%s'", p);
@@ -1305,6 +1317,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (btrfs_test_opt(info, FRAGMENT_METADATA))
 		seq_puts(seq, ",fragment=metadata");
 #endif
+	if (btrfs_test_opt(info, REF_VERIFY))
+		seq_puts(seq, ",ref_verify");
 	seq_printf(seq, ",subvolid=%llu",
 		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
 	seq_puts(seq, ",subvol=");
@@ -2318,6 +2332,9 @@ static void btrfs_print_mod_info(void)
 #endif
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 			", integrity-checker=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+			", ref-verify=on"
 #endif
 			"\n",
 			btrfs_crc32c_impl());

From 84f7d8e6242ceb377c7af10a7133c653cc7fea5f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:43:49 -0400
Subject: [PATCH 063/109] btrfs: pass root to various extent ref mod functions

We need the actual root for the ref verifier tool to work, so change
these functions to pass the root around instead.  This will be used in
a subsequent patch.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c       |  2 +-
 fs/btrfs/ctree.h       |  7 ++++---
 fs/btrfs/extent-tree.c | 24 +++++++++++++-----------
 fs/btrfs/file.c        | 10 +++++-----
 fs/btrfs/inode.c       |  9 +++++----
 fs/btrfs/ioctl.c       |  2 +-
 fs/btrfs/relocation.c  | 14 +++++++-------
 fs/btrfs/tree-log.c    |  2 +-
 8 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 19b9c5131745a1..531e0a8645b08a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -192,7 +192,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
  * tree until you end up with a lock on the root.  A locked buffer
  * is returned, with a reference held.
  */
-static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 {
 	struct extent_buffer *eb;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 75947c6c04c3c2..2fcc3c30d471fd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2637,7 +2637,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct extent_buffer *buf,
 			   u64 parent, int last_ref);
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-				     u64 root_objectid, u64 owner,
+				     struct btrfs_root *root, u64 owner,
 				     u64 offset, u64 ram_bytes,
 				     struct btrfs_key *ins);
 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
@@ -2656,7 +2656,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes, u64 flags,
 				int level, int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
-		      struct btrfs_fs_info *fs_info,
+		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
 		      u64 owner, u64 offset);
 
@@ -2668,7 +2668,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_fs_info *fs_info);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-			 struct btrfs_fs_info *fs_info,
+			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 owner, u64 offset);
 
@@ -2807,6 +2807,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
 			     const struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 			struct btrfs_key *key, int lowest_level,
 			u64 min_trans);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 624aa6841bf2cb..a9787c3c0f7e7a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2177,10 +2177,11 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 
 /* Can return -ENOMEM */
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-			 struct btrfs_fs_info *fs_info,
+			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 owner, u64 offset)
 {
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	int old_ref_mod, new_ref_mod;
 	int ret;
 
@@ -3339,7 +3340,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 	int level;
 	int ret = 0;
 	int (*process_func)(struct btrfs_trans_handle *,
-			    struct btrfs_fs_info *,
+			    struct btrfs_root *,
 			    u64, u64, u64, u64, u64, u64);
 
 
@@ -3379,7 +3380,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 
 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
 			key.offset -= btrfs_file_extent_offset(buf, fi);
-			ret = process_func(trans, fs_info, bytenr, num_bytes,
+			ret = process_func(trans, root, bytenr, num_bytes,
 					   parent, ref_root, key.objectid,
 					   key.offset);
 			if (ret)
@@ -3387,7 +3388,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			num_bytes = fs_info->nodesize;
-			ret = process_func(trans, fs_info, bytenr, num_bytes,
+			ret = process_func(trans, root, bytenr, num_bytes,
 					   parent, ref_root, level - 1, 0);
 			if (ret)
 				goto fail;
@@ -7331,17 +7332,17 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
 /* Can return -ENOMEM */
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
-		      struct btrfs_fs_info *fs_info,
+		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
 		      u64 owner, u64 offset)
 {
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	int old_ref_mod, new_ref_mod;
 	int ret;
 
 	if (btrfs_is_testing(fs_info))
 		return 0;
 
-
 	/*
 	 * tree log blocks never actually go into the extent allocation
 	 * tree, just update pinning info and exit early.
@@ -8308,17 +8309,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-				     u64 root_objectid, u64 owner,
+				     struct btrfs_root *root, u64 owner,
 				     u64 offset, u64 ram_bytes,
 				     struct btrfs_key *ins)
 {
-	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 
-	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
 
 	ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
-					 ins->offset, 0, root_objectid, owner,
+					 ins->offset, 0,
+					 root->root_key.objectid, owner,
 					 offset, ram_bytes,
 					 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
 	return ret;
@@ -8896,7 +8898,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 					     ret);
 			}
 		}
-		ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize,
+		ret = btrfs_free_extent(trans, root, bytenr, blocksize,
 					parent, root->root_key.objectid,
 					level - 1, 0);
 		if (ret)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 79945053c7e756..d3c1725f03e41b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -856,7 +856,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			btrfs_mark_buffer_dirty(leaf);
 
 			if (update_refs && disk_bytenr > 0) {
-				ret = btrfs_inc_extent_ref(trans, fs_info,
+				ret = btrfs_inc_extent_ref(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						new_key.objectid,
@@ -940,7 +940,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 				extent_end = ALIGN(extent_end,
 						   fs_info->sectorsize);
 			} else if (update_refs && disk_bytenr > 0) {
-				ret = btrfs_free_extent(trans, fs_info,
+				ret = btrfs_free_extent(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						key.objectid, key.offset -
@@ -1234,7 +1234,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 						extent_end - split);
 		btrfs_mark_buffer_dirty(leaf);
 
-		ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes,
+		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
 					   0, root->root_key.objectid,
 					   ino, orig_offset);
 		if (ret) {
@@ -1268,7 +1268,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 		extent_end = other_end;
 		del_slot = path->slots[0] + 1;
 		del_nr++;
-		ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
 					ino, orig_offset);
 		if (ret) {
@@ -1288,7 +1288,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 		key.offset = other_start;
 		del_slot = path->slots[0];
 		del_nr++;
-		ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
 					ino, orig_offset);
 		if (ret) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4ddb299af4728c..a11f87fa79d089 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2218,8 +2218,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto out;
 	qg_released = ret;
-	ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
-			btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins);
+	ret = btrfs_alloc_reserved_file_extent(trans, root,
+					       btrfs_ino(BTRFS_I(inode)),
+					       file_pos, qg_released, &ins);
 out:
 	btrfs_free_path(path);
 
@@ -2671,7 +2672,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
 	inode_add_bytes(inode, len);
 	btrfs_release_path(path);
 
-	ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr,
+	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
 			new->disk_len, 0,
 			backref->root_id, backref->inum,
 			new->file_pos);	/* start - extent_offset */
@@ -4665,7 +4666,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		     root == fs_info->tree_root)) {
 			btrfs_set_path_blocking(path);
 			bytes_deleted += extent_num_bytes;
-			ret = btrfs_free_extent(trans, fs_info, extent_start,
+			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes, 0,
 						btrfs_header_owner(leaf),
 						ino, extent_offset);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f969c034d7b383..847d318756d474 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3694,7 +3694,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 				if (disko) {
 					inode_add_bytes(inode, datal);
 					ret = btrfs_inc_extent_ref(trans,
-							fs_info,
+							root,
 							disko, diskl, 0,
 							root->root_key.objectid,
 							btrfs_ino(BTRFS_I(inode)),
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9841faef08ea72..7d506a3e46dd82 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1742,7 +1742,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		dirty = 1;
 
 		key.offset -= btrfs_file_extent_offset(leaf, fi);
-		ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
+		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
 					   num_bytes, parent,
 					   btrfs_header_owner(leaf),
 					   key.objectid, key.offset);
@@ -1751,7 +1751,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 			break;
 		}
 
-		ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					parent, btrfs_header_owner(leaf),
 					key.objectid, key.offset);
 		if (ret) {
@@ -1952,21 +1952,21 @@ int replace_path(struct btrfs_trans_handle *trans,
 					      path->slots[level], old_ptr_gen);
 		btrfs_mark_buffer_dirty(path->nodes[level]);
 
-		ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr,
+		ret = btrfs_inc_extent_ref(trans, src, old_bytenr,
 					blocksize, path->nodes[level]->start,
 					src->root_key.objectid, level - 1, 0);
 		BUG_ON(ret);
-		ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
+		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr,
 					blocksize, 0, dest->root_key.objectid,
 					level - 1, 0);
 		BUG_ON(ret);
 
-		ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize,
+		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
 					path->nodes[level]->start,
 					src->root_key.objectid, level - 1, 0);
 		BUG_ON(ret);
 
-		ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize,
+		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
 					0);
 		BUG_ON(ret);
@@ -2808,7 +2808,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 						      trans->transid);
 			btrfs_mark_buffer_dirty(upper->eb);
 
-			ret = btrfs_inc_extent_ref(trans, root->fs_info,
+			ret = btrfs_inc_extent_ref(trans, root,
 						node->eb->start, blocksize,
 						upper->eb->start,
 						btrfs_header_owner(upper->eb),
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1036ac7313a7d6..aa7c71cff575a5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -717,7 +717,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
 						ins.offset);
 			if (ret == 0) {
-				ret = btrfs_inc_extent_ref(trans, fs_info,
+				ret = btrfs_inc_extent_ref(trans, root,
 						ins.objectid, ins.offset,
 						0, root->root_key.objectid,
 						key->objectid, offset);

From fd708b81d972a0714b02a60eb4792fdbf15868c4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:43:50 -0400
Subject: [PATCH 064/109] Btrfs: add a extent ref verify tool

We were having corruption issues that were tied back to problems with
the extent tree.  In order to track them down I built this tool to try
and find the culprit, which was pretty successful.  If you compile with
this tool on it will live verify every ref update that the fs makes and
make sure it is consistent and valid.  I've run this through with
xfstests and haven't gotten any false positives.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update error messages, add fixup from Dan Carpenter to handle errors
  of read_tree_block ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Kconfig       |   11 +
 fs/btrfs/Makefile      |    1 +
 fs/btrfs/ctree.h       |    5 +
 fs/btrfs/disk-io.c     |    6 +
 fs/btrfs/extent-tree.c |   22 +
 fs/btrfs/ref-verify.c  | 1031 ++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ref-verify.h  |   62 +++
 7 files changed, 1138 insertions(+)
 create mode 100644 fs/btrfs/ref-verify.c
 create mode 100644 fs/btrfs/ref-verify.h

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a26c63b4ad6875..2e558227931aa0 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -91,3 +91,14 @@ config BTRFS_ASSERT
 	  any of the assertions trip.  This is meant for btrfs developers only.
 
 	  If unsure, say N.
+
+config BTRFS_FS_REF_VERIFY
+	bool "Btrfs with the ref verify tool compiled in"
+	depends on BTRFS_FS
+	default n
+	help
+	  Enable run-time extent reference verification instrumentation.  This
+	  is meant to be used by btrfs developers for tracking down extent
+	  reference problems or verifying they didn't break something.
+
+	  If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 88255e133ade85..9216ff22d741ae 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -13,6 +13,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
 
 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
 	tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2fcc3c30d471fd..e2afe524e25ea4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1097,6 +1097,11 @@ struct btrfs_fs_info {
 	u32 nodesize;
 	u32 sectorsize;
 	u32 stripesize;
+
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+	spinlock_t ref_verify_lock;
+	struct rb_root block_tree;
+#endif
 };
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f3e6e8fa19b066..f971d5680e6f26 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -51,6 +51,7 @@
 #include "qgroup.h"
 #include "compression.h"
 #include "tree-checker.h"
+#include "ref-verify.h"
 
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -2509,6 +2510,7 @@ int open_ctree(struct super_block *sb,
 	/* readahead state */
 	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 	spin_lock_init(&fs_info->reada_lock);
+	btrfs_init_ref_verify(fs_info);
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
@@ -2920,6 +2922,9 @@ int open_ctree(struct super_block *sb,
 	if (ret)
 		goto fail_trans_kthread;
 
+	if (btrfs_build_ref_tree(fs_info))
+		btrfs_err(fs_info, "couldn't build ref tree");
+
 	/* do not make disk changes in broken FS or nologreplay is given */
 	if (btrfs_super_log_root(disk_super) != 0 &&
 	    !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
@@ -3785,6 +3790,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
 	btrfs_free_stripe_hash_table(fs_info);
+	btrfs_free_ref_cache(fs_info);
 
 	__btrfs_free_block_rsv(root->orphan_block_rsv);
 	root->orphan_block_rsv = NULL;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a9787c3c0f7e7a..52d24699434199 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -38,6 +38,7 @@
 #include "math.h"
 #include "sysfs.h"
 #include "qgroup.h"
+#include "ref-verify.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -2188,6 +2189,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
+	btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
+			   owner, offset, BTRFS_ADD_DELAYED_REF);
+
 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
 						 num_bytes, parent,
@@ -7280,6 +7284,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 		int old_ref_mod, new_ref_mod;
 
+		btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
+				   root->root_key.objectid,
+				   btrfs_header_level(buf), 0,
+				   BTRFS_DROP_DELAYED_REF);
 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
 						 buf->len, parent,
 						 root->root_key.objectid,
@@ -7343,6 +7351,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 	if (btrfs_is_testing(fs_info))
 		return 0;
 
+	if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
+		btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
+				   root_objectid, owner, offset,
+				   BTRFS_DROP_DELAYED_REF);
+
 	/*
 	 * tree log blocks never actually go into the extent allocation
 	 * tree, just update pinning info and exit early.
@@ -8318,6 +8331,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 
 	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
 
+	btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
+			   root->root_key.objectid, owner, offset,
+			   BTRFS_ADD_DELAYED_EXTENT);
+
 	ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
 					 ins->offset, 0,
 					 root->root_key.objectid, owner,
@@ -8542,6 +8559,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 		extent_op->is_data = false;
 		extent_op->level = level;
 
+		btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
+				   root_objectid, level, 0,
+				   BTRFS_ADD_DELAYED_EXTENT);
 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
 						 ins.offset, parent,
 						 root_objectid, level,
@@ -10391,6 +10411,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	 * remove it.
 	 */
 	free_excluded_extents(fs_info, block_group);
+	btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
+				  block_group->key.offset);
 
 	memcpy(&key, &block_group->key, sizeof(key));
 	index = get_block_group_index(block_group);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
new file mode 100644
index 00000000000000..34878699d363e6
--- /dev/null
+++ b/fs/btrfs/ref-verify.c
@@ -0,0 +1,1031 @@
+/*
+ * Copyright (C) 2014 Facebook.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "delayed-ref.h"
+#include "ref-verify.h"
+
+/*
+ * Used to keep track the roots and number of refs each root has for a given
+ * bytenr.  This just tracks the number of direct references, no shared
+ * references.
+ */
+struct root_entry {
+	u64 root_objectid;
+	u64 num_refs;
+	struct rb_node node;
+};
+
+/*
+ * These are meant to represent what should exist in the extent tree, these can
+ * be used to verify the extent tree is consistent as these should all match
+ * what the extent tree says.
+ */
+struct ref_entry {
+	u64 root_objectid;
+	u64 parent;
+	u64 owner;
+	u64 offset;
+	u64 num_refs;
+	struct rb_node node;
+};
+
+#define MAX_TRACE	16
+
+/*
+ * Whenever we add/remove a reference we record the action.  The action maps
+ * back to the delayed ref action.  We hold the ref we are changing in the
+ * action so we can account for the history properly, and we record the root we
+ * were called with since it could be different from ref_root.  We also store
+ * stack traces because thats how I roll.
+ */
+struct ref_action {
+	int action;
+	u64 root;
+	struct ref_entry ref;
+	struct list_head list;
+	unsigned long trace[MAX_TRACE];
+	unsigned int trace_len;
+};
+
+/*
+ * One of these for every block we reference, it holds the roots and references
+ * to it as well as all of the ref actions that have occured to it.  We never
+ * free it until we unmount the file system in order to make sure re-allocations
+ * are happening properly.
+ */
+struct block_entry {
+	u64 bytenr;
+	u64 len;
+	u64 num_refs;
+	int metadata;
+	int from_disk;
+	struct rb_root roots;
+	struct rb_root refs;
+	struct rb_node node;
+	struct list_head actions;
+};
+
+static struct block_entry *insert_block_entry(struct rb_root *root,
+					      struct block_entry *be)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent_node = NULL;
+	struct block_entry *entry;
+
+	while (*p) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct block_entry, node);
+		if (entry->bytenr > be->bytenr)
+			p = &(*p)->rb_left;
+		else if (entry->bytenr < be->bytenr)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	rb_link_node(&be->node, parent_node, p);
+	rb_insert_color(&be->node, root);
+	return NULL;
+}
+
+static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr)
+{
+	struct rb_node *n;
+	struct block_entry *entry = NULL;
+
+	n = root->rb_node;
+	while (n) {
+		entry = rb_entry(n, struct block_entry, node);
+		if (entry->bytenr < bytenr)
+			n = n->rb_right;
+		else if (entry->bytenr > bytenr)
+			n = n->rb_left;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static struct root_entry *insert_root_entry(struct rb_root *root,
+					    struct root_entry *re)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent_node = NULL;
+	struct root_entry *entry;
+
+	while (*p) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct root_entry, node);
+		if (entry->root_objectid > re->root_objectid)
+			p = &(*p)->rb_left;
+		else if (entry->root_objectid < re->root_objectid)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	rb_link_node(&re->node, parent_node, p);
+	rb_insert_color(&re->node, root);
+	return NULL;
+
+}
+
+static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
+{
+	if (ref1->root_objectid < ref2->root_objectid)
+		return -1;
+	if (ref1->root_objectid > ref2->root_objectid)
+		return 1;
+	if (ref1->parent < ref2->parent)
+		return -1;
+	if (ref1->parent > ref2->parent)
+		return 1;
+	if (ref1->owner < ref2->owner)
+		return -1;
+	if (ref1->owner > ref2->owner)
+		return 1;
+	if (ref1->offset < ref2->offset)
+		return -1;
+	if (ref1->offset > ref2->offset)
+		return 1;
+	return 0;
+}
+
+static struct ref_entry *insert_ref_entry(struct rb_root *root,
+					  struct ref_entry *ref)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent_node = NULL;
+	struct ref_entry *entry;
+	int cmp;
+
+	while (*p) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct ref_entry, node);
+		cmp = comp_refs(entry, ref);
+		if (cmp > 0)
+			p = &(*p)->rb_left;
+		else if (cmp < 0)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	rb_link_node(&ref->node, parent_node, p);
+	rb_insert_color(&ref->node, root);
+	return NULL;
+
+}
+
+static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid)
+{
+	struct rb_node *n;
+	struct root_entry *entry = NULL;
+
+	n = root->rb_node;
+	while (n) {
+		entry = rb_entry(n, struct root_entry, node);
+		if (entry->root_objectid < objectid)
+			n = n->rb_right;
+		else if (entry->root_objectid > objectid)
+			n = n->rb_left;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+#ifdef CONFIG_STACKTRACE
+static void __save_stack_trace(struct ref_action *ra)
+{
+	struct stack_trace stack_trace;
+
+	stack_trace.max_entries = MAX_TRACE;
+	stack_trace.nr_entries = 0;
+	stack_trace.entries = ra->trace;
+	stack_trace.skip = 2;
+	save_stack_trace(&stack_trace);
+	ra->trace_len = stack_trace.nr_entries;
+}
+
+static void __print_stack_trace(struct btrfs_fs_info *fs_info,
+				struct ref_action *ra)
+{
+	struct stack_trace trace;
+
+	if (ra->trace_len == 0) {
+		btrfs_err(fs_info, "  ref-verify: no stacktrace");
+		return;
+	}
+	trace.nr_entries = ra->trace_len;
+	trace.entries = ra->trace;
+	print_stack_trace(&trace, 2);
+}
+#else
+static void inline __save_stack_trace(struct ref_action *ra)
+{
+}
+
+static void inline __print_stack_trace(struct btrfs_fs_info *fs_info,
+				       struct ref_action *ra)
+{
+	btrfs_err(fs_info, "  ref-verify: no stacktrace support");
+}
+#endif
+
+static void free_block_entry(struct block_entry *be)
+{
+	struct root_entry *re;
+	struct ref_entry *ref;
+	struct ref_action *ra;
+	struct rb_node *n;
+
+	while ((n = rb_first(&be->roots))) {
+		re = rb_entry(n, struct root_entry, node);
+		rb_erase(&re->node, &be->roots);
+		kfree(re);
+	}
+
+	while((n = rb_first(&be->refs))) {
+		ref = rb_entry(n, struct ref_entry, node);
+		rb_erase(&ref->node, &be->refs);
+		kfree(ref);
+	}
+
+	while (!list_empty(&be->actions)) {
+		ra = list_first_entry(&be->actions, struct ref_action,
+				      list);
+		list_del(&ra->list);
+		kfree(ra);
+	}
+	kfree(be);
+}
+
+static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
+					   u64 bytenr, u64 len,
+					   u64 root_objectid)
+{
+	struct block_entry *be = NULL, *exist;
+	struct root_entry *re = NULL;
+
+	re = kzalloc(sizeof(struct root_entry), GFP_KERNEL);
+	be = kzalloc(sizeof(struct block_entry), GFP_KERNEL);
+	if (!be || !re) {
+		kfree(re);
+		kfree(be);
+		return ERR_PTR(-ENOMEM);
+	}
+	be->bytenr = bytenr;
+	be->len = len;
+
+	re->root_objectid = root_objectid;
+	re->num_refs = 0;
+
+	spin_lock(&fs_info->ref_verify_lock);
+	exist = insert_block_entry(&fs_info->block_tree, be);
+	if (exist) {
+		if (root_objectid) {
+			struct root_entry *exist_re;
+
+			exist_re = insert_root_entry(&exist->roots, re);
+			if (exist_re)
+				kfree(re);
+		}
+		kfree(be);
+		return exist;
+	}
+
+	be->num_refs = 0;
+	be->metadata = 0;
+	be->from_disk = 0;
+	be->roots = RB_ROOT;
+	be->refs = RB_ROOT;
+	INIT_LIST_HEAD(&be->actions);
+	if (root_objectid)
+		insert_root_entry(&be->roots, re);
+	else
+		kfree(re);
+	return be;
+}
+
+static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root,
+			  u64 parent, u64 bytenr, int level)
+{
+	struct block_entry *be;
+	struct root_entry *re;
+	struct ref_entry *ref = NULL, *exist;
+
+	ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL);
+	if (!ref)
+		return -ENOMEM;
+
+	if (parent)
+		ref->root_objectid = 0;
+	else
+		ref->root_objectid = ref_root;
+	ref->parent = parent;
+	ref->owner = level;
+	ref->offset = 0;
+	ref->num_refs = 1;
+
+	be = add_block_entry(fs_info, bytenr, fs_info->nodesize, ref_root);
+	if (IS_ERR(be)) {
+		kfree(ref);
+		return PTR_ERR(be);
+	}
+	be->num_refs++;
+	be->from_disk = 1;
+	be->metadata = 1;
+
+	if (!parent) {
+		ASSERT(ref_root);
+		re = lookup_root_entry(&be->roots, ref_root);
+		ASSERT(re);
+		re->num_refs++;
+	}
+	exist = insert_ref_entry(&be->refs, ref);
+	if (exist) {
+		exist->num_refs++;
+		kfree(ref);
+	}
+	spin_unlock(&fs_info->ref_verify_lock);
+
+	return 0;
+}
+
+static int add_shared_data_ref(struct btrfs_fs_info *fs_info,
+			       u64 parent, u32 num_refs, u64 bytenr,
+			       u64 num_bytes)
+{
+	struct block_entry *be;
+	struct ref_entry *ref;
+
+	ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+	if (!ref)
+		return -ENOMEM;
+	be = add_block_entry(fs_info, bytenr, num_bytes, 0);
+	if (IS_ERR(be)) {
+		kfree(ref);
+		return PTR_ERR(be);
+	}
+	be->num_refs += num_refs;
+
+	ref->parent = parent;
+	ref->num_refs = num_refs;
+	if (insert_ref_entry(&be->refs, ref)) {
+		spin_unlock(&fs_info->ref_verify_lock);
+		btrfs_err(fs_info, "existing shared ref when reading from disk?");
+		kfree(ref);
+		return -EINVAL;
+	}
+	spin_unlock(&fs_info->ref_verify_lock);
+	return 0;
+}
+
+static int add_extent_data_ref(struct btrfs_fs_info *fs_info,
+			       struct extent_buffer *leaf,
+			       struct btrfs_extent_data_ref *dref,
+			       u64 bytenr, u64 num_bytes)
+{
+	struct block_entry *be;
+	struct ref_entry *ref;
+	struct root_entry *re;
+	u64 ref_root = btrfs_extent_data_ref_root(leaf, dref);
+	u64 owner = btrfs_extent_data_ref_objectid(leaf, dref);
+	u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
+	u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);
+
+	ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+	if (!ref)
+		return -ENOMEM;
+	be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
+	if (IS_ERR(be)) {
+		kfree(ref);
+		return PTR_ERR(be);
+	}
+	be->num_refs += num_refs;
+
+	ref->parent = 0;
+	ref->owner = owner;
+	ref->root_objectid = ref_root;
+	ref->offset = offset;
+	ref->num_refs = num_refs;
+	if (insert_ref_entry(&be->refs, ref)) {
+		spin_unlock(&fs_info->ref_verify_lock);
+		btrfs_err(fs_info, "existing ref when reading from disk?");
+		kfree(ref);
+		return -EINVAL;
+	}
+
+	re = lookup_root_entry(&be->roots, ref_root);
+	if (!re) {
+		spin_unlock(&fs_info->ref_verify_lock);
+		btrfs_err(fs_info, "missing root in new block entry?");
+		return -EINVAL;
+	}
+	re->num_refs += num_refs;
+	spin_unlock(&fs_info->ref_verify_lock);
+	return 0;
+}
+
+static int process_extent_item(struct btrfs_fs_info *fs_info,
+			       struct btrfs_path *path, struct btrfs_key *key,
+			       int slot, int *tree_block_level)
+{
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_shared_data_ref *sref;
+	struct extent_buffer *leaf = path->nodes[0];
+	u32 item_size = btrfs_item_size_nr(leaf, slot);
+	unsigned long end, ptr;
+	u64 offset, flags, count;
+	int type, ret;
+
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	flags = btrfs_extent_flags(leaf, ei);
+
+	if ((key->type == BTRFS_EXTENT_ITEM_KEY) &&
+	    flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		struct btrfs_tree_block_info *info;
+
+		info = (struct btrfs_tree_block_info *)(ei + 1);
+		*tree_block_level = btrfs_tree_block_level(leaf, info);
+		iref = (struct btrfs_extent_inline_ref *)(info + 1);
+	} else {
+		if (key->type == BTRFS_METADATA_ITEM_KEY)
+			*tree_block_level = key->offset;
+		iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+	}
+
+	ptr = (unsigned long)iref;
+	end = (unsigned long)ei + item_size;
+	while (ptr < end) {
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(leaf, iref);
+		offset = btrfs_extent_inline_ref_offset(leaf, iref);
+		switch (type) {
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = add_tree_block(fs_info, offset, 0, key->objectid,
+					     *tree_block_level);
+			break;
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = add_tree_block(fs_info, 0, offset, key->objectid,
+					     *tree_block_level);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			ret = add_extent_data_ref(fs_info, leaf, dref,
+						  key->objectid, key->offset);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY:
+			sref = (struct btrfs_shared_data_ref *)(iref + 1);
+			count = btrfs_shared_data_ref_count(leaf, sref);
+			ret = add_shared_data_ref(fs_info, offset, count,
+						  key->objectid, key->offset);
+			break;
+		default:
+			btrfs_err(fs_info, "invalid key type in iref");
+			ret = -EINVAL;
+			break;
+		}
+		if (ret)
+			break;
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+	return ret;
+}
+
+static int process_leaf(struct btrfs_root *root,
+			struct btrfs_path *path, u64 *bytenr, u64 *num_bytes)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct extent_buffer *leaf = path->nodes[0];
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_shared_data_ref *sref;
+	u32 count;
+	int i = 0, tree_block_level = 0, ret;
+	struct btrfs_key key;
+	int nritems = btrfs_header_nritems(leaf);
+
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		switch (key.type) {
+		case BTRFS_EXTENT_ITEM_KEY:
+			*num_bytes = key.offset;
+		case BTRFS_METADATA_ITEM_KEY:
+			*bytenr = key.objectid;
+			ret = process_extent_item(fs_info, path, &key, i,
+						  &tree_block_level);
+			break;
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = add_tree_block(fs_info, key.offset, 0,
+					     key.objectid, tree_block_level);
+			break;
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = add_tree_block(fs_info, 0, key.offset,
+					     key.objectid, tree_block_level);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+			dref = btrfs_item_ptr(leaf, i,
+					      struct btrfs_extent_data_ref);
+			ret = add_extent_data_ref(fs_info, leaf, dref, *bytenr,
+						  *num_bytes);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY:
+			sref = btrfs_item_ptr(leaf, i,
+					      struct btrfs_shared_data_ref);
+			count = btrfs_shared_data_ref_count(leaf, sref);
+			ret = add_shared_data_ref(fs_info, key.offset, count,
+						  *bytenr, *num_bytes);
+			break;
+		default:
+			break;
+		}
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+/* Walk down to the leaf from the given level */
+static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
+			  int level, u64 *bytenr, u64 *num_bytes)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct extent_buffer *eb;
+	u64 block_bytenr, gen;
+	int ret = 0;
+
+	while (level >= 0) {
+		if (level) {
+			block_bytenr = btrfs_node_blockptr(path->nodes[level],
+							   path->slots[level]);
+			gen = btrfs_node_ptr_generation(path->nodes[level],
+							path->slots[level]);
+			eb = read_tree_block(fs_info, block_bytenr, gen);
+			if (IS_ERR(eb))
+				return PTR_ERR(eb);
+			if (!extent_buffer_uptodate(eb)) {
+				free_extent_buffer(eb);
+				return -EIO;
+			}
+			btrfs_tree_read_lock(eb);
+			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+			path->nodes[level-1] = eb;
+			path->slots[level-1] = 0;
+			path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
+		} else {
+			ret = process_leaf(root, path, bytenr, num_bytes);
+			if (ret)
+				break;
+		}
+		level--;
+	}
+	return ret;
+}
+
+/* Walk up to the next node that needs to be processed */
+static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
+			int *level)
+{
+	int l;
+
+	for (l = 0; l < BTRFS_MAX_LEVEL; l++) {
+		if (!path->nodes[l])
+			continue;
+		if (l) {
+			path->slots[l]++;
+			if (path->slots[l] <
+			    btrfs_header_nritems(path->nodes[l])) {
+				*level = l;
+				return 0;
+			}
+		}
+		btrfs_tree_unlock_rw(path->nodes[l], path->locks[l]);
+		free_extent_buffer(path->nodes[l]);
+		path->nodes[l] = NULL;
+		path->slots[l] = 0;
+		path->locks[l] = 0;
+	}
+
+	return 1;
+}
+
+static void dump_ref_action(struct btrfs_fs_info *fs_info,
+			    struct ref_action *ra)
+{
+	btrfs_err(fs_info,
+"  Ref action %d, root %llu, ref_root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu",
+		  ra->action, ra->root, ra->ref.root_objectid, ra->ref.parent,
+		  ra->ref.owner, ra->ref.offset, ra->ref.num_refs);
+	__print_stack_trace(fs_info, ra);
+}
+
+/*
+ * Dumps all the information from the block entry to printk, it's going to be
+ * awesome.
+ */
+static void dump_block_entry(struct btrfs_fs_info *fs_info,
+			     struct block_entry *be)
+{
+	struct ref_entry *ref;
+	struct root_entry *re;
+	struct ref_action *ra;
+	struct rb_node *n;
+
+	btrfs_err(fs_info,
+"dumping block entry [%llu %llu], num_refs %llu, metadata %d, from disk %d",
+		  be->bytenr, be->len, be->num_refs, be->metadata,
+		  be->from_disk);
+
+	for (n = rb_first(&be->refs); n; n = rb_next(n)) {
+		ref = rb_entry(n, struct ref_entry, node);
+		btrfs_err(fs_info,
+"  ref root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu",
+			  ref->root_objectid, ref->parent, ref->owner,
+			  ref->offset, ref->num_refs);
+	}
+
+	for (n = rb_first(&be->roots); n; n = rb_next(n)) {
+		re = rb_entry(n, struct root_entry, node);
+		btrfs_err(fs_info, "  root entry %llu, num_refs %llu",
+			  re->root_objectid, re->num_refs);
+	}
+
+	list_for_each_entry(ra, &be->actions, list)
+		dump_ref_action(fs_info, ra);
+}
+
+/*
+ * btrfs_ref_tree_mod: called when we modify a ref for a bytenr
+ * @root: the root we are making this modification from.
+ * @bytenr: the bytenr we are modifying.
+ * @num_bytes: number of bytes.
+ * @parent: the parent bytenr.
+ * @ref_root: the original root owner of the bytenr.
+ * @owner: level in the case of metadata, inode in the case of data.
+ * @offset: 0 for metadata, file offset for data.
+ * @action: the action that we are doing, this is the same as the delayed ref
+ *	action.
+ *
+ * This will add an action item to the given bytenr and do sanity checks to make
+ * sure we haven't messed something up.  If we are making a new allocation and
+ * this block entry has history we will delete all previous actions as long as
+ * our sanity checks pass as they are no longer needed.
+ */
+int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+		       u64 parent, u64 ref_root, u64 owner, u64 offset,
+		       int action)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct ref_entry *ref = NULL, *exist;
+	struct ref_action *ra = NULL;
+	struct block_entry *be = NULL;
+	struct root_entry *re = NULL;
+	int ret = 0;
+	bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
+	if (!btrfs_test_opt(root->fs_info, REF_VERIFY))
+		return 0;
+
+	ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
+	ra = kmalloc(sizeof(struct ref_action), GFP_NOFS);
+	if (!ra || !ref) {
+		kfree(ref);
+		kfree(ra);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (parent) {
+		ref->parent = parent;
+	} else {
+		ref->root_objectid = ref_root;
+		ref->owner = owner;
+		ref->offset = offset;
+	}
+	ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1;
+
+	memcpy(&ra->ref, ref, sizeof(struct ref_entry));
+	/*
+	 * Save the extra info from the delayed ref in the ref action to make it
+	 * easier to figure out what is happening.  The real ref's we add to the
+	 * ref tree need to reflect what we save on disk so it matches any
+	 * on-disk refs we pre-loaded.
+	 */
+	ra->ref.owner = owner;
+	ra->ref.offset = offset;
+	ra->ref.root_objectid = ref_root;
+	__save_stack_trace(ra);
+
+	INIT_LIST_HEAD(&ra->list);
+	ra->action = action;
+	ra->root = root->objectid;
+
+	/*
+	 * This is an allocation, preallocate the block_entry in case we haven't
+	 * used it before.
+	 */
+	ret = -EINVAL;
+	if (action == BTRFS_ADD_DELAYED_EXTENT) {
+		/*
+		 * For subvol_create we'll just pass in whatever the parent root
+		 * is and the new root objectid, so let's not treat the passed
+		 * in root as if it really has a ref for this bytenr.
+		 */
+		be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root);
+		if (IS_ERR(be)) {
+			kfree(ra);
+			ret = PTR_ERR(be);
+			goto out;
+		}
+		be->num_refs++;
+		if (metadata)
+			be->metadata = 1;
+
+		if (be->num_refs != 1) {
+			btrfs_err(fs_info,
+			"re-allocated a block that still has references to it!");
+			dump_block_entry(fs_info, be);
+			dump_ref_action(fs_info, ra);
+			goto out_unlock;
+		}
+
+		while (!list_empty(&be->actions)) {
+			struct ref_action *tmp;
+
+			tmp = list_first_entry(&be->actions, struct ref_action,
+					       list);
+			list_del(&tmp->list);
+			kfree(tmp);
+		}
+	} else {
+		struct root_entry *tmp;
+
+		if (!parent) {
+			re = kmalloc(sizeof(struct root_entry), GFP_NOFS);
+			if (!re) {
+				kfree(ref);
+				kfree(ra);
+				ret = -ENOMEM;
+				goto out;
+			}
+			/*
+			 * This is the root that is modifying us, so it's the
+			 * one we want to lookup below when we modify the
+			 * re->num_refs.
+			 */
+			ref_root = root->objectid;
+			re->root_objectid = root->objectid;
+			re->num_refs = 0;
+		}
+
+		spin_lock(&root->fs_info->ref_verify_lock);
+		be = lookup_block_entry(&root->fs_info->block_tree, bytenr);
+		if (!be) {
+			btrfs_err(fs_info,
+"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!",
+				  action, (unsigned long long)bytenr,
+				  (unsigned long long)num_bytes);
+			dump_ref_action(fs_info, ra);
+			kfree(ref);
+			kfree(ra);
+			goto out_unlock;
+		}
+
+		if (!parent) {
+			tmp = insert_root_entry(&be->roots, re);
+			if (tmp) {
+				kfree(re);
+				re = tmp;
+			}
+		}
+	}
+
+	exist = insert_ref_entry(&be->refs, ref);
+	if (exist) {
+		if (action == BTRFS_DROP_DELAYED_REF) {
+			if (exist->num_refs == 0) {
+				btrfs_err(fs_info,
+"dropping a ref for a existing root that doesn't have a ref on the block");
+				dump_block_entry(fs_info, be);
+				dump_ref_action(fs_info, ra);
+				kfree(ra);
+				goto out_unlock;
+			}
+			exist->num_refs--;
+			if (exist->num_refs == 0) {
+				rb_erase(&exist->node, &be->refs);
+				kfree(exist);
+			}
+		} else if (!be->metadata) {
+			exist->num_refs++;
+		} else {
+			btrfs_err(fs_info,
+"attempting to add another ref for an existing ref on a tree block");
+			dump_block_entry(fs_info, be);
+			dump_ref_action(fs_info, ra);
+			kfree(ra);
+			goto out_unlock;
+		}
+		kfree(ref);
+	} else {
+		if (action == BTRFS_DROP_DELAYED_REF) {
+			btrfs_err(fs_info,
+"dropping a ref for a root that doesn't have a ref on the block");
+			dump_block_entry(fs_info, be);
+			dump_ref_action(fs_info, ra);
+			kfree(ra);
+			goto out_unlock;
+		}
+	}
+
+	if (!parent && !re) {
+		re = lookup_root_entry(&be->roots, ref_root);
+		if (!re) {
+			/*
+			 * This shouldn't happen because we will add our re
+			 * above when we lookup the be with !parent, but just in
+			 * case catch this case so we don't panic because I
+			 * didn't thik of some other corner case.
+			 */
+			btrfs_err(fs_info, "failed to find root %llu for %llu",
+				  root->objectid, be->bytenr);
+			dump_block_entry(fs_info, be);
+			dump_ref_action(fs_info, ra);
+			kfree(ra);
+			goto out_unlock;
+		}
+	}
+	if (action == BTRFS_DROP_DELAYED_REF) {
+		if (re)
+			re->num_refs--;
+		be->num_refs--;
+	} else if (action == BTRFS_ADD_DELAYED_REF) {
+		be->num_refs++;
+		if (re)
+			re->num_refs++;
+	}
+	list_add_tail(&ra->list, &be->actions);
+	ret = 0;
+out_unlock:
+	spin_unlock(&root->fs_info->ref_verify_lock);
+out:
+	if (ret)
+		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+	return ret;
+}
+
+/* Free up the ref cache */
+void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
+{
+	struct block_entry *be;
+	struct rb_node *n;
+
+	if (!btrfs_test_opt(fs_info, REF_VERIFY))
+		return;
+
+	spin_lock(&fs_info->ref_verify_lock);
+	while ((n = rb_first(&fs_info->block_tree))) {
+		be = rb_entry(n, struct block_entry, node);
+		rb_erase(&be->node, &fs_info->block_tree);
+		free_block_entry(be);
+		cond_resched_lock(&fs_info->ref_verify_lock);
+	}
+	spin_unlock(&fs_info->ref_verify_lock);
+}
+
+void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
+			       u64 len)
+{
+	struct block_entry *be = NULL, *entry;
+	struct rb_node *n;
+
+	if (!btrfs_test_opt(fs_info, REF_VERIFY))
+		return;
+
+	spin_lock(&fs_info->ref_verify_lock);
+	n = fs_info->block_tree.rb_node;
+	while (n) {
+		entry = rb_entry(n, struct block_entry, node);
+		if (entry->bytenr < start) {
+			n = n->rb_right;
+		} else if (entry->bytenr > start) {
+			n = n->rb_left;
+		} else {
+			be = entry;
+			break;
+		}
+		/* We want to get as close to start as possible */
+		if (be == NULL ||
+		    (entry->bytenr < start && be->bytenr > start) ||
+		    (entry->bytenr < start && entry->bytenr > be->bytenr))
+			be = entry;
+	}
+
+	/*
+	 * Could have an empty block group, maybe have something to check for
+	 * this case to verify we were actually empty?
+	 */
+	if (!be) {
+		spin_unlock(&fs_info->ref_verify_lock);
+		return;
+	}
+
+	n = &be->node;
+	while (n) {
+		be = rb_entry(n, struct block_entry, node);
+		n = rb_next(n);
+		if (be->bytenr < start && be->bytenr + be->len > start) {
+			btrfs_err(fs_info,
+				"block entry overlaps a block group [%llu,%llu]!",
+				start, len);
+			dump_block_entry(fs_info, be);
+			continue;
+		}
+		if (be->bytenr < start)
+			continue;
+		if (be->bytenr >= start + len)
+			break;
+		if (be->bytenr + be->len > start + len) {
+			btrfs_err(fs_info,
+				"block entry overlaps a block group [%llu,%llu]!",
+				start, len);
+			dump_block_entry(fs_info, be);
+		}
+		rb_erase(&be->node, &fs_info->block_tree);
+		free_block_entry(be);
+	}
+	spin_unlock(&fs_info->ref_verify_lock);
+}
+
+/* Walk down all roots and build the ref tree, meant to be called at mount */
+int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct extent_buffer *eb;
+	u64 bytenr = 0, num_bytes = 0;
+	int ret, level;
+
+	if (!btrfs_test_opt(fs_info, REF_VERIFY))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	eb = btrfs_read_lock_root_node(fs_info->extent_root);
+	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+	level = btrfs_header_level(eb);
+	path->nodes[level] = eb;
+	path->slots[level] = 0;
+	path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+	while (1) {
+		/*
+		 * We have to keep track of the bytenr/num_bytes we last hit
+		 * because we could have run out of space for an inline ref, and
+		 * would have had to added a ref key item which may appear on a
+		 * different leaf from the original extent item.
+		 */
+		ret = walk_down_tree(fs_info->extent_root, path, level,
+				     &bytenr, &num_bytes);
+		if (ret)
+			break;
+		ret = walk_up_tree(root, path, &level);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			ret = 0;
+			break;
+		}
+	}
+	if (ret) {
+		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+		btrfs_free_ref_cache(fs_info);
+	}
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
new file mode 100644
index 00000000000000..3bf02ce0e1e27f
--- /dev/null
+++ b/fs/btrfs/ref-verify.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2014 Facebook.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __REF_VERIFY__
+#define __REF_VERIFY__
+
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
+void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
+int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+		       u64 parent, u64 ref_root, u64 owner, u64 offset,
+		       int action);
+void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
+			       u64 len);
+
+static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
+{
+	spin_lock_init(&fs_info->ref_verify_lock);
+	fs_info->block_tree = RB_ROOT;
+}
+#else
+static inline int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
+{
+	return 0;
+}
+
+static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
+{
+}
+
+static inline int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr,
+				     u64 num_bytes, u64 parent, u64 ref_root,
+				     u64 owner, u64 offset, int action)
+{
+	return 0;
+}
+
+static inline void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info,
+					     u64 start, u64 len)
+{
+}
+
+static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
+{
+}
+
+#endif /* CONFIG_BTRFS_FS_REF_VERIFY */
+#endif /* _REF_VERIFY__ */

From 7c777430e855898461d54efd14b3942c763ee8de Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:43:51 -0400
Subject: [PATCH 065/109] Btrfs: only check delayed ref usage in
 should_end_transaction

We were only doing btrfs_check_space_for_delayed_refs() if the metadata
space was full, ie we couldn't allocate chunks.  This assumes we'll be
able to allocate chunks during transaction commit, but since nothing
does a LIMIT flush during the transaction commit this won't actually
happen unless we happen to run shy of actual space.  We already take
into account a full fs in btrfs_check_space_for_delayed_refs() so just
kill this extra check to make sure we're ending the transaction when we
need to.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9c5f126064bdee..68c3e1c04bca7e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -797,8 +797,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 
-	if (fs_info->global_block_rsv.space_info->full &&
-	    btrfs_check_space_for_delayed_refs(trans, fs_info))
+	if (btrfs_check_space_for_delayed_refs(trans, fs_info))
 		return 1;
 
 	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);

From 2eadaa22c17ac3c546850f70d9d20fa19459dbce Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:43:52 -0400
Subject: [PATCH 066/109] btrfs: add a helper to return a head ref

Simplify the error handling in __btrfs_run_delayed_refs by breaking out
the code used to return a head back to the delayed_refs tree for
processing into a helper function.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 52d24699434199..3e4a6a4b050595 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2575,6 +2575,16 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
 	return ref;
 }
 
+static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+				      struct btrfs_delayed_ref_head *head)
+{
+	spin_lock(&delayed_refs->lock);
+	head->processing = 0;
+	delayed_refs->num_heads_ready++;
+	spin_unlock(&delayed_refs->lock);
+	btrfs_delayed_ref_unlock(head);
+}
+
 /*
  * Returns 0 on success or if called with an already aborted transaction.
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
@@ -2648,11 +2658,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		if (ref && ref->seq &&
 		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
 			spin_unlock(&locked_ref->lock);
-			spin_lock(&delayed_refs->lock);
-			locked_ref->processing = 0;
-			delayed_refs->num_heads_ready++;
-			spin_unlock(&delayed_refs->lock);
-			btrfs_delayed_ref_unlock(locked_ref);
+			unselect_delayed_ref_head(delayed_refs, locked_ref);
 			locked_ref = NULL;
 			cond_resched();
 			count++;
@@ -2698,14 +2704,11 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 					 */
 					if (must_insert_reserved)
 						locked_ref->must_insert_reserved = 1;
-					spin_lock(&delayed_refs->lock);
-					locked_ref->processing = 0;
-					delayed_refs->num_heads_ready++;
-					spin_unlock(&delayed_refs->lock);
+					unselect_delayed_ref_head(delayed_refs,
+								  locked_ref);
 					btrfs_debug(fs_info,
 						    "run_delayed_extent_op returned %d",
 						    ret);
-					btrfs_delayed_ref_unlock(locked_ref);
 					return ret;
 				}
 				continue;
@@ -2763,11 +2766,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
 		btrfs_free_delayed_extent_op(extent_op);
 		if (ret) {
-			spin_lock(&delayed_refs->lock);
-			locked_ref->processing = 0;
-			delayed_refs->num_heads_ready++;
-			spin_unlock(&delayed_refs->lock);
-			btrfs_delayed_ref_unlock(locked_ref);
+			unselect_delayed_ref_head(delayed_refs, locked_ref);
 			btrfs_put_delayed_ref(ref);
 			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
 				    ret);

From b00e62507ec6a9b63d9b3019795584e694085b65 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:43:53 -0400
Subject: [PATCH 067/109] btrfs: move extent_op cleanup to a helper

Move the extent_op cleanup for an empty head ref to a helper function to
help simplify __btrfs_run_delayed_refs.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 77 ++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3e4a6a4b050595..e5ed90f2080582 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2585,6 +2585,26 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref
 	btrfs_delayed_ref_unlock(head);
 }
 
+static int cleanup_extent_op(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info,
+			     struct btrfs_delayed_ref_head *head)
+{
+	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+	int ret;
+
+	if (!extent_op)
+		return 0;
+	head->extent_op = NULL;
+	if (head->must_insert_reserved) {
+		btrfs_free_delayed_extent_op(extent_op);
+		return 0;
+	}
+	spin_unlock(&head->lock);
+	ret = run_delayed_extent_op(trans, fs_info, &head->node, extent_op);
+	btrfs_free_delayed_extent_op(extent_op);
+	return ret ? ret : 1;
+}
+
 /*
  * Returns 0 on success or if called with an already aborted transaction.
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
@@ -2665,16 +2685,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
-		/*
-		 * record the must insert reserved flag before we
-		 * drop the spin lock.
-		 */
-		must_insert_reserved = locked_ref->must_insert_reserved;
-		locked_ref->must_insert_reserved = 0;
-
-		extent_op = locked_ref->extent_op;
-		locked_ref->extent_op = NULL;
-
 		if (!ref) {
 
 
@@ -2684,33 +2694,17 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			 */
 			ref = &locked_ref->node;
 
-			if (extent_op && must_insert_reserved) {
-				btrfs_free_delayed_extent_op(extent_op);
-				extent_op = NULL;
-			}
-
-			if (extent_op) {
-				spin_unlock(&locked_ref->lock);
-				ret = run_delayed_extent_op(trans, fs_info,
-							    ref, extent_op);
-				btrfs_free_delayed_extent_op(extent_op);
-
-				if (ret) {
-					/*
-					 * Need to reset must_insert_reserved if
-					 * there was an error so the abort stuff
-					 * can cleanup the reserved space
-					 * properly.
-					 */
-					if (must_insert_reserved)
-						locked_ref->must_insert_reserved = 1;
-					unselect_delayed_ref_head(delayed_refs,
-								  locked_ref);
-					btrfs_debug(fs_info,
-						    "run_delayed_extent_op returned %d",
-						    ret);
-					return ret;
-				}
+			ret = cleanup_extent_op(trans, fs_info, locked_ref);
+			if (ret < 0) {
+				unselect_delayed_ref_head(delayed_refs,
+							  locked_ref);
+				btrfs_debug(fs_info,
+					    "run_delayed_extent_op returned %d",
+					    ret);
+				return ret;
+			} else if (ret > 0) {
+				/* We dropped our lock, we need to loop. */
+				ret = 0;
 				continue;
 			}
 
@@ -2759,6 +2753,15 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 				WARN_ON(1);
 			}
 		}
+		/*
+		 * Record the must-insert_reserved flag before we drop the spin
+		 * lock.
+		 */
+		must_insert_reserved = locked_ref->must_insert_reserved;
+		locked_ref->must_insert_reserved = 0;
+
+		extent_op = locked_ref->extent_op;
+		locked_ref->extent_op = NULL;
 		spin_unlock(&locked_ref->lock);
 
 		ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,

From 194ab0bc21cc99c5b804040aa785e0bdcc214656 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:43:54 -0400
Subject: [PATCH 068/109] btrfs: breakout empty head cleanup to a helper

Move this code out to a helper function to further simplivy
__btrfs_run_delayed_refs.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 79 ++++++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 34 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e5ed90f2080582..5e61e9287b0ea2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2605,6 +2605,43 @@ static int cleanup_extent_op(struct btrfs_trans_handle *trans,
 	return ret ? ret : 1;
 }
 
+static int cleanup_ref_head(struct btrfs_trans_handle *trans,
+			    struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_head *head)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	ret = cleanup_extent_op(trans, fs_info, head);
+	if (ret < 0) {
+		unselect_delayed_ref_head(delayed_refs, head);
+		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
+		return ret;
+	} else if (ret) {
+		return ret;
+	}
+
+	/*
+	 * Need to drop our head ref lock and re-acquire the delayed ref lock
+	 * and then re-check to make sure nobody got added.
+	 */
+	spin_unlock(&head->lock);
+	spin_lock(&delayed_refs->lock);
+	spin_lock(&head->lock);
+	if (!list_empty(&head->ref_list) || head->extent_op) {
+		spin_unlock(&head->lock);
+		spin_unlock(&delayed_refs->lock);
+		return 1;
+	}
+	head->node.in_tree = 0;
+	delayed_refs->num_heads--;
+	rb_erase(&head->href_node, &delayed_refs->href_root);
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
 /*
  * Returns 0 on success or if called with an already aborted transaction.
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
@@ -2686,47 +2723,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		}
 
 		if (!ref) {
-
-
-			/* All delayed refs have been processed, Go ahead
-			 * and send the head node to run_one_delayed_ref,
-			 * so that any accounting fixes can happen
-			 */
-			ref = &locked_ref->node;
-
-			ret = cleanup_extent_op(trans, fs_info, locked_ref);
-			if (ret < 0) {
-				unselect_delayed_ref_head(delayed_refs,
-							  locked_ref);
-				btrfs_debug(fs_info,
-					    "run_delayed_extent_op returned %d",
-					    ret);
-				return ret;
-			} else if (ret > 0) {
+			ret = cleanup_ref_head(trans, fs_info, locked_ref);
+			if (ret > 0 ) {
 				/* We dropped our lock, we need to loop. */
 				ret = 0;
 				continue;
+			} else if (ret) {
+				return ret;
 			}
 
 			/*
-			 * Need to drop our head ref lock and re-acquire the
-			 * delayed ref lock and then re-check to make sure
-			 * nobody got added.
+			 * All delayed refs have been processed, Go ahead and
+			 * send the head node to run_one_delayed_ref, so that
+			 * any accounting fixes can happen
 			 */
-			spin_unlock(&locked_ref->lock);
-			spin_lock(&delayed_refs->lock);
-			spin_lock(&locked_ref->lock);
-			if (!list_empty(&locked_ref->ref_list) ||
-			    locked_ref->extent_op) {
-				spin_unlock(&locked_ref->lock);
-				spin_unlock(&delayed_refs->lock);
-				continue;
-			}
-			ref->in_tree = 0;
-			delayed_refs->num_heads--;
-			rb_erase(&locked_ref->href_node,
-				 &delayed_refs->href_root);
-			spin_unlock(&delayed_refs->lock);
+			ref = &locked_ref->node;
 		} else {
 			actual_count++;
 			ref->in_tree = 0;

From 1ce7a5ec44c8b148c4600e25c696a135319734db Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:43:55 -0400
Subject: [PATCH 069/109] btrfs: move ref_mod modification into the if (ref)
 logic

We only use this logic if our ref isn't a ref_head, so move it up into
the if (ref) case since we know that this is a normal ref and not a
delayed ref head.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5e61e9287b0ea2..0dcbbeacaadc7d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2744,10 +2744,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			list_del(&ref->list);
 			if (!list_empty(&ref->add_list))
 				list_del(&ref->add_list);
-		}
-		atomic_dec(&delayed_refs->num_entries);
-
-		if (!btrfs_delayed_ref_is_head(ref)) {
 			/*
 			 * when we play the delayed ref, also correct the
 			 * ref_mod on head
@@ -2764,6 +2760,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 				WARN_ON(1);
 			}
 		}
+		atomic_dec(&delayed_refs->num_entries);
+
 		/*
 		 * Record the must-insert_reserved flag before we drop the spin
 		 * lock.

From c1103f7a5d7a544dfdaca6102de68792909dc834 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:43:56 -0400
Subject: [PATCH 070/109] btrfs: move all ref head cleanup to the helper
 function

We do a couple different cleanup operations on the ref head.  We adjust
counters, we'll free any reserved space if we didn't end up using the
ref, and we clear the pending csum bytes.  Move all these disparate
things into cleanup_ref_head and clean up the logic in
__btrfs_run_delayed_refs so that it handles the !ref case a lot cleaner,
as well as making run_one_delayed_ref() only deal with real refs and not
the ref head.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 145 ++++++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 81 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0dcbbeacaadc7d..628ae71094f268 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2500,44 +2500,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	if (btrfs_delayed_ref_is_head(node)) {
-		struct btrfs_delayed_ref_head *head;
-		/*
-		 * we've hit the end of the chain and we were supposed
-		 * to insert this extent into the tree.  But, it got
-		 * deleted before we ever needed to insert it, so all
-		 * we have to do is clean up the accounting
-		 */
-		BUG_ON(extent_op);
-		head = btrfs_delayed_node_to_head(node);
-		trace_run_delayed_ref_head(fs_info, node, head, node->action);
-
-		if (head->total_ref_mod < 0) {
-			struct btrfs_block_group_cache *cache;
-
-			cache = btrfs_lookup_block_group(fs_info, node->bytenr);
-			ASSERT(cache);
-			percpu_counter_add(&cache->space_info->total_bytes_pinned,
-					   -node->num_bytes);
-			btrfs_put_block_group(cache);
-		}
-
-		if (insert_reserved) {
-			btrfs_pin_extent(fs_info, node->bytenr,
-					 node->num_bytes, 1);
-			if (head->is_data) {
-				ret = btrfs_del_csums(trans, fs_info,
-						      node->bytenr,
-						      node->num_bytes);
-			}
-		}
-
-		/* Also free its reserved qgroup space */
-		btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
-					      head->qgroup_reserved);
-		return ret;
-	}
-
 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
 		ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
@@ -2639,6 +2601,43 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 	delayed_refs->num_heads--;
 	rb_erase(&head->href_node, &delayed_refs->href_root);
 	spin_unlock(&delayed_refs->lock);
+	spin_unlock(&head->lock);
+	atomic_dec(&delayed_refs->num_entries);
+
+	trace_run_delayed_ref_head(fs_info, &head->node, head,
+				   head->node.action);
+
+	if (head->total_ref_mod < 0) {
+		struct btrfs_block_group_cache *cache;
+
+		cache = btrfs_lookup_block_group(fs_info, head->node.bytenr);
+		ASSERT(cache);
+		percpu_counter_add(&cache->space_info->total_bytes_pinned,
+				   -head->node.num_bytes);
+		btrfs_put_block_group(cache);
+
+		if (head->is_data) {
+			spin_lock(&delayed_refs->lock);
+			delayed_refs->pending_csums -= head->node.num_bytes;
+			spin_unlock(&delayed_refs->lock);
+		}
+	}
+
+	if (head->must_insert_reserved) {
+		btrfs_pin_extent(fs_info, head->node.bytenr,
+				 head->node.num_bytes, 1);
+		if (head->is_data) {
+			ret = btrfs_del_csums(trans, fs_info,
+					      head->node.bytenr,
+					      head->node.num_bytes);
+		}
+	}
+
+	/* Also free its reserved qgroup space */
+	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
+				      head->qgroup_reserved);
+	btrfs_delayed_ref_unlock(head);
+	btrfs_put_delayed_ref(&head->node);
 	return 0;
 }
 
@@ -2722,6 +2721,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
+		/*
+		 * We're done processing refs in this ref_head, clean everything
+		 * up and move on to the next ref_head.
+		 */
 		if (!ref) {
 			ret = cleanup_ref_head(trans, fs_info, locked_ref);
 			if (ret > 0 ) {
@@ -2731,34 +2734,30 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			} else if (ret) {
 				return ret;
 			}
+			locked_ref = NULL;
+			count++;
+			continue;
+		}
 
-			/*
-			 * All delayed refs have been processed, Go ahead and
-			 * send the head node to run_one_delayed_ref, so that
-			 * any accounting fixes can happen
-			 */
-			ref = &locked_ref->node;
-		} else {
-			actual_count++;
-			ref->in_tree = 0;
-			list_del(&ref->list);
-			if (!list_empty(&ref->add_list))
-				list_del(&ref->add_list);
-			/*
-			 * when we play the delayed ref, also correct the
-			 * ref_mod on head
-			 */
-			switch (ref->action) {
-			case BTRFS_ADD_DELAYED_REF:
-			case BTRFS_ADD_DELAYED_EXTENT:
-				locked_ref->node.ref_mod -= ref->ref_mod;
-				break;
-			case BTRFS_DROP_DELAYED_REF:
-				locked_ref->node.ref_mod += ref->ref_mod;
-				break;
-			default:
-				WARN_ON(1);
-			}
+		actual_count++;
+		ref->in_tree = 0;
+		list_del(&ref->list);
+		if (!list_empty(&ref->add_list))
+			list_del(&ref->add_list);
+		/*
+		 * When we play the delayed ref, also correct the ref_mod on
+		 * head
+		 */
+		switch (ref->action) {
+		case BTRFS_ADD_DELAYED_REF:
+		case BTRFS_ADD_DELAYED_EXTENT:
+			locked_ref->node.ref_mod -= ref->ref_mod;
+			break;
+		case BTRFS_DROP_DELAYED_REF:
+			locked_ref->node.ref_mod += ref->ref_mod;
+			break;
+		default:
+			WARN_ON(1);
 		}
 		atomic_dec(&delayed_refs->num_entries);
 
@@ -2785,22 +2784,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			return ret;
 		}
 
-		/*
-		 * If this node is a head, that means all the refs in this head
-		 * have been dealt with, and we will pick the next head to deal
-		 * with, so we must unlock the head and drop it from the cluster
-		 * list before we release it.
-		 */
-		if (btrfs_delayed_ref_is_head(ref)) {
-			if (locked_ref->is_data &&
-			    locked_ref->total_ref_mod < 0) {
-				spin_lock(&delayed_refs->lock);
-				delayed_refs->pending_csums -= ref->num_bytes;
-				spin_unlock(&delayed_refs->lock);
-			}
-			btrfs_delayed_ref_unlock(locked_ref);
-			locked_ref = NULL;
-		}
 		btrfs_put_delayed_ref(ref);
 		count++;
 		cond_resched();

From d278850eff3053ef166cf64c16f798dfe36278a2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:43:57 -0400
Subject: [PATCH 071/109] btrfs: remove delayed_ref_node from ref_head

This is just excessive information in the ref_head, and makes the code
complicated.  It is a relic from when we had the heads and the refs in
the same tree, which is no longer the case.  With this removal I've
cleaned up a bunch of the cruft around this old assumption as well.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c           |   4 +-
 fs/btrfs/delayed-ref.c       | 126 +++++++++++++++--------------------
 fs/btrfs/delayed-ref.h       |  49 +++++---------
 fs/btrfs/disk-io.c           |  12 ++--
 fs/btrfs/extent-tree.c       |  90 ++++++++++---------------
 include/trace/events/btrfs.h |  13 ++--
 6 files changed, 119 insertions(+), 175 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b517ef1477ea56..33cba1abf8b602 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1178,7 +1178,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 		head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 		if (head) {
 			if (!mutex_trylock(&head->mutex)) {
-				refcount_inc(&head->node.refs);
+				refcount_inc(&head->refs);
 				spin_unlock(&delayed_refs->lock);
 
 				btrfs_release_path(path);
@@ -1189,7 +1189,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 				 */
 				mutex_lock(&head->mutex);
 				mutex_unlock(&head->mutex);
-				btrfs_put_delayed_ref(&head->node);
+				btrfs_put_delayed_ref_head(head);
 				goto again;
 			}
 			spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 93ffa898df6d88..b9b41c838da48f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -96,15 +96,15 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
 	u64 bytenr;
 
 	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
-	bytenr = ins->node.bytenr;
+	bytenr = ins->bytenr;
 	while (*p) {
 		parent_node = *p;
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
 				 href_node);
 
-		if (bytenr < entry->node.bytenr)
+		if (bytenr < entry->bytenr)
 			p = &(*p)->rb_left;
-		else if (bytenr > entry->node.bytenr)
+		else if (bytenr > entry->bytenr)
 			p = &(*p)->rb_right;
 		else
 			return entry;
@@ -133,15 +133,15 @@ find_ref_head(struct rb_root *root, u64 bytenr,
 	while (n) {
 		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
 
-		if (bytenr < entry->node.bytenr)
+		if (bytenr < entry->bytenr)
 			n = n->rb_left;
-		else if (bytenr > entry->node.bytenr)
+		else if (bytenr > entry->bytenr)
 			n = n->rb_right;
 		else
 			return entry;
 	}
 	if (entry && return_bigger) {
-		if (bytenr > entry->node.bytenr) {
+		if (bytenr > entry->bytenr) {
 			n = rb_next(&entry->href_node);
 			if (!n)
 				n = rb_first(root);
@@ -164,17 +164,17 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 	if (mutex_trylock(&head->mutex))
 		return 0;
 
-	refcount_inc(&head->node.refs);
+	refcount_inc(&head->refs);
 	spin_unlock(&delayed_refs->lock);
 
 	mutex_lock(&head->mutex);
 	spin_lock(&delayed_refs->lock);
-	if (!head->node.in_tree) {
+	if (RB_EMPTY_NODE(&head->href_node)) {
 		mutex_unlock(&head->mutex);
-		btrfs_put_delayed_ref(&head->node);
+		btrfs_put_delayed_ref_head(head);
 		return -EAGAIN;
 	}
-	btrfs_put_delayed_ref(&head->node);
+	btrfs_put_delayed_ref_head(head);
 	return 0;
 }
 
@@ -183,15 +183,10 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 				    struct btrfs_delayed_ref_head *head,
 				    struct btrfs_delayed_ref_node *ref)
 {
-	if (btrfs_delayed_ref_is_head(ref)) {
-		head = btrfs_delayed_node_to_head(ref);
-		rb_erase(&head->href_node, &delayed_refs->href_root);
-	} else {
-		assert_spin_locked(&head->lock);
-		list_del(&ref->list);
-		if (!list_empty(&ref->add_list))
-			list_del(&ref->add_list);
-	}
+	assert_spin_locked(&head->lock);
+	list_del(&ref->list);
+	if (!list_empty(&ref->add_list))
+		list_del(&ref->add_list);
 	ref->in_tree = 0;
 	btrfs_put_delayed_ref(ref);
 	atomic_dec(&delayed_refs->num_entries);
@@ -380,8 +375,8 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
 	head->processing = 1;
 	WARN_ON(delayed_refs->num_heads_ready == 0);
 	delayed_refs->num_heads_ready--;
-	delayed_refs->run_delayed_start = head->node.bytenr +
-		head->node.num_bytes;
+	delayed_refs->run_delayed_start = head->bytenr +
+		head->num_bytes;
 	return head;
 }
 
@@ -469,20 +464,16 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
  */
 static noinline void
 update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
-			 struct btrfs_delayed_ref_node *existing,
-			 struct btrfs_delayed_ref_node *update,
+			 struct btrfs_delayed_ref_head *existing,
+			 struct btrfs_delayed_ref_head *update,
 			 int *old_ref_mod_ret)
 {
-	struct btrfs_delayed_ref_head *existing_ref;
-	struct btrfs_delayed_ref_head *ref;
 	int old_ref_mod;
 
-	existing_ref = btrfs_delayed_node_to_head(existing);
-	ref = btrfs_delayed_node_to_head(update);
-	BUG_ON(existing_ref->is_data != ref->is_data);
+	BUG_ON(existing->is_data != update->is_data);
 
-	spin_lock(&existing_ref->lock);
-	if (ref->must_insert_reserved) {
+	spin_lock(&existing->lock);
+	if (update->must_insert_reserved) {
 		/* if the extent was freed and then
 		 * reallocated before the delayed ref
 		 * entries were processed, we can end up
@@ -490,7 +481,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
 		 * the must_insert_reserved flag set.
 		 * Set it again here
 		 */
-		existing_ref->must_insert_reserved = ref->must_insert_reserved;
+		existing->must_insert_reserved = update->must_insert_reserved;
 
 		/*
 		 * update the num_bytes so we make sure the accounting
@@ -500,22 +491,22 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
 
 	}
 
-	if (ref->extent_op) {
-		if (!existing_ref->extent_op) {
-			existing_ref->extent_op = ref->extent_op;
+	if (update->extent_op) {
+		if (!existing->extent_op) {
+			existing->extent_op = update->extent_op;
 		} else {
-			if (ref->extent_op->update_key) {
-				memcpy(&existing_ref->extent_op->key,
-				       &ref->extent_op->key,
-				       sizeof(ref->extent_op->key));
-				existing_ref->extent_op->update_key = true;
+			if (update->extent_op->update_key) {
+				memcpy(&existing->extent_op->key,
+				       &update->extent_op->key,
+				       sizeof(update->extent_op->key));
+				existing->extent_op->update_key = true;
 			}
-			if (ref->extent_op->update_flags) {
-				existing_ref->extent_op->flags_to_set |=
-					ref->extent_op->flags_to_set;
-				existing_ref->extent_op->update_flags = true;
+			if (update->extent_op->update_flags) {
+				existing->extent_op->flags_to_set |=
+					update->extent_op->flags_to_set;
+				existing->extent_op->update_flags = true;
 			}
-			btrfs_free_delayed_extent_op(ref->extent_op);
+			btrfs_free_delayed_extent_op(update->extent_op);
 		}
 	}
 	/*
@@ -523,23 +514,23 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
 	 * only need the lock for this case cause we could be processing it
 	 * currently, for refs we just added we know we're a-ok.
 	 */
-	old_ref_mod = existing_ref->total_ref_mod;
+	old_ref_mod = existing->total_ref_mod;
 	if (old_ref_mod_ret)
 		*old_ref_mod_ret = old_ref_mod;
 	existing->ref_mod += update->ref_mod;
-	existing_ref->total_ref_mod += update->ref_mod;
+	existing->total_ref_mod += update->ref_mod;
 
 	/*
 	 * If we are going to from a positive ref mod to a negative or vice
 	 * versa we need to make sure to adjust pending_csums accordingly.
 	 */
-	if (existing_ref->is_data) {
-		if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0)
+	if (existing->is_data) {
+		if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
 			delayed_refs->pending_csums -= existing->num_bytes;
-		if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0)
+		if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
 			delayed_refs->pending_csums += existing->num_bytes;
 	}
-	spin_unlock(&existing_ref->lock);
+	spin_unlock(&existing->lock);
 }
 
 /*
@@ -550,14 +541,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
 static noinline struct btrfs_delayed_ref_head *
 add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		     struct btrfs_trans_handle *trans,
-		     struct btrfs_delayed_ref_node *ref,
+		     struct btrfs_delayed_ref_head *head_ref,
 		     struct btrfs_qgroup_extent_record *qrecord,
 		     u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
 		     int action, int is_data, int *qrecord_inserted_ret,
 		     int *old_ref_mod, int *new_ref_mod)
 {
 	struct btrfs_delayed_ref_head *existing;
-	struct btrfs_delayed_ref_head *head_ref = NULL;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int count_mod = 1;
 	int must_insert_reserved = 0;
@@ -593,26 +583,21 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 
 	delayed_refs = &trans->transaction->delayed_refs;
 
-	/* first set the basic ref node struct up */
-	refcount_set(&ref->refs, 1);
-	ref->bytenr = bytenr;
-	ref->num_bytes = num_bytes;
-	ref->ref_mod = count_mod;
-	ref->type  = 0;
-	ref->action  = 0;
-	ref->is_head = 1;
-	ref->in_tree = 1;
-	ref->seq = 0;
-
-	head_ref = btrfs_delayed_node_to_head(ref);
+	refcount_set(&head_ref->refs, 1);
+	head_ref->bytenr = bytenr;
+	head_ref->num_bytes = num_bytes;
+	head_ref->ref_mod = count_mod;
 	head_ref->must_insert_reserved = must_insert_reserved;
 	head_ref->is_data = is_data;
 	INIT_LIST_HEAD(&head_ref->ref_list);
 	INIT_LIST_HEAD(&head_ref->ref_add_list);
+	RB_CLEAR_NODE(&head_ref->href_node);
 	head_ref->processing = 0;
 	head_ref->total_ref_mod = count_mod;
 	head_ref->qgroup_reserved = 0;
 	head_ref->qgroup_ref_root = 0;
+	spin_lock_init(&head_ref->lock);
+	mutex_init(&head_ref->mutex);
 
 	/* Record qgroup extent info if provided */
 	if (qrecord) {
@@ -632,17 +617,14 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 			qrecord_inserted = 1;
 	}
 
-	spin_lock_init(&head_ref->lock);
-	mutex_init(&head_ref->mutex);
-
-	trace_add_delayed_ref_head(fs_info, ref, head_ref, action);
+	trace_add_delayed_ref_head(fs_info, head_ref, action);
 
 	existing = htree_insert(&delayed_refs->href_root,
 				&head_ref->href_node);
 	if (existing) {
 		WARN_ON(ref_root && reserved && existing->qgroup_ref_root
 			&& existing->qgroup_reserved);
-		update_existing_head_ref(delayed_refs, &existing->node, ref,
+		update_existing_head_ref(delayed_refs, existing, head_ref,
 					 old_ref_mod);
 		/*
 		 * we've updated the existing ref, free the newly
@@ -821,7 +803,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
+	head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
 					bytenr, num_bytes, 0, 0, action, 0,
 					&qrecord_inserted, old_ref_mod,
 					new_ref_mod);
@@ -888,7 +870,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
+	head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
 					bytenr, num_bytes, ref_root, reserved,
 					action, 1, &qrecord_inserted,
 					old_ref_mod, new_ref_mod);
@@ -920,7 +902,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
+	add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr,
 			     num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
 			     extent_op->is_data, NULL, NULL, NULL);
 
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index ce88e4ac5276f4..1ce11858d72731 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -26,15 +26,6 @@
 #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
 #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
 
-/*
- * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the
- * same ref_node structure.
- * Ref_head is in a higher logic level than tree/data ref, and duplicated
- * bytenr/num_bytes in ref_node is really a waste or memory, they should be
- * referred from ref_head.
- * This gets more disgusting after we use list to store tree/data ref in
- * ref_head. Must clean this mess up later.
- */
 struct btrfs_delayed_ref_node {
 	/*data/tree ref use list, stored in ref_head->ref_list. */
 	struct list_head list;
@@ -91,8 +82,9 @@ struct btrfs_delayed_extent_op {
  * reference count modifications we've queued up.
  */
 struct btrfs_delayed_ref_head {
-	struct btrfs_delayed_ref_node node;
-
+	u64 bytenr;
+	u64 num_bytes;
+	refcount_t refs;
 	/*
 	 * the mutex is held while running the refs, and it is also
 	 * held when checking the sum of reference modifications.
@@ -115,6 +107,14 @@ struct btrfs_delayed_ref_head {
 	 */
 	int total_ref_mod;
 
+	/*
+	 * This is the current outstanding mod references for this bytenr.  This
+	 * is used with lookup_extent_info to get an accurate reference count
+	 * for a bytenr, so it is adjusted as delayed refs are run so that any
+	 * on disk reference count + ref_mod is accurate.
+	 */
+	int ref_mod;
+
 	/*
 	 * For qgroup reserved space freeing.
 	 *
@@ -234,15 +234,18 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 		case BTRFS_SHARED_DATA_REF_KEY:
 			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
 			break;
-		case 0:
-			kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
-			break;
 		default:
 			BUG();
 		}
 	}
 }
 
+static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head)
+{
+	if (refcount_dec_and_test(&head->refs))
+		kmem_cache_free(btrfs_delayed_ref_head_cachep, head);
+}
+
 int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
@@ -282,36 +285,18 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 seq);
 
-/*
- * a node might live in a head or a regular ref, this lets you
- * test for the proper type to use.
- */
-static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
-{
-	return node->is_head;
-}
-
 /*
  * helper functions to cast a node into its container
  */
 static inline struct btrfs_delayed_tree_ref *
 btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
 {
-	WARN_ON(btrfs_delayed_ref_is_head(node));
 	return container_of(node, struct btrfs_delayed_tree_ref, node);
 }
 
 static inline struct btrfs_delayed_data_ref *
 btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
 {
-	WARN_ON(btrfs_delayed_ref_is_head(node));
 	return container_of(node, struct btrfs_delayed_data_ref, node);
 }
-
-static inline struct btrfs_delayed_ref_head *
-btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
-{
-	WARN_ON(!btrfs_delayed_ref_is_head(node));
-	return container_of(node, struct btrfs_delayed_ref_head, node);
-}
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f971d5680e6f26..69ce738c00d005 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4121,12 +4121,12 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 		head = rb_entry(node, struct btrfs_delayed_ref_head,
 				href_node);
 		if (!mutex_trylock(&head->mutex)) {
-			refcount_inc(&head->node.refs);
+			refcount_inc(&head->refs);
 			spin_unlock(&delayed_refs->lock);
 
 			mutex_lock(&head->mutex);
 			mutex_unlock(&head->mutex);
-			btrfs_put_delayed_ref(&head->node);
+			btrfs_put_delayed_ref_head(head);
 			spin_lock(&delayed_refs->lock);
 			continue;
 		}
@@ -4147,16 +4147,16 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 		if (head->processing == 0)
 			delayed_refs->num_heads_ready--;
 		atomic_dec(&delayed_refs->num_entries);
-		head->node.in_tree = 0;
 		rb_erase(&head->href_node, &delayed_refs->href_root);
+		RB_CLEAR_NODE(&head->href_node);
 		spin_unlock(&head->lock);
 		spin_unlock(&delayed_refs->lock);
 		mutex_unlock(&head->mutex);
 
 		if (pin_bytes)
-			btrfs_pin_extent(fs_info, head->node.bytenr,
-					 head->node.num_bytes, 1);
-		btrfs_put_delayed_ref(&head->node);
+			btrfs_pin_extent(fs_info, head->bytenr,
+					 head->num_bytes, 1);
+		btrfs_put_delayed_ref_head(head);
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 628ae71094f268..423d89145bac1a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -912,7 +912,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 	if (head) {
 		if (!mutex_trylock(&head->mutex)) {
-			refcount_inc(&head->node.refs);
+			refcount_inc(&head->refs);
 			spin_unlock(&delayed_refs->lock);
 
 			btrfs_release_path(path);
@@ -923,7 +923,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			 */
 			mutex_lock(&head->mutex);
 			mutex_unlock(&head->mutex);
-			btrfs_put_delayed_ref(&head->node);
+			btrfs_put_delayed_ref_head(head);
 			goto search_again;
 		}
 		spin_lock(&head->lock);
@@ -932,7 +932,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 		else
 			BUG_ON(num_refs == 0);
 
-		num_refs += head->node.ref_mod;
+		num_refs += head->ref_mod;
 		spin_unlock(&head->lock);
 		mutex_unlock(&head->mutex);
 	}
@@ -2337,7 +2337,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 
 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 				 struct btrfs_fs_info *fs_info,
-				 struct btrfs_delayed_ref_node *node,
+				 struct btrfs_delayed_ref_head *head,
 				 struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_key key;
@@ -2359,14 +2359,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	key.objectid = node->bytenr;
+	key.objectid = head->bytenr;
 
 	if (metadata) {
 		key.type = BTRFS_METADATA_ITEM_KEY;
 		key.offset = extent_op->level;
 	} else {
 		key.type = BTRFS_EXTENT_ITEM_KEY;
-		key.offset = node->num_bytes;
+		key.offset = head->num_bytes;
 	}
 
 again:
@@ -2383,17 +2383,17 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 				path->slots[0]--;
 				btrfs_item_key_to_cpu(path->nodes[0], &key,
 						      path->slots[0]);
-				if (key.objectid == node->bytenr &&
+				if (key.objectid == head->bytenr &&
 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
-				    key.offset == node->num_bytes)
+				    key.offset == head->num_bytes)
 					ret = 0;
 			}
 			if (ret > 0) {
 				btrfs_release_path(path);
 				metadata = 0;
 
-				key.objectid = node->bytenr;
-				key.offset = node->num_bytes;
+				key.objectid = head->bytenr;
+				key.offset = head->num_bytes;
 				key.type = BTRFS_EXTENT_ITEM_KEY;
 				goto again;
 			}
@@ -2562,7 +2562,7 @@ static int cleanup_extent_op(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 	spin_unlock(&head->lock);
-	ret = run_delayed_extent_op(trans, fs_info, &head->node, extent_op);
+	ret = run_delayed_extent_op(trans, fs_info, head, extent_op);
 	btrfs_free_delayed_extent_op(extent_op);
 	return ret ? ret : 1;
 }
@@ -2597,39 +2597,37 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 		spin_unlock(&delayed_refs->lock);
 		return 1;
 	}
-	head->node.in_tree = 0;
 	delayed_refs->num_heads--;
 	rb_erase(&head->href_node, &delayed_refs->href_root);
+	RB_CLEAR_NODE(&head->href_node);
 	spin_unlock(&delayed_refs->lock);
 	spin_unlock(&head->lock);
 	atomic_dec(&delayed_refs->num_entries);
 
-	trace_run_delayed_ref_head(fs_info, &head->node, head,
-				   head->node.action);
+	trace_run_delayed_ref_head(fs_info, head, 0);
 
 	if (head->total_ref_mod < 0) {
 		struct btrfs_block_group_cache *cache;
 
-		cache = btrfs_lookup_block_group(fs_info, head->node.bytenr);
+		cache = btrfs_lookup_block_group(fs_info, head->bytenr);
 		ASSERT(cache);
 		percpu_counter_add(&cache->space_info->total_bytes_pinned,
-				   -head->node.num_bytes);
+				   -head->num_bytes);
 		btrfs_put_block_group(cache);
 
 		if (head->is_data) {
 			spin_lock(&delayed_refs->lock);
-			delayed_refs->pending_csums -= head->node.num_bytes;
+			delayed_refs->pending_csums -= head->num_bytes;
 			spin_unlock(&delayed_refs->lock);
 		}
 	}
 
 	if (head->must_insert_reserved) {
-		btrfs_pin_extent(fs_info, head->node.bytenr,
-				 head->node.num_bytes, 1);
+		btrfs_pin_extent(fs_info, head->bytenr,
+				 head->num_bytes, 1);
 		if (head->is_data) {
-			ret = btrfs_del_csums(trans, fs_info,
-					      head->node.bytenr,
-					      head->node.num_bytes);
+			ret = btrfs_del_csums(trans, fs_info, head->bytenr,
+					      head->num_bytes);
 		}
 	}
 
@@ -2637,7 +2635,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
 				      head->qgroup_reserved);
 	btrfs_delayed_ref_unlock(head);
-	btrfs_put_delayed_ref(&head->node);
+	btrfs_put_delayed_ref_head(head);
 	return 0;
 }
 
@@ -2751,10 +2749,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		switch (ref->action) {
 		case BTRFS_ADD_DELAYED_REF:
 		case BTRFS_ADD_DELAYED_EXTENT:
-			locked_ref->node.ref_mod -= ref->ref_mod;
+			locked_ref->ref_mod -= ref->ref_mod;
 			break;
 		case BTRFS_DROP_DELAYED_REF:
-			locked_ref->node.ref_mod += ref->ref_mod;
+			locked_ref->ref_mod += ref->ref_mod;
 			break;
 		default:
 			WARN_ON(1);
@@ -3087,33 +3085,16 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			spin_unlock(&delayed_refs->lock);
 			goto out;
 		}
+		head = rb_entry(node, struct btrfs_delayed_ref_head,
+				href_node);
+		refcount_inc(&head->refs);
+		spin_unlock(&delayed_refs->lock);
 
-		while (node) {
-			head = rb_entry(node, struct btrfs_delayed_ref_head,
-					href_node);
-			if (btrfs_delayed_ref_is_head(&head->node)) {
-				struct btrfs_delayed_ref_node *ref;
-
-				ref = &head->node;
-				refcount_inc(&ref->refs);
-
-				spin_unlock(&delayed_refs->lock);
-				/*
-				 * Mutex was contended, block until it's
-				 * released and try again
-				 */
-				mutex_lock(&head->mutex);
-				mutex_unlock(&head->mutex);
+		/* Mutex was contended, block until it's released and retry. */
+		mutex_lock(&head->mutex);
+		mutex_unlock(&head->mutex);
 
-				btrfs_put_delayed_ref(ref);
-				cond_resched();
-				goto again;
-			} else {
-				WARN_ON(1);
-			}
-			node = rb_next(node);
-		}
-		spin_unlock(&delayed_refs->lock);
+		btrfs_put_delayed_ref_head(head);
 		cond_resched();
 		goto again;
 	}
@@ -3171,7 +3152,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 	}
 
 	if (!mutex_trylock(&head->mutex)) {
-		refcount_inc(&head->node.refs);
+		refcount_inc(&head->refs);
 		spin_unlock(&delayed_refs->lock);
 
 		btrfs_release_path(path);
@@ -3182,7 +3163,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 		 */
 		mutex_lock(&head->mutex);
 		mutex_unlock(&head->mutex);
-		btrfs_put_delayed_ref(&head->node);
+		btrfs_put_delayed_ref_head(head);
 		return -EAGAIN;
 	}
 	spin_unlock(&delayed_refs->lock);
@@ -7235,9 +7216,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	 * at this point we have a head with no other entries.  Go
 	 * ahead and process it.
 	 */
-	head->node.in_tree = 0;
 	rb_erase(&head->href_node, &delayed_refs->href_root);
-
+	RB_CLEAR_NODE(&head->href_node);
 	atomic_dec(&delayed_refs->num_entries);
 
 	/*
@@ -7256,7 +7236,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 		ret = 1;
 
 	mutex_unlock(&head->mutex);
-	btrfs_put_delayed_ref(&head->node);
+	btrfs_put_delayed_ref_head(head);
 	return ret;
 out:
 	spin_unlock(&head->lock);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 77437f545c639c..bfe2f23b578c39 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -798,11 +798,10 @@ DEFINE_EVENT(btrfs_delayed_data_ref,  run_delayed_data_ref,
 DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_delayed_ref_node *ref,
 		 const struct btrfs_delayed_ref_head *head_ref,
 		 int action),
 
-	TP_ARGS(fs_info, ref, head_ref, action),
+	TP_ARGS(fs_info, head_ref, action),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,  bytenr		)
@@ -812,8 +811,8 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
 	),
 
 	TP_fast_assign_btrfs(fs_info,
-		__entry->bytenr		= ref->bytenr;
-		__entry->num_bytes	= ref->num_bytes;
+		__entry->bytenr		= head_ref->bytenr;
+		__entry->num_bytes	= head_ref->num_bytes;
 		__entry->action		= action;
 		__entry->is_data	= head_ref->is_data;
 	),
@@ -828,21 +827,19 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
 DEFINE_EVENT(btrfs_delayed_ref_head,  add_delayed_ref_head,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_delayed_ref_node *ref,
 		 const struct btrfs_delayed_ref_head *head_ref,
 		 int action),
 
-	TP_ARGS(fs_info, ref, head_ref, action)
+	TP_ARGS(fs_info, head_ref, action)
 );
 
 DEFINE_EVENT(btrfs_delayed_ref_head,  run_delayed_ref_head,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_delayed_ref_node *ref,
 		 const struct btrfs_delayed_ref_head *head_ref,
 		 int action),
 
-	TP_ARGS(fs_info, ref, head_ref, action)
+	TP_ARGS(fs_info, head_ref, action)
 );
 
 #define show_chunk_type(type)					\

From 3b60d436a165f890597d0226def17858973fa985 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:43:58 -0400
Subject: [PATCH 072/109] btrfs: remove type argument from comp_tree_refs

We can get this from the ref we've passed in.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index b9b41c838da48f..a2973340a94f87 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -41,9 +41,9 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep;
  * compare two delayed tree backrefs with same bytenr and type
  */
 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
-			  struct btrfs_delayed_tree_ref *ref1, int type)
+			  struct btrfs_delayed_tree_ref *ref1)
 {
-	if (type == BTRFS_TREE_BLOCK_REF_KEY) {
+	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
 		if (ref1->root < ref2->root)
 			return -1;
 		if (ref1->root > ref2->root)
@@ -223,8 +223,7 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
 		if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
 		     ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
 		    comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
-				   btrfs_delayed_node_to_tree_ref(next),
-				   ref->type))
+				   btrfs_delayed_node_to_tree_ref(next)))
 			goto next;
 		if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
 		     ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
@@ -409,8 +408,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
 	if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	     exist->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
 	    comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist),
-			   btrfs_delayed_node_to_tree_ref(ref),
-			   ref->type))
+			   btrfs_delayed_node_to_tree_ref(ref)))
 		goto add_tail;
 	if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY ||
 	     exist->type == BTRFS_SHARED_DATA_REF_KEY) &&

From bf2681cb94d13d91326fc098ca98620b595a9d6b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 29 Sep 2017 15:44:05 -0400
Subject: [PATCH 073/109] btrfs: add assertions for releasing trans handle
 reservations

These are useful for debugging problems where we mess with
trans->block_rsv to make sure we're not screwing something up.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 423d89145bac1a..4f874d02f31049 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5823,12 +5823,15 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_fs_info *fs_info)
 {
-	if (!trans->block_rsv)
+	if (!trans->block_rsv) {
+		ASSERT(!trans->bytes_reserved);
 		return;
+	}
 
 	if (!trans->bytes_reserved)
 		return;
 
+	ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
 	trace_btrfs_space_reservation(fs_info, "transaction",
 				      trans->transid, trans->bytes_reserved, 0);
 	btrfs_block_rsv_release(fs_info, trans->block_rsv,

From 2dbe0c77186c691386b74391039899808a4d3f59 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sat, 14 Oct 2017 08:35:56 +0800
Subject: [PATCH 074/109] btrfs: use BLK_STS defines where needed

At few places we could use BLK_STS_OK and BLK_STS_NOSUPP.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Satoru Taekeuchi <satoru.takeuchi@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ dropped first hunk btrfs_endio_direct_read ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 3 ++-
 fs/btrfs/inode.c       | 2 +-
 fs/btrfs/volumes.c     | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 280384bf34f13b..8bdd3dc6c4dc4f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -255,7 +255,8 @@ static void end_compressed_bio_write(struct bio *bio)
 					 cb->start,
 					 cb->start + cb->len - 1,
 					 NULL,
-					 bio->bi_status ? 0 : 1);
+					 bio->bi_status ?
+					 BLK_STS_OK : BLK_STS_NOTSUPP);
 	cb->compressed_pages[0]->mapping = NULL;
 
 	end_compressed_writeback(inode, cb);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a11f87fa79d089..f2787cab6f3b90 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8485,7 +8485,7 @@ static void btrfs_end_dio_bio(struct bio *bio)
 	if (dip->errors) {
 		bio_io_error(dip->orig_bio);
 	} else {
-		dip->dio_bio->bi_status = 0;
+		dip->dio_bio->bi_status = BLK_STS_OK;
 		bio_endio(dip->orig_bio);
 	}
 out:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4931b2df4043c9..56ed01f8d4df9c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6029,7 +6029,7 @@ static void btrfs_end_bio(struct bio *bio)
 			 * this bio is actually up to date, we didn't
 			 * go over the max number of errors
 			 */
-			bio->bi_status = 0;
+			bio->bi_status = BLK_STS_OK;
 		}
 
 		btrfs_end_bbio(bbio, bio);

From 79f015f216539dfdf0d3ef0b3bbb9dc7754364cd Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Mon, 16 Oct 2017 05:43:21 -0500
Subject: [PATCH 075/109] btrfs: cleanup extent locking sequence

Code cleanup for better understanding:
Variable needs_unlock to be called extent_locked to show state as
opposed to action. Changed the type to int, to reduce code in the
critical path.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d3c1725f03e41b..4de174b664ffa3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1590,7 +1590,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 	int ret = 0;
 	bool only_release_metadata = false;
 	bool force_page_uptodate = false;
-	bool need_unlock;
 
 	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
 			PAGE_SIZE / (sizeof(struct page *)));
@@ -1613,6 +1612,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		size_t copied;
 		size_t dirty_sectors;
 		size_t num_sectors;
+		int extents_locked;
 
 		WARN_ON(num_pages > nrptrs);
 
@@ -1669,7 +1669,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		}
 
 		release_bytes = reserve_bytes;
-		need_unlock = false;
 again:
 		/*
 		 * This is going to setup the pages array with the number of
@@ -1682,16 +1681,15 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		if (ret)
 			break;
 
-		ret = lock_and_cleanup_extent_if_need(BTRFS_I(inode), pages,
+		extents_locked = lock_and_cleanup_extent_if_need(
+				BTRFS_I(inode), pages,
 				num_pages, pos, write_bytes, &lockstart,
 				&lockend, &cached_state);
-		if (ret < 0) {
-			if (ret == -EAGAIN)
+		if (extents_locked < 0) {
+			if (extents_locked == -EAGAIN)
 				goto again;
+			ret = extents_locked;
 			break;
-		} else if (ret > 0) {
-			need_unlock = true;
-			ret = 0;
 		}
 
 		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
@@ -1756,7 +1754,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		if (copied > 0)
 			ret = btrfs_dirty_pages(inode, pages, dirty_pages,
 						pos, copied, NULL);
-		if (need_unlock)
+		if (extents_locked)
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     lockstart, lockend, &cached_state,
 					     GFP_NOFS);

From de48373454aceaf65877f9953eaf942505e83cc5 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 12 Oct 2017 16:43:00 +0800
Subject: [PATCH 076/109] btrfs: use need_full_stripe() in __btrfs_map_block()

A cleanup patch, use need_full_stripe() to replace the open code.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 56ed01f8d4df9c..6df39b5fda8961 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5691,10 +5691,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
 				&stripe_index);
-		if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS)
+		if (!need_full_stripe(op))
 			mirror_num = 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
+		if (need_full_stripe(op))
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
@@ -5707,7 +5707,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) {
+		if (need_full_stripe(op)) {
 			num_stripes = map->num_stripes;
 		} else if (mirror_num) {
 			stripe_index = mirror_num - 1;
@@ -5721,7 +5721,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
 		stripe_index *= map->sub_stripes;
 
-		if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
+		if (need_full_stripe(op))
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
@@ -5736,9 +5736,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		if (need_raid_map &&
-		    (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
-		     mirror_num > 1)) {
+		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
 			/* push stripe_nr back to the start of the full stripe */
 			stripe_nr = div64_u64(raid56_full_stripe_start,
 					stripe_len * nr_data_stripes(map));
@@ -5765,9 +5763,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 			/* We distribute the parity blocks across stripes */
 			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
 					&stripe_index);
-			if ((op != BTRFS_MAP_WRITE &&
-			     op != BTRFS_MAP_GET_READ_MIRRORS) &&
-			    mirror_num <= 1)
+			if (!need_full_stripe(op) && mirror_num <= 1)
 				mirror_num = 1;
 		}
 	} else {

From 102ed2c5ff932439bbbe74c7bd63e6d5baa9f732 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sat, 14 Oct 2017 08:34:02 +0800
Subject: [PATCH 077/109] btrfs: fix false EIO for missing device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When one of the device is missing, bbio_error() takes care of setting
the error status. And if its only IO that is pending in that stripe, it
fails to check the status of the other IO at %bbio_error before setting
the error %bi_status for the %orig_bio. Fix this by checking if
%bbio->error has exceeded the %bbio->max_errors.

Reproducer as below fdatasync error is seen intermittently.

 mount -o degraded /dev/sdc /btrfs
 dd status=none if=/dev/zero of=$(mktemp /btrfs/XXX) bs=4096 count=1 conv=fdatasync

 dd: fdatasync failed for ‘/btrfs/LSe’: Input/output error

 The reason for the intermittences of the problem is because
 the following conditions have to be met, which depends on timing:
 In btrfs_map_bio()
  - the RAID1 the missing device has to be at %dev_nr = 1
 In bbio_error()
  . before bbio_error() is called the bio of the not-missing
    device at %dev_nr = 0 must be completed so that the below
    condition is true
     if (atomic_dec_and_test(&bbio->stripes_pending)) {

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6df39b5fda8961..11d7707a3fb3ce 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6129,7 +6129,10 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		bio->bi_iter.bi_sector = logical >> 9;
-		bio->bi_status = BLK_STS_IOERR;
+		if (atomic_read(&bbio->error) > bbio->max_errors)
+			bio->bi_status = BLK_STS_IOERR;
+		else
+			bio->bi_status = BLK_STS_OK;
 		btrfs_end_bbio(bbio, bio);
 	}
 }

From 859a58a2074858dc03943c1fa1a2975576f9b0b8 Mon Sep 17 00:00:00 2001
From: Gu JinXiang <gujx@cn.fujitsu.com>
Date: Wed, 11 Oct 2017 16:44:27 +0800
Subject: [PATCH 078/109] btrfs: Use bd_dev to generate index when
 dev_state_hashtable add items.

Fix missing change from commit f8f84b2dfda5
("btrfs: index check-integrity state hash by a dev_t").

Function btrfsic_dev_state_hashtable_lookup uses dev_t to generate hashval
when look in up a btrfsic_dev_state in hash table. So when we add a
btrfsic_dev_state into the hash table, it should also use dev_t.

Reproducer of this bug:
Use MOUNT_OPTIONS="-o check_int" when running xfstest, device can not be
mounted successfully. So xfstest can not run.

Signed-off-by: Gu JinXiang <gujx@cn.fujitsu.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9db1e7602dfcb9..1fee5fe93484e6 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -613,7 +613,7 @@ static void btrfsic_dev_state_hashtable_add(
 		struct btrfsic_dev_state_hashtable *h)
 {
 	const unsigned int hashval =
-	    (((unsigned int)((uintptr_t)ds->bdev)) &
+	    (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) &
 	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
 
 	list_add(&ds->collision_resolving_node, h->table + hashval);

From d4417e22551377c6e589c15ff2b931610e5230bc Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 16 Oct 2017 16:48:40 +0300
Subject: [PATCH 079/109] btrfs: Replace opencoded sizes with their symbolic
 constants

Currently btrfs' code uses a mix of opencoded sizes and defines from sizes.h.
Let's unifiy the code base to always use the symbolic constants. No functional
changes

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h             | 2 +-
 fs/btrfs/disk-io.c           | 2 +-
 fs/btrfs/super.c             | 2 +-
 fs/btrfs/tests/inode-tests.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e2afe524e25ea4..7bda8429e93fa8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -523,7 +523,7 @@ struct btrfs_caching_control {
 };
 
 /* Once caching_thread() finds this much free space, it will wake up waiters. */
-#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2)
+#define CACHING_CTL_WAKE_UP SZ_2M
 
 struct btrfs_io_ctl {
 	void *cur, *orig;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 69ce738c00d005..484cf8fc952c43 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2733,7 +2733,7 @@ int open_ctree(struct super_block *sb,
 	sb->s_bdi->congested_fn = btrfs_congested_fn;
 	sb->s_bdi->congested_data = fs_info;
 	sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
-	sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+	sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
 	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
 	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 880ab4949f698b..4fb7eefb80aeca 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2126,7 +2126,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	 * succeed even if the Avail is zero. But this is better than the other
 	 * way around.
 	 */
-	thresh = 4 * 1024 * 1024;
+	thresh = SZ_4M;
 
 	if (!mixed && total_free_meta - thresh < block_rsv->size)
 		buf->f_bavail = 0;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 8c91d03cc82d8a..330815eb07b40f 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, 4096 * 1024, 0);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0);
 	if (IS_ERR(em)) {
 		test_msg("Got an error when we shouldn't have\n");
 		goto out;

From f51d2b59120ff364a5e612a594ed358767e1cd09 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 15 Sep 2017 17:36:57 +0200
Subject: [PATCH 080/109] btrfs: allow to set compression level for zlib

Preliminary support for setting compression level for zlib, the
following works:

$ mount -o compess=zlib                 # default
$ mount -o compess=zlib0                # same
$ mount -o compess=zlib9                # level 9, slower sync, less data
$ mount -o compess=zlib1                # level 1, faster sync, more data
$ mount -o remount,compress=zlib3	# level set by remount

The compress-force works the same as compress'.  The level is visible in
the same format in /proc/mounts. Level set via file property does not
work yet.

Required patch: "btrfs: prepare for extensions in compression options"

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 20 +++++++++++++++++++-
 fs/btrfs/compression.h |  6 +++++-
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/inode.c       |  5 ++++-
 fs/btrfs/lzo.c         |  5 +++++
 fs/btrfs/super.c       |  8 ++++++--
 fs/btrfs/zlib.c        | 15 ++++++++++++++-
 fs/btrfs/zstd.c        |  5 +++++
 8 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8bdd3dc6c4dc4f..3e452525f8ad61 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -884,6 +884,11 @@ static void free_workspaces(void)
  * Given an address space and start and length, compress the bytes into @pages
  * that are allocated on demand.
  *
+ * @type_level is encoded algorithm and level, where level 0 means whatever
+ * default the algorithm chooses and is opaque here;
+ * - compression algo are 0-3
+ * - the level are bits 4-7
+ *
  * @out_pages is an in/out parameter, holds maximum number of pages to allocate
  * and returns number of actually allocated pages
  *
@@ -898,7 +903,7 @@ static void free_workspaces(void)
  * @max_out tells us the max number of bytes that we're allowed to
  * stuff into pages
  */
-int btrfs_compress_pages(int type, struct address_space *mapping,
+int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
 			 u64 start, struct page **pages,
 			 unsigned long *out_pages,
 			 unsigned long *total_in,
@@ -906,9 +911,11 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
 {
 	struct list_head *workspace;
 	int ret;
+	int type = type_level & 0xF;
 
 	workspace = find_workspace(type);
 
+	btrfs_compress_op[type - 1]->set_level(workspace, type_level);
 	ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
 						      start, pages,
 						      out_pages,
@@ -1098,3 +1105,14 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 
 	return ret;
 }
+
+unsigned int btrfs_compress_str2level(const char *str)
+{
+	if (strncmp(str, "zlib", 4) != 0)
+		return 0;
+
+	if ('1' <= str[4] && str[4] <= '9' )
+		return str[4] - '0';
+
+	return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d2781ff8f994bd..da20755ebf2183 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -76,7 +76,7 @@ struct compressed_bio {
 void btrfs_init_compress(void);
 void btrfs_exit_compress(void);
 
-int btrfs_compress_pages(int type, struct address_space *mapping,
+int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
 			 u64 start, struct page **pages,
 			 unsigned long *out_pages,
 			 unsigned long *total_in,
@@ -95,6 +95,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags);
 
+unsigned btrfs_compress_str2level(const char *str);
+
 enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE  = 0,
 	BTRFS_COMPRESS_ZLIB  = 1,
@@ -124,6 +126,8 @@ struct btrfs_compress_op {
 			  struct page *dest_page,
 			  unsigned long start_byte,
 			  size_t srclen, size_t destlen);
+
+	void (*set_level)(struct list_head *ws, unsigned int type);
 };
 
 extern const struct btrfs_compress_op btrfs_zlib_compress;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7bda8429e93fa8..2c02d9524055b3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -790,6 +790,7 @@ struct btrfs_fs_info {
 	 */
 	unsigned long pending_changes;
 	unsigned long compress_type:4;
+	unsigned int compress_level;
 	int commit_interval;
 	/*
 	 * It is a suggestive number, the read side is safe even it gets a
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f2787cab6f3b90..3f1b53f8573581 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -539,7 +539,10 @@ static noinline void compress_file_range(struct inode *inode,
 		 */
 		extent_range_clear_dirty_for_io(inode, start, end);
 		redirty = 1;
-		ret = btrfs_compress_pages(compress_type,
+
+		/* Compression level is applied here and only here */
+		ret = btrfs_compress_pages(
+			compress_type | (fs_info->compress_level << 4),
 					   inode->i_mapping, start,
 					   pages,
 					   &nr_pages,
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d433e75d489a79..6c7f18cd3b6122 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -430,10 +430,15 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
 	return ret;
 }
 
+static void lzo_set_level(struct list_head *ws, unsigned int type)
+{
+}
+
 const struct btrfs_compress_op btrfs_lzo_compress = {
 	.alloc_workspace	= lzo_alloc_workspace,
 	.free_workspace		= lzo_free_workspace,
 	.compress_pages		= lzo_compress_pages,
 	.decompress_bio		= lzo_decompress_bio,
 	.decompress		= lzo_decompress,
+	.set_level		= lzo_set_level,
 };
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4fb7eefb80aeca..57f3f9600e18bc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -508,6 +508,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			    strncmp(args[0].from, "zlib", 4) == 0) {
 				compress_type = "zlib";
 				info->compress_type = BTRFS_COMPRESS_ZLIB;
+				info->compress_level =
+					btrfs_compress_str2level(args[0].from);
 				btrfs_set_opt(info->mount_opt, COMPRESS);
 				btrfs_clear_opt(info->mount_opt, NODATACOW);
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -555,9 +557,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			      compress_force != saved_compress_force)) ||
 			    (!btrfs_test_opt(info, COMPRESS) &&
 			     no_compress == 1)) {
-				btrfs_info(info, "%s %s compression",
+				btrfs_info(info, "%s %s compression, level %d",
 					   (compress_force) ? "force" : "use",
-					   compress_type);
+					   compress_type, info->compress_level);
 			}
 			compress_force = false;
 			break;
@@ -1258,6 +1260,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 			seq_printf(seq, ",compress-force=%s", compress_type);
 		else
 			seq_printf(seq, ",compress=%s", compress_type);
+		if (info->compress_level)
+			seq_printf(seq, "%d", info->compress_level);
 	}
 	if (btrfs_test_opt(info, NOSSD))
 		seq_puts(seq, ",nossd");
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c248f9286366fd..2b52950dc2c68a 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -37,6 +37,7 @@ struct workspace {
 	z_stream strm;
 	char *buf;
 	struct list_head list;
+	int level;
 };
 
 static void zlib_free_workspace(struct list_head *ws)
@@ -96,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
 	*total_out = 0;
 	*total_in = 0;
 
-	if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) {
+	if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) {
 		pr_warn("BTRFS: deflateInit failed\n");
 		ret = -EIO;
 		goto out;
@@ -402,10 +403,22 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
 	return ret;
 }
 
+static void zlib_set_level(struct list_head *ws, unsigned int type)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	unsigned level = (type & 0xF0) >> 4;
+
+	if (level > 9)
+		level = 9;
+
+	workspace->level = level > 0 ? level : 3;
+}
+
 const struct btrfs_compress_op btrfs_zlib_compress = {
 	.alloc_workspace	= zlib_alloc_workspace,
 	.free_workspace		= zlib_free_workspace,
 	.compress_pages		= zlib_compress_pages,
 	.decompress_bio		= zlib_decompress_bio,
 	.decompress		= zlib_decompress,
+	.set_level              = zlib_set_level,
 };
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 607ce47b483aad..17f2dd8fddb814 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -423,10 +423,15 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
 	return ret;
 }
 
+static void zstd_set_level(struct list_head *ws, unsigned int type)
+{
+}
+
 const struct btrfs_compress_op btrfs_zstd_compress = {
 	.alloc_workspace = zstd_alloc_workspace,
 	.free_workspace = zstd_free_workspace,
 	.compress_pages = zstd_compress_pages,
 	.decompress_bio = zstd_decompress_bio,
 	.decompress = zstd_decompress,
+	.set_level = zstd_set_level,
 };

From fa4d885a482ef52ad3efa12a5799a3f6408b0718 Mon Sep 17 00:00:00 2001
From: Adam Borowski <kilobyte@angband.pl>
Date: Fri, 15 Sep 2017 17:36:58 +0200
Subject: [PATCH 081/109] btrfs: allow setting zlib compression level via :9

This is bikeshedding, but it seems people are drastically more likely to
understand "zlib:9" as compression level rather than an algorithm
version compared to "zlib9".

Based on feedback on the mailinglist, the ":9" will be the only accepted
syntax. The level must be a single digit. Unrecognized format will
result to the default, for forward compatibility in a similar way the
compression algorithm specifier was relaxed in commit
a7164fa4e055daf6368c ("btrfs: prepare for extensions in compression
options").

Signed-off-by: Adam Borowski <kilobyte@angband.pl>
Reviewed-by: David Sterba <dsterba@suse.com>
[ tighten the accepted format ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 5 +++--
 fs/btrfs/super.c       | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 3e452525f8ad61..083f9c48587523 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1111,8 +1111,9 @@ unsigned int btrfs_compress_str2level(const char *str)
 	if (strncmp(str, "zlib", 4) != 0)
 		return 0;
 
-	if ('1' <= str[4] && str[4] <= '9' )
-		return str[4] - '0';
+	/* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
+	if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
+		return str[5] - '0';
 
 	return 0;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 57f3f9600e18bc..65af029559b58a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1261,7 +1261,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		else
 			seq_printf(seq, ",compress=%s", compress_type);
 		if (info->compress_level)
-			seq_printf(seq, "%d", info->compress_level);
+			seq_printf(seq, ":%d", info->compress_level);
 	}
 	if (btrfs_test_opt(info, NOSSD))
 		seq_puts(seq, ",nossd");

From 6dd38f81f9c90fb5eec358d46f393f4366c9de50 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 17 Oct 2017 06:53:50 +0800
Subject: [PATCH 082/109] btrfs: remove BUG_ON in
 btrfs_rm_dev_replace_free_srcdev()

That was only an extra check to tackle a few bugs around this area, now
its safe to remove it.  Replace it by an ASSERT.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 11d7707a3fb3ce..f1ecb938ba4d71 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2016,19 +2016,20 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
 	}
 
 	btrfs_close_bdev(srcdev);
-
 	call_rcu(&srcdev->rcu, free_device);
 
-	/*
-	 * unless fs_devices is seed fs, num_devices shouldn't go
-	 * zero
-	 */
-	BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
-
 	/* if this is no devs we rather delete the fs_devices */
 	if (!fs_devices->num_devices) {
 		struct btrfs_fs_devices *tmp_fs_devices;
 
+		/*
+		 * On a mounted FS, num_devices can't be zero unless it's a
+		 * seed. In case of a seed device being replaced, the replace
+		 * target added to the sprout FS, so there will be no more
+		 * device left under the seed FS.
+		 */
+		ASSERT(fs_devices->seeding);
+
 		tmp_fs_devices = fs_info->fs_devices;
 		while (tmp_fs_devices) {
 			if (tmp_fs_devices->seed == fs_devices) {

From eb7b9d6a467409cc0b6c369b0a72489bf3be6801 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 16 Oct 2017 16:29:23 +0300
Subject: [PATCH 083/109] btrfs: send: remove unused code

This code was first introduced in 31db9f7c23fb ("Btrfs: introduce
BTRFS_IOC_SEND for btrfs send/receive") and it was not functional, then
it got slightly refactored in e938c8ad543c ("Btrfs: code cleanups for
send/receive"), alas it was still dead. So let's remove it for good!

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d9ddcdbdd2e7d3..13b98a554aab36 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1271,12 +1271,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 		 */
 		if (ino >= bctx->cur_objectid)
 			return 0;
-#if 0
-		if (ino > bctx->cur_objectid)
-			return 0;
-		if (offset + bctx->extent_len > bctx->cur_offset)
-			return 0;
-#endif
 	}
 
 	bctx->found++;

From c995ab3cda3f4178c1f1a47926bea5f8372880cb Mon Sep 17 00:00:00 2001
From: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Date: Fri, 22 Sep 2017 13:58:45 -0400
Subject: [PATCH 084/109] btrfs: add a flag to iterate_inodes_from_logical to
 find all extent refs for uncompressed extents

The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address).  These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).

When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset.  This prevents LOGICAL_INO from returning
references to more than a single block.

To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:

	for (i = a; i < b; ++i)
		extent_ref_set += LOGICAL_INO(i);

At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.

When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent.  This removes
the need for the loop, since we get all the extent refs in one call.

Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.

There is no functional change in this patch.  The new flag is always
false.

Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c            | 63 +++++++++++++++++++++--------------
 fs/btrfs/backref.h            |  8 +++--
 fs/btrfs/inode.c              |  2 +-
 fs/btrfs/ioctl.c              |  2 +-
 fs/btrfs/qgroup.c             |  8 ++---
 fs/btrfs/scrub.c              |  6 ++--
 fs/btrfs/send.c               |  2 +-
 fs/btrfs/tests/qgroup-tests.c | 30 +++++++++++------
 8 files changed, 73 insertions(+), 48 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 33cba1abf8b602..523d2dba7745e9 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -40,12 +40,14 @@ static int check_extent_in_eb(const struct btrfs_key *key,
 			      const struct extent_buffer *eb,
 			      const struct btrfs_file_extent_item *fi,
 			      u64 extent_item_pos,
-			      struct extent_inode_elem **eie)
+			      struct extent_inode_elem **eie,
+			      bool ignore_offset)
 {
 	u64 offset = 0;
 	struct extent_inode_elem *e;
 
-	if (!btrfs_file_extent_compression(eb, fi) &&
+	if (!ignore_offset &&
+	    !btrfs_file_extent_compression(eb, fi) &&
 	    !btrfs_file_extent_encryption(eb, fi) &&
 	    !btrfs_file_extent_other_encoding(eb, fi)) {
 		u64 data_offset;
@@ -84,7 +86,8 @@ static void free_inode_elem_list(struct extent_inode_elem *eie)
 
 static int find_extent_in_eb(const struct extent_buffer *eb,
 			     u64 wanted_disk_byte, u64 extent_item_pos,
-			     struct extent_inode_elem **eie)
+			     struct extent_inode_elem **eie,
+			     bool ignore_offset)
 {
 	u64 disk_byte;
 	struct btrfs_key key;
@@ -113,7 +116,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
 		if (disk_byte != wanted_disk_byte)
 			continue;
 
-		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset);
 		if (ret < 0)
 			return ret;
 	}
@@ -419,7 +422,7 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			   struct ulist *parents, struct prelim_ref *ref,
 			   int level, u64 time_seq, const u64 *extent_item_pos,
-			   u64 total_refs)
+			   u64 total_refs, bool ignore_offset)
 {
 	int ret = 0;
 	int slot;
@@ -472,7 +475,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			if (extent_item_pos) {
 				ret = check_extent_in_eb(&key, eb, fi,
 						*extent_item_pos,
-						&eie);
+						&eie, ignore_offset);
 				if (ret < 0)
 					break;
 			}
@@ -510,7 +513,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 				struct btrfs_path *path, u64 time_seq,
 				struct prelim_ref *ref, struct ulist *parents,
-				const u64 *extent_item_pos, u64 total_refs)
+				const u64 *extent_item_pos, u64 total_refs,
+				bool ignore_offset)
 {
 	struct btrfs_root *root;
 	struct btrfs_key root_key;
@@ -581,7 +585,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = add_all_parents(root, path, parents, ref, level, time_seq,
-			      extent_item_pos, total_refs);
+			      extent_item_pos, total_refs, ignore_offset);
 out:
 	path->lowest_level = 0;
 	btrfs_release_path(path);
@@ -616,7 +620,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 				 struct btrfs_path *path, u64 time_seq,
 				 struct preftrees *preftrees,
 				 const u64 *extent_item_pos, u64 total_refs,
-				 struct share_check *sc)
+				 struct share_check *sc, bool ignore_offset)
 {
 	int err;
 	int ret = 0;
@@ -661,7 +665,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		}
 		err = resolve_indirect_ref(fs_info, path, time_seq, ref,
 					   parents, extent_item_pos,
-					   total_refs);
+					   total_refs, ignore_offset);
 		/*
 		 * we can only tolerate ENOENT,otherwise,we should catch error
 		 * and return directly.
@@ -1107,13 +1111,17 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
  *
  * Otherwise this returns 0 for success and <0 for an error.
  *
+ * If ignore_offset is set to false, only extent refs whose offsets match
+ * extent_item_pos are returned.  If true, every extent ref is returned
+ * and extent_item_pos is ignored.
+ *
  * FIXME some caching might speed things up
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
 			     u64 time_seq, struct ulist *refs,
 			     struct ulist *roots, const u64 *extent_item_pos,
-			     struct share_check *sc)
+			     struct share_check *sc, bool ignore_offset)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -1235,7 +1243,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
 
 	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
-				    extent_item_pos, total_refs, sc);
+				    extent_item_pos, total_refs, sc, ignore_offset);
 	if (ret)
 		goto out;
 
@@ -1282,7 +1290,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 				btrfs_tree_read_lock(eb);
 				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 				ret = find_extent_in_eb(eb, bytenr,
-							*extent_item_pos, &eie);
+							*extent_item_pos, &eie, ignore_offset);
 				btrfs_tree_read_unlock_blocking(eb);
 				free_extent_buffer(eb);
 				if (ret < 0)
@@ -1350,7 +1358,7 @@ static void free_leaf_list(struct ulist *blocks)
 static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 				struct btrfs_fs_info *fs_info, u64 bytenr,
 				u64 time_seq, struct ulist **leafs,
-				const u64 *extent_item_pos)
+				const u64 *extent_item_pos, bool ignore_offset)
 {
 	int ret;
 
@@ -1359,7 +1367,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
-				*leafs, NULL, extent_item_pos, NULL);
+				*leafs, NULL, extent_item_pos, NULL, ignore_offset);
 	if (ret < 0 && ret != -ENOENT) {
 		free_leaf_list(*leafs);
 		return ret;
@@ -1383,7 +1391,8 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
  */
 static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
 				     struct btrfs_fs_info *fs_info, u64 bytenr,
-				     u64 time_seq, struct ulist **roots)
+				     u64 time_seq, struct ulist **roots,
+				     bool ignore_offset)
 {
 	struct ulist *tmp;
 	struct ulist_node *node = NULL;
@@ -1402,7 +1411,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
 		ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
-					tmp, *roots, NULL, NULL);
+					tmp, *roots, NULL, NULL, ignore_offset);
 		if (ret < 0 && ret != -ENOENT) {
 			ulist_free(tmp);
 			ulist_free(*roots);
@@ -1421,14 +1430,15 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
 
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 bytenr,
-			 u64 time_seq, struct ulist **roots)
+			 u64 time_seq, struct ulist **roots,
+			 bool ignore_offset)
 {
 	int ret;
 
 	if (!trans)
 		down_read(&fs_info->commit_root_sem);
 	ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
-					time_seq, roots);
+					time_seq, roots, ignore_offset);
 	if (!trans)
 		up_read(&fs_info->commit_root_sem);
 	return ret;
@@ -1483,7 +1493,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
 		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
-					roots, NULL, &shared);
+					roots, NULL, &shared, false);
 		if (ret == BACKREF_FOUND_SHARED) {
 			/* this is the only condition under which we return 1 */
 			ret = 1;
@@ -1877,7 +1887,8 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 				u64 extent_item_objectid, u64 extent_item_pos,
 				int search_commit_root,
-				iterate_extent_inodes_t *iterate, void *ctx)
+				iterate_extent_inodes_t *iterate, void *ctx,
+				bool ignore_offset)
 {
 	int ret;
 	struct btrfs_trans_handle *trans = NULL;
@@ -1903,14 +1914,15 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 
 	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
 				   tree_mod_seq_elem.seq, &refs,
-				   &extent_item_pos);
+				   &extent_item_pos, ignore_offset);
 	if (ret)
 		goto out;
 
 	ULIST_ITER_INIT(&ref_uiter);
 	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
 		ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
-						tree_mod_seq_elem.seq, &roots);
+						tree_mod_seq_elem.seq, &roots,
+						ignore_offset);
 		if (ret)
 			break;
 		ULIST_ITER_INIT(&root_uiter);
@@ -1943,7 +1955,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 
 int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 				struct btrfs_path *path,
-				iterate_extent_inodes_t *iterate, void *ctx)
+				iterate_extent_inodes_t *iterate, void *ctx,
+				bool ignore_offset)
 {
 	int ret;
 	u64 extent_item_pos;
@@ -1961,7 +1974,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 	extent_item_pos = logical - found_key.objectid;
 	ret = iterate_extent_inodes(fs_info, found_key.objectid,
 					extent_item_pos, search_commit_root,
-					iterate, ctx);
+					iterate, ctx, ignore_offset);
 
 	return ret;
 }
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index e410335841aacf..0c2fab8514ffa1 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -43,17 +43,19 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 				u64 extent_item_objectid,
 				u64 extent_offset, int search_commit_root,
-				iterate_extent_inodes_t *iterate, void *ctx);
+				iterate_extent_inodes_t *iterate, void *ctx,
+				bool ignore_offset);
 
 int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 				struct btrfs_path *path,
-				iterate_extent_inodes_t *iterate, void *ctx);
+				iterate_extent_inodes_t *iterate, void *ctx,
+				bool ignore_offset);
 
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 bytenr,
-			 u64 time_seq, struct ulist **roots);
+			 u64 time_seq, struct ulist **roots, bool ignore_offset);
 char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 			u32 name_len, unsigned long name_off,
 			struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3f1b53f8573581..eb20239284a26b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2457,7 +2457,7 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path,
 		ret = iterate_inodes_from_logical(old->bytenr +
 						  old->extent_offset, fs_info,
 						  path, record_one_backref,
-						  old);
+						  old, false);
 		if (ret < 0 && ret != -ENOENT)
 			return false;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 847d318756d474..2497a5d45d9c59 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4560,7 +4560,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
-					  build_ino_list, inodes);
+					  build_ino_list, inodes, false);
 	if (ret == -EINVAL)
 		ret = -ENOENT;
 	if (ret < 0)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e172d4843eae2d..168fd03ca3ac94 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1441,7 +1441,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
 	u64 bytenr = qrecord->bytenr;
 	int ret;
 
-	ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root);
+	ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
 	if (ret < 0)
 		return ret;
 
@@ -2031,7 +2031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
 				/* Search commit root to find old_roots */
 				ret = btrfs_find_all_roots(NULL, fs_info,
 						record->bytenr, 0,
-						&record->old_roots);
+						&record->old_roots, false);
 				if (ret < 0)
 					goto cleanup;
 			}
@@ -2042,7 +2042,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
 			 * root. It's safe inside commit_transaction().
 			 */
 			ret = btrfs_find_all_roots(trans, fs_info,
-					record->bytenr, SEQ_LAST, &new_roots);
+				record->bytenr, SEQ_LAST, &new_roots, false);
 			if (ret < 0)
 				goto cleanup;
 			if (qgroup_to_skip) {
@@ -2570,7 +2570,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 			num_bytes = found.offset;
 
 		ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
-					   &roots);
+					   &roots, false);
 		if (ret < 0)
 			goto out;
 		/* For rescan, just pass old_roots as NULL */
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index cd1b791d9706cc..b2f871d80982f2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -883,7 +883,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 		swarn.dev = dev;
 		iterate_extent_inodes(fs_info, found_key.objectid,
 					extent_item_pos, 1,
-					scrub_print_warning_inode, &swarn);
+					scrub_print_warning_inode, &swarn, false);
 	}
 
 out:
@@ -1047,7 +1047,7 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
 	 * can be found.
 	 */
 	ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
-					  scrub_fixup_readpage, fixup);
+					  scrub_fixup_readpage, fixup, false);
 	if (ret < 0) {
 		uncorrectable = 1;
 		goto out;
@@ -4390,7 +4390,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
 	}
 
 	ret = iterate_inodes_from_logical(logical, fs_info, path,
-					  record_inode_for_nocow, nocow_ctx);
+			record_inode_for_nocow, nocow_ctx, false);
 	if (ret != 0 && ret != -ENOENT) {
 		btrfs_warn(fs_info,
 			   "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 13b98a554aab36..c10e4c70f02d15 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1423,7 +1423,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 		extent_item_pos = 0;
 	ret = iterate_extent_inodes(fs_info, found_key.objectid,
 				    extent_item_pos, 1, __iterate_backrefs,
-				    backref_ctx);
+				    backref_ctx, false);
 
 	if (ret < 0)
 		goto out;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 0f4ce970d19517..90204b16664352 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -240,7 +240,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 	 * we can only call btrfs_qgroup_account_extent() directly to test
 	 * quota.
 	 */
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+			false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_msg("Couldn't find old roots: %d\n", ret);
@@ -252,7 +253,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 	if (ret)
 		return ret;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+			false);
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
@@ -275,7 +277,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 	old_roots = NULL;
 	new_roots = NULL;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+			false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_msg("Couldn't find old roots: %d\n", ret);
@@ -286,7 +289,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 	if (ret)
 		return -EINVAL;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+			false);
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
@@ -337,7 +341,8 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return ret;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+			false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_msg("Couldn't find old roots: %d\n", ret);
@@ -349,7 +354,8 @@ static int test_multiple_refs(struct btrfs_root *root,
 	if (ret)
 		return ret;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+			false);
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
@@ -370,7 +376,8 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return -EINVAL;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+			false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_msg("Couldn't find old roots: %d\n", ret);
@@ -382,7 +389,8 @@ static int test_multiple_refs(struct btrfs_root *root,
 	if (ret)
 		return ret;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+			false);
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
@@ -409,7 +417,8 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return -EINVAL;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+			false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_msg("Couldn't find old roots: %d\n", ret);
@@ -421,7 +430,8 @@ static int test_multiple_refs(struct btrfs_root *root,
 	if (ret)
 		return ret;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+			false);
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);

From d24a67b2d997c860a42516076f3315c2ad2d2884 Mon Sep 17 00:00:00 2001
From: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Date: Fri, 22 Sep 2017 13:58:46 -0400
Subject: [PATCH 085/109] btrfs: add a flags argument to LOGICAL_INO and call
 it LOGICAL_INO_V2

Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.

Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.

Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values.  Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized.  The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.

To avoid these problems, define a new ioctl LOGICAL_INO_V2.  We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field.  The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.

Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different).  A version parameter and an 'if' statement will suffice.

Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.

Motivation and background, copied from the patchset cover letter:

Suppose we have a file with one extent:

    root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
    root@tester:~# sync

Split the extent by overwriting it in the middle:

    root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a

We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:

    root@tester:~# btrfs-debug-tree /dev/vdc -t 2
    [...]
            item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
                    extent refs 2 gen 29 flags DATA
                    extent data backref root 5 objectid 261 offset 0 count 2
    [...]
            item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
                    extent refs 1 gen 30 flags DATA
                    extent data backref root 5 objectid 261 offset 8192 count 1
    [...]

and the ref tree looks like:

    root@tester:~# btrfs-debug-tree /dev/vdc -t 5
    [...]
            item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
                    extent data disk byte 1103101952 nr 73728
                    extent data offset 0 nr 8192 ram 73728
                    extent compression(none)
            item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
                    extent data disk byte 1103175680 nr 4096
                    extent data offset 0 nr 4096 ram 4096
                    extent compression(none)
            item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
                    extent data disk byte 1103101952 nr 73728
                    extent data offset 12288 nr 61440 ram 73728
                    extent compression(none)
    [...]

There are two references to the same extent with different, non-overlapping
byte offsets:

    [------------------72K extent at 1103101952----------------------]
    [--8K----------------|--4K unreachable----|--60K-----------------]
    ^                                         ^
    |                                         |
    [--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
                         |
                         v
                         [-----4K extent-----] at 1103175680

We want to find all of the references to extent bytenr 1103101952.

Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:

    root@tester:~# btrfs ins log 1103101952 -P /test/
    Using LOGICAL_INO
    inode 261 offset 0 root 5

    root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
    inode 261 offset 0 root 5
    inode 261 offset 4096 root 5   <- same extent ref as offset 0
                                   (offset 8192 returns empty set, not reachable)
    inode 261 offset 12288 root 5
    inode 261 offset 16384 root 5  \
    inode 261 offset 20480 root 5  |
    inode 261 offset 24576 root 5  |
    inode 261 offset 28672 root 5  |
    inode 261 offset 32768 root 5  |
    inode 261 offset 36864 root 5  \
    inode 261 offset 40960 root 5   > all the same extent ref as offset 12288.
    inode 261 offset 45056 root 5  /  More processing required in userspace
    inode 261 offset 49152 root 5  |  to figure out these are all duplicates.
    inode 261 offset 53248 root 5  |
    inode 261 offset 57344 root 5  |
    inode 261 offset 61440 root 5  |
    inode 261 offset 65536 root 5  |
    inode 261 offset 69632 root 5  /

In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.

With the patch, we just use one call to map all refs to the extent at once:
    root@tester:~# btrfs ins log 1103101952 -P /test/
    Using LOGICAL_INO_V2
    inode 261 offset 0 root 5
    inode 261 offset 12288 root 5

The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references.  Userspace can use this information to make
better choices to dedup or defrag.

Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c           | 26 +++++++++++++++++++++++---
 include/uapi/linux/btrfs.h |  8 +++++++-
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2497a5d45d9c59..fa9996ab3da616 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4530,13 +4530,14 @@ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
 }
 
 static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
-					void __user *arg)
+					void __user *arg, int version)
 {
 	int ret = 0;
 	int size;
 	struct btrfs_ioctl_logical_ino_args *loi;
 	struct btrfs_data_container *inodes = NULL;
 	struct btrfs_path *path = NULL;
+	bool ignore_offset;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -4545,6 +4546,22 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 	if (IS_ERR(loi))
 		return PTR_ERR(loi);
 
+	if (version == 1) {
+		ignore_offset = false;
+	} else {
+		/* All reserved bits must be 0 for now */
+		if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
+			ret = -EINVAL;
+			goto out_loi;
+		}
+		/* Only accept flags we have defined so far */
+		if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
+			ret = -EINVAL;
+			goto out_loi;
+		}
+		ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
+	}
+
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
@@ -4560,7 +4577,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
-					  build_ino_list, inodes, false);
+					  build_ino_list, inodes, ignore_offset);
 	if (ret == -EINVAL)
 		ret = -ENOENT;
 	if (ret < 0)
@@ -4574,6 +4591,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 out:
 	btrfs_free_path(path);
 	kvfree(inodes);
+out_loi:
 	kfree(loi);
 
 	return ret;
@@ -5575,7 +5593,9 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_INO_PATHS:
 		return btrfs_ioctl_ino_to_path(root, argp);
 	case BTRFS_IOC_LOGICAL_INO:
-		return btrfs_ioctl_logical_to_ino(fs_info, argp);
+		return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
+	case BTRFS_IOC_LOGICAL_INO_V2:
+		return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
 	case BTRFS_IOC_SPACE_INFO:
 		return btrfs_ioctl_space_info(fs_info, argp);
 	case BTRFS_IOC_SYNC: {
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 378230c163d5ec..99bb7988e6fe67 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -608,10 +608,14 @@ struct btrfs_ioctl_ino_path_args {
 struct btrfs_ioctl_logical_ino_args {
 	__u64				logical;	/* in */
 	__u64				size;		/* in */
-	__u64				reserved[4];
+	__u64				reserved[3];	/* must be 0 for now */
+	__u64				flags;		/* in, v2 only */
 	/* struct btrfs_data_container	*inodes;	out   */
 	__u64				inodes;
 };
+/* Return every ref to the extent, not just those containing logical block.
+ * Requires logical == extent bytenr. */
+#define BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET	(1ULL << 0)
 
 enum btrfs_dev_stat_values {
 	/* disk I/O failure stats */
@@ -835,5 +839,7 @@ enum btrfs_err_code {
 				   struct btrfs_ioctl_feature_flags[3])
 #define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \
 				   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_LOGICAL_INO_V2 _IOWR(BTRFS_IOCTL_MAGIC, 59, \
+					struct btrfs_ioctl_logical_ino_args)
 
 #endif /* _UAPI_LINUX_BTRFS_H */

From b115e3bc81aeb624fe7c4eccecbd094601ebde84 Mon Sep 17 00:00:00 2001
From: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Date: Fri, 22 Sep 2017 13:58:47 -0400
Subject: [PATCH 086/109] btrfs: increase output size for LOGICAL_INO_V2 ioctl

Build-server workloads have hundreds of references per file after dedup.
Multiply by a few snapshots and we quickly exhaust the limit of 2730
references per extent that can fit into a 64K buffer.

Raise the limit to 16M to be consistent with other btrfs ioctls
(e.g. TREE_SEARCH_V2, FILE_EXTENT_SAME).

To minimize surprising userspace behavior, apply this change only to
the LOGICAL_INO_V2 ioctl.

Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fa9996ab3da616..0afa79c65b9ff3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4548,6 +4548,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 
 	if (version == 1) {
 		ignore_offset = false;
+		size = min_t(u32, loi->size, SZ_64K);
 	} else {
 		/* All reserved bits must be 0 for now */
 		if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
@@ -4560,6 +4561,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 			goto out_loi;
 		}
 		ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
+		size = min_t(u32, loi->size, SZ_16M);
 	}
 
 	path = btrfs_alloc_path();
@@ -4568,7 +4570,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 		goto out;
 	}
 
-	size = min_t(u32, loi->size, SZ_64K);
 	inodes = init_data_container(size);
 	if (IS_ERR(inodes)) {
 		ret = PTR_ERR(inodes);

From 8b62f87bad9cf06e536799bf8cb942ab95f6bfa4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 19 Oct 2017 14:15:55 -0400
Subject: [PATCH 087/109] Btrfs: rework outstanding_extents

Right now we do a lot of weird hoops around outstanding_extents in order
to keep the extent count consistent.  This is because we logically
transfer the outstanding_extent count from the initial reservation
through the set_delalloc_bits.  This makes it pretty difficult to get a
handle on how and when we need to mess with outstanding_extents.

Fix this by revamping the rules of how we deal with outstanding_extents.
Now instead everybody that is holding on to a delalloc extent is
required to increase the outstanding extents count for itself.  This
means we'll have something like this

btrfs_delalloc_reserve_metadata	- outstanding_extents = 1
 btrfs_set_extent_delalloc	- outstanding_extents = 2
btrfs_release_delalloc_extents	- outstanding_extents = 1

for an initial file write.  Now take the append write where we extend an
existing delalloc range but still under the maximum extent size

btrfs_delalloc_reserve_metadata - outstanding_extents = 2
  btrfs_set_extent_delalloc
    btrfs_set_bit_hook		- outstanding_extents = 3
    btrfs_merge_extent_hook	- outstanding_extents = 2
btrfs_delalloc_release_extents	- outstanding_extnets = 1

In order to make the ordered extent transition we of course must now
make ordered extents carry their own outstanding_extent reservation, so
for cow_file_range we end up with

btrfs_add_ordered_extent	- outstanding_extents = 2
clear_extent_bit		- outstanding_extents = 1
btrfs_remove_ordered_extent	- outstanding_extents = 0

This makes all manipulations of outstanding_extents much more explicit.
Every successful call to btrfs_delalloc_reserve_metadata _must_ now be
combined with btrfs_release_delalloc_extents, even in the error case, as
that is the only function that actually modifies the
outstanding_extents counter.

The drawback to this is now we are much more likely to have transient
cases where outstanding_extents is much larger than it actually should
be.  This could happen before as we manipulated the delalloc bits, but
now it happens basically at every write.  This may put more pressure on
the ENOSPC flushing code, but I think making this code simpler is worth
the cost.  I have another change coming to mitigate this side-effect
somewhat.

I also added trace points for the counter manipulation.  These were used
by a bpf script I wrote to help track down leak issues.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h       |  17 +++++
 fs/btrfs/ctree.h             |   2 +
 fs/btrfs/extent-tree.c       | 139 +++++++++++++++++++++++------------
 fs/btrfs/file.c              |  22 ++----
 fs/btrfs/inode-map.c         |   3 +-
 fs/btrfs/inode.c             | 114 +++++++++-------------------
 fs/btrfs/ioctl.c             |   2 +
 fs/btrfs/ordered-data.c      |  21 +++++-
 fs/btrfs/relocation.c        |   3 +
 fs/btrfs/tests/inode-tests.c |  18 +----
 10 files changed, 185 insertions(+), 156 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index eccadb5f62a5ed..bed88811f686ac 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -267,6 +267,23 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
 	return false;
 }
 
+static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
+						 int mod)
+{
+	lockdep_assert_held(&inode->lock);
+	inode->outstanding_extents += mod;
+	if (btrfs_is_free_space_inode(inode))
+		return;
+}
+
+static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode, int mod)
+{
+	lockdep_assert_held(&inode->lock);
+	inode->reserved_extents += mod;
+	if (btrfs_is_free_space_inode(inode))
+		return;
+}
+
 static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
 {
 	int ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c02d9524055b3..2ede3b6ceb681a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2748,6 +2748,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     u64 *qgroup_reserved, bool use_global_rsv);
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
 				      struct btrfs_block_rsv *rsv);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4f874d02f31049..aaa346562df62d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5954,42 +5954,31 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
 }
 
 /**
- * drop_outstanding_extent - drop an outstanding extent
+ * drop_over_reserved_extents - drop our extra extent reservations
  * @inode: the inode we're dropping the extent for
- * @num_bytes: the number of bytes we're releasing.
  *
- * This is called when we are freeing up an outstanding extent, either called
- * after an error or after an extent is written.  This will return the number of
- * reserved extents that need to be freed.  This must be called with
- * BTRFS_I(inode)->lock held.
+ * We reserve extents we may use, but they may have been merged with other
+ * extents and we may not need the extra reservation.
+ *
+ * We also call this when we've completed io to an extent or had an error and
+ * cleared the outstanding extent, in either case we no longer need our
+ * reservation and can drop the excess.
  */
-static unsigned drop_outstanding_extent(struct btrfs_inode *inode,
-		u64 num_bytes)
+static unsigned drop_over_reserved_extents(struct btrfs_inode *inode)
 {
-	unsigned drop_inode_space = 0;
-	unsigned dropped_extents = 0;
-	unsigned num_extents;
+	unsigned num_extents = 0;
 
-	num_extents = count_max_extents(num_bytes);
-	ASSERT(num_extents);
-	ASSERT(inode->outstanding_extents >= num_extents);
-	inode->outstanding_extents -= num_extents;
+	if (inode->reserved_extents > inode->outstanding_extents) {
+		num_extents = inode->reserved_extents -
+			inode->outstanding_extents;
+		btrfs_mod_reserved_extents(inode, -num_extents);
+	}
 
 	if (inode->outstanding_extents == 0 &&
 	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
 			       &inode->runtime_flags))
-		drop_inode_space = 1;
-
-	/*
-	 * If we have more or the same amount of outstanding extents than we have
-	 * reserved then we need to leave the reserved extents count alone.
-	 */
-	if (inode->outstanding_extents >= inode->reserved_extents)
-		return drop_inode_space;
-
-	dropped_extents = inode->reserved_extents - inode->outstanding_extents;
-	inode->reserved_extents -= dropped_extents;
-	return dropped_extents + drop_inode_space;
+		num_extents++;
+	return num_extents;
 }
 
 /**
@@ -6044,13 +6033,15 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 	struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
 	u64 to_reserve = 0;
 	u64 csum_bytes;
-	unsigned nr_extents;
+	unsigned nr_extents, reserve_extents;
 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 	int ret = 0;
 	bool delalloc_lock = true;
 	u64 to_free = 0;
 	unsigned dropped;
 	bool release_extra = false;
+	bool underflow = false;
+	bool did_retry = false;
 
 	/* If we are a free space inode we need to not flush since we will be in
 	 * the middle of a transaction commit.  We also don't need the delalloc
@@ -6075,18 +6066,31 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 		mutex_lock(&inode->delalloc_mutex);
 
 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
-
+retry:
 	spin_lock(&inode->lock);
-	nr_extents = count_max_extents(num_bytes);
-	inode->outstanding_extents += nr_extents;
+	reserve_extents = nr_extents = count_max_extents(num_bytes);
+	btrfs_mod_outstanding_extents(inode, nr_extents);
 
-	nr_extents = 0;
-	if (inode->outstanding_extents > inode->reserved_extents)
-		nr_extents += inode->outstanding_extents -
+	/*
+	 * Because we add an outstanding extent for ordered before we clear
+	 * delalloc we will double count our outstanding extents slightly.  This
+	 * could mean that we transiently over-reserve, which could result in an
+	 * early ENOSPC if our timing is unlucky.  Keep track of the case that
+	 * we had a reservation underflow so we can retry if we fail.
+	 *
+	 * Keep in mind we can legitimately have more outstanding extents than
+	 * reserved because of fragmentation, so only allow a retry once.
+	 */
+	if (inode->outstanding_extents >
+	    inode->reserved_extents + nr_extents) {
+		reserve_extents = inode->outstanding_extents -
 			inode->reserved_extents;
+		underflow = true;
+	}
 
 	/* We always want to reserve a slot for updating the inode. */
-	to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1);
+	to_reserve = btrfs_calc_trans_metadata_size(fs_info,
+						    reserve_extents + 1);
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	csum_bytes = inode->csum_bytes;
 	spin_unlock(&inode->lock);
@@ -6111,7 +6115,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 		to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
 		release_extra = true;
 	}
-	inode->reserved_extents += nr_extents;
+	btrfs_mod_reserved_extents(inode, reserve_extents);
 	spin_unlock(&inode->lock);
 
 	if (delalloc_lock)
@@ -6127,7 +6131,10 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 
 out_fail:
 	spin_lock(&inode->lock);
-	dropped = drop_outstanding_extent(inode, num_bytes);
+	nr_extents = count_max_extents(num_bytes);
+	btrfs_mod_outstanding_extents(inode, -nr_extents);
+
+	dropped = drop_over_reserved_extents(inode);
 	/*
 	 * If the inodes csum_bytes is the same as the original
 	 * csum_bytes then we know we haven't raced with any free()ers
@@ -6184,6 +6191,11 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 		trace_btrfs_space_reservation(fs_info, "delalloc",
 					      btrfs_ino(inode), to_free, 0);
 	}
+	if (underflow && !did_retry) {
+		did_retry = true;
+		underflow = false;
+		goto retry;
+	}
 	if (delalloc_lock)
 		mutex_unlock(&inode->delalloc_mutex);
 	return ret;
@@ -6191,12 +6203,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 
 /**
  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
- * @inode: the inode to release the reservation for
- * @num_bytes: the number of bytes we're releasing
+ * @inode: the inode to release the reservation for.
+ * @num_bytes: the number of bytes we are releasing.
  *
  * This will release the metadata reservation for an inode.  This can be called
  * once we complete IO for a given set of bytes to release their metadata
- * reservations.
+ * reservations, or on error for the same reason.
  */
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
@@ -6206,8 +6218,7 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
 
 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 	spin_lock(&inode->lock);
-	dropped = drop_outstanding_extent(inode, num_bytes);
-
+	dropped = drop_over_reserved_extents(inode);
 	if (num_bytes)
 		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
 	spin_unlock(&inode->lock);
@@ -6223,6 +6234,42 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
 	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
 }
 
+/**
+ * btrfs_delalloc_release_extents - release our outstanding_extents
+ * @inode: the inode to balance the reservation for.
+ * @num_bytes: the number of bytes we originally reserved with
+ *
+ * When we reserve space we increase outstanding_extents for the extents we may
+ * add.  Once we've set the range as delalloc or created our ordered extents we
+ * have outstanding_extents to track the real usage, so we use this to free our
+ * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
+ * with btrfs_delalloc_reserve_metadata.
+ */
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+	unsigned num_extents;
+	u64 to_free;
+	unsigned dropped;
+
+	spin_lock(&inode->lock);
+	num_extents = count_max_extents(num_bytes);
+	btrfs_mod_outstanding_extents(inode, -num_extents);
+	dropped = drop_over_reserved_extents(inode);
+	spin_unlock(&inode->lock);
+
+	if (!dropped)
+		return;
+
+	if (btrfs_is_testing(fs_info))
+		return;
+
+	to_free = btrfs_calc_trans_metadata_size(fs_info, dropped);
+	trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
+				      to_free, 0);
+	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
+}
+
 /**
  * btrfs_delalloc_reserve_space - reserve data and metadata space for
  * delalloc
@@ -6267,10 +6314,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
  * @inode: inode we're releasing space for
  * @start: start position of the space already reserved
  * @len: the len of the space already reserved
- *
- * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
- * called in the case that we don't need the metadata AND data reservations
- * anymore.  So if there is an error or we insert an inline extent.
+ * @release_bytes: the len of the space we consumed or didn't use
  *
  * This function will release the metadata space that was not used and will
  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
@@ -6278,7 +6322,8 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
  * Also it will handle the qgroup reserved space.
  */
 void btrfs_delalloc_release_space(struct inode *inode,
-			struct extent_changeset *reserved, u64 start, u64 len)
+				  struct extent_changeset *reserved,
+				  u64 start, u64 len)
 {
 	btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
 	btrfs_free_reserved_data_space(inode, reserved, start, len);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4de174b664ffa3..f80254d82f409b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1656,6 +1656,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 			}
 		}
 
+		WARN_ON(reserve_bytes == 0);
 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
 				reserve_bytes);
 		if (ret) {
@@ -1678,8 +1679,11 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		ret = prepare_pages(inode, pages, num_pages,
 				    pos, write_bytes,
 				    force_page_uptodate);
-		if (ret)
+		if (ret) {
+			btrfs_delalloc_release_extents(BTRFS_I(inode),
+						       reserve_bytes);
 			break;
+		}
 
 		extents_locked = lock_and_cleanup_extent_if_need(
 				BTRFS_I(inode), pages,
@@ -1688,6 +1692,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		if (extents_locked < 0) {
 			if (extents_locked == -EAGAIN)
 				goto again;
+			btrfs_delalloc_release_extents(BTRFS_I(inode),
+						       reserve_bytes);
 			ret = extents_locked;
 			break;
 		}
@@ -1716,23 +1722,10 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 						   PAGE_SIZE);
 		}
 
-		/*
-		 * If we had a short copy we need to release the excess delaloc
-		 * bytes we reserved.  We need to increment outstanding_extents
-		 * because btrfs_delalloc_release_space and
-		 * btrfs_delalloc_release_metadata will decrement it, but
-		 * we still have an outstanding extent for the chunk we actually
-		 * managed to copy.
-		 */
 		if (num_sectors > dirty_sectors) {
 			/* release everything except the sectors we dirtied */
 			release_bytes -= dirty_sectors <<
 						fs_info->sb->s_blocksize_bits;
-			if (copied > 0) {
-				spin_lock(&BTRFS_I(inode)->lock);
-				BTRFS_I(inode)->outstanding_extents++;
-				spin_unlock(&BTRFS_I(inode)->lock);
-			}
 			if (only_release_metadata) {
 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
 								release_bytes);
@@ -1758,6 +1751,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     lockstart, lockend, &cached_state,
 					     GFP_NOFS);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
 		if (ret) {
 			btrfs_drop_pages(pages, num_pages);
 			break;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d02019747d001d..022b19336feead 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -500,11 +500,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
 					      prealloc, prealloc, &alloc_hint);
 	if (ret) {
-		btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
 		goto out_put;
 	}
 
 	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
 out_put:
 	iput(inode);
 out_release:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index eb20239284a26b..5b0de1e120d1ef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -67,7 +67,6 @@ struct btrfs_iget_args {
 };
 
 struct btrfs_dio_data {
-	u64 outstanding_extents;
 	u64 reserve;
 	u64 unsubmitted_oe_range_start;
 	u64 unsubmitted_oe_range_end;
@@ -348,7 +347,6 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 	}
 
 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
-	btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start);
 	btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
 out:
 	/*
@@ -584,16 +582,21 @@ static noinline void compress_file_range(struct inode *inode,
 		}
 		if (ret <= 0) {
 			unsigned long clear_flags = EXTENT_DELALLOC |
-				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG;
+				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
+				EXTENT_DO_ACCOUNTING;
 			unsigned long page_error_op;
 
-			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
 			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
 
 			/*
 			 * inline extent creation worked or returned error,
 			 * we don't need to create any more async work items.
 			 * Unlock and free up our temp pages.
+			 *
+			 * We use DO_ACCOUNTING here because we need the
+			 * delalloc_release_metadata to be done _after_ we drop
+			 * our outstanding extent for clearing delalloc for this
+			 * range.
 			 */
 			extent_clear_unlock_delalloc(inode, start, end, end,
 						     NULL, clear_flags,
@@ -602,10 +605,6 @@ static noinline void compress_file_range(struct inode *inode,
 						     PAGE_SET_WRITEBACK |
 						     page_error_op |
 						     PAGE_END_WRITEBACK);
-			if (ret == 0)
-				btrfs_free_reserved_data_space_noquota(inode,
-							       start,
-							       end - start + 1);
 			goto free_pages_out;
 		}
 	}
@@ -981,15 +980,19 @@ static noinline int cow_file_range(struct inode *inode,
 		ret = cow_file_range_inline(root, inode, start, end, 0,
 					BTRFS_COMPRESS_NONE, NULL);
 		if (ret == 0) {
+			/*
+			 * We use DO_ACCOUNTING here because we need the
+			 * delalloc_release_metadata to be run _after_ we drop
+			 * our outstanding extent for clearing delalloc for this
+			 * range.
+			 */
 			extent_clear_unlock_delalloc(inode, start, end,
 				     delalloc_end, NULL,
 				     EXTENT_LOCKED | EXTENT_DELALLOC |
-				     EXTENT_DELALLOC_NEW |
-				     EXTENT_DEFRAG, PAGE_UNLOCK |
+				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
+				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
 				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
 				     PAGE_END_WRITEBACK);
-			btrfs_free_reserved_data_space_noquota(inode, start,
-						end - start + 1);
 			*nr_written = *nr_written +
 			     (end - start + PAGE_SIZE) / PAGE_SIZE;
 			*page_started = 1;
@@ -1627,7 +1630,7 @@ static void btrfs_split_extent_hook(void *private_data,
 	}
 
 	spin_lock(&BTRFS_I(inode)->lock);
-	BTRFS_I(inode)->outstanding_extents++;
+	btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 }
 
@@ -1657,7 +1660,7 @@ static void btrfs_merge_extent_hook(void *private_data,
 	/* we're not bigger than the max, unreserve the space and go */
 	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
 		spin_lock(&BTRFS_I(inode)->lock);
-		BTRFS_I(inode)->outstanding_extents--;
+		btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
 		spin_unlock(&BTRFS_I(inode)->lock);
 		return;
 	}
@@ -1688,7 +1691,7 @@ static void btrfs_merge_extent_hook(void *private_data,
 		return;
 
 	spin_lock(&BTRFS_I(inode)->lock);
-	BTRFS_I(inode)->outstanding_extents--;
+	btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 }
 
@@ -1758,15 +1761,12 @@ static void btrfs_set_bit_hook(void *private_data,
 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		u64 len = state->end + 1 - state->start;
+		u32 num_extents = count_max_extents(len);
 		bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
 
-		if (*bits & EXTENT_FIRST_DELALLOC) {
-			*bits &= ~EXTENT_FIRST_DELALLOC;
-		} else {
-			spin_lock(&BTRFS_I(inode)->lock);
-			BTRFS_I(inode)->outstanding_extents++;
-			spin_unlock(&BTRFS_I(inode)->lock);
-		}
+		spin_lock(&BTRFS_I(inode)->lock);
+		btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
+		spin_unlock(&BTRFS_I(inode)->lock);
 
 		/* For sanity tests */
 		if (btrfs_is_testing(fs_info))
@@ -1820,13 +1820,9 @@ static void btrfs_clear_bit_hook(void *private_data,
 		struct btrfs_root *root = inode->root;
 		bool do_list = !btrfs_is_free_space_inode(inode);
 
-		if (*bits & EXTENT_FIRST_DELALLOC) {
-			*bits &= ~EXTENT_FIRST_DELALLOC;
-		} else if (!(*bits & EXTENT_CLEAR_META_RESV)) {
-			spin_lock(&inode->lock);
-			inode->outstanding_extents -= num_extents;
-			spin_unlock(&inode->lock);
-		}
+		spin_lock(&inode->lock);
+		btrfs_mod_outstanding_extents(inode, -num_extents);
+		spin_unlock(&inode->lock);
 
 		/*
 		 * We don't reserve metadata space for space cache inodes so we
@@ -2097,6 +2093,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 				  0);
 	ClearPageChecked(page);
 	set_page_dirty(page);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 out:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			     &cached_state, GFP_NOFS);
@@ -3051,9 +3048,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 				 0, &cached_state, GFP_NOFS);
 	}
 
-	if (root != fs_info->tree_root)
-		btrfs_delalloc_release_metadata(BTRFS_I(inode),
-				ordered_extent->len);
 	if (trans)
 		btrfs_end_transaction(trans);
 
@@ -4794,8 +4788,11 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
 	    (!len || ((len & (blocksize - 1)) == 0)))
 		goto out;
 
+	block_start = round_down(from, blocksize);
+	block_end = block_start + blocksize - 1;
+
 	ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
-			round_down(from, blocksize), blocksize);
+					   block_start, blocksize);
 	if (ret)
 		goto out;
 
@@ -4803,15 +4800,12 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
 	page = find_or_create_page(mapping, index, mask);
 	if (!page) {
 		btrfs_delalloc_release_space(inode, data_reserved,
-				round_down(from, blocksize),
-				blocksize);
+					     block_start, blocksize);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	block_start = round_down(from, blocksize);
-	block_end = block_start + blocksize - 1;
-
 	if (!PageUptodate(page)) {
 		ret = btrfs_readpage(NULL, page);
 		lock_page(page);
@@ -4876,6 +4870,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
 	if (ret)
 		btrfs_delalloc_release_space(inode, data_reserved, block_start,
 					     blocksize);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
 	unlock_page(page);
 	put_page(page);
 out:
@@ -7790,33 +7785,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
 	return em;
 }
 
-static void adjust_dio_outstanding_extents(struct inode *inode,
-					   struct btrfs_dio_data *dio_data,
-					   const u64 len)
-{
-	unsigned num_extents = count_max_extents(len);
-
-	/*
-	 * If we have an outstanding_extents count still set then we're
-	 * within our reservation, otherwise we need to adjust our inode
-	 * counter appropriately.
-	 */
-	if (dio_data->outstanding_extents >= num_extents) {
-		dio_data->outstanding_extents -= num_extents;
-	} else {
-		/*
-		 * If dio write length has been split due to no large enough
-		 * contiguous space, we need to compensate our inode counter
-		 * appropriately.
-		 */
-		u64 num_needed = num_extents - dio_data->outstanding_extents;
-
-		spin_lock(&BTRFS_I(inode)->lock);
-		BTRFS_I(inode)->outstanding_extents += num_needed;
-		spin_unlock(&BTRFS_I(inode)->lock);
-	}
-}
-
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
@@ -7978,7 +7946,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 		if (!dio_data->overwrite && start + len > i_size_read(inode))
 			i_size_write(inode, start + len);
 
-		adjust_dio_outstanding_extents(inode, dio_data, len);
 		WARN_ON(dio_data->reserve < len);
 		dio_data->reserve -= len;
 		dio_data->unsubmitted_oe_range_end = start + len;
@@ -8008,14 +7975,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 err:
 	if (dio_data)
 		current->journal_info = dio_data;
-	/*
-	 * Compensate the delalloc release we do in btrfs_direct_IO() when we
-	 * write less data then expected, so that we don't underflow our inode's
-	 * outstanding extents counter.
-	 */
-	if (create && dio_data)
-		adjust_dio_outstanding_extents(inode, dio_data, len);
-
 	return ret;
 }
 
@@ -8860,7 +8819,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 						   offset, count);
 		if (ret)
 			goto out;
-		dio_data.outstanding_extents = count_max_extents(count);
 
 		/*
 		 * We need to know how many extents we reserved so that we can
@@ -8887,6 +8845,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (iov_iter_rw(iter) == WRITE) {
 		up_read(&BTRFS_I(inode)->dio_sem);
 		current->journal_info = NULL;
+		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
 		if (ret < 0 && ret != -EIOCBQUEUED) {
 			if (dio_data.reserve)
 				btrfs_delalloc_release_space(inode, data_reserved,
@@ -9224,9 +9183,6 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
 					  fs_info->sectorsize);
 		if (reserved_space < PAGE_SIZE) {
 			end = page_start + reserved_space - 1;
-			spin_lock(&BTRFS_I(inode)->lock);
-			BTRFS_I(inode)->outstanding_extents++;
-			spin_unlock(&BTRFS_I(inode)->lock);
 			btrfs_delalloc_release_space(inode, data_reserved,
 					page_start, PAGE_SIZE - reserved_space);
 		}
@@ -9278,12 +9234,14 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
 
 out_unlock:
 	if (!ret) {
+		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 		sb_end_pagefault(inode->i_sb);
 		extent_changeset_free(data_reserved);
 		return VM_FAULT_LOCKED;
 	}
 	unlock_page(page);
 out:
+	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 	btrfs_delalloc_release_space(inode, data_reserved, page_start,
 				     reserved_space);
 out_noreserve:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0afa79c65b9ff3..fd172a93d11a9b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1217,6 +1217,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
 		unlock_page(pages[i]);
 		put_page(pages[i]);
 	}
+	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
 	extent_changeset_free(data_reserved);
 	return i_done;
 out:
@@ -1227,6 +1228,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
 	btrfs_delalloc_release_space(inode, data_reserved,
 			start_index << PAGE_SHIFT,
 			page_cnt << PAGE_SHIFT);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
 	extent_changeset_free(data_reserved);
 	return ret;
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a3aca495e33e2c..5b311aeddcc80d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -242,6 +242,15 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	}
 	spin_unlock(&root->ordered_extent_lock);
 
+	/*
+	 * We don't need the count_max_extents here, we can assume that all of
+	 * that work has been done at higher layers, so this is truly the
+	 * smallest the extent is going to get.
+	 */
+	spin_lock(&BTRFS_I(inode)->lock);
+	btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
+	spin_unlock(&BTRFS_I(inode)->lock);
+
 	return 0;
 }
 
@@ -591,11 +600,19 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_ordered_inode_tree *tree;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+	struct btrfs_root *root = btrfs_inode->root;
 	struct rb_node *node;
 	bool dec_pending_ordered = false;
 
-	tree = &BTRFS_I(inode)->ordered_tree;
+	/* This is paired with btrfs_add_ordered_extent. */
+	spin_lock(&btrfs_inode->lock);
+	btrfs_mod_outstanding_extents(btrfs_inode, -1);
+	spin_unlock(&btrfs_inode->lock);
+	if (root != fs_info->tree_root)
+		btrfs_delalloc_release_metadata(btrfs_inode, entry->len);
+
+	tree = &btrfs_inode->ordered_tree;
 	spin_lock_irq(&tree->lock);
 	node = &entry->rb_node;
 	rb_erase(node, &tree->tree);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 7d506a3e46dd82..4cf2eb67eba6ce 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3246,6 +3246,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
 				put_page(page);
 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
 							PAGE_SIZE);
+				btrfs_delalloc_release_extents(BTRFS_I(inode),
+							       PAGE_SIZE);
 				ret = -EIO;
 				goto out;
 			}
@@ -3275,6 +3277,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 		put_page(page);
 
 		index++;
+		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 		balance_dirty_pages_ratelimited(inode->i_mapping);
 		btrfs_throttle(fs_info);
 	}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 330815eb07b40f..f797642c013dad 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -968,7 +968,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	btrfs_test_inode_set_ops(inode);
 
 	/* [BTRFS_MAX_EXTENT_SIZE] */
-	BTRFS_I(inode)->outstanding_extents++;
 	ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
 					NULL, 0);
 	if (ret) {
@@ -983,7 +982,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	}
 
 	/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
-	BTRFS_I(inode)->outstanding_extents++;
 	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
 					BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
 					NULL, 0);
@@ -1003,7 +1001,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 			       BTRFS_MAX_EXTENT_SIZE >> 1,
 			       (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
 			       EXTENT_DELALLOC | EXTENT_DIRTY |
-			       EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
+			       EXTENT_UPTODATE, 0, 0,
 			       NULL, GFP_KERNEL);
 	if (ret) {
 		test_msg("clear_extent_bit returned %d\n", ret);
@@ -1017,7 +1015,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	}
 
 	/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
-	BTRFS_I(inode)->outstanding_extents++;
 	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
 					(BTRFS_MAX_EXTENT_SIZE >> 1)
 					+ sectorsize - 1,
@@ -1035,12 +1032,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 
 	/*
 	 * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize]
-	 *
-	 * I'm artificially adding 2 to outstanding_extents because in the
-	 * buffered IO case we'd add things up as we go, but I don't feel like
-	 * doing that here, this isn't the interesting case we want to test.
 	 */
-	BTRFS_I(inode)->outstanding_extents += 2;
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
 			(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
@@ -1059,7 +1051,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	/*
 	* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize]
 	*/
-	BTRFS_I(inode)->outstanding_extents++;
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + sectorsize,
 			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
@@ -1079,7 +1070,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 			       BTRFS_MAX_EXTENT_SIZE + sectorsize,
 			       BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
 			       EXTENT_DIRTY | EXTENT_DELALLOC |
-			       EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+			       EXTENT_UPTODATE, 0, 0,
 			       NULL, GFP_KERNEL);
 	if (ret) {
 		test_msg("clear_extent_bit returned %d\n", ret);
@@ -1096,7 +1087,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	 * Refill the hole again just for good measure, because I thought it
 	 * might fail and I'd rather satisfy my paranoia at this point.
 	 */
-	BTRFS_I(inode)->outstanding_extents++;
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + sectorsize,
 			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
@@ -1114,7 +1104,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	/* Empty */
 	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
 			       EXTENT_DIRTY | EXTENT_DELALLOC |
-			       EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+			       EXTENT_UPTODATE, 0, 0,
 			       NULL, GFP_KERNEL);
 	if (ret) {
 		test_msg("clear_extent_bit returned %d\n", ret);
@@ -1131,7 +1121,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	if (ret)
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+				 EXTENT_UPTODATE, 0, 0,
 				 NULL, GFP_KERNEL);
 	iput(inode);
 	btrfs_free_dummy_root(root);

From dd48d4072e0cdac51edcbff66342fe2f21b5b588 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 19 Oct 2017 14:15:56 -0400
Subject: [PATCH 088/109] btrfs: add tracepoints for outstanding extents mods

This is handy for tracing problems with modifying the outstanding
extents counters.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h       |  2 ++
 include/trace/events/btrfs.h | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index bed88811f686ac..427c8738a3bdea 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -274,6 +274,8 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
 	inode->outstanding_extents += mod;
 	if (btrfs_is_free_space_inode(inode))
 		return;
+	trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
+						  mod);
 }
 
 static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode, int mod)
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index bfe2f23b578c39..5848ae7845dacc 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1695,6 +1695,27 @@ DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert,
 	TP_ARGS(fs_info, oldref, newref, tree_size)
 );
 
+TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
+	TP_PROTO(struct btrfs_root *root, u64 ino, int mod),
+
+	TP_ARGS(root, ino, mod),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64, root_objectid	)
+		__field(	u64, ino		)
+		__field(	int, mod		)
+	),
+
+	TP_fast_assign_btrfs(root->fs_info,
+		__entry->root_objectid	= root->objectid;
+		__entry->ino		= ino;
+		__entry->mod		= mod;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d",
+			show_root_type(__entry->root_objectid),
+			(unsigned long long)__entry->ino, __entry->mod)
+);
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */

From 69fe2d75dd91d0124ad2ab6e9fef07633bd730e0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 19 Oct 2017 14:15:57 -0400
Subject: [PATCH 089/109] btrfs: make the delalloc block rsv per inode

The way we handle delalloc metadata reservations has gotten
progressively more complicated over the years.  There is so much cruft
and weirdness around keeping the reserved count and outstanding counters
consistent and handling the error cases that it's impossible to
understand.

Fix this by making the delalloc block rsv per-inode.  This way we can
calculate the actual size of the outstanding metadata reservations every
time we make a change, and then reserve the delta based on that amount.
This greatly simplifies the code everywhere, and makes the error
handling in btrfs_delalloc_reserve_metadata far less terrifying.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h   |  26 ++--
 fs/btrfs/ctree.h         |   5 +-
 fs/btrfs/delayed-inode.c |  46 +-----
 fs/btrfs/disk-io.c       |  18 +--
 fs/btrfs/extent-tree.c   | 320 +++++++++++++--------------------------
 fs/btrfs/inode.c         |  18 ++-
 6 files changed, 141 insertions(+), 292 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 427c8738a3bdea..63f0ccc92a7122 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -36,14 +36,13 @@
 #define BTRFS_INODE_ORPHAN_META_RESERVED	1
 #define BTRFS_INODE_DUMMY			2
 #define BTRFS_INODE_IN_DEFRAG			3
-#define BTRFS_INODE_DELALLOC_META_RESERVED	4
-#define BTRFS_INODE_HAS_ORPHAN_ITEM		5
-#define BTRFS_INODE_HAS_ASYNC_EXTENT		6
-#define BTRFS_INODE_NEEDS_FULL_SYNC		7
-#define BTRFS_INODE_COPY_EVERYTHING		8
-#define BTRFS_INODE_IN_DELALLOC_LIST		9
-#define BTRFS_INODE_READDIO_NEED_LOCK		10
-#define BTRFS_INODE_HAS_PROPS		        11
+#define BTRFS_INODE_HAS_ORPHAN_ITEM		4
+#define BTRFS_INODE_HAS_ASYNC_EXTENT		5
+#define BTRFS_INODE_NEEDS_FULL_SYNC		6
+#define BTRFS_INODE_COPY_EVERYTHING		7
+#define BTRFS_INODE_IN_DELALLOC_LIST		8
+#define BTRFS_INODE_READDIO_NEED_LOCK		9
+#define BTRFS_INODE_HAS_PROPS		        10
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -176,7 +175,8 @@ struct btrfs_inode {
 	 * of extent items we've reserved metadata for.
 	 */
 	unsigned outstanding_extents;
-	unsigned reserved_extents;
+
+	struct btrfs_block_rsv block_rsv;
 
 	/*
 	 * Cached values of inode properties
@@ -278,14 +278,6 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
 						  mod);
 }
 
-static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode, int mod)
-{
-	lockdep_assert_held(&inode->lock);
-	inode->reserved_extents += mod;
-	if (btrfs_is_free_space_inode(inode))
-		return;
-}
-
 static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
 {
 	int ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2ede3b6ceb681a..f7df5536ab61e1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -763,8 +763,6 @@ struct btrfs_fs_info {
 	 * delayed dir index item
 	 */
 	struct btrfs_block_rsv global_block_rsv;
-	/* block reservation for delay allocation */
-	struct btrfs_block_rsv delalloc_block_rsv;
 	/* block reservation for metadata operations */
 	struct btrfs_block_rsv trans_block_rsv;
 	/* block reservation for chunk tree */
@@ -2757,6 +2755,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
 					      unsigned short type);
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+				   struct btrfs_block_rsv *rsv,
+				   unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
 			  struct btrfs_block_rsv *rsv);
 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 19e4ad2f3f2e4a..5d73f79ded8bcb 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -581,36 +581,12 @@ static int btrfs_delayed_inode_reserve_metadata(
 	struct btrfs_block_rsv *dst_rsv;
 	u64 num_bytes;
 	int ret;
-	bool release = false;
 
 	src_rsv = trans->block_rsv;
 	dst_rsv = &fs_info->delayed_block_rsv;
 
 	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 
-	/*
-	 * If our block_rsv is the delalloc block reserve then check and see if
-	 * we have our extra reservation for updating the inode.  If not fall
-	 * through and try to reserve space quickly.
-	 *
-	 * We used to try and steal from the delalloc block rsv or the global
-	 * reserve, but we'd steal a full reservation, which isn't kind.  We are
-	 * here through delalloc which means we've likely just cowed down close
-	 * to the leaf that contains the inode, so we would steal less just
-	 * doing the fallback inode update, so if we do end up having to steal
-	 * from the global block rsv we hopefully only steal one or two blocks
-	 * worth which is less likely to hurt us.
-	 */
-	if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
-		spin_lock(&inode->lock);
-		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
-				       &inode->runtime_flags))
-			release = true;
-		else
-			src_rsv = NULL;
-		spin_unlock(&inode->lock);
-	}
-
 	/*
 	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
 	 * which doesn't reserve space for speed.  This is a problem since we
@@ -618,7 +594,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 	 * space.
 	 *
 	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
-	 * we're accounted for.
+	 * we always reserve enough to update the inode item.
 	 */
 	if (!src_rsv || (!trans->bytes_reserved &&
 			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
@@ -643,32 +619,12 @@ static int btrfs_delayed_inode_reserve_metadata(
 	}
 
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-
-	/*
-	 * Migrate only takes a reservation, it doesn't touch the size of the
-	 * block_rsv.  This is to simplify people who don't normally have things
-	 * migrated from their block rsv.  If they go to release their
-	 * reservation, that will decrease the size as well, so if migrate
-	 * reduced size we'd end up with a negative size.  But for the
-	 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
-	 * but we could in fact do this reserve/migrate dance several times
-	 * between the time we did the original reservation and we'd clean it
-	 * up.  So to take care of this, release the space for the meta
-	 * reservation here.  I think it may be time for a documentation page on
-	 * how block rsvs. work.
-	 */
 	if (!ret) {
 		trace_btrfs_space_reservation(fs_info, "delayed_inode",
 					      btrfs_ino(inode), num_bytes, 1);
 		node->bytes_reserved = num_bytes;
 	}
 
-	if (release) {
-		trace_btrfs_space_reservation(fs_info, "delalloc",
-					      btrfs_ino(inode), num_bytes, 0);
-		btrfs_block_rsv_release(fs_info, src_rsv, num_bytes);
-	}
-
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 484cf8fc952c43..d1f396f7297979 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2447,14 +2447,6 @@ int open_ctree(struct super_block *sb,
 		goto fail_delalloc_bytes;
 	}
 
-	fs_info->btree_inode = new_inode(sb);
-	if (!fs_info->btree_inode) {
-		err = -ENOMEM;
-		goto fail_bio_counter;
-	}
-
-	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
 	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
 	INIT_LIST_HEAD(&fs_info->trans_list);
@@ -2487,8 +2479,6 @@ int open_ctree(struct super_block *sb,
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	btrfs_init_block_rsv(&fs_info->global_block_rsv,
 			     BTRFS_BLOCK_RSV_GLOBAL);
-	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
-			     BTRFS_BLOCK_RSV_DELALLOC);
 	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
 	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
@@ -2517,6 +2507,14 @@ int open_ctree(struct super_block *sb,
 
 	INIT_LIST_HEAD(&fs_info->ordered_roots);
 	spin_lock_init(&fs_info->ordered_root_lock);
+
+	fs_info->btree_inode = new_inode(sb);
+	if (!fs_info->btree_inode) {
+		err = -ENOMEM;
+		goto fail_bio_counter;
+	}
+	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+
 	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
 					GFP_KERNEL);
 	if (!fs_info->delayed_root) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aaa346562df62d..fc9720e280055a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/percpu_counter.h>
+#include <linux/lockdep.h>
 #include "hash.h"
 #include "tree-log.h"
 #include "disk-io.h"
@@ -4811,7 +4812,6 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 			    u64 orig, bool wait_ordered)
 {
-	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_space_info *space_info;
 	struct btrfs_trans_handle *trans;
 	u64 delalloc_bytes;
@@ -4827,8 +4827,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
-	block_rsv = &fs_info->delalloc_block_rsv;
-	space_info = block_rsv->space_info;
+	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 
 	delalloc_bytes = percpu_counter_sum_positive(
 						&fs_info->delalloc_bytes);
@@ -5564,11 +5563,12 @@ static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
 	}
 }
 
-static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 				    struct btrfs_block_rsv *block_rsv,
 				    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
+	u64 ret;
 
 	spin_lock(&block_rsv->lock);
 	if (num_bytes == (u64)-1)
@@ -5583,6 +5583,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 	}
 	spin_unlock(&block_rsv->lock);
 
+	ret = num_bytes;
 	if (num_bytes > 0) {
 		if (dest) {
 			spin_lock(&dest->lock);
@@ -5602,6 +5603,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 			space_info_add_old_bytes(fs_info, space_info,
 						 num_bytes);
 	}
+	return ret;
 }
 
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
@@ -5625,6 +5627,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 	rsv->type = type;
 }
 
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+				   struct btrfs_block_rsv *rsv,
+				   unsigned short type)
+{
+	btrfs_init_block_rsv(rsv, type);
+	rsv->space_info = __find_space_info(fs_info,
+					    BTRFS_BLOCK_GROUP_METADATA);
+}
+
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
 					      unsigned short type)
 {
@@ -5634,9 +5645,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
 	if (!block_rsv)
 		return NULL;
 
-	btrfs_init_block_rsv(block_rsv, type);
-	block_rsv->space_info = __find_space_info(fs_info,
-						  BTRFS_BLOCK_GROUP_METADATA);
+	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
 	return block_rsv;
 }
 
@@ -5719,6 +5728,66 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
 	return ret;
 }
 
+/**
+ * btrfs_inode_rsv_refill - refill the inode block rsv.
+ * @inode - the inode we are refilling.
+ * @flush - the flusing restriction.
+ *
+ * Essentially the same as btrfs_block_rsv_refill, except it uses the
+ * block_rsv->size as the minimum size.  We'll either refill the missing amount
+ * or return if we already have enough space.  This will also handle the resreve
+ * tracepoint for the reserved amount.
+ */
+int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
+			   enum btrfs_reserve_flush_enum flush)
+{
+	struct btrfs_root *root = inode->root;
+	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	spin_lock(&block_rsv->lock);
+	if (block_rsv->reserved < block_rsv->size)
+		num_bytes = block_rsv->size - block_rsv->reserved;
+	spin_unlock(&block_rsv->lock);
+
+	if (num_bytes == 0)
+		return 0;
+
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 0);
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), num_bytes, 1);
+	}
+	return ret;
+}
+
+/**
+ * btrfs_inode_rsv_release - release any excessive reservation.
+ * @inode - the inode we need to release from.
+ *
+ * This is the same as btrfs_block_rsv_release, except that it handles the
+ * tracepoint for the reservation.
+ */
+void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+	u64 released = 0;
+
+	/*
+	 * Since we statically set the block_rsv->size we just want to say we
+	 * are releasing 0 bytes, and then we'll just get the reservation over
+	 * the size free'd.
+	 */
+	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
+	if (released > 0)
+		trace_btrfs_space_reservation(fs_info, "delalloc",
+					      btrfs_ino(inode), released, 0);
+}
+
 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
 			     struct btrfs_block_rsv *block_rsv,
 			     u64 num_bytes)
@@ -5790,7 +5859,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 	fs_info->global_block_rsv.space_info = space_info;
-	fs_info->delalloc_block_rsv.space_info = space_info;
 	fs_info->trans_block_rsv.space_info = space_info;
 	fs_info->empty_block_rsv.space_info = space_info;
 	fs_info->delayed_block_rsv.space_info = space_info;
@@ -5810,8 +5878,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
 				(u64)-1);
-	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
-	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
 	WARN_ON(fs_info->trans_block_rsv.size > 0);
 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -5953,95 +6019,37 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
 	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
 }
 
-/**
- * drop_over_reserved_extents - drop our extra extent reservations
- * @inode: the inode we're dropping the extent for
- *
- * We reserve extents we may use, but they may have been merged with other
- * extents and we may not need the extra reservation.
- *
- * We also call this when we've completed io to an extent or had an error and
- * cleared the outstanding extent, in either case we no longer need our
- * reservation and can drop the excess.
- */
-static unsigned drop_over_reserved_extents(struct btrfs_inode *inode)
-{
-	unsigned num_extents = 0;
-
-	if (inode->reserved_extents > inode->outstanding_extents) {
-		num_extents = inode->reserved_extents -
-			inode->outstanding_extents;
-		btrfs_mod_reserved_extents(inode, -num_extents);
-	}
-
-	if (inode->outstanding_extents == 0 &&
-	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
-			       &inode->runtime_flags))
-		num_extents++;
-	return num_extents;
-}
-
-/**
- * calc_csum_metadata_size - return the amount of metadata space that must be
- *	reserved/freed for the given bytes.
- * @inode: the inode we're manipulating
- * @num_bytes: the number of bytes in question
- * @reserve: 1 if we are reserving space, 0 if we are freeing space
- *
- * This adjusts the number of csum_bytes in the inode and then returns the
- * correct amount of metadata that must either be reserved or freed.  We
- * calculate how many checksums we can fit into one leaf and then divide the
- * number of bytes that will need to be checksumed by this value to figure out
- * how many checksums will be required.  If we are adding bytes then the number
- * may go up and we will return the number of additional bytes that must be
- * reserved.  If it is going down we will return the number of bytes that must
- * be freed.
- *
- * This must be called with BTRFS_I(inode)->lock held.
- */
-static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
-				   int reserve)
+static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
+						 struct btrfs_inode *inode)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-	u64 old_csums, num_csums;
-
-	if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
-		return 0;
-
-	old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
-	if (reserve)
-		inode->csum_bytes += num_bytes;
-	else
-		inode->csum_bytes -= num_bytes;
-	num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
-
-	/* No change, no need to reserve more */
-	if (old_csums == num_csums)
-		return 0;
+	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+	u64 reserve_size = 0;
+	u64 csum_leaves;
+	unsigned outstanding_extents;
 
-	if (reserve)
-		return btrfs_calc_trans_metadata_size(fs_info,
-						      num_csums - old_csums);
+	lockdep_assert_held(&inode->lock);
+	outstanding_extents = inode->outstanding_extents;
+	if (outstanding_extents)
+		reserve_size = btrfs_calc_trans_metadata_size(fs_info,
+						outstanding_extents + 1);
+	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
+						 inode->csum_bytes);
+	reserve_size += btrfs_calc_trans_metadata_size(fs_info,
+						       csum_leaves);
 
-	return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
+	spin_lock(&block_rsv->lock);
+	block_rsv->size = reserve_size;
+	spin_unlock(&block_rsv->lock);
 }
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 	struct btrfs_root *root = inode->root;
-	struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
-	u64 to_reserve = 0;
-	u64 csum_bytes;
-	unsigned nr_extents, reserve_extents;
+	unsigned nr_extents;
 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 	int ret = 0;
 	bool delalloc_lock = true;
-	u64 to_free = 0;
-	unsigned dropped;
-	bool release_extra = false;
-	bool underflow = false;
-	bool did_retry = false;
 
 	/* If we are a free space inode we need to not flush since we will be in
 	 * the middle of a transaction commit.  We also don't need the delalloc
@@ -6066,33 +6074,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 		mutex_lock(&inode->delalloc_mutex);
 
 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
-retry:
+
+	/* Add our new extents and calculate the new rsv size. */
 	spin_lock(&inode->lock);
-	reserve_extents = nr_extents = count_max_extents(num_bytes);
+	nr_extents = count_max_extents(num_bytes);
 	btrfs_mod_outstanding_extents(inode, nr_extents);
-
-	/*
-	 * Because we add an outstanding extent for ordered before we clear
-	 * delalloc we will double count our outstanding extents slightly.  This
-	 * could mean that we transiently over-reserve, which could result in an
-	 * early ENOSPC if our timing is unlucky.  Keep track of the case that
-	 * we had a reservation underflow so we can retry if we fail.
-	 *
-	 * Keep in mind we can legitimately have more outstanding extents than
-	 * reserved because of fragmentation, so only allow a retry once.
-	 */
-	if (inode->outstanding_extents >
-	    inode->reserved_extents + nr_extents) {
-		reserve_extents = inode->outstanding_extents -
-			inode->reserved_extents;
-		underflow = true;
-	}
-
-	/* We always want to reserve a slot for updating the inode. */
-	to_reserve = btrfs_calc_trans_metadata_size(fs_info,
-						    reserve_extents + 1);
-	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
-	csum_bytes = inode->csum_bytes;
+	inode->csum_bytes += num_bytes;
+	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
 
 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
@@ -6102,100 +6090,26 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 			goto out_fail;
 	}
 
-	ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
+	ret = btrfs_inode_rsv_refill(inode, flush);
 	if (unlikely(ret)) {
 		btrfs_qgroup_free_meta(root,
 				       nr_extents * fs_info->nodesize);
 		goto out_fail;
 	}
 
-	spin_lock(&inode->lock);
-	if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
-			     &inode->runtime_flags)) {
-		to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
-		release_extra = true;
-	}
-	btrfs_mod_reserved_extents(inode, reserve_extents);
-	spin_unlock(&inode->lock);
-
 	if (delalloc_lock)
 		mutex_unlock(&inode->delalloc_mutex);
-
-	if (to_reserve)
-		trace_btrfs_space_reservation(fs_info, "delalloc",
-					      btrfs_ino(inode), to_reserve, 1);
-	if (release_extra)
-		btrfs_block_rsv_release(fs_info, block_rsv,
-				btrfs_calc_trans_metadata_size(fs_info, 1));
 	return 0;
 
 out_fail:
 	spin_lock(&inode->lock);
 	nr_extents = count_max_extents(num_bytes);
 	btrfs_mod_outstanding_extents(inode, -nr_extents);
-
-	dropped = drop_over_reserved_extents(inode);
-	/*
-	 * If the inodes csum_bytes is the same as the original
-	 * csum_bytes then we know we haven't raced with any free()ers
-	 * so we can just reduce our inodes csum bytes and carry on.
-	 */
-	if (inode->csum_bytes == csum_bytes) {
-		calc_csum_metadata_size(inode, num_bytes, 0);
-	} else {
-		u64 orig_csum_bytes = inode->csum_bytes;
-		u64 bytes;
-
-		/*
-		 * This is tricky, but first we need to figure out how much we
-		 * freed from any free-ers that occurred during this
-		 * reservation, so we reset ->csum_bytes to the csum_bytes
-		 * before we dropped our lock, and then call the free for the
-		 * number of bytes that were freed while we were trying our
-		 * reservation.
-		 */
-		bytes = csum_bytes - inode->csum_bytes;
-		inode->csum_bytes = csum_bytes;
-		to_free = calc_csum_metadata_size(inode, bytes, 0);
-
-
-		/*
-		 * Now we need to see how much we would have freed had we not
-		 * been making this reservation and our ->csum_bytes were not
-		 * artificially inflated.
-		 */
-		inode->csum_bytes = csum_bytes - num_bytes;
-		bytes = csum_bytes - orig_csum_bytes;
-		bytes = calc_csum_metadata_size(inode, bytes, 0);
-
-		/*
-		 * Now reset ->csum_bytes to what it should be.  If bytes is
-		 * more than to_free then we would have freed more space had we
-		 * not had an artificially high ->csum_bytes, so we need to free
-		 * the remainder.  If bytes is the same or less then we don't
-		 * need to do anything, the other free-ers did the correct
-		 * thing.
-		 */
-		inode->csum_bytes = orig_csum_bytes - num_bytes;
-		if (bytes > to_free)
-			to_free = bytes - to_free;
-		else
-			to_free = 0;
-	}
+	inode->csum_bytes -= num_bytes;
+	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
-	if (dropped)
-		to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
 
-	if (to_free) {
-		btrfs_block_rsv_release(fs_info, block_rsv, to_free);
-		trace_btrfs_space_reservation(fs_info, "delalloc",
-					      btrfs_ino(inode), to_free, 0);
-	}
-	if (underflow && !did_retry) {
-		did_retry = true;
-		underflow = false;
-		goto retry;
-	}
+	btrfs_inode_rsv_release(inode);
 	if (delalloc_lock)
 		mutex_unlock(&inode->delalloc_mutex);
 	return ret;
@@ -6213,25 +6127,17 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-	u64 to_free = 0;
-	unsigned dropped;
 
 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 	spin_lock(&inode->lock);
-	dropped = drop_over_reserved_extents(inode);
-	if (num_bytes)
-		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+	inode->csum_bytes -= num_bytes;
+	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
-	if (dropped > 0)
-		to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
 
 	if (btrfs_is_testing(fs_info))
 		return;
 
-	trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
-				      to_free, 0);
-
-	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
+	btrfs_inode_rsv_release(inode);
 }
 
 /**
@@ -6249,25 +6155,17 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 	unsigned num_extents;
-	u64 to_free;
-	unsigned dropped;
 
 	spin_lock(&inode->lock);
 	num_extents = count_max_extents(num_bytes);
 	btrfs_mod_outstanding_extents(inode, -num_extents);
-	dropped = drop_over_reserved_extents(inode);
+	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
 
-	if (!dropped)
-		return;
-
 	if (btrfs_is_testing(fs_info))
 		return;
 
-	to_free = btrfs_calc_trans_metadata_size(fs_info, dropped);
-	trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
-				      to_free, 0);
-	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
+	btrfs_inode_rsv_release(inode);
 }
 
 /**
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b0de1e120d1ef..b71731ef28c474 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
 #include <linux/blkdev.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/uio.h>
+#include <linux/magic.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -315,7 +316,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 		btrfs_free_path(path);
 		return PTR_ERR(trans);
 	}
-	trans->block_rsv = &fs_info->delalloc_block_rsv;
+	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
 
 	if (compressed_size && compressed_pages)
 		extent_item_size = btrfs_file_extent_calc_inline_size(
@@ -2954,7 +2955,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 			trans = NULL;
 			goto out;
 		}
-		trans->block_rsv = &fs_info->delalloc_block_rsv;
+		trans->block_rsv = &BTRFS_I(inode)->block_rsv;
 		ret = btrfs_update_inode_fallback(trans, root, inode);
 		if (ret) /* -ENOMEM or corruption */
 			btrfs_abort_transaction(trans, ret);
@@ -2990,7 +2991,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 		goto out;
 	}
 
-	trans->block_rsv = &fs_info->delalloc_block_rsv;
+	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
 
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
 		compress_type = ordered_extent->compress_type;
@@ -8845,7 +8846,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (iov_iter_rw(iter) == WRITE) {
 		up_read(&BTRFS_I(inode)->dio_sem);
 		current->journal_info = NULL;
-		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
 		if (ret < 0 && ret != -EIOCBQUEUED) {
 			if (dio_data.reserve)
 				btrfs_delalloc_release_space(inode, data_reserved,
@@ -8866,6 +8866,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		} else if (ret >= 0 && (size_t)ret < count)
 			btrfs_delalloc_release_space(inode, data_reserved,
 					offset, count - (size_t)ret);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
 	}
 out:
 	if (wakeup)
@@ -9430,6 +9431,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 
 struct inode *btrfs_alloc_inode(struct super_block *sb)
 {
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_inode *ei;
 	struct inode *inode;
 
@@ -9456,8 +9458,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
 	spin_lock_init(&ei->lock);
 	ei->outstanding_extents = 0;
-	ei->reserved_extents = 0;
-
+	if (sb->s_magic != BTRFS_TEST_MAGIC)
+		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
+					      BTRFS_BLOCK_RSV_DELALLOC);
 	ei->runtime_flags = 0;
 	ei->prop_compress = BTRFS_COMPRESS_NONE;
 	ei->defrag_compress = BTRFS_COMPRESS_NONE;
@@ -9507,8 +9510,9 @@ void btrfs_destroy_inode(struct inode *inode)
 
 	WARN_ON(!hlist_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
+	WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
+	WARN_ON(BTRFS_I(inode)->block_rsv.size);
 	WARN_ON(BTRFS_I(inode)->outstanding_extents);
-	WARN_ON(BTRFS_I(inode)->reserved_extents);
 	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
 	WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
 	WARN_ON(BTRFS_I(inode)->csum_bytes);

From c7ad7c843965d8691269f581e132633a4ca9ef91 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 19 Oct 2017 14:15:58 -0400
Subject: [PATCH 090/109] btrfs: switch args for comp_*_refs

Make it more consistent, we want the inserted ref to be compared against
what's already in there.  This will make the order go from lowest seq ->
highest seq, which will make us more likely to make forward progress if
there's a seqlock currently held.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index a2973340a94f87..bc940bb374cfb8 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -40,8 +40,8 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep;
 /*
  * compare two delayed tree backrefs with same bytenr and type
  */
-static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
-			  struct btrfs_delayed_tree_ref *ref1)
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
+			  struct btrfs_delayed_tree_ref *ref2)
 {
 	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
 		if (ref1->root < ref2->root)
@@ -60,8 +60,8 @@ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
 /*
  * compare two delayed data backrefs with same bytenr and type
  */
-static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
-			  struct btrfs_delayed_data_ref *ref1)
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
+			  struct btrfs_delayed_data_ref *ref2)
 {
 	if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
 		if (ref1->root < ref2->root)

From 1d148e5939f55c76d06108548c7c0226e55dde8e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 19 Oct 2017 14:15:59 -0400
Subject: [PATCH 091/109] btrfs: add a comp_refs() helper

Instead of open-coding the delayed ref comparisons, add a helper to do
the comparisons generically and use that everywhere.  We compare
sequence numbers last for following patches.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 54 +++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index bc940bb374cfb8..8c7d7db01f7aa7 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -85,6 +85,34 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
 	return 0;
 }
 
+static int comp_refs(struct btrfs_delayed_ref_node *ref1,
+		     struct btrfs_delayed_ref_node *ref2,
+		     bool check_seq)
+{
+	int ret = 0;
+
+	if (ref1->type < ref2->type)
+		return -1;
+	if (ref1->type > ref2->type)
+		return 1;
+	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
+		ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
+				     btrfs_delayed_node_to_tree_ref(ref2));
+	else
+		ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
+				     btrfs_delayed_node_to_data_ref(ref2));
+	if (ret)
+		return ret;
+	if (check_seq) {
+		if (ref1->seq < ref2->seq)
+			return -1;
+		if (ref1->seq > ref2->seq)
+			return 1;
+	}
+	return 0;
+}
+
 /* insert a new ref to head ref rbtree */
 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
 						   struct rb_node *node)
@@ -217,18 +245,7 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
 		if (seq && next->seq >= seq)
 			goto next;
 
-		if (next->type != ref->type)
-			goto next;
-
-		if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
-		     ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
-		    comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
-				   btrfs_delayed_node_to_tree_ref(next)))
-			goto next;
-		if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
-		     ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
-		    comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
-				   btrfs_delayed_node_to_data_ref(next)))
+		if (comp_refs(ref, next, false))
 			goto next;
 
 		if (ref->action == next->action) {
@@ -402,18 +419,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
 	exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
 			   list);
 	/* No need to compare bytenr nor is_head */
-	if (exist->type != ref->type || exist->seq != ref->seq)
-		goto add_tail;
-
-	if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
-	     exist->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
-	    comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist),
-			   btrfs_delayed_node_to_tree_ref(ref)))
-		goto add_tail;
-	if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY ||
-	     exist->type == BTRFS_SHARED_DATA_REF_KEY) &&
-	    comp_data_refs(btrfs_delayed_node_to_data_ref(exist),
-			   btrfs_delayed_node_to_data_ref(ref)))
+	if (comp_refs(exist, ref, true))
 		goto add_tail;
 
 	/* Now we are sure we can merge */

From 0e0adbcfdc908684317c99a9bf5e13383f03b7ec Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 19 Oct 2017 14:16:00 -0400
Subject: [PATCH 092/109] btrfs: track refs in a rb_tree instead of a list

If we get a significant amount of delayed refs for a single block (think
modifying multiple snapshots) we can end up spending an ungodly amount
of time looping through all of the entries trying to see if they can be
merged.  This is because we only add them to a list, so we have O(2n)
for every ref head.  This doesn't make any sense as we likely have refs
for different roots, and so they cannot be merged.  Tracking in a tree
will allow us to break as soon as we hit an entry that doesn't match,
making our worst case O(n).

With this we can also merge entries more easily.  Before we had to hope
that matching refs were on the ends of our list, but with the tree we
can search down to exact matches and merge them at insert time.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c     |   5 +-
 fs/btrfs/delayed-ref.c | 108 +++++++++++++++++++++--------------------
 fs/btrfs/delayed-ref.h |   5 +-
 fs/btrfs/disk-io.c     |  10 ++--
 fs/btrfs/extent-tree.c |  21 +++++---
 5 files changed, 82 insertions(+), 67 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 523d2dba7745e9..7d0dc100a09a81 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -773,6 +773,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 	struct btrfs_key key;
 	struct btrfs_key tmp_op_key;
 	struct btrfs_key *op_key = NULL;
+	struct rb_node *n;
 	int count;
 	int ret = 0;
 
@@ -782,7 +783,9 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 	}
 
 	spin_lock(&head->lock);
-	list_for_each_entry(node, &head->ref_list, list) {
+	for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
+		node = rb_entry(n, struct btrfs_delayed_ref_node,
+				ref_node);
 		if (node->seq > seq)
 			continue;
 
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 8c7d7db01f7aa7..83be8f9fd9068e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -143,6 +143,34 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
 	return NULL;
 }
 
+static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
+		struct btrfs_delayed_ref_node *ins)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *node = &ins->ref_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_delayed_ref_node *entry;
+
+	while (*p) {
+		int comp;
+
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+				 ref_node);
+		comp = comp_refs(ins, entry, true);
+		if (comp < 0)
+			p = &(*p)->rb_left;
+		else if (comp > 0)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	rb_link_node(node, parent_node, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
 /*
  * find an head entry based on bytenr. This returns the delayed ref
  * head if it was able to find one, or NULL if nothing was in that spot.
@@ -212,7 +240,8 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 				    struct btrfs_delayed_ref_node *ref)
 {
 	assert_spin_locked(&head->lock);
-	list_del(&ref->list);
+	rb_erase(&ref->ref_node, &head->ref_tree);
+	RB_CLEAR_NODE(&ref->ref_node);
 	if (!list_empty(&ref->add_list))
 		list_del(&ref->add_list);
 	ref->in_tree = 0;
@@ -229,24 +258,18 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
 		      u64 seq)
 {
 	struct btrfs_delayed_ref_node *next;
+	struct rb_node *node = rb_next(&ref->ref_node);
 	bool done = false;
 
-	next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
-				list);
-	while (!done && &next->list != &head->ref_list) {
+	while (!done && node) {
 		int mod;
-		struct btrfs_delayed_ref_node *next2;
-
-		next2 = list_next_entry(next, list);
-
-		if (next == ref)
-			goto next;
 
+		next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
+		node = rb_next(node);
 		if (seq && next->seq >= seq)
-			goto next;
-
+			break;
 		if (comp_refs(ref, next, false))
-			goto next;
+			break;
 
 		if (ref->action == next->action) {
 			mod = next->ref_mod;
@@ -270,8 +293,6 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
 			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
 				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
 		}
-next:
-		next = next2;
 	}
 
 	return done;
@@ -283,11 +304,12 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
 			      struct btrfs_delayed_ref_head *head)
 {
 	struct btrfs_delayed_ref_node *ref;
+	struct rb_node *node;
 	u64 seq = 0;
 
 	assert_spin_locked(&head->lock);
 
-	if (list_empty(&head->ref_list))
+	if (RB_EMPTY_ROOT(&head->ref_tree))
 		return;
 
 	/* We don't have too many refs to merge for data. */
@@ -304,22 +326,13 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
 	}
 	spin_unlock(&fs_info->tree_mod_seq_lock);
 
-	ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
-			       list);
-	while (&ref->list != &head->ref_list) {
+again:
+	for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 		if (seq && ref->seq >= seq)
-			goto next;
-
-		if (merge_ref(trans, delayed_refs, head, ref, seq)) {
-			if (list_empty(&head->ref_list))
-				break;
-			ref = list_first_entry(&head->ref_list,
-					       struct btrfs_delayed_ref_node,
-					       list);
 			continue;
-		}
-next:
-		ref = list_next_entry(ref, list);
+		if (merge_ref(trans, delayed_refs, head, ref, seq))
+			goto again;
 	}
 }
 
@@ -402,25 +415,19 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
  * Return 0 for insert.
  * Return >0 for merge.
  */
-static int
-add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
-			   struct btrfs_delayed_ref_root *root,
-			   struct btrfs_delayed_ref_head *href,
-			   struct btrfs_delayed_ref_node *ref)
+static int insert_delayed_ref(struct btrfs_trans_handle *trans,
+			      struct btrfs_delayed_ref_root *root,
+			      struct btrfs_delayed_ref_head *href,
+			      struct btrfs_delayed_ref_node *ref)
 {
 	struct btrfs_delayed_ref_node *exist;
 	int mod;
 	int ret = 0;
 
 	spin_lock(&href->lock);
-	/* Check whether we can merge the tail node with ref */
-	if (list_empty(&href->ref_list))
-		goto add_tail;
-	exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
-			   list);
-	/* No need to compare bytenr nor is_head */
-	if (comp_refs(exist, ref, true))
-		goto add_tail;
+	exist = tree_insert(&href->ref_tree, ref);
+	if (!exist)
+		goto inserted;
 
 	/* Now we are sure we can merge */
 	ret = 1;
@@ -451,9 +458,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
 		drop_delayed_ref(trans, root, href, exist);
 	spin_unlock(&href->lock);
 	return ret;
-
-add_tail:
-	list_add_tail(&ref->list, &href->ref_list);
+inserted:
 	if (ref->action == BTRFS_ADD_DELAYED_REF)
 		list_add_tail(&ref->add_list, &href->ref_add_list);
 	atomic_inc(&root->num_entries);
@@ -593,7 +598,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	head_ref->ref_mod = count_mod;
 	head_ref->must_insert_reserved = must_insert_reserved;
 	head_ref->is_data = is_data;
-	INIT_LIST_HEAD(&head_ref->ref_list);
+	head_ref->ref_tree = RB_ROOT;
 	INIT_LIST_HEAD(&head_ref->ref_add_list);
 	RB_CLEAR_NODE(&head_ref->href_node);
 	head_ref->processing = 0;
@@ -685,7 +690,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 	ref->seq = seq;
-	INIT_LIST_HEAD(&ref->list);
+	RB_CLEAR_NODE(&ref->ref_node);
 	INIT_LIST_HEAD(&ref->add_list);
 
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -699,7 +704,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 
 	trace_add_delayed_tree_ref(fs_info, ref, full_ref, action);
 
-	ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+	ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
 
 	/*
 	 * XXX: memory should be freed at the same level allocated.
@@ -742,7 +747,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 	ref->seq = seq;
-	INIT_LIST_HEAD(&ref->list);
+	RB_CLEAR_NODE(&ref->ref_node);
 	INIT_LIST_HEAD(&ref->add_list);
 
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -758,8 +763,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 
 	trace_add_delayed_data_ref(fs_info, ref, full_ref, action);
 
-	ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
-
+	ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
 	if (ret > 0)
 		kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
 }
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 1ce11858d72731..a43af432f8596e 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -27,8 +27,7 @@
 #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
 
 struct btrfs_delayed_ref_node {
-	/*data/tree ref use list, stored in ref_head->ref_list. */
-	struct list_head list;
+	struct rb_node ref_node;
 	/*
 	 * If action is BTRFS_ADD_DELAYED_REF, also link this node to
 	 * ref_head->ref_add_list, then we do not need to iterate the
@@ -92,7 +91,7 @@ struct btrfs_delayed_ref_head {
 	struct mutex mutex;
 
 	spinlock_t lock;
-	struct list_head ref_list;
+	struct rb_root ref_tree;
 	/* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
 	struct list_head ref_add_list;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d1f396f7297979..efce9a2fa9be09 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4113,7 +4113,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 
 	while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
 		struct btrfs_delayed_ref_head *head;
-		struct btrfs_delayed_ref_node *tmp;
+		struct rb_node *n;
 		bool pin_bytes = false;
 
 		head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -4129,10 +4129,12 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 			continue;
 		}
 		spin_lock(&head->lock);
-		list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list,
-						 list) {
+		while ((n = rb_first(&head->ref_tree)) != NULL) {
+			ref = rb_entry(n, struct btrfs_delayed_ref_node,
+				       ref_node);
 			ref->in_tree = 0;
-			list_del(&ref->list);
+			rb_erase(&ref->ref_node, &head->ref_tree);
+			RB_CLEAR_NODE(&ref->ref_node);
 			if (!list_empty(&ref->add_list))
 				list_del(&ref->add_list);
 			atomic_dec(&delayed_refs->num_entries);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fc9720e280055a..673ac4e01dd07d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2519,7 +2519,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
 	struct btrfs_delayed_ref_node *ref;
 
-	if (list_empty(&head->ref_list))
+	if (RB_EMPTY_ROOT(&head->ref_tree))
 		return NULL;
 
 	/*
@@ -2532,8 +2532,8 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
 		return list_first_entry(&head->ref_add_list,
 				struct btrfs_delayed_ref_node, add_list);
 
-	ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
-			       list);
+	ref = rb_entry(rb_first(&head->ref_tree),
+		       struct btrfs_delayed_ref_node, ref_node);
 	ASSERT(list_empty(&ref->add_list));
 	return ref;
 }
@@ -2593,7 +2593,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 	spin_unlock(&head->lock);
 	spin_lock(&delayed_refs->lock);
 	spin_lock(&head->lock);
-	if (!list_empty(&head->ref_list) || head->extent_op) {
+	if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
 		spin_unlock(&head->lock);
 		spin_unlock(&delayed_refs->lock);
 		return 1;
@@ -2740,7 +2740,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
 		actual_count++;
 		ref->in_tree = 0;
-		list_del(&ref->list);
+		rb_erase(&ref->ref_node, &locked_ref->ref_tree);
+		RB_CLEAR_NODE(&ref->ref_node);
 		if (!list_empty(&ref->add_list))
 			list_del(&ref->add_list);
 		/*
@@ -3138,6 +3139,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 	struct btrfs_delayed_data_ref *data_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_transaction *cur_trans;
+	struct rb_node *node;
 	int ret = 0;
 
 	cur_trans = root->fs_info->running_transaction;
@@ -3170,7 +3172,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 	spin_unlock(&delayed_refs->lock);
 
 	spin_lock(&head->lock);
-	list_for_each_entry(ref, &head->ref_list, list) {
+	/*
+	 * XXX: We should replace this with a proper search function in the
+	 * future.
+	 */
+	for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 		/* If it's a shared ref we know a cross reference exists */
 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
 			ret = 1;
@@ -7141,7 +7148,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 		goto out_delayed_unlock;
 
 	spin_lock(&head->lock);
-	if (!list_empty(&head->ref_list))
+	if (!RB_EMPTY_ROOT(&head->ref_tree))
 		goto out;
 
 	if (head->extent_op) {

From ce8ea7cc6eb3139f4c730d647325e69354159b0f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 19 Oct 2017 14:16:01 -0400
Subject: [PATCH 093/109] btrfs: don't call btrfs_start_delalloc_roots in
 flushoncommit

We're holding the sb_start_intwrite lock at this point, and doing async
filemap_flush of the inodes will result in a deadlock if we freeze the
fs during this operation.  This is because we could do a
btrfs_join_transaction() in the thread we are waiting on which would
block at sb_start_intwrite, and thus deadlock.  Using
writeback_inodes_sb() side steps the problem by not introducing all of
these extra locking dependencies.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 68c3e1c04bca7e..5a8c2649af2fa8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1916,8 +1916,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 
 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
+	/*
+	 * We use writeback_inodes_sb here because if we used
+	 * btrfs_start_delalloc_roots we would deadlock with fs freeze.
+	 * Currently are holding the fs freeze lock, if we do an async flush
+	 * we'll do btrfs_join_transaction() and deadlock because we need to
+	 * wait for the fs freeze lock.  Using the direct flushing we benefit
+	 * from already being in a transaction and our join_transaction doesn't
+	 * have to re-take the fs freeze lock.
+	 */
 	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
-		return btrfs_start_delalloc_roots(fs_info, 1, -1);
+		writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
 	return 0;
 }
 

From ddfae63cc8e03ef48622cb780e6c66367b2d4f1a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 19 Oct 2017 14:16:02 -0400
Subject: [PATCH 094/109] btrfs: move btrfs_truncate_block out of trans handle

Since we do a delalloc reserve in btrfs_truncate_block we can deadlock
with freeze.  If somebody else is trying to allocate metadata for this
inode and it gets stuck in start_delalloc_inodes because of freeze we
will deadlock.  Be safe and move this outside of a trans handle.  This
also has a side-effect of making sure that we're not leaving stale data
behind in the other_encoding or encryption case.  Not an issue now since
nobody uses it, but it would be a problem in the future.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 120 ++++++++++++++++++-----------------------------
 1 file changed, 45 insertions(+), 75 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b71731ef28c474..b93fe05a39c764 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4360,47 +4360,11 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
 
 }
 
-static int truncate_inline_extent(struct inode *inode,
-				  struct btrfs_path *path,
-				  struct btrfs_key *found_key,
-				  const u64 item_end,
-				  const u64 new_size)
-{
-	struct extent_buffer *leaf = path->nodes[0];
-	int slot = path->slots[0];
-	struct btrfs_file_extent_item *fi;
-	u32 size = (u32)(new_size - found_key->offset);
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-
-	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-
-	if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
-		loff_t offset = new_size;
-		loff_t page_end = ALIGN(offset, PAGE_SIZE);
-
-		/*
-		 * Zero out the remaining of the last page of our inline extent,
-		 * instead of directly truncating our inline extent here - that
-		 * would be much more complex (decompressing all the data, then
-		 * compressing the truncated data, which might be bigger than
-		 * the size of the inline extent, resize the extent, etc).
-		 * We release the path because to get the page we might need to
-		 * read the extent item from disk (data not in the page cache).
-		 */
-		btrfs_release_path(path);
-		return btrfs_truncate_block(inode, offset, page_end - offset,
-					0);
-	}
-
-	btrfs_set_file_extent_ram_bytes(leaf, fi, size);
-	size = btrfs_file_extent_calc_inline_size(size);
-	btrfs_truncate_item(root->fs_info, path, size, 1);
-
-	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-		inode_sub_bytes(inode, item_end + 1 - new_size);
-
-	return 0;
-}
+/*
+ * Return this if we need to call truncate_block for the last bit of the
+ * truncate.
+ */
+#define NEED_TRUNCATE_BLOCK 1
 
 /*
  * this can truncate away extent items, csum items and directory items.
@@ -4561,11 +4525,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		if (found_type != BTRFS_EXTENT_DATA_KEY)
 			goto delete;
 
-		if (del_item)
-			last_size = found_key.offset;
-		else
-			last_size = new_size;
-
 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
@@ -4607,40 +4566,30 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			 */
 			if (!del_item &&
 			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
-			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
-
+			    btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
+			    btrfs_file_extent_compression(leaf, fi) == 0) {
+				u32 size = (u32)(new_size - found_key.offset);
+
+				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+				size = btrfs_file_extent_calc_inline_size(size);
+				btrfs_truncate_item(root->fs_info, path, size, 1);
+			} else if (!del_item) {
 				/*
-				 * Need to release path in order to truncate a
-				 * compressed extent. So delete any accumulated
-				 * extent items so far.
+				 * We have to bail so the last_size is set to
+				 * just before this extent.
 				 */
-				if (btrfs_file_extent_compression(leaf, fi) !=
-				    BTRFS_COMPRESS_NONE && pending_del_nr) {
-					err = btrfs_del_items(trans, root, path,
-							      pending_del_slot,
-							      pending_del_nr);
-					if (err) {
-						btrfs_abort_transaction(trans,
-									err);
-						goto error;
-					}
-					pending_del_nr = 0;
-				}
+				err = NEED_TRUNCATE_BLOCK;
+				break;
+			}
 
-				err = truncate_inline_extent(inode, path,
-							     &found_key,
-							     item_end,
-							     new_size);
-				if (err) {
-					btrfs_abort_transaction(trans, err);
-					goto error;
-				}
-			} else if (test_bit(BTRFS_ROOT_REF_COWS,
-					    &root->state)) {
+			if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 				inode_sub_bytes(inode, item_end + 1 - new_size);
-			}
 		}
 delete:
+		if (del_item)
+			last_size = found_key.offset;
+		else
+			last_size = new_size;
 		if (del_item) {
 			if (!pending_del_nr) {
 				/* no pending yet, add ourselves */
@@ -9338,12 +9287,12 @@ static int btrfs_truncate(struct inode *inode)
 		ret = btrfs_truncate_inode_items(trans, root, inode,
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY);
+		trans->block_rsv = &fs_info->trans_block_rsv;
 		if (ret != -ENOSPC && ret != -EAGAIN) {
 			err = ret;
 			break;
 		}
 
-		trans->block_rsv = &fs_info->trans_block_rsv;
 		ret = btrfs_update_inode(trans, root, inode);
 		if (ret) {
 			err = ret;
@@ -9367,6 +9316,27 @@ static int btrfs_truncate(struct inode *inode)
 		trans->block_rsv = rsv;
 	}
 
+	/*
+	 * We can't call btrfs_truncate_block inside a trans handle as we could
+	 * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
+	 * we've truncated everything except the last little bit, and can do
+	 * btrfs_truncate_block and then update the disk_i_size.
+	 */
+	if (ret == NEED_TRUNCATE_BLOCK) {
+		btrfs_end_transaction(trans);
+		btrfs_btree_balance_dirty(fs_info);
+
+		ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
+		if (ret)
+			goto out;
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out;
+		}
+		btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+	}
+
 	if (ret == 0 && inode->i_nlink > 0) {
 		trans->block_rsv = root->orphan_block_rsv;
 		ret = btrfs_orphan_del(trans, BTRFS_I(inode));

From 4e439a0b184f014a5833fa468af36cc9f59b8fb1 Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Thu, 28 Sep 2017 17:33:36 +0300
Subject: [PATCH 095/109] Btrfs: compression: separate heuristic/compression
 workspaces

Compression heuristic itself is not a compression type, as current
infrastructure provides workspaces for several compression types, it's
difficult to just add heuristic workspace.

Just refactor the code to support compression/heuristic workspaces with
maximum code sharing and minimum changes in it.

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 139 +++++++++++++++++++++++++++++++++++------
 1 file changed, 121 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 083f9c48587523..b155542c5e6b4e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -707,7 +707,34 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	return ret;
 }
 
-static struct {
+
+struct heuristic_ws {
+	struct list_head list;
+};
+
+static void free_heuristic_ws(struct list_head *ws)
+{
+	struct heuristic_ws *workspace;
+
+	workspace = list_entry(ws, struct heuristic_ws, list);
+
+	kfree(workspace);
+}
+
+static struct list_head *alloc_heuristic_ws(void)
+{
+	struct heuristic_ws *ws;
+
+	ws = kzalloc(sizeof(*ws), GFP_KERNEL);
+	if (!ws)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ws->list);
+
+	return &ws->list;
+}
+
+struct workspaces_list {
 	struct list_head idle_ws;
 	spinlock_t ws_lock;
 	/* Number of free workspaces */
@@ -716,7 +743,11 @@ static struct {
 	atomic_t total_ws;
 	/* Waiters for a free workspace */
 	wait_queue_head_t ws_wait;
-} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
+};
+
+static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
+
+static struct workspaces_list btrfs_heuristic_ws;
 
 static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 	&btrfs_zlib_compress,
@@ -726,11 +757,25 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 
 void __init btrfs_init_compress(void)
 {
+	struct list_head *workspace;
 	int i;
 
-	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
-		struct list_head *workspace;
+	INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
+	spin_lock_init(&btrfs_heuristic_ws.ws_lock);
+	atomic_set(&btrfs_heuristic_ws.total_ws, 0);
+	init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
 
+	workspace = alloc_heuristic_ws();
+	if (IS_ERR(workspace)) {
+		pr_warn(
+	"BTRFS: cannot preallocate heuristic workspace, will try later\n");
+	} else {
+		atomic_set(&btrfs_heuristic_ws.total_ws, 1);
+		btrfs_heuristic_ws.free_ws = 1;
+		list_add(workspace, &btrfs_heuristic_ws.idle_ws);
+	}
+
+	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
 		INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
 		spin_lock_init(&btrfs_comp_ws[i].ws_lock);
 		atomic_set(&btrfs_comp_ws[i].total_ws, 0);
@@ -757,18 +802,32 @@ void __init btrfs_init_compress(void)
  * Preallocation makes a forward progress guarantees and we do not return
  * errors.
  */
-static struct list_head *find_workspace(int type)
+static struct list_head *__find_workspace(int type, bool heuristic)
 {
 	struct list_head *workspace;
 	int cpus = num_online_cpus();
 	int idx = type - 1;
 	unsigned nofs_flag;
+	struct list_head *idle_ws;
+	spinlock_t *ws_lock;
+	atomic_t *total_ws;
+	wait_queue_head_t *ws_wait;
+	int *free_ws;
+
+	if (heuristic) {
+		idle_ws	 = &btrfs_heuristic_ws.idle_ws;
+		ws_lock	 = &btrfs_heuristic_ws.ws_lock;
+		total_ws = &btrfs_heuristic_ws.total_ws;
+		ws_wait	 = &btrfs_heuristic_ws.ws_wait;
+		free_ws	 = &btrfs_heuristic_ws.free_ws;
+	} else {
+		idle_ws	 = &btrfs_comp_ws[idx].idle_ws;
+		ws_lock	 = &btrfs_comp_ws[idx].ws_lock;
+		total_ws = &btrfs_comp_ws[idx].total_ws;
+		ws_wait	 = &btrfs_comp_ws[idx].ws_wait;
+		free_ws	 = &btrfs_comp_ws[idx].free_ws;
+	}
 
-	struct list_head *idle_ws	= &btrfs_comp_ws[idx].idle_ws;
-	spinlock_t *ws_lock		= &btrfs_comp_ws[idx].ws_lock;
-	atomic_t *total_ws		= &btrfs_comp_ws[idx].total_ws;
-	wait_queue_head_t *ws_wait	= &btrfs_comp_ws[idx].ws_wait;
-	int *free_ws			= &btrfs_comp_ws[idx].free_ws;
 again:
 	spin_lock(ws_lock);
 	if (!list_empty(idle_ws)) {
@@ -798,7 +857,10 @@ static struct list_head *find_workspace(int type)
 	 * context of btrfs_compress_bio/btrfs_compress_pages
 	 */
 	nofs_flag = memalloc_nofs_save();
-	workspace = btrfs_compress_op[idx]->alloc_workspace();
+	if (heuristic)
+		workspace = alloc_heuristic_ws();
+	else
+		workspace = btrfs_compress_op[idx]->alloc_workspace();
 	memalloc_nofs_restore(nofs_flag);
 
 	if (IS_ERR(workspace)) {
@@ -829,18 +891,38 @@ static struct list_head *find_workspace(int type)
 	return workspace;
 }
 
+static struct list_head *find_workspace(int type)
+{
+	return __find_workspace(type, false);
+}
+
 /*
  * put a workspace struct back on the list or free it if we have enough
  * idle ones sitting around
  */
-static void free_workspace(int type, struct list_head *workspace)
+static void __free_workspace(int type, struct list_head *workspace,
+			     bool heuristic)
 {
 	int idx = type - 1;
-	struct list_head *idle_ws	= &btrfs_comp_ws[idx].idle_ws;
-	spinlock_t *ws_lock		= &btrfs_comp_ws[idx].ws_lock;
-	atomic_t *total_ws		= &btrfs_comp_ws[idx].total_ws;
-	wait_queue_head_t *ws_wait	= &btrfs_comp_ws[idx].ws_wait;
-	int *free_ws			= &btrfs_comp_ws[idx].free_ws;
+	struct list_head *idle_ws;
+	spinlock_t *ws_lock;
+	atomic_t *total_ws;
+	wait_queue_head_t *ws_wait;
+	int *free_ws;
+
+	if (heuristic) {
+		idle_ws	 = &btrfs_heuristic_ws.idle_ws;
+		ws_lock	 = &btrfs_heuristic_ws.ws_lock;
+		total_ws = &btrfs_heuristic_ws.total_ws;
+		ws_wait	 = &btrfs_heuristic_ws.ws_wait;
+		free_ws	 = &btrfs_heuristic_ws.free_ws;
+	} else {
+		idle_ws	 = &btrfs_comp_ws[idx].idle_ws;
+		ws_lock	 = &btrfs_comp_ws[idx].ws_lock;
+		total_ws = &btrfs_comp_ws[idx].total_ws;
+		ws_wait	 = &btrfs_comp_ws[idx].ws_wait;
+		free_ws	 = &btrfs_comp_ws[idx].free_ws;
+	}
 
 	spin_lock(ws_lock);
 	if (*free_ws <= num_online_cpus()) {
@@ -851,7 +933,10 @@ static void free_workspace(int type, struct list_head *workspace)
 	}
 	spin_unlock(ws_lock);
 
-	btrfs_compress_op[idx]->free_workspace(workspace);
+	if (heuristic)
+		free_heuristic_ws(workspace);
+	else
+		btrfs_compress_op[idx]->free_workspace(workspace);
 	atomic_dec(total_ws);
 wake:
 	/*
@@ -862,6 +947,11 @@ static void free_workspace(int type, struct list_head *workspace)
 		wake_up(ws_wait);
 }
 
+static void free_workspace(int type, struct list_head *ws)
+{
+	return __free_workspace(type, ws, false);
+}
+
 /*
  * cleanup function for module exit
  */
@@ -870,6 +960,13 @@ static void free_workspaces(void)
 	struct list_head *workspace;
 	int i;
 
+	while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
+		workspace = btrfs_heuristic_ws.idle_ws.next;
+		list_del(workspace);
+		free_heuristic_ws(workspace);
+		atomic_dec(&btrfs_heuristic_ws.total_ws);
+	}
+
 	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
 		while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
 			workspace = btrfs_comp_ws[i].idle_ws.next;
@@ -1090,11 +1187,15 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
  */
 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 {
+	struct list_head *ws_list = __find_workspace(0, true);
+	struct heuristic_ws *ws;
 	u64 index = start >> PAGE_SHIFT;
 	u64 end_index = end >> PAGE_SHIFT;
 	struct page *page;
 	int ret = 1;
 
+	ws = list_entry(ws_list, struct heuristic_ws, list);
+
 	while (index <= end_index) {
 		page = find_get_page(inode->i_mapping, index);
 		kmap(page);
@@ -1103,6 +1204,8 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 		index++;
 	}
 
+	__free_workspace(0, ws_list, true);
+
 	return ret;
 }
 

From 17b5a6c17e265ca84fac9c3256ff86c691f04aab Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Thu, 28 Sep 2017 17:33:37 +0300
Subject: [PATCH 096/109] Btrfs: heuristic: add bucket and sample counters and
 other defines

Add basic defines and structures for data sampling.

Added macros:
 - For future sampling algo
 - For bucket size

Heuristic workspace:
 - Add bucket for storing byte type counters
 - Add sample array for storing partial copy of input data range
 - Add counter for store current sample size to workspace

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes, comments updated ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 53 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b155542c5e6b4e..4f5d958c71ad8f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -707,8 +707,47 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	return ret;
 }
 
+/*
+ * Heuristic uses systematic sampling to collect data from the input data
+ * range, the logic can be tuned by the following constants:
+ *
+ * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
+ * @SAMPLING_INTERVAL  - range from which the sampled data can be collected
+ */
+#define SAMPLING_READ_SIZE	(16)
+#define SAMPLING_INTERVAL	(256)
+
+/*
+ * For statistical analysis of the input data we consider bytes that form a
+ * Galois Field of 256 objects. Each object has an attribute count, ie. how
+ * many times the object appeared in the sample.
+ */
+#define BUCKET_SIZE		(256)
+
+/*
+ * The size of the sample is based on a statistical sampling rule of thumb.
+ * The common way is to perform sampling tests as long as the number of
+ * elements in each cell is at least 5.
+ *
+ * Instead of 5, we choose 32 to obtain more accurate results.
+ * If the data contain the maximum number of symbols, which is 256, we obtain a
+ * sample size bound by 8192.
+ *
+ * For a sample of at most 8KB of data per data range: 16 consecutive bytes
+ * from up to 512 locations.
+ */
+#define MAX_SAMPLE_SIZE		(BTRFS_MAX_UNCOMPRESSED *		\
+				 SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
+
+struct bucket_item {
+	u32 count;
+};
 
 struct heuristic_ws {
+	/* Partial copy of input data */
+	u8 *sample;
+	/* Buckets store counters for each byte value */
+	struct bucket_item *bucket;
 	struct list_head list;
 };
 
@@ -718,6 +757,8 @@ static void free_heuristic_ws(struct list_head *ws)
 
 	workspace = list_entry(ws, struct heuristic_ws, list);
 
+	kvfree(workspace->sample);
+	kfree(workspace->bucket);
 	kfree(workspace);
 }
 
@@ -729,9 +770,19 @@ static struct list_head *alloc_heuristic_ws(void)
 	if (!ws)
 		return ERR_PTR(-ENOMEM);
 
-	INIT_LIST_HEAD(&ws->list);
+	ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
+	if (!ws->sample)
+		goto fail;
+
+	ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
+	if (!ws->bucket)
+		goto fail;
 
+	INIT_LIST_HEAD(&ws->list);
 	return &ws->list;
+fail:
+	free_heuristic_ws(&ws->list);
+	return ERR_PTR(-ENOMEM);
 }
 
 struct workspaces_list {

From a440d48c7f93af5bae86af676cc6cd4e9fd6015f Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Thu, 28 Sep 2017 17:33:38 +0300
Subject: [PATCH 097/109] Btrfs: heuristic: implement sampling logic

Copy sample data from the input data range to sample buffer then
calculate byte value count for that sample into bucket.

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
[ minor comment updates ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 71 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 62 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 4f5d958c71ad8f..0e1561cc9578b9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -746,6 +746,7 @@ struct bucket_item {
 struct heuristic_ws {
 	/* Partial copy of input data */
 	u8 *sample;
+	u32 sample_size;
 	/* Buckets store counters for each byte value */
 	struct bucket_item *bucket;
 	struct list_head list;
@@ -1221,6 +1222,58 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
 	return 1;
 }
 
+static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
+				     struct heuristic_ws *ws)
+{
+	struct page *page;
+	u64 index, index_end;
+	u32 i, curr_sample_pos;
+	u8 *in_data;
+
+	/*
+	 * Compression handles the input data by chunks of 128KiB
+	 * (defined by BTRFS_MAX_UNCOMPRESSED)
+	 *
+	 * We do the same for the heuristic and loop over the whole range.
+	 *
+	 * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
+	 * process no more than BTRFS_MAX_UNCOMPRESSED at a time.
+	 */
+	if (end - start > BTRFS_MAX_UNCOMPRESSED)
+		end = start + BTRFS_MAX_UNCOMPRESSED;
+
+	index = start >> PAGE_SHIFT;
+	index_end = end >> PAGE_SHIFT;
+
+	/* Don't miss unaligned end */
+	if (!IS_ALIGNED(end, PAGE_SIZE))
+		index_end++;
+
+	curr_sample_pos = 0;
+	while (index < index_end) {
+		page = find_get_page(inode->i_mapping, index);
+		in_data = kmap(page);
+		/* Handle case where the start is not aligned to PAGE_SIZE */
+		i = start % PAGE_SIZE;
+		while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
+			/* Don't sample any garbage from the last page */
+			if (start > end - SAMPLING_READ_SIZE)
+				break;
+			memcpy(&ws->sample[curr_sample_pos], &in_data[i],
+					SAMPLING_READ_SIZE);
+			i += SAMPLING_INTERVAL;
+			start += SAMPLING_INTERVAL;
+			curr_sample_pos += SAMPLING_READ_SIZE;
+		}
+		kunmap(page);
+		put_page(page);
+
+		index++;
+	}
+
+	ws->sample_size = curr_sample_pos;
+}
+
 /*
  * Compression heuristic.
  *
@@ -1240,19 +1293,19 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 {
 	struct list_head *ws_list = __find_workspace(0, true);
 	struct heuristic_ws *ws;
-	u64 index = start >> PAGE_SHIFT;
-	u64 end_index = end >> PAGE_SHIFT;
-	struct page *page;
+	u32 i;
+	u8 byte;
 	int ret = 1;
 
 	ws = list_entry(ws_list, struct heuristic_ws, list);
 
-	while (index <= end_index) {
-		page = find_get_page(inode->i_mapping, index);
-		kmap(page);
-		kunmap(page);
-		put_page(page);
-		index++;
+	heuristic_collect_sample(inode, start, end, ws);
+
+	memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
+
+	for (i = 0; i < ws->sample_size; i++) {
+		byte = ws->sample[i];
+		ws->bucket[byte].count++;
 	}
 
 	__free_workspace(0, ws_list, true);

From 1fe4f6fa5ae7dd1e63145e1ced7b9b38854da9f4 Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Thu, 28 Sep 2017 17:33:39 +0300
Subject: [PATCH 098/109] Btrfs: heuristic: add detection of repeated data
 patterns

Walk over data sample and use memcmp to detect repeated patterns, like
zeros, but a bit more general.

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0e1561cc9578b9..0d445c815ca270 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1222,6 +1222,14 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
 	return 1;
 }
 
+static bool sample_repeated_patterns(struct heuristic_ws *ws)
+{
+	const u32 half_of_sample = ws->sample_size / 2;
+	const u8 *data = ws->sample;
+
+	return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
+}
+
 static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
 				     struct heuristic_ws *ws)
 {
@@ -1301,6 +1309,11 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 
 	heuristic_collect_sample(inode, start, end, ws);
 
+	if (sample_repeated_patterns(ws)) {
+		ret = 1;
+		goto out;
+	}
+
 	memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
 
 	for (i = 0; i < ws->sample_size; i++) {
@@ -1308,8 +1321,8 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 		ws->bucket[byte].count++;
 	}
 
+out:
 	__free_workspace(0, ws_list, true);
-
 	return ret;
 }
 

From a288e92cacdc4729ad8f83d018fb0f3f5ded6c37 Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Thu, 28 Sep 2017 17:33:40 +0300
Subject: [PATCH 099/109] Btrfs: heuristic: add byte set calculation

Calculate byte set size for data sample:
- calculate how many unique bytes have been in the sample
- for all bytes count > 0, check if we're still in the low count range
  (~25%), such data are easily compressible, otherwise furhter analysis
  is needed

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 45 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0d445c815ca270..e949f078a81bc8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1222,6 +1222,45 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
 	return 1;
 }
 
+/*
+ * Count byte values in buckets.
+ * This heuristic can detect textual data (configs, xml, json, html, etc).
+ * Because in most text-like data byte set is restricted to limited number of
+ * possible characters, and that restriction in most cases makes data easy to
+ * compress.
+ *
+ * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
+ *	less - compressible
+ *	more - need additional analysis
+ */
+#define BYTE_SET_THRESHOLD		(64)
+
+static u32 byte_set_size(const struct heuristic_ws *ws)
+{
+	u32 i;
+	u32 byte_set_size = 0;
+
+	for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
+		if (ws->bucket[i].count > 0)
+			byte_set_size++;
+	}
+
+	/*
+	 * Continue collecting count of byte values in buckets.  If the byte
+	 * set size is bigger then the threshold, it's pointless to continue,
+	 * the detection technique would fail for this type of data.
+	 */
+	for (; i < BUCKET_SIZE; i++) {
+		if (ws->bucket[i].count > 0) {
+			byte_set_size++;
+			if (byte_set_size > BYTE_SET_THRESHOLD)
+				return byte_set_size;
+		}
+	}
+
+	return byte_set_size;
+}
+
 static bool sample_repeated_patterns(struct heuristic_ws *ws)
 {
 	const u32 half_of_sample = ws->sample_size / 2;
@@ -1321,6 +1360,12 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 		ws->bucket[byte].count++;
 	}
 
+	i = byte_set_size(ws);
+	if (i < BYTE_SET_THRESHOLD) {
+		ret = 2;
+		goto out;
+	}
+
 out:
 	__free_workspace(0, ws_list, true);
 	return ret;

From 858177d38d4681dad6efc015b99e4c786a34aca5 Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Thu, 28 Sep 2017 17:33:41 +0300
Subject: [PATCH 100/109] Btrfs: heuristic: add byte core set calculation

Calculate byte core set for data sample:
- sort buckets' numbers in decreasing order
- count how many values cover 90% of the sample

If the core set size is low (<=25%), data are easily compressible.
If the core set size is high (>=80%), data are not compressible.

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 65 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e949f078a81bc8..c551d8a979f4ba 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -33,6 +33,7 @@
 #include <linux/bit_spinlock.h>
 #include <linux/slab.h>
 #include <linux/sched/mm.h>
+#include <linux/sort.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -1222,6 +1223,59 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
 	return 1;
 }
 
+/* Compare buckets by size, ascending */
+static int bucket_comp_rev(const void *lv, const void *rv)
+{
+	const struct bucket_item *l = (const struct bucket_item *)lv;
+	const struct bucket_item *r = (const struct bucket_item *)rv;
+
+	return r->count - l->count;
+}
+
+/*
+ * Size of the core byte set - how many bytes cover 90% of the sample
+ *
+ * There are several types of structured binary data that use nearly all byte
+ * values. The distribution can be uniform and counts in all buckets will be
+ * nearly the same (eg. encrypted data). Unlikely to be compressible.
+ *
+ * Other possibility is normal (Gaussian) distribution, where the data could
+ * be potentially compressible, but we have to take a few more steps to decide
+ * how much.
+ *
+ * @BYTE_CORE_SET_LOW  - main part of byte values repeated frequently,
+ *                       compression algo can easy fix that
+ * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
+ *                       probability is not compressible
+ */
+#define BYTE_CORE_SET_LOW		(64)
+#define BYTE_CORE_SET_HIGH		(200)
+
+static int byte_core_set_size(struct heuristic_ws *ws)
+{
+	u32 i;
+	u32 coreset_sum = 0;
+	const u32 core_set_threshold = ws->sample_size * 90 / 100;
+	struct bucket_item *bucket = ws->bucket;
+
+	/* Sort in reverse order */
+	sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL);
+
+	for (i = 0; i < BYTE_CORE_SET_LOW; i++)
+		coreset_sum += bucket[i].count;
+
+	if (coreset_sum > core_set_threshold)
+		return i;
+
+	for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
+		coreset_sum += bucket[i].count;
+		if (coreset_sum > core_set_threshold)
+			break;
+	}
+
+	return i;
+}
+
 /*
  * Count byte values in buckets.
  * This heuristic can detect textual data (configs, xml, json, html, etc).
@@ -1366,6 +1420,17 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 		goto out;
 	}
 
+	i = byte_core_set_size(ws);
+	if (i <= BYTE_CORE_SET_LOW) {
+		ret = 3;
+		goto out;
+	}
+
+	if (i >= BYTE_CORE_SET_HIGH) {
+		ret = 0;
+		goto out;
+	}
+
 out:
 	__free_workspace(0, ws_list, true);
 	return ret;

From 19562430c6213925faba3baa9e8cb224ddd47ee6 Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Sun, 8 Oct 2017 16:11:59 +0300
Subject: [PATCH 101/109] Btrfs: heuristic: add Shannon entropy calculation

Byte distribution check in heuristic will filter edge data cases and
some time fail to classify input data.

Let's fix that by adding Shannon entropy calculation, that will cover
classification of most other data types.

As Shannon entropy needs log2 with some precision to work, let's use
ilog2(N) and for increased precision, by do ilog2(pow(N, 4)).

Shannon entropy has been slightly changed to avoid signed numbers and
division.

The calculation is direct by the formula, successor of precalculated
table or chains of if-else.

The accuracy errors of ilog2 are compensated by

@ENTROPY_LVL_ACEPTABLE 70 -> 65
@ENTROPY_LVL_HIGH      85 -> 80

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 85 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c551d8a979f4ba..b35ce16b3df3c6 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/sched/mm.h>
 #include <linux/sort.h>
+#include <linux/log2.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -1223,6 +1224,59 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
 	return 1;
 }
 
+/*
+ * Shannon Entropy calculation
+ *
+ * Pure byte distribution analysis fails to determine compressiability of data.
+ * Try calculating entropy to estimate the average minimum number of bits
+ * needed to encode the sampled data.
+ *
+ * For convenience, return the percentage of needed bits, instead of amount of
+ * bits directly.
+ *
+ * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
+ *			    and can be compressible with high probability
+ *
+ * @ENTROPY_LVL_HIGH - data are not compressible with high probability
+ *
+ * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
+ */
+#define ENTROPY_LVL_ACEPTABLE		(65)
+#define ENTROPY_LVL_HIGH		(80)
+
+/*
+ * For increasead precision in shannon_entropy calculation,
+ * let's do pow(n, M) to save more digits after comma:
+ *
+ * - maximum int bit length is 64
+ * - ilog2(MAX_SAMPLE_SIZE)	-> 13
+ * - 13 * 4 = 52 < 64		-> M = 4
+ *
+ * So use pow(n, 4).
+ */
+static inline u32 ilog2_w(u64 n)
+{
+	return ilog2(n * n * n * n);
+}
+
+static u32 shannon_entropy(struct heuristic_ws *ws)
+{
+	const u32 entropy_max = 8 * ilog2_w(2);
+	u32 entropy_sum = 0;
+	u32 p, p_base, sz_base;
+	u32 i;
+
+	sz_base = ilog2_w(ws->sample_size);
+	for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
+		p = ws->bucket[i].count;
+		p_base = ilog2_w(p);
+		entropy_sum += p * (sz_base - p_base);
+	}
+
+	entropy_sum /= ws->sample_size;
+	return entropy_sum * 100 / entropy_max;
+}
+
 /* Compare buckets by size, ascending */
 static int bucket_comp_rev(const void *lv, const void *rv)
 {
@@ -1396,7 +1450,7 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 	struct heuristic_ws *ws;
 	u32 i;
 	u8 byte;
-	int ret = 1;
+	int ret = 0;
 
 	ws = list_entry(ws_list, struct heuristic_ws, list);
 
@@ -1431,6 +1485,35 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 		goto out;
 	}
 
+	i = shannon_entropy(ws);
+	if (i <= ENTROPY_LVL_ACEPTABLE) {
+		ret = 4;
+		goto out;
+	}
+
+	/*
+	 * For the levels below ENTROPY_LVL_HIGH, additional analysis would be
+	 * needed to give green light to compression.
+	 *
+	 * For now just assume that compression at that level is not worth the
+	 * resources because:
+	 *
+	 * 1. it is possible to defrag the data later
+	 *
+	 * 2. the data would turn out to be hardly compressible, eg. 150 byte
+	 * values, every bucket has counter at level ~54. The heuristic would
+	 * be confused. This can happen when data have some internal repeated
+	 * patterns like "abbacbbc...". This can be detected by analyzing
+	 * pairs of bytes, which is too costly.
+	 */
+	if (i < ENTROPY_LVL_HIGH) {
+		ret = 5;
+		goto out;
+	} else {
+		ret = 0;
+		goto out;
+	}
+
 out:
 	__free_workspace(0, ws_list, true);
 	return ret;

From d28e649a5c58b779b303c252c66ee84a0f2c3b32 Mon Sep 17 00:00:00 2001
From: Gu JinXiang <gujx@cn.fujitsu.com>
Date: Thu, 19 Oct 2017 09:49:27 +0800
Subject: [PATCH 102/109] btrfs: Fix bug for misused dev_t when lookup in dev
 state hash table.

Fix bug of commit 74d46992e0d9 ("block: replace bi_bdev with a gendisk
pointer and partitions index").

bio_dev(bio) is used to find the dev state in function
__btrfsic_submit_bio. But when dev_state is added to the hashtable, it
is using dev_t of block_device.

bio_dev(bio) returns a dev_t of part0 which is different from dev_t in
block_device(bd_dev). bd_dev in block_device represents the exact
partition.

block_device.bd_dev =
	bio->bi_partno (same as block_device.bd_partno) + bio_dev(bio).

When adding a dev_state into hashtable, we use the exact partition dev_t.
So when looking it up, it should also use the exact partition dev_t.

Reproducer of this bug:

Use MOUNT_OPTIONS="-o check_int" and run btrfs/001 in fstests.
Then there will be WARNING like below.

WARNING:
btrfs: attempt to write superblock which references block M @29523968 (sda7     /1111654400/2) which is never written!

Signed-off-by: Gu JinXiang <gujx@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 1fee5fe93484e6..7d51b5a5b5053e 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2803,7 +2803,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
 	mutex_lock(&btrfsic_mutex);
 	/* since btrfsic_submit_bio() is also called before
 	 * btrfsic_mount(), this might return NULL */
-	dev_state = btrfsic_dev_state_lookup(bio_dev(bio));
+	dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno);
 	if (NULL != dev_state &&
 	    (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
 		unsigned int i = 0;

From f82b735936ffd58b6711cf1f1054616517d8ffcd Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 23 Oct 2017 23:18:16 -0600
Subject: [PATCH 103/109] Btrfs: add write_flags for compression bio

Compression code path has only flaged bios with REQ_OP_WRITE no matter
where the bios come from, but it could be a sync write if fsync starts
this writeback or a normal writeback write if wb kthread starts a
periodic writeback.

It breaks the rule that sync writes and writeback writes need to be
differentiated from each other, because from the POV of block layer,
all bios need to be recognized by these flags in order to do some
management, e.g. throttlling.

This passes writeback_control to compression write path so that it can
send bios with proper flags to block layer.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c |  7 ++++---
 fs/btrfs/compression.h |  3 ++-
 fs/btrfs/extent_io.c   |  2 +-
 fs/btrfs/extent_io.h   |  3 ++-
 fs/btrfs/inode.c       | 15 +++++++++++----
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b35ce16b3df3c6..4a78e572633738 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -295,7 +295,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				 unsigned long len, u64 disk_start,
 				 unsigned long compressed_len,
 				 struct page **compressed_pages,
-				 unsigned long nr_pages)
+				 unsigned long nr_pages,
+				 unsigned int write_flags)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct bio *bio = NULL;
@@ -327,7 +328,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	bdev = fs_info->fs_devices->latest_bdev;
 
 	bio = btrfs_bio_alloc(bdev, first_byte);
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+	bio->bi_opf = REQ_OP_WRITE | write_flags;
 	bio->bi_private = cb;
 	bio->bi_end_io = end_compressed_bio_write;
 	refcount_set(&cb->pending_bios, 1);
@@ -374,7 +375,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			bio_put(bio);
 
 			bio = btrfs_bio_alloc(bdev, first_byte);
-			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+			bio->bi_opf = REQ_OP_WRITE | write_flags;
 			bio->bi_private = cb;
 			bio->bi_end_io = end_compressed_bio_write;
 			bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index da20755ebf2183..93c5b82ae97ec1 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -91,7 +91,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				  unsigned long len, u64 disk_start,
 				  unsigned long compressed_len,
 				  struct page **compressed_pages,
-				  unsigned long nr_pages);
+				  unsigned long nr_pages,
+				  unsigned int write_flags);
 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dd941885b9c30b..1dfd14678db86d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3252,7 +3252,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
 					       delalloc_start,
 					       delalloc_end,
 					       &page_started,
-					       nr_written);
+					       nr_written, wbc);
 		/* File system has been set read-only */
 		if (ret) {
 			SetPageError(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 861dacb371c7f9..d8b27af7101e6f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -115,7 +115,8 @@ struct extent_io_ops {
 	 */
 	int (*fill_delalloc)(void *private_data, struct page *locked_page,
 			     u64 start, u64 end, int *page_started,
-			     unsigned long *nr_written);
+			     unsigned long *nr_written,
+			     struct writeback_control *wbc);
 
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b93fe05a39c764..8525a44a607ea9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -378,6 +378,7 @@ struct async_cow {
 	struct page *locked_page;
 	u64 start;
 	u64 end;
+	unsigned int write_flags;
 	struct list_head extents;
 	struct btrfs_work work;
 };
@@ -857,7 +858,8 @@ static noinline void submit_compressed_extents(struct inode *inode,
 				    async_extent->ram_size,
 				    ins.objectid,
 				    ins.offset, async_extent->pages,
-				    async_extent->nr_pages)) {
+				    async_extent->nr_pages,
+				    async_cow->write_flags)) {
 			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 			struct page *p = async_extent->pages[0];
 			const u64 start = async_extent->start;
@@ -1191,7 +1193,8 @@ static noinline void async_cow_free(struct btrfs_work *work)
 
 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 				u64 start, u64 end, int *page_started,
-				unsigned long *nr_written)
+				unsigned long *nr_written,
+				unsigned int write_flags)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct async_cow *async_cow;
@@ -1208,6 +1211,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 		async_cow->root = root;
 		async_cow->locked_page = locked_page;
 		async_cow->start = start;
+		async_cow->write_flags = write_flags;
 
 		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
 		    !btrfs_test_opt(fs_info, FORCE_COMPRESS))
@@ -1577,11 +1581,13 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
  */
 static int run_delalloc_range(void *private_data, struct page *locked_page,
 			      u64 start, u64 end, int *page_started,
-			      unsigned long *nr_written)
+			      unsigned long *nr_written,
+			      struct writeback_control *wbc)
 {
 	struct inode *inode = private_data;
 	int ret;
 	int force_cow = need_force_cow(inode, start, end);
+	unsigned int write_flags = wbc_to_write_flags(wbc);
 
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1596,7 +1602,8 @@ static int run_delalloc_range(void *private_data, struct page *locked_page,
 		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 			&BTRFS_I(inode)->runtime_flags);
 		ret = cow_file_range_async(inode, locked_page, start, end,
-					   page_started, nr_written);
+					   page_started, nr_written,
+					   write_flags);
 	}
 	if (ret)
 		btrfs_cleanup_ordered_extents(inode, start, end - start + 1);

From 5e9f2ad5b2904a7e81df6d9a3dbef29478952eac Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 23 Oct 2017 09:58:46 +0300
Subject: [PATCH 104/109] btrfs: Fix transaction abort during failure in
 btrfs_rm_dev_item

btrfs_rm_dev_item calls several function under an active transaction,
however it fails to abort it if an error happens. Fix this by adding
explicit btrfs_abort_transaction/btrfs_end_transaction calls.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f1ecb938ba4d71..84d37b4ab230f7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1750,20 +1750,24 @@ static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
 	key.offset = device->devid;
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0)
-		goto out;
-
-	if (ret > 0) {
-		ret = -ENOENT;
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
 		goto out;
 	}
 
 	ret = btrfs_del_item(trans, root, path);
-	if (ret)
-		goto out;
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
+	}
+
 out:
 	btrfs_free_path(path);
-	btrfs_commit_transaction(trans);
+	if (!ret)
+		ret = btrfs_commit_transaction(trans);
 	return ret;
 }
 

From 3065ae5b85654e9141b134bc7b07eb0f2ca7cfcf Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 30 Oct 2017 18:36:08 +0100
Subject: [PATCH 105/109] btrfs: add missing device::flush_bio puts

This fixes potential bio leaks, in several error paths. Unfortunatelly
the device structure freeing is opencoded in many places and I missed
them when introducing the flush_bio.

Most of the time, devices get freed through call_rcu(..., free_device),
so it at least it's not that easy to hit the leak, but it's still
possible through the path that frees stale devices.

Fixes: e0ae99941423 ("btrfs: preallocate device flush bio")
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 84d37b4ab230f7..b5c67dd5e52234 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -189,6 +189,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 				    struct btrfs_device, dev_list);
 		list_del(&device->dev_list);
 		rcu_string_free(device->name);
+		bio_put(device->flush_bio);
 		kfree(device);
 	}
 	kfree(fs_devices);
@@ -578,6 +579,7 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
 				fs_devs->num_devices--;
 				list_del(&dev->dev_list);
 				rcu_string_free(dev->name);
+				bio_put(dev->flush_bio);
 				kfree(dev);
 			}
 			break;
@@ -630,6 +632,7 @@ static noinline int device_list_add(const char *path,
 
 		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name) {
+			bio_put(device->flush_bio);
 			kfree(device);
 			return -ENOMEM;
 		}
@@ -742,6 +745,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 			name = rcu_string_strdup(orig_dev->name->str,
 					GFP_KERNEL);
 			if (!name) {
+				bio_put(device->flush_bio);
 				kfree(device);
 				goto error;
 			}
@@ -807,6 +811,7 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
 		list_del_init(&device->dev_list);
 		fs_devices->num_devices--;
 		rcu_string_free(device->name);
+		bio_put(device->flush_bio);
 		kfree(device);
 	}
 
@@ -2353,6 +2358,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 
 	name = rcu_string_strdup(device_path, GFP_KERNEL);
 	if (!name) {
+		bio_put(device->flush_bio);
 		kfree(device);
 		ret = -ENOMEM;
 		goto error;
@@ -2362,6 +2368,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		rcu_string_free(device->name);
+		bio_put(device->flush_bio);
 		kfree(device);
 		ret = PTR_ERR(trans);
 		goto error;
@@ -2505,6 +2512,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	if (trans)
 		btrfs_end_transaction(trans);
 	rcu_string_free(device->name);
+	bio_put(device->flush_bio);
 	kfree(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
@@ -2571,6 +2579,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 
 	name = rcu_string_strdup(device_path, GFP_KERNEL);
 	if (!name) {
+		bio_put(device->flush_bio);
 		kfree(device);
 		ret = -ENOMEM;
 		goto error;
@@ -6288,6 +6297,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 
 		ret = find_next_devid(fs_info, &tmp);
 		if (ret) {
+			bio_put(dev->flush_bio);
 			kfree(dev);
 			return ERR_PTR(ret);
 		}

From 619c47f3d4cd7a60576fd15e133a2eee4fcc0c4e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 19 Jun 2017 14:14:22 +0200
Subject: [PATCH 106/109] btrfs: dev_alloc_list is not protected by RCU, use
 normal list_del

The dev_alloc_list list could be protected by various mutexes,
depending on the context. The list tracks devices that can take part of
allocating new chunks, so the closest mutex is chunk_mutex. Adding a new
device from inside the ADD_DEV ioctl will need device_list_mutex and
registering a new device from the ioctl needs uuid_mutex.

All mutexes naturally guarantee exclusivity against the same context.
The device ownership can move between the contexts and the exclusivity
is guaranteed by other means, eg. during the mount with the uuid_mutex.

There's no RCU involved for dev_alloc_list.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b5c67dd5e52234..d48b24e54366f3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2002,7 +2002,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
 	fs_devices = srcdev->fs_devices;
 
 	list_del_rcu(&srcdev->dev_list);
-	list_del_rcu(&srcdev->dev_alloc_list);
+	list_del(&srcdev->dev_alloc_list);
 	fs_devices->num_devices--;
 	if (srcdev->missing)
 		fs_devices->missing_devices--;

From 56a0e706fcf870270878d6d72b71092ae42d229c Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 30 Oct 2017 11:14:38 -0600
Subject: [PATCH 107/109] Btrfs: bail out gracefully rather than BUG_ON

If a file's DIR_ITEM key is invalid (due to memory errors) and gets
written to disk, a future lookup_path can end up with kernel panic due
to BUG_ON().

This gets rid of the BUG_ON(), meanwhile output the corrupted key and
return ENOENT if it's invalid.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reported-by: Guillaume Bouchard <bouchard@mercs-eng.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8525a44a607ea9..1131db7a0b2871 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5445,6 +5445,14 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 		goto out_err;
 
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+	if (location->type != BTRFS_INODE_ITEM_KEY &&
+	    location->type != BTRFS_ROOT_ITEM_KEY) {
+		btrfs_warn(root->fs_info,
+"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
+			   __func__, name, btrfs_ino(BTRFS_I(dir)),
+			   location->objectid, location->type, location->offset);
+		goto out_err;
+	}
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -5761,8 +5769,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		return inode;
 	}
 
-	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
-
 	index = srcu_read_lock(&fs_info->subvol_srcu);
 	ret = fixup_tree_root_location(fs_info, dir, dentry,
 				       &location, &sub_root);

From f48bf66b662e7acd6a32dbc28c4fa38931f8f0a6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 3 Nov 2017 22:26:44 +0000
Subject: [PATCH 108/109] Btrfs: move definition of the function
 btrfs_find_new_delalloc_bytes

Move the definition of the function btrfs_find_new_delalloc_bytes() closer
to the function btrfs_dirty_pages(), because in a future commit it will be
used exclusively by btrfs_dirty_pages(). This just moves the function's
definition, with no functional changes at all.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 82 ++++++++++++++++++++++++-------------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f80254d82f409b..47e6ebad78c36c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -477,6 +477,47 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 	}
 }
 
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+					 const u64 start,
+					 const u64 len,
+					 struct extent_state **cached_state)
+{
+	u64 search_start = start;
+	const u64 end = start + len - 1;
+
+	while (search_start < end) {
+		const u64 search_len = end - search_start + 1;
+		struct extent_map *em;
+		u64 em_len;
+		int ret = 0;
+
+		em = btrfs_get_extent(inode, NULL, 0, search_start,
+				      search_len, 0);
+		if (IS_ERR(em))
+			return PTR_ERR(em);
+
+		if (em->block_start != EXTENT_MAP_HOLE)
+			goto next;
+
+		em_len = em->len;
+		if (em->start < search_start)
+			em_len -= search_start - em->start;
+		if (em_len > search_len)
+			em_len = search_len;
+
+		ret = set_extent_bit(&inode->io_tree, search_start,
+				     search_start + em_len - 1,
+				     EXTENT_DELALLOC_NEW,
+				     NULL, cached_state, GFP_NOFS);
+next:
+		search_start = extent_map_end(em);
+		free_extent_map(em);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 /*
  * after copy_from_user, pages need to be dirtied and we need to make
  * sure holes are created between the current EOF and the start of
@@ -1404,47 +1445,6 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
 
 }
 
-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
-					 const u64 start,
-					 const u64 len,
-					 struct extent_state **cached_state)
-{
-	u64 search_start = start;
-	const u64 end = start + len - 1;
-
-	while (search_start < end) {
-		const u64 search_len = end - search_start + 1;
-		struct extent_map *em;
-		u64 em_len;
-		int ret = 0;
-
-		em = btrfs_get_extent(inode, NULL, 0, search_start,
-				      search_len, 0);
-		if (IS_ERR(em))
-			return PTR_ERR(em);
-
-		if (em->block_start != EXTENT_MAP_HOLE)
-			goto next;
-
-		em_len = em->len;
-		if (em->start < search_start)
-			em_len -= search_start - em->start;
-		if (em_len > search_len)
-			em_len = search_len;
-
-		ret = set_extent_bit(&inode->io_tree, search_start,
-				     search_start + em_len - 1,
-				     EXTENT_DELALLOC_NEW,
-				     NULL, cached_state, GFP_NOFS);
-next:
-		search_start = extent_map_end(em);
-		free_extent_map(em);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
 /*
  * This function locks the extent and properly waits for data=ordered extents
  * to finish before allowing the pages to be modified if need.

From e3b8a4858566a6cc25422fbfdfdd760b13b79280 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Sat, 4 Nov 2017 00:16:59 +0000
Subject: [PATCH 109/109] Btrfs: fix reported number of inode blocks after
 buffered append writes

The patch from commit a7e3b975a0f9 ("Btrfs: fix reported number of inode
blocks") introduced a regression where if we do a buffered write starting
at position equal to or greater than the file's size and then stat(2) the
file before writeback is triggered, the number of used blocks does not
change (unless there's a prealloc/unwritten extent). Example:

  $ xfs_io -f -c "pwrite -S 0xab 0 64K" foobar
  $ du -h foobar
  0	foobar
  $ sync
  $ du -h foobar
  64K	foobar

The first version of that patch didn't had this regression and the second
version, which was the one committed, was made only to address some
performance regression detected by the intel test robots using fs_mark.

This fixes the regression by setting the new delaloc bit in the range, and
doing it at btrfs_dirty_pages() while setting the regular dealloc bit as
well, so that this way we set both bits at once avoiding navigation of the
inode's io tree twice. Doing it at btrfs_dirty_pages() is also the most
meaninful place, as we should set the new dellaloc bit when if we set the
delalloc bit, which happens only if we copied bytes into the pages at
__btrfs_buffered_write().

This was making some of LTP's du tests fail, which can be quickly run
using a command line like the following:

  $ ./runltp -q -p -l /ltp.log -f commands -s du -d /mnt

Fixes: a7e3b975a0f9 ("Btrfs: fix reported number of inode blocks")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h                 |  1 +
 fs/btrfs/extent_io.h             |  5 ++--
 fs/btrfs/file.c                  | 43 +++++++++++++++++++-------------
 fs/btrfs/inode.c                 |  9 ++++---
 fs/btrfs/relocation.c            |  3 ++-
 fs/btrfs/tests/extent-io-tests.c |  6 ++---
 fs/btrfs/tests/inode-tests.c     | 12 ++++-----
 7 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f7df5536ab61e1..72dcbf19f6cef1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3180,6 +3180,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
 			       int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      unsigned int extra_bits,
 			      struct extent_state **cached_state, int dedupe);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *new_root,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index d8b27af7101e6f..2ef824b58e3c73 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -365,10 +365,11 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		       struct extent_state **cached_state);
 
 static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
-		u64 end, struct extent_state **cached_state)
+				      u64 end, unsigned int extra_bits,
+				      struct extent_state **cached_state)
 {
 	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_UPTODATE,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
 			      NULL, cached_state, GFP_NOFS);
 }
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 47e6ebad78c36c..2de7b4f4ca3c7c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -538,14 +538,34 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
 	loff_t isize = i_size_read(inode);
+	unsigned int extra_bits = 0;
 
 	start_pos = pos & ~((u64) fs_info->sectorsize - 1);
 	num_bytes = round_up(write_bytes + pos - start_pos,
 			     fs_info->sectorsize);
 
 	end_of_last_block = start_pos + num_bytes - 1;
+
+	if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
+		if (start_pos >= isize &&
+		    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
+			/*
+			 * There can't be any extents following eof in this case
+			 * so just set the delalloc new bit for the range
+			 * directly.
+			 */
+			extra_bits |= EXTENT_DELALLOC_NEW;
+		} else {
+			err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
+							    start_pos,
+							    num_bytes, cached);
+			if (err)
+				return err;
+		}
+	}
+
 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-					cached, 0);
+					extra_bits, cached, 0);
 	if (err)
 		return err;
 
@@ -1473,10 +1493,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 		+ round_up(pos + write_bytes - start_pos,
 			   fs_info->sectorsize) - 1;
 
-	if (start_pos < inode->vfs_inode.i_size ||
-	    (inode->flags & BTRFS_INODE_PREALLOC)) {
+	if (start_pos < inode->vfs_inode.i_size) {
 		struct btrfs_ordered_extent *ordered;
-		unsigned int clear_bits;
 
 		lock_extent_bits(&inode->io_tree, start_pos, last_pos,
 				cached_state);
@@ -1498,19 +1516,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 		}
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
-		ret = btrfs_find_new_delalloc_bytes(inode, start_pos,
-						    last_pos - start_pos + 1,
-						    cached_state);
-		clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC |
-			EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG;
-		if (ret)
-			clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED;
-		clear_extent_bit(&inode->io_tree, start_pos,
-				 last_pos, clear_bits,
-				 (clear_bits & EXTENT_LOCKED) ? 1 : 0,
-				 0, cached_state, GFP_NOFS);
-		if (ret)
-			return ret;
+		clear_extent_bit(&inode->io_tree, start_pos, last_pos,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+				 0, 0, cached_state, GFP_NOFS);
 		*lockstart = start_pos;
 		*lockend = last_pos;
 		ret = 1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1131db7a0b2871..993061f83067a9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2032,11 +2032,12 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      unsigned int extra_bits,
 			      struct extent_state **cached_state, int dedupe)
 {
 	WARN_ON((end & (PAGE_SIZE - 1)) == 0);
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
-				   cached_state);
+				   extra_bits, cached_state);
 }
 
 /* see btrfs_writepage_start_hook for details on why this is required */
@@ -2097,7 +2098,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 		goto out;
 	 }
 
-	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state,
+	btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state,
 				  0);
 	ClearPageChecked(page);
 	set_page_dirty(page);
@@ -4797,7 +4798,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
-	ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
+	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
 					&cached_state, 0);
 	if (ret) {
 		unlock_extent_cached(io_tree, block_start, block_end,
@@ -9163,7 +9164,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
-	ret = btrfs_set_extent_delalloc(inode, page_start, end,
+	ret = btrfs_set_extent_delalloc(inode, page_start, end, 0,
 					&cached_state, 0);
 	if (ret) {
 		unlock_extent_cached(io_tree, page_start, page_end,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4cf2eb67eba6ce..f0c3f00e97cbe7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3268,7 +3268,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
 			nr++;
 		}
 
-		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0);
+		btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL,
+					  0);
 		set_page_dirty(page);
 
 		unlock_extent(&BTRFS_I(inode)->io_tree,
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index d06b1c931d05b8..2e7f64a3b22b7d 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -114,7 +114,7 @@ static int test_find_delalloc(u32 sectorsize)
 	 * |--- delalloc ---|
 	 * |---  search  ---|
 	 */
-	set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL);
+	set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
 	start = 0;
 	end = 0;
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -145,7 +145,7 @@ static int test_find_delalloc(u32 sectorsize)
 		test_msg("Couldn't find the locked page\n");
 		goto out_bits;
 	}
-	set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL);
+	set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
 	start = test_start;
 	end = 0;
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -200,7 +200,7 @@ static int test_find_delalloc(u32 sectorsize)
 	 *
 	 * We are re-using our test_start from above since it works out well.
 	 */
-	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL);
+	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
 	start = test_start;
 	end = 0;
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index f797642c013dad..30affb60da5148 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -968,7 +968,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	btrfs_test_inode_set_ops(inode);
 
 	/* [BTRFS_MAX_EXTENT_SIZE] */
-	ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
+	ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
 					NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
@@ -984,7 +984,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
 	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
 					BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
-					NULL, 0);
+					0, NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
 		goto out;
@@ -1018,7 +1018,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
 					(BTRFS_MAX_EXTENT_SIZE >> 1)
 					+ sectorsize - 1,
-					NULL, 0);
+					0, NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
 		goto out;
@@ -1036,7 +1036,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
 			(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
-			NULL, 0);
+			0, NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
 		goto out;
@@ -1053,7 +1053,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	*/
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + sectorsize,
-			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
+			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
 		goto out;
@@ -1089,7 +1089,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	 */
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + sectorsize,
-			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
+			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
 		goto out;