From 4fb5d637e3d570b89ee8cbbebc3cc5a8210e4e32 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 20 Jun 2018 13:39:23 -0700
Subject: [PATCH] Add support for autoexpand property

While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure.  Enough
of that infrastructure is now in place with a few customizations
for Linux the autoexpand property for whole disk configurations
can be supported.

Autoexpand works as follows; when a block device is resized a
change event is generated by udev with the DISK_MEDIA_CHANGE key.
The ZED, which is monitoring udev events detects the event for
disks (but not partitions) and hands it off to zfs_deliver_dle().
The zfs_deliver_dle() function appends the exected whole disk
partition suffix, and if the partition can be matched against
a known pool vdev it re-opens it.

Re-opening the vdev with trigger a re-reading of the partition
table so the maximum possible expansion size can be reported.
Next if the property autoexpand is set to "on" a vdev expansion
will be attempted.  After performing some sanity checks on the
disk to verify it's safe to expand the ZFS partition (-part1) it
will be expanded an the partition table updated.  The partition
is then re-opened again to detect the updated size which allows
the new capacity to be used.

Added PHYS_PATH="/dev/zvol/dataset" to vdev configuration for
ZFS volumes.  This was required for the test cases which test
expansion by layering a new pool on top of ZFS volumes.

Enable the zpool_expand_001_pos and /zpool_expand_003_pos
test cases which excercise the autoexpand property.

Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.

Removed vdev_disk_rrpart() functionality which can be abandoned
in favour of re-opening the device which trigger a re-read of
the partition table as long no other partitions are in use.
This will be true as long as we're working with hole disks.
As a bonus this allows us to remove to Linux kernel API checks.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #120
Issue #2437
Issue #5771
Issue #7582
---
 cmd/zed/agents/zfs_mod.c                      |  15 +-
 cmd/zed/zed_disk_event.c                      |  32 ++-
 config/kernel-blkdev-get.m4                   |  19 --
 config/kernel-get-gendisk.m4                  |  17 --
 config/kernel.m4                              |   2 -
 lib/libzfs/libzfs_import.c                    |  61 ++++--
 module/zfs/fm.c                               |  30 ++-
 module/zfs/vdev.c                             |   3 +-
 module/zfs/vdev_disk.c                        | 182 +++++++-----------
 module/zfs/zvol.c                             |  19 +-
 tests/test-runner/bin/zts-report.py           |  12 +-
 .../zpool_expand/zpool_expand_001_pos.ksh     |  23 ++-
 .../zpool_expand/zpool_expand_003_neg.ksh     |   9 +-
 13 files changed, 217 insertions(+), 207 deletions(-)
 delete mode 100644 config/kernel-blkdev-get.m4
 delete mode 100644 config/kernel-get-gendisk.m4

diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c
index 600d6527c0db..b757e2360adf 100644
--- a/cmd/zed/agents/zfs_mod.c
+++ b/cmd/zed/agents/zfs_mod.c
@@ -751,23 +751,30 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * This function handles the ESC_DEV_DLE event.
+ * This function handles the ESC_DEV_DLE (DISK_MEDIA_CHANGE) event which
+ * is only delivered for the disk itself, not for each partition.  Presume
+ * that a 'wholedisk' partition exists and append the expected partition
+ * suffix in order to attempt a match.
  */
 static int
 zfs_deliver_dle(nvlist_t *nvl)
 {
-	char *devname;
+	char *devname, pname[MAXPATHLEN];
 
 	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) {
 		zed_log_msg(LOG_INFO, "zfs_deliver_dle: no physpath");
 		return (-1);
 	}
 
-	if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) {
+	strlcpy(pname, devname, MAXPATHLEN);
+	zfs_append_partition(pname, MAXPATHLEN);
+
+	if (zpool_iter(g_zfshdl, zfsdle_vdev_online, pname) != 1) {
 		zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not "
-		    "found", devname);
+		    "found", pname);
 		return (1);
 	}
+
 	return (0);
 }
 
diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c
index 996b911c537c..758955bdc460 100644
--- a/cmd/zed/zed_disk_event.c
+++ b/cmd/zed/zed_disk_event.c
@@ -165,11 +165,12 @@ zed_udev_monitor(void *arg)
 
 	while (1) {
 		struct udev_device *dev;
-		const char *action, *type, *part, *sectors;
+		const char *action, *type, *part, *sectors, *change;
 		const char *bus, *uuid;
 		const char *class, *subclass;
 		nvlist_t *nvl;
 		boolean_t is_zfs = B_FALSE;
+		boolean_t is_disk_media_change = B_FALSE;
 
 		/* allow a cancellation while blocked (recvmsg) */
 		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
@@ -202,14 +203,26 @@ zed_udev_monitor(void *arg)
 		}
 
 		/*
-		 * if this is a disk and it is partitioned, then the
+		 * Disk media change events are allowed for auto-expand.
+		 * Whether the device contains a zfs_member is determined
+		 * at the time of the attempted expansion.
+		 */
+		change = udev_device_get_property_value(dev,
+		    "DISK_MEDIA_CHANGE");
+		if (change != NULL && change[0] == '1')
+			is_disk_media_change = B_TRUE;
+
+		/*
+		 * If this is a disk and it is partitioned, then the
 		 * zfs label will reside in a DEVTYPE=partition and
-		 * we can skip passing this event
+		 * we can skip passing this event.  Unless it's a disk
+		 * media changes event which is expected for auto-expand.
 		 */
 		type = udev_device_get_property_value(dev, "DEVTYPE");
 		part = udev_device_get_property_value(dev,
 		    "ID_PART_TABLE_TYPE");
-		if (type != NULL && type[0] != '\0' &&
+		if (!is_disk_media_change &&
+		    type != NULL && type[0] != '\0' &&
 		    strcmp(type, "disk") == 0 &&
 		    part != NULL && part[0] != '\0') {
 			/* skip and wait for partition event */
@@ -231,14 +244,15 @@ zed_udev_monitor(void *arg)
 		}
 
 		/*
-		 * If the blkid probe didn't find ZFS, then a persistent
-		 * device id string is required in the message schema
-		 * for matching with vdevs. Preflight here for expected
-		 * udev information.
+		 * If the blkid probe didn't find ZFS and this is not a
+		 * disk media change event.  Then a persistent device id
+		 * string is required in the message schema for matching
+		 * with vdevs. Preflight here for expected udev information.
 		 */
 		bus = udev_device_get_property_value(dev, "ID_BUS");
 		uuid = udev_device_get_property_value(dev, "DM_UUID");
-		if (!is_zfs && (bus == NULL && uuid == NULL)) {
+		if (!is_zfs && !is_disk_media_change &&
+		    bus == NULL && uuid == NULL) {
 			zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
 			    "source", udev_device_get_devnode(dev));
 			udev_device_unref(dev);
diff --git a/config/kernel-blkdev-get.m4 b/config/kernel-blkdev-get.m4
deleted file mode 100644
index e31d71770511..000000000000
--- a/config/kernel-blkdev-get.m4
+++ /dev/null
@@ -1,19 +0,0 @@
-dnl #
-dnl # 2.6.37 API change
-dnl # Added 3rd argument for the active holder, previously this was
-dnl # hardcoded to NULL.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_3ARG_BLKDEV_GET], [
-	AC_MSG_CHECKING([whether blkdev_get() wants 3 args])
-	ZFS_LINUX_TRY_COMPILE([
-		#include <linux/fs.h>
-	],[
-		struct block_device *bdev = NULL;
-		(void) blkdev_get(bdev, 0, NULL);
-	],[
-		AC_MSG_RESULT(yes)
-		AC_DEFINE(HAVE_3ARG_BLKDEV_GET, 1, [blkdev_get() wants 3 args])
-	],[
-		AC_MSG_RESULT(no)
-	])
-])
diff --git a/config/kernel-get-gendisk.m4 b/config/kernel-get-gendisk.m4
deleted file mode 100644
index b0913770e43d..000000000000
--- a/config/kernel-get-gendisk.m4
+++ /dev/null
@@ -1,17 +0,0 @@
-dnl #
-dnl # 2.6.34 API change
-dnl # Verify the get_gendisk() symbol is available.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_GET_GENDISK],
-	[AC_MSG_CHECKING([whether get_gendisk() is available])
-	ZFS_LINUX_TRY_COMPILE_SYMBOL([
-		#include <linux/genhd.h>
-	], [
-		get_gendisk(0, NULL);
-	], [get_gendisk], [block/genhd.c], [
-		AC_MSG_RESULT(yes)
-		AC_DEFINE(HAVE_GET_GENDISK, 1, [get_gendisk() is available])
-	], [
-		AC_MSG_RESULT(no)
-	])
-])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 8c2998204cde..32abb81da39b 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -44,7 +44,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
 	ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
 	ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
 	ZFS_AC_KERNEL_TYPE_FMODE_T
-	ZFS_AC_KERNEL_3ARG_BLKDEV_GET
 	ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH
 	ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE
 	ZFS_AC_KERNEL_LOOKUP_BDEV
@@ -73,7 +72,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
 	ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BLK_PLUG
 	ZFS_AC_KERNEL_GET_DISK_AND_MODULE
 	ZFS_AC_KERNEL_GET_DISK_RO
-	ZFS_AC_KERNEL_GET_GENDISK
 	ZFS_AC_KERNEL_HAVE_BIO_SET_OP_ATTRS
 	ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL
 	ZFS_AC_KERNEL_DISCARD_GRANULARITY
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c
index 7d2f0e903cce..e83d8ed888e0 100644
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@ -145,6 +145,21 @@ zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
 			return (0);
 		}
 
+		/*
+		 * For volumes use the persistent /dev/zvol/dataset identifier
+		 */
+		entry = udev_device_get_devlinks_list_entry(dev);
+		while (entry != NULL) {
+			const char *name;
+
+			name = udev_list_entry_get_name(entry);
+			if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
+				(void) strlcpy(bufptr, name, buflen);
+				return (0);
+			}
+			entry = udev_list_entry_get_next(entry);
+		}
+
 		/*
 		 * NVME 'by-id' symlinks are similar to bus case
 		 */
@@ -187,26 +202,44 @@ int
 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
 {
 	const char *physpath = NULL;
+	struct udev_list_entry *entry;
 
 	/*
-	 * Normal disks use ID_PATH for their physical path.  Device mapper
-	 * devices are virtual and don't have a physical path.  For them we
-	 * use ID_VDEV instead, which is setup via the /etc/vdev_id.conf file.
-	 * ID_VDEV provides a persistent path to a virtual device.  If you
-	 * don't have vdev_id.conf setup, you cannot use multipath autoreplace.
+	 * Normal disks use ID_PATH for their physical path.
 	 */
-	if (!((physpath = udev_device_get_property_value(dev, "ID_PATH")) &&
-	    physpath[0])) {
-		if (!((physpath =
-		    udev_device_get_property_value(dev, "ID_VDEV")) &&
-		    physpath[0])) {
-			return (ENODATA);
-		}
+	physpath = udev_device_get_property_value(dev, "ID_PATH");
+	if (physpath != NULL && strlen(physpath) > 0) {
+		(void) strlcpy(bufptr, physpath, buflen);
+		return (0);
+	}
+
+	/*
+	 * Device mapper devices are virtual and don't have a physical
+	 * path. For them we use ID_VDEV instead, which is setup via the
+	 * /etc/vdev_id.conf file.  ID_VDEV provides a persistent path
+	 * to a virtual device.  If you don't have vdev_id.conf setup,
+	 * you cannot use multipath autoreplace with device mapper.
+	 */
+	physpath = udev_device_get_property_value(dev, "ID_VDEV");
+	if (physpath != NULL && strlen(physpath) > 0) {
+		(void) strlcpy(bufptr, physpath, buflen);
+		return (0);
 	}
 
-	(void) strlcpy(bufptr, physpath, buflen);
+	/*
+	 * For volumes use the persistent /dev/zvol/dataset identifier
+	 */
+	entry = udev_device_get_devlinks_list_entry(dev);
+	while (entry != NULL) {
+		physpath = udev_list_entry_get_name(entry);
+		if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
+			(void) strlcpy(bufptr, physpath, buflen);
+			return (0);
+		}
+		entry = udev_list_entry_get_next(entry);
+	}
 
-	return (0);
+	return (ENODATA);
 }
 
 boolean_t
diff --git a/module/zfs/fm.c b/module/zfs/fm.c
index 4986a3fa2350..df8309d8de39 100644
--- a/module/zfs/fm.c
+++ b/module/zfs/fm.c
@@ -671,19 +671,31 @@ zfs_zevent_wait(zfs_zevent_t *ze)
 	int error = 0;
 
 	mutex_enter(&zevent_lock);
+	zevent_waiters++;
 
-	if (zevent_flags & ZEVENT_SHUTDOWN) {
-		error = ESHUTDOWN;
-		goto out;
-	}
+	while (error == 0) {
+		if (zevent_flags & ZEVENT_SHUTDOWN) {
+			error = SET_ERROR(ESHUTDOWN);
+			break;
+		}
 
-	zevent_waiters++;
-	cv_wait_sig(&zevent_cv, &zevent_lock);
-	if (issig(JUSTLOOKING))
-		error = EINTR;
+		error = cv_timedwait_sig(&zevent_cv, &zevent_lock,
+		    ddi_get_lbolt() + hz);
+		if (signal_pending(current) || fatal_signal_pending(current)) {
+			error = SET_ERROR(EINTR);
+			break;
+		} else {
+			if (error == -1) {
+				error = 0;
+				continue;
+			} else {
+				error = 0;
+				break;
+			}
+		}
+	}
 
 	zevent_waiters--;
-out:
 	mutex_exit(&zevent_lock);
 
 	return (error);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 5b67e5f5fe3f..49a1fd9849f3 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -3097,7 +3097,8 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 	/* XXX - L2ARC 1.0 does not support expansion */
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
-			pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
+			pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
+			    spa->spa_autoexpand);
 	}
 
 	vdev_reopen(tvd);
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 996bab43c6ce..f736908751ba 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -85,50 +85,54 @@ vdev_bdev_mode(int smode)
 }
 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
 
-/* The capacity (in bytes) of a bdev that is available to be used by a vdev */
+/*
+ * Returns the usable capacity (in bytes) for the partition or disk.
+ */
 static uint64_t
-bdev_capacity(struct block_device *bdev, boolean_t wholedisk)
+bdev_capacity(struct block_device *bdev)
 {
-	struct hd_struct *part = bdev->bd_part;
-	uint64_t sectors = get_capacity(bdev->bd_disk);
-	/* If there are no paritions, return the entire device capacity */
-	if (part == NULL)
-		return (sectors << SECTOR_BITS);
+	if (bdev->bd_part != NULL)
+		return (bdev->bd_part->nr_sects << SECTOR_BITS);
+	else
+		return (i_size_read(bdev->bd_inode));
+}
 
-	/*
-	 * If there are partitions, decide if we are using a `wholedisk`
-	 * layout (composed of part1 and part9) or just a single partition.
-	 */
-	if (wholedisk) {
-		/* Verify the expected device layout */
-		ASSERT3P(bdev, !=, bdev->bd_contains);
-		/*
-		 * Sectors used by the EFI partition (part9) as well as
-		 * partion alignment.
-		 */
-		uint64_t used = EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
-		    PARTITION_END_ALIGNMENT;
-
-		/* Space available to the vdev, i.e. the size of part1 */
-		if (sectors <= used)
-			return (0);
-		uint64_t available = sectors - used;
-		return (available << SECTOR_BITS);
+/*
+ * Returns the maximum expansion capacity of the block device,  When the
+ * vdev has been created as a 'wholedisk' then expansion may be possible.
+ * Before any expansion is performed the partition layout is verified to
+ * confirm the original layout (-part1 and -part9).  If everything checks
+ * out the primary partition will be resized and the reserved partition
+ * relocated to the new end of device as part of 'zpool online -e'.
+ */
+static uint64_t
+bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
+{
+	uint64_t psize;
+	int64_t available;
+
+	if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
+		available = i_size_read(bdev->bd_contains->bd_inode) -
+		    ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
+		    PARTITION_END_ALIGNMENT) << SECTOR_BITS);
+		if (available > 0)
+			psize = available;
+		else
+			psize = bdev_capacity(bdev);
 	} else {
-		/* The partition capacity referenced by the block device */
-		return (part->nr_sects << SECTOR_BITS);
+		psize = bdev_capacity(bdev);
 	}
+
+	return (psize);
 }
 
 static void
 vdev_disk_error(zio_t *zio)
 {
-#ifdef ZFS_DEBUG
-	printk(KERN_WARNING "ZFS: zio error=%d type=%d offset=%llu size=%llu "
+	zfs_dbgmsg(KERN_WARNING "zio error=%d type=%d offset=%llu size=%llu "
 	    "flags=%x\n", zio->io_error, zio->io_type,
 	    (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
 	    zio->io_flags);
-#endif
 }
 
 /*
@@ -200,71 +204,6 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
 	}
 }
 
-/*
- * Expanding a whole disk vdev involves invoking BLKRRPART on the
- * whole disk device. This poses a problem, because BLKRRPART will
- * return EBUSY if one of the disk's partitions is open. That's why
- * we have to do it here, just before opening the data partition.
- * Unfortunately, BLKRRPART works by dropping all partitions and
- * recreating them, which means that for a short time window, all
- * /dev/sdxN device files disappear (until udev recreates them).
- * This means two things:
- *  - When we open the data partition just after a BLKRRPART, we
- *    can't do it using the normal device file path because of the
- *    obvious race condition with udev. Instead, we use reliable
- *    kernel APIs to get a handle to the new partition device from
- *    the whole disk device.
- *  - Because vdev_disk_open() initially needs to find the device
- *    using its path, multiple vdev_disk_open() invocations in
- *    short succession on the same disk with BLKRRPARTs in the
- *    middle have a high probability of failure (because of the
- *    race condition with udev). A typical situation where this
- *    might happen is when the zpool userspace tool does a
- *    TRYIMPORT immediately followed by an IMPORT. For this
- *    reason, we only invoke BLKRRPART in the module when strictly
- *    necessary (zpool online -e case), and rely on userspace to
- *    do it when possible.
- */
-static struct block_device *
-vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
-{
-#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
-	struct block_device *bdev, *result = ERR_PTR(-ENXIO);
-	struct gendisk *disk;
-	int error, partno;
-
-	bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
-	if (IS_ERR(bdev))
-		return (bdev);
-
-	disk = get_gendisk(bdev->bd_dev, &partno);
-	vdev_bdev_close(bdev, vdev_bdev_mode(mode));
-
-	if (disk) {
-		bdev = bdget(disk_devt(disk));
-		if (bdev) {
-			error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
-			if (error == 0)
-				error = ioctl_by_bdev(bdev, BLKRRPART, 0);
-			vdev_bdev_close(bdev, vdev_bdev_mode(mode));
-		}
-
-		bdev = bdget_disk(disk, partno);
-		if (bdev) {
-			error = blkdev_get(bdev,
-			    vdev_bdev_mode(mode) | FMODE_EXCL, vd);
-			if (error == 0)
-				result = bdev;
-		}
-		put_disk(disk);
-	}
-
-	return (result);
-#else
-	return (ERR_PTR(-EOPNOTSUPP));
-#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
-}
-
 static int
 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
     uint64_t *ashift)
@@ -281,28 +220,37 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 		return (SET_ERROR(EINVAL));
 	}
 
+	mode = spa_mode(v->vdev_spa);
+
 	/*
-	 * Reopen the device if it's not currently open. Otherwise,
-	 * just update the physical size of the device.
+	 * Reopen the device if it is not currently open. Otherwise, when
+	 * expanding a device close and open it to trigger a re-scanning
+	 * of the partition table in order to get an accurate size.
 	 */
 	if (v->vdev_tsd != NULL) {
 		ASSERT(v->vdev_reopening);
 		vd = v->vdev_tsd;
-		goto skip_open;
-	}
 
-	vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
-	if (vd == NULL)
-		return (SET_ERROR(ENOMEM));
+		if (vd->vd_bdev && v->vdev_wholedisk && v->vdev_expanding) {
+			vdev_bdev_close(vd->vd_bdev, vdev_bdev_mode(mode));
+			vd->vd_bdev = NULL;
+		} else {
+			goto skip_open;
+		}
+	} else {
+		vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+		if (vd == NULL)
+			return (SET_ERROR(ENOMEM));
+	}
 
 	/*
 	 * Devices are always opened by the path provided at configuration
 	 * time.  This means that if the provided path is a udev by-id path
-	 * then drives may be recabled without an issue.  If the provided
+	 * then drives may be re-cabled without an issue.  If the provided
 	 * path is a udev by-path path, then the physical location information
 	 * will be preserved.  This can be critical for more complicated
 	 * configurations where drives are located in specific physical
-	 * locations to maximize the systems tolerence to component failure.
+	 * locations to maximize the systems tolerance to component failure.
 	 * Alternatively, you can provide your own udev rule to flexibly map
 	 * the drives as you see fit.  It is not advised that you use the
 	 * /dev/[hd]d devices which may be reordered due to probing order.
@@ -317,10 +265,6 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	 * and it is reasonable to sleep and retry before giving up.  In
 	 * practice delays have been observed to be on the order of 100ms.
 	 */
-	mode = spa_mode(v->vdev_spa);
-	if (v->vdev_wholedisk && v->vdev_expanding)
-		bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
-
 	while (IS_ERR(bdev) && count < 50) {
 		bdev = vdev_bdev_open(v->vdev_path,
 		    vdev_bdev_mode(mode), zfs_vdev_holder);
@@ -333,12 +277,13 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	}
 
 	if (IS_ERR(bdev)) {
-		dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
-		    v->vdev_path, -PTR_ERR(bdev), count);
+		vdev_dbgmsg(v, "failed open error=%d count=%d\n",
+		    -PTR_ERR(bdev), count);
 		kmem_free(vd, sizeof (vdev_disk_t));
 		return (SET_ERROR(-PTR_ERR(bdev)));
 	}
 
+	ASSERT3P(vd->vd_bdev, ==, NULL);
 	v->vdev_tsd = vd;
 	vd->vd_bdev = bdev;
 
@@ -352,9 +297,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	/* Inform the ZIO pipeline that we are non-rotational */
 	v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
 
-	/* Physical volume size in bytes */
-	*psize = bdev_capacity(vd->vd_bdev, v->vdev_wholedisk);
-	*max_psize = *psize;
+	/* Physical volume size in bytes for the partition */
+	*psize = bdev_capacity(vd->vd_bdev);
+
+	/* Physical volume size in bytes including possible expansion space */
+	*max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
 
 	/* Based on the minimum sector size set the block size */
 	*ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
@@ -373,9 +320,10 @@ vdev_disk_close(vdev_t *v)
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
-	if (vd->vd_bdev != NULL)
+	if (vd->vd_bdev != NULL) {
 		vdev_bdev_close(vd->vd_bdev,
 		    vdev_bdev_mode(spa_mode(v->vdev_spa)));
+	}
 
 	kmem_free(vd, sizeof (vdev_disk_t));
 	v->vdev_tsd = NULL;
@@ -563,8 +511,8 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 	struct blk_plug plug;
 #endif
 
-	ASSERT(zio != NULL);
-	ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size);
+	if (io_offset + io_size > bdev->bd_inode->i_size)
+		return (SET_ERROR(ENXIO));
 
 retry:
 	dr = vdev_disk_dio_alloc(bio_count);
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index ecba516fcc0d..1eeaded75d79 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -118,6 +118,7 @@ struct zvol_state {
 	objset_t		*zv_objset;	/* objset handle */
 	uint32_t		zv_flags;	/* ZVOL_* flags */
 	uint32_t		zv_open_count;	/* open counts */
+	uint32_t		zv_open_retry;	/* -ERESTARTSYS retry */
 	uint32_t		zv_changed;	/* disk changed */
 	zilog_t			*zv_zilog;	/* ZIL handle */
 	zfs_rlock_t		zv_range_lock;	/* range lock */
@@ -1288,11 +1289,23 @@ zvol_first_open(zvol_state_t *zv, boolean_t readonly)
 	 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
 	 * called again.  This process can be repeated safely until both
 	 * locks are acquired.
+	 *
+	 * There is one exception and that is for 2.6.35 and older kernels
+	 * which include the BKL.  In order to ensure a deadlock can never
+	 * happen even for these ancient kernels a retry limit in included.
 	 */
 	if (!mutex_owned(&spa_namespace_lock)) {
 		locked = mutex_tryenter(&spa_namespace_lock);
-		if (!locked)
+		if (!locked) {
+#if defined(CONFIG_LOCK_KERNEL)
+			zv->zv_open_retry++;
+			if (zv->zv_open_retry > 100) {
+				zv->zv_open_retry = 0;
+				return (-SET_ERROR(ENXIO));
+			}
+#endif
 			return (-SET_ERROR(ERESTARTSYS));
+		}
 	}
 
 	ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
@@ -1384,6 +1397,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 	}
 
 	zv->zv_open_count++;
+	zv->zv_open_retry = 0;
 
 	mutex_exit(&zv->zv_state_lock);
 	if (drop_suspend)
@@ -1399,8 +1413,10 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 
 out_mutex:
 	mutex_exit(&zv->zv_state_lock);
+
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
+
 	if (error == -ERESTARTSYS)
 		schedule();
 
@@ -1672,6 +1688,7 @@ zvol_alloc(dev_t dev, const char *name)
 	zv->zv_queue->queuedata = zv;
 	zv->zv_dev = dev;
 	zv->zv_open_count = 0;
+	zv->zv_open_retry = 0;
 	strlcpy(zv->zv_name, name, MAXNAMELEN);
 
 	zfs_rlock_init(&zv->zv_range_lock);
diff --git a/tests/test-runner/bin/zts-report.py b/tests/test-runner/bin/zts-report.py
index 976ca1f8adb9..43695b73bb02 100755
--- a/tests/test-runner/bin/zts-report.py
+++ b/tests/test-runner/bin/zts-report.py
@@ -105,6 +105,14 @@
 #
 rewind_reason = 'Arbitrary pool rewind is not guaranteed'
 
+#
+# Some tests which depend on the pool autoexpand property will not work
+# with 2.6.37 and older kernels.  These kernels do not set the udev
+# DISK_MEDIA_CHANGE udev property which is required by the ZED to detect
+# and perform the needed device expansion.
+#
+udev_reason = 'Udev DISK_MEDIA_CHANGE property required'
+
 #
 # Some tests are not applicable to Linux or need to be updated to operate
 # in the manor required by Linux.  Any tests which are skipped for this
@@ -151,8 +159,6 @@
     'cli_root/zfs_unshare/zfs_unshare_002_pos': ['SKIP', na_reason],
     'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason],
     'cli_root/zpool_create/zpool_create_016_pos': ['SKIP', na_reason],
-    'cli_root/zpool_expand/zpool_expand_001_pos': ['SKIP', '5771'],
-    'cli_root/zpool_expand/zpool_expand_003_neg': ['SKIP', '5771'],
     'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason],
     'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason],
     'inuse/inuse_001_pos': ['SKIP', na_reason],
@@ -212,6 +218,8 @@
     'cli_root/zpool_create/setup': ['SKIP', disk_reason],
     'cli_root/zpool_create/zpool_create_008_pos': ['FAIL', known_reason],
     'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', '6145'],
+    'cli_root/zpool_expand/zpool_expand_001_pos': ['SKIP', udev_reason],
+    'cli_root/zpool_expand/zpool_expand_003_neg': ['SKIP', udev_reason],
     'cli_root/zpool_export/setup': ['SKIP', disk_reason],
     'cli_root/zpool_import/setup': ['SKIP', disk_reason],
     'cli_root/zpool_import/import_rewind_device_replaced':
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh
index 06ab1b84fd1c..59e4686406d7 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh
@@ -48,9 +48,12 @@
 
 verify_runnable "global"
 
-# See issue: https://github.com/zfsonlinux/zfs/issues/5771
-if is_linux; then
-	log_unsupported "Requires autoexpand property support"
+#
+# The zpool autoexpand functionality depends on the DISK_MEDIA_CHANGE udev
+# property which wasn't added until the 2.6.37 kernel.
+#
+if [[ $(linux_version) -le $(linux_version "2.6.37") ]]; then
+	log_unsupported "Requires udev DISK_MEDIA_CHANGE property"
 fi
 
 function cleanup
@@ -73,6 +76,7 @@ log_assert "zpool can be autoexpanded after set autoexpand=on on LUN expansion"
 for i in 1 2 3; do
 	log_must zfs create -V $org_size $VFS/vol$i
 done
+
 block_device_wait
 
 for type in " " mirror raidz raidz2; do
@@ -105,8 +109,8 @@ for type in " " mirror raidz raidz2; do
 	log_note "$TESTPOOL1 $type has previous size: $prev_size and " \
 	    "expanded size: $expand_size"
 	# compare available pool size from zfs
-	if [[ $zfs_expand_size > $zfs_prev_size ]]; then
-	# check for zpool history for the pool size expansion
+	if [[ $zfs_expand_size -gt $zfs_prev_size ]]; then
+		# check for zpool history for the pool size expansion
 		if [[ $type == " " ]]; then
 			typeset expansion_size=$(($exp_size-$org_size))
 			typeset	size_addition=$(zpool history -il $TESTPOOL1 |\
@@ -116,7 +120,7 @@ for type in " " mirror raidz raidz2; do
 
 			if [[ $size_addition -ne $i ]]; then
 				log_fail "pool $TESTPOOL1 is not autoexpand " \
-				    "after LUN expansion"
+				    "after LUN expansion (stripe)"
 			fi
 		elif [[ $type == "mirror" ]]; then
 			typeset expansion_size=$(($exp_size-$org_size))
@@ -127,7 +131,7 @@ for type in " " mirror raidz raidz2; do
 
 			if [[ $? -ne 0 ]] ; then
 				log_fail "pool $TESTPOOL1 is not autoexpand " \
-				    "after LUN expansion"
+				    "after LUN expansion (mirror)"
 			fi
 		else
 			typeset expansion_size=$((3*($exp_size-$org_size)))
@@ -138,12 +142,13 @@ for type in " " mirror raidz raidz2; do
 
 			if [[ $? -ne 0 ]]; then
 				log_fail "pool $TESTPOOL is not autoexpand " \
-				    "after LUN expansion"
+				    "after LUN expansion (raidz)"
 			fi
 		fi
 	else
 		log_fail "pool $TESTPOOL1 is not autoexpanded after LUN " \
-		    "expansion"
+		    "expansion.  Previous size: $zfs_prev_size and expanded " \
+		    "size: $zfs_expand_size"
 	fi
 
 	log_must zpool destroy $TESTPOOL1
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh
index 585dd050fd63..b89497ed9468 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh
@@ -48,9 +48,12 @@
 
 verify_runnable "global"
 
-# See issue: https://github.com/zfsonlinux/zfs/issues/5771
-if is_linux; then
-	log_unsupported "Requires autoexpand property support"
+#
+# The zpool autoexpand functionality depends on the DISK_MEDIA_CHANGE udev
+# property which wasn't added until the 2.6.37 kernel.
+#
+if [[ $(linux_version) -le $(linux_version "2.6.37") ]]; then
+	log_unsupported "Requires udev DISK_MEDIA_CHANGE property"
 fi
 
 function cleanup