From 4fb5d637e3d570b89ee8cbbebc3cc5a8210e4e32 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 20 Jun 2018 13:39:23 -0700 Subject: [PATCH] Add support for autoexpand property While the autoexpand property may seem like a small feature it depends on a significant amount of system infrastructure. Enough of that infrastructure is now in place with a few customizations for Linux the autoexpand property for whole disk configurations can be supported. Autoexpand works as follows; when a block device is resized a change event is generated by udev with the DISK_MEDIA_CHANGE key. The ZED, which is monitoring udev events detects the event for disks (but not partitions) and hands it off to zfs_deliver_dle(). The zfs_deliver_dle() function appends the exected whole disk partition suffix, and if the partition can be matched against a known pool vdev it re-opens it. Re-opening the vdev with trigger a re-reading of the partition table so the maximum possible expansion size can be reported. Next if the property autoexpand is set to "on" a vdev expansion will be attempted. After performing some sanity checks on the disk to verify it's safe to expand the ZFS partition (-part1) it will be expanded an the partition table updated. The partition is then re-opened again to detect the updated size which allows the new capacity to be used. Added PHYS_PATH="/dev/zvol/dataset" to vdev configuration for ZFS volumes. This was required for the test cases which test expansion by layering a new pool on top of ZFS volumes. Enable the zpool_expand_001_pos and /zpool_expand_003_pos test cases which excercise the autoexpand property. Fixed zfs_zevent_wait() signal handling which could result in the ZED spinning when a signal was not handled. Removed vdev_disk_rrpart() functionality which can be abandoned in favour of re-opening the device which trigger a re-read of the partition table as long no other partitions are in use. This will be true as long as we're working with hole disks. As a bonus this allows us to remove to Linux kernel API checks. Signed-off-by: Brian Behlendorf Issue #120 Issue #2437 Issue #5771 Issue #7582 --- cmd/zed/agents/zfs_mod.c | 15 +- cmd/zed/zed_disk_event.c | 32 ++- config/kernel-blkdev-get.m4 | 19 -- config/kernel-get-gendisk.m4 | 17 -- config/kernel.m4 | 2 - lib/libzfs/libzfs_import.c | 61 ++++-- module/zfs/fm.c | 30 ++- module/zfs/vdev.c | 3 +- module/zfs/vdev_disk.c | 182 +++++++----------- module/zfs/zvol.c | 19 +- tests/test-runner/bin/zts-report.py | 12 +- .../zpool_expand/zpool_expand_001_pos.ksh | 23 ++- .../zpool_expand/zpool_expand_003_neg.ksh | 9 +- 13 files changed, 217 insertions(+), 207 deletions(-) delete mode 100644 config/kernel-blkdev-get.m4 delete mode 100644 config/kernel-get-gendisk.m4 diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 600d6527c0db..b757e2360adf 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -751,23 +751,30 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) } /* - * This function handles the ESC_DEV_DLE event. + * This function handles the ESC_DEV_DLE (DISK_MEDIA_CHANGE) event which + * is only delivered for the disk itself, not for each partition. Presume + * that a 'wholedisk' partition exists and append the expected partition + * suffix in order to attempt a match. */ static int zfs_deliver_dle(nvlist_t *nvl) { - char *devname; + char *devname, pname[MAXPATHLEN]; if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) { zed_log_msg(LOG_INFO, "zfs_deliver_dle: no physpath"); return (-1); } - if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) { + strlcpy(pname, devname, MAXPATHLEN); + zfs_append_partition(pname, MAXPATHLEN); + + if (zpool_iter(g_zfshdl, zfsdle_vdev_online, pname) != 1) { zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " - "found", devname); + "found", pname); return (1); } + return (0); } diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 996b911c537c..758955bdc460 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -165,11 +165,12 @@ zed_udev_monitor(void *arg) while (1) { struct udev_device *dev; - const char *action, *type, *part, *sectors; + const char *action, *type, *part, *sectors, *change; const char *bus, *uuid; const char *class, *subclass; nvlist_t *nvl; boolean_t is_zfs = B_FALSE; + boolean_t is_disk_media_change = B_FALSE; /* allow a cancellation while blocked (recvmsg) */ pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); @@ -202,14 +203,26 @@ zed_udev_monitor(void *arg) } /* - * if this is a disk and it is partitioned, then the + * Disk media change events are allowed for auto-expand. + * Whether the device contains a zfs_member is determined + * at the time of the attempted expansion. + */ + change = udev_device_get_property_value(dev, + "DISK_MEDIA_CHANGE"); + if (change != NULL && change[0] == '1') + is_disk_media_change = B_TRUE; + + /* + * If this is a disk and it is partitioned, then the * zfs label will reside in a DEVTYPE=partition and - * we can skip passing this event + * we can skip passing this event. Unless it's a disk + * media changes event which is expected for auto-expand. */ type = udev_device_get_property_value(dev, "DEVTYPE"); part = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); - if (type != NULL && type[0] != '\0' && + if (!is_disk_media_change && + type != NULL && type[0] != '\0' && strcmp(type, "disk") == 0 && part != NULL && part[0] != '\0') { /* skip and wait for partition event */ @@ -231,14 +244,15 @@ zed_udev_monitor(void *arg) } /* - * If the blkid probe didn't find ZFS, then a persistent - * device id string is required in the message schema - * for matching with vdevs. Preflight here for expected - * udev information. + * If the blkid probe didn't find ZFS and this is not a + * disk media change event. Then a persistent device id + * string is required in the message schema for matching + * with vdevs. Preflight here for expected udev information. */ bus = udev_device_get_property_value(dev, "ID_BUS"); uuid = udev_device_get_property_value(dev, "DM_UUID"); - if (!is_zfs && (bus == NULL && uuid == NULL)) { + if (!is_zfs && !is_disk_media_change && + bus == NULL && uuid == NULL) { zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid " "source", udev_device_get_devnode(dev)); udev_device_unref(dev); diff --git a/config/kernel-blkdev-get.m4 b/config/kernel-blkdev-get.m4 deleted file mode 100644 index e31d71770511..000000000000 --- a/config/kernel-blkdev-get.m4 +++ /dev/null @@ -1,19 +0,0 @@ -dnl # -dnl # 2.6.37 API change -dnl # Added 3rd argument for the active holder, previously this was -dnl # hardcoded to NULL. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_3ARG_BLKDEV_GET], [ - AC_MSG_CHECKING([whether blkdev_get() wants 3 args]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct block_device *bdev = NULL; - (void) blkdev_get(bdev, 0, NULL); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_3ARG_BLKDEV_GET, 1, [blkdev_get() wants 3 args]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-get-gendisk.m4 b/config/kernel-get-gendisk.m4 deleted file mode 100644 index b0913770e43d..000000000000 --- a/config/kernel-get-gendisk.m4 +++ /dev/null @@ -1,17 +0,0 @@ -dnl # -dnl # 2.6.34 API change -dnl # Verify the get_gendisk() symbol is available. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_GET_GENDISK], - [AC_MSG_CHECKING([whether get_gendisk() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - ], [ - get_gendisk(0, NULL); - ], [get_gendisk], [block/genhd.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GET_GENDISK, 1, [get_gendisk() is available]) - ], [ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 8c2998204cde..32abb81da39b 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -44,7 +44,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID ZFS_AC_KERNEL_TYPE_FMODE_T - ZFS_AC_KERNEL_3ARG_BLKDEV_GET ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE ZFS_AC_KERNEL_LOOKUP_BDEV @@ -73,7 +72,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BLK_PLUG ZFS_AC_KERNEL_GET_DISK_AND_MODULE ZFS_AC_KERNEL_GET_DISK_RO - ZFS_AC_KERNEL_GET_GENDISK ZFS_AC_KERNEL_HAVE_BIO_SET_OP_ATTRS ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_DISCARD_GRANULARITY diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 7d2f0e903cce..e83d8ed888e0 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -145,6 +145,21 @@ zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) return (0); } + /* + * For volumes use the persistent /dev/zvol/dataset identifier + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + const char *name; + + name = udev_list_entry_get_name(entry); + if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { + (void) strlcpy(bufptr, name, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + /* * NVME 'by-id' symlinks are similar to bus case */ @@ -187,26 +202,44 @@ int zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) { const char *physpath = NULL; + struct udev_list_entry *entry; /* - * Normal disks use ID_PATH for their physical path. Device mapper - * devices are virtual and don't have a physical path. For them we - * use ID_VDEV instead, which is setup via the /etc/vdev_id.conf file. - * ID_VDEV provides a persistent path to a virtual device. If you - * don't have vdev_id.conf setup, you cannot use multipath autoreplace. + * Normal disks use ID_PATH for their physical path. */ - if (!((physpath = udev_device_get_property_value(dev, "ID_PATH")) && - physpath[0])) { - if (!((physpath = - udev_device_get_property_value(dev, "ID_VDEV")) && - physpath[0])) { - return (ENODATA); - } + physpath = udev_device_get_property_value(dev, "ID_PATH"); + if (physpath != NULL && strlen(physpath) > 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + + /* + * Device mapper devices are virtual and don't have a physical + * path. For them we use ID_VDEV instead, which is setup via the + * /etc/vdev_id.conf file. ID_VDEV provides a persistent path + * to a virtual device. If you don't have vdev_id.conf setup, + * you cannot use multipath autoreplace with device mapper. + */ + physpath = udev_device_get_property_value(dev, "ID_VDEV"); + if (physpath != NULL && strlen(physpath) > 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); } - (void) strlcpy(bufptr, physpath, buflen); + /* + * For volumes use the persistent /dev/zvol/dataset identifier + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + physpath = udev_list_entry_get_name(entry); + if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } - return (0); + return (ENODATA); } boolean_t diff --git a/module/zfs/fm.c b/module/zfs/fm.c index 4986a3fa2350..df8309d8de39 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -671,19 +671,31 @@ zfs_zevent_wait(zfs_zevent_t *ze) int error = 0; mutex_enter(&zevent_lock); + zevent_waiters++; - if (zevent_flags & ZEVENT_SHUTDOWN) { - error = ESHUTDOWN; - goto out; - } + while (error == 0) { + if (zevent_flags & ZEVENT_SHUTDOWN) { + error = SET_ERROR(ESHUTDOWN); + break; + } - zevent_waiters++; - cv_wait_sig(&zevent_cv, &zevent_lock); - if (issig(JUSTLOOKING)) - error = EINTR; + error = cv_timedwait_sig(&zevent_cv, &zevent_lock, + ddi_get_lbolt() + hz); + if (signal_pending(current) || fatal_signal_pending(current)) { + error = SET_ERROR(EINTR); + break; + } else { + if (error == -1) { + error = 0; + continue; + } else { + error = 0; + break; + } + } + } zevent_waiters--; -out: mutex_exit(&zevent_lock); return (error); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 5b67e5f5fe3f..49a1fd9849f3 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3097,7 +3097,8 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) - pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); + pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || + spa->spa_autoexpand); } vdev_reopen(tvd); diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 996bab43c6ce..f736908751ba 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -85,50 +85,54 @@ vdev_bdev_mode(int smode) } #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ -/* The capacity (in bytes) of a bdev that is available to be used by a vdev */ +/* + * Returns the usable capacity (in bytes) for the partition or disk. + */ static uint64_t -bdev_capacity(struct block_device *bdev, boolean_t wholedisk) +bdev_capacity(struct block_device *bdev) { - struct hd_struct *part = bdev->bd_part; - uint64_t sectors = get_capacity(bdev->bd_disk); - /* If there are no paritions, return the entire device capacity */ - if (part == NULL) - return (sectors << SECTOR_BITS); + if (bdev->bd_part != NULL) + return (bdev->bd_part->nr_sects << SECTOR_BITS); + else + return (i_size_read(bdev->bd_inode)); +} - /* - * If there are partitions, decide if we are using a `wholedisk` - * layout (composed of part1 and part9) or just a single partition. - */ - if (wholedisk) { - /* Verify the expected device layout */ - ASSERT3P(bdev, !=, bdev->bd_contains); - /* - * Sectors used by the EFI partition (part9) as well as - * partion alignment. - */ - uint64_t used = EFI_MIN_RESV_SIZE + NEW_START_BLOCK + - PARTITION_END_ALIGNMENT; - - /* Space available to the vdev, i.e. the size of part1 */ - if (sectors <= used) - return (0); - uint64_t available = sectors - used; - return (available << SECTOR_BITS); +/* + * Returns the maximum expansion capacity of the block device, When the + * vdev has been created as a 'wholedisk' then expansion may be possible. + * Before any expansion is performed the partition layout is verified to + * confirm the original layout (-part1 and -part9). If everything checks + * out the primary partition will be resized and the reserved partition + * relocated to the new end of device as part of 'zpool online -e'. + */ +static uint64_t +bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) +{ + uint64_t psize; + int64_t available; + + if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { + available = i_size_read(bdev->bd_contains->bd_inode) - + ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + + PARTITION_END_ALIGNMENT) << SECTOR_BITS); + if (available > 0) + psize = available; + else + psize = bdev_capacity(bdev); } else { - /* The partition capacity referenced by the block device */ - return (part->nr_sects << SECTOR_BITS); + psize = bdev_capacity(bdev); } + + return (psize); } static void vdev_disk_error(zio_t *zio) { -#ifdef ZFS_DEBUG - printk(KERN_WARNING "ZFS: zio error=%d type=%d offset=%llu size=%llu " + zfs_dbgmsg(KERN_WARNING "zio error=%d type=%d offset=%llu size=%llu " "flags=%x\n", zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, zio->io_flags); -#endif } /* @@ -200,71 +204,6 @@ vdev_elevator_switch(vdev_t *v, char *elevator) } } -/* - * Expanding a whole disk vdev involves invoking BLKRRPART on the - * whole disk device. This poses a problem, because BLKRRPART will - * return EBUSY if one of the disk's partitions is open. That's why - * we have to do it here, just before opening the data partition. - * Unfortunately, BLKRRPART works by dropping all partitions and - * recreating them, which means that for a short time window, all - * /dev/sdxN device files disappear (until udev recreates them). - * This means two things: - * - When we open the data partition just after a BLKRRPART, we - * can't do it using the normal device file path because of the - * obvious race condition with udev. Instead, we use reliable - * kernel APIs to get a handle to the new partition device from - * the whole disk device. - * - Because vdev_disk_open() initially needs to find the device - * using its path, multiple vdev_disk_open() invocations in - * short succession on the same disk with BLKRRPARTs in the - * middle have a high probability of failure (because of the - * race condition with udev). A typical situation where this - * might happen is when the zpool userspace tool does a - * TRYIMPORT immediately followed by an IMPORT. For this - * reason, we only invoke BLKRRPART in the module when strictly - * necessary (zpool online -e case), and rely on userspace to - * do it when possible. - */ -static struct block_device * -vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) -{ -#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) - struct block_device *bdev, *result = ERR_PTR(-ENXIO); - struct gendisk *disk; - int error, partno; - - bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder); - if (IS_ERR(bdev)) - return (bdev); - - disk = get_gendisk(bdev->bd_dev, &partno); - vdev_bdev_close(bdev, vdev_bdev_mode(mode)); - - if (disk) { - bdev = bdget(disk_devt(disk)); - if (bdev) { - error = blkdev_get(bdev, vdev_bdev_mode(mode), vd); - if (error == 0) - error = ioctl_by_bdev(bdev, BLKRRPART, 0); - vdev_bdev_close(bdev, vdev_bdev_mode(mode)); - } - - bdev = bdget_disk(disk, partno); - if (bdev) { - error = blkdev_get(bdev, - vdev_bdev_mode(mode) | FMODE_EXCL, vd); - if (error == 0) - result = bdev; - } - put_disk(disk); - } - - return (result); -#else - return (ERR_PTR(-EOPNOTSUPP)); -#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */ -} - static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) @@ -281,28 +220,37 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, return (SET_ERROR(EINVAL)); } + mode = spa_mode(v->vdev_spa); + /* - * Reopen the device if it's not currently open. Otherwise, - * just update the physical size of the device. + * Reopen the device if it is not currently open. Otherwise, when + * expanding a device close and open it to trigger a re-scanning + * of the partition table in order to get an accurate size. */ if (v->vdev_tsd != NULL) { ASSERT(v->vdev_reopening); vd = v->vdev_tsd; - goto skip_open; - } - vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); - if (vd == NULL) - return (SET_ERROR(ENOMEM)); + if (vd->vd_bdev && v->vdev_wholedisk && v->vdev_expanding) { + vdev_bdev_close(vd->vd_bdev, vdev_bdev_mode(mode)); + vd->vd_bdev = NULL; + } else { + goto skip_open; + } + } else { + vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + if (vd == NULL) + return (SET_ERROR(ENOMEM)); + } /* * Devices are always opened by the path provided at configuration * time. This means that if the provided path is a udev by-id path - * then drives may be recabled without an issue. If the provided + * then drives may be re-cabled without an issue. If the provided * path is a udev by-path path, then the physical location information * will be preserved. This can be critical for more complicated * configurations where drives are located in specific physical - * locations to maximize the systems tolerence to component failure. + * locations to maximize the systems tolerance to component failure. * Alternatively, you can provide your own udev rule to flexibly map * the drives as you see fit. It is not advised that you use the * /dev/[hd]d devices which may be reordered due to probing order. @@ -317,10 +265,6 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, * and it is reasonable to sleep and retry before giving up. In * practice delays have been observed to be on the order of 100ms. */ - mode = spa_mode(v->vdev_spa); - if (v->vdev_wholedisk && v->vdev_expanding) - bdev = vdev_disk_rrpart(v->vdev_path, mode, vd); - while (IS_ERR(bdev) && count < 50) { bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), zfs_vdev_holder); @@ -333,12 +277,13 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, } if (IS_ERR(bdev)) { - dprintf("failed open v->vdev_path=%s, error=%d count=%d\n", - v->vdev_path, -PTR_ERR(bdev), count); + vdev_dbgmsg(v, "failed open error=%d count=%d\n", + -PTR_ERR(bdev), count); kmem_free(vd, sizeof (vdev_disk_t)); return (SET_ERROR(-PTR_ERR(bdev))); } + ASSERT3P(vd->vd_bdev, ==, NULL); v->vdev_tsd = vd; vd->vd_bdev = bdev; @@ -352,9 +297,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, /* Inform the ZIO pipeline that we are non-rotational */ v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); - /* Physical volume size in bytes */ - *psize = bdev_capacity(vd->vd_bdev, v->vdev_wholedisk); - *max_psize = *psize; + /* Physical volume size in bytes for the partition */ + *psize = bdev_capacity(vd->vd_bdev); + + /* Physical volume size in bytes including possible expansion space */ + *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); /* Based on the minimum sector size set the block size */ *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; @@ -373,9 +320,10 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; - if (vd->vd_bdev != NULL) + if (vd->vd_bdev != NULL) { vdev_bdev_close(vd->vd_bdev, vdev_bdev_mode(spa_mode(v->vdev_spa))); + } kmem_free(vd, sizeof (vdev_disk_t)); v->vdev_tsd = NULL; @@ -563,8 +511,8 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, struct blk_plug plug; #endif - ASSERT(zio != NULL); - ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size); + if (io_offset + io_size > bdev->bd_inode->i_size) + return (SET_ERROR(ENXIO)); retry: dr = vdev_disk_dio_alloc(bio_count); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index ecba516fcc0d..1eeaded75d79 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -118,6 +118,7 @@ struct zvol_state { objset_t *zv_objset; /* objset handle */ uint32_t zv_flags; /* ZVOL_* flags */ uint32_t zv_open_count; /* open counts */ + uint32_t zv_open_retry; /* -ERESTARTSYS retry */ uint32_t zv_changed; /* disk changed */ zilog_t *zv_zilog; /* ZIL handle */ zfs_rlock_t zv_range_lock; /* range lock */ @@ -1288,11 +1289,23 @@ zvol_first_open(zvol_state_t *zv, boolean_t readonly) * bdev->bd_mutex being dropped, reacquired, and fops->open() being * called again. This process can be repeated safely until both * locks are acquired. + * + * There is one exception and that is for 2.6.35 and older kernels + * which include the BKL. In order to ensure a deadlock can never + * happen even for these ancient kernels a retry limit in included. */ if (!mutex_owned(&spa_namespace_lock)) { locked = mutex_tryenter(&spa_namespace_lock); - if (!locked) + if (!locked) { +#if defined(CONFIG_LOCK_KERNEL) + zv->zv_open_retry++; + if (zv->zv_open_retry > 100) { + zv->zv_open_retry = 0; + return (-SET_ERROR(ENXIO)); + } +#endif return (-SET_ERROR(ERESTARTSYS)); + } } ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); @@ -1384,6 +1397,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) } zv->zv_open_count++; + zv->zv_open_retry = 0; mutex_exit(&zv->zv_state_lock); if (drop_suspend) @@ -1399,8 +1413,10 @@ zvol_open(struct block_device *bdev, fmode_t flag) out_mutex: mutex_exit(&zv->zv_state_lock); + if (drop_suspend) rw_exit(&zv->zv_suspend_lock); + if (error == -ERESTARTSYS) schedule(); @@ -1672,6 +1688,7 @@ zvol_alloc(dev_t dev, const char *name) zv->zv_queue->queuedata = zv; zv->zv_dev = dev; zv->zv_open_count = 0; + zv->zv_open_retry = 0; strlcpy(zv->zv_name, name, MAXNAMELEN); zfs_rlock_init(&zv->zv_range_lock); diff --git a/tests/test-runner/bin/zts-report.py b/tests/test-runner/bin/zts-report.py index 976ca1f8adb9..43695b73bb02 100755 --- a/tests/test-runner/bin/zts-report.py +++ b/tests/test-runner/bin/zts-report.py @@ -105,6 +105,14 @@ # rewind_reason = 'Arbitrary pool rewind is not guaranteed' +# +# Some tests which depend on the pool autoexpand property will not work +# with 2.6.37 and older kernels. These kernels do not set the udev +# DISK_MEDIA_CHANGE udev property which is required by the ZED to detect +# and perform the needed device expansion. +# +udev_reason = 'Udev DISK_MEDIA_CHANGE property required' + # # Some tests are not applicable to Linux or need to be updated to operate # in the manor required by Linux. Any tests which are skipped for this @@ -151,8 +159,6 @@ 'cli_root/zfs_unshare/zfs_unshare_002_pos': ['SKIP', na_reason], 'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason], 'cli_root/zpool_create/zpool_create_016_pos': ['SKIP', na_reason], - 'cli_root/zpool_expand/zpool_expand_001_pos': ['SKIP', '5771'], - 'cli_root/zpool_expand/zpool_expand_003_neg': ['SKIP', '5771'], 'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason], 'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason], 'inuse/inuse_001_pos': ['SKIP', na_reason], @@ -212,6 +218,8 @@ 'cli_root/zpool_create/setup': ['SKIP', disk_reason], 'cli_root/zpool_create/zpool_create_008_pos': ['FAIL', known_reason], 'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', '6145'], + 'cli_root/zpool_expand/zpool_expand_001_pos': ['SKIP', udev_reason], + 'cli_root/zpool_expand/zpool_expand_003_neg': ['SKIP', udev_reason], 'cli_root/zpool_export/setup': ['SKIP', disk_reason], 'cli_root/zpool_import/setup': ['SKIP', disk_reason], 'cli_root/zpool_import/import_rewind_device_replaced': diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh index 06ab1b84fd1c..59e4686406d7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh @@ -48,9 +48,12 @@ verify_runnable "global" -# See issue: https://github.com/zfsonlinux/zfs/issues/5771 -if is_linux; then - log_unsupported "Requires autoexpand property support" +# +# The zpool autoexpand functionality depends on the DISK_MEDIA_CHANGE udev +# property which wasn't added until the 2.6.37 kernel. +# +if [[ $(linux_version) -le $(linux_version "2.6.37") ]]; then + log_unsupported "Requires udev DISK_MEDIA_CHANGE property" fi function cleanup @@ -73,6 +76,7 @@ log_assert "zpool can be autoexpanded after set autoexpand=on on LUN expansion" for i in 1 2 3; do log_must zfs create -V $org_size $VFS/vol$i done + block_device_wait for type in " " mirror raidz raidz2; do @@ -105,8 +109,8 @@ for type in " " mirror raidz raidz2; do log_note "$TESTPOOL1 $type has previous size: $prev_size and " \ "expanded size: $expand_size" # compare available pool size from zfs - if [[ $zfs_expand_size > $zfs_prev_size ]]; then - # check for zpool history for the pool size expansion + if [[ $zfs_expand_size -gt $zfs_prev_size ]]; then + # check for zpool history for the pool size expansion if [[ $type == " " ]]; then typeset expansion_size=$(($exp_size-$org_size)) typeset size_addition=$(zpool history -il $TESTPOOL1 |\ @@ -116,7 +120,7 @@ for type in " " mirror raidz raidz2; do if [[ $size_addition -ne $i ]]; then log_fail "pool $TESTPOOL1 is not autoexpand " \ - "after LUN expansion" + "after LUN expansion (stripe)" fi elif [[ $type == "mirror" ]]; then typeset expansion_size=$(($exp_size-$org_size)) @@ -127,7 +131,7 @@ for type in " " mirror raidz raidz2; do if [[ $? -ne 0 ]] ; then log_fail "pool $TESTPOOL1 is not autoexpand " \ - "after LUN expansion" + "after LUN expansion (mirror)" fi else typeset expansion_size=$((3*($exp_size-$org_size))) @@ -138,12 +142,13 @@ for type in " " mirror raidz raidz2; do if [[ $? -ne 0 ]]; then log_fail "pool $TESTPOOL is not autoexpand " \ - "after LUN expansion" + "after LUN expansion (raidz)" fi fi else log_fail "pool $TESTPOOL1 is not autoexpanded after LUN " \ - "expansion" + "expansion. Previous size: $zfs_prev_size and expanded " \ + "size: $zfs_expand_size" fi log_must zpool destroy $TESTPOOL1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh index 585dd050fd63..b89497ed9468 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh @@ -48,9 +48,12 @@ verify_runnable "global" -# See issue: https://github.com/zfsonlinux/zfs/issues/5771 -if is_linux; then - log_unsupported "Requires autoexpand property support" +# +# The zpool autoexpand functionality depends on the DISK_MEDIA_CHANGE udev +# property which wasn't added until the 2.6.37 kernel. +# +if [[ $(linux_version) -le $(linux_version "2.6.37") ]]; then + log_unsupported "Requires udev DISK_MEDIA_CHANGE property" fi function cleanup