Skip to content

Commit

Permalink
net/mlx5: fix device removal event handling
Browse files Browse the repository at this point in the history
[ upstream commit 22dc56cfbd39692eb74fad93ff5ecc3df5fd0633 ]

On the device removal kernel notifies user space application
with queueing the IBV_DEVICE_FATAL_EVENT and triggering appropriate
file descriptor. Mellanox kernel driver stack emits this event
twice from different layers (mlx5 and uverbs). The IB port index
is not applicable in the event structure and should be ignored
for IBV_DEVICE_FATAL_EVENT events.

Also, on the older kernels (at least from OFED 4.9) there might be
race conditions causing the event queue close before application
fetches the IBV_DEVICE_FATAL_EVENT message with ibv_get_async_event()
API.

To provide the reliable device removal event detection the patch:

  - ignores the IB port index for the IBV_DEVICE_FATAL_EVENT
  - introduces the flag to notify PMD about removal only once
  - acks event with ibv_ack_async_event after actual handling
  - checks for EIO error, making sure queue is not closed yet

Fixes: 40d9f90 ("net/mlx5: fix device removal handler for multiport")

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
  • Loading branch information
viacheslavo authored and bluca committed Jun 28, 2023
1 parent b67f103 commit 92f3908
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 10 deletions.
34 changes: 24 additions & 10 deletions drivers/net/mlx5/linux/mlx5_ethdev_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,7 @@ mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh)

for (i = 0; i < sh->max_port; ++i) {
struct rte_eth_dev *dev;
struct mlx5_priv *priv;

if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
/*
Expand All @@ -725,9 +726,14 @@ mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh)
}
dev = &rte_eth_devices[sh->port[i].ih_port_id];
MLX5_ASSERT(dev);
if (dev->data->dev_conf.intr_conf.rmv)
priv = dev->data->dev_private;
MLX5_ASSERT(priv);
if (!priv->rmv_notified && dev->data->dev_conf.intr_conf.rmv) {
/* Notify driver about removal only once. */
priv->rmv_notified = 1;
rte_eth_dev_callback_process
(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
}
}
}

Expand Down Expand Up @@ -800,21 +806,29 @@ mlx5_dev_interrupt_handler(void *cb_arg)
struct rte_eth_dev *dev;
uint32_t tmp;

if (mlx5_glue->get_async_event(sh->ctx, &event))
if (mlx5_glue->get_async_event(sh->ctx, &event)) {
if (errno == EIO) {
DRV_LOG(DEBUG,
"IBV async event queue closed on: %s",
sh->ibdev_name);
mlx5_dev_interrupt_device_fatal(sh);
}
break;
/* Retrieve and check IB port index. */
tmp = (uint32_t)event.element.port_num;
if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
}
if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
/*
* The DEVICE_FATAL event is called once for
* entire device without port specifying.
* We should notify all existing ports.
* The DEVICE_FATAL event can be called by kernel
* twice - from mlx5 and uverbs layers, and port
* index is not applicable. We should notify all
* existing ports.
*/
mlx5_glue->ack_async_event(&event);
mlx5_dev_interrupt_device_fatal(sh);
mlx5_glue->ack_async_event(&event);
continue;
}
MLX5_ASSERT(tmp && (tmp <= sh->max_port));
/* Retrieve and check IB port index. */
tmp = (uint32_t)event.element.port_num;
MLX5_ASSERT(tmp <= sh->max_port);
if (!tmp) {
/* Unsupported device level event. */
mlx5_glue->ack_async_event(&event);
Expand Down
1 change: 1 addition & 0 deletions drivers/net/mlx5/mlx5.h
Original file line number Diff line number Diff line change
Expand Up @@ -945,6 +945,7 @@ struct mlx5_priv {
unsigned int mtr_reg_share:1; /* Whether support meter REG_C share. */
unsigned int sampler_en:1; /* Whether support sampler. */
unsigned int lb_used:1; /* Loopback queue is referred to. */
unsigned int rmv_notified:1; /* Notified about removal event */
uint32_t mark_enabled:1; /* If mark action is enabled on rxqs. */
uint16_t domain_id; /* Switch domain identifier. */
uint16_t vport_id; /* Associated VF vport index (if any). */
Expand Down

0 comments on commit 92f3908

Please sign in to comment.