From 8662711a63b6dc714ec0c703fff0f78e317302e6 Mon Sep 17 00:00:00 2001 From: Jianwei Zhang Date: Thu, 30 Sep 2021 14:47:01 +0800 Subject: [PATCH] osd: wake up more shard workers when new osdmap is consumed Reproduce: (1) ceph cluster not running any client IO (2) only ceph osd in osd.14 operation Reason: (1) one shard-queue has three shard-threads (2) one or some PeeringOp's epoch > osdmap's epoch held by current osd, and these PeeringOp _add_slot_waiter() (3) shard-queue become empty and three shard-threads cond.wait() (4) new osdmap consume and it _wake_pg_slot() Problem in here 1> OSDShard::consume() exec loop all pg's slot wait and requeue more than one PeeringOp to shard-queue 2> but it only notify one shard-thread to wakeup, the other two shard-threads continue cond.wait() 3> OSD::ShardedOpWQ::_enqueue() found the shard-queue not empty and not notify all shard-thread to wakeup In a period of time, only one shard-thread of 3 shard-threads is running. Fixes: https://tracker.ceph.com/issues/52781 Signed-off-by: Jianwei Zhang Change-Id: I4617db2fd95082007e6d9fa2b60f17f2a6296b5b (cherry picked from commit 566b60b3f6528f491da6c2b862142ca55c05646f) --- src/osd/OSD.cc | 17 ++++++++++++----- src/osd/OSD.h | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index e0d8d78650a1f..df88a28afd8fd 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -10624,7 +10624,7 @@ void OSDShard::consume_map( dout(10) << new_osdmap->get_epoch() << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")" << dendl; - bool queued = false; + int queued = 0; // check slots auto p = pg_slots.begin(); @@ -10651,8 +10651,7 @@ void OSDShard::consume_map( dout(20) << __func__ << " " << pgid << " pending_peering first epoch " << first << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl; - _wake_pg_slot(pgid, slot); - queued = true; + queued += _wake_pg_slot(pgid, slot); } ++p; continue; @@ -10692,14 +10691,18 @@ void OSDShard::consume_map( } if (queued) { std::lock_guard l{sdata_wait_lock}; - sdata_cond.notify_one(); + if (queued == 1) + sdata_cond.notify_one(); + else + sdata_cond.notify_all(); } } -void OSDShard::_wake_pg_slot( +int OSDShard::_wake_pg_slot( spg_t pgid, OSDShardPGSlot *slot) { + int count = 0; dout(20) << __func__ << " " << pgid << " to_process " << slot->to_process << " waiting " << slot->waiting @@ -10708,12 +10711,14 @@ void OSDShard::_wake_pg_slot( i != slot->to_process.rend(); ++i) { scheduler->enqueue_front(std::move(*i)); + count++; } slot->to_process.clear(); for (auto i = slot->waiting.rbegin(); i != slot->waiting.rend(); ++i) { scheduler->enqueue_front(std::move(*i)); + count++; } slot->waiting.clear(); for (auto i = slot->waiting_peering.rbegin(); @@ -10724,10 +10729,12 @@ void OSDShard::_wake_pg_slot( // someday, if we decide this inefficiency matters for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) { scheduler->enqueue_front(std::move(*j)); + count++; } } slot->waiting_peering.clear(); ++slot->requeue_seq; + return count; } void OSDShard::identify_splits_and_merges( diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 0e9bc06a3b79d..2034ff6bed24f 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1094,7 +1094,7 @@ struct OSDShard { const OSDMapRef& osdmap, unsigned *pushes_to_free); - void _wake_pg_slot(spg_t pgid, OSDShardPGSlot *slot); + int _wake_pg_slot(spg_t pgid, OSDShardPGSlot *slot); void identify_splits_and_merges( const OSDMapRef& as_of_osdmap,