From c5f0e2222cc55a92ebc46d12b618fadb3d2fb577 Mon Sep 17 00:00:00 2001 From: Guang G Yang Date: Wed, 1 Jul 2015 20:26:54 +0000 Subject: [PATCH 1/2] osd: Move IsRecoverablePredicate/IsReadablePredicate to osd_types.h Signed-off-by: Guang Yang (cherry picked from commit 466b083166231ec7e4c069fef8c9e07d38accab9) --- src/osd/ECBackend.h | 8 ++++---- src/osd/PG.cc | 4 ++-- src/osd/PG.h | 12 ++++++------ src/osd/PGBackend.h | 21 ++------------------- src/osd/ReplicatedBackend.h | 8 ++++---- src/osd/osd_types.h | 18 ++++++++++++++++++ 6 files changed, 36 insertions(+), 35 deletions(-) diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index 4290de8da2561..d6e710d0be371 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -385,7 +385,7 @@ class ECBackend : public PGBackend { * * Determines the whether _have is suffient to recover an object */ - class ECRecPred : public IsRecoverablePredicate { + class ECRecPred : public IsPGRecoverablePredicate { set want; ErasureCodeInterfaceRef ec_impl; public: @@ -405,7 +405,7 @@ class ECBackend : public PGBackend { return ec_impl->minimum_to_decode(want, have, &min) == 0; } }; - IsRecoverablePredicate *get_is_recoverable_predicate() { + IsPGRecoverablePredicate *get_is_recoverable_predicate() { return new ECRecPred(ec_impl); } @@ -414,7 +414,7 @@ class ECBackend : public PGBackend { * * Determines the whether _have is suffient to read an object */ - class ECReadPred : public IsReadablePredicate { + class ECReadPred : public IsPGReadablePredicate { pg_shard_t whoami; ECRecPred rec_pred; public: @@ -425,7 +425,7 @@ class ECBackend : public PGBackend { return _have.count(whoami) && rec_pred(_have); } }; - IsReadablePredicate *get_is_readable_predicate() { + IsPGReadablePredicate *get_is_readable_predicate() { return new ECReadPred(get_parent()->whoami_shard(), ec_impl); } diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 73010949711a2..28283f4e423f1 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1328,7 +1328,7 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id) } /* Check whether we have enough acting shards to later perform recovery */ - boost::scoped_ptr recoverable_predicate( + boost::scoped_ptr recoverable_predicate( get_pgbackend()->get_is_recoverable_predicate()); set have; for (int i = 0; i < (int)want.size(); ++i) { @@ -7458,7 +7458,7 @@ void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_ #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ") PG::PriorSet::PriorSet(bool ec_pool, - PGBackend::IsRecoverablePredicate *c, + IsPGRecoverablePredicate *c, const OSDMap &osdmap, const map &past_intervals, const vector &up, diff --git a/src/osd/PG.h b/src/osd/PG.h index f69d431123b1b..f675e69637a23 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -315,13 +315,13 @@ class PG { PG *pg; set empty_set; public: - boost::scoped_ptr is_readable; - boost::scoped_ptr is_recoverable; + boost::scoped_ptr is_readable; + boost::scoped_ptr is_recoverable; MissingLoc(PG *pg) : pg(pg) {} void set_backend_predicates( - PGBackend::IsReadablePredicate *_is_readable, - PGBackend::IsRecoverablePredicate *_is_recoverable) { + IsPGReadablePredicate *_is_readable, + IsPGRecoverablePredicate *_is_recoverable) { is_readable.reset(_is_readable); is_recoverable.reset(_is_recoverable); } @@ -492,9 +492,9 @@ class PG { map blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set bool pg_down; /// some down osds are included in @a cur; the DOWN pg state bit should be set. - boost::scoped_ptr pcontdec; + boost::scoped_ptr pcontdec; PriorSet(bool ec_pool, - PGBackend::IsRecoverablePredicate *c, + IsPGRecoverablePredicate *c, const OSDMap &osdmap, const map &past_intervals, const vector &up, diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index 91b4d105fca6c..f050226723582 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -318,25 +318,8 @@ virtual void on_flushed() = 0; - class IsRecoverablePredicate { - public: - /** - * have encodes the shards available - */ - virtual bool operator()(const set &have) const = 0; - virtual ~IsRecoverablePredicate() {} - }; - virtual IsRecoverablePredicate *get_is_recoverable_predicate() = 0; - - class IsReadablePredicate { - public: - /** - * have encodes the shards available - */ - virtual bool operator()(const set &have) const = 0; - virtual ~IsReadablePredicate() {} - }; - virtual IsReadablePredicate *get_is_readable_predicate() = 0; + virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() = 0; + virtual IsPGReadablePredicate *get_is_readable_predicate() = 0; void temp_colls(list *out) { if (temp_created) diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index 5ad22bfc0b22d..509065740d320 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -73,17 +73,17 @@ class ReplicatedBackend : public PGBackend { void clear_recovery_state(); void on_flushed(); - class RPCRecPred : public IsRecoverablePredicate { + class RPCRecPred : public IsPGRecoverablePredicate { public: bool operator()(const set &have) const { return !have.empty(); } }; - IsRecoverablePredicate *get_is_recoverable_predicate() { + IsPGRecoverablePredicate *get_is_recoverable_predicate() { return new RPCRecPred; } - class RPCReadPred : public IsReadablePredicate { + class RPCReadPred : public IsPGReadablePredicate { pg_shard_t whoami; public: RPCReadPred(pg_shard_t whoami) : whoami(whoami) {} @@ -91,7 +91,7 @@ class ReplicatedBackend : public PGBackend { return have.count(whoami); } }; - IsReadablePredicate *get_is_readable_predicate() { + IsPGReadablePredicate *get_is_readable_predicate() { return new RPCReadPred(get_parent()->whoami_shard()); } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 6637ae4dc09db..cd8ee44836cfb 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -95,6 +95,24 @@ WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard) WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard) ostream &operator<<(ostream &lhs, const pg_shard_t &rhs); +class IsPGRecoverablePredicate { +public: + /** + * have encodes the shards available + */ + virtual bool operator()(const set &have) const = 0; + virtual ~IsPGRecoverablePredicate() {} +}; + +class IsPGReadablePredicate { +public: + /** + * have encodes the shards available + */ + virtual bool operator()(const set &have) const = 0; + virtual ~IsPGReadablePredicate() {} +}; + inline ostream& operator<<(ostream& out, const osd_reqid_t& r) { return out << r.name << "." << r.inc << ":" << r.tid; } From cd11b887c6c586085a7014ba44123b115370a462 Mon Sep 17 00:00:00 2001 From: Guang G Yang Date: Thu, 2 Jul 2015 05:29:47 +0000 Subject: [PATCH 2/2] osd: pg_interval_t::check_new_interval should not rely on pool.min_size to determine if the PG was active If the pool's min_size is set improperly, during peering, pg_interval_t::check_new_interval might wrongly determine the PG's state and cause the PG to stuck at down+peering forever Fixes: #12162 Signed-off-by: Guang Yang yguang@yahoo-inc.com (cherry picked from commit 684927442d81ea08f95878a8af69d08d3a14d973) Conflicts: src/osd/PG.cc because PG::start_peering_interval has an assert that is not found in hammer in the context src/test/osd/types.cc because include/stringify.h is not included by types.cc in hammer --- src/osd/OSD.cc | 3 +++ src/osd/PG.cc | 6 ++++++ src/osd/PG.h | 4 ++++ src/osd/osd_types.cc | 18 +++++++++++++++++- src/osd/osd_types.h | 4 ++++ src/test/osd/types.cc | 14 ++++++++++++++ 6 files changed, 48 insertions(+), 1 deletion(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 7dbcfc5ae6030..1e89365197dac 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3012,6 +3012,8 @@ void OSD::build_past_intervals_parallel() } assert(last_map); + boost::scoped_ptr recoverable( + pg->get_is_recoverable_predicate()); std::stringstream debug; bool new_interval = pg_interval_t::check_new_interval( p.primary, @@ -3024,6 +3026,7 @@ void OSD::build_past_intervals_parallel() pg->info.history.last_epoch_clean, cur_map, last_map, pgid, + recoverable.get(), &pg->past_intervals, &debug); if (new_interval) { diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 28283f4e423f1..0cd900a5e23c2 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -694,6 +694,8 @@ void PG::generate_past_intervals() pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool())); cur_map->pg_to_up_acting_osds(pgid, &up, &up_primary, &acting, &primary); + boost::scoped_ptr recoverable( + get_is_recoverable_predicate()); std::stringstream debug; bool new_interval = pg_interval_t::check_new_interval( old_primary, @@ -709,6 +711,7 @@ void PG::generate_past_intervals() cur_map, last_map, pgid, + recoverable.get(), &past_intervals, &debug); if (new_interval) { @@ -4729,6 +4732,8 @@ void PG::start_peering_interval( info.history.same_interval_since = osdmap->get_epoch(); } else { std::stringstream debug; + boost::scoped_ptr recoverable( + get_is_recoverable_predicate()); bool new_interval = pg_interval_t::check_new_interval( old_acting_primary.osd, new_acting_primary, @@ -4741,6 +4746,7 @@ void PG::start_peering_interval( osdmap, lastmap, info.pgid.pgid, + recoverable.get(), &past_intervals, &debug); dout(10) << __func__ << ": check_new_interval output: " diff --git a/src/osd/PG.h b/src/osd/PG.h index f675e69637a23..ebbbf80496bd3 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -197,6 +197,10 @@ class PG { void update_snap_mapper_bits(uint32_t bits) { snap_mapper.update_bits(bits); } + /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done + IsPGRecoverablePredicate *get_is_recoverable_predicate() { + return get_pgbackend()->get_is_recoverable_predicate(); + } protected: // Ops waiting for map, should be queued at back Mutex map_lock; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 1fd8ee0350ecd..599094e932a0b 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -929,6 +929,16 @@ void pg_pool_t::dump(Formatter *f) const f->dump_unsigned("expected_num_objects", expected_num_objects); } +void pg_pool_t::convert_to_pg_shards(const vector &from, set* to) const { + for (size_t i = 0; i < from.size(); ++i) { + if (from[i] != CRUSH_ITEM_NONE) { + to->insert( + pg_shard_t( + from[i], + ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + } +} int pg_pool_t::calc_bits_of(int t) { @@ -2600,6 +2610,7 @@ bool pg_interval_t::check_new_interval( OSDMapRef osdmap, OSDMapRef lastmap, pg_t pgid, + IsPGRecoverablePredicate *could_have_gone_active, map *past_intervals, std::ostream *out) { @@ -2633,9 +2644,14 @@ bool pg_interval_t::check_new_interval( if (*p != CRUSH_ITEM_NONE) ++num_acting; + const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second; + set old_acting_shards; + old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards); + if (num_acting && i.primary != -1 && - num_acting >= lastmap->get_pools().find(pgid.pool())->second.min_size) { + num_acting >= old_pg_pool.min_size && + (*could_have_gone_active)(old_acting_shards)) { if (out) *out << "generate_past_intervals " << i << ": not rw," diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index cd8ee44836cfb..66e77f998c1bf 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -897,6 +897,9 @@ struct pg_pool_t { return 0; } + /// converts the acting/up vector to a set of pg shards + void convert_to_pg_shards(const vector &from, set* to) const; + typedef enum { CACHEMODE_NONE = 0, ///< no caching CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later @@ -1905,6 +1908,7 @@ struct pg_interval_t { ceph::shared_ptr osdmap, ///< [in] current map ceph::shared_ptr lastmap, ///< [in] last map pg_t pgid, ///< [in] pgid for pg + IsPGRecoverablePredicate *could_have_gone_active, /// [in] predicate whether the pg can be active map *past_intervals,///< [out] intervals ostream *out = 0 ///< [out] debug ostream ); diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc index 83d9c0fff479c..33324b278c74d 100644 --- a/src/test/osd/types.cc +++ b/src/test/osd/types.cc @@ -20,6 +20,7 @@ #include "osd/OSDMap.h" #include "gtest/gtest.h" #include "common/Thread.h" +#include "osd/ReplicatedBackend.h" #include @@ -139,6 +140,7 @@ TEST(pg_interval_t, check_new_interval) int64_t pool_id = 200; int pg_num = 4; __u8 min_size = 2; + boost::scoped_ptr recoverable(new ReplicatedBackend::RPCRecPred()); { OSDMap::Incremental inc(epoch + 1); inc.new_pools[pool_id].min_size = min_size; @@ -183,6 +185,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_TRUE(past_intervals.empty()); } @@ -212,6 +215,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_EQ((unsigned int)1, past_intervals.size()); ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); @@ -244,6 +248,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); old_primary = new_primary; ASSERT_EQ((unsigned int)1, past_intervals.size()); @@ -277,6 +282,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_EQ((unsigned int)1, past_intervals.size()); ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); @@ -308,6 +314,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_EQ((unsigned int)1, past_intervals.size()); ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); @@ -346,6 +353,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_EQ((unsigned int)1, past_intervals.size()); ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); @@ -384,6 +392,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_EQ((unsigned int)1, past_intervals.size()); ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); @@ -417,6 +426,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals, &out)); ASSERT_EQ((unsigned int)1, past_intervals.size()); @@ -468,6 +478,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals, &out)); ASSERT_EQ((unsigned int)1, past_intervals.size()); @@ -502,6 +513,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals, &out)); ASSERT_EQ((unsigned int)1, past_intervals.size()); @@ -546,6 +558,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals, &out)); ASSERT_EQ((unsigned int)1, past_intervals.size()); @@ -594,6 +607,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals, &out)); ASSERT_EQ((unsigned int)1, past_intervals.size());