From 1b6f6f27b77803727a523b4337cbad411e8321ed Mon Sep 17 00:00:00 2001
From: xie xingguo <xie.xingguo@zte.com.cn>
Date: Tue, 26 Apr 2016 11:13:32 +0800
Subject: [PATCH 1/2] mon/OSDMonitor: improve reweight_by_utilization() logic

By calling reweight_by_utilization() method, we are aiming at an evener result
of utilization among all osds. To achieve this, we shall decrease weights of
osds which are currently overloaded, and try to increase weights of osds which
are currently underloaded when it is possible.
However, we can't do this all at a time in order to avoid a massive pg migrations
between osds. Thus we introduce a max_osds limit to smooth the progress.

The problem here is that we have sorted the utilization of all osds in a descending
manner and we always try to decrease the weights of the most overloaded osds
since they are most likely to encounter a nearfull/full transition soon, but
we won't increase the weights from the most underloaded(least utilized by contrast)
at the same time, which I think is not quite reasonable.

Actually, the best thing would probably be to iterate over teh low and high osds
in parallel, and do the ones that are furthest from the average first.

Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
(cherry picked from commit e7a32534ebc9e27f955ff2d7a8d1db511383301e)

 Conflicts:
	src/mon/OSDMonitor.cc

Resolved by picking the lambda implemenation.
NOTE: Because hammer does not support C++11, the lambda functionality from the
current master has been moved into the "Sorter" function object.
---
 src/mon/OSDMonitor.cc | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 1f08548fa54c6..c647c60017d6d 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -463,10 +463,16 @@ void OSDMonitor::update_logger()
 }
 
 struct Sorter {
-  bool operator()(std::pair<int,float> l,
-		  std::pair<int,float> r) {
-    return l.second > r.second;
-  }
+
+    double average_util; 
+
+    Sorter(const double average_util_)
+     : average_util(average_util_)
+    {}
+
+    bool operator()(std::pair<int,float> l, std::pair<int,float> r) {
+        return abs(l.second - average_util) > abs(r.second - average_util);
+    }
 };
 
 /* Assign a lower weight to overloaded OSDs.
@@ -595,8 +601,9 @@ int OSDMonitor::reweight_by_utilization(int oload,
     util_by_osd.push_back(osd_util);
   }
 
-  // sort and iterate from most to least utilized
-  std::sort(util_by_osd.begin(), util_by_osd.end(), Sorter());
+  // sort by absolute deviation from the mean utilization,
+  // in descending order.
+  std::sort(util_by_osd.begin(), util_by_osd.end(), Sorter(average_util));
 
   OSDMap::Incremental newinc;
 

From 23498a9620f792cd099dba028c5bdf96b1a625be Mon Sep 17 00:00:00 2001
From: xie xingguo <xie.xingguo@zte.com.cn>
Date: Tue, 31 May 2016 15:40:05 -0700
Subject: [PATCH 2/2] mon/OSDMonitor: avoid potential expensive grace
 calculation

The grace calculation during check_failure() is now very complicated
and time-consuming. Therefore we shall skip this when it is possible.

Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
(cherry picked from commit 3557903d5d57642179b2ae137bedc389974b1956)

 Conflicts:
	src/mon/OSDMonitor.cc

Resolved by choosing the move-to-top implementation. Removed unused vars.
---
 src/mon/OSDMonitor.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index c647c60017d6d..5c7ef72e7a3c5 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1494,6 +1494,13 @@ void OSDMonitor::check_failures(utime_t now)
 
 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
 {
+  // already pending failure?
+  if (pending_inc.new_state.count(target_osd) &&
+      pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
+    dout(10) << " already pending failure" << dendl;
+    return true;
+  }
+
   utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
   utime_t max_failed_since = fi.get_failed_since();
   utime_t failed_for = now - max_failed_since;
@@ -1537,13 +1544,6 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
 	   << grace << " grace (" << orig_grace << " + " << my_grace << " + " << peer_grace << "), max_failed_since " << max_failed_since
 	   << dendl;
 
-  // already pending failure?
-  if (pending_inc.new_state.count(target_osd) &&
-      pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
-    dout(10) << " already pending failure" << dendl;
-    return true;
-  }
-
   if (failed_for >= grace &&
       ((int)fi.reporters.size() >= g_conf->mon_osd_min_down_reporters) &&
       (fi.num_reports >= g_conf->mon_osd_min_down_reports)) {