Skip to content

Commit

Permalink
Merge pull request #531 from TousakaRin/circuit_breaker
Browse files Browse the repository at this point in the history
Exposing the status of CircuitBreaker to the builtIn page
  • Loading branch information
jamesge committed Nov 9, 2018
2 parents 11e9156 + 1737f16 commit 2685cd8
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 37 deletions.
26 changes: 16 additions & 10 deletions src/brpc/builtin/connections_service.cpp
Expand Up @@ -108,16 +108,18 @@ static std::string BriefName(const std::string& cname) {

void ConnectionsService::PrintConnections(
std::ostream& os, const std::vector<SocketId>& conns,
bool use_html, const Server* server, bool need_local) const {
bool use_html, const Server* server, bool is_channel_conn) const {
if (conns.empty()) {
return;
}
if (use_html) {
os << "<table class=\"gridtable sortable\" border=\"1\"><tr>"
"<th>CreatedTime</th>"
"<th>RemoteSide</th>";
if (need_local) {
os << "<th>Local</th>";
if (is_channel_conn) {
os << "<th>Local</th>"
"<th>RecentErr</th>"
"<th>nbreak</th>";
}
os << "<th>SSL</th>"
"<th>Protocol</th>"
Expand All @@ -135,8 +137,8 @@ void ConnectionsService::PrintConnections(
"</tr>\n";
} else {
os << "CreatedTime |RemoteSide |";
if (need_local) {
os << "Local|";
if (is_channel_conn) {
os << "Local|RecentErr|nbreak|";
}
os << "SSL|Protocol |fd |"
"InBytes/s|In/s |InBytes/m |In/m |"
Expand Down Expand Up @@ -171,8 +173,10 @@ void ConnectionsService::PrintConnections(
if (failed) {
os << min_width("Broken", 26) << bar
<< min_width(NameOfPoint(ptr->remote_side()), 19) << bar;
if (need_local) {
os << min_width(ptr->local_side().port, 5) << bar;
if (is_channel_conn) {
os << min_width(ptr->local_side().port, 5) << bar
<< min_width(ptr->recent_error_count(), 10) << bar
<< min_width(ptr->isolated_times(), 7) << bar;
}
os << min_width("-", 3) << bar
<< min_width("-", 12) << bar
Expand Down Expand Up @@ -267,12 +271,14 @@ void ConnectionsService::PrintConnections(
strcpy(rtt_display, "-");
}
os << bar << min_width(NameOfPoint(ptr->remote_side()), 19) << bar;
if (need_local) {
if (is_channel_conn) {
if (ptr->local_side().port > 0) {
os << min_width(ptr->local_side().port, 5) << bar;
} else {
os << min_width("-", 5) << bar;
}
os << min_width(ptr->recent_error_count(), 10) << bar
<< min_width(ptr->isolated_times(), 7) << bar;
}
os << SSLStateToYesNo(ptr->ssl_state(), use_html) << bar;
char protname[32];
Expand Down Expand Up @@ -367,7 +373,7 @@ void ConnectionsService::default_method(
conns.insert(conns.end(), internal_conns.begin(), internal_conns.end());
}
os << "server_connection_count: " << num_conns << '\n';
PrintConnections(os, conns, use_html, server, false/*need_local*/);
PrintConnections(os, conns, use_html, server, false/*is_channel_conn*/);
if (has_uncopied) {
// Notice that we don't put the link of givemeall directly because
// people seeing the link are very likely to click it which may be
Expand All @@ -380,7 +386,7 @@ void ConnectionsService::default_method(
SocketMapList(&conns);
os << (use_html ? "<br>\n" : "\n")
<< "channel_connection_count: " << GetChannelConnectionCount() << '\n';
PrintConnections(os, conns, use_html, server, true/*need_local*/);
PrintConnections(os, conns, use_html, server, true/*is_channel_conn*/);

if (use_html) {
os << "</body></html>\n";
Expand Down
31 changes: 21 additions & 10 deletions src/brpc/circuit_breaker.cpp
Expand Up @@ -21,9 +21,9 @@

namespace brpc {

DEFINE_int32(circuit_breaker_short_window_size, 500,
DEFINE_int32(circuit_breaker_short_window_size, 1500,
"Short window sample size.");
DEFINE_int32(circuit_breaker_long_window_size, 1000,
DEFINE_int32(circuit_breaker_long_window_size, 3000,
"Long window sample size.");
DEFINE_int32(circuit_breaker_short_window_error_percent, 10,
"The maximum error rate allowed by the short window, ranging from 0-99.");
Expand All @@ -39,6 +39,8 @@ DEFINE_int32(circuit_breaker_min_isolation_duration_ms, 100,
"Minimum isolation duration in milliseconds");
DEFINE_int32(circuit_breaker_max_isolation_duration_ms, 30000,
"Maximum isolation duration in milliseconds");
DEFINE_double(circuit_breaker_epsilon_value, 0.02,
"ema_alpha = 1 - std::pow(epsilon, 1.0 / window_size)");

namespace {
// EPSILON is used to generate the smoothing coefficient when calculating EMA.
Expand All @@ -51,7 +53,9 @@ namespace {
// when window_size = 1000,
// EPSILON = 0.1, smooth = 0.9977
// EPSILON = 0.3, smooth = 0.9987
const double EPSILON = 0.1;

#define EPSILON (FLAGS_circuit_breaker_epsilon_value)

} // namepace

CircuitBreaker::EmaErrorRecorder::EmaErrorRecorder(int window_size,
Expand Down Expand Up @@ -115,8 +119,8 @@ bool CircuitBreaker::EmaErrorRecorder::UpdateErrorCost(int64_t error_cost,
int64_t ema_error_cost =
_ema_error_cost.fetch_add(error_cost, butil::memory_order_relaxed);
ema_error_cost += error_cost;
int64_t max_error_cost = ema_latency * _window_size *
(_max_error_percent / 100.0) * (1.0 + EPSILON);
const int64_t max_error_cost =
ema_latency * _window_size * (_max_error_percent / 100.0) * (1.0 + EPSILON);
return ema_error_cost <= max_error_cost;
}

Expand Down Expand Up @@ -147,8 +151,9 @@ CircuitBreaker::CircuitBreaker()
, _short_window(FLAGS_circuit_breaker_short_window_size,
FLAGS_circuit_breaker_short_window_error_percent)
, _last_reset_time_ms(butil::cpuwide_time_ms())
, _broken(false)
, _isolation_duration_ms(FLAGS_circuit_breaker_min_isolation_duration_ms) {
, _isolation_duration_ms(FLAGS_circuit_breaker_min_isolation_duration_ms)
, _isolated_times(0)
, _broken(false) {
}

bool CircuitBreaker::OnCallEnd(int error_code, int64_t latency) {
Expand All @@ -159,9 +164,7 @@ bool CircuitBreaker::OnCallEnd(int error_code, int64_t latency) {
_short_window.OnCallEnd(error_code, latency)) {
return true;
}
if (!_broken.exchange(true, butil::memory_order_acquire)) {
UpdateIsolationDuration();
}
MarkAsBroken();
return false;
}

Expand All @@ -172,6 +175,13 @@ void CircuitBreaker::Reset() {
_broken.store(false, butil::memory_order_release);
}

void CircuitBreaker::MarkAsBroken() {
if (!_broken.exchange(true, butil::memory_order_acquire)) {
_isolated_times.fetch_add(1, butil::memory_order_relaxed);
UpdateIsolationDuration();
}
}

void CircuitBreaker::UpdateIsolationDuration() {
int64_t now_time_ms = butil::cpuwide_time_ms();
int isolation_duration_ms = _isolation_duration_ms.load(butil::memory_order_relaxed);
Expand All @@ -188,4 +198,5 @@ void CircuitBreaker::UpdateIsolationDuration() {
_isolation_duration_ms.store(isolation_duration_ms, butil::memory_order_relaxed);
}


} // namespace brpc
19 changes: 15 additions & 4 deletions src/brpc/circuit_breaker.h
Expand Up @@ -38,12 +38,22 @@ class CircuitBreaker {

// Reset CircuitBreaker and clear history data. will erase the historical
// data and start sampling again. Before you call this method, you need to
// ensure that no one else is calling OnCallEnd.
// ensure that no one else is accessing CircuitBreaker.
void Reset();

// Mark the Socket as broken. Call this method when you want to isolate a
// node in advance. When this method is called multiple times in succession,
// only the first call will take effect.
void MarkAsBroken();

// Number of times marked as broken
int isolated_times() const {
return _isolated_times.load(butil::memory_order_relaxed);
}

// The duration that should be isolated when the socket fails in milliseconds.
// The higher the frequency of socket errors, the longer the duration.
int isolation_duration_ms() {
int isolation_duration_ms() const {
return _isolation_duration_ms.load(butil::memory_order_relaxed);
}

Expand All @@ -55,7 +65,7 @@ class CircuitBreaker {
EmaErrorRecorder(int windows_size, int max_error_percent);
bool OnCallEnd(int error_code, int64_t latency);
void Reset();

private:
int64_t UpdateLatency(int64_t latency);
bool UpdateErrorCost(int64_t latency, int64_t ema_latency);
Expand All @@ -72,8 +82,9 @@ class CircuitBreaker {
EmaErrorRecorder _long_window;
EmaErrorRecorder _short_window;
int64_t _last_reset_time_ms;
butil::atomic<bool> _broken;
butil::atomic<int> _isolation_duration_ms;
butil::atomic<int> _isolated_times;
butil::atomic<bool> _broken;
};

} // namespace brpc
Expand Down
27 changes: 17 additions & 10 deletions src/brpc/controller.cpp
Expand Up @@ -696,11 +696,22 @@ inline bool does_error_affect_main_socket(int error_code) {
// entire RPC (specified by c->FailedInline()).
void Controller::Call::OnComplete(
Controller* c, int error_code/*note*/, bool responded, bool end_of_rpc) {
if (enable_circuit_breaker && sending_sock) {
sending_sock->FeedbackCircuitBreaker(error_code,
butil::gettimeofday_us() - begin_time_us);
if (stream_user_data) {
stream_user_data->DestroyStreamUserData(sending_sock, c, error_code, end_of_rpc);
stream_user_data = NULL;
}

if (sending_sock != NULL) {
if (error_code != 0) {
sending_sock->AddRecentError();
}

if (enable_circuit_breaker) {
sending_sock->FeedbackCircuitBreaker(error_code,
butil::gettimeofday_us() - begin_time_us);
}
}

switch (c->connection_type()) {
case CONNECTION_TYPE_UNKNOWN:
break;
Expand Down Expand Up @@ -758,25 +769,21 @@ void Controller::Call::OnComplete(
}
break;
}

if (ELOGOFF == error_code) {
SocketUniquePtr sock;
if (Socket::Address(peer_id, &sock) == 0) {
// Block this `Socket' while not closing the fd
sock->SetLogOff();
}
}

if (need_feedback) {
const LoadBalancer::CallInfo info =
{ begin_time_us, peer_id, error_code, c };
c->_lb->Feedback(info);
}

if (stream_user_data) {
stream_user_data->DestroyStreamUserData(sending_sock, c, error_code, end_of_rpc);
stream_user_data = NULL;
}

// Release the `Socket' we used to send/receive data
sending_sock.reset(NULL);
}
Expand Down
35 changes: 32 additions & 3 deletions src/brpc/socket.cpp
Expand Up @@ -178,6 +178,8 @@ class Socket::SharedPart : public SharedObject {

CircuitBreaker circuit_breaker;

butil::atomic<uint64_t> recent_error_count;

explicit SharedPart(SocketId creator_socket_id);
~SharedPart();

Expand All @@ -193,7 +195,8 @@ Socket::SharedPart::SharedPart(SocketId creator_socket_id2)
, in_num_messages(0)
, out_size(0)
, out_num_messages(0)
, extended_stat(NULL) {
, extended_stat(NULL)
, recent_error_count(0) {
}

Socket::SharedPart::~SharedPart() {
Expand Down Expand Up @@ -766,6 +769,7 @@ void Socket::Revive() {
SharedPart* sp = GetSharedPart();
if (sp) {
sp->circuit_breaker.Reset();
sp->recent_error_count.store(0, butil::memory_order_relaxed);
}
// Set this flag to true since we add additional ref again
_recycle_flag.store(false, butil::memory_order_relaxed);
Expand Down Expand Up @@ -802,6 +806,29 @@ int Socket::ReleaseAdditionalReference() {
return -1;
}

void Socket::AddRecentError() {
SharedPart* sp = GetSharedPart();
if (sp) {
sp->recent_error_count.fetch_add(1, butil::memory_order_relaxed);
}
}

int64_t Socket::recent_error_count() const {
SharedPart* sp = GetSharedPart();
if (sp) {
return sp->recent_error_count.load(butil::memory_order_relaxed);
}
return 0;
}

int Socket::isolated_times() const {
SharedPart* sp = GetSharedPart();
if (sp) {
return sp->circuit_breaker.isolated_times();
}
return 0;
}

int Socket::SetFailed(int error_code, const char* error_fmt, ...) {
if (error_code == 0) {
CHECK(false) << "error_code is 0";
Expand Down Expand Up @@ -836,6 +863,7 @@ int Socket::SetFailed(int error_code, const char* error_fmt, ...) {
// by Channel to revive never-connected socket when server side
// comes online.
if (_health_check_interval_s > 0) {
GetOrNewSharedPart( )->circuit_breaker.MarkAsBroken();
PeriodicTaskManager::StartTaskAt(
new HealthCheckTask(id()),
butil::milliseconds_from_now(GetOrNewSharedPart()->
Expand Down Expand Up @@ -876,8 +904,9 @@ int Socket::SetFailed() {

void Socket::FeedbackCircuitBreaker(int error_code, int64_t latency_us) {
if (!GetOrNewSharedPart()->circuit_breaker.OnCallEnd(error_code, latency_us)) {
LOG(ERROR) << "Socket[" << *this << "] isolated by circuit breaker";
SetFailed(main_socket_id());
if (SetFailed(main_socket_id()) == 0) {
LOG(ERROR) << "Socket[" << *this << "] isolated by circuit breaker";
}
}
}

Expand Down
6 changes: 6 additions & 0 deletions src/brpc/socket.h
Expand Up @@ -317,6 +317,12 @@ friend class policy::H2GlobalStreamCreator;
__attribute__ ((__format__ (__printf__, 3, 4)));
static int SetFailed(SocketId id);

void AddRecentError();

int64_t recent_error_count() const;

int isolated_times() const;

void FeedbackCircuitBreaker(int error_code, int64_t latency_us);

bool Failed() const;
Expand Down

0 comments on commit 2685cd8

Please sign in to comment.