Skip to content

Commit

Permalink
Recover gRPC channel from connection dropouts. (#1406)
Browse files Browse the repository at this point in the history
- check and recover channel connection in `TryRecovery()`
- fixes an infinite loop in the unlimited retry strategy by adding `grpc::UNAVAILABLE`,
  `DEADLINE_EXCEEDED` to the unrecoverable status codes.
  - server restart leads to `UNAVAILABLE`
  - connection loss leads to `DEADLINE_EXCEEDED`
- fixes repeated recovery attempts
  • Loading branch information
MichaelGrupp authored and wally-the-cartographer committed Sep 4, 2018
1 parent 153952d commit b1dfa30
Showing 1 changed file with 22 additions and 1 deletion.
23 changes: 22 additions & 1 deletion cartographer/cloud/internal/local_trajectory_uploader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,18 @@ namespace {
using absl::make_unique;

constexpr int kConnectionTimeoutInSeconds = 10;
constexpr int kConnectionRecoveryTimeoutInSeconds = 60;
constexpr int kTokenRefreshIntervalInSeconds = 60;
const common::Duration kPopTimeout = common::FromMilliseconds(100);

// This defines the '::grpc::StatusCode's that are considered unrecoverable
// errors and hence no retries will be attempted by the client.
const std::set<::grpc::StatusCode> kUnrecoverableStatusCodes = {
::grpc::NOT_FOUND};
::grpc::DEADLINE_EXCEEDED,
::grpc::NOT_FOUND,
::grpc::UNAVAILABLE,
::grpc::UNKNOWN,
};

bool IsNewSubmap(const mapping::proto::Submap& submap) {
return (submap.has_submap_2d() && submap.submap_2d().num_range_data() == 1) ||
Expand Down Expand Up @@ -151,6 +156,19 @@ void LocalTrajectoryUploader::Shutdown() {
}

void LocalTrajectoryUploader::TryRecovery() {
if (client_channel_->GetState(false /* try_to_connect */) !=
grpc_connectivity_state::GRPC_CHANNEL_READY) {
LOG(INFO) << "Trying to re-connect to uplink...";
std::chrono::system_clock::time_point deadline =
std::chrono::system_clock::now() +
std::chrono::seconds(kConnectionRecoveryTimeoutInSeconds);
if (!client_channel_->WaitForConnected(deadline)) {
LOG(ERROR) << "Failed to re-connect to uplink prior to recovery attempt.";
return;
}
}
LOG(INFO) << "Uplink channel ready, trying recovery.";

// Wind the sensor_data_queue forward to the next new submap.
LOG(INFO) << "LocalTrajectoryUploader tries to recover with next submap.";
while (true) {
Expand Down Expand Up @@ -184,6 +202,9 @@ void LocalTrajectoryUploader::TryRecovery() {
if (!status.ok()) {
LOG(ERROR) << "Failed to create trajectory. Aborting recovery attempt. "
<< status.error_message();
// Restore the previous state for the next recovery attempt.
local_trajectory_id_to_trajectory_info_ =
local_trajectory_id_to_trajectory_info;
return;
}
}
Expand Down

0 comments on commit b1dfa30

Please sign in to comment.