Skip to content

Commit

Permalink
Returned "ServiceUnavailable" for slave's /state during recovery.
Browse files Browse the repository at this point in the history
This will help operators hitting the /state endpoint by not returning incomplete
data when slave is still recoverying.

Review: https://reviews.apache.org/r/43267
  • Loading branch information
vinodkone committed Feb 9, 2016
1 parent 748a60e commit 83b43d9
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 0 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG
@@ -1,3 +1,12 @@

Release Notes - Mesos - Version 0.28.0 (WIP)
--------------------------------------------

** API Changes:
* [MESOS-4066] - Agent should not return partial state when a request is made
to /state endpoint during recovery.


Release Notes - Mesos - Version 0.27.0
--------------------------------------------

Expand Down
4 changes: 4 additions & 0 deletions src/slave/http.cpp
Expand Up @@ -399,6 +399,10 @@ string Slave::Http::STATE_HELP() {

Future<Response> Slave::Http::state(const Request& request) const
{
if (slave->state == Slave::RECOVERING) {
return ServiceUnavailable("Agent has not finished recovery");
}

JSON::Object object;
object.values["version"] = MESOS_VERSION;

Expand Down
89 changes: 89 additions & 0 deletions src/tests/slave_tests.cpp
Expand Up @@ -75,6 +75,7 @@ using process::UPID;

using process::http::OK;
using process::http::Response;
using process::http::ServiceUnavailable;

using std::map;
using std::shared_ptr;
Expand Down Expand Up @@ -1184,9 +1185,15 @@ TEST_F(SlaveTest, StateEndpoint)
// Capture the start time deterministically.
Clock::pause();

Future<Nothing> __recover = FUTURE_DISPATCH(_, &Slave::__recover);

Try<PID<Slave>> slave = StartSlave(&containerizer, flags);
ASSERT_SOME(slave);

// Ensure slave has finished recovery.
AWAIT_READY(__recover);
Clock::settle();

Future<Response> response = process::http::get(slave.get(), "state");

AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
Expand Down Expand Up @@ -1320,6 +1327,88 @@ TEST_F(SlaveTest, StateEndpoint)
}


// This test checks that when a slave is in RECOVERING state it responds
// to HTTP requests for "/state" endpoint with ServiceUnavailable.
TEST_F(SlaveTest, StateEndpointUnavailableDuringRecovery)
{
Try<PID<Master>> master = StartMaster();
ASSERT_SOME(master);

MockExecutor exec(DEFAULT_EXECUTOR_ID);

TestContainerizer* containerizer1 = new TestContainerizer(&exec);

slave::Flags flags = CreateSlaveFlags();

Try<PID<Slave>> slave = StartSlave(containerizer1, flags);
ASSERT_SOME(slave);

// Launch a task so that slave has something to recover after restart.
MockScheduler sched;

// Enable checkpointing for the framework.
FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
frameworkInfo.set_checkpoint(true);

MesosSchedulerDriver driver(
&sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL);

EXPECT_CALL(sched, registered(&driver, _, _))
.Times(1);

EXPECT_CALL(sched, resourceOffers(&driver, _))
.WillOnce(LaunchTasks(DEFAULT_EXECUTOR_INFO, 1, 1, 512, "*"))
.WillRepeatedly(Return()); // Ignore subsequent offers.

EXPECT_CALL(exec, registered(_, _, _, _));

EXPECT_CALL(exec, launchTask(_, _))
.WillOnce(SendStatusUpdateFromTask(TASK_RUNNING));

Future<TaskStatus> status;
EXPECT_CALL(sched, statusUpdate(&driver, _))
.WillOnce(FutureArg<1>(&status))
.WillRepeatedly(Return()); // Ignore subsequent updates.

driver.start();

AWAIT_READY(status);
EXPECT_EQ(TASK_RUNNING, status.get().state());

// Need this expectation here because `TestContainerizer` doesn't do recovery
// and hence sets `MESOS_RECOVERY_TIMEOUT` as '0s' causing the executor driver
// to exit immediately after slave exit.
EXPECT_CALL(exec, shutdown(_))
.Times(AtMost(1));

// Restart the slave.
Stop(slave.get());
delete containerizer1;

// Pause the clock to keep slave in RECOVERING state.
Clock::pause();

Future<Nothing> _recover = FUTURE_DISPATCH(_, &Slave::_recover);

TestContainerizer containerizer2;

slave = StartSlave(&containerizer2, flags);
ASSERT_SOME(slave);

// Ensure slave has setup the route for "/state".
AWAIT_READY(_recover);

Future<Response> response = process::http::get(slave.get(), "state");

AWAIT_EXPECT_RESPONSE_STATUS_EQ(ServiceUnavailable().status, response);

driver.stop();
driver.join();

Shutdown();
}


// This test ensures that when a slave is shutting down, it will not
// try to re-register with the master.
TEST_F(SlaveTest, DISABLED_TerminatingSlaveDoesNotReregister)
Expand Down

0 comments on commit 83b43d9

Please sign in to comment.