Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IGNITE-13510 Added snapshot status command to control.sh and JMX. #10202

Merged
merged 14 commits into from
Aug 18, 2022
36 changes: 32 additions & 4 deletions docs/_docs/snapshots/snapshots.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ control.(sh|bat) --snapshot restore snapshot_02092020 --start --groups snapshot-
--

==== Using CLI to control restore operation
The `control.sh|bat` script provides the ability to start, stop, and get the status of the restore operation.
The `control.sh|bat` script provides the ability to start and stop the restore operation.

[source,shell]
----
Expand All @@ -234,13 +234,41 @@ control.(sh|bat) --snapshot restore snapshot_09062021 --src /tmp/ignite/snapshot
# Start restoring only "cache-group1" and "cache-group2" from the snapshot "snapshot_09062021" in the background.
control.(sh|bat) --snapshot restore snapshot_09062021 --start --groups cache-group1,cache-group2

# Get the status of the restore operation for "snapshot_09062021".
control.(sh|bat) --snapshot restore snapshot_09062021 --status

# Cancel the restore operation for "snapshot_09062021".
control.(sh|bat) --snapshot restore snapshot_09062021 --cancel
----

== Getting Snapshot Operation Status

The status of the current snapshot operation in the cluster can be obtained using the `control.sh|bat` script or JMX interface:

[tabs]
--
tab:Unix[]
[source,shell]
----
# Get the status of the snapshot operation.
control.sh --snapshot status
----

tab:Windows[]
[source,shell]
----
# Get the status of the snapshot operation.
control.bat --snapshot status
----

tab:JMX[]
You can also get the current snapshot status via the `SnapshotMXBean` interface:
[source,java]
----
SnapshotMXBean mxBean = ...;

// The status of a current snapshot operation in the cluster.
String status = mxBean.status();
----
--

== Consistency Guarantees

All snapshots are fully consistent in terms of concurrent cluster-wide operations as well as ongoing changes with Ignite.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import static org.apache.ignite.internal.commandline.snapshot.SnapshotRestoreCommandOption.SYNC;
import static org.apache.ignite.internal.commandline.snapshot.SnapshotSubcommands.RESTORE;
import static org.apache.ignite.internal.visor.snapshot.VisorSnapshotRestoreTaskAction.START;
import static org.apache.ignite.internal.visor.snapshot.VisorSnapshotRestoreTaskAction.STATUS;

/**
* Sub-command to restore snapshot.
Expand All @@ -49,6 +50,9 @@ protected SnapshotRestoreCommand() {

/** {@inheritDoc} */
@Override public Object execute(GridClientConfiguration clientCfg, Logger log) throws Exception {
if (cmdArg instanceof VisorSnapshotRestoreTaskArg && ((VisorSnapshotRestoreTaskArg)cmdArg).jobAction() == STATUS)
log.warning("Command deprecated. Use '" + SNAPSHOT + ' ' + SnapshotSubcommands.STATUS + "' instead.");

Object res = super.execute(clientCfg, log);

log.info(String.valueOf(res));
Expand Down Expand Up @@ -122,7 +126,8 @@ else if (option == SYNC) {

usage(log, "Restore snapshot:", SNAPSHOT, startParams, RESTORE.toString(), SNAPSHOT_NAME_ARG, "--start",
optional(GROUPS.argName(), GROUPS.arg()), optional(SOURCE.argName(), SOURCE.arg()), optional(SYNC.argName()));
usage(log, "Snapshot restore operation status:", SNAPSHOT, params, RESTORE.toString(), SNAPSHOT_NAME_ARG, "--status");
usage(log, "Snapshot restore operation status (Command deprecated. Use '" + SNAPSHOT + ' '
+ SnapshotSubcommands.STATUS + "' instead.):", SNAPSHOT, params, RESTORE.toString(), SNAPSHOT_NAME_ARG, "--status");
usage(log, "Cancel snapshot restore operation:", SNAPSHOT, params, RESTORE.toString(), SNAPSHOT_NAME_ARG, "--cancel");
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.ignite.internal.commandline.snapshot;

import java.text.DateFormat;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import org.apache.ignite.internal.commandline.CommandArgIterator;
import org.apache.ignite.internal.commandline.systemview.SystemViewCommand;
import org.apache.ignite.internal.util.GridStringBuilder;
import org.apache.ignite.internal.util.typedef.F;
import org.apache.ignite.internal.util.typedef.X;
import org.apache.ignite.internal.util.typedef.internal.U;
import org.apache.ignite.internal.visor.snapshot.VisorSnapshotStatusTask;
import org.apache.ignite.internal.visor.snapshot.VisorSnapshotStatusTask.SnapshotStatus;

import static org.apache.ignite.internal.commandline.CommandList.SNAPSHOT;
import static org.apache.ignite.internal.commandline.snapshot.SnapshotSubcommands.STATUS;
import static org.apache.ignite.internal.visor.systemview.VisorSystemViewTask.SimpleType.NUMBER;
import static org.apache.ignite.internal.visor.systemview.VisorSystemViewTask.SimpleType.STRING;

/**
* Command to get the status of the current snapshot operation in the cluster.
*/
public class SnapshotStatusCommand extends SnapshotSubcommand {
/** */
protected SnapshotStatusCommand() {
super("status", VisorSnapshotStatusTask.class);
}

/** {@inheritDoc} */
@Override protected void printResult(Object res, Logger log) {
if (res == null) {
log.info("There is no create or restore snapshot operation in progress.");

return;
}

SnapshotStatus status = (SnapshotStatus)res;

boolean isCreating = status.operation() == VisorSnapshotStatusTask.SnapshotOperation.CREATE;

GridStringBuilder s = new GridStringBuilder();

if (isCreating)
s.a("Create snapshot operation is in progress.").nl();
else
s.a("Restore snapshot operation is in progress.").nl();

s.a("Snapshot name: ").a(status.name()).nl();
s.a("Operation ID: ").a(status.requestId()).nl();
s.a("Started at: ").a(DateFormat.getDateTimeInstance().format(new Date(status.startTime()))).nl();
s.a("Duration: ").a(X.timeSpan2DHMSM(System.currentTimeMillis() - status.startTime())).nl()
.nl();
s.a("Estimated operation progress:").nl();

log.info(s.toString());

List<String> titles = isCreating ? F.asList("Node ID", "Processed, bytes", "Total, bytes", "Percent") :
F.asList("Node ID", "Processed, partitions", "Total, partitions", "Percent");

List<List<?>> rows = status.progress().entrySet().stream().sorted(Map.Entry.comparingByKey()).map(e -> {
UUID nodeId = e.getKey();
long processed = e.getValue().get1();
long total = e.getValue().get2();

if (total <= 0)
return F.asList(nodeId, "unknown", "unknown", "unknown");

String percent = (int)(processed * 100 / total) + "%";

if (isCreating)
return F.asList(nodeId, U.humanReadableByteCount(processed), U.humanReadableByteCount(total), percent);
else
return F.asList(nodeId, processed, total, percent);
}).collect(Collectors.toList());

SystemViewCommand.printTable(titles, F.asList(STRING, NUMBER, NUMBER, NUMBER),
rows, log);

log.info(U.nl());
}

/** {@inheritDoc} */
@Override public void parseArguments(CommandArgIterator argIter) {
// No-op.
NSAmelchev marked this conversation as resolved.
Show resolved Hide resolved
}

/** {@inheritDoc} */
@Override public void printUsage(Logger log) {
usage(log, "Get the status of the current snapshot operation:", SNAPSHOT, STATUS.toString());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ public enum SnapshotSubcommands {
CHECK(new SnapshotCheckCommand()),

/** Sub-command to restore snapshot. */
RESTORE(new SnapshotRestoreCommand());
RESTORE(new SnapshotRestoreCommand()),

/** Sub-command to get the status of the current snapshot operation. */
STATUS(new SnapshotStatusCommand());

/** Sub-command. */
private final SnapshotSubcommand cmd;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
import java.util.logging.Logger;
import java.util.stream.Collectors;
import org.apache.ignite.Ignite;
import org.apache.ignite.IgniteCache;
import org.apache.ignite.IgniteDataStreamer;
import org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction;
import org.apache.ignite.cluster.ClusterNode;
import org.apache.ignite.configuration.AtomicConfiguration;
Expand Down Expand Up @@ -455,9 +455,10 @@ protected void createCacheAndPreload(

ignite.createCache(ccfg);

IgniteCache<Object, Object> cache = ignite.cache(cacheName);
for (int i = 0; i < countEntries; i++)
cache.put(i, i);
try (IgniteDataStreamer<Object, Object> streamer = ignite.dataStreamer(cacheName)) {
for (int i = 0; i < countEntries; i++)
streamer.addData(i, i);
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
import org.apache.ignite.internal.processors.cluster.ChangeGlobalStateFinishMessage;
import org.apache.ignite.internal.processors.cluster.GridClusterStateProcessor;
import org.apache.ignite.internal.util.BasicRateLimiter;
import org.apache.ignite.internal.util.distributed.SingleNodeMessage;
import org.apache.ignite.internal.util.future.IgniteFinishedFutureImpl;
import org.apache.ignite.internal.util.lang.GridAbsPredicate;
import org.apache.ignite.internal.util.lang.GridFunc;
Expand Down Expand Up @@ -3636,9 +3637,92 @@ public void testSnapshotRestoreCancelAndStatus() throws Exception {
assertNull(ig.cache(DEFAULT_CACHE_NAME));
}

/** @throws Exception If fails. */
@Test
public void testSnapshotStatus() throws Exception {
String snapshotName = "snapshot1";
int keysCnt = 10_000;

IgniteEx srv = startGrids(3);

srv.cluster().state(ACTIVE);

createCacheAndPreload(srv, keysCnt);

checkSnapshotStatus(false, false, null);

TestRecordingCommunicationSpi spi = TestRecordingCommunicationSpi.spi(grid(1));

spi.blockMessages((node, msg) -> msg instanceof SingleNodeMessage);

IgniteFuture<Void> fut = srv.snapshot().createSnapshot(snapshotName);

spi.waitForBlocked();

checkSnapshotStatus(true, false, snapshotName);

spi.stopBlock();

fut.get(getTestTimeout());

checkSnapshotStatus(false, false, null);

srv.destroyCache(DEFAULT_CACHE_NAME);

spi.blockMessages((node, msg) -> msg instanceof SingleNodeMessage);

fut = srv.snapshot().restoreSnapshot(snapshotName, F.asList(DEFAULT_CACHE_NAME));

spi.waitForBlocked();

checkSnapshotStatus(false, true, snapshotName);

spi.stopBlock();

fut.get(getTestTimeout());

checkSnapshotStatus(false, false, null);
}

/**
* @throws Exception If failed.
* @param isCreating {@code True} if create snapshot operation is in progress.
* @param isRestoring {@code True} if restore snapshot operation is in progress.
* @param expName Expected snapshot name.
*/
private void checkSnapshotStatus(boolean isCreating, boolean isRestoring, String expName) throws Exception {
assertTrue(waitForCondition(() -> G.allGrids().stream().allMatch(
ignite -> {
IgniteSnapshotManager mgr = ((IgniteEx)ignite).context().cache().context().snapshotMgr();

return isCreating == mgr.isSnapshotCreating() && isRestoring == mgr.isRestoring();
}),
getTestTimeout()));

injectTestSystemOut();

int status = execute("--snapshot", "status");

String out = testOut.toString();

assertEquals(out, EXIT_CODE_OK, status);

if (!isCreating && !isRestoring) {
assertContains(log, out, "There is no create or restore snapshot operation in progress.");

return;
}

if (isCreating)
assertContains(log, out, "Create snapshot operation is in progress.");
else
assertContains(log, out, "Restore snapshot operation is in progress.");

assertContains(log, out, "Snapshot name: " + expName);

G.allGrids().forEach(srv -> assertContains(log, out, srv.cluster().localNode().id().toString()));
}

/** @throws Exception If failed. */
@Test
@WithSystemProperty(key = IGNITE_PDS_SKIP_CHECKPOINT_ON_NODE_STOP, value = "true")
public void testCleaningGarbageAfterCacheDestroyedAndNodeStop_ControlConsoleUtil() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,11 @@ public boolean isSnapshotCreating() {
}
}

/** @return Current create snapshot request. {@code Null} if there is no create snapshot operation in progress. */
@Nullable public SnapshotOperationRequest currentCreateRequest() {
return clusterSnpReq;
}

/**
* Check if snapshot restore process is currently running.
*
Expand Down