Skip to content

Commit

Permalink
ignite-5125 Do not limit number of hanging objects dumps, instead inc…
Browse files Browse the repository at this point in the history
…rease time between dumps.
  • Loading branch information
zstan authored and sboikov committed Jun 6, 2017
1 parent ec5c3a2 commit 808ad9f
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 77 deletions.
Expand Up @@ -451,6 +451,9 @@ public final class IgniteSystemProperties {
/** Cache operations that take more time than value of this property will be output to log. Set to {@code 0} to disable. */ /** Cache operations that take more time than value of this property will be output to log. Set to {@code 0} to disable. */
public static final String IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT = "IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT"; public static final String IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT = "IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT";


/** Upper time limit between long running/hanging operations debug dumps. */
public static final String IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT = "IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT";

/** JDBC driver cursor remove delay. */ /** JDBC driver cursor remove delay. */
public static final String IGNITE_JDBC_DRIVER_CURSOR_REMOVE_DELAY = "IGNITE_JDBC_DRIVER_CURSOR_RMV_DELAY"; public static final String IGNITE_JDBC_DRIVER_CURSOR_REMOVE_DELAY = "IGNITE_JDBC_DRIVER_CURSOR_RMV_DELAY";


Expand Down
Expand Up @@ -110,6 +110,7 @@
import static org.apache.ignite.internal.GridTopic.TOPIC_CACHE; import static org.apache.ignite.internal.GridTopic.TOPIC_CACHE;
import static org.apache.ignite.internal.events.DiscoveryCustomEvent.EVT_DISCOVERY_CUSTOM_EVT; import static org.apache.ignite.internal.events.DiscoveryCustomEvent.EVT_DISCOVERY_CUSTOM_EVT;
import static org.apache.ignite.internal.managers.communication.GridIoPolicy.SYSTEM_POOL; import static org.apache.ignite.internal.managers.communication.GridIoPolicy.SYSTEM_POOL;
import static org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.*;
import static org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPreloader.DFLT_PRELOAD_RESEND_TIMEOUT; import static org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPreloader.DFLT_PRELOAD_RESEND_TIMEOUT;


/** /**
Expand Down Expand Up @@ -175,7 +176,10 @@ public class GridCachePartitionExchangeManager<K, V> extends GridCacheSharedMana
private volatile IgniteCheckedException stopErr; private volatile IgniteCheckedException stopErr;


/** */ /** */
private int longRunningOpsDumpCnt; private long nextLongRunningOpsDumpTime;

/** */
private int longRunningOpsDumpStep;


/** */ /** */
private DateFormat dateFormat = new SimpleDateFormat("HH:mm:ss.SSS"); private DateFormat dateFormat = new SimpleDateFormat("HH:mm:ss.SSS");
Expand Down Expand Up @@ -1405,84 +1409,83 @@ public void dumpDebugInfo(@Nullable AffinityTopologyVersion exchTopVer) throws E


/** /**
* @param timeout Operation timeout. * @param timeout Operation timeout.
* @return {@code True} if found long running operations.
*/ */
public void dumpLongRunningOperations(long timeout) { private boolean dumpLongRunningOperations0(long timeout) {
try { long curTime = U.currentTimeMillis();
GridDhtPartitionsExchangeFuture lastFut = lastInitializedFut;


// If exchange is in progress it will dump all hanging operations if any. boolean found = false;
if (lastFut != null && !lastFut.isDone())
return;

long curTime = U.currentTimeMillis();


boolean found = false; IgniteTxManager tm = cctx.tm();


IgniteTxManager tm = cctx.tm(); GridCacheMvccManager mvcc = cctx.mvcc();


if (tm != null) { if (tm != null) {
for (IgniteInternalTx tx : tm.activeTransactions()) { for (IgniteInternalTx tx : tm.activeTransactions()) {
if (curTime - tx.startTime() > timeout) { if (curTime - tx.startTime() > timeout) {
found = true; found = true;


if (longRunningOpsDumpCnt < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { U.warn(log, "Found long running transaction [startTime=" + formatTime(tx.startTime()) +
U.warn(log, "Found long running transaction [startTime=" + formatTime(tx.startTime()) + ", curTime=" + formatTime(curTime) + ", tx=" + tx + ']');
", curTime=" + formatTime(curTime) + ", tx=" + tx + ']');
}
else
break;
}
} }
} }
}


GridCacheMvccManager mvcc = cctx.mvcc(); if (mvcc != null) {

for (GridCacheFuture<?> fut : mvcc.activeFutures()) {
if (mvcc != null) { if (curTime - fut.startTime() > timeout) {
for (GridCacheFuture<?> fut : mvcc.activeFutures()) { found = true;
if (curTime - fut.startTime() > timeout) {
found = true;


if (longRunningOpsDumpCnt < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { U.warn(log, "Found long running cache future [startTime=" + formatTime(fut.startTime()) +
U.warn(log, "Found long running cache future [startTime=" + formatTime(fut.startTime()) + ", curTime=" + formatTime(curTime) + ", fut=" + fut + ']');
", curTime=" + formatTime(curTime) + ", fut=" + fut + ']');
}
else
break;
}
} }
}


for (GridCacheFuture<?> fut : mvcc.atomicFutures()) { for (GridCacheFuture<?> fut : mvcc.atomicFutures()) {
if (curTime - fut.startTime() > timeout) { if (curTime - fut.startTime() > timeout) {
found = true; found = true;


if (longRunningOpsDumpCnt < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { U.warn(log, "Found long running cache future [startTime=" + formatTime(fut.startTime()) +
U.warn(log, "Found long running cache future [startTime=" + formatTime(fut.startTime()) + ", curTime=" + formatTime(curTime) + ", fut=" + fut + ']');
", curTime=" + formatTime(curTime) + ", fut=" + fut + ']');
}
else
break;
}
} }
} }
}


if (found) { return found;
if (longRunningOpsDumpCnt < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { }
longRunningOpsDumpCnt++;


if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) { /**
U.warn(log, "Found long running cache operations, dump threads."); * @param timeout Operation timeout.
*/
public void dumpLongRunningOperations(long timeout) {
try {
GridDhtPartitionsExchangeFuture lastFut = lastInitializedFut;


U.dumpThreads(log); // If exchange is in progress it will dump all hanging operations if any.
} if (lastFut != null && !lastFut.isDone())
return;


U.warn(log, "Found long running cache operations, dump IO statistics."); if (U.currentTimeMillis() < nextLongRunningOpsDumpTime)
return;


// Dump IO manager statistics. if (dumpLongRunningOperations0(timeout)) {
cctx.gridIO().dumpStats(); nextLongRunningOpsDumpTime = U.currentTimeMillis() + nextDumpTimeout(longRunningOpsDumpStep++, timeout);

if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) {
U.warn(log, "Found long running cache operations, dump threads.");

U.dumpThreads(log);
} }

U.warn(log, "Found long running cache operations, dump IO statistics.");

// Dump IO manager statistics.
cctx.gridIO().dumpStats();
}
else {
nextLongRunningOpsDumpTime = 0;
longRunningOpsDumpStep = 0;
} }
else
longRunningOpsDumpCnt = 0;
} }
catch (Exception e) { catch (Exception e) {
U.error(log, "Failed to dump debug information: " + e, e); U.error(log, "Failed to dump debug information: " + e, e);
Expand Down Expand Up @@ -1774,11 +1777,15 @@ void dumpExchangeDebugInfo() {


exchFut.init(); exchFut.init();


int dumpedObjects = 0; int dumpCnt = 0;

final long futTimeout = 2 * cctx.gridConfig().getNetworkTimeout();

long nextDumpTime = 0;


while (true) { while (true) {
try { try {
exchFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); exchFut.get(futTimeout, TimeUnit.MILLISECONDS);


break; break;
} }
Expand All @@ -1788,7 +1795,7 @@ void dumpExchangeDebugInfo() {
", node=" + cctx.localNodeId() + "]. " + ", node=" + cctx.localNodeId() + "]. " +
"Dumping pending objects that might be the cause: "); "Dumping pending objects that might be the cause: ");


if (dumpedObjects < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { if (nextDumpTime <= U.currentTimeMillis()) {
try { try {
dumpDebugInfo(exchFut.topologyVersion()); dumpDebugInfo(exchFut.topologyVersion());
} }
Expand All @@ -1799,7 +1806,7 @@ void dumpExchangeDebugInfo() {
if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false))
U.dumpThreads(log); U.dumpThreads(log);


dumpedObjects++; nextDumpTime = U.currentTimeMillis() + nextDumpTimeout(dumpCnt++, futTimeout);
} }
} }
catch (Exception e) { catch (Exception e) {
Expand Down
Expand Up @@ -79,6 +79,10 @@ public class GridDhtPartitionTopologyImpl implements GridDhtPartitionTopology {
/** Flag to control amount of output for full map. */ /** Flag to control amount of output for full map. */
private static final boolean FULL_MAP_DEBUG = false; private static final boolean FULL_MAP_DEBUG = false;


/** */
private static final int DUMP_PENDING_OBJECTS_THRESHOLD =
IgniteSystemProperties.getInteger(IgniteSystemProperties.IGNITE_DUMP_PENDING_OBJECTS_THRESHOLD, 10);

/** */ /** */
private static final Long ZERO = 0L; private static final Long ZERO = 0L;


Expand Down Expand Up @@ -234,7 +238,7 @@ private boolean waitForRent() throws IgniteCheckedException {
break; break;
} }
catch (IgniteFutureTimeoutCheckedException ignored) { catch (IgniteFutureTimeoutCheckedException ignored) {
if (dumpCnt++ < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { if (dumpCnt++ < DUMP_PENDING_OBJECTS_THRESHOLD) {
U.warn(log, "Failed to wait for partition eviction [" + U.warn(log, "Failed to wait for partition eviction [" +
"topVer=" + topVer + "topVer=" + topVer +
", group=" + grp.cacheOrGroupName() + ", group=" + grp.cacheOrGroupName() +
Expand Down
Expand Up @@ -35,7 +35,6 @@
import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReadWriteLock;
import org.apache.ignite.IgniteCheckedException; import org.apache.ignite.IgniteCheckedException;
import org.apache.ignite.IgniteLogger; import org.apache.ignite.IgniteLogger;
import org.apache.ignite.IgniteSystemProperties;
import org.apache.ignite.cache.PartitionLossPolicy; import org.apache.ignite.cache.PartitionLossPolicy;
import org.apache.ignite.cluster.ClusterNode; import org.apache.ignite.cluster.ClusterNode;
import org.apache.ignite.events.CacheEvent; import org.apache.ignite.events.CacheEvent;
Expand Down Expand Up @@ -90,7 +89,10 @@
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
import org.jsr166.ConcurrentHashMap8; import org.jsr166.ConcurrentHashMap8;


import static org.apache.ignite.IgniteSystemProperties.IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT;
import static org.apache.ignite.IgniteSystemProperties.IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT; import static org.apache.ignite.IgniteSystemProperties.IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT;
import static org.apache.ignite.IgniteSystemProperties.getBoolean;
import static org.apache.ignite.IgniteSystemProperties.getLong;
import static org.apache.ignite.cache.PartitionLossPolicy.READ_ONLY_ALL; import static org.apache.ignite.cache.PartitionLossPolicy.READ_ONLY_ALL;
import static org.apache.ignite.cache.PartitionLossPolicy.READ_ONLY_SAFE; import static org.apache.ignite.cache.PartitionLossPolicy.READ_ONLY_SAFE;
import static org.apache.ignite.cache.PartitionLossPolicy.READ_WRITE_ALL; import static org.apache.ignite.cache.PartitionLossPolicy.READ_WRITE_ALL;
Expand All @@ -107,10 +109,6 @@
@SuppressWarnings({"TypeMayBeWeakened", "unchecked"}) @SuppressWarnings({"TypeMayBeWeakened", "unchecked"})
public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityTopologyVersion> public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityTopologyVersion>
implements Comparable<GridDhtPartitionsExchangeFuture>, GridDhtTopologyFuture, CachePartitionExchangeWorkerTask { implements Comparable<GridDhtPartitionsExchangeFuture>, GridDhtTopologyFuture, CachePartitionExchangeWorkerTask {
/** */
public static final int DUMP_PENDING_OBJECTS_THRESHOLD =
IgniteSystemProperties.getInteger(IgniteSystemProperties.IGNITE_DUMP_PENDING_OBJECTS_THRESHOLD, 10);

/** Dummy flag. */ /** Dummy flag. */
private final boolean dummy; private final boolean dummy;


Expand Down Expand Up @@ -880,20 +878,24 @@ private void waitPartitionRelease() throws IgniteCheckedException {
if (log.isDebugEnabled()) if (log.isDebugEnabled())
log.debug("Before waiting for partition release future: " + this); log.debug("Before waiting for partition release future: " + this);


int dumpedObjects = 0; int dumpCnt = 0;

long nextDumpTime = 0;

long futTimeout = 2 * cctx.gridConfig().getNetworkTimeout();


while (true) { while (true) {
try { try {
partReleaseFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); partReleaseFut.get(futTimeout, TimeUnit.MILLISECONDS);


break; break;
} }
catch (IgniteFutureTimeoutCheckedException ignored) { catch (IgniteFutureTimeoutCheckedException ignored) {
// Print pending transactions and locks that might have led to hang. // Print pending transactions and locks that might have led to hang.
if (dumpedObjects < DUMP_PENDING_OBJECTS_THRESHOLD) { if (nextDumpTime <= U.currentTimeMillis()) {
dumpPendingObjects(); dumpPendingObjects();


dumpedObjects++; nextDumpTime = U.currentTimeMillis() + nextDumpTimeout(dumpCnt++, futTimeout);
} }
} }
} }
Expand All @@ -903,16 +905,17 @@ private void waitPartitionRelease() throws IgniteCheckedException {


IgniteInternalFuture<?> locksFut = cctx.mvcc().finishLocks(exchId.topologyVersion()); IgniteInternalFuture<?> locksFut = cctx.mvcc().finishLocks(exchId.topologyVersion());


dumpedObjects = 0; nextDumpTime = 0;
dumpCnt = 0;


while (true) { while (true) {
try { try {
locksFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); locksFut.get(futTimeout, TimeUnit.MILLISECONDS);


break; break;
} }
catch (IgniteFutureTimeoutCheckedException ignored) { catch (IgniteFutureTimeoutCheckedException ignored) {
if (dumpedObjects < DUMP_PENDING_OBJECTS_THRESHOLD) { if (nextDumpTime <= U.currentTimeMillis()) {
U.warn(log, "Failed to wait for locks release future. " + U.warn(log, "Failed to wait for locks release future. " +
"Dumping pending objects that might be the cause: " + cctx.localNodeId()); "Dumping pending objects that might be the cause: " + cctx.localNodeId());


Expand All @@ -930,9 +933,9 @@ private void waitPartitionRelease() throws IgniteCheckedException {
for (Map.Entry<IgniteTxKey, Collection<GridCacheMvccCandidate>> e : locks.entrySet()) for (Map.Entry<IgniteTxKey, Collection<GridCacheMvccCandidate>> e : locks.entrySet())
U.warn(log, "Awaited locked entry [key=" + e.getKey() + ", mvcc=" + e.getValue() + ']'); U.warn(log, "Awaited locked entry [key=" + e.getKey() + ", mvcc=" + e.getValue() + ']');


dumpedObjects++; nextDumpTime = U.currentTimeMillis() + nextDumpTimeout(dumpCnt++, futTimeout);


if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) if (getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false))
U.dumpThreads(log); U.dumpThreads(log);
} }
} }
Expand Down Expand Up @@ -1026,7 +1029,7 @@ private void dumpPendingObjects() {
U.error(log, "Failed to dump debug information: " + e, e); U.error(log, "Failed to dump debug information: " + e, e);
} }


if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) if (getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false))
U.dumpThreads(log); U.dumpThreads(log);
} }


Expand Down Expand Up @@ -2195,4 +2198,27 @@ private CounterWithNodes(long cnt, UUID firstNode) {
return S.toString(CounterWithNodes.class, this); return S.toString(CounterWithNodes.class, this);
} }
} }

/**
* @param step Exponent coefficient.
* @param timeout Base timeout.
* @return Time to wait before next debug dump.
*/
public static long nextDumpTimeout(int step, long timeout) {
long limit = getLong(IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT, 30 * 60_000);

if (limit <= 0)
limit = 30 * 60_000;

assert step >= 0 : step;

long dumpFactor = Math.round(Math.pow(2, step));

long nextTimeout = timeout * dumpFactor;

if (nextTimeout <= 0)
return limit;

return nextTimeout <= limit ? nextTimeout : limit;
}
} }

0 comments on commit 808ad9f

Please sign in to comment.