From 97225b16e712640ba4009748ead1e1b52f30a97f Mon Sep 17 00:00:00 2001 From: Adam Lerman Date: Thu, 6 Jun 2019 17:09:48 -0400 Subject: [PATCH] Add logging of top tablets to enable better hot spot detection --- .../apache/accumulo/core/conf/Property.java | 3 ++ .../apache/accumulo/tserver/TabletServer.java | 44 +++++++++++++++++++ .../accumulo/tserver/tablet/Tablet.java | 4 ++ 3 files changed, 51 insertions(+) diff --git a/core/src/main/java/org/apache/accumulo/core/conf/Property.java b/core/src/main/java/org/apache/accumulo/core/conf/Property.java index 7cca3c89c87..ab396a4c09c 100644 --- a/core/src/main/java/org/apache/accumulo/core/conf/Property.java +++ b/core/src/main/java/org/apache/accumulo/core/conf/Property.java @@ -485,6 +485,9 @@ public enum Property { "The time between adjustments of the server thread pool."), TSERV_MAX_MESSAGE_SIZE("tserver.server.message.size.max", "1G", PropertyType.MEMORY, "The maximum size of a message that can be sent to a tablet server."), + TSERV_LOG_TOP_TABLETS_COUNT("tserver.log.top.tablets.count", "0", PropertyType.COUNT, + "Number of top tablets to log when saving tablet stats. If <= 0, logging " + + "of top tablets is disabled"), TSERV_HOLD_TIME_SUICIDE("tserver.hold.time.max", "5m", PropertyType.TIMEDURATION, "The maximum time for a tablet server to be in the \"memory full\" state." + " If the tablet server cannot write out memory in this much time, it will" diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java index 45b2c9d24ca..ae1be4ce5f8 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java @@ -40,6 +40,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.PriorityQueue; import java.util.Random; import java.util.Set; import java.util.SortedMap; @@ -358,17 +359,60 @@ public TabletServer(ServerConfigurationFactory confFactory, VolumeManager fs) th this.logSorter = new LogSorter(instance, fs, aconf); this.replWorker = new ReplicationWorker(this, fs); this.statsKeeper = new TabletStatsKeeper(); + final int numTopTabletsToLog = aconf.getCount(Property.TSERV_LOG_TOP_TABLETS_COUNT); + final boolean logTopTablets = numTopTabletsToLog > 0; SimpleTimer.getInstance(aconf).schedule(new Runnable() { @Override public void run() { + + Comparator> topTabletComparator = new Comparator>() { + @Override + public int compare(Pair first, Pair second) { + return second.getSecond().compareTo(first.getSecond()); + } + }; + PriorityQueue> topTabletsByIngestCount = + new PriorityQueue<>(numTopTabletsToLog, topTabletComparator); + PriorityQueue> topTabletsByQueryCount = + new PriorityQueue<>(topTabletsByIngestCount); synchronized (onlineTablets) { long now = System.currentTimeMillis(); + topTabletsByIngestCount.clear(); + topTabletsByQueryCount.clear(); for (Tablet tablet : onlineTablets.values()) try { tablet.updateRates(now); + if (logTopTablets) { + addToTopTablets(tablet.totalIngest(), topTabletsByIngestCount, numTopTabletsToLog); + addToTopTablets(tablet.totalQueries(), topTabletsByQueryCount, numTopTabletsToLog); + } } catch (Exception ex) { log.error("Error updating rates for {}", tablet.getExtent(), ex); } + + if (logTopTablets) { + logTopTablets(topTabletsByIngestCount, "QUERY", numTopTabletsToLog); + logTopTablets(topTabletsByQueryCount, "INGEST", numTopTabletsToLog); + } + } + } + + private void addToTopTablets(long count, + PriorityQueue> topTabletsByIngestCount, int numTopTabletsToLog) { + if (topTabletsByIngestCount.size() < numTopTabletsToLog + || topTabletsByIngestCount.peek().getSecond() < count) { + if (topTabletsByIngestCount.size() == numTopTabletsToLog) { + topTabletsByIngestCount.remove(); + } + } + } + + private void logTopTablets(PriorityQueue> topTabletsByIngestCount, + String label, int numTopTabletsToLog) { + for (int i = 0; i < numTopTabletsToLog; i++) { + Pair pair = topTabletsByIngestCount.poll(); + log.debug("Top {} tablet by {} count -- extent: {} count: {}", i, label, pair.getFirst(), + pair.getSecond()); } } }, 5000, 5000); diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java index 3366e9e4712..3af8423fb2f 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java @@ -2395,6 +2395,10 @@ public long totalQueries() { return this.queryCount; } + public long totalIngest() { + return this.ingestCount; + } + // synchronized? public void updateRates(long now) { queryRate.update(now, queryCount);