From 8cd227848b7de0552712fab964cc59d4691c18ad Mon Sep 17 00:00:00 2001 From: Junzhi Peng <201250214@smail.nju.edu.cn> Date: Thu, 20 Jul 2023 09:55:38 +0800 Subject: [PATCH 01/22] Add jvm gc monitor and alerter --- .../iotdb/confignode/service/ConfigNode.java | 2 + .../metrics/DataNodeMetricsHelper.java | 4 + .../commons/service/metric/GcTimeAlerter.java | 26 ++ .../commons/service/metric/JvmGcMetrics.java | 241 ++++++++++++++++++ 4 files changed, 273 insertions(+) create mode 100644 iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java create mode 100644 iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index e446bec812c8b..04740eb13aeee 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -28,6 +28,7 @@ import org.apache.iotdb.commons.exception.StartupException; import org.apache.iotdb.commons.service.JMXService; import org.apache.iotdb.commons.service.RegisterManager; +import org.apache.iotdb.commons.service.metric.JvmGcMetrics; import org.apache.iotdb.commons.service.metric.MetricService; import org.apache.iotdb.commons.utils.TestOnly; import org.apache.iotdb.confignode.client.ConfigNodeRequestType; @@ -244,6 +245,7 @@ private void setUpMetricService() throws StartupException { MetricService.getInstance().addMetricSet(new SystemMetrics(false)); MetricService.getInstance().addMetricSet(new DiskMetrics(IoTDBConstant.CN_ROLE)); MetricService.getInstance().addMetricSet(new NetMetrics(IoTDBConstant.CN_ROLE)); + MetricService.getInstance().addMetricSet(JvmGcMetrics.getInstance()); initCpuMetrics(); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java index c2853fb2fa75b..24a15985c9232 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java @@ -23,6 +23,7 @@ import org.apache.iotdb.commons.concurrent.ThreadName; import org.apache.iotdb.commons.concurrent.ThreadPoolMetrics; import org.apache.iotdb.commons.conf.IoTDBConstant; +import org.apache.iotdb.commons.service.metric.JvmGcMetrics; import org.apache.iotdb.commons.service.metric.MetricService; import org.apache.iotdb.commons.service.metric.PerformanceOverviewMetrics; import org.apache.iotdb.db.queryengine.metric.DataExchangeCostMetricSet; @@ -72,6 +73,9 @@ public static void bind() { // bind performance overview related metrics MetricService.getInstance().addMetricSet(PerformanceOverviewMetrics.getInstance()); + + // bind gc metrics + MetricService.getInstance().addMetricSet(JvmGcMetrics.getInstance()); } private static void initCpuMetrics() { diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java new file mode 100644 index 0000000000000..d102ea2312870 --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java @@ -0,0 +1,26 @@ +package org.apache.iotdb.commons.service.metric; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.text.SimpleDateFormat; +import java.util.Date; + +public class GcTimeAlerter implements JvmGcMetrics.GcTimeAlertHandler { + private static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + private static final Logger logger = LoggerFactory.getLogger(GcTimeAlerter.class); + + /** + * Alert handler func + * User can tailor their handle logic here + * @param gcData + */ + @Override + public void alert(JvmGcMetrics.GcData gcData) { + logger.warn("Error metrics taken time: " + sdf.format(new Date(Long.parseLong(String.valueOf(gcData.getTimestamp()))))); + logger.warn("Time since monitor has been started: " + gcData.getGcMonitorRunTime() + " ms"); + logger.warn("Gc Time Percentage: " + gcData.getGcTimePercentage() + "%"); + logger.warn("Accumulated GC time: " + gcData.getAccumulatedGcTime() + " ms"); + logger.warn("Accumulated GC count: " + gcData.getAccumulatedGcCount()); + } +} diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java new file mode 100644 index 0000000000000..06260bc8302a0 --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java @@ -0,0 +1,241 @@ +package org.apache.iotdb.commons.service.metric; + +import com.google.common.base.Preconditions; +import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; +import org.apache.iotdb.commons.concurrent.threadpool.ScheduledExecutorUtil; +import org.apache.iotdb.metrics.AbstractMetricService; +import org.apache.iotdb.metrics.metricsets.IMetricSet; +import org.apache.iotdb.metrics.utils.MetricLevel; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.management.GarbageCollectorMXBean; +import java.lang.management.ManagementFactory; +import java.util.concurrent.Future; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +public class JvmGcMetrics implements IMetricSet { + private static final Logger logger = LoggerFactory.getLogger(JvmGcMetrics.class); + private final ScheduledExecutorService scheduledGCInfoMonitor = + IoTDBThreadPoolFactory.newSingleThreadScheduledExecutor( + "JVM-GC-Statistics-Monitor"); + private Future scheduledGcMonitorFuture; + // Ring buffers containing GC timings and timestamps when timings were taken + private final TsAndData[] gcDataBuf; +// private final long maxGcTimePercentage = 0L; + private final long maxGcTimePercentage = 70L; + private final long observationWindowMs = TimeUnit.MINUTES.toMillis(1); +// private final long observationWindowMs = TimeUnit.SECONDS.toMillis(1); + private final long sleepIntervalMs = TimeUnit.SECONDS.toMillis(5); +// private final long sleepIntervalMs = TimeUnit.MILLISECONDS.toMillis(1000); + private final int bufSize; + private int startIdx; + private int endIdx; + private long startTime; + private final GcData curData = new GcData(); + private boolean shouldRun = true; + private final GcTimeAlertHandler alertHandler; + + public JvmGcMetrics() { + bufSize = (int) (observationWindowMs / sleepIntervalMs + 2); + // Prevent the user from accidentally creating an abnormally big buffer, which will result in slow calculations and likely inaccuracy. + Preconditions.checkArgument(bufSize <= 128 * 1024); + gcDataBuf = new TsAndData[bufSize]; + for (int i = 0; i < bufSize; i++) { + gcDataBuf[i] = new TsAndData(); + } + + alertHandler = new GcTimeAlerter(); + } + + @Override + public void bindTo(AbstractMetricService metricService) { + metricService.createAutoGauge( + "jvm_gc_accumulated_time_percentage", MetricLevel.CORE, curData, GcData::getGcTimePercentage + ); + + startTime = System.currentTimeMillis(); + curData.timestamp.set(startTime); + gcDataBuf[startIdx].setValues(startTime, 0); + scheduledGcMonitorFuture = ScheduledExecutorUtil.safelyScheduleWithFixedDelay( + scheduledGCInfoMonitor, + this::scheduledMonitoring, + TimeUnit.MILLISECONDS.toMillis(50), // to prevent / ZERO exception + sleepIntervalMs, + TimeUnit.MILLISECONDS + ); + } + + @Override + public void unbindFrom(AbstractMetricService metricService) { + shouldRun = false; + if (scheduledGcMonitorFuture != null) { + scheduledGcMonitorFuture.cancel(false); + scheduledGcMonitorFuture = null; + logger.info("JVM GC scheduled monitor is stopped successfully."); + } + } + + private void scheduledMonitoring() { + if (shouldRun) { + calculateGCTimePercentageWithinObservedInterval(); + } + // TODO: we can add an alertHandler here to handle the abnormal GC case. + if (alertHandler != null && + curData.gcTimePercentage.get() > maxGcTimePercentage) { + alertHandler.alert(curData.clone()); + } + } + + private void calculateGCTimePercentageWithinObservedInterval() { + long prevTotalGcTime = curData.totalGcTime.get(); + long totalGcTime = 0; + long totalGcCount = 0; + for (GarbageCollectorMXBean gcBean : ManagementFactory.getGarbageCollectorMXBeans()) { + totalGcTime += gcBean.getCollectionTime(); + totalGcCount += gcBean.getCollectionCount(); + } + long gcTimeWithinSleepInterval = totalGcTime - prevTotalGcTime; + + long curTime = System.currentTimeMillis(); + long gcMonitorRunTime = curTime - startTime; + + endIdx = (endIdx + 1) % bufSize; + gcDataBuf[endIdx].setValues(curTime, gcTimeWithinSleepInterval); + + // Move startIdx forward until we reach the first buffer entry with + // timestamp within the observation window. + long startObsWindowTs = curTime - observationWindowMs; + while (gcDataBuf[startIdx].ts < startObsWindowTs && startIdx != endIdx) { + startIdx = (startIdx + 1) % bufSize; + } + + // Calculate total GC time within observationWindowMs. + // We should be careful about GC time that passed before the first timestamp + // in our observation window. + long gcTimeWithinObservationWindow = Math.min( + gcDataBuf[startIdx].gcPause, gcDataBuf[startIdx].ts - startObsWindowTs); + if (startIdx != endIdx) { + for (int i = (startIdx + 1) % bufSize; i != endIdx; + i = (i + 1) % bufSize) { + gcTimeWithinObservationWindow += gcDataBuf[i].gcPause; + } + } + + curData.update(curTime, gcMonitorRunTime, totalGcTime, totalGcCount, + (int) (gcTimeWithinObservationWindow * 100 / + Math.min(observationWindowMs, gcMonitorRunTime))); + } + + + /** + * Encapsulates data about GC pauses measured at the specific timestamp. + */ + public static class GcData implements Cloneable { + private final AtomicLong timestamp = new AtomicLong(); + private AtomicLong gcMonitorRunTime = new AtomicLong(); + private AtomicLong totalGcTime = new AtomicLong(); + private AtomicLong totalGcCount = new AtomicLong(); + private AtomicLong gcTimePercentage = new AtomicLong(); + + /** + * Returns the absolute timestamp when this measurement was taken. + * + * @return timestamp. + */ + public long getTimestamp() { + return timestamp.get(); + } + + /** + * Returns the time since the start of the associated GCTimeMonitor. + * + * @return GcMonitorRunTime. + */ + public long getGcMonitorRunTime() { + return gcMonitorRunTime.get(); + } + + /** + * Returns accumulated GC time since this JVM started. + * + * @return AccumulatedGcTime. + */ + public long getAccumulatedGcTime() { + return totalGcTime.get(); + } + + /** + * Returns the accumulated number of GC pauses since this JVM started. + * + * @return AccumulatedGcCount. + */ + public long getAccumulatedGcCount() { + return totalGcCount.get(); + } + + /** + * Returns the percentage (0..100) of time that the JVM spent in GC pauses + * within the observation window of the associated GCTimeMonitor. + * + * @return GcTimePercentage. + */ + public long getGcTimePercentage() { + return gcTimePercentage.get(); + } + + private synchronized void update(long inTimestamp, long inGcMonitorRunTime, + long inTotalGcTime, long inTotalGcCount, int inGcTimePercentage) { + this.timestamp.set(inTimestamp); + this.gcMonitorRunTime.set(inGcMonitorRunTime); + this.totalGcTime.set(inTotalGcTime); + this.totalGcCount.set(inTotalGcCount); + this.gcTimePercentage.set(inGcTimePercentage); + } + + @Override + public synchronized GcData clone() { + try { + return (GcData) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + } + + private static class TsAndData { + private long ts; // Timestamp when this measurement was taken + private long gcPause; // Total GC pause time within the interval between ts + + // and the timestamp of the previous measurement. + void setValues(long inTs, long inGcPause) { + this.ts = inTs; + this.gcPause = inGcPause; + } + } + + /** + * The user can provide an instance of a class implementing this interface + * when initializing a GCTimeMonitor to receive alerts when GC time + * percentage exceeds the specified threshold. + */ + public interface GcTimeAlertHandler { + void alert(GcData gcData); + } + + private static class JvmGcMetricsHolder { + + private static final JvmGcMetrics INSTANCE = new JvmGcMetrics(); + + private JvmGcMetricsHolder() { + // empty constructor + } + } + + public static JvmGcMetrics getInstance() { + return JvmGcMetricsHolder.INSTANCE; + } + +} From 6fc2282d5cedd52b66f83dca663d396ede38da9a Mon Sep 17 00:00:00 2001 From: Junzhi Peng <201250214@smail.nju.edu.cn> Date: Thu, 20 Jul 2023 09:57:53 +0800 Subject: [PATCH 02/22] Fix concurrent GC duration calculation --- .../metrics/metricsets/jvm/JvmGcMetrics.java | 177 ++++++++++++++---- 1 file changed, 143 insertions(+), 34 deletions(-) diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java index 77264c868a934..d8a97ed6fa913 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java @@ -55,6 +55,9 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable { private String oldGenPoolName; private String nonGenerationalMemoryPool; private final List notificationListenerCleanUpRunnables = new CopyOnWriteArrayList<>(); + private AtomicLong lastGcTotalDuration = new AtomicLong(); + private AtomicLong totalGcTimeSpend = new AtomicLong(); + private double throughout = 0.0d; public JvmGcMetrics() { for (MemoryPoolMXBean mbean : ManagementFactory.getMemoryPoolMXBeans()) { @@ -91,6 +94,13 @@ public void bindTo(AbstractMetricService metricService) { metricService.createAutoGauge( "jvm_gc_live_data_size_bytes", MetricLevel.CORE, liveDataSize, AtomicLong::get); + AtomicLong heapMemUsedPercentage = new AtomicLong(calculateMemoryUsagePercentage()); + metricService.createAutoGauge( + "jvm_gc_memory_used_percent", MetricLevel.CORE, heapMemUsedPercentage, AtomicLong::get); + + metricService.createAutoGauge( + "jvm_gc_throughout", MetricLevel.CORE, this, JvmGcMetrics::getThroughput); + Counter allocatedBytes = metricService.getOrCreateCounter("jvm_gc_memory_allocated_bytes", MetricLevel.CORE); @@ -116,17 +126,61 @@ public void bindTo(AbstractMetricService metricService) { String gcAction = notificationInfo.getGcAction(); GcInfo gcInfo = notificationInfo.getGcInfo(); long duration = gcInfo.getDuration(); + + // The duration supplied in the notification info includes more than just + // application stopped time for concurrent GCs (since the concurrent phase is not stop-the-world). + // Try and do a better job coming up with a good stopped time + // value by asking for and tracking cumulative time spent blocked in GC. + if (isPartiallyConcurrentGC(mbean)) { + long previousTotal = lastGcTotalDuration.get(); + long total = mbean.getCollectionTime(); + lastGcTotalDuration.set(total); + duration = total - previousTotal; // may be zero for a really fast collection + } + totalGcTimeSpend.set(totalGcTimeSpend.get() + duration); + String timerName; if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { - timerName = "jvm_gc_concurrent_phase_time"; + timerName = "jvm_gc_concurrent_phase_time_"; } else { - timerName = "jvm_gc_pause"; + timerName = "jvm_gc_pause_"; } + // create a timer for each cause, which binds gcCause with gcDuration + timerName += gcCause; Timer timer = metricService.getOrCreateTimer( timerName, MetricLevel.CORE, "action", gcAction, "cause", gcCause); timer.update(duration, TimeUnit.MILLISECONDS); + // add support for ZGC + if (mbean.getName().equals("ZGC Cycles")) { + Counter cyclesCount = metricService.getOrCreateCounter("jvm_zgc_cycles_count", MetricLevel.CORE); + cyclesCount.inc(); + } + else if (mbean.getName().equals("ZGC Pauses")) { + Counter pausesCount = metricService.getOrCreateCounter("jvm_zgc_pauses_count", MetricLevel.CORE); + pausesCount.inc(); + } + + // monitoring old/young GC count, which is helpful for users to locate GC exception. + if (GcGenerationAge.fromName(notificationInfo.getGcName()) + == GcGenerationAge.OLD) { + Counter oldGcCounter = metricService.getOrCreateCounter("jvm_gc_old_gc_count", MetricLevel.CORE); + oldGcCounter.inc(); + } + else if (GcGenerationAge.fromName(notificationInfo.getGcName()) + == GcGenerationAge.YOUNG) { + Counter youngGcCounter = metricService.getOrCreateCounter("jvm_gc_young_gc_count", MetricLevel.CORE); + youngGcCounter.inc(); + } + + // update memory usage percentage + heapMemUsedPercentage.set(calculateMemoryUsagePercentage()); + + // update throughput, which is the + // percentage of GC time as total JVM running time + throughout = (double)((gcInfo.getEndTime() - totalGcTimeSpend.get())) / gcInfo.getEndTime() * 100L; + // Update promotion and allocation counters final Map before = gcInfo.getMemoryUsageBeforeGc(); final Map after = gcInfo.getMemoryUsageAfterGc(); @@ -147,35 +201,39 @@ public void bindTo(AbstractMetricService metricService) { return; } - if (oldGenPoolName != null) { - final long oldBefore = before.get(oldGenPoolName).getUsed(); - final long oldAfter = after.get(oldGenPoolName).getUsed(); - final long delta = oldAfter - oldBefore; - if (delta > 0L && promotedBytes != null) { - promotedBytes.inc(delta); - } - - // Some GC implementations such as G1 can reduce the old gen size as part of a minor - // GC. To track the - // live data size we record the value if we see a reduction in the old gen heap size - // or - // after a major GC. - if (oldAfter < oldBefore - || GcGenerationAge.fromName(notificationInfo.getGcName()) - == GcGenerationAge.OLD) { - liveDataSize.set(oldAfter); - final long oldMaxAfter = after.get(oldGenPoolName).getMax(); - maxDataSize.set(oldMaxAfter); - } - } - - if (youngGenPoolName != null) { - countPoolSizeDelta( - gcInfo.getMemoryUsageBeforeGc(), - gcInfo.getMemoryUsageAfterGc(), - allocatedBytes, - heapPoolSizeAfterGc, - youngGenPoolName); + // should add `else` here, since there are only two + // cases: generational and non-generational + else { + if (oldGenPoolName != null) { + final long oldBefore = before.get(oldGenPoolName).getUsed(); + final long oldAfter = after.get(oldGenPoolName).getUsed(); + final long delta = oldAfter - oldBefore; + if (delta > 0L && promotedBytes != null) { + promotedBytes.inc(delta); + } + + // Some GC implementations such as G1 can reduce the old gen size as part of a minor + // GC. To track the + // live data size we record the value if we see a reduction in the old gen heap size + // or + // after a major GC. + if (oldAfter < oldBefore + || GcGenerationAge.fromName(notificationInfo.getGcName()) + == GcGenerationAge.OLD) { + liveDataSize.set(oldAfter); + final long oldMaxAfter = after.get(oldGenPoolName).getMax(); + maxDataSize.set(oldMaxAfter); + } + } + + if (youngGenPoolName != null) { + countPoolSizeDelta( + gcInfo.getMemoryUsageBeforeGc(), + gcInfo.getMemoryUsageAfterGc(), + allocatedBytes, + heapPoolSizeAfterGc, + youngGenPoolName); + } } }; NotificationEmitter notificationEmitter = (NotificationEmitter) mbean; @@ -205,8 +263,11 @@ public void unbindFrom(AbstractMetricService metricService) { metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_max_data_size_bytes"); metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_live_data_size_bytes"); + metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_memory_used_percent"); + metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_throughout"); metricService.remove(MetricType.COUNTER, "jvm_gc_memory_allocated_bytes"); + if (oldGenPoolName != null) { metricService.remove(MetricType.COUNTER, "jvm_gc_memory_promoted_bytes"); } @@ -226,11 +287,26 @@ public void unbindFrom(AbstractMetricService metricService) { String gcAction = notificationInfo.getGcAction(); String timerName; if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { - timerName = "jvm_gc_concurrent_phase_time"; + timerName = "jvm_gc_concurrent_phase_time_"; } else { - timerName = "jvm_gc_pause"; + timerName = "jvm_gc_pause_"; } + timerName += gcCause; metricService.remove(MetricType.TIMER, timerName, "action", gcAction, "cause", gcCause); + if (mbean.getName().equals("ZGC Cycles")) { + metricService.remove(MetricType.COUNTER, "jvm_zgc_cycles_count"); + } + else if (mbean.getName().equals("ZGC Pauses")) { + metricService.remove(MetricType.COUNTER, "jvm_zgc_pauses_count"); + } + if (GcGenerationAge.fromName(notificationInfo.getGcName()) + == GcGenerationAge.OLD) { + metricService.remove(MetricType.COUNTER, "jvm_gc_old_gc_count"); + } + else if (GcGenerationAge.fromName(notificationInfo.getGcName()) + == GcGenerationAge.YOUNG) { + metricService.remove(MetricType.COUNTER, "jvm_gc_young_gc_count"); + } }; NotificationEmitter notificationEmitter = (NotificationEmitter) mbean; notificationEmitter.addNotificationListener( @@ -294,6 +370,15 @@ public void close() { notificationListenerCleanUpRunnables.forEach(Runnable::run); } + public long calculateMemoryUsagePercentage() { + return (ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getUsed() * 100 / Runtime.getRuntime().maxMemory()); + } + + // We need to keep two decimal places accurate, so we maintain a double var here. + public double getThroughput() { + return throughout; + } + enum GcGenerationAge { OLD, YOUNG, @@ -317,12 +402,35 @@ static GcGenerationAge fromName(String name) { } } + private static boolean isPartiallyConcurrentGC(GarbageCollectorMXBean gc) + { + switch (gc.getName()) + { + //First two are from the serial collector + case "Copy": + case "MarkSweepCompact": + //Parallel collector + case "PS MarkSweep": + case "PS Scavenge": + case "G1 Young Generation": + //CMS young generation collector + case "ParNew": + return false; + case "ConcurrentMarkSweep": + case "G1 Old Generation": + return true; + default: + //Assume possibly concurrent if unsure + return true; + } + } + private static boolean isConcurrentPhase(String cause, String name) { return "No GC".equals(cause) || "Shenandoah Cycles".equals(name); } private static boolean isYoungGenPool(String name) { - return name != null && name.endsWith("Eden Space"); + return name != null && name.endsWith("Eden Space") && name.endsWith("Survivor Space"); } private static boolean isOldGenPool(String name) { @@ -332,4 +440,5 @@ private static boolean isOldGenPool(String name) { private static boolean isNonGenerationalHeapPool(String name) { return "Shenandoah".equals(name) || "ZHeap".equals(name); } + } From 6f176470ddd89c85d14f1cfb139f872463519772 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Thu, 20 Jul 2023 15:48:28 +0800 Subject: [PATCH 03/22] Reformat codes with IoTDB's standard. --- .../commons/service/metric/GcTimeAlerter.java | 32 +- .../commons/service/metric/JvmGcMetrics.java | 408 +++++++++--------- 2 files changed, 224 insertions(+), 216 deletions(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java index d102ea2312870..253771470f89b 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java @@ -7,20 +7,22 @@ import java.util.Date; public class GcTimeAlerter implements JvmGcMetrics.GcTimeAlertHandler { - private static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - private static final Logger logger = LoggerFactory.getLogger(GcTimeAlerter.class); + private static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + private static final Logger logger = LoggerFactory.getLogger(GcTimeAlerter.class); - /** - * Alert handler func - * User can tailor their handle logic here - * @param gcData - */ - @Override - public void alert(JvmGcMetrics.GcData gcData) { - logger.warn("Error metrics taken time: " + sdf.format(new Date(Long.parseLong(String.valueOf(gcData.getTimestamp()))))); - logger.warn("Time since monitor has been started: " + gcData.getGcMonitorRunTime() + " ms"); - logger.warn("Gc Time Percentage: " + gcData.getGcTimePercentage() + "%"); - logger.warn("Accumulated GC time: " + gcData.getAccumulatedGcTime() + " ms"); - logger.warn("Accumulated GC count: " + gcData.getAccumulatedGcCount()); - } + /** + * Alert handler func User can tailor their handle logic here + * + * @param gcData + */ + @Override + public void alert(JvmGcMetrics.GcData gcData) { + logger.warn( + "Error metrics taken time: " + + sdf.format(new Date(Long.parseLong(String.valueOf(gcData.getTimestamp()))))); + logger.warn("Time since monitor has been started: " + gcData.getGcMonitorRunTime() + " ms"); + logger.warn("Gc Time Percentage: " + gcData.getGcTimePercentage() + "%"); + logger.warn("Accumulated GC time: " + gcData.getAccumulatedGcTime() + " ms"); + logger.warn("Accumulated GC count: " + gcData.getAccumulatedGcCount()); + } } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java index 06260bc8302a0..1ede70d3bbccf 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java @@ -1,11 +1,12 @@ package org.apache.iotdb.commons.service.metric; -import com.google.common.base.Preconditions; import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; import org.apache.iotdb.commons.concurrent.threadpool.ScheduledExecutorUtil; import org.apache.iotdb.metrics.AbstractMetricService; import org.apache.iotdb.metrics.metricsets.IMetricSet; import org.apache.iotdb.metrics.utils.MetricLevel; + +import com.google.common.base.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,225 +18,230 @@ import java.util.concurrent.atomic.AtomicLong; public class JvmGcMetrics implements IMetricSet { - private static final Logger logger = LoggerFactory.getLogger(JvmGcMetrics.class); - private final ScheduledExecutorService scheduledGCInfoMonitor = - IoTDBThreadPoolFactory.newSingleThreadScheduledExecutor( - "JVM-GC-Statistics-Monitor"); - private Future scheduledGcMonitorFuture; - // Ring buffers containing GC timings and timestamps when timings were taken - private final TsAndData[] gcDataBuf; -// private final long maxGcTimePercentage = 0L; - private final long maxGcTimePercentage = 70L; - private final long observationWindowMs = TimeUnit.MINUTES.toMillis(1); -// private final long observationWindowMs = TimeUnit.SECONDS.toMillis(1); - private final long sleepIntervalMs = TimeUnit.SECONDS.toMillis(5); -// private final long sleepIntervalMs = TimeUnit.MILLISECONDS.toMillis(1000); - private final int bufSize; - private int startIdx; - private int endIdx; - private long startTime; - private final GcData curData = new GcData(); - private boolean shouldRun = true; - private final GcTimeAlertHandler alertHandler; - - public JvmGcMetrics() { - bufSize = (int) (observationWindowMs / sleepIntervalMs + 2); - // Prevent the user from accidentally creating an abnormally big buffer, which will result in slow calculations and likely inaccuracy. - Preconditions.checkArgument(bufSize <= 128 * 1024); - gcDataBuf = new TsAndData[bufSize]; - for (int i = 0; i < bufSize; i++) { - gcDataBuf[i] = new TsAndData(); - } - - alertHandler = new GcTimeAlerter(); + private static final Logger logger = LoggerFactory.getLogger(JvmGcMetrics.class); + private final ScheduledExecutorService scheduledGCInfoMonitor = + IoTDBThreadPoolFactory.newSingleThreadScheduledExecutor("JVM-GC-Statistics-Monitor"); + private Future scheduledGcMonitorFuture; + // Ring buffers containing GC timings and timestamps when timings were taken + private final TsAndData[] gcDataBuf; + // private final long maxGcTimePercentage = 0L; + private final long maxGcTimePercentage = 70L; + private final long observationWindowMs = TimeUnit.MINUTES.toMillis(1); + // private final long observationWindowMs = TimeUnit.SECONDS.toMillis(1); + private final long sleepIntervalMs = TimeUnit.SECONDS.toMillis(5); + // private final long sleepIntervalMs = TimeUnit.MILLISECONDS.toMillis(1000); + private final int bufSize; + private int startIdx; + private int endIdx; + private long startTime; + private final GcData curData = new GcData(); + private boolean shouldRun = true; + private final GcTimeAlertHandler alertHandler; + + public JvmGcMetrics() { + bufSize = (int) (observationWindowMs / sleepIntervalMs + 2); + // Prevent the user from accidentally creating an abnormally big buffer, which will result in + // slow calculations and likely inaccuracy. + Preconditions.checkArgument(bufSize <= 128 * 1024); + gcDataBuf = new TsAndData[bufSize]; + for (int i = 0; i < bufSize; i++) { + gcDataBuf[i] = new TsAndData(); } - @Override - public void bindTo(AbstractMetricService metricService) { - metricService.createAutoGauge( - "jvm_gc_accumulated_time_percentage", MetricLevel.CORE, curData, GcData::getGcTimePercentage - ); - - startTime = System.currentTimeMillis(); - curData.timestamp.set(startTime); - gcDataBuf[startIdx].setValues(startTime, 0); - scheduledGcMonitorFuture = ScheduledExecutorUtil.safelyScheduleWithFixedDelay( - scheduledGCInfoMonitor, - this::scheduledMonitoring, - TimeUnit.MILLISECONDS.toMillis(50), // to prevent / ZERO exception - sleepIntervalMs, - TimeUnit.MILLISECONDS - ); + alertHandler = new GcTimeAlerter(); + } + + @Override + public void bindTo(AbstractMetricService metricService) { + metricService.createAutoGauge( + "jvm_gc_accumulated_time_percentage", + MetricLevel.CORE, + curData, + GcData::getGcTimePercentage); + + startTime = System.currentTimeMillis(); + curData.timestamp.set(startTime); + gcDataBuf[startIdx].setValues(startTime, 0); + scheduledGcMonitorFuture = + ScheduledExecutorUtil.safelyScheduleWithFixedDelay( + scheduledGCInfoMonitor, + this::scheduledMonitoring, + TimeUnit.MILLISECONDS.toMillis(50), // to prevent / ZERO exception + sleepIntervalMs, + TimeUnit.MILLISECONDS); + } + + @Override + public void unbindFrom(AbstractMetricService metricService) { + shouldRun = false; + if (scheduledGcMonitorFuture != null) { + scheduledGcMonitorFuture.cancel(false); + scheduledGcMonitorFuture = null; + logger.info("JVM GC scheduled monitor is stopped successfully."); } + } - @Override - public void unbindFrom(AbstractMetricService metricService) { - shouldRun = false; - if (scheduledGcMonitorFuture != null) { - scheduledGcMonitorFuture.cancel(false); - scheduledGcMonitorFuture = null; - logger.info("JVM GC scheduled monitor is stopped successfully."); - } - } - - private void scheduledMonitoring() { - if (shouldRun) { - calculateGCTimePercentageWithinObservedInterval(); - } - // TODO: we can add an alertHandler here to handle the abnormal GC case. - if (alertHandler != null && - curData.gcTimePercentage.get() > maxGcTimePercentage) { - alertHandler.alert(curData.clone()); - } - } - - private void calculateGCTimePercentageWithinObservedInterval() { - long prevTotalGcTime = curData.totalGcTime.get(); - long totalGcTime = 0; - long totalGcCount = 0; - for (GarbageCollectorMXBean gcBean : ManagementFactory.getGarbageCollectorMXBeans()) { - totalGcTime += gcBean.getCollectionTime(); - totalGcCount += gcBean.getCollectionCount(); - } - long gcTimeWithinSleepInterval = totalGcTime - prevTotalGcTime; - - long curTime = System.currentTimeMillis(); - long gcMonitorRunTime = curTime - startTime; - - endIdx = (endIdx + 1) % bufSize; - gcDataBuf[endIdx].setValues(curTime, gcTimeWithinSleepInterval); - - // Move startIdx forward until we reach the first buffer entry with - // timestamp within the observation window. - long startObsWindowTs = curTime - observationWindowMs; - while (gcDataBuf[startIdx].ts < startObsWindowTs && startIdx != endIdx) { - startIdx = (startIdx + 1) % bufSize; - } - - // Calculate total GC time within observationWindowMs. - // We should be careful about GC time that passed before the first timestamp - // in our observation window. - long gcTimeWithinObservationWindow = Math.min( - gcDataBuf[startIdx].gcPause, gcDataBuf[startIdx].ts - startObsWindowTs); - if (startIdx != endIdx) { - for (int i = (startIdx + 1) % bufSize; i != endIdx; - i = (i + 1) % bufSize) { - gcTimeWithinObservationWindow += gcDataBuf[i].gcPause; - } - } - - curData.update(curTime, gcMonitorRunTime, totalGcTime, totalGcCount, - (int) (gcTimeWithinObservationWindow * 100 / - Math.min(observationWindowMs, gcMonitorRunTime))); + private void scheduledMonitoring() { + if (shouldRun) { + calculateGCTimePercentageWithinObservedInterval(); + } + // TODO: we can add an alertHandler here to handle the abnormal GC case. + if (alertHandler != null && curData.gcTimePercentage.get() > maxGcTimePercentage) { + alertHandler.alert(curData.clone()); + } + } + + private void calculateGCTimePercentageWithinObservedInterval() { + long prevTotalGcTime = curData.totalGcTime.get(); + long totalGcTime = 0; + long totalGcCount = 0; + for (GarbageCollectorMXBean gcBean : ManagementFactory.getGarbageCollectorMXBeans()) { + totalGcTime += gcBean.getCollectionTime(); + totalGcCount += gcBean.getCollectionCount(); + } + long gcTimeWithinSleepInterval = totalGcTime - prevTotalGcTime; + + long curTime = System.currentTimeMillis(); + long gcMonitorRunTime = curTime - startTime; + + endIdx = (endIdx + 1) % bufSize; + gcDataBuf[endIdx].setValues(curTime, gcTimeWithinSleepInterval); + + // Move startIdx forward until we reach the first buffer entry with + // timestamp within the observation window. + long startObsWindowTs = curTime - observationWindowMs; + while (gcDataBuf[startIdx].ts < startObsWindowTs && startIdx != endIdx) { + startIdx = (startIdx + 1) % bufSize; + } + + // Calculate total GC time within observationWindowMs. + // We should be careful about GC time that passed before the first timestamp + // in our observation window. + long gcTimeWithinObservationWindow = + Math.min(gcDataBuf[startIdx].gcPause, gcDataBuf[startIdx].ts - startObsWindowTs); + if (startIdx != endIdx) { + for (int i = (startIdx + 1) % bufSize; i != endIdx; i = (i + 1) % bufSize) { + gcTimeWithinObservationWindow += gcDataBuf[i].gcPause; + } + } + + curData.update( + curTime, + gcMonitorRunTime, + totalGcTime, + totalGcCount, + (int) + (gcTimeWithinObservationWindow + * 100 + / Math.min(observationWindowMs, gcMonitorRunTime))); + } + + /** Encapsulates data about GC pauses measured at the specific timestamp. */ + public static class GcData implements Cloneable { + private final AtomicLong timestamp = new AtomicLong(); + private AtomicLong gcMonitorRunTime = new AtomicLong(); + private AtomicLong totalGcTime = new AtomicLong(); + private AtomicLong totalGcCount = new AtomicLong(); + private AtomicLong gcTimePercentage = new AtomicLong(); + + /** + * Returns the absolute timestamp when this measurement was taken. + * + * @return timestamp. + */ + public long getTimestamp() { + return timestamp.get(); + } + + /** + * Returns the time since the start of the associated GCTimeMonitor. + * + * @return GcMonitorRunTime. + */ + public long getGcMonitorRunTime() { + return gcMonitorRunTime.get(); } + /** + * Returns accumulated GC time since this JVM started. + * + * @return AccumulatedGcTime. + */ + public long getAccumulatedGcTime() { + return totalGcTime.get(); + } /** - * Encapsulates data about GC pauses measured at the specific timestamp. + * Returns the accumulated number of GC pauses since this JVM started. + * + * @return AccumulatedGcCount. */ - public static class GcData implements Cloneable { - private final AtomicLong timestamp = new AtomicLong(); - private AtomicLong gcMonitorRunTime = new AtomicLong(); - private AtomicLong totalGcTime = new AtomicLong(); - private AtomicLong totalGcCount = new AtomicLong(); - private AtomicLong gcTimePercentage = new AtomicLong(); - - /** - * Returns the absolute timestamp when this measurement was taken. - * - * @return timestamp. - */ - public long getTimestamp() { - return timestamp.get(); - } - - /** - * Returns the time since the start of the associated GCTimeMonitor. - * - * @return GcMonitorRunTime. - */ - public long getGcMonitorRunTime() { - return gcMonitorRunTime.get(); - } - - /** - * Returns accumulated GC time since this JVM started. - * - * @return AccumulatedGcTime. - */ - public long getAccumulatedGcTime() { - return totalGcTime.get(); - } - - /** - * Returns the accumulated number of GC pauses since this JVM started. - * - * @return AccumulatedGcCount. - */ - public long getAccumulatedGcCount() { - return totalGcCount.get(); - } - - /** - * Returns the percentage (0..100) of time that the JVM spent in GC pauses - * within the observation window of the associated GCTimeMonitor. - * - * @return GcTimePercentage. - */ - public long getGcTimePercentage() { - return gcTimePercentage.get(); - } - - private synchronized void update(long inTimestamp, long inGcMonitorRunTime, - long inTotalGcTime, long inTotalGcCount, int inGcTimePercentage) { - this.timestamp.set(inTimestamp); - this.gcMonitorRunTime.set(inGcMonitorRunTime); - this.totalGcTime.set(inTotalGcTime); - this.totalGcCount.set(inTotalGcCount); - this.gcTimePercentage.set(inGcTimePercentage); - } - - @Override - public synchronized GcData clone() { - try { - return (GcData) super.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); - } - } - } - - private static class TsAndData { - private long ts; // Timestamp when this measurement was taken - private long gcPause; // Total GC pause time within the interval between ts - - // and the timestamp of the previous measurement. - void setValues(long inTs, long inGcPause) { - this.ts = inTs; - this.gcPause = inGcPause; - } + public long getAccumulatedGcCount() { + return totalGcCount.get(); } /** - * The user can provide an instance of a class implementing this interface - * when initializing a GCTimeMonitor to receive alerts when GC time - * percentage exceeds the specified threshold. + * Returns the percentage (0..100) of time that the JVM spent in GC pauses within the + * observation window of the associated GCTimeMonitor. + * + * @return GcTimePercentage. */ - public interface GcTimeAlertHandler { - void alert(GcData gcData); + public long getGcTimePercentage() { + return gcTimePercentage.get(); + } + + private synchronized void update( + long inTimestamp, + long inGcMonitorRunTime, + long inTotalGcTime, + long inTotalGcCount, + int inGcTimePercentage) { + this.timestamp.set(inTimestamp); + this.gcMonitorRunTime.set(inGcMonitorRunTime); + this.totalGcTime.set(inTotalGcTime); + this.totalGcCount.set(inTotalGcCount); + this.gcTimePercentage.set(inGcTimePercentage); } - private static class JvmGcMetricsHolder { + @Override + public synchronized GcData clone() { + try { + return (GcData) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + } - private static final JvmGcMetrics INSTANCE = new JvmGcMetrics(); + private static class TsAndData { + private long ts; // Timestamp when this measurement was taken + private long gcPause; // Total GC pause time within the interval between ts - private JvmGcMetricsHolder() { - // empty constructor - } + // and the timestamp of the previous measurement. + void setValues(long inTs, long inGcPause) { + this.ts = inTs; + this.gcPause = inGcPause; } + } + + /** + * The user can provide an instance of a class implementing this interface when initializing a + * GCTimeMonitor to receive alerts when GC time percentage exceeds the specified threshold. + */ + public interface GcTimeAlertHandler { + void alert(GcData gcData); + } + + private static class JvmGcMetricsHolder { + + private static final JvmGcMetrics INSTANCE = new JvmGcMetrics(); - public static JvmGcMetrics getInstance() { - return JvmGcMetricsHolder.INSTANCE; + private JvmGcMetricsHolder() { + // empty constructor } + } + public static JvmGcMetrics getInstance() { + return JvmGcMetricsHolder.INSTANCE; + } } From 7b5247e0aa96b49648f42756e442e8f1f57fea8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Thu, 20 Jul 2023 15:49:13 +0800 Subject: [PATCH 04/22] Reformat gc_pause_{cause} naming style. --- .../metrics/metricsets/jvm/JvmGcMetrics.java | 220 +++++++++--------- 1 file changed, 110 insertions(+), 110 deletions(-) diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java index d8a97ed6fa913..77ef3980ffe05 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java @@ -51,12 +51,12 @@ /** This file is modified from io.micrometer.core.instrument.binder.jvm.JvmGcMetrics */ public class JvmGcMetrics implements IMetricSet, AutoCloseable { private static final Logger logger = LoggerFactory.getLogger(JvmGcMetrics.class); + private final List notificationListenerCleanUpRunnables = new CopyOnWriteArrayList<>(); private String youngGenPoolName; private String oldGenPoolName; private String nonGenerationalMemoryPool; - private final List notificationListenerCleanUpRunnables = new CopyOnWriteArrayList<>(); - private AtomicLong lastGcTotalDuration = new AtomicLong(); - private AtomicLong totalGcTimeSpend = new AtomicLong(); + private final AtomicLong lastGcTotalDuration = new AtomicLong(); + private final AtomicLong totalGcTimeSpend = new AtomicLong(); private double throughout = 0.0d; public JvmGcMetrics() { @@ -72,6 +72,43 @@ public JvmGcMetrics() { } } + private static boolean isPartiallyConcurrentGC(GarbageCollectorMXBean gc) { + switch (gc.getName()) { + // First two are from the serial collector + case "Copy": + case "MarkSweepCompact": + // Parallel collector + case "PS MarkSweep": + case "PS Scavenge": + case "G1 Young Generation": + // CMS young generation collector + case "ParNew": + return false; + case "ConcurrentMarkSweep": + case "G1 Old Generation": + return true; + default: + // Assume possibly concurrent if unsure + return true; + } + } + + private static boolean isConcurrentPhase(String cause, String name) { + return "No GC".equals(cause) || "Shenandoah Cycles".equals(name); + } + + private static boolean isYoungGenPool(String name) { + return name != null && name.endsWith("Eden Space") && name.endsWith("Survivor Space"); + } + + private static boolean isOldGenPool(String name) { + return name != null && (name.endsWith("Old Gen") || name.endsWith("Tenured Gen")); + } + + private static boolean isNonGenerationalHeapPool(String name) { + return "Shenandoah".equals(name) || "ZHeap".equals(name); + } + @Override public void bindTo(AbstractMetricService metricService) { if (!preCheck()) { @@ -96,10 +133,10 @@ public void bindTo(AbstractMetricService metricService) { AtomicLong heapMemUsedPercentage = new AtomicLong(calculateMemoryUsagePercentage()); metricService.createAutoGauge( - "jvm_gc_memory_used_percent", MetricLevel.CORE, heapMemUsedPercentage, AtomicLong::get); + "jvm_gc_memory_used_percent", MetricLevel.CORE, heapMemUsedPercentage, AtomicLong::get); metricService.createAutoGauge( - "jvm_gc_throughout", MetricLevel.CORE, this, JvmGcMetrics::getThroughput); + "jvm_gc_throughout", MetricLevel.CORE, this, JvmGcMetrics::getThroughput); Counter allocatedBytes = metricService.getOrCreateCounter("jvm_gc_memory_allocated_bytes", MetricLevel.CORE); @@ -128,14 +165,15 @@ public void bindTo(AbstractMetricService metricService) { long duration = gcInfo.getDuration(); // The duration supplied in the notification info includes more than just - // application stopped time for concurrent GCs (since the concurrent phase is not stop-the-world). + // application stopped time for concurrent GCs (since the concurrent phase is not + // stop-the-world). // Try and do a better job coming up with a good stopped time // value by asking for and tracking cumulative time spent blocked in GC. if (isPartiallyConcurrentGC(mbean)) { - long previousTotal = lastGcTotalDuration.get(); - long total = mbean.getCollectionTime(); - lastGcTotalDuration.set(total); - duration = total - previousTotal; // may be zero for a really fast collection + long previousTotal = lastGcTotalDuration.get(); + long total = mbean.getCollectionTime(); + lastGcTotalDuration.set(total); + duration = total - previousTotal; // may be zero for a really fast collection } totalGcTimeSpend.set(totalGcTimeSpend.get() + duration); @@ -145,8 +183,9 @@ public void bindTo(AbstractMetricService metricService) { } else { timerName = "jvm_gc_pause_"; } + String reformatCause = gcCause.replaceAll(" ", "_"); // create a timer for each cause, which binds gcCause with gcDuration - timerName += gcCause; + timerName += reformatCause; Timer timer = metricService.getOrCreateTimer( timerName, MetricLevel.CORE, "action", gcAction, "cause", gcCause); @@ -154,24 +193,25 @@ public void bindTo(AbstractMetricService metricService) { // add support for ZGC if (mbean.getName().equals("ZGC Cycles")) { - Counter cyclesCount = metricService.getOrCreateCounter("jvm_zgc_cycles_count", MetricLevel.CORE); - cyclesCount.inc(); - } - else if (mbean.getName().equals("ZGC Pauses")) { - Counter pausesCount = metricService.getOrCreateCounter("jvm_zgc_pauses_count", MetricLevel.CORE); - pausesCount.inc(); + Counter cyclesCount = + metricService.getOrCreateCounter("jvm_zgc_cycles_count", MetricLevel.CORE); + cyclesCount.inc(); + } else if (mbean.getName().equals("ZGC Pauses")) { + Counter pausesCount = + metricService.getOrCreateCounter("jvm_zgc_pauses_count", MetricLevel.CORE); + pausesCount.inc(); } // monitoring old/young GC count, which is helpful for users to locate GC exception. - if (GcGenerationAge.fromName(notificationInfo.getGcName()) - == GcGenerationAge.OLD) { - Counter oldGcCounter = metricService.getOrCreateCounter("jvm_gc_old_gc_count", MetricLevel.CORE); - oldGcCounter.inc(); - } - else if (GcGenerationAge.fromName(notificationInfo.getGcName()) - == GcGenerationAge.YOUNG) { - Counter youngGcCounter = metricService.getOrCreateCounter("jvm_gc_young_gc_count", MetricLevel.CORE); - youngGcCounter.inc(); + if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { + Counter oldGcCounter = + metricService.getOrCreateCounter("jvm_gc_old_gc_count", MetricLevel.CORE); + oldGcCounter.inc(); + } else if (GcGenerationAge.fromName(notificationInfo.getGcName()) + == GcGenerationAge.YOUNG) { + Counter youngGcCounter = + metricService.getOrCreateCounter("jvm_gc_young_gc_count", MetricLevel.CORE); + youngGcCounter.inc(); } // update memory usage percentage @@ -179,7 +219,10 @@ else if (GcGenerationAge.fromName(notificationInfo.getGcName()) // update throughput, which is the // percentage of GC time as total JVM running time - throughout = (double)((gcInfo.getEndTime() - totalGcTimeSpend.get())) / gcInfo.getEndTime() * 100L; + throughout = + (double) ((gcInfo.getEndTime() - totalGcTimeSpend.get())) + / gcInfo.getEndTime() + * 100L; // Update promotion and allocation counters final Map before = gcInfo.getMemoryUsageBeforeGc(); @@ -198,42 +241,41 @@ else if (GcGenerationAge.fromName(notificationInfo.getGcName()) final long longLivedMaxAfter = after.get(nonGenerationalMemoryPool).getMax(); maxDataSize.set(longLivedMaxAfter); } - return; } // should add `else` here, since there are only two // cases: generational and non-generational else { - if (oldGenPoolName != null) { - final long oldBefore = before.get(oldGenPoolName).getUsed(); - final long oldAfter = after.get(oldGenPoolName).getUsed(); - final long delta = oldAfter - oldBefore; - if (delta > 0L && promotedBytes != null) { - promotedBytes.inc(delta); - } - - // Some GC implementations such as G1 can reduce the old gen size as part of a minor - // GC. To track the - // live data size we record the value if we see a reduction in the old gen heap size - // or - // after a major GC. - if (oldAfter < oldBefore - || GcGenerationAge.fromName(notificationInfo.getGcName()) - == GcGenerationAge.OLD) { - liveDataSize.set(oldAfter); - final long oldMaxAfter = after.get(oldGenPoolName).getMax(); - maxDataSize.set(oldMaxAfter); - } + if (oldGenPoolName != null) { + final long oldBefore = before.get(oldGenPoolName).getUsed(); + final long oldAfter = after.get(oldGenPoolName).getUsed(); + final long delta = oldAfter - oldBefore; + if (delta > 0L && promotedBytes != null) { + promotedBytes.inc(delta); } - if (youngGenPoolName != null) { - countPoolSizeDelta( - gcInfo.getMemoryUsageBeforeGc(), - gcInfo.getMemoryUsageAfterGc(), - allocatedBytes, - heapPoolSizeAfterGc, - youngGenPoolName); + // Some GC implementations such as G1 can reduce the old gen size as part of a minor + // GC. To track the + // live data size we record the value if we see a reduction in the old gen heap size + // or + // after a major GC. + if (oldAfter < oldBefore + || GcGenerationAge.fromName(notificationInfo.getGcName()) + == GcGenerationAge.OLD) { + liveDataSize.set(oldAfter); + final long oldMaxAfter = after.get(oldGenPoolName).getMax(); + maxDataSize.set(oldMaxAfter); } + } + + if (youngGenPoolName != null) { + countPoolSizeDelta( + gcInfo.getMemoryUsageBeforeGc(), + gcInfo.getMemoryUsageAfterGc(), + allocatedBytes, + heapPoolSizeAfterGc, + youngGenPoolName); + } } }; NotificationEmitter notificationEmitter = (NotificationEmitter) mbean; @@ -267,7 +309,6 @@ public void unbindFrom(AbstractMetricService metricService) { metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_throughout"); metricService.remove(MetricType.COUNTER, "jvm_gc_memory_allocated_bytes"); - if (oldGenPoolName != null) { metricService.remove(MetricType.COUNTER, "jvm_gc_memory_promoted_bytes"); } @@ -294,18 +335,15 @@ public void unbindFrom(AbstractMetricService metricService) { timerName += gcCause; metricService.remove(MetricType.TIMER, timerName, "action", gcAction, "cause", gcCause); if (mbean.getName().equals("ZGC Cycles")) { - metricService.remove(MetricType.COUNTER, "jvm_zgc_cycles_count"); + metricService.remove(MetricType.COUNTER, "jvm_zgc_cycles_count"); + } else if (mbean.getName().equals("ZGC Pauses")) { + metricService.remove(MetricType.COUNTER, "jvm_zgc_pauses_count"); } - else if (mbean.getName().equals("ZGC Pauses")) { - metricService.remove(MetricType.COUNTER, "jvm_zgc_pauses_count"); - } - if (GcGenerationAge.fromName(notificationInfo.getGcName()) - == GcGenerationAge.OLD) { - metricService.remove(MetricType.COUNTER, "jvm_gc_old_gc_count"); - } - else if (GcGenerationAge.fromName(notificationInfo.getGcName()) - == GcGenerationAge.YOUNG) { - metricService.remove(MetricType.COUNTER, "jvm_gc_young_gc_count"); + if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { + metricService.remove(MetricType.COUNTER, "jvm_gc_old_gc_count"); + } else if (GcGenerationAge.fromName(notificationInfo.getGcName()) + == GcGenerationAge.YOUNG) { + metricService.remove(MetricType.COUNTER, "jvm_gc_young_gc_count"); } }; NotificationEmitter notificationEmitter = (NotificationEmitter) mbean; @@ -371,12 +409,14 @@ public void close() { } public long calculateMemoryUsagePercentage() { - return (ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getUsed() * 100 / Runtime.getRuntime().maxMemory()); + return (ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getUsed() + * 100 + / Runtime.getRuntime().maxMemory()); } // We need to keep two decimal places accurate, so we maintain a double var here. public double getThroughput() { - return throughout; + return throughout; } enum GcGenerationAge { @@ -384,7 +424,7 @@ enum GcGenerationAge { YOUNG, UNKNOWN; - private static Map knownCollectors = new HashMap<>(); + private static final Map knownCollectors = new HashMap<>(); static { knownCollectors.put("ConcurrentMarkSweep", OLD); @@ -401,44 +441,4 @@ static GcGenerationAge fromName(String name) { return knownCollectors.getOrDefault(name, UNKNOWN); } } - - private static boolean isPartiallyConcurrentGC(GarbageCollectorMXBean gc) - { - switch (gc.getName()) - { - //First two are from the serial collector - case "Copy": - case "MarkSweepCompact": - //Parallel collector - case "PS MarkSweep": - case "PS Scavenge": - case "G1 Young Generation": - //CMS young generation collector - case "ParNew": - return false; - case "ConcurrentMarkSweep": - case "G1 Old Generation": - return true; - default: - //Assume possibly concurrent if unsure - return true; - } - } - - private static boolean isConcurrentPhase(String cause, String name) { - return "No GC".equals(cause) || "Shenandoah Cycles".equals(name); - } - - private static boolean isYoungGenPool(String name) { - return name != null && name.endsWith("Eden Space") && name.endsWith("Survivor Space"); - } - - private static boolean isOldGenPool(String name) { - return name != null && (name.endsWith("Old Gen") || name.endsWith("Tenured Gen")); - } - - private static boolean isNonGenerationalHeapPool(String name) { - return "Shenandoah".equals(name) || "ZHeap".equals(name); - } - } From d8af94d11e1b756f05776caa39530773872e32bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Thu, 20 Jul 2023 16:42:06 +0800 Subject: [PATCH 05/22] adding license. --- .../commons/service/metric/GcTimeAlerter.java | 19 +++++++++++++++++++ .../commons/service/metric/JvmGcMetrics.java | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java index 253771470f89b..fdb07dd295ba9 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.iotdb.commons.service.metric; import org.slf4j.Logger; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java index 1ede70d3bbccf..0caa1f382a2e8 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.iotdb.commons.service.metric; import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; From 1b4a6f5684f6ad3ea15d3ec2eb6bde9a2fcf34c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Fri, 21 Jul 2023 10:59:08 +0800 Subject: [PATCH 06/22] add metric remove and some comments --- .../commons/service/metric/JvmGcMetrics.java | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java index 0caa1f382a2e8..1500913888eae 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java @@ -24,6 +24,7 @@ import org.apache.iotdb.metrics.AbstractMetricService; import org.apache.iotdb.metrics.metricsets.IMetricSet; import org.apache.iotdb.metrics.utils.MetricLevel; +import org.apache.iotdb.metrics.utils.MetricType; import com.google.common.base.Preconditions; import org.slf4j.Logger; @@ -43,18 +44,25 @@ public class JvmGcMetrics implements IMetricSet { private Future scheduledGcMonitorFuture; // Ring buffers containing GC timings and timestamps when timings were taken private final TsAndData[] gcDataBuf; - // private final long maxGcTimePercentage = 0L; + // Max GC time threshold private final long maxGcTimePercentage = 70L; + // Duration of observation window private final long observationWindowMs = TimeUnit.MINUTES.toMillis(1); - // private final long observationWindowMs = TimeUnit.SECONDS.toMillis(1); + // Interval for data collection private final long sleepIntervalMs = TimeUnit.SECONDS.toMillis(5); - // private final long sleepIntervalMs = TimeUnit.MILLISECONDS.toMillis(1000); + // Buffer size private final int bufSize; + // Buffer start index private int startIdx; + // Buffer end index private int endIdx; + // The time when jvm start running private long startTime; + // Container to hold collected GC data private final GcData curData = new GcData(); + // Switch for GC monitor operation private boolean shouldRun = true; + // Hook function called with GC exception private final GcTimeAlertHandler alertHandler; public JvmGcMetrics() { @@ -79,7 +87,8 @@ public void bindTo(AbstractMetricService metricService) { GcData::getGcTimePercentage); startTime = System.currentTimeMillis(); - curData.timestamp.set(startTime); + // current collect time: startTime + start delay(50ms) + curData.timestamp.set(startTime + TimeUnit.MILLISECONDS.toMillis(50)); gcDataBuf[startIdx].setValues(startTime, 0); scheduledGcMonitorFuture = ScheduledExecutorUtil.safelyScheduleWithFixedDelay( @@ -93,6 +102,7 @@ public void bindTo(AbstractMetricService metricService) { @Override public void unbindFrom(AbstractMetricService metricService) { shouldRun = false; + metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_accumulated_time_percentage"); if (scheduledGcMonitorFuture != null) { scheduledGcMonitorFuture.cancel(false); scheduledGcMonitorFuture = null; From b841b558a3f6a73caeaf2895c49e36e19a7318a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Fri, 21 Jul 2023 11:14:23 +0800 Subject: [PATCH 07/22] remove end-of-line comment --- .../apache/iotdb/commons/service/metric/JvmGcMetrics.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java index 1500913888eae..ea0a4c6227c2f 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java @@ -243,10 +243,12 @@ public synchronized GcData clone() { } private static class TsAndData { - private long ts; // Timestamp when this measurement was taken - private long gcPause; // Total GC pause time within the interval between ts - + // Timestamp when this measurement was taken + private long ts; + // Total GC pause time within the interval between ts // and the timestamp of the previous measurement. + private long gcPause; + void setValues(long inTs, long inGcPause) { this.ts = inTs; this.gcPause = inGcPause; From 6042b27f722d7805e04c6e279b6b927d3fcfec94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Fri, 21 Jul 2023 20:59:07 +0800 Subject: [PATCH 08/22] merge base iotdb --- .../java/org/apache/iotdb/confignode/service/ConfigNode.java | 1 + 1 file changed, 1 insertion(+) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index b9d898cd9031d..3836ce219b445 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -22,6 +22,7 @@ import org.apache.iotdb.common.rpc.thrift.TConfigNodeLocation; import org.apache.iotdb.common.rpc.thrift.TEndPoint; import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.client.ClientManagerMetrics; import org.apache.iotdb.commons.concurrent.ThreadModule; import org.apache.iotdb.commons.concurrent.ThreadName; import org.apache.iotdb.commons.conf.IoTDBConstant; From 771d35daa0dad007e19c728b4540db7a225d85ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Tue, 25 Jul 2023 22:43:36 +0800 Subject: [PATCH 09/22] add thread name --- .../java/org/apache/iotdb/commons/concurrent/ThreadName.java | 1 + .../apache/iotdb/commons/service/metric/JvmGcMetrics.java | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java index 6464d6129a37c..c751a9ab624a6 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java @@ -169,6 +169,7 @@ public enum ThreadName { UPGRADE_TASK("UpgradeThread"), REGION_MIGRATE("Region-Migrate-Pool"), STORAGE_ENGINE_RECOVER_TRIGGER("StorageEngine-RecoverTrigger"), + JVM_GC_STATISTICS_MONITOR("JVM-GC-Statistics-Monitor"), // the unknown thread name is used for metrics UNKOWN("UNKNOWN"); diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java index ea0a4c6227c2f..f361fc786e7ad 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java @@ -20,6 +20,7 @@ package org.apache.iotdb.commons.service.metric; import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; +import org.apache.iotdb.commons.concurrent.ThreadName; import org.apache.iotdb.commons.concurrent.threadpool.ScheduledExecutorUtil; import org.apache.iotdb.metrics.AbstractMetricService; import org.apache.iotdb.metrics.metricsets.IMetricSet; @@ -40,7 +41,8 @@ public class JvmGcMetrics implements IMetricSet { private static final Logger logger = LoggerFactory.getLogger(JvmGcMetrics.class); private final ScheduledExecutorService scheduledGCInfoMonitor = - IoTDBThreadPoolFactory.newSingleThreadScheduledExecutor("JVM-GC-Statistics-Monitor"); + IoTDBThreadPoolFactory.newSingleThreadScheduledExecutor( + ThreadName.JVM_GC_STATISTICS_MONITOR.getName()); private Future scheduledGcMonitorFuture; // Ring buffers containing GC timings and timestamps when timings were taken private final TsAndData[] gcDataBuf; @@ -114,7 +116,6 @@ private void scheduledMonitoring() { if (shouldRun) { calculateGCTimePercentageWithinObservedInterval(); } - // TODO: we can add an alertHandler here to handle the abnormal GC case. if (alertHandler != null && curData.gcTimePercentage.get() > maxGcTimePercentage) { alertHandler.alert(curData.clone()); } From 4dff86d3002b1d81912ed9c88bc3130b105a7259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Tue, 25 Jul 2023 23:16:05 +0800 Subject: [PATCH 10/22] spotless apply --- .../metrics/metricsets/jvm/JvmGcMetrics.java | 392 +++++++++--------- 1 file changed, 196 insertions(+), 196 deletions(-) diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java index 4906deb1c1b28..bd57e18d4e5a2 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java @@ -51,16 +51,16 @@ /** This file is modified from io.micrometer.core.instrument.binder.jvm.JvmGcMetrics */ public class JvmGcMetrics implements IMetricSet, AutoCloseable { - private static final Logger logger = LoggerFactory.getLogger(JvmGcMetrics.class); - private final List notificationListenerCleanUpRunnables = new CopyOnWriteArrayList<>(); - private String youngGenPoolName; - private String oldGenPoolName; - private String nonGenerationalMemoryPool; - private final AtomicLong lastGcTotalDuration = new AtomicLong(); - private final AtomicLong totalGcTimeSpend = new AtomicLong(); - private double throughout = 0.0d; - - public JvmGcMetrics() { + private static final Logger logger = LoggerFactory.getLogger(JvmGcMetrics.class); + private final List notificationListenerCleanUpRunnables = new CopyOnWriteArrayList<>(); + private String youngGenPoolName; + private String oldGenPoolName; + private String nonGenerationalMemoryPool; + private final AtomicLong lastGcTotalDuration = new AtomicLong(); + private final AtomicLong totalGcTimeSpend = new AtomicLong(); + private double throughout = 0.0d; + + public JvmGcMetrics() { for (MemoryPoolMXBean mbean : ManagementFactory.getMemoryPoolMXBeans()) { String name = mbean.getName(); if (isYoungGenPool(name)) { @@ -73,42 +73,42 @@ public JvmGcMetrics() { } } - private static boolean isPartiallyConcurrentGC(GarbageCollectorMXBean gc) { - switch (gc.getName()) { - // First two are from the serial collector - case "Copy": - case "MarkSweepCompact": - // Parallel collector - case "PS MarkSweep": - case "PS Scavenge": - case "G1 Young Generation": - // CMS young generation collector - case "ParNew": - return false; - case "ConcurrentMarkSweep": - case "G1 Old Generation": - return true; - default: - // Assume possibly concurrent if unsure - return true; - } + private static boolean isPartiallyConcurrentGC(GarbageCollectorMXBean gc) { + switch (gc.getName()) { + // First two are from the serial collector + case "Copy": + case "MarkSweepCompact": + // Parallel collector + case "PS MarkSweep": + case "PS Scavenge": + case "G1 Young Generation": + // CMS young generation collector + case "ParNew": + return false; + case "ConcurrentMarkSweep": + case "G1 Old Generation": + return true; + default: + // Assume possibly concurrent if unsure + return true; } + } - private static boolean isConcurrentPhase(String cause, String name) { - return "No GC".equals(cause) || "Shenandoah Cycles".equals(name); - } + private static boolean isConcurrentPhase(String cause, String name) { + return "No GC".equals(cause) || "Shenandoah Cycles".equals(name); + } - private static boolean isYoungGenPool(String name) { - return name != null && name.endsWith("Eden Space") && name.endsWith("Survivor Space"); - } + private static boolean isYoungGenPool(String name) { + return name != null && name.endsWith("Eden Space") && name.endsWith("Survivor Space"); + } - private static boolean isOldGenPool(String name) { - return name != null && (name.endsWith("Old Gen") || name.endsWith("Tenured Gen")); - } + private static boolean isOldGenPool(String name) { + return name != null && (name.endsWith("Old Gen") || name.endsWith("Tenured Gen")); + } - private static boolean isNonGenerationalHeapPool(String name) { - return "Shenandoah".equals(name) || "ZHeap".equals(name); - } + private static boolean isNonGenerationalHeapPool(String name) { + return "Shenandoah".equals(name) || "ZHeap".equals(name); + } @Override public void bindTo(AbstractMetricService metricService) { @@ -125,38 +125,37 @@ public void bindTo(AbstractMetricService metricService) { .orElse(0.0); AtomicLong maxDataSize = new AtomicLong((long) maxLongLivedPoolBytes); - metricService.createAutoGauge( - SystemMetric.JVM_GC_MAX_DATA_SIZE_BYTES.toString(), - MetricLevel.CORE, - maxDataSize, - AtomicLong::get); - - AtomicLong liveDataSize = new AtomicLong(); - metricService.createAutoGauge( - SystemMetric.JVM_GC_LIVE_DATA_SIZE_BYTES.toString(), - MetricLevel.CORE, - liveDataSize, - AtomicLong::get); - - AtomicLong heapMemUsedPercentage = new AtomicLong(calculateMemoryUsagePercentage()); - metricService.createAutoGauge( - "jvm_gc_memory_used_percent", MetricLevel.CORE, heapMemUsedPercentage, AtomicLong::get); - - metricService.createAutoGauge( - "jvm_gc_throughout", MetricLevel.CORE, this, JvmGcMetrics::getThroughput); - - - Counter allocatedBytes = - metricService.getOrCreateCounter( - SystemMetric.JVM_GC_MEMORY_ALLOCATED_BYTES.toString(), MetricLevel.CORE); - - Counter promotedBytes = - (oldGenPoolName == null) - ? null - : metricService.getOrCreateCounter( - SystemMetric.JVM_GC_MEMORY_PROMOTED_BYTES.toString(), MetricLevel.CORE); - - // start watching for GC notifications + metricService.createAutoGauge( + SystemMetric.JVM_GC_MAX_DATA_SIZE_BYTES.toString(), + MetricLevel.CORE, + maxDataSize, + AtomicLong::get); + + AtomicLong liveDataSize = new AtomicLong(); + metricService.createAutoGauge( + SystemMetric.JVM_GC_LIVE_DATA_SIZE_BYTES.toString(), + MetricLevel.CORE, + liveDataSize, + AtomicLong::get); + + AtomicLong heapMemUsedPercentage = new AtomicLong(calculateMemoryUsagePercentage()); + metricService.createAutoGauge( + "jvm_gc_memory_used_percent", MetricLevel.CORE, heapMemUsedPercentage, AtomicLong::get); + + metricService.createAutoGauge( + "jvm_gc_throughout", MetricLevel.CORE, this, JvmGcMetrics::getThroughput); + + Counter allocatedBytes = + metricService.getOrCreateCounter( + SystemMetric.JVM_GC_MEMORY_ALLOCATED_BYTES.toString(), MetricLevel.CORE); + + Counter promotedBytes = + (oldGenPoolName == null) + ? null + : metricService.getOrCreateCounter( + SystemMetric.JVM_GC_MEMORY_PROMOTED_BYTES.toString(), MetricLevel.CORE); + + // start watching for GC notifications final AtomicLong heapPoolSizeAfterGc = new AtomicLong(); for (GarbageCollectorMXBean mbean : ManagementFactory.getGarbageCollectorMXBeans()) { @@ -174,67 +173,67 @@ public void bindTo(AbstractMetricService metricService) { GcInfo gcInfo = notificationInfo.getGcInfo(); long duration = gcInfo.getDuration(); - // The duration supplied in the notification info includes more than just - // application stopped time for concurrent GCs (since the concurrent phase is not - // stop-the-world). - // Try and do a better job coming up with a good stopped time - // value by asking for and tracking cumulative time spent blocked in GC. - if (isPartiallyConcurrentGC(mbean)) { - long previousTotal = lastGcTotalDuration.get(); - long total = mbean.getCollectionTime(); - lastGcTotalDuration.set(total); - duration = total - previousTotal; // may be zero for a really fast collection - } - totalGcTimeSpend.set(totalGcTimeSpend.get() + duration); + // The duration supplied in the notification info includes more than just + // application stopped time for concurrent GCs (since the concurrent phase is not + // stop-the-world). + // Try and do a better job coming up with a good stopped time + // value by asking for and tracking cumulative time spent blocked in GC. + if (isPartiallyConcurrentGC(mbean)) { + long previousTotal = lastGcTotalDuration.get(); + long total = mbean.getCollectionTime(); + lastGcTotalDuration.set(total); + duration = total - previousTotal; // may be zero for a really fast collection + } + totalGcTimeSpend.set(totalGcTimeSpend.get() + duration); - String timerName; - if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { - timerName = "jvm_gc_concurrent_phase_time_"; - } else { - timerName = "jvm_gc_pause_"; - } - String reformatCause = gcCause.replaceAll(" ", "_"); - // create a timer for each cause, which binds gcCause with gcDuration - timerName += reformatCause; - Timer timer = - metricService.getOrCreateTimer( - timerName, MetricLevel.CORE, "action", gcAction, "cause", gcCause); - timer.update(duration, TimeUnit.MILLISECONDS); - - // add support for ZGC - if (mbean.getName().equals("ZGC Cycles")) { - Counter cyclesCount = - metricService.getOrCreateCounter("jvm_zgc_cycles_count", MetricLevel.CORE); - cyclesCount.inc(); - } else if (mbean.getName().equals("ZGC Pauses")) { - Counter pausesCount = - metricService.getOrCreateCounter("jvm_zgc_pauses_count", MetricLevel.CORE); - pausesCount.inc(); - } + String timerName; + if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { + timerName = "jvm_gc_concurrent_phase_time_"; + } else { + timerName = "jvm_gc_pause_"; + } + String reformatCause = gcCause.replaceAll(" ", "_"); + // create a timer for each cause, which binds gcCause with gcDuration + timerName += reformatCause; + Timer timer = + metricService.getOrCreateTimer( + timerName, MetricLevel.CORE, "action", gcAction, "cause", gcCause); + timer.update(duration, TimeUnit.MILLISECONDS); + + // add support for ZGC + if (mbean.getName().equals("ZGC Cycles")) { + Counter cyclesCount = + metricService.getOrCreateCounter("jvm_zgc_cycles_count", MetricLevel.CORE); + cyclesCount.inc(); + } else if (mbean.getName().equals("ZGC Pauses")) { + Counter pausesCount = + metricService.getOrCreateCounter("jvm_zgc_pauses_count", MetricLevel.CORE); + pausesCount.inc(); + } - // monitoring old/young GC count, which is helpful for users to locate GC exception. - if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { - Counter oldGcCounter = - metricService.getOrCreateCounter("jvm_gc_old_gc_count", MetricLevel.CORE); - oldGcCounter.inc(); - } else if (GcGenerationAge.fromName(notificationInfo.getGcName()) - == GcGenerationAge.YOUNG) { - Counter youngGcCounter = - metricService.getOrCreateCounter("jvm_gc_young_gc_count", MetricLevel.CORE); - youngGcCounter.inc(); - } + // monitoring old/young GC count, which is helpful for users to locate GC exception. + if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { + Counter oldGcCounter = + metricService.getOrCreateCounter("jvm_gc_old_gc_count", MetricLevel.CORE); + oldGcCounter.inc(); + } else if (GcGenerationAge.fromName(notificationInfo.getGcName()) + == GcGenerationAge.YOUNG) { + Counter youngGcCounter = + metricService.getOrCreateCounter("jvm_gc_young_gc_count", MetricLevel.CORE); + youngGcCounter.inc(); + } - // update memory usage percentage - heapMemUsedPercentage.set(calculateMemoryUsagePercentage()); + // update memory usage percentage + heapMemUsedPercentage.set(calculateMemoryUsagePercentage()); - // update throughput, which is the - // percentage of GC time as total JVM running time - throughout = - (double) ((gcInfo.getEndTime() - totalGcTimeSpend.get())) - / gcInfo.getEndTime() - * 100L; + // update throughput, which is the + // percentage of GC time as total JVM running time + throughout = + (double) ((gcInfo.getEndTime() - totalGcTimeSpend.get())) + / gcInfo.getEndTime() + * 100L; - // Update promotion and allocation counters + // Update promotion and allocation counters final Map before = gcInfo.getMemoryUsageBeforeGc(); final Map after = gcInfo.getMemoryUsageAfterGc(); @@ -253,39 +252,40 @@ public void bindTo(AbstractMetricService metricService) { } } -// should add `else` here, since there are only two + // should add `else` here, since there are only two // cases: generational and non-generational else { - if (oldGenPoolName != null) { - final long oldBefore = before.get(oldGenPoolName).getUsed(); - final long oldAfter = after.get(oldGenPoolName).getUsed(); - final long delta = oldAfter - oldBefore; - if (delta > 0L && promotedBytes != null) { - promotedBytes.inc(delta); - } - - // Some GC implementations such as G1 can reduce the old gen size as part of a minor - // GC. To track the - // live data size we record the value if we see a reduction in the old gen heap size - // or - // after a major GC. - if (oldAfter < oldBefore - || GcGenerationAge.fromName(notificationInfo.getGcName()) - == GcGenerationAge.OLD) { - liveDataSize.set(oldAfter); - final long oldMaxAfter = after.get(oldGenPoolName).getMax(); - maxDataSize.set(oldMaxAfter); - } + if (oldGenPoolName != null) { + final long oldBefore = before.get(oldGenPoolName).getUsed(); + final long oldAfter = after.get(oldGenPoolName).getUsed(); + final long delta = oldAfter - oldBefore; + if (delta > 0L && promotedBytes != null) { + promotedBytes.inc(delta); } - if (youngGenPoolName != null) { - countPoolSizeDelta( - gcInfo.getMemoryUsageBeforeGc(), - gcInfo.getMemoryUsageAfterGc(), - allocatedBytes, - heapPoolSizeAfterGc, - youngGenPoolName); + // Some GC implementations such as G1 can reduce the old gen size as part of a minor + // GC. To track the + // live data size we record the value if we see a reduction in the old gen heap size + // or + // after a major GC. + if (oldAfter < oldBefore + || GcGenerationAge.fromName(notificationInfo.getGcName()) + == GcGenerationAge.OLD) { + liveDataSize.set(oldAfter); + final long oldMaxAfter = after.get(oldGenPoolName).getMax(); + maxDataSize.set(oldMaxAfter); } + } + + if (youngGenPoolName != null) { + countPoolSizeDelta( + gcInfo.getMemoryUsageBeforeGc(), + gcInfo.getMemoryUsageAfterGc(), + allocatedBytes, + heapPoolSizeAfterGc, + youngGenPoolName); + } + } }; NotificationEmitter notificationEmitter = (NotificationEmitter) mbean; notificationEmitter.addNotificationListener( @@ -312,18 +312,17 @@ public void unbindFrom(AbstractMetricService metricService) { return; } - metricService.remove(MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_MAX_DATA_SIZE_BYTES.toString()); - metricService.remove( - MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_LIVE_DATA_SIZE_BYTES.toString()); - metricService.remove(MetricType.COUNTER, SystemMetric.JVM_GC_MEMORY_ALLOCATED_BYTES.toString()); - metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_memory_used_percent"); - metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_throughout"); + metricService.remove(MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_MAX_DATA_SIZE_BYTES.toString()); + metricService.remove( + MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_LIVE_DATA_SIZE_BYTES.toString()); + metricService.remove(MetricType.COUNTER, SystemMetric.JVM_GC_MEMORY_ALLOCATED_BYTES.toString()); + metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_memory_used_percent"); + metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_throughout"); - - if (oldGenPoolName != null) { - metricService.remove( - MetricType.COUNTER, SystemMetric.JVM_GC_MEMORY_PROMOTED_BYTES.toString()); - } + if (oldGenPoolName != null) { + metricService.remove( + MetricType.COUNTER, SystemMetric.JVM_GC_MEMORY_PROMOTED_BYTES.toString()); + } // start watching for GC notifications for (GarbageCollectorMXBean mbean : ManagementFactory.getGarbageCollectorMXBeans()) { @@ -339,25 +338,25 @@ public void unbindFrom(AbstractMetricService metricService) { String gcCause = notificationInfo.getGcCause(); String gcAction = notificationInfo.getGcAction(); String timerName; - if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { - timerName = "jvm_gc_concurrent_phase_time_"; - } else { - timerName = "jvm_gc_pause_"; - } - timerName += gcCause; - metricService.remove(MetricType.TIMER, timerName, "action", gcAction, "cause", gcCause); - if (mbean.getName().equals("ZGC Cycles")) { - metricService.remove(MetricType.COUNTER, "jvm_zgc_cycles_count"); - } else if (mbean.getName().equals("ZGC Pauses")) { - metricService.remove(MetricType.COUNTER, "jvm_zgc_pauses_count"); - } - if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { - metricService.remove(MetricType.COUNTER, "jvm_gc_old_gc_count"); - } else if (GcGenerationAge.fromName(notificationInfo.getGcName()) - == GcGenerationAge.YOUNG) { - metricService.remove(MetricType.COUNTER, "jvm_gc_young_gc_count"); - } - }; + if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { + timerName = "jvm_gc_concurrent_phase_time_"; + } else { + timerName = "jvm_gc_pause_"; + } + timerName += gcCause; + metricService.remove(MetricType.TIMER, timerName, "action", gcAction, "cause", gcCause); + if (mbean.getName().equals("ZGC Cycles")) { + metricService.remove(MetricType.COUNTER, "jvm_zgc_cycles_count"); + } else if (mbean.getName().equals("ZGC Pauses")) { + metricService.remove(MetricType.COUNTER, "jvm_zgc_pauses_count"); + } + if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { + metricService.remove(MetricType.COUNTER, "jvm_gc_old_gc_count"); + } else if (GcGenerationAge.fromName(notificationInfo.getGcName()) + == GcGenerationAge.YOUNG) { + metricService.remove(MetricType.COUNTER, "jvm_gc_young_gc_count"); + } + }; NotificationEmitter notificationEmitter = (NotificationEmitter) mbean; notificationEmitter.addNotificationListener( notificationListener, @@ -419,16 +418,17 @@ private void countPoolSizeDelta( public void close() { notificationListenerCleanUpRunnables.forEach(Runnable::run); } - public long calculateMemoryUsagePercentage() { - return (ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getUsed() - * 100 - / Runtime.getRuntime().maxMemory()); - } - // We need to keep two decimal places accurate, so we maintain a double var here. - public double getThroughput() { - return throughout; - } + public long calculateMemoryUsagePercentage() { + return (ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getUsed() + * 100 + / Runtime.getRuntime().maxMemory()); + } + + // We need to keep two decimal places accurate, so we maintain a double var here. + public double getThroughput() { + return throughout; + } enum GcGenerationAge { OLD, From c71517f24fb3d4579670c3ed59647f02850383bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Tue, 25 Jul 2023 23:54:14 +0800 Subject: [PATCH 11/22] rename and add some comments --- .../iotdb/confignode/service/ConfigNode.java | 4 +- .../metrics/DataNodeMetricsHelper.java | 4 +- .../metrics/metricsets/jvm/JvmGcMetrics.java | 49 +++++++++++++------ .../iotdb/metrics/utils/SystemMetric.java | 7 +++ .../commons/service/metric/GcTimeAlerter.java | 4 +- ...cMetrics.java => JvmGcMonitorMetrics.java} | 16 +++--- 6 files changed, 55 insertions(+), 29 deletions(-) rename iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/{JvmGcMetrics.java => JvmGcMonitorMetrics.java} (94%) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index a1b3f4792c01f..7499dc6bd57cc 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -30,7 +30,7 @@ import org.apache.iotdb.commons.exception.StartupException; import org.apache.iotdb.commons.service.JMXService; import org.apache.iotdb.commons.service.RegisterManager; -import org.apache.iotdb.commons.service.metric.JvmGcMetrics; +import org.apache.iotdb.commons.service.metric.JvmGcMonitorMetrics; import org.apache.iotdb.commons.service.metric.MetricService; import org.apache.iotdb.commons.utils.TestOnly; import org.apache.iotdb.confignode.client.ConfigNodeRequestType; @@ -246,7 +246,7 @@ private void setUpMetricService() throws StartupException { MetricService.getInstance().addMetricSet(new ProcessMetrics()); MetricService.getInstance().addMetricSet(new DiskMetrics(IoTDBConstant.CN_ROLE)); MetricService.getInstance().addMetricSet(new NetMetrics(IoTDBConstant.CN_ROLE)); - MetricService.getInstance().addMetricSet(JvmGcMetrics.getInstance()); + MetricService.getInstance().addMetricSet(JvmGcMonitorMetrics.getInstance()); MetricService.getInstance().addMetricSet(ClientManagerMetrics.getInstance()); MetricService.getInstance().addMetricSet(ThreadPoolMetrics.getInstance()); initCpuMetrics(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java index 51f5a287eeb06..ea188be3cf8c4 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java @@ -25,7 +25,7 @@ import org.apache.iotdb.commons.concurrent.ThreadPoolMetrics; import org.apache.iotdb.commons.conf.CommonDescriptor; import org.apache.iotdb.commons.conf.IoTDBConstant; -import org.apache.iotdb.commons.service.metric.JvmGcMetrics; +import org.apache.iotdb.commons.service.metric.JvmGcMonitorMetrics; import org.apache.iotdb.commons.service.metric.MetricService; import org.apache.iotdb.commons.service.metric.PerformanceOverviewMetrics; import org.apache.iotdb.db.conf.IoTDBDescriptor; @@ -80,7 +80,7 @@ public static void bind() { MetricService.getInstance().addMetricSet(PerformanceOverviewMetrics.getInstance()); // bind gc metrics - MetricService.getInstance().addMetricSet(JvmGcMetrics.getInstance()); + MetricService.getInstance().addMetricSet(JvmGcMonitorMetrics.getInstance()); } private static void initSystemMetrics() { diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java index bd57e18d4e5a2..f0c00d4f3235e 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java @@ -75,17 +75,21 @@ public JvmGcMetrics() { private static boolean isPartiallyConcurrentGC(GarbageCollectorMXBean gc) { switch (gc.getName()) { - // First two are from the serial collector + // First two are from the `serial` collector case "Copy": case "MarkSweepCompact": - // Parallel collector + // The following 4 GCs do not contain concurrent execution phase case "PS MarkSweep": case "PS Scavenge": case "G1 Young Generation": - // CMS young generation collector case "ParNew": return false; + + // The following 2 GCs' execution process consists of concurrent phase, which means they can + // run simultaneously with the user threads in some phases. + // Concurrent mark and concurrent sweep case "ConcurrentMarkSweep": + // Concurrent mark case "G1 Old Generation": return true; default: @@ -99,7 +103,7 @@ private static boolean isConcurrentPhase(String cause, String name) { } private static boolean isYoungGenPool(String name) { - return name != null && name.endsWith("Eden Space") && name.endsWith("Survivor Space"); + return name != null && (name.endsWith("Eden Space") || name.endsWith("Survivor Space")); } private static boolean isOldGenPool(String name) { @@ -140,10 +144,16 @@ public void bindTo(AbstractMetricService metricService) { AtomicLong heapMemUsedPercentage = new AtomicLong(calculateMemoryUsagePercentage()); metricService.createAutoGauge( - "jvm_gc_memory_used_percent", MetricLevel.CORE, heapMemUsedPercentage, AtomicLong::get); + SystemMetric.JVM_GC_MEMORY_USED_PERCENT.toString(), + MetricLevel.CORE, + heapMemUsedPercentage, + AtomicLong::get); metricService.createAutoGauge( - "jvm_gc_throughout", MetricLevel.CORE, this, JvmGcMetrics::getThroughput); + SystemMetric.JVM_GC_THROUGHOUT.toString(), + MetricLevel.CORE, + this, + JvmGcMetrics::getThroughput); Counter allocatedBytes = metricService.getOrCreateCounter( @@ -203,23 +213,27 @@ public void bindTo(AbstractMetricService metricService) { // add support for ZGC if (mbean.getName().equals("ZGC Cycles")) { Counter cyclesCount = - metricService.getOrCreateCounter("jvm_zgc_cycles_count", MetricLevel.CORE); + metricService.getOrCreateCounter( + SystemMetric.JVM_ZGC_CYCLES_COUNT.toString(), MetricLevel.CORE); cyclesCount.inc(); } else if (mbean.getName().equals("ZGC Pauses")) { Counter pausesCount = - metricService.getOrCreateCounter("jvm_zgc_pauses_count", MetricLevel.CORE); + metricService.getOrCreateCounter( + SystemMetric.JVM_ZGC_PAUSES_COUNT.toString(), MetricLevel.CORE); pausesCount.inc(); } // monitoring old/young GC count, which is helpful for users to locate GC exception. if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { Counter oldGcCounter = - metricService.getOrCreateCounter("jvm_gc_old_gc_count", MetricLevel.CORE); + metricService.getOrCreateCounter( + SystemMetric.JVM_GC_YOUNG_GC_COUNT.toString(), MetricLevel.CORE); oldGcCounter.inc(); } else if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.YOUNG) { Counter youngGcCounter = - metricService.getOrCreateCounter("jvm_gc_young_gc_count", MetricLevel.CORE); + metricService.getOrCreateCounter( + SystemMetric.JVM_GC_OLD_GC_COUNT.toString(), MetricLevel.CORE); youngGcCounter.inc(); } @@ -316,8 +330,8 @@ public void unbindFrom(AbstractMetricService metricService) { metricService.remove( MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_LIVE_DATA_SIZE_BYTES.toString()); metricService.remove(MetricType.COUNTER, SystemMetric.JVM_GC_MEMORY_ALLOCATED_BYTES.toString()); - metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_memory_used_percent"); - metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_throughout"); + metricService.remove(MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_MEMORY_USED_PERCENT.toString()); + metricService.remove(MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_THROUGHOUT.toString()); if (oldGenPoolName != null) { metricService.remove( @@ -346,15 +360,18 @@ public void unbindFrom(AbstractMetricService metricService) { timerName += gcCause; metricService.remove(MetricType.TIMER, timerName, "action", gcAction, "cause", gcCause); if (mbean.getName().equals("ZGC Cycles")) { - metricService.remove(MetricType.COUNTER, "jvm_zgc_cycles_count"); + metricService.remove( + MetricType.COUNTER, SystemMetric.JVM_ZGC_CYCLES_COUNT.toString()); } else if (mbean.getName().equals("ZGC Pauses")) { - metricService.remove(MetricType.COUNTER, "jvm_zgc_pauses_count"); + metricService.remove( + MetricType.COUNTER, SystemMetric.JVM_ZGC_PAUSES_COUNT.toString()); } if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { - metricService.remove(MetricType.COUNTER, "jvm_gc_old_gc_count"); + metricService.remove( + MetricType.COUNTER, SystemMetric.JVM_GC_YOUNG_GC_COUNT.toString()); } else if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.YOUNG) { - metricService.remove(MetricType.COUNTER, "jvm_gc_young_gc_count"); + metricService.remove(MetricType.COUNTER, SystemMetric.JVM_GC_OLD_GC_COUNT.toString()); } }; NotificationEmitter notificationEmitter = (NotificationEmitter) mbean; diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java index 6898a1dab83ff..257bcd2173fbb 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java @@ -81,6 +81,13 @@ public enum SystemMetric { JVM_GC_LIVE_DATA_SIZE_BYTES("jvm_gc_live_data_size_bytes"), JVM_GC_MEMORY_ALLOCATED_BYTES("jvm_gc_memory_allocated_bytes"), JVM_GC_MEMORY_PROMOTED_BYTES("jvm_gc_memory_promoted_bytes"), + JVM_GC_MEMORY_USED_PERCENT("jvm_gc_memory_used_percent"), + JVM_GC_THROUGHOUT("jvm_gc_throughout"), + JVM_ZGC_CYCLES_COUNT("jvm_zgc_cycles_count"), + JVM_ZGC_PAUSES_COUNT("jvm_zgc_pauses_count"), + JVM_GC_YOUNG_GC_COUNT("jvm_gc_old_gc_count"), + JVM_GC_OLD_GC_COUNT("jvm_gc_young_gc_count"), + JVM_GC_ACCUMULATED_TIME_PERCENTAGE("jvm_gc_accumulated_time_percentage"), // net related RECEIVED_BYTES("received_bytes"), RECEIVED_PACKETS("received_packets"), diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java index fdb07dd295ba9..4a28c720ba19d 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java @@ -25,7 +25,7 @@ import java.text.SimpleDateFormat; import java.util.Date; -public class GcTimeAlerter implements JvmGcMetrics.GcTimeAlertHandler { +public class GcTimeAlerter implements JvmGcMonitorMetrics.GcTimeAlertHandler { private static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); private static final Logger logger = LoggerFactory.getLogger(GcTimeAlerter.class); @@ -35,7 +35,7 @@ public class GcTimeAlerter implements JvmGcMetrics.GcTimeAlertHandler { * @param gcData */ @Override - public void alert(JvmGcMetrics.GcData gcData) { + public void alert(JvmGcMonitorMetrics.GcData gcData) { logger.warn( "Error metrics taken time: " + sdf.format(new Date(Long.parseLong(String.valueOf(gcData.getTimestamp()))))); diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java similarity index 94% rename from iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java rename to iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java index f361fc786e7ad..16bc64fe58f13 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java @@ -26,6 +26,7 @@ import org.apache.iotdb.metrics.metricsets.IMetricSet; import org.apache.iotdb.metrics.utils.MetricLevel; import org.apache.iotdb.metrics.utils.MetricType; +import org.apache.iotdb.metrics.utils.SystemMetric; import com.google.common.base.Preconditions; import org.slf4j.Logger; @@ -38,8 +39,8 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; -public class JvmGcMetrics implements IMetricSet { - private static final Logger logger = LoggerFactory.getLogger(JvmGcMetrics.class); +public class JvmGcMonitorMetrics implements IMetricSet { + private static final Logger logger = LoggerFactory.getLogger(JvmGcMonitorMetrics.class); private final ScheduledExecutorService scheduledGCInfoMonitor = IoTDBThreadPoolFactory.newSingleThreadScheduledExecutor( ThreadName.JVM_GC_STATISTICS_MONITOR.getName()); @@ -67,7 +68,7 @@ public class JvmGcMetrics implements IMetricSet { // Hook function called with GC exception private final GcTimeAlertHandler alertHandler; - public JvmGcMetrics() { + public JvmGcMonitorMetrics() { bufSize = (int) (observationWindowMs / sleepIntervalMs + 2); // Prevent the user from accidentally creating an abnormally big buffer, which will result in // slow calculations and likely inaccuracy. @@ -83,7 +84,7 @@ public JvmGcMetrics() { @Override public void bindTo(AbstractMetricService metricService) { metricService.createAutoGauge( - "jvm_gc_accumulated_time_percentage", + SystemMetric.JVM_GC_ACCUMULATED_TIME_PERCENTAGE.toString(), MetricLevel.CORE, curData, GcData::getGcTimePercentage); @@ -104,7 +105,8 @@ public void bindTo(AbstractMetricService metricService) { @Override public void unbindFrom(AbstractMetricService metricService) { shouldRun = false; - metricService.remove(MetricType.AUTO_GAUGE, "jvm_gc_accumulated_time_percentage"); + metricService.remove( + MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_ACCUMULATED_TIME_PERCENTAGE.toString()); if (scheduledGcMonitorFuture != null) { scheduledGcMonitorFuture.cancel(false); scheduledGcMonitorFuture = null; @@ -266,14 +268,14 @@ public interface GcTimeAlertHandler { private static class JvmGcMetricsHolder { - private static final JvmGcMetrics INSTANCE = new JvmGcMetrics(); + private static final JvmGcMonitorMetrics INSTANCE = new JvmGcMonitorMetrics(); private JvmGcMetricsHolder() { // empty constructor } } - public static JvmGcMetrics getInstance() { + public static JvmGcMonitorMetrics getInstance() { return JvmGcMetricsHolder.INSTANCE; } } From ae796182c9469cf9b161d6d1a302cfc036865589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Wed, 26 Jul 2023 10:24:07 +0800 Subject: [PATCH 12/22] add some detailed comments --- .../metrics/metricsets/jvm/JvmGcMetrics.java | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java index f0c00d4f3235e..2c8f9bb39fae5 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java @@ -75,10 +75,10 @@ public JvmGcMetrics() { private static boolean isPartiallyConcurrentGC(GarbageCollectorMXBean gc) { switch (gc.getName()) { - // First two are from the `serial` collector + // First two are from the 'serial' collector which are not concurrent, obviously. case "Copy": case "MarkSweepCompact": - // The following 4 GCs do not contain concurrent execution phase + // The following 4 GCs do not contain concurrent execution phase. case "PS MarkSweep": case "PS Scavenge": case "G1 Young Generation": @@ -86,7 +86,8 @@ private static boolean isPartiallyConcurrentGC(GarbageCollectorMXBean gc) { return false; // The following 2 GCs' execution process consists of concurrent phase, which means they can - // run simultaneously with the user threads in some phases. + // run simultaneously with the user thread in some phases. + // Concurrent mark and concurrent sweep case "ConcurrentMarkSweep": // Concurrent mark @@ -181,11 +182,15 @@ public void bindTo(AbstractMetricService metricService) { String gcCause = notificationInfo.getGcCause(); String gcAction = notificationInfo.getGcAction(); GcInfo gcInfo = notificationInfo.getGcInfo(); - long duration = gcInfo.getDuration(); // The duration supplied in the notification info includes more than just // application stopped time for concurrent GCs (since the concurrent phase is not // stop-the-world). + // E.g. For mixed GC or full GC in collector 'G1 old generation', the duration collected + // here is more than the actual pause time (the latter can be accessed by GC + // log/-XX:PrintGCDetails) + long duration = gcInfo.getDuration(); + // Try and do a better job coming up with a good stopped time // value by asking for and tracking cumulative time spent blocked in GC. if (isPartiallyConcurrentGC(mbean)) { @@ -198,13 +203,11 @@ public void bindTo(AbstractMetricService metricService) { String timerName; if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { - timerName = "jvm_gc_concurrent_phase_time_"; + timerName = "jvm_gc_concurrent_phase_time"; } else { - timerName = "jvm_gc_pause_"; + timerName = "jvm_gc_pause"; } - String reformatCause = gcCause.replaceAll(" ", "_"); - // create a timer for each cause, which binds gcCause with gcDuration - timerName += reformatCause; + // create a timer with tags named by gcCause, which binds gcCause with gcDuration Timer timer = metricService.getOrCreateTimer( timerName, MetricLevel.CORE, "action", gcAction, "cause", gcCause); @@ -224,6 +227,8 @@ public void bindTo(AbstractMetricService metricService) { } // monitoring old/young GC count, which is helpful for users to locate GC exception. + // Unfortunately, the JMX doesn't seem to provide an api for monitoring mixed gc in G1. + // In fact, JMX may treat mixed GCs as minor GCs. if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { Counter oldGcCounter = metricService.getOrCreateCounter( @@ -278,7 +283,8 @@ public void bindTo(AbstractMetricService metricService) { } // Some GC implementations such as G1 can reduce the old gen size as part of a minor - // GC. To track the + // GC (since in JMX, a minor GC of G1 may actually represent mixed GC, which collect + // some obj in old gen region). To track the // live data size we record the value if we see a reduction in the old gen heap size // or // after a major GC. @@ -353,11 +359,10 @@ public void unbindFrom(AbstractMetricService metricService) { String gcAction = notificationInfo.getGcAction(); String timerName; if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { - timerName = "jvm_gc_concurrent_phase_time_"; + timerName = "jvm_gc_concurrent_phase_time"; } else { - timerName = "jvm_gc_pause_"; + timerName = "jvm_gc_pause"; } - timerName += gcCause; metricService.remove(MetricType.TIMER, timerName, "action", gcAction, "cause", gcCause); if (mbean.getName().equals("ZGC Cycles")) { metricService.remove( From c477c80ec4bdf6b93f7378c749121edec739fc83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Wed, 26 Jul 2023 22:31:08 +0800 Subject: [PATCH 13/22] fix global previousTotal var --- .../metrics/metricsets/jvm/JvmGcMetrics.java | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java index 2c8f9bb39fae5..ba9da4488902e 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java @@ -45,6 +45,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; @@ -56,8 +57,8 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable { private String youngGenPoolName; private String oldGenPoolName; private String nonGenerationalMemoryPool; - private final AtomicLong lastGcTotalDuration = new AtomicLong(); - private final AtomicLong totalGcTimeSpend = new AtomicLong(); + private final Map lastGcTotalDurationMap = new ConcurrentHashMap<>(); + private final Map totalGcTimeSpendMap = new ConcurrentHashMap<>(); private double throughout = 0.0d; public JvmGcMetrics() { @@ -194,12 +195,17 @@ public void bindTo(AbstractMetricService metricService) { // Try and do a better job coming up with a good stopped time // value by asking for and tracking cumulative time spent blocked in GC. if (isPartiallyConcurrentGC(mbean)) { - long previousTotal = lastGcTotalDuration.get(); + AtomicLong previousTotal = + lastGcTotalDurationMap.getOrDefault(mbean.getName(), new AtomicLong()); long total = mbean.getCollectionTime(); - lastGcTotalDuration.set(total); - duration = total - previousTotal; // may be zero for a really fast collection + duration = total - previousTotal.get(); // may be zero for a really fast collection + previousTotal.set(total); + lastGcTotalDurationMap.put(mbean.getName(), previousTotal); } + AtomicLong totalGcTimeSpend = + totalGcTimeSpendMap.getOrDefault(mbean.getName(), new AtomicLong()); totalGcTimeSpend.set(totalGcTimeSpend.get() + duration); + totalGcTimeSpendMap.put(mbean.getName(), totalGcTimeSpend); String timerName; if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { @@ -248,7 +254,11 @@ public void bindTo(AbstractMetricService metricService) { // update throughput, which is the // percentage of GC time as total JVM running time throughout = - (double) ((gcInfo.getEndTime() - totalGcTimeSpend.get())) + (double) + ((gcInfo.getEndTime() + - totalGcTimeSpendMap + .getOrDefault(mbean.getName(), new AtomicLong()) + .get())) / gcInfo.getEndTime() * 100L; From bc6f0bf1c2b3b3ef6bcb255e0c800426a85ed18c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Wed, 26 Jul 2023 22:35:13 +0800 Subject: [PATCH 14/22] fix review of ThreadName --- .../java/org/apache/iotdb/commons/concurrent/ThreadName.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java index c751a9ab624a6..47e45241d6cac 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java @@ -136,6 +136,7 @@ public enum ThreadName { // NOTICE: The thread name of jvm cannot be edited here! // We list the thread name here just for distinguishing what module the thread belongs to. JVM_PAUSE_MONITOR("JvmPauseMonitor"), + JVM_GC_STATISTICS_MONITOR("JVM-GC-Statistics-Monitor"), PARALLEL_GC("GC task thread"), G1_GC("GC Thread"), G1_MAIN_MARKER("G1 Main Marker"), @@ -169,7 +170,6 @@ public enum ThreadName { UPGRADE_TASK("UpgradeThread"), REGION_MIGRATE("Region-Migrate-Pool"), STORAGE_ENGINE_RECOVER_TRIGGER("StorageEngine-RecoverTrigger"), - JVM_GC_STATISTICS_MONITOR("JVM-GC-Statistics-Monitor"), // the unknown thread name is used for metrics UNKOWN("UNKNOWN"); @@ -269,6 +269,7 @@ public enum ThreadName { new HashSet<>( Arrays.asList( JVM_PAUSE_MONITOR, + JVM_GC_STATISTICS_MONITOR, PARALLEL_GC, G1_GC, G1_MAIN_MARKER, From fbb35622aeb5df8a3188a5874376408d2083e421 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Wed, 26 Jul 2023 22:57:48 +0800 Subject: [PATCH 15/22] remove duplicate metrics `throughout` --- .../metrics/metricsets/jvm/JvmGcMetrics.java | 24 ------------------- .../iotdb/metrics/utils/SystemMetric.java | 1 - 2 files changed, 25 deletions(-) diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java index ba9da4488902e..42075190fd598 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java @@ -59,7 +59,6 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable { private String nonGenerationalMemoryPool; private final Map lastGcTotalDurationMap = new ConcurrentHashMap<>(); private final Map totalGcTimeSpendMap = new ConcurrentHashMap<>(); - private double throughout = 0.0d; public JvmGcMetrics() { for (MemoryPoolMXBean mbean : ManagementFactory.getMemoryPoolMXBeans()) { @@ -151,12 +150,6 @@ public void bindTo(AbstractMetricService metricService) { heapMemUsedPercentage, AtomicLong::get); - metricService.createAutoGauge( - SystemMetric.JVM_GC_THROUGHOUT.toString(), - MetricLevel.CORE, - this, - JvmGcMetrics::getThroughput); - Counter allocatedBytes = metricService.getOrCreateCounter( SystemMetric.JVM_GC_MEMORY_ALLOCATED_BYTES.toString(), MetricLevel.CORE); @@ -251,17 +244,6 @@ public void bindTo(AbstractMetricService metricService) { // update memory usage percentage heapMemUsedPercentage.set(calculateMemoryUsagePercentage()); - // update throughput, which is the - // percentage of GC time as total JVM running time - throughout = - (double) - ((gcInfo.getEndTime() - - totalGcTimeSpendMap - .getOrDefault(mbean.getName(), new AtomicLong()) - .get())) - / gcInfo.getEndTime() - * 100L; - // Update promotion and allocation counters final Map before = gcInfo.getMemoryUsageBeforeGc(); final Map after = gcInfo.getMemoryUsageAfterGc(); @@ -347,7 +329,6 @@ public void unbindFrom(AbstractMetricService metricService) { MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_LIVE_DATA_SIZE_BYTES.toString()); metricService.remove(MetricType.COUNTER, SystemMetric.JVM_GC_MEMORY_ALLOCATED_BYTES.toString()); metricService.remove(MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_MEMORY_USED_PERCENT.toString()); - metricService.remove(MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_THROUGHOUT.toString()); if (oldGenPoolName != null) { metricService.remove( @@ -457,11 +438,6 @@ public long calculateMemoryUsagePercentage() { / Runtime.getRuntime().maxMemory()); } - // We need to keep two decimal places accurate, so we maintain a double var here. - public double getThroughput() { - return throughout; - } - enum GcGenerationAge { OLD, YOUNG, diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java index 257bcd2173fbb..645c2307eeaa4 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java @@ -82,7 +82,6 @@ public enum SystemMetric { JVM_GC_MEMORY_ALLOCATED_BYTES("jvm_gc_memory_allocated_bytes"), JVM_GC_MEMORY_PROMOTED_BYTES("jvm_gc_memory_promoted_bytes"), JVM_GC_MEMORY_USED_PERCENT("jvm_gc_memory_used_percent"), - JVM_GC_THROUGHOUT("jvm_gc_throughout"), JVM_ZGC_CYCLES_COUNT("jvm_zgc_cycles_count"), JVM_ZGC_PAUSES_COUNT("jvm_zgc_pauses_count"), JVM_GC_YOUNG_GC_COUNT("jvm_gc_old_gc_count"), From 84a78a3fb013c57d5681490a513c05416447aca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Fri, 28 Jul 2023 00:55:50 +0800 Subject: [PATCH 16/22] fix review 1-4 --- .../apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java | 8 +------- .../iotdb/commons/service/metric/GcTimeAlerter.java | 2 +- .../iotdb/commons/service/metric/JvmGcMonitorMetrics.java | 7 +------ 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java index 42075190fd598..413ffba13a890 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java @@ -58,7 +58,6 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable { private String oldGenPoolName; private String nonGenerationalMemoryPool; private final Map lastGcTotalDurationMap = new ConcurrentHashMap<>(); - private final Map totalGcTimeSpendMap = new ConcurrentHashMap<>(); public JvmGcMetrics() { for (MemoryPoolMXBean mbean : ManagementFactory.getMemoryPoolMXBeans()) { @@ -189,16 +188,11 @@ public void bindTo(AbstractMetricService metricService) { // value by asking for and tracking cumulative time spent blocked in GC. if (isPartiallyConcurrentGC(mbean)) { AtomicLong previousTotal = - lastGcTotalDurationMap.getOrDefault(mbean.getName(), new AtomicLong()); + lastGcTotalDurationMap.computeIfAbsent(mbean.getName(), k -> new AtomicLong()); long total = mbean.getCollectionTime(); duration = total - previousTotal.get(); // may be zero for a really fast collection previousTotal.set(total); - lastGcTotalDurationMap.put(mbean.getName(), previousTotal); } - AtomicLong totalGcTimeSpend = - totalGcTimeSpendMap.getOrDefault(mbean.getName(), new AtomicLong()); - totalGcTimeSpend.set(totalGcTimeSpend.get() + duration); - totalGcTimeSpendMap.put(mbean.getName(), totalGcTimeSpend); String timerName; if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java index 4a28c720ba19d..c471bf526f83a 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java @@ -39,9 +39,9 @@ public void alert(JvmGcMonitorMetrics.GcData gcData) { logger.warn( "Error metrics taken time: " + sdf.format(new Date(Long.parseLong(String.valueOf(gcData.getTimestamp()))))); - logger.warn("Time since monitor has been started: " + gcData.getGcMonitorRunTime() + " ms"); logger.warn("Gc Time Percentage: " + gcData.getGcTimePercentage() + "%"); logger.warn("Accumulated GC time: " + gcData.getAccumulatedGcTime() + " ms"); logger.warn("Accumulated GC count: " + gcData.getAccumulatedGcCount()); + logger.warn("Detailed GC metrics can be accessed in Prometheus."); } } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java index 16bc64fe58f13..51a799a2f593d 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java @@ -63,8 +63,6 @@ public class JvmGcMonitorMetrics implements IMetricSet { private long startTime; // Container to hold collected GC data private final GcData curData = new GcData(); - // Switch for GC monitor operation - private boolean shouldRun = true; // Hook function called with GC exception private final GcTimeAlertHandler alertHandler; @@ -104,7 +102,6 @@ public void bindTo(AbstractMetricService metricService) { @Override public void unbindFrom(AbstractMetricService metricService) { - shouldRun = false; metricService.remove( MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_ACCUMULATED_TIME_PERCENTAGE.toString()); if (scheduledGcMonitorFuture != null) { @@ -115,9 +112,7 @@ public void unbindFrom(AbstractMetricService metricService) { } private void scheduledMonitoring() { - if (shouldRun) { - calculateGCTimePercentageWithinObservedInterval(); - } + calculateGCTimePercentageWithinObservedInterval(); if (alertHandler != null && curData.gcTimePercentage.get() > maxGcTimePercentage) { alertHandler.alert(curData.clone()); } From 2fc18db024980eae62c9cf3d9add4b1fd75c2e3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Fri, 28 Jul 2023 01:09:45 +0800 Subject: [PATCH 17/22] remove duplicate --- .../service/metric/JvmGcMonitorMetrics.java | 22 +++---------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java index 51a799a2f593d..930564e9e8428 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java @@ -154,7 +154,6 @@ private void calculateGCTimePercentageWithinObservedInterval() { curData.update( curTime, - gcMonitorRunTime, totalGcTime, totalGcCount, (int) @@ -166,7 +165,6 @@ private void calculateGCTimePercentageWithinObservedInterval() { /** Encapsulates data about GC pauses measured at the specific timestamp. */ public static class GcData implements Cloneable { private final AtomicLong timestamp = new AtomicLong(); - private AtomicLong gcMonitorRunTime = new AtomicLong(); private AtomicLong totalGcTime = new AtomicLong(); private AtomicLong totalGcCount = new AtomicLong(); private AtomicLong gcTimePercentage = new AtomicLong(); @@ -181,16 +179,7 @@ public long getTimestamp() { } /** - * Returns the time since the start of the associated GCTimeMonitor. - * - * @return GcMonitorRunTime. - */ - public long getGcMonitorRunTime() { - return gcMonitorRunTime.get(); - } - - /** - * Returns accumulated GC time since this JVM started. + * Returns accumulated GC time since the start of the latest time window. * * @return AccumulatedGcTime. */ @@ -199,7 +188,7 @@ public long getAccumulatedGcTime() { } /** - * Returns the accumulated number of GC pauses since this JVM started. + * Returns the accumulated number of GC pauses since the start of the latest time window. * * @return AccumulatedGcCount. */ @@ -218,13 +207,8 @@ public long getGcTimePercentage() { } private synchronized void update( - long inTimestamp, - long inGcMonitorRunTime, - long inTotalGcTime, - long inTotalGcCount, - int inGcTimePercentage) { + long inTimestamp, long inTotalGcTime, long inTotalGcCount, int inGcTimePercentage) { this.timestamp.set(inTimestamp); - this.gcMonitorRunTime.set(inGcMonitorRunTime); this.totalGcTime.set(inTotalGcTime); this.totalGcCount.set(inTotalGcCount); this.gcTimePercentage.set(inGcTimePercentage); From 6cfae460d038f9631e49c2033bc8b3a38382170d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Fri, 28 Jul 2023 11:14:58 +0800 Subject: [PATCH 18/22] remove duplicate fields and add accumulated gc time within obsWindows --- .../commons/service/metric/GcTimeAlerter.java | 6 ++-- .../service/metric/JvmGcMonitorMetrics.java | 30 +++++++++---------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java index c471bf526f83a..25011c0b5628a 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java @@ -40,8 +40,10 @@ public void alert(JvmGcMonitorMetrics.GcData gcData) { "Error metrics taken time: " + sdf.format(new Date(Long.parseLong(String.valueOf(gcData.getTimestamp()))))); logger.warn("Gc Time Percentage: " + gcData.getGcTimePercentage() + "%"); - logger.warn("Accumulated GC time: " + gcData.getAccumulatedGcTime() + " ms"); - logger.warn("Accumulated GC count: " + gcData.getAccumulatedGcCount()); + logger.warn( + "Accumulated GC time within current observation window: " + + gcData.getGcTimeWithinObsWindow() + + " ms"); logger.warn("Detailed GC metrics can be accessed in Prometheus."); } } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java index 930564e9e8428..ac313a9b83e1d 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java @@ -119,12 +119,10 @@ private void scheduledMonitoring() { } private void calculateGCTimePercentageWithinObservedInterval() { - long prevTotalGcTime = curData.totalGcTime.get(); + long prevTotalGcTime = curData.getAccumulatedGcTime(); long totalGcTime = 0; - long totalGcCount = 0; for (GarbageCollectorMXBean gcBean : ManagementFactory.getGarbageCollectorMXBeans()) { totalGcTime += gcBean.getCollectionTime(); - totalGcCount += gcBean.getCollectionCount(); } long gcTimeWithinSleepInterval = totalGcTime - prevTotalGcTime; @@ -155,7 +153,7 @@ private void calculateGCTimePercentageWithinObservedInterval() { curData.update( curTime, totalGcTime, - totalGcCount, + gcTimeWithinObservationWindow, (int) (gcTimeWithinObservationWindow * 100 @@ -165,10 +163,9 @@ private void calculateGCTimePercentageWithinObservedInterval() { /** Encapsulates data about GC pauses measured at the specific timestamp. */ public static class GcData implements Cloneable { private final AtomicLong timestamp = new AtomicLong(); - private AtomicLong totalGcTime = new AtomicLong(); - private AtomicLong totalGcCount = new AtomicLong(); - private AtomicLong gcTimePercentage = new AtomicLong(); - + private final AtomicLong totalGcTime = new AtomicLong(); + private final AtomicLong gcTimePercentage = new AtomicLong(); + private final AtomicLong gcTimeWithinObsWindow = new AtomicLong(); /** * Returns the absolute timestamp when this measurement was taken. * @@ -179,7 +176,7 @@ public long getTimestamp() { } /** - * Returns accumulated GC time since the start of the latest time window. + * Returns accumulated GC time since the start of IoTDB. * * @return AccumulatedGcTime. */ @@ -188,12 +185,12 @@ public long getAccumulatedGcTime() { } /** - * Returns the accumulated number of GC pauses since the start of the latest time window. + * Returns accumulated GC time within the latest observation window. * - * @return AccumulatedGcCount. + * @return gcTimeWithinObsWindow. */ - public long getAccumulatedGcCount() { - return totalGcCount.get(); + public long getGcTimeWithinObsWindow() { + return gcTimeWithinObsWindow.get(); } /** @@ -207,10 +204,13 @@ public long getGcTimePercentage() { } private synchronized void update( - long inTimestamp, long inTotalGcTime, long inTotalGcCount, int inGcTimePercentage) { + long inTimestamp, + long inTotalGcTime, + long inGcTimeWithinObsWindow, + int inGcTimePercentage) { this.timestamp.set(inTimestamp); this.totalGcTime.set(inTotalGcTime); - this.totalGcCount.set(inTotalGcCount); + this.gcTimeWithinObsWindow.set(inGcTimeWithinObsWindow); this.gcTimePercentage.set(inGcTimePercentage); } From 15eaf437910af112b1ee6f5843c78ac93ebce994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Fri, 28 Jul 2023 11:15:50 +0800 Subject: [PATCH 19/22] adjust max GC time threshold --- .../iotdb/commons/service/metric/JvmGcMonitorMetrics.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java index ac313a9b83e1d..084ef78459c12 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java @@ -48,7 +48,7 @@ public class JvmGcMonitorMetrics implements IMetricSet { // Ring buffers containing GC timings and timestamps when timings were taken private final TsAndData[] gcDataBuf; // Max GC time threshold - private final long maxGcTimePercentage = 70L; + private final long maxGcTimePercentage = 40L; // Duration of observation window private final long observationWindowMs = TimeUnit.MINUTES.toMillis(1); // Interval for data collection From 17f60e793a0ac1b6127f1a401616c11f7ab7648d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Fri, 28 Jul 2023 13:41:12 +0800 Subject: [PATCH 20/22] fix review --- .../commons/service/metric/GcTimeAlerter.java | 7 +- .../service/metric/JvmGcMonitorMetrics.java | 66 ++++++++++++++----- 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java index 25011c0b5628a..e4396c09b1128 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java @@ -44,6 +44,11 @@ public void alert(JvmGcMonitorMetrics.GcData gcData) { "Accumulated GC time within current observation window: " + gcData.getGcTimeWithinObsWindow() + " ms"); - logger.warn("Detailed GC metrics can be accessed in Prometheus."); + logger.warn( + "The observation window is from: " + + sdf.format(new Date(Long.parseLong(String.valueOf(gcData.getStartObsWindowTs())))) + + " to: " + + sdf.format(new Date(Long.parseLong(String.valueOf(gcData.getTimestamp()))))); + logger.warn("The observation window time is: " + gcData.getCurrentObsWindowTs() + " ms."); } } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java index 084ef78459c12..06dfdf36d11eb 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java @@ -40,6 +40,14 @@ import java.util.concurrent.atomic.AtomicLong; public class JvmGcMonitorMetrics implements IMetricSet { + // Duration of observation window + public static final long OBSERVATION_WINDOW_MS = TimeUnit.MINUTES.toMillis(1); + // Interval for data collection + public static final long SLEEP_INTERVAL_MS = TimeUnit.SECONDS.toMillis(5); + // Max GC time threshold + public static final long MAX_GC_TIME_PERCENTAGE = 2L; + // The time when IoTDB start running + private static long startTime; private static final Logger logger = LoggerFactory.getLogger(JvmGcMonitorMetrics.class); private final ScheduledExecutorService scheduledGCInfoMonitor = IoTDBThreadPoolFactory.newSingleThreadScheduledExecutor( @@ -47,27 +55,19 @@ public class JvmGcMonitorMetrics implements IMetricSet { private Future scheduledGcMonitorFuture; // Ring buffers containing GC timings and timestamps when timings were taken private final TsAndData[] gcDataBuf; - // Max GC time threshold - private final long maxGcTimePercentage = 40L; - // Duration of observation window - private final long observationWindowMs = TimeUnit.MINUTES.toMillis(1); - // Interval for data collection - private final long sleepIntervalMs = TimeUnit.SECONDS.toMillis(5); // Buffer size private final int bufSize; // Buffer start index private int startIdx; // Buffer end index private int endIdx; - // The time when jvm start running - private long startTime; // Container to hold collected GC data private final GcData curData = new GcData(); // Hook function called with GC exception private final GcTimeAlertHandler alertHandler; public JvmGcMonitorMetrics() { - bufSize = (int) (observationWindowMs / sleepIntervalMs + 2); + bufSize = (int) (OBSERVATION_WINDOW_MS / SLEEP_INTERVAL_MS + 2); // Prevent the user from accidentally creating an abnormally big buffer, which will result in // slow calculations and likely inaccuracy. Preconditions.checkArgument(bufSize <= 128 * 1024); @@ -89,14 +89,13 @@ public void bindTo(AbstractMetricService metricService) { startTime = System.currentTimeMillis(); // current collect time: startTime + start delay(50ms) - curData.timestamp.set(startTime + TimeUnit.MILLISECONDS.toMillis(50)); - gcDataBuf[startIdx].setValues(startTime, 0); + gcDataBuf[startIdx].setValues(startTime + TimeUnit.MILLISECONDS.toMillis(50), 0); scheduledGcMonitorFuture = ScheduledExecutorUtil.safelyScheduleWithFixedDelay( scheduledGCInfoMonitor, this::scheduledMonitoring, TimeUnit.MILLISECONDS.toMillis(50), // to prevent / ZERO exception - sleepIntervalMs, + SLEEP_INTERVAL_MS, TimeUnit.MILLISECONDS); } @@ -113,7 +112,7 @@ public void unbindFrom(AbstractMetricService metricService) { private void scheduledMonitoring() { calculateGCTimePercentageWithinObservedInterval(); - if (alertHandler != null && curData.gcTimePercentage.get() > maxGcTimePercentage) { + if (alertHandler != null && curData.gcTimePercentage.get() > MAX_GC_TIME_PERCENTAGE) { alertHandler.alert(curData.clone()); } } @@ -134,7 +133,7 @@ private void calculateGCTimePercentageWithinObservedInterval() { // Move startIdx forward until we reach the first buffer entry with // timestamp within the observation window. - long startObsWindowTs = curTime - observationWindowMs; + long startObsWindowTs = curTime - OBSERVATION_WINDOW_MS; while (gcDataBuf[startIdx].ts < startObsWindowTs && startIdx != endIdx) { startIdx = (startIdx + 1) % bufSize; } @@ -152,20 +151,40 @@ private void calculateGCTimePercentageWithinObservedInterval() { curData.update( curTime, + startObsWindowTs, totalGcTime, gcTimeWithinObservationWindow, (int) (gcTimeWithinObservationWindow * 100 - / Math.min(observationWindowMs, gcMonitorRunTime))); + / Math.min(OBSERVATION_WINDOW_MS, gcMonitorRunTime))); } /** Encapsulates data about GC pauses measured at the specific timestamp. */ public static class GcData implements Cloneable { + // The time when this object get updated. private final AtomicLong timestamp = new AtomicLong(); - private final AtomicLong totalGcTime = new AtomicLong(); + // The theoretical start time of the observation window, usually equal to `timestamp - + // OBSERVATION_WINDOW_MS` + private final AtomicLong startObsWindowTs = new AtomicLong(); + // Accumulated GC time since the start of IoTDB. + private final AtomicLong accumulatedGcTime = new AtomicLong(); + // The percentage (0..100) of time that the JVM spent in GC pauses within the observation window private final AtomicLong gcTimePercentage = new AtomicLong(); + // Accumulated GC time within the latest observation window. private final AtomicLong gcTimeWithinObsWindow = new AtomicLong(); + + /** + * Returns the length of current observation window, usually equal to OBSERVATION_WINDOW_MS. If + * IoTDB is started after the start of the theoretical time window, then IoTDB startup time is + * returned. + * + * @return current observation window time, millisecond. + */ + public long getCurrentObsWindowTs() { + return Math.min(timestamp.get() - startTime, timestamp.get() - startObsWindowTs.get()); + } + /** * Returns the absolute timestamp when this measurement was taken. * @@ -175,13 +194,22 @@ public long getTimestamp() { return timestamp.get(); } + /** + * Returns the start timestamp of the latest observation window. + * + * @return the actual start timestamp of the obs window. + */ + public long getStartObsWindowTs() { + return Math.max(startObsWindowTs.get(), startTime); + } + /** * Returns accumulated GC time since the start of IoTDB. * * @return AccumulatedGcTime. */ public long getAccumulatedGcTime() { - return totalGcTime.get(); + return accumulatedGcTime.get(); } /** @@ -205,11 +233,13 @@ public long getGcTimePercentage() { private synchronized void update( long inTimestamp, + long inStartObsWindowTs, long inTotalGcTime, long inGcTimeWithinObsWindow, int inGcTimePercentage) { this.timestamp.set(inTimestamp); - this.totalGcTime.set(inTotalGcTime); + this.startObsWindowTs.set(inStartObsWindowTs); + this.accumulatedGcTime.set(inTotalGcTime); this.gcTimeWithinObsWindow.set(inGcTimeWithinObsWindow); this.gcTimePercentage.set(inGcTimePercentage); } From df8caa51e43d2cdd004d52c6bb41f259079eb191 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Fri, 28 Jul 2023 13:43:12 +0800 Subject: [PATCH 21/22] fix threshold --- .../iotdb/commons/service/metric/JvmGcMonitorMetrics.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java index 06dfdf36d11eb..6624c3047de4a 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java @@ -45,7 +45,7 @@ public class JvmGcMonitorMetrics implements IMetricSet { // Interval for data collection public static final long SLEEP_INTERVAL_MS = TimeUnit.SECONDS.toMillis(5); // Max GC time threshold - public static final long MAX_GC_TIME_PERCENTAGE = 2L; + public static final long MAX_GC_TIME_PERCENTAGE = 40L; // The time when IoTDB start running private static long startTime; private static final Logger logger = LoggerFactory.getLogger(JvmGcMonitorMetrics.class); From e45a8eda44ca664faf26be0b1d90c290e044ca7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BF=8A=E6=A4=8D?= Date: Sun, 30 Jul 2023 14:22:30 +0800 Subject: [PATCH 22/22] remove unnecessary metric and fix young pool and old pool allocated bytes' statisticians --- .../metrics/metricsets/jvm/JvmGcMetrics.java | 148 ++++++++---------- .../iotdb/metrics/utils/SystemMetric.java | 8 +- 2 files changed, 72 insertions(+), 84 deletions(-) diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java index 413ffba13a890..3aeaa4e65bbe7 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java @@ -54,7 +54,7 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable { private static final Logger logger = LoggerFactory.getLogger(JvmGcMetrics.class); private final List notificationListenerCleanUpRunnables = new CopyOnWriteArrayList<>(); - private String youngGenPoolName; + private String firstYoungGenPoolName; private String oldGenPoolName; private String nonGenerationalMemoryPool; private final Map lastGcTotalDurationMap = new ConcurrentHashMap<>(); @@ -62,8 +62,8 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable { public JvmGcMetrics() { for (MemoryPoolMXBean mbean : ManagementFactory.getMemoryPoolMXBeans()) { String name = mbean.getName(); - if (isYoungGenPool(name)) { - youngGenPoolName = name; + if (isFirstYoungGenPool(name)) { + firstYoungGenPoolName = name; } else if (isOldGenPool(name)) { oldGenPoolName = name; } else if (isNonGenerationalHeapPool(name)) { @@ -98,12 +98,8 @@ private static boolean isPartiallyConcurrentGC(GarbageCollectorMXBean gc) { } } - private static boolean isConcurrentPhase(String cause, String name) { - return "No GC".equals(cause) || "Shenandoah Cycles".equals(name); - } - - private static boolean isYoungGenPool(String name) { - return name != null && (name.endsWith("Eden Space") || name.endsWith("Survivor Space")); + private static boolean isFirstYoungGenPool(String name) { + return name != null && name.endsWith("Eden Space"); } private static boolean isOldGenPool(String name) { @@ -142,26 +138,35 @@ public void bindTo(AbstractMetricService metricService) { liveDataSize, AtomicLong::get); - AtomicLong heapMemUsedPercentage = new AtomicLong(calculateMemoryUsagePercentage()); - metricService.createAutoGauge( - SystemMetric.JVM_GC_MEMORY_USED_PERCENT.toString(), - MetricLevel.CORE, - heapMemUsedPercentage, - AtomicLong::get); - - Counter allocatedBytes = - metricService.getOrCreateCounter( - SystemMetric.JVM_GC_MEMORY_ALLOCATED_BYTES.toString(), MetricLevel.CORE); - Counter promotedBytes = (oldGenPoolName == null) ? null : metricService.getOrCreateCounter( SystemMetric.JVM_GC_MEMORY_PROMOTED_BYTES.toString(), MetricLevel.CORE); - // start watching for GC notifications - final AtomicLong heapPoolSizeAfterGc = new AtomicLong(); + Counter nonGenAllocatedBytes = + (nonGenerationalMemoryPool == null) + ? null + : metricService.getOrCreateCounter( + SystemMetric.JVM_GC_NON_GEN_MEMORY_ALLOCATED_BYTES.toString(), MetricLevel.CORE); + Counter oldGenAllocatedBytes = + (oldGenPoolName == null) + ? null + : metricService.getOrCreateCounter( + SystemMetric.JVM_GC_OLD_MEMORY_ALLOCATED_BYTES.toString(), MetricLevel.CORE); + + Counter youngGenAllocatedBytes = + (firstYoungGenPoolName == null) + ? null + : metricService.getOrCreateCounter( + SystemMetric.JVM_GC_YOUNG_MEMORY_ALLOCATED_BYTES.toString(), MetricLevel.CORE); + + final AtomicLong firstYoungHeapPoolSizeAfterGc = new AtomicLong(); + // long live heap pool includes old gen heap pool and non-generation heap pool. + final AtomicLong longLivedHeapPoolSizeAfterGc = new AtomicLong(); + + // start watching for GC notifications for (GarbageCollectorMXBean mbean : ManagementFactory.getGarbageCollectorMXBeans()) { if (!(mbean instanceof NotificationEmitter)) { continue; @@ -194,16 +199,15 @@ public void bindTo(AbstractMetricService metricService) { previousTotal.set(total); } - String timerName; - if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { - timerName = "jvm_gc_concurrent_phase_time"; - } else { - timerName = "jvm_gc_pause"; - } // create a timer with tags named by gcCause, which binds gcCause with gcDuration Timer timer = metricService.getOrCreateTimer( - timerName, MetricLevel.CORE, "action", gcAction, "cause", gcCause); + SystemMetric.JVM_GC_PAUSE.toString(), + MetricLevel.CORE, + "action", + gcAction, + "cause", + gcCause); timer.update(duration, TimeUnit.MILLISECONDS); // add support for ZGC @@ -219,25 +223,6 @@ public void bindTo(AbstractMetricService metricService) { pausesCount.inc(); } - // monitoring old/young GC count, which is helpful for users to locate GC exception. - // Unfortunately, the JMX doesn't seem to provide an api for monitoring mixed gc in G1. - // In fact, JMX may treat mixed GCs as minor GCs. - if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { - Counter oldGcCounter = - metricService.getOrCreateCounter( - SystemMetric.JVM_GC_YOUNG_GC_COUNT.toString(), MetricLevel.CORE); - oldGcCounter.inc(); - } else if (GcGenerationAge.fromName(notificationInfo.getGcName()) - == GcGenerationAge.YOUNG) { - Counter youngGcCounter = - metricService.getOrCreateCounter( - SystemMetric.JVM_GC_OLD_GC_COUNT.toString(), MetricLevel.CORE); - youngGcCounter.inc(); - } - - // update memory usage percentage - heapMemUsedPercentage.set(calculateMemoryUsagePercentage()); - // Update promotion and allocation counters final Map before = gcInfo.getMemoryUsageBeforeGc(); final Map after = gcInfo.getMemoryUsageAfterGc(); @@ -246,8 +231,8 @@ public void bindTo(AbstractMetricService metricService) { countPoolSizeDelta( gcInfo.getMemoryUsageBeforeGc(), gcInfo.getMemoryUsageAfterGc(), - allocatedBytes, - heapPoolSizeAfterGc, + nonGenAllocatedBytes, + longLivedHeapPoolSizeAfterGc, nonGenerationalMemoryPool); if (after.get(nonGenerationalMemoryPool).getUsed() < before.get(nonGenerationalMemoryPool).getUsed()) { @@ -272,8 +257,7 @@ public void bindTo(AbstractMetricService metricService) { // GC (since in JMX, a minor GC of G1 may actually represent mixed GC, which collect // some obj in old gen region). To track the // live data size we record the value if we see a reduction in the old gen heap size - // or - // after a major GC. + // or after a major GC. if (oldAfter < oldBefore || GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { @@ -281,15 +265,21 @@ public void bindTo(AbstractMetricService metricService) { final long oldMaxAfter = after.get(oldGenPoolName).getMax(); maxDataSize.set(oldMaxAfter); } + countPoolSizeDelta( + gcInfo.getMemoryUsageBeforeGc(), + gcInfo.getMemoryUsageAfterGc(), + oldGenAllocatedBytes, + longLivedHeapPoolSizeAfterGc, + oldGenPoolName); } - if (youngGenPoolName != null) { + if (firstYoungGenPoolName != null) { countPoolSizeDelta( gcInfo.getMemoryUsageBeforeGc(), gcInfo.getMemoryUsageAfterGc(), - allocatedBytes, - heapPoolSizeAfterGc, - youngGenPoolName); + youngGenAllocatedBytes, + firstYoungHeapPoolSizeAfterGc, + firstYoungGenPoolName); } } }; @@ -321,12 +311,22 @@ public void unbindFrom(AbstractMetricService metricService) { metricService.remove(MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_MAX_DATA_SIZE_BYTES.toString()); metricService.remove( MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_LIVE_DATA_SIZE_BYTES.toString()); - metricService.remove(MetricType.COUNTER, SystemMetric.JVM_GC_MEMORY_ALLOCATED_BYTES.toString()); - metricService.remove(MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_MEMORY_USED_PERCENT.toString()); - if (oldGenPoolName != null) { + if (nonGenerationalMemoryPool != null) { metricService.remove( - MetricType.COUNTER, SystemMetric.JVM_GC_MEMORY_PROMOTED_BYTES.toString()); + MetricType.COUNTER, SystemMetric.JVM_GC_NON_GEN_MEMORY_ALLOCATED_BYTES.toString()); + } else { + if (oldGenPoolName != null) { + metricService.remove( + MetricType.COUNTER, SystemMetric.JVM_GC_MEMORY_PROMOTED_BYTES.toString()); + metricService.remove( + MetricType.COUNTER, SystemMetric.JVM_GC_OLD_MEMORY_ALLOCATED_BYTES.toString()); + } + + if (firstYoungGenPoolName != null) { + metricService.remove( + MetricType.COUNTER, SystemMetric.JVM_GC_YOUNG_MEMORY_ALLOCATED_BYTES.toString()); + } } // start watching for GC notifications @@ -342,13 +342,14 @@ public void unbindFrom(AbstractMetricService metricService) { String gcCause = notificationInfo.getGcCause(); String gcAction = notificationInfo.getGcAction(); - String timerName; - if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) { - timerName = "jvm_gc_concurrent_phase_time"; - } else { - timerName = "jvm_gc_pause"; - } - metricService.remove(MetricType.TIMER, timerName, "action", gcAction, "cause", gcCause); + metricService.remove( + MetricType.TIMER, + SystemMetric.JVM_GC_PAUSE.toString(), + "action", + gcAction, + "cause", + gcCause); + if (mbean.getName().equals("ZGC Cycles")) { metricService.remove( MetricType.COUNTER, SystemMetric.JVM_ZGC_CYCLES_COUNT.toString()); @@ -356,13 +357,6 @@ public void unbindFrom(AbstractMetricService metricService) { metricService.remove( MetricType.COUNTER, SystemMetric.JVM_ZGC_PAUSES_COUNT.toString()); } - if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) { - metricService.remove( - MetricType.COUNTER, SystemMetric.JVM_GC_YOUNG_GC_COUNT.toString()); - } else if (GcGenerationAge.fromName(notificationInfo.getGcName()) - == GcGenerationAge.YOUNG) { - metricService.remove(MetricType.COUNTER, SystemMetric.JVM_GC_OLD_GC_COUNT.toString()); - } }; NotificationEmitter notificationEmitter = (NotificationEmitter) mbean; notificationEmitter.addNotificationListener( @@ -426,12 +420,6 @@ public void close() { notificationListenerCleanUpRunnables.forEach(Runnable::run); } - public long calculateMemoryUsagePercentage() { - return (ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getUsed() - * 100 - / Runtime.getRuntime().maxMemory()); - } - enum GcGenerationAge { OLD, YOUNG, diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java index 645c2307eeaa4..e04586c0f11bf 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java @@ -79,13 +79,13 @@ public enum SystemMetric { JVM_THREADS_STATUS_THREADS("jvm_threads_states_threads"), JVM_GC_MAX_DATA_SIZE_BYTES("jvm_gc_max_data_size_bytes"), JVM_GC_LIVE_DATA_SIZE_BYTES("jvm_gc_live_data_size_bytes"), - JVM_GC_MEMORY_ALLOCATED_BYTES("jvm_gc_memory_allocated_bytes"), + JVM_GC_YOUNG_MEMORY_ALLOCATED_BYTES("jvm_gc_young_memory_allocated_bytes"), + JVM_GC_OLD_MEMORY_ALLOCATED_BYTES("jvm_gc_old_memory_allocated_bytes"), + JVM_GC_NON_GEN_MEMORY_ALLOCATED_BYTES("jvm_gc_non_gen_memory_allocated_bytes"), JVM_GC_MEMORY_PROMOTED_BYTES("jvm_gc_memory_promoted_bytes"), - JVM_GC_MEMORY_USED_PERCENT("jvm_gc_memory_used_percent"), + JVM_GC_PAUSE("jvm_gc_pause"), JVM_ZGC_CYCLES_COUNT("jvm_zgc_cycles_count"), JVM_ZGC_PAUSES_COUNT("jvm_zgc_pauses_count"), - JVM_GC_YOUNG_GC_COUNT("jvm_gc_old_gc_count"), - JVM_GC_OLD_GC_COUNT("jvm_gc_young_gc_count"), JVM_GC_ACCUMULATED_TIME_PERCENTAGE("jvm_gc_accumulated_time_percentage"), // net related RECEIVED_BYTES("received_bytes"),