Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 46 additions & 32 deletions core/src/main/java/org/apache/accumulo/core/metrics/Metric.java
Original file line number Diff line number Diff line change
Expand Up @@ -210,10 +210,10 @@ public enum Metric {
"Total number of minor compactions performed.", MetricDocSection.COMPACTION, "Minc Completed",
null, NUMBER),
TSERVER_TABLETS_ONLINE("accumulo.tablets.online", MetricType.GAUGE, "Number of online tablets.",
MetricDocSection.TABLET_SERVER, "Tablets Online", null, NUMBER),
MetricDocSection.TABLET_SERVER, "Tablets", null, NUMBER),
TSERVER_TABLETS_LONG_ASSIGNMENTS("accumulo.tablets.assignments.warning", MetricType.GAUGE,
"Number of tablet assignments that are taking longer than the configured warning duration.",
MetricDocSection.TABLET_SERVER, "Tablet Assignments Overdue", null, NUMBER),
MetricDocSection.TABLET_SERVER, "Slow Assignments", null, NUMBER),
TSERVER_TABLETS_OPENING("accumulo.tablets.opening", MetricType.GAUGE,
"Number of opening tablets.", MetricDocSection.TABLET_SERVER, "Tablets Opening", null,
NUMBER),
Expand All @@ -228,9 +228,10 @@ public enum Metric {
TSERVER_INGEST_BYTES("accumulo.ingest.bytes", MetricType.GAUGE,
"Ingest byte count. The rate can be derived from this metric.",
MetricDocSection.TABLET_SERVER, "Bytes Ingested", null, BYTES),
// TODO does this duration expect millis in javascript?
TSERVER_HOLD("accumulo.ingest.hold", MetricType.GAUGE,
"Duration for which commits have been held in milliseconds.", MetricDocSection.TABLET_SERVER,
"Ingest Commit Hold Time", null, NUMBER),
"Hold Time", null, DURATION),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The null parameter here is for the column description. It's used as the text displayed in the Monitor UI when hovering over the column header in a table. When null is present the value defaults to the property description. In some cases we may want to be more descriptive in the monitor as "Hold Time" may not mean something to a novice (or experienced) user. In some cases we could be very descriptive in the Monitor telling the user what the column value means and what may cause it. For example,

Duration for which the TabletServer has not been accepting new mutations. The acceptance of new mutations are held as the TabletServer is waiting for some other activity to complete. This is typically a sync of the write-ahead log or a minor compaction. Frequent small hold times are normal whereas large hold times could indicate a problem that needs to be investigated.

TSERVER_TABLETS_ONLINE_ONDEMAND("accumulo.tablets.ondemand.online", MetricType.GAUGE,
"Number of online on-demand tablets", MetricDocSection.TABLET_SERVER,
"Online On-Demand Tablets", null, NUMBER),
Expand All @@ -253,46 +254,50 @@ public enum Metric {
null, NUMBER),

// Scan Metrics
SCAN_BUSY_TIMEOUT_COUNT("accumulo.scan.busy.timeout.count", MetricType.COUNTER,
SCAN_BUSY_TIMEOUT_COUNT("accumulo.scan.busy.timeout.count", MetricType.FUNCTION_COUNTER,
"Count of the scans where a busy timeout happened.", MetricDocSection.SCAN, "Scan Busy Count",
null, NUMBER),
SCAN_TIMES("accumulo.scan.times", MetricType.TIMER, "Scan session lifetime (creation to close).",
MetricDocSection.SCAN, "Scan Session Total Time", null, NUMBER),
SCAN_OPEN_FILES("accumulo.scan.files.open", MetricType.GAUGE, "Number of files open for scans.",
MetricDocSection.SCAN, "Scan Files Open", null, NUMBER),
SCAN_RESULTS("accumulo.scan.result", MetricType.GAUGE, "Results per scan.", MetricDocSection.SCAN,
"Scan Result Count", null, NUMBER),
SCAN_YIELDS("accumulo.scan.yields", MetricType.GAUGE, "Counts scans that have yielded.",
MetricDocSection.SCAN, "Scan Yield Count", null, NUMBER),
SCAN_START("accumulo.scan.start", MetricType.COUNTER,
SCAN_RESULTS("accumulo.scan.result", MetricType.DISTRIBUTION_SUMMARY, "Results per scan.",
MetricDocSection.SCAN, "Scan Result Count", null, NUMBER),
SCAN_YIELDS("accumulo.scan.yields", MetricType.DISTRIBUTION_SUMMARY,
"Counts scans that have yielded.", MetricDocSection.SCAN, "Scan Yield Count", null, NUMBER),
SCAN_START("accumulo.scan.start", MetricType.FUNCTION_COUNTER,
"Number of calls to start a scan or multiscan.", MetricDocSection.SCAN, "Scan Start Count",
null, NUMBER),
SCAN_CONTINUE("accumulo.scan.continue", MetricType.COUNTER,
SCAN_CONTINUE("accumulo.scan.continue", MetricType.FUNCTION_COUNTER,
"Number of calls to continue a scan or multiscan.", MetricDocSection.SCAN,
"Scan Continue Count", null, NUMBER),
SCAN_CLOSE("accumulo.scan.close", MetricType.COUNTER,
SCAN_CLOSE("accumulo.scan.close", MetricType.FUNCTION_COUNTER,
"Number of calls to close a scan or multiscan.", MetricDocSection.SCAN, "Scan Close Count",
null, NUMBER),
SCAN_QUERIES("accumulo.scan.queries", MetricType.GAUGE, "Number of queries made during scans.",
MetricDocSection.SCAN, "Tablet Lookup Count", null, NUMBER),
SCAN_SCANNED_ENTRIES("accumulo.scan.query.scanned.entries", MetricType.GAUGE,
SCAN_QUERIES("accumulo.scan.queries", MetricType.FUNCTION_COUNTER,
"Number of queries made during scans.", MetricDocSection.SCAN, "Tablet Lookup Count", null,
NUMBER),
SCAN_SCANNED_ENTRIES("accumulo.scan.query.scanned.entries", MetricType.FUNCTION_COUNTER,
"Count of scanned entries. The rate can be derived from this metric.", MetricDocSection.SCAN,
"Scanned Entry Count", null, NUMBER),
SCAN_QUERY_SCAN_RESULTS("accumulo.scan.query.results", MetricType.GAUGE,
"Scanned Entries", null, NUMBER),
SCAN_QUERY_SCAN_RESULTS("accumulo.scan.query.results", MetricType.FUNCTION_COUNTER,
"Query count. The rate can be derived from this metric.", MetricDocSection.SCAN,
"Returned Entry Count", null, NUMBER),
SCAN_QUERY_SCAN_RESULTS_BYTES("accumulo.scan.query.results.bytes", MetricType.GAUGE,
"Returned Entries", null, NUMBER),
SCAN_QUERY_SCAN_RESULTS_BYTES("accumulo.scan.query.results.bytes", MetricType.FUNCTION_COUNTER,
"Query byte count. The rate can be derived from this metric.", MetricDocSection.SCAN,
"Returned Bytes Count", null, BYTES),
SCAN_PAUSED_FOR_MEM("accumulo.scan.paused.for.memory", MetricType.COUNTER,
"Returned Bytes", null, BYTES),
SCAN_PAUSED_FOR_MEM("accumulo.scan.paused.for.memory", MetricType.FUNCTION_COUNTER,
"Count of scans paused due to server being low on memory.", MetricDocSection.SCAN,
"Scans Paused For Low Memory", null, NUMBER),
SCAN_RETURN_FOR_MEM("accumulo.scan.return.early.for.memory", MetricType.COUNTER,
SCAN_RETURN_FOR_MEM("accumulo.scan.return.early.for.memory", MetricType.FUNCTION_COUNTER,
"Count of scans that returned results early due to server being low on memory.",
MetricDocSection.SCAN, "Scans Returned Early For Low Memory", null, NUMBER),
SCAN_ZOMBIE_THREADS("accumulo.scan.zombie.threads", MetricType.GAUGE,
"Number of scan threads that have no associated client session.", MetricDocSection.SCAN,
"Scan Zombie Thread Count", null, NUMBER),
SCAN_ERRORS("accumulo.scan.errors", MetricType.FUNCTION_COUNTER,
"Number of scan task that had an exception.", MetricDocSection.SCAN, "Failed scans", null,
NUMBER),

// Major Compaction Metrics
MAJC_PAUSED("accumulo.compaction.majc.paused", MetricType.COUNTER,
Expand Down Expand Up @@ -334,23 +339,22 @@ public enum Metric {

// Block Cache Metrics
BLOCKCACHE_INDEX_HITCOUNT("accumulo.blockcache.index.hitcount", MetricType.FUNCTION_COUNTER,
"Index block cache hit count.", MetricDocSection.BLOCK_CACHE, "Index Block Cache Hit Count",
null, NUMBER),
"Index block cache hit count.", MetricDocSection.BLOCK_CACHE, "Index Cache Hit", null,
NUMBER),
BLOCKCACHE_INDEX_REQUESTCOUNT("accumulo.blockcache.index.requestcount",
MetricType.FUNCTION_COUNTER, "Index block cache request count.", MetricDocSection.BLOCK_CACHE,
"Index Block Cache Request Count", null, NUMBER),
"Index Cache Request", null, NUMBER),
BLOCKCACHE_INDEX_EVICTIONCOUNT("accumulo.blockcache.index.evictioncount",
MetricType.FUNCTION_COUNTER, "Index block cache eviction count.",
MetricDocSection.BLOCK_CACHE, "Index Block Cache Eviction Count", null, NUMBER),
MetricDocSection.BLOCK_CACHE, "Index Cache Eviction", null, NUMBER),
BLOCKCACHE_DATA_HITCOUNT("accumulo.blockcache.data.hitcount", MetricType.FUNCTION_COUNTER,
"Data block cache hit count.", MetricDocSection.BLOCK_CACHE, "Data Block Cache Hit Count",
null, NUMBER),
"Data block cache hit count.", MetricDocSection.BLOCK_CACHE, "Data Cache Hit", null, NUMBER),
BLOCKCACHE_DATA_REQUESTCOUNT("accumulo.blockcache.data.requestcount", MetricType.FUNCTION_COUNTER,
"Data block cache request count.", MetricDocSection.BLOCK_CACHE,
"Data Block Cache Request Count", null, NUMBER),
"Data block cache request count.", MetricDocSection.BLOCK_CACHE, "Data Cache Request", null,
NUMBER),
BLOCKCACHE_DATA_EVICTIONCOUNT("accumulo.blockcache.data.evictioncount",
MetricType.FUNCTION_COUNTER, "Data block cache eviction count.", MetricDocSection.BLOCK_CACHE,
"Data Block Cache Eviction Count", null, NUMBER),
"Data Cache Eviction", null, NUMBER),
BLOCKCACHE_SUMMARY_HITCOUNT("accumulo.blockcache.summary.hitcount", MetricType.FUNCTION_COUNTER,
"Summary block cache hit count.", MetricDocSection.BLOCK_CACHE,
"Summary Block Cache Hit Count", null, NUMBER),
Expand Down Expand Up @@ -387,16 +391,26 @@ public enum Metric {
MetricDocSection.GENERAL_SERVER, "Tablet Recovery Longest Time", null, DURATION),
RECOVERIES_AVG_PROGRESS("accumulo.recoveries.avg.progress", MetricType.GAUGE,
"The average percentage (0.0 - 99.9) of the in progress recoveries.",
MetricDocSection.GENERAL_SERVER, "Tablet Recovery Avg Percent Complete", null, PERCENT);
MetricDocSection.GENERAL_SERVER, "Tablet Recovery Avg Percent Complete", null, PERCENT),

// Executor metrics
EXECUTOR_COMPLETED("executor.completed", MetricType.FUNCTION_COUNTER,
"Task completed by a thread pool. Each thread pool emits this metric w/ a different tag.",
MetricDocSection.GENERAL_SERVER, "Completed task", null, NUMBER),
EXECUTOR_QUEUED("executor.queued", MetricType.GAUGE,
"Task queued for a thread pool. Each thread pool emits this metric w/ a different tag.",
MetricDocSection.GENERAL_SERVER, "Queued task", null, NUMBER);

public static enum MonitorCssClass {
public enum MonitorCssClass {
BYTES("big-size"),
BYTES_RATE("rate-size"),
DATE_END("end-date"),
DATE_START("start-date"),
DURATION("duration"),
IDLE_STATE("idle-state"),
MEMORY_STATE("memory-state"),
NUMBER("big-num"),
RATE("rate-num"),
PERCENT("percent");

private final String cssClass;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.accumulo.core.metrics;

import java.time.Duration;
import java.util.concurrent.TimeUnit;

import io.micrometer.core.instrument.Clock;
import io.micrometer.core.instrument.step.StepMeterRegistry;
import io.micrometer.core.instrument.step.StepRegistryConfig;

public class MonitorMeterRegistry extends StepMeterRegistry {

public static final Duration STEP = Duration.ofSeconds(30);

private static final StepRegistryConfig CONFIG = new StepRegistryConfig() {

@Override
public String prefix() {
return "monitor";
}

@Override
public String get(String key) {
return null;
}

@Override
public Duration step() {
return STEP;
}
};

public MonitorMeterRegistry() {
super(CONFIG, Clock.SYSTEM);
}

@Override
protected void publish() {

}

@Override
protected TimeUnit getBaseTimeUnit() {
return TimeUnit.MILLISECONDS;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ public enum ThreadPoolNames {
MANAGER_UPGRADE_COORDINATOR_METADATA_POOL("accumulo.pool.manager.upgrade.metadata"),
METADATA_TABLET_MIGRATION_POOL("accumulo.pool.metadata.tablet.migration"),
METADATA_TABLET_ASSIGNMENT_POOL("accumulo.pool.metadata.tablet.assignment"),
RPC_POOL("accumulo.pool.rpc"),
SCAN_EXECUTOR_PREFIX("accumulo.pool.scan.exec."),
SCAN_SERVER_TABLET_METADATA_CACHE_POOL("accumulo.pool.scan.server.tablet.metadata.cache"),
SCANNER_READ_AHEAD_POOL("accumulo.pool.client.context.scanner.read.ahead"),
SCHED_FUTURE_CHECKER_POOL("accumulo.pool.scheduled.future.checker"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import org.apache.accumulo.core.util.threads.Threads;
import org.apache.accumulo.server.mem.LowMemoryDetector;
import org.apache.accumulo.server.metrics.MetricResponseWrapper;
import org.apache.accumulo.server.metrics.MetricsInfoImpl;
import org.apache.accumulo.server.metrics.ProcessMetrics;
import org.apache.accumulo.server.rpc.ServerAddress;
import org.apache.accumulo.server.security.SecurityUtil;
Expand All @@ -67,7 +68,6 @@
import com.google.flatbuffers.FlatBufferBuilder;

import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Metrics;

public abstract class AbstractServer
implements AutoCloseable, MetricsProducer, Runnable, ServerProcessService.Iface {
Expand Down Expand Up @@ -404,9 +404,12 @@ public MetricResponse getMetrics(TInfo tinfo, TCredentials credentials) throws T
response.setResourceGroup(getResourceGroup().canonical());
response.setTimestamp(System.currentTimeMillis());

if (context.getMetricsInfo().isMetricsEnabled()) {
Metrics.globalRegistry.getMeters().forEach(m -> {
if (m.getId().getName().startsWith("accumulo.")) {
var registry = MetricsInfoImpl.MONITOR_REGISTRY.get();
if (registry != null) {
registry.getMeters().forEach(m -> {
if (m.getId().getName().startsWith("accumulo.")
|| m.getId().getName().equals(Metric.EXECUTOR_COMPLETED.getName())
|| m.getId().getName().equals(Metric.EXECUTOR_QUEUED.getName())) {
if (!this.monitorMetricExclusions.contains(m.getId().getName())) {
m.match(response::writeMeter, response::writeMeter, response::writeTimer,
response::writeDistributionSummary, response::writeLongTaskTimer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.accumulo.core.classloader.ClassLoaderUtil;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.metrics.MetricsInfo;
import org.apache.accumulo.core.metrics.MetricsProducer;
import org.apache.accumulo.core.metrics.MonitorMeterRegistry;
import org.apache.accumulo.core.spi.metrics.MeterRegistryFactory;
import org.apache.accumulo.core.util.threads.ThreadPools;
import org.apache.accumulo.server.ServerContext;
Expand Down Expand Up @@ -67,6 +69,8 @@ public class MetricsInfoImpl implements MetricsInfo {

private final List<MetricsProducer> producers = new ArrayList<>();

public static final AtomicReference<MeterRegistry> MONITOR_REGISTRY = new AtomicReference<>();

public MetricsInfoImpl(final ServerContext context) {
this.context = context;
metricsEnabled = context.getConfiguration().getBoolean(Property.GENERAL_MICROMETER_ENABLED);
Expand Down Expand Up @@ -164,6 +168,9 @@ public synchronized void init(Collection<Tag> tags) {
}
}

MONITOR_REGISTRY.set(new MonitorMeterRegistry());
Metrics.globalRegistry.add(MONITOR_REGISTRY.get());

// Set the MeterRegistry on the ThreadPools
ThreadPools.getServerThreadPools().setMeterRegistry(Metrics.globalRegistry);

Expand Down
Loading