Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/operations/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ These metrics are for the Druid Coordinator and are reset each time the Coordina

|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
| `service/heartbeat` | Metric indicating the service is up. `ServiceStatusMonitor` must be enabled. |`leader` on the Overlord and Coordinator.|1|
| `service/heartbeat` | Metric indicating the service is up. `ServiceStatusMonitor` must be enabled. | `leader` on the Overlord and Coordinator.<br />`workerVersion`, `category`, `status` on the Middle Manager.<br />`taskId`, `groupId`, `taskType`, `dataSource` on the Peon |1|

### Historical

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,5 +175,5 @@
"namespace/cache/numEntries" : { "dimensions" : [], "type" : "gauge" },
"namespace/cache/heapSizeInBytes" : { "dimensions" : [], "type" : "gauge" },

"service/heartbeat" : { "dimensions" : ["leader"], "type" : "gauge" }
"service/heartbeat" : { "dimensions" : ["leader"], "type" : "count" }
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be gauge IMO, as it's not a cumulative value. E.g. it doesn't make sense to say the total number of heartbeat yesterday.

Copy link
Copy Markdown
Contributor Author

@suneet-s suneet-s Aug 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the datadog docs - it looks like a count metric reports the count of the metrics retrieved in that interval and is not cumulative across intervals.

Suppose you are submitting a COUNT metric, activeusers.basket_size, from a single host running the Datadog Agent. This host emits the following values in a flush time interval: [1,1,1,2,2,2,3,3].

The Agent adds all of the values received in one time interval. Then, it submits the total number, in this case 15, as the COUNT metric’s value.

From an analysis point, it is very useful to know how many heartbeats were sent across a time period as it gives us an idea of uptime if we know how many heartbeats are sent per minute.

What do you think?

}
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ public class DruidMetrics

public static final String TAGS = "tags";

public static final String CATEGORY = "category";
public static final String WORKER_VERSION = "workerVersion";

public static int findNumComplexAggs(List<AggregatorFactory> aggs)
{
int retVal = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@
*/
public class ServiceStatusMonitor extends AbstractMonitor
{
/**
* The named binding for tags that should be reported with the `service/heartbeat` metric.
*/
public static final String HEARTBEAT_TAGS_BINDING = "heartbeat";

@Named("heartbeat")
@Named(HEARTBEAT_TAGS_BINDING)
@Inject(optional = true)
Supplier<Map<String, Object>> heartbeatTagsSupplier = null;

Expand All @@ -43,9 +47,7 @@ public boolean doMonitor(ServiceEmitter emitter)
{
final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder();
if (heartbeatTagsSupplier != null && heartbeatTagsSupplier.get() != null) {
heartbeatTagsSupplier.get().forEach((k, v) -> {
builder.setDimension(k, v);
});
heartbeatTagsSupplier.get().forEach(builder::setDimension);
}

emitter.emit(builder.build("service/heartbeat", 1));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.apache.druid.java.util.emitter.service.ServiceEmitter;
import org.apache.druid.java.util.emitter.service.ServiceMetricEvent;
import org.apache.druid.java.util.metrics.AbstractMonitor;
import org.apache.druid.query.DruidMetrics;

import java.util.Set;

Expand Down Expand Up @@ -71,8 +72,8 @@ private void emit(ServiceEmitter emitter, String metricName, Long value)
{
if (value != null) {
final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder();
builder.setDimension("category", workerCategory);
builder.setDimension("workerVersion", workerVersion);
builder.setDimension(DruidMetrics.CATEGORY, workerCategory);
builder.setDimension(DruidMetrics.WORKER_VERSION, workerVersion);
emitter.emit(builder.build(metricName, value));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
import org.apache.druid.server.initialization.jetty.JettyServerInitializer;
import org.apache.druid.server.lookup.cache.LookupCoordinatorManager;
import org.apache.druid.server.lookup.cache.LookupCoordinatorManagerConfig;
import org.apache.druid.server.metrics.ServiceStatusMonitor;
import org.apache.druid.server.router.TieredBrokerConfig;
import org.eclipse.jetty.server.Server;
import org.joda.time.Duration;
Expand Down Expand Up @@ -332,7 +333,7 @@ public void configure(Binder binder)
binder.bind(RowIngestionMetersFactory.class).toProvider(Providers.of(null));
// Bind HeartbeatSupplier only when the service operates independently of Overlord.
binder.bind(new TypeLiteral<Supplier<Map<String, Object>>>() {})
.annotatedWith(Names.named("heartbeat"))
.annotatedWith(Names.named(ServiceStatusMonitor.HEARTBEAT_TAGS_BINDING))
.toProvider(HeartbeatSupplier.class);
}

Expand Down
18 changes: 18 additions & 0 deletions services/src/main/java/org/apache/druid/cli/CliMiddleManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,17 @@
package org.apache.druid.cli;

import com.github.rvesse.airline.annotations.Command;
import com.google.common.base.Supplier;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.inject.Binder;
import com.google.inject.Inject;
import com.google.inject.Key;
import com.google.inject.Module;
import com.google.inject.Provides;
import com.google.inject.multibindings.MapBinder;
import com.google.inject.name.Named;
import com.google.inject.name.Names;
import com.google.inject.util.Providers;
import org.apache.druid.curator.ZkEnablementConfig;
Expand Down Expand Up @@ -67,6 +70,7 @@
import org.apache.druid.indexing.worker.shuffle.ShuffleModule;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.metadata.input.InputSourceModule;
import org.apache.druid.query.DruidMetrics;
import org.apache.druid.query.lookup.LookupSerdeModule;
import org.apache.druid.segment.incremental.RowIngestionMetersFactory;
import org.apache.druid.segment.realtime.appenderator.AppenderatorsManager;
Expand All @@ -76,11 +80,13 @@
import org.apache.druid.server.DruidNode;
import org.apache.druid.server.http.SelfDiscoveryResource;
import org.apache.druid.server.initialization.jetty.JettyServerInitializer;
import org.apache.druid.server.metrics.ServiceStatusMonitor;
import org.apache.druid.server.metrics.WorkerTaskCountStatsProvider;
import org.apache.druid.timeline.PruneLastCompactionState;
import org.eclipse.jetty.server.Server;

import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

Expand Down Expand Up @@ -195,6 +201,18 @@ private void configureIntermediaryData(Binder binder)
biddy.addBinding("deepstore").to(DeepStorageIntermediaryDataManager.class).in(LazySingleton.class);
}

@Provides
@LazySingleton
@Named(ServiceStatusMonitor.HEARTBEAT_TAGS_BINDING)
public Supplier<Map<String, Object>> heartbeatDimensions(WorkerConfig workerConfig, WorkerTaskManager workerTaskManager)
{
return () -> ImmutableMap.of(
DruidMetrics.WORKER_VERSION, workerConfig.getVersion(),
DruidMetrics.CATEGORY, workerConfig.getCategory(),
DruidMetrics.STATUS, workerTaskManager.isWorkerEnabled() ? "Enabled" : "Disabled"
);
}

@Provides
@LazySingleton
public Worker getWorker(@Self DruidNode node, WorkerConfig config)
Expand Down
3 changes: 2 additions & 1 deletion services/src/main/java/org/apache/druid/cli/CliOverlord.java
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@
import org.apache.druid.server.initialization.ServerConfig;
import org.apache.druid.server.initialization.jetty.JettyServerInitUtils;
import org.apache.druid.server.initialization.jetty.JettyServerInitializer;
import org.apache.druid.server.metrics.ServiceStatusMonitor;
import org.apache.druid.server.metrics.TaskCountStatsProvider;
import org.apache.druid.server.metrics.TaskSlotCountStatsProvider;
import org.apache.druid.server.security.AuthConfig;
Expand Down Expand Up @@ -361,7 +362,7 @@ public TaskStorageDirTracker getTaskStorageDirTracker(WorkerConfig workerConfig,

@Provides
@LazySingleton
@Named("heartbeat")
@Named(ServiceStatusMonitor.HEARTBEAT_TAGS_BINDING)
public Supplier<Map<String, Object>> getHeartbeatSupplier(TaskMaster taskMaster)
{
return () -> {
Expand Down
21 changes: 21 additions & 0 deletions services/src/main/java/org/apache/druid/cli/CliPeon.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@
import com.github.rvesse.airline.annotations.Command;
import com.github.rvesse.airline.annotations.Option;
import com.github.rvesse.airline.annotations.restrictions.Required;
import com.google.common.base.Supplier;
import com.google.common.base.Suppliers;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.inject.Binder;
import com.google.inject.Inject;
Expand Down Expand Up @@ -96,6 +99,7 @@
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.metadata.IndexerSQLMetadataStorageCoordinator;
import org.apache.druid.metadata.input.InputSourceModule;
import org.apache.druid.query.DruidMetrics;
import org.apache.druid.query.QuerySegmentWalker;
import org.apache.druid.query.lookup.LookupModule;
import org.apache.druid.segment.handoff.CoordinatorBasedSegmentHandoffNotifierConfig;
Expand Down Expand Up @@ -124,12 +128,14 @@
import org.apache.druid.server.initialization.jetty.ChatHandlerServerModule;
import org.apache.druid.server.initialization.jetty.JettyServerInitializer;
import org.apache.druid.server.metrics.DataSourceTaskIdHolder;
import org.apache.druid.server.metrics.ServiceStatusMonitor;
import org.eclipse.jetty.server.Server;

import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

Expand Down Expand Up @@ -259,6 +265,21 @@ public void configure(Binder binder)
}
}

@Provides
@LazySingleton
@Named(ServiceStatusMonitor.HEARTBEAT_TAGS_BINDING)
public Supplier<Map<String, Object>> heartbeatDimensions(Task task)
{
return Suppliers.ofInstance(
ImmutableMap.of(
DruidMetrics.TASK_ID, task.getId(),
DruidMetrics.DATASOURCE, task.getDataSource(),
DruidMetrics.TASK_TYPE, task.getType(),
DruidMetrics.GROUP_ID, task.getGroupId()
)
);
}

@Provides
@LazySingleton
public Task readTask(@Json ObjectMapper mapper, ExecutorLifecycleConfig config)
Expand Down