From bf7da520260e23166210ee6b6ffe0a3e73fe5c82 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Mon, 20 Apr 2026 23:32:58 -0700 Subject: [PATCH] feat: MSQ: Log full stack trace when "debug" is set. Typically controllers and workers only log the full stack trace for unknown faults and for DruidExceptions that have a non-USER persona. The rationale is to avoid log spam. However, sometimes it is useful to see them. This patch changes things so the full stack traces are logged when the "debug" flag is set in the context. Test contexts are also updated such that the full stack trace is always logged in tests. Finally, this patch also calls "wasDeserialized()" on the DruidException builder when re-creating DruidExceptions on the controller. This avoids having a stack trace assigned that isn't useful. --- .../druid/msq/dart/controller/DartControllerContext.java | 6 ++++++ .../apache/druid/msq/dart/worker/DartWorkerContext.java | 6 ++++++ .../java/org/apache/druid/msq/exec/ControllerContext.java | 5 +++++ .../java/org/apache/druid/msq/exec/ControllerImpl.java | 4 ++-- .../src/main/java/org/apache/druid/msq/exec/MSQTasks.java | 7 ++++--- .../java/org/apache/druid/msq/exec/WorkerContext.java | 5 +++++ .../main/java/org/apache/druid/msq/exec/WorkerImpl.java | 2 +- .../druid/msq/indexing/IndexerControllerContext.java | 6 ++++++ .../apache/druid/msq/indexing/IndexerWorkerContext.java | 8 ++++++++ .../druid/msq/indexing/error/DruidExceptionFault.java | 1 + .../apache/druid/msq/test/MSQTestControllerContext.java | 6 ++++++ .../org/apache/druid/msq/test/MSQTestWorkerContext.java | 6 ++++++ .../msq/test/TestDartControllerContextFactoryImpl.java | 6 ++++++ .../main/java/org/apache/druid/error/DruidException.java | 2 +- 14 files changed, 63 insertions(+), 7 deletions(-) diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/dart/controller/DartControllerContext.java b/multi-stage-query/src/main/java/org/apache/druid/msq/dart/controller/DartControllerContext.java index a4a4ff8945a4..52936b8e0f87 100644 --- a/multi-stage-query/src/main/java/org/apache/druid/msq/dart/controller/DartControllerContext.java +++ b/multi-stage-query/src/main/java/org/apache/druid/msq/dart/controller/DartControllerContext.java @@ -254,4 +254,10 @@ public int targetPartitionsPerWorker() DEFAULT_TARGET_PARTITIONS_PER_WORKER ); } + + @Override + public boolean isDebug() + { + return context.isDebug(); + } } diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/dart/worker/DartWorkerContext.java b/multi-stage-query/src/main/java/org/apache/druid/msq/dart/worker/DartWorkerContext.java index 6aaddba48d5a..bde9f8968a0d 100644 --- a/multi-stage-query/src/main/java/org/apache/druid/msq/dart/worker/DartWorkerContext.java +++ b/multi-stage-query/src/main/java/org/apache/druid/msq/dart/worker/DartWorkerContext.java @@ -274,6 +274,12 @@ public boolean includeAllCounters() return true; } + @Override + public boolean isDebug() + { + return queryContext.isDebug(); + } + @Override public DruidNode selfNode() { diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java b/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java index 2d14c10788b2..e12c5c3b3362 100644 --- a/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java +++ b/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java @@ -135,4 +135,9 @@ default File taskTempDir() * shuffle specs that have {@link ShuffleSpec#isAdjustable()} set to true. */ int targetPartitionsPerWorker(); + + /** + * Whether the controller should log full stack traces on error. + */ + boolean isDebug(); } diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java b/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java index f616db7a032a..83568629be1d 100644 --- a/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java +++ b/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java @@ -487,11 +487,11 @@ private MSQTaskReportPayload runInternal(final QueryListener queryListener, fina // Log the errors we encountered. if (controllerError != null) { - log.warn("Controller: %s", MSQTasks.errorReportToLogMessage(controllerError)); + log.warn("Controller: %s", MSQTasks.errorReportToLogMessage(controllerError, context.isDebug())); } if (workerError != null) { - log.warn("Worker: %s", MSQTasks.errorReportToLogMessage(workerError)); + log.warn("Worker: %s", MSQTasks.errorReportToLogMessage(workerError, context.isDebug())); } } if (queryKernel != null && queryKernel.isSuccess()) { diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MSQTasks.java b/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MSQTasks.java index b47124aef6b4..ea47ded0c922 100644 --- a/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MSQTasks.java +++ b/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MSQTasks.java @@ -203,9 +203,10 @@ static MSQErrorReport makeErrorReport( } /** - * Returns a string form of a {@link MSQErrorReport} suitable for logging. + * Returns a string form of a {@link MSQErrorReport} suitable for logging. When {@code forceFullStackTrace} is true, + * the full stack trace is always included (when available), regardless of fault type. */ - static String errorReportToLogMessage(final MSQErrorReport errorReport) + static String errorReportToLogMessage(final MSQErrorReport errorReport, final boolean forceFullStackTrace) { final StringBuilder logMessage = new StringBuilder("Work failed"); @@ -222,7 +223,7 @@ static String errorReportToLogMessage(final MSQErrorReport errorReport) logMessage.append(": ").append(MSQFaultUtils.generateMessageWithErrorCode(errorReport.getFault())); if (errorReport.getExceptionStackTrace() != null) { - if (logFullStackTrace(errorReport.getFault())) { + if (forceFullStackTrace || logFullStackTrace(errorReport.getFault())) { logMessage.append('\n').append(errorReport.getExceptionStackTrace()); } else { // Log first line only (error class, message) for known faults, to avoid polluting logs. diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java b/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java index 61c1dd9654e3..9e696a16cce6 100644 --- a/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java +++ b/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java @@ -116,6 +116,11 @@ public interface WorkerContext extends Closeable */ boolean includeAllCounters(); + /** + * Whether to log full stack traces for all errors. + */ + boolean isDebug(); + @Override void close(); } diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java b/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java index 0b3bfdc235ae..fa7d5c232ce6 100644 --- a/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java +++ b/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java @@ -195,7 +195,7 @@ public void run() if (maybeErrorReport.isPresent()) { final MSQErrorReport errorReport = maybeErrorReport.get(); - final String logMessage = MSQTasks.errorReportToLogMessage(errorReport); + final String logMessage = MSQTasks.errorReportToLogMessage(errorReport, context.isDebug()); log.warn("%s", logMessage); // Inform controller of any errors that occur, unless we were canceled. This prevents attempting to contact diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java b/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java index cd91376e5c9f..18aebe0bcd21 100644 --- a/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java +++ b/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java @@ -259,6 +259,12 @@ public int targetPartitionsPerWorker() ); } + @Override + public boolean isDebug() + { + return taskQuerySpecContext.isDebug(); + } + /** * Helper method for {@link #queryKernelConfig(MSQSpec)}. Also used in tests. */ diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java b/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java index 1e445f5e2c9b..3a50cdc71f20 100644 --- a/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java +++ b/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java @@ -93,6 +93,7 @@ public class IndexerWorkerContext implements WorkerContext private final int maxConcurrentStages; private final boolean liveReportCounters; private final boolean includeAllCounters; + private final boolean debug; private final int threadCount; // Written under synchronized(this) using double-checked locking. @@ -134,6 +135,7 @@ public IndexerWorkerContext( ); this.liveReportCounters = MultiStageQueryContext.getLiveReportCounters(queryContext, DEFAULT_LIVE_REPORT_COUNTERS); this.includeAllCounters = MultiStageQueryContext.getIncludeAllCounters(queryContext); + this.debug = queryContext.isDebug(); // Compute thread count once in constructor final int baseThreadCount = memoryIntrospector.numProcessingThreads(); @@ -327,6 +329,12 @@ public boolean includeAllCounters() return includeAllCounters; } + @Override + public boolean isDebug() + { + return debug; + } + public ServiceLocator controllerLocator() { return controllerLocator; diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/DruidExceptionFault.java b/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/DruidExceptionFault.java index 32dded914ee3..d58578e7f29f 100644 --- a/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/DruidExceptionFault.java +++ b/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/DruidExceptionFault.java @@ -106,6 +106,7 @@ public DruidException toDruidException() return DruidException.forPersona(personaEnum) .ofCategory(categoryEnum) .withErrorCode(druidErrorCode) + .wasDeserialized() .build(getErrorMessage()) .withContext(context); } diff --git a/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java b/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java index 5bae51c53e3f..6f09c43ff028 100644 --- a/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java +++ b/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java @@ -485,6 +485,12 @@ public int targetPartitionsPerWorker() return 1; } + @Override + public boolean isDebug() + { + return true; + } + @Override public ControllerContext newContext(QueryContext context) { diff --git a/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java b/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java index a926aabee6f7..a8773635484c 100644 --- a/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java +++ b/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java @@ -206,6 +206,12 @@ public boolean includeAllCounters() return true; } + @Override + public boolean isDebug() + { + return true; + } + @Override public void close() { diff --git a/multi-stage-query/src/test/java/org/apache/druid/msq/test/TestDartControllerContextFactoryImpl.java b/multi-stage-query/src/test/java/org/apache/druid/msq/test/TestDartControllerContextFactoryImpl.java index 6032afc853bc..c3eaf6e9c9eb 100644 --- a/multi-stage-query/src/test/java/org/apache/druid/msq/test/TestDartControllerContextFactoryImpl.java +++ b/multi-stage-query/src/test/java/org/apache/druid/msq/test/TestDartControllerContextFactoryImpl.java @@ -108,6 +108,12 @@ public void emitMetric(MSQMetricEventBuilder metricBuilder) { serviceEmitter.emit(metricBuilder.build("controller", queryId())); } + + @Override + public boolean isDebug() + { + return true; + } }; } diff --git a/processing/src/main/java/org/apache/druid/error/DruidException.java b/processing/src/main/java/org/apache/druid/error/DruidException.java index 4883c8115627..54382f5274b6 100644 --- a/processing/src/main/java/org/apache/druid/error/DruidException.java +++ b/processing/src/main/java/org/apache/druid/error/DruidException.java @@ -477,7 +477,7 @@ public DruidExceptionBuilder ofCategory(Category category) * * @return the builder */ - DruidExceptionBuilder wasDeserialized() + public DruidExceptionBuilder wasDeserialized() { this.deserialized = true; return this;