-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-31440][SQL] Improve SQL Rest API #28208
Changes from all commits
c89c5a4
d4a78a7
06bc108
6c5c2f9
817ebab
b0a9149
295d727
507b6be
2a1e388
e685b94
6e9b55f
c0660b1
2743296
2f9522f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,46 +21,55 @@ import java.util.Date | |
import javax.ws.rs._ | ||
import javax.ws.rs.core.MediaType | ||
|
||
import scala.util.{Failure, Success, Try} | ||
|
||
import org.apache.spark.JobExecutionStatus | ||
import org.apache.spark.sql.execution.ui.{SQLAppStatusStore, SQLExecutionUIData, SQLPlanMetric} | ||
import org.apache.spark.sql.execution.ui.{SparkPlanGraph, SparkPlanGraphCluster, SparkPlanGraphNode, SQLAppStatusStore, SQLExecutionUIData} | ||
import org.apache.spark.status.api.v1.{BaseAppResource, NotFoundException} | ||
|
||
@Produces(Array(MediaType.APPLICATION_JSON)) | ||
private[v1] class SqlResource extends BaseAppResource { | ||
|
||
val WHOLE_STAGE_CODEGEN = "WholeStageCodegen" | ||
|
||
@GET | ||
def sqlList( | ||
@DefaultValue("false") @QueryParam("details") details: Boolean, | ||
@DefaultValue("true") @QueryParam("details") details: Boolean, | ||
@DefaultValue("true") @QueryParam("planDescription") planDescription: Boolean, | ||
@DefaultValue("0") @QueryParam("offset") offset: Int, | ||
@DefaultValue("20") @QueryParam("length") length: Int): Seq[ExecutionData] = { | ||
withUI { ui => | ||
val sqlStore = new SQLAppStatusStore(ui.store.store) | ||
sqlStore.executionsList(offset, length).map(prepareExecutionData(_, details)) | ||
sqlStore.executionsList(offset, length).map { exec => | ||
val graph = sqlStore.planGraph(exec.executionId) | ||
prepareExecutionData(exec, graph, details, planDescription) | ||
} | ||
} | ||
} | ||
|
||
@GET | ||
@Path("{executionId:\\d+}") | ||
def sql( | ||
@PathParam("executionId") execId: Long, | ||
@DefaultValue("false") @QueryParam("details") details: Boolean): ExecutionData = { | ||
@DefaultValue("true") @QueryParam("details") details: Boolean, | ||
@DefaultValue("true") @QueryParam("planDescription") | ||
planDescription: Boolean): ExecutionData = { | ||
withUI { ui => | ||
val sqlStore = new SQLAppStatusStore(ui.store.store) | ||
val graph = sqlStore.planGraph(execId) | ||
sqlStore | ||
.execution(execId) | ||
.map(prepareExecutionData(_, details)) | ||
.getOrElse(throw new NotFoundException("unknown id: " + execId)) | ||
.map(prepareExecutionData(_, graph, details, planDescription)) | ||
.getOrElse(throw new NotFoundException("unknown query execution id: " + execId)) | ||
} | ||
} | ||
|
||
private def printableMetrics( | ||
metrics: Seq[SQLPlanMetric], | ||
metricValues: Map[Long, String]): Seq[Metrics] = { | ||
metrics.map(metric => | ||
Metrics(metric.name, metricValues.get(metric.accumulatorId).getOrElse(""))) | ||
} | ||
private def prepareExecutionData( | ||
exec: SQLExecutionUIData, | ||
graph: SparkPlanGraph, | ||
details: Boolean, | ||
planDescription: Boolean): ExecutionData = { | ||
|
||
private def prepareExecutionData(exec: SQLExecutionUIData, details: Boolean): ExecutionData = { | ||
var running = Seq[Int]() | ||
var completed = Seq[Int]() | ||
var failed = Seq[Int]() | ||
|
@@ -84,18 +93,65 @@ private[v1] class SqlResource extends BaseAppResource { | |
} | ||
|
||
val duration = exec.completionTime.getOrElse(new Date()).getTime - exec.submissionTime | ||
val planDetails = if (details) exec.physicalPlanDescription else "" | ||
val metrics = if (details) printableMetrics(exec.metrics, exec.metricValues) else Seq.empty | ||
val planDetails = if (planDescription) exec.physicalPlanDescription else "" | ||
val nodes = if (details) printableMetrics(graph.allNodes, exec.metricValues) else Seq.empty | ||
val edges = if (details) graph.edges else Seq.empty | ||
|
||
new ExecutionData( | ||
exec.executionId, | ||
status, | ||
exec.description, | ||
planDetails, | ||
metrics, | ||
new Date(exec.submissionTime), | ||
duration, | ||
running, | ||
completed, | ||
failed) | ||
failed, | ||
nodes, | ||
edges) | ||
} | ||
|
||
private def printableMetrics(allNodes: Seq[SparkPlanGraphNode], | ||
metricValues: Map[Long, String]): Seq[Node] = { | ||
|
||
def getMetric(metricValues: Map[Long, String], accumulatorId: Long, | ||
metricName: String): Option[Metric] = { | ||
|
||
metricValues.get(accumulatorId).map( mv => { | ||
val metricValue = if (mv.startsWith("\n")) mv.substring(1, mv.length) else mv | ||
Metric(metricName, metricValue) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually after #28037, part of the metrics name is in the value...
But this is not strongly related to this PR. We can just fix it in another PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, sounds good. |
||
}) | ||
} | ||
|
||
val nodeIdAndWSCGIdMap = getNodeIdAndWSCGIdMap(allNodes) | ||
val nodes = allNodes.map { node => | ||
val wholeStageCodegenId = nodeIdAndWSCGIdMap.get(node.id).flatten | ||
val metrics = | ||
node.metrics.flatMap(m => getMetric(metricValues, m.accumulatorId, m.name.trim)) | ||
Node(nodeId = node.id, nodeName = node.name.trim, wholeStageCodegenId, metrics) | ||
} | ||
|
||
nodes.sortBy(_.nodeId).reverse | ||
} | ||
|
||
private def getNodeIdAndWSCGIdMap(allNodes: Seq[SparkPlanGraphNode]): Map[Long, Option[Long]] = { | ||
val wscgNodes = allNodes.filter(_.name.trim.startsWith(WHOLE_STAGE_CODEGEN)) | ||
val nodeIdAndWSCGIdMap: Map[Long, Option[Long]] = wscgNodes.flatMap { | ||
_ match { | ||
case x: SparkPlanGraphCluster => x.nodes.map(_.id -> getWholeStageCodegenId(x.name.trim)) | ||
case _ => Seq.empty | ||
} | ||
}.toMap | ||
|
||
nodeIdAndWSCGIdMap | ||
} | ||
|
||
private def getWholeStageCodegenId(wscgNodeName: String): Option[Long] = { | ||
Try(wscgNodeName.substring( | ||
s"$WHOLE_STAGE_CODEGEN (".length, wscgNodeName.length - 1).toLong) match { | ||
case Success(wscgId) => Some(wscgId) | ||
case Failure(t) => None | ||
} | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@erenavsarogullari one last comment: why do we have extra option "planDescription" here. It seems reasonable to be covered by the option "details"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@gengliangwang Thanks for the review.
Please find my comments as follows:
1-
planDescription
exposes Physical Plan which covers datasetcolumn-names
. Column Names can be thought as customer sensitive data so with this option, end-users can disable in the light of their use-cases when they still access metrics.2- For complex queries,
planDescription
can be big string and create network overhead. In this case, it can be disabled where it is not required and metrics are required(e.g: time-series monitoring - metrics need to be persisted & exposed but Physical Plan does not) (if makes sense)