diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala index 3949b67be99..d9a8a974073 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala @@ -109,6 +109,7 @@ import org.apache.texera.amber.operator.visualization.continuousErrorBands.Conti import org.apache.texera.amber.operator.visualization.contourPlot.ContourPlotOpDesc import org.apache.texera.amber.operator.visualization.dendrogram.DendrogramOpDesc import org.apache.texera.amber.operator.visualization.dumbbellPlot.DumbbellPlotOpDesc +import org.apache.texera.amber.operator.visualization.ecdfPlot.ECDFPlotOpDesc import org.apache.texera.amber.operator.visualization.figureFactoryTable.FigureFactoryTableOpDesc import org.apache.texera.amber.operator.visualization.filledAreaPlot.FilledAreaPlotOpDesc import org.apache.texera.amber.operator.visualization.funnelPlot.FunnelPlotOpDesc @@ -185,6 +186,7 @@ trait StateTransferFunc new Type(value = classOf[CandlestickChartOpDesc], name = "CandlestickChart"), new Type(value = classOf[SplitOpDesc], name = "Split"), new Type(value = classOf[ContourPlotOpDesc], name = "ContourPlot"), + new Type(value = classOf[ECDFPlotOpDesc], name = "ECDFPlot"), new Type(value = classOf[RegexOpDesc], name = "Regex"), new Type(value = classOf[SpecializedFilterOpDesc], name = "Filter"), new Type(value = classOf[ProjectionOpDesc], name = "Projection"), diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDesc.scala new file mode 100644 index 00000000000..68625bdc4bf --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDesc.scala @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.visualization.ecdfPlot + +import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} +import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} +import org.apache.texera.amber.core.tuple.{AttributeType, Schema} +import org.apache.texera.amber.core.workflow.PortIdentity +import org.apache.texera.amber.operator.PythonOperatorDescriptor +import org.apache.texera.amber.operator.metadata.annotations.AutofillAttributeName +import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString +import org.apache.texera.amber.pybuilder.PythonTemplateBuilder +import org.apache.texera.amber.pybuilder.PythonTemplateBuilder.PythonTemplateBuilderStringContext + +import javax.validation.constraints.NotNull + +@JsonSchemaInject( + json = """{"attributeTypeRules":{"valueColumn":{"enum":["integer","long","double"]}}}""" +) +class ECDFPlotOpDesc extends PythonOperatorDescriptor { + + @JsonProperty(required = true) + @JsonSchemaTitle("Value Column") + @JsonPropertyDescription("Numeric column used to compute the empirical cumulative distribution.") + @AutofillAttributeName + @NotNull(message = "Value column cannot be empty") + var valueColumn: EncodableString = "" + + @JsonProperty(required = false) + @JsonSchemaTitle("Color Column") + @JsonPropertyDescription("Optional column for coloring ECDF lines by group.") + @AutofillAttributeName + var colorColumn: EncodableString = "" + + @JsonProperty(required = false) + @JsonSchemaTitle("Separate By Column") + @JsonPropertyDescription("Optional column for splitting ECDF plots into subplots.") + @AutofillAttributeName + var separateBy: EncodableString = "" + + @JsonProperty(required = false, defaultValue = "probability") + @JsonSchemaTitle("Y Axis Mode") + @JsonPropertyDescription("Display cumulative probability, raw count, or cumulative sum.") + @JsonSchemaInject( + json = """{ "enum": ["probability", "count", "sum"], "default": "probability" }""" + ) + var yAxisMode: String = "probability" + + @JsonProperty(required = false, defaultValue = "standard") + @JsonSchemaTitle("CDF Mode") + @JsonPropertyDescription( + "'standard' shows P(X ≤ x), 'reversed' shows P(X ≥ x), " + + "'complementary' shows 1 - P(X ≤ x)." + ) + @JsonSchemaInject( + json = """{ "enum": ["standard", "reversed", "complementary"], "default": "standard" }""" + ) + var cdfMode: EncodableString = "standard" + + @JsonProperty(required = false, defaultValue = "vertical") + @JsonSchemaTitle("Orientation") + @JsonPropertyDescription("Plot ECDF vertically or horizontally.") + @JsonSchemaInject(json = """{ "enum": ["vertical", "horizontal"], "default": "vertical" }""") + var orientation: EncodableString = "vertical" + + @JsonProperty(required = false, defaultValue = "false") + @JsonSchemaTitle("Show Markers") + @JsonPropertyDescription("Display sample markers on the ECDF line.") + var showMarkers: Boolean = false + + @JsonProperty(required = false, defaultValue = "none") + @JsonSchemaTitle("Marginal Plot") + @JsonPropertyDescription("Optional marginal plot to display alongside the ECDF.") + @JsonSchemaInject( + json = """{ "enum": ["none", "histogram", "rug"], "default": "none" }""" + ) + var marginal: EncodableString = "none" + + override def operatorInfo: OperatorInfo = + OperatorInfo.forVisualization( + "Empirical Cumulative Distribution Plot", + "Visualize the empirical cumulative distribution of a numeric column.", + OperatorGroupConstants.VISUALIZATION_STATISTICAL_GROUP + ) + + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema().add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) + } + + def manipulateTable(): PythonTemplateBuilder = { + assert(valueColumn.nonEmpty) + val requiredCols = + List( + Some(pyb"$valueColumn"), + Option.when(colorColumn.nonEmpty)(pyb"$colorColumn"), + Option.when(separateBy.nonEmpty)(pyb"$separateBy") + ).flatten + val requiredColsExpr = requiredCols.mkString(", ") + + pyb""" + | required_cols = [$requiredColsExpr] + | table.dropna(subset=required_cols, inplace=True) + | table[$valueColumn] = pd.to_numeric(table[$valueColumn], errors='coerce') + | table.dropna(subset=[$valueColumn], inplace=True) + |""" + } + + def createPlotlyFigure(): PythonTemplateBuilder = { + assert(valueColumn.nonEmpty) + + val args = scala.collection.mutable.ArrayBuffer[PythonTemplateBuilder]( + pyb"table", + pyb"x=$valueColumn" + ) + if (colorColumn.nonEmpty) args += pyb"color=$colorColumn" + if (separateBy.nonEmpty) args += pyb"facet_col=$separateBy" + yAxisMode match { + case "count" => args += pyb"ecdfnorm=None" + case "sum" => args += pyb"ecdfnorm=None" + case _ => + } + if (yAxisMode == "sum") args += pyb"y=$valueColumn" + if (cdfMode != "standard") args += pyb"ecdfmode=$cdfMode" + if (orientation == "horizontal") args += pyb"orientation='h'" + if (showMarkers) args += pyb"markers=True" + if (marginal != "none") args += pyb"marginal=$marginal" + + val joinedArgs = args.mkString(", ") + pyb""" + | fig = px.ecdf($joinedArgs) + | fig.update_layout(margin=dict(l=0, r=0, t=30, b=0)) + |""" + } + + override def generatePythonCode(): String = { + val finalCode = + pyb""" + |from pytexera import * + | + |import pandas as pd + |import plotly.express as px + |import plotly.io + | + |class ProcessTableOperator(UDFTableOperator): + | def render_error(self, error_msg): + | return '''
Reason is: {}
+ | '''.format(error_msg) + | + | @overrides + | def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: + | if table.empty: + | yield {'html-content': self.render_error("input table is empty.")} + | return + | ${manipulateTable()} + | if table.empty: + | yield {'html-content': self.render_error("no valid rows left after removing missing or non-numeric values.")} + | return + | ${createPlotlyFigure()} + | html = plotly.io.to_html(fig, include_plotlyjs='cdn', auto_play=False) + | yield {'html-content': html} + |""" + finalCode.encode + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDescSpec.scala new file mode 100644 index 00000000000..bc565e4d3a3 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDescSpec.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.visualization.ecdfPlot + +import org.scalatest.BeforeAndAfter +import org.scalatest.flatspec.AnyFlatSpec + +class ECDFPlotOpDescSpec extends AnyFlatSpec with BeforeAndAfter { + + var opDesc: ECDFPlotOpDesc = _ + + before { + opDesc = new ECDFPlotOpDesc() + } + + it should "throw assertion error if value column is empty" in { + assertThrows[AssertionError] { + opDesc.manipulateTable() + } + } + + it should "generate a plotly ecdf figure with optional parameters" in { + opDesc.valueColumn = "score" + opDesc.colorColumn = "group" + opDesc.separateBy = "category" + opDesc.yAxisMode = "count" + opDesc.cdfMode = "reversed" + opDesc.orientation = "horizontal" + opDesc.showMarkers = true + opDesc.marginal = "histogram" + + val plain = opDesc.createPlotlyFigure().plain + + assert(plain.contains("fig = px.ecdf(table")) + assert(plain.contains("ecdfnorm=None")) + assert(plain.contains("ecdfmode=self.decode_python_template")) + assert(plain.contains("orientation='h'")) + assert(plain.contains("markers=True")) + assert(plain.contains("marginal=self.decode_python_template")) + assert(plain.contains("x=self.decode_python_template")) + assert(plain.contains("color=self.decode_python_template")) + assert(plain.contains("facet_col=self.decode_python_template")) + } +} diff --git a/frontend/src/assets/operator_images/ECDFPlot.png b/frontend/src/assets/operator_images/ECDFPlot.png new file mode 100644 index 00000000000..85c3bc5a956 Binary files /dev/null and b/frontend/src/assets/operator_images/ECDFPlot.png differ