apache · JingsongLi · Jan 23, 2024 · Jan 21, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/paimon-core/src/main/java/org/apache/paimon/stats/ColStats.java b/paimon-core/src/main/java/org/apache/paimon/stats/ColStats.java
@@ -89,6 +89,7 @@ public class ColStats<T> {
     @JsonProperty(FIELD_MAX_LEN)
     private final @Nullable Long maxLen;
 
+    // This should only be used by jackson
     @JsonCreator
     public ColStats(
             @JsonProperty(FIELD_COL_ID) int colId,
@@ -107,7 +108,7 @@ public ColStats(
         this.maxLen = maxLen;
     }
 
-    public ColStats(
+    private ColStats(
             int colId,
             @Nullable Long distinctCount,
             @Nullable Comparable<T> min,
@@ -124,6 +125,17 @@ public ColStats(
         this.maxLen = maxLen;
     }
 
+    public static <T> ColStats<T> newColStats(
+            int colId,
+            @Nullable Long distinctCount,
+            @Nullable Comparable<T> min,
+            @Nullable Comparable<T> max,
+            @Nullable Long nullCount,
+            @Nullable Long avgLen,
+            @Nullable Long maxLen) {
+        return new ColStats<>(colId, distinctCount, min, max, nullCount, avgLen, maxLen);
+    }
+
     public int colId() {
         return colId;
     }
@@ -189,9 +201,7 @@ public boolean equals(Object o) {
         ColStats<?> colStats = (ColStats<?>) o;
         return colId == colStats.colId
                 && Objects.equals(distinctCount, colStats.distinctCount)
-                && Objects.equals(serializedMin, colStats.serializedMin)
                 && Objects.equals(min, colStats.min)
-                && Objects.equals(serializedMax, colStats.serializedMax)
                 && Objects.equals(max, colStats.max)
                 && Objects.equals(nullCount, colStats.nullCount)
                 && Objects.equals(avgLen, colStats.avgLen)
@@ -200,16 +210,7 @@ public boolean equals(Object o) {
 
     @Override
     public int hashCode() {
-        return Objects.hash(
-                colId,
-                distinctCount,
-                serializedMin,
-                min,
-                serializedMax,
-                max,
-                nullCount,
-                avgLen,
-                maxLen);
+        return Objects.hash(colId, distinctCount, min, max, nullCount, avgLen, maxLen);
     }
 
     @Override

diff --git a/paimon-core/src/main/java/org/apache/paimon/table/AbstractFileStoreTable.java b/paimon-core/src/main/java/org/apache/paimon/table/AbstractFileStoreTable.java
@@ -34,6 +34,7 @@
 import org.apache.paimon.schema.SchemaManager;
 import org.apache.paimon.schema.SchemaValidation;
 import org.apache.paimon.schema.TableSchema;
+import org.apache.paimon.stats.Stats;
 import org.apache.paimon.table.sink.CallbackUtils;
 import org.apache.paimon.table.sink.CommitCallback;
 import org.apache.paimon.table.sink.DynamicBucketRowKeyExtractor;
@@ -97,6 +98,18 @@ public AbstractFileStoreTable(
         this.catalogEnvironment = catalogEnvironment;
     }
 
+    @Override
+    public Optional<Stats> statistics() {
+        // todo: support time travel
+        if (coreOptions().startupMode().equals(CoreOptions.StartupMode.LATEST_FULL)) {
+            Snapshot latestSnapshot = snapshotManager().latestSnapshot();
+            if (latestSnapshot != null) {
+                return store().newStatsFileHandler().readStats(latestSnapshot);
+            }
+        }
+        return Optional.empty();
+    }
+
     @Override
     public BucketMode bucketMode() {
         return store().bucketMode();

diff --git a/paimon-core/src/main/java/org/apache/paimon/table/ReadonlyTable.java b/paimon-core/src/main/java/org/apache/paimon/table/ReadonlyTable.java
@@ -18,6 +18,7 @@
 
 package org.apache.paimon.table;
 
+import org.apache.paimon.stats.Stats;
 import org.apache.paimon.table.sink.BatchWriteBuilder;
 import org.apache.paimon.table.sink.InnerTableCommit;
 import org.apache.paimon.table.sink.InnerTableWrite;
@@ -47,6 +48,11 @@ default Optional<String> comment() {
         return Optional.empty();
     }
 
+    @Override
+    default Optional<Stats> statistics() {
+        return Optional.empty();
+    }
+
     @Override
     default BatchWriteBuilder newBatchWriteBuilder() {
         throw new UnsupportedOperationException(

diff --git a/paimon-core/src/main/java/org/apache/paimon/table/Table.java b/paimon-core/src/main/java/org/apache/paimon/table/Table.java
@@ -20,6 +20,7 @@
 
 import org.apache.paimon.annotation.Experimental;
 import org.apache.paimon.annotation.Public;
+import org.apache.paimon.stats.Stats;
 import org.apache.paimon.table.sink.BatchWriteBuilder;
 import org.apache.paimon.table.sink.StreamWriteBuilder;
 import org.apache.paimon.table.source.ReadBuilder;
@@ -58,6 +59,9 @@ public interface Table extends Serializable {
     /** Optional comment of this table. */
     Optional<String> comment();
 
+    /** Optional statistics of this table. */
+    Optional<Stats> statistics();
+
     // ================= Table Operations ====================
 
     /** Copy this table with adding dynamic options. */

diff --git a/paimon-core/src/test/java/org/apache/paimon/operation/FileStoreCommitTest.java b/paimon-core/src/test/java/org/apache/paimon/operation/FileStoreCommitTest.java
@@ -790,7 +790,7 @@ public void testWriteStats() throws Exception {
 
         // Analyze and check
         HashMap<String, ColStats<?>> fakeColStatsMap = new HashMap<>();
-        fakeColStatsMap.put("orderId", new ColStats<>(3, 10L, 1L, 10L, 0L, 8L, 8L));
+        fakeColStatsMap.put("orderId", ColStats.newColStats(3, 10L, 1L, 10L, 0L, 8L, 8L));
         Stats fakeStats =
                 new Stats(
                         latestSnapshot.id(),
@@ -821,7 +821,7 @@ public void testWriteStats() throws Exception {
         // Then we need to analyze again
         latestSnapshot = store.snapshotManager().latestSnapshot();
         fakeColStatsMap = new HashMap<>();
-        fakeColStatsMap.put("orderId", new ColStats<>(3, 30L, 1L, 30L, 0L, 8L, 8L));
+        fakeColStatsMap.put("orderId", ColStats.newColStats(3, 30L, 1L, 30L, 0L, 8L, 8L));
         fakeStats =
                 new Stats(
                         latestSnapshot.id(),

diff --git a/...park-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonAnalysis.scala b/...park-common/src/main/scala/org/apache/paimon/spark/catalyst/analysis/PaimonAnalysis.scala
@@ -18,15 +18,15 @@
 package org.apache.paimon.spark.catalyst.analysis
 
 import org.apache.paimon.spark.SparkTable
-import org.apache.paimon.spark.catalyst.plans.logical.{PaimonTableValuedFunctions, PaimonTableValueFunction}
-import org.apache.paimon.spark.commands.{PaimonDynamicPartitionOverwriteCommand, PaimonTruncateTableCommand}
+import org.apache.paimon.spark.catalyst.analysis.PaimonRelation.isPaimonTable
+import org.apache.paimon.spark.commands.{PaimonAnalyzeTableColumnCommand, PaimonDynamicPartitionOverwriteCommand, PaimonTruncateTableCommand}
 import org.apache.paimon.table.FileStoreTable
 
-import PaimonRelation.isPaimonTable
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.analysis.ResolvedPartitionSpec
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, MergeIntoTable, OverwritePartitionsDynamic, TruncatePartition, TruncateTable}
+import org.apache.spark.sql.catalyst.analysis.ResolvedTable
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog}
 import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 
 class PaimonAnalysis(session: SparkSession) extends Rule[LogicalPlan] {
@@ -49,6 +49,29 @@ case class PaimonPostHocResolutionRules(session: SparkSession) extends Rule[Logi
       case t @ TruncateTable(PaimonRelation(table)) if t.resolved =>
         PaimonTruncateTableCommand(table, Map.empty)
 
+      case a @ AnalyzeTable(
+            ResolvedTable(catalog: TableCatalog, identifier: Identifier, table: SparkTable, _),
+            partitionSpec,
+            noScan) if a.resolved =>
+        if (partitionSpec.nonEmpty) {
+          throw new UnsupportedOperationException("Analyze table partition is not supported")
+        } else if (noScan) {
+          throw new IllegalArgumentException("NOSCAN is ineffective with paimon")
+        } else {
+          PaimonAnalyzeTableColumnCommand(
+            catalog,
+            identifier,
+            table,
+            Option.apply(Seq()),
+            allColumns = false)
+        }
+
+      case a @ AnalyzeColumn(
+            ResolvedTable(catalog: TableCatalog, identifier: Identifier, table: SparkTable, _),
+            columnNames: Option[Seq[String]],
+            allColumns: Boolean) if a.resolved =>
+        PaimonAnalyzeTableColumnCommand(catalog, identifier, table, columnNames, allColumns)
+
       case _ => plan
     }
   }

diff --git a/...mon/src/main/scala/org/apache/paimon/spark/commands/PaimonAnalyzeTableColumnCommand.scala b/...mon/src/main/scala/org/apache/paimon/spark/commands/PaimonAnalyzeTableColumnCommand.scala
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.paimon.spark.commands
+
+import org.apache.paimon.schema.TableSchema
+import org.apache.paimon.spark.SparkTable
+import org.apache.paimon.spark.leafnode.PaimonLeafRunnableCommand
+import org.apache.paimon.stats.{ColStats, Stats}
+import org.apache.paimon.table.FileStoreTable
+import org.apache.paimon.table.sink.BatchWriteBuilder
+
+import org.apache.parquet.Preconditions
+import org.apache.spark.sql.{Row, SparkSession, Utils}
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.types.{DataType, Decimal, DecimalType, TimestampType}
+
+import java.util
+import java.util.UUID
+
+import scala.collection.JavaConverters._
+
+/** Command for analyze table and column. */
+case class PaimonAnalyzeTableColumnCommand(
+    catalog: TableCatalog,
+    identifier: Identifier,
+    v2Table: SparkTable,
+    columnNames: Option[Seq[String]],
+    allColumns: Boolean)
+  extends PaimonLeafRunnableCommand
+  with WithFileStoreTable {
+
+  override def table: FileStoreTable = v2Table.getTable.asInstanceOf[FileStoreTable]
+
+  override def run(sparkSession: SparkSession): Seq[Row] = {
+    val relation = DataSourceV2Relation.create(v2Table, Some(catalog), Some(identifier))
+    val currentSnapshot = table.snapshotManager().latestSnapshot()
+
+    // compute stats
+    val attributes = getColumnsToAnalyze(relation, columnNames, allColumns)
+    val totalSize = Utils.calculateTotalSize(
+      sparkSession.sessionState,
+      table.name(),
+      Some(table.location().toUri))
+    val (mergedRecordCount, colStats) = Utils.computeColumnStats(sparkSession, relation, attributes)
+
+    val totalRecordCount = currentSnapshot.totalRecordCount()
+    Preconditions.checkState(
+      totalRecordCount >= mergedRecordCount,
+      s"totalRecordCount: $totalRecordCount should be greater or equal than mergedRecordCount: $mergedRecordCount.")
+    val mergedRecordSize = totalSize * (mergedRecordCount.toDouble / totalRecordCount).toLong
+
+    // convert to paimon stats
+    val tableSchema = table.schema()
+
+    val colStatsMap: util.Map[String, ColStats[_]] = new util.HashMap
+    for ((attr, stats) <- colStats) {
+      colStatsMap.put(attr.name, toPaimonColStats(attr, stats, tableSchema))
+    }
+
+    val stats = new Stats(
+      currentSnapshot.id(),
+      currentSnapshot.schemaId(),
+      mergedRecordCount,
+      mergedRecordSize,
+      colStatsMap)
+
+    // commit stats
+    val commit = table.store.newCommit(UUID.randomUUID.toString)
+    commit.commitStatistics(stats, BatchWriteBuilder.COMMIT_IDENTIFIER)
+
+    Seq.empty[Row]
+  }
+
+  private def getColumnsToAnalyze(
+      relation: DataSourceV2Relation,
+      columnNames: Option[Seq[String]],
+      allColumns: Boolean): Seq[Attribute] = {
+    if (columnNames.isDefined && allColumns) {
+      throw new UnsupportedOperationException(
+        "Parameter `columnNames` and `allColumns` are " +
+          "mutually exclusive. Only one of them should be specified.")
+    }
+    val columnsToAnalyze = if (allColumns) {
+      relation.output
+    } else {
+      columnNames.get.map {
+        col =>
+          val exprOption = relation.output.find(attr => conf.resolver(attr.name, col))
+          exprOption.getOrElse(
+            throw new RuntimeException(s"Column $col not found in ${relation.table.name()}."))
+      }
+    }
+    columnsToAnalyze.foreach {
+      attr =>
+        if (!Utils.analyzeSupportsType(attr.dataType)) {
+          throw new UnsupportedOperationException(
+            s"Analyzing on col: ${attr.name}, data type: ${attr.dataType} is not supported.")
+        }
+    }
+    columnsToAnalyze
+  }
+
+  private def toPaimonColStats(
+      attribute: Attribute,
+      columnStat: ColumnStat,
+      paimonSchema: TableSchema): ColStats[_] = {
+    paimonSchema.fields.asScala
+      .find(_.name() == attribute.name)
+      .map {
+        field =>
+          ColStats.newColStats(
+            field.id(),
+            if (columnStat.distinctCount.isDefined) columnStat.distinctCount.get.longValue
+            else null,
+            if (columnStat.min.isDefined)
+              toPaimonData(columnStat.min.get, attribute.dataType).asInstanceOf[Comparable[Any]]
+            else null,
+            if (columnStat.max.isDefined)
+              toPaimonData(columnStat.max.get, attribute.dataType).asInstanceOf[Comparable[Any]]
+            else null,
+            if (columnStat.nullCount.isDefined) columnStat.nullCount.get.longValue else null,
+            if (columnStat.avgLen.isDefined) columnStat.avgLen.get else null,
+            if (columnStat.maxLen.isDefined) columnStat.maxLen.get else null
+          )
+      }
+      .getOrElse(throw new RuntimeException(s"Column ${attribute.name} is not found in schema."))
+  }
+
+  /**
+   * Convert data from spark type to paimon, only cover datatype meet [[Utils.hasMinMax]] currently.
+   */
+  private def toPaimonData(o: Any, dataType: DataType): Any = {
+    dataType match {
+      case d if !Utils.hasMinMax(d) =>
+        // should not reach here
+        throw new UnsupportedOperationException(s"Unsupported data type $d, value is $o.")
+      case _: DecimalType =>
+        val d = o.asInstanceOf[Decimal]
+        org.apache.paimon.data.Decimal.fromBigDecimal(d.toJavaBigDecimal, d.precision, d.scale)
+      case _: TimestampType =>
+        val l = o.asInstanceOf[Long]
+        org.apache.paimon.data.Timestamp.fromSQLTimestamp(DateTimeUtils.toJavaTimestamp(l))
+      case _ => o
+    }
+  }
+}