Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[spark] Support ANALYZE table #2752

Merged
merged 3 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import org.apache.paimon.fs.FileIO;
import org.apache.paimon.manifest.ManifestCommittable;
import org.apache.paimon.operation.metrics.CommitMetrics;
import org.apache.paimon.stats.Stats;
import org.apache.paimon.stats.Statistics;
import org.apache.paimon.table.sink.CommitMessage;
import org.apache.paimon.utils.FileStorePathFactory;

Expand Down Expand Up @@ -90,7 +90,7 @@ void overwrite(
* Commit new statistics. The {@link Snapshot.CommitKind} of generated snapshot is {@link
* Snapshot.CommitKind#ANALYZE}.
*/
void commitStatistics(Stats stats, long commitIdentifier);
void commitStatistics(Statistics stats, long commitIdentifier);

FileStorePathFactory pathFactory();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
import org.apache.paimon.predicate.Predicate;
import org.apache.paimon.predicate.PredicateBuilder;
import org.apache.paimon.schema.SchemaManager;
import org.apache.paimon.stats.Stats;
import org.apache.paimon.stats.Statistics;
import org.apache.paimon.stats.StatsFileHandler;
import org.apache.paimon.table.sink.CommitMessage;
import org.apache.paimon.table.sink.CommitMessageImpl;
Expand Down Expand Up @@ -513,7 +513,7 @@ public FileStoreCommit withMetrics(CommitMetrics metrics) {
}

@Override
public void commitStatistics(Stats stats, long commitIdentifier) {
public void commitStatistics(Statistics stats, long commitIdentifier) {
String statsFileName = statsFileHandler.writeStats(stats);
tryCommit(
Collections.emptyList(),
Expand Down Expand Up @@ -786,7 +786,7 @@ public boolean tryCommitOnce(
if (newStatsFileName != null) {
statsFileName = newStatsFileName;
} else if (latestSnapshot != null) {
Optional<Stats> previousStatistic = statsFileHandler.readStats(latestSnapshot);
Optional<Statistics> previousStatistic = statsFileHandler.readStats(latestSnapshot);
if (previousStatistic.isPresent()) {
if (previousStatistic.get().schemaId() != latestSchemaId) {
LOG.warn("Schema changed, stats will not be inherited");
Expand Down
29 changes: 16 additions & 13 deletions paimon-core/src/main/java/org/apache/paimon/stats/ColStats.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package org.apache.paimon.stats;

import org.apache.paimon.annotation.Experimental;
import org.apache.paimon.data.serializer.InternalSerializers;
import org.apache.paimon.data.serializer.Serializer;
import org.apache.paimon.types.DataType;
Expand Down Expand Up @@ -48,6 +49,7 @@
*
* @param <T> col internal data type
*/
@Experimental
public class ColStats<T> {

private static final String FIELD_COL_ID = "colId";
Expand Down Expand Up @@ -89,6 +91,7 @@ public class ColStats<T> {
@JsonProperty(FIELD_MAX_LEN)
private final @Nullable Long maxLen;

// This should only be used by jackson
@JsonCreator
public ColStats(
@JsonProperty(FIELD_COL_ID) int colId,
Expand All @@ -107,7 +110,7 @@ public ColStats(
this.maxLen = maxLen;
}

public ColStats(
private ColStats(
int colId,
@Nullable Long distinctCount,
@Nullable Comparable<T> min,
Expand All @@ -124,6 +127,17 @@ public ColStats(
this.maxLen = maxLen;
}

public static <T> ColStats<T> newColStats(
int colId,
@Nullable Long distinctCount,
@Nullable Comparable<T> min,
@Nullable Comparable<T> max,
@Nullable Long nullCount,
@Nullable Long avgLen,
@Nullable Long maxLen) {
return new ColStats<>(colId, distinctCount, min, max, nullCount, avgLen, maxLen);
}

public int colId() {
return colId;
}
Expand Down Expand Up @@ -189,9 +203,7 @@ public boolean equals(Object o) {
ColStats<?> colStats = (ColStats<?>) o;
return colId == colStats.colId
&& Objects.equals(distinctCount, colStats.distinctCount)
&& Objects.equals(serializedMin, colStats.serializedMin)
&& Objects.equals(min, colStats.min)
&& Objects.equals(serializedMax, colStats.serializedMax)
&& Objects.equals(max, colStats.max)
&& Objects.equals(nullCount, colStats.nullCount)
&& Objects.equals(avgLen, colStats.avgLen)
Expand All @@ -200,16 +212,7 @@ public boolean equals(Object o) {

@Override
public int hashCode() {
return Objects.hash(
colId,
distinctCount,
serializedMin,
min,
serializedMax,
max,
nullCount,
avgLen,
maxLen);
return Objects.hash(colId, distinctCount, min, max, nullCount, avgLen, maxLen);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package org.apache.paimon.stats;

import org.apache.paimon.annotation.Experimental;
import org.apache.paimon.fs.FileIO;
import org.apache.paimon.fs.Path;
import org.apache.paimon.schema.TableSchema;
Expand Down Expand Up @@ -46,7 +47,8 @@
* <li>colStats: column stats map
* </ul>
*/
public class Stats {
@Experimental
public class Statistics {

// ID of the snapshot this statistics collected from
private static final String FIELD_SNAPSHOT_ID = "snapshotId";
Expand Down Expand Up @@ -74,7 +76,7 @@ public class Stats {
private final Map<String, ColStats<?>> colStats;

@JsonCreator
public Stats(
public Statistics(
@JsonProperty(FIELD_SNAPSHOT_ID) long snapshotId,
@JsonProperty(FIELD_SCHEMA_ID) long schemaId,
@JsonProperty(FIELD_MERGED_RECORD_COUNT) @Nullable Long mergedRecordCount,
Expand All @@ -87,7 +89,8 @@ public Stats(
this.colStats = colStats;
}

public Stats(long snapshotId, long schemaId, Long mergedRecordCount, Long mergedRecordSize) {
public Statistics(
long snapshotId, long schemaId, Long mergedRecordCount, Long mergedRecordSize) {
this(snapshotId, schemaId, mergedRecordCount, mergedRecordSize, Collections.emptyMap());
}

Expand Down Expand Up @@ -161,14 +164,14 @@ public String toJson() {
return JsonSerdeUtil.toJson(this);
}

public static Stats fromJson(String json) {
return JsonSerdeUtil.fromJson(json, Stats.class);
public static Statistics fromJson(String json) {
return JsonSerdeUtil.fromJson(json, Statistics.class);
}

public static Stats fromPath(FileIO fileIO, Path path) {
public static Statistics fromPath(FileIO fileIO, Path path) {
try {
String json = fileIO.readFileUtf8(path);
return Stats.fromJson(json);
return Statistics.fromJson(json);
} catch (IOException e) {
throw new RuntimeException("Fails to read snapshot from path " + path, e);
}
Expand All @@ -182,7 +185,7 @@ public boolean equals(Object o) {
if (o == null || getClass() != o.getClass()) {
return false;
}
Stats stats = (Stats) o;
Statistics stats = (Statistics) o;
return snapshotId == stats.snapshotId
&& schemaId == stats.schemaId
&& Objects.equals(mergedRecordCount, stats.mergedRecordCount)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,16 @@ public StatsFile(FileIO fileIO, PathFactory pathFactory) {
*
* @return stats
*/
public Stats read(String fileName) {
return Stats.fromPath(fileIO, pathFactory.toPath(fileName));
public Statistics read(String fileName) {
return Statistics.fromPath(fileIO, pathFactory.toPath(fileName));
}

/**
* Write stats to a stats file.
*
* @return the written file name
*/
public String write(Stats stats) {
public String write(Statistics stats) {
Path path = pathFactory.newPath();

try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public StatsFileHandler(
*
* @return the written file name
*/
public String writeStats(Stats stats) {
public String writeStats(Statistics stats) {
stats.serializeFieldsToString(schemaManager.schema(stats.schemaId()));
return statsFile.write(stats);
}
Expand All @@ -53,7 +53,7 @@ public String writeStats(Stats stats) {
*
* @return stats
*/
public Optional<Stats> readStats() {
public Optional<Statistics> readStats() {
Long latestSnapshotId = snapshotManager.latestSnapshotId();
if (latestSnapshotId == null) {
throw new IllegalStateException("Unable to obtain the latest schema");
Expand All @@ -66,15 +66,15 @@ public Optional<Stats> readStats() {
*
* @return stats
*/
public Optional<Stats> readStats(long snapshotId) {
public Optional<Statistics> readStats(long snapshotId) {
return readStats(snapshotManager.snapshot(snapshotId));
}

public Optional<Stats> readStats(Snapshot snapshot) {
public Optional<Statistics> readStats(Snapshot snapshot) {
if (snapshot.statistics() == null) {
return Optional.empty();
} else {
Stats stats = statsFile.read(snapshot.statistics());
Statistics stats = statsFile.read(snapshot.statistics());
stats.deserializeFieldsFromString(schemaManager.schema(stats.schemaId()));
return Optional.of(stats);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.paimon.schema.SchemaManager;
import org.apache.paimon.schema.SchemaValidation;
import org.apache.paimon.schema.TableSchema;
import org.apache.paimon.stats.Statistics;
import org.apache.paimon.table.sink.CallbackUtils;
import org.apache.paimon.table.sink.CommitCallback;
import org.apache.paimon.table.sink.DynamicBucketRowKeyExtractor;
Expand Down Expand Up @@ -97,6 +98,16 @@ public AbstractFileStoreTable(
this.catalogEnvironment = catalogEnvironment;
}

@Override
public Optional<Statistics> statistics() {
// todo: support time travel
Snapshot latestSnapshot = snapshotManager().latestSnapshot();
if (latestSnapshot != null) {
return store().newStatsFileHandler().readStats(latestSnapshot);
}
return Optional.empty();
}

@Override
public BucketMode bucketMode() {
return store().bucketMode();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package org.apache.paimon.table;

import org.apache.paimon.stats.Statistics;
import org.apache.paimon.table.sink.BatchWriteBuilder;
import org.apache.paimon.table.sink.InnerTableCommit;
import org.apache.paimon.table.sink.InnerTableWrite;
Expand Down Expand Up @@ -47,6 +48,11 @@ default Optional<String> comment() {
return Optional.empty();
}

@Override
default Optional<Statistics> statistics() {
return Optional.empty();
}

@Override
default BatchWriteBuilder newBatchWriteBuilder() {
throw new UnsupportedOperationException(
Expand Down
5 changes: 5 additions & 0 deletions paimon-core/src/main/java/org/apache/paimon/table/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import org.apache.paimon.annotation.Experimental;
import org.apache.paimon.annotation.Public;
import org.apache.paimon.stats.Statistics;
import org.apache.paimon.table.sink.BatchWriteBuilder;
import org.apache.paimon.table.sink.StreamWriteBuilder;
import org.apache.paimon.table.source.ReadBuilder;
Expand Down Expand Up @@ -58,6 +59,10 @@ public interface Table extends Serializable {
/** Optional comment of this table. */
Optional<String> comment();

/** Optional statistics of this table. */
@Experimental
Optional<Statistics> statistics();

// ================= Table Operations ====================

/** Copy this table with adding dynamic options. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import org.apache.paimon.schema.SchemaManager;
import org.apache.paimon.schema.SchemaUtils;
import org.apache.paimon.stats.ColStats;
import org.apache.paimon.stats.Stats;
import org.apache.paimon.stats.Statistics;
import org.apache.paimon.stats.StatsFileHandler;
import org.apache.paimon.testutils.assertj.AssertionUtils;
import org.apache.paimon.types.DataField;
Expand Down Expand Up @@ -790,16 +790,16 @@ public void testWriteStats() throws Exception {

// Analyze and check
HashMap<String, ColStats<?>> fakeColStatsMap = new HashMap<>();
fakeColStatsMap.put("orderId", new ColStats<>(3, 10L, 1L, 10L, 0L, 8L, 8L));
Stats fakeStats =
new Stats(
fakeColStatsMap.put("orderId", ColStats.newColStats(3, 10L, 1L, 10L, 0L, 8L, 8L));
Statistics fakeStats =
new Statistics(
latestSnapshot.id(),
latestSnapshot.schemaId(),
10L,
1000L,
fakeColStatsMap);
fileStoreCommit.commitStatistics(fakeStats, Long.MAX_VALUE);
Optional<Stats> readStats = statsFileHandler.readStats();
Optional<Statistics> readStats = statsFileHandler.readStats();
assertThat(readStats).isPresent();
assertThat(readStats.get()).isEqualTo(fakeStats);

Expand All @@ -821,9 +821,9 @@ public void testWriteStats() throws Exception {
// Then we need to analyze again
latestSnapshot = store.snapshotManager().latestSnapshot();
fakeColStatsMap = new HashMap<>();
fakeColStatsMap.put("orderId", new ColStats<>(3, 30L, 1L, 30L, 0L, 8L, 8L));
fakeColStatsMap.put("orderId", ColStats.newColStats(3, 30L, 1L, 30L, 0L, 8L, 8L));
fakeStats =
new Stats(
new Statistics(
latestSnapshot.id(),
latestSnapshot.schemaId(),
30L,
Expand All @@ -836,7 +836,7 @@ public void testWriteStats() throws Exception {

// Analyze without col stats and check
latestSnapshot = store.snapshotManager().latestSnapshot();
fakeStats = new Stats(latestSnapshot.id(), latestSnapshot.schemaId(), 30L, 3000L);
fakeStats = new Statistics(latestSnapshot.id(), latestSnapshot.schemaId(), 30L, 3000L);
fileStoreCommit.commitStatistics(fakeStats, Long.MAX_VALUE);
readStats = statsFileHandler.readStats();
assertThat(readStats).isPresent();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public Path toPath(String fileName) {
StatsFile file = new StatsFile(LocalFileIO.create(), pathFactory);
HashMap<String, ColStats<?>> colStatsMap = new HashMap<>();
colStatsMap.put("orderId", new ColStats<>(0, 10L, "111", "222", 0L, 8L, 8L));
Stats stats = new Stats(1L, 0L, 10L, 1000L, colStatsMap);
Statistics stats = new Statistics(1L, 0L, 10L, 1000L, colStatsMap);
String fileName = file.write(stats);

assertThat(file.exists(fileName)).isTrue();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.paimon.spark.sql

class AnalyzeTableTest extends AnalyzeTableTestBase {}