Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 17 additions & 17 deletions docs/content/concepts/system-tables.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,30 +250,30 @@ You can query the files of the table with specific snapshot.
SELECT * FROM my_table$files;

/*
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+
| partition | bucket | file_path | file_format | schema_id | level | record_count | file_size_in_bytes | min_key | max_key | null_value_counts | min_value_stats | max_value_stats | min_sequence_number | max_sequence_number | creation_time |
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+
| {3} | 0 | data-8f64af95-29cc-4342-adc... | orc | 0 | 0 | 1 | 593 | [c] | [c] | {cnt=0, val=0, word=0} | {cnt=3, val=33, word=c} | {cnt=3, val=33, word=c} | 1691551246234 | 1691551246637 |2023-02-24T16:06:21.166|
| {2} | 0 | data-8b369068-0d37-4011-aa5... | orc | 0 | 0 | 1 | 593 | [b] | [b] | {cnt=0, val=0, word=0} | {cnt=2, val=22, word=b} | {cnt=2, val=22, word=b} | 1691551246233 | 1691551246732 |2023-02-24T16:06:21.166|
| {2} | 0 | data-83aa7973-060b-40b6-8c8... | orc | 0 | 0 | 1 | 605 | [d] | [d] | {cnt=0, val=0, word=0} | {cnt=2, val=32, word=d} | {cnt=2, val=32, word=d} | 1691551246267 | 1691551246798 |2023-02-24T16:06:21.166|
| {5} | 0 | data-3d304f4a-bcea-44dc-a13... | orc | 0 | 0 | 1 | 593 | [c] | [c] | {cnt=0, val=0, word=0} | {cnt=5, val=51, word=c} | {cnt=5, val=51, word=c} | 1691551246788 | 1691551246152 |2023-02-24T16:06:21.166|
| {1} | 0 | data-10abb5bc-0170-43ae-b6a... | orc | 0 | 0 | 1 | 595 | [a] | [a] | {cnt=0, val=0, word=0} | {cnt=1, val=11, word=a} | {cnt=1, val=11, word=a} | 1691551246722 | 1691551246273 |2023-02-24T16:06:21.166|
| {4} | 0 | data-2c9b7095-65b7-4013-a7a... | orc | 0 | 0 | 1 | 593 | [a] | [a] | {cnt=0, val=0, word=0} | {cnt=4, val=12, word=a} | {cnt=4, val=12, word=a} | 1691551246321 | 1691551246109 |2023-02-24T16:06:21.166|
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+--------------------------------+--------------------+
| partition | bucket | file_path | file_format | schema_id | level | record_count | file_size_in_bytes | min_key | max_key | null_value_counts | min_value_stats | max_value_stats | min_sequence_number | max_sequence_number | creation_time | file_name | clustering_columns |
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+--------------------------------+--------------------+
| {3} | 0 | data-8f64af95-29cc-4342-adc... | orc | 0 | 0 | 1 | 593 | [c] | [c] | {cnt=0, val=0, word=0} | {cnt=3, val=33, word=c} | {cnt=3, val=33, word=c} | 1691551246234 | 1691551246637 |2023-02-24T16:06:21.166| data-8f64af95-29cc-4342-adc... | NULL |
| {2} | 0 | data-8b369068-0d37-4011-aa5... | orc | 0 | 0 | 1 | 593 | [b] | [b] | {cnt=0, val=0, word=0} | {cnt=2, val=22, word=b} | {cnt=2, val=22, word=b} | 1691551246233 | 1691551246732 |2023-02-24T16:06:21.166| data-8b369068-0d37-4011-aa5... | NULL |
| {2} | 0 | data-83aa7973-060b-40b6-8c8... | orc | 0 | 0 | 1 | 605 | [d] | [d] | {cnt=0, val=0, word=0} | {cnt=2, val=32, word=d} | {cnt=2, val=32, word=d} | 1691551246267 | 1691551246798 |2023-02-24T16:06:21.166| data-83aa7973-060b-40b6-8c8... | NULL |
| {5} | 0 | data-3d304f4a-bcea-44dc-a13... | orc | 0 | 0 | 1 | 593 | [c] | [c] | {cnt=0, val=0, word=0} | {cnt=5, val=51, word=c} | {cnt=5, val=51, word=c} | 1691551246788 | 1691551246152 |2023-02-24T16:06:21.166| data-3d304f4a-bcea-44dc-a13... | NULL |
| {1} | 0 | data-10abb5bc-0170-43ae-b6a... | orc | 0 | 0 | 1 | 595 | [a] | [a] | {cnt=0, val=0, word=0} | {cnt=1, val=11, word=a} | {cnt=1, val=11, word=a} | 1691551246722 | 1691551246273 |2023-02-24T16:06:21.166| data-10abb5bc-0170-43ae-b6a... | NULL |
| {4} | 0 | data-2c9b7095-65b7-4013-a7a... | orc | 0 | 0 | 1 | 593 | [a] | [a] | {cnt=0, val=0, word=0} | {cnt=4, val=12, word=a} | {cnt=4, val=12, word=a} | 1691551246321 | 1691551246109 |2023-02-24T16:06:21.166| data-2c9b7095-65b7-4013-a7a... | NULL |
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+--------------------------------+--------------------+
6 rows in set
*/

-- You can also query the files with specific snapshot
SELECT * FROM my_table$files /*+ OPTIONS('scan.snapshot-id'='1') */;

/*
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+
| partition | bucket | file_path | file_format | schema_id | level | record_count | file_size_in_bytes | min_key | max_key | null_value_counts | min_value_stats | max_value_stats | min_sequence_number | max_sequence_number | creation_time |
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+
| {3} | 0 | data-8f64af95-29cc-4342-adc... | orc | 0 | 0 | 1 | 593 | [c] | [c] | {cnt=0, val=0, word=0} | {cnt=3, val=33, word=c} | {cnt=3, val=33, word=c} | 1691551246234 | 1691551246637 |2023-02-24T16:06:21.166|
| {2} | 0 | data-8b369068-0d37-4011-aa5... | orc | 0 | 0 | 1 | 593 | [b] | [b] | {cnt=0, val=0, word=0} | {cnt=2, val=22, word=b} | {cnt=2, val=22, word=b} | 1691551246233 | 1691551246732 |2023-02-24T16:06:21.166|
| {1} | 0 | data-10abb5bc-0170-43ae-b6a... | orc | 0 | 0 | 1 | 595 | [a] | [a] | {cnt=0, val=0, word=0} | {cnt=1, val=11, word=a} | {cnt=1, val=11, word=a} | 1691551246267 | 1691551246798 |2023-02-24T16:06:21.166|
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+--------------------------------+--------------------+
| partition | bucket | file_path | file_format | schema_id | level | record_count | file_size_in_bytes | min_key | max_key | null_value_counts | min_value_stats | max_value_stats | min_sequence_number | max_sequence_number | creation_time | file_name | clustering_columns |
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+--------------------------------+--------------------+
| {3} | 0 | data-8f64af95-29cc-4342-adc... | orc | 0 | 0 | 1 | 593 | [c] | [c] | {cnt=0, val=0, word=0} | {cnt=3, val=33, word=c} | {cnt=3, val=33, word=c} | 1691551246234 | 1691551246637 |2023-02-24T16:06:21.166| data-8f64af95-29cc-4342-adc... | NULL |
| {2} | 0 | data-8b369068-0d37-4011-aa5... | orc | 0 | 0 | 1 | 593 | [b] | [b] | {cnt=0, val=0, word=0} | {cnt=2, val=22, word=b} | {cnt=2, val=22, word=b} | 1691551246233 | 1691551246732 |2023-02-24T16:06:21.166| data-8b369068-0d37-4011-aa5... | NULL |
| {1} | 0 | data-10abb5bc-0170-43ae-b6a... | orc | 0 | 0 | 1 | 595 | [a] | [a] | {cnt=0, val=0, word=0} | {cnt=1, val=11, word=a} | {cnt=1, val=11, word=a} | 1691551246267 | 1691551246798 |2023-02-24T16:06:21.166| data-10abb5bc-0170-43ae-b6a... | NULL |
+-----------+--------+--------------------------------+-------------+-----------+-------+--------------+--------------------+---------+---------+------------------------+-------------------------+-------------------------+---------------------+---------------------+-----------------------+--------------------------------+--------------------+
3 rows in set
*/
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package org.apache.paimon.table.system;

import org.apache.paimon.CoreOptions;
import org.apache.paimon.casting.CastExecutor;
import org.apache.paimon.casting.CastExecutors;
import org.apache.paimon.data.BinaryRow;
Expand Down Expand Up @@ -120,7 +121,12 @@ public class FilesTable implements ReadonlyTable {
new DataField(16, "deleteRowCount", DataTypes.BIGINT()),
new DataField(17, "file_source", DataTypes.STRING()),
new DataField(18, "first_row_id", DataTypes.BIGINT()),
new DataField(19, "write_cols", DataTypes.ARRAY(DataTypes.STRING()))));
new DataField(19, "write_cols", DataTypes.ARRAY(DataTypes.STRING())),
new DataField(20, "file_name", SerializationUtils.newStringType(false)),
new DataField(
21,
"clustering_columns",
SerializationUtils.newStringType(true))));

private final FileStoreTable storeTable;

Expand Down Expand Up @@ -353,6 +359,24 @@ public RowDataToObjectArrayConverter apply(Long schemaId) {
});
}
};

Function<Long, List<String>> clusteringColumnsGetter =
new Function<Long, List<String>>() {
final Map<Long, List<String>> cache = new HashMap<>();

@Override
public List<String> apply(Long schemaId) {
if (cache.containsKey(schemaId)) {
return cache.get(schemaId);
}
TableSchema dataSchema = schemaManager.schema(schemaId);
CoreOptions options = new CoreOptions(dataSchema.options());
List<String> cols = options.clusteringColumns();
cache.put(schemaId, cols);
return cols;
}
};

for (Split dataSplit : splits) {
iteratorList.add(
Iterators.transform(
Expand All @@ -362,6 +386,7 @@ public RowDataToObjectArrayConverter apply(Long schemaId) {
(DataSplit) dataSplit,
partitionCastExecutor,
keyConverters,
clusteringColumnsGetter,
file,
simpleStatsEvolutions)));
}
Expand All @@ -381,6 +406,7 @@ private LazyGenericRow toRow(
DataSplit dataSplit,
CastExecutor<InternalRow, BinaryString> partitionCastExecutor,
Function<Long, RowDataToObjectArrayConverter> keyConverters,
Function<Long, List<String>> clusteringColumnsGetter,
DataFileMeta file,
SimpleStatsEvolutions simpleStatsEvolutions) {
StatsLazyGetter statsGetter = new StatsLazyGetter(file, simpleStatsEvolutions);
Expand Down Expand Up @@ -441,6 +467,13 @@ private LazyGenericRow toRow(
return new GenericArray(
writeCols.stream().map(BinaryString::fromString).toArray());
},
() -> BinaryString.fromString(file.fileName()),
() -> {
List<String> cols = clusteringColumnsGetter.apply(file.schemaId());
return cols.isEmpty()
? null
: BinaryString.fromString(String.join(",", cols));
},
};

return new LazyGenericRow(fields);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,56 @@ public void testReadFilesFromNotExistSnapshot() {
.satisfies(anyCauseMatches(IllegalArgumentException.class));
}

@Test
public void testFileNameAndClusteringColumns() throws Exception {
String clusterTableName = "ClusterTable";
FileIO fileIO = LocalFileIO.create();
Path clusterTablePath =
new Path(String.format("%s/%s.db/%s", warehouse, database, clusterTableName));
Schema clusterSchema =
Schema.newBuilder()
.column("pk", DataTypes.INT())
.column("pt", DataTypes.INT())
.column("col1", DataTypes.INT())
.column("col2", DataTypes.STRING())
.partitionKeys("pt")
.primaryKey("pk", "pt")
.option(CoreOptions.BUCKET.key(), "1")
.option(CoreOptions.SEQUENCE_FIELD.key(), "col1")
.option("clustering.columns", "col1,col2")
.build();
TableSchema clusterTableSchema =
SchemaUtils.forceCommit(new SchemaManager(fileIO, clusterTablePath), clusterSchema);
FileStoreTable clusterTable =
FileStoreTableFactory.create(
LocalFileIO.create(), clusterTablePath, clusterTableSchema);
write(clusterTable, GenericRow.of(1, 1, 100, BinaryString.fromString("abc")));

Identifier clusterFilesId =
identifier(clusterTableName + SYSTEM_TABLE_SPLITTER + FilesTable.FILES);
FilesTable clusterFilesTable = (FilesTable) catalog.getTable(clusterFilesId);
List<InternalRow> result = read(clusterFilesTable);

assertThat(result).hasSize(1);
InternalRow row = result.get(0);
String filePath = row.getString(2).toString();
String expectedFileName = filePath.substring(filePath.lastIndexOf("/") + 1);
String fileName = row.getString(20).toString();
assertThat(fileName).isEqualTo(expectedFileName);
String clusteringCols = row.getString(21).toString();
assertThat(clusteringCols).isEqualTo("col1,col2");
}

@Test
public void testClusteringColumnsNull() throws Exception {
List<InternalRow> result = read(filesTable);
assertThat(result).isNotEmpty();
for (InternalRow row : result) {
assertThat(row.getString(20).toString()).isNotEmpty();
assertThat(row.isNullAt(21)).isTrue();
}
}

private List<InternalRow> getExpectedResult(long snapshotId) {
if (!snapshotManager.snapshotExists(snapshotId)) {
return Collections.emptyList();
Expand Down Expand Up @@ -279,6 +329,8 @@ private List<InternalRow> getExpectedResult(long snapshotId) {
BinaryString.fromString(
file.fileSource().map(Object::toString).orElse(null)),
file.firstRowId(),
null,
BinaryString.fromString(file.fileName()),
null));
}
return expectedRow;
Expand Down
Loading