diff --git a/iceberg/iceberg-handler/src/test/results/positive/col_stats.q.out b/iceberg/iceberg-handler/src/test/results/positive/col_stats.q.out index b1f13fa76b50..851cb106ac35 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/col_stats.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/col_stats.q.out @@ -244,14 +244,14 @@ Stage-0 Stage-1 Reducer 2 vectorized File Output Operator [FS_8] - Select Operator [SEL_7] (rows=9 width=95) + Select Operator [SEL_7] (rows=9 width=192) Output:["_col0","_col1","_col2"] <-Map 1 [SIMPLE_EDGE] vectorized SHUFFLE [RS_6] - Select Operator [SEL_5] (rows=9 width=95) + Select Operator [SEL_5] (rows=9 width=192) Output:["_col0","_col1","_col2"] - TableScan [TS_0] (rows=9 width=95) - default@tbl_ice_puffin,tbl_ice_puffin,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"] + TableScan [TS_0] (rows=9 width=192) + default@tbl_ice_puffin,tbl_ice_puffin,Tbl:COMPLETE,Col:NONE,Output:["a","b","c"] PREHOOK: query: drop table if exists tbl_ice_puffin PREHOOK: type: DROPTABLE @@ -339,17 +339,16 @@ POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@tbl_ice_puffin col_name a data_type int -min 1 -max 333 -num_nulls 0 -distinct_count 7 +min +max +num_nulls +distinct_count avg_col_len max_col_len num_trues num_falses -bit_vector HL +bit_vector comment -COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}} PREHOOK: query: drop table if exists tbl_ice PREHOOK: type: DROPTABLE POSTHOOK: query: drop table if exists tbl_ice diff --git a/iceberg/iceberg-handler/src/test/results/positive/truncate_iceberg_table.q.out b/iceberg/iceberg-handler/src/test/results/positive/truncate_iceberg_table.q.out index dfab2edec758..07e2a34e423f 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/truncate_iceberg_table.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/truncate_iceberg_table.q.out @@ -225,7 +225,7 @@ Retention: 0 #### A masked pattern was here #### Table Type: EXTERNAL_TABLE Table Parameters: - COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"id\":\"true\",\"value\":\"true\"}} + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} EXTERNAL TRUE bucketing_version 2 current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"id\",\"required\":false,\"type\":\"int\"},{\"id\":2,\"name\":\"value\",\"required\":false,\"type\":\"string\"}]} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java index a228cca50458..8e2ca07b3845 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java @@ -20,10 +20,13 @@ import java.io.Serializable; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.Path; @@ -34,6 +37,7 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.ObjectDictionary; import org.apache.hadoop.hive.metastore.api.Order; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.metastore.api.SQLCheckConstraint; @@ -921,14 +925,23 @@ public Table toTable(HiveConf conf) throws HiveException { // When replicating the statistics for a table will be obtained from the source. Do not // reset it on replica. if (replicationSpec == null || !replicationSpec.isInReplicationScope()) { - if (!this.isCTAS && (tbl.getPath() == null || (!isExternal() && tbl.isEmpty()))) { - if (!tbl.isPartitioned() && conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { - StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(), - MetaStoreUtils.getColumnNames(tbl.getCols()), StatsSetupConst.TRUE); - } - } else { - StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(), null, - StatsSetupConst.FALSE); + // Remove COLUMN_STATS_ACCURATE=true from table's parameter, let the HMS determine if + // there is need to add column stats dependent on the table's location. + StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(), null, + StatsSetupConst.FALSE); + if (!this.isCTAS && !tbl.isPartitioned() && !tbl.isTemporary() && + conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { + // Put the flag into the dictionary in order not to pollute the table, + // ObjectDictionary is meant to convey repeatitive messages. + ObjectDictionary dictionary = tbl.getTTable().isSetDictionary() ? + tbl.getTTable().getDictionary() : new ObjectDictionary(); + List buffers = new ArrayList<>(); + String statsSetup = StatsSetupConst.ColumnStatsSetup.getStatsSetupAsString(true, + storageHandler != null && storageHandler.isMetadataTableSupported() ? "metadata" : null, // Skip metadata directory for Iceberg table + MetaStoreUtils.getColumnNames(tbl.getCols())); + buffers.add(ByteBuffer.wrap(statsSetup.getBytes(StandardCharsets.UTF_8))); + dictionary.putToValues(StatsSetupConst.STATS_FOR_CREATE_TABLE, buffers); + tbl.getTTable().setDictionary(dictionary); } } diff --git a/ql/src/test/queries/clientpositive/stats_external_location.q b/ql/src/test/queries/clientpositive/stats_external_location.q new file mode 100644 index 000000000000..87985a68d785 --- /dev/null +++ b/ql/src/test/queries/clientpositive/stats_external_location.q @@ -0,0 +1,9 @@ +set hive.stats.column.autogather=true; +set hive.stats.autogather=true; +dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/test1; + +create external table test_custom(age int, name string) stored as orc location '/tmp/test1'; +insert into test_custom select 1, 'test'; +desc formatted test_custom age; + +drop table test_custom; diff --git a/ql/src/test/results/clientpositive/llap/default_file_format.q.out b/ql/src/test/results/clientpositive/llap/default_file_format.q.out index 0adf5ae7415a..df2621d79125 100644 --- a/ql/src/test/results/clientpositive/llap/default_file_format.q.out +++ b/ql/src/test/results/clientpositive/llap/default_file_format.q.out @@ -170,8 +170,13 @@ Retention: 0 #### A masked pattern was here #### Table Type: EXTERNAL_TABLE Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c\":\"true\"}} EXTERNAL TRUE bucketing_version 2 + numFiles 0 + numRows 0 + rawDataSize 0 + totalSize 0 #### A masked pattern was here #### # Storage Information @@ -234,9 +239,12 @@ Retention: 0 #### A masked pattern was here #### Table Type: EXTERNAL_TABLE Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c\":\"true\"}} EXTERNAL TRUE bucketing_version 2 numFiles 0 + numRows 0 + rawDataSize 0 totalSize 0 #### A masked pattern was here #### @@ -470,9 +478,12 @@ Retention: 0 #### A masked pattern was here #### Table Type: EXTERNAL_TABLE Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c\":\"true\"}} EXTERNAL TRUE bucketing_version 2 numFiles 0 + numRows 0 + rawDataSize 0 totalSize 0 #### A masked pattern was here #### @@ -536,9 +547,12 @@ Retention: 0 #### A masked pattern was here #### Table Type: EXTERNAL_TABLE Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c\":\"true\"}} EXTERNAL TRUE bucketing_version 2 numFiles 0 + numRows 0 + rawDataSize 0 totalSize 0 #### A masked pattern was here #### diff --git a/ql/src/test/results/clientpositive/llap/mm_exim.q.out b/ql/src/test/results/clientpositive/llap/mm_exim.q.out index c23d711534ea..37ff35659c4f 100644 --- a/ql/src/test/results/clientpositive/llap/mm_exim.q.out +++ b/ql/src/test/results/clientpositive/llap/mm_exim.q.out @@ -312,8 +312,8 @@ Table Type: MANAGED_TABLE Table Parameters: bucketing_version 2 numFiles 3 - numRows 0 - rawDataSize 0 + numRows 6 + rawDataSize 37 totalSize 43 transactional true transactional_properties insert_only diff --git a/ql/src/test/results/clientpositive/llap/stats_external_location.q.out b/ql/src/test/results/clientpositive/llap/stats_external_location.q.out new file mode 100644 index 000000000000..f7fc782fbf34 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/stats_external_location.q.out @@ -0,0 +1,47 @@ +#### A masked pattern was here #### +PREHOOK: type: CREATETABLE +#### A masked pattern was here #### +PREHOOK: Output: database:default +PREHOOK: Output: default@test_custom +#### A masked pattern was here #### +POSTHOOK: type: CREATETABLE +#### A masked pattern was here #### +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_custom +PREHOOK: query: insert into test_custom select 1, 'test' +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_custom +POSTHOOK: query: insert into test_custom select 1, 'test' +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_custom +POSTHOOK: Lineage: test_custom.age SIMPLE [] +POSTHOOK: Lineage: test_custom.name SIMPLE [] +PREHOOK: query: desc formatted test_custom age +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@test_custom +POSTHOOK: query: desc formatted test_custom age +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@test_custom +col_name age +data_type int +min 1 +max 1 +num_nulls 0 +distinct_count 1 +avg_col_len +max_col_len +num_trues +num_falses +bit_vector HL +comment from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"age\":\"true\",\"name\":\"true\"}} +PREHOOK: query: drop table test_custom +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_custom +PREHOOK: Output: default@test_custom +POSTHOOK: query: drop table test_custom +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_custom +POSTHOOK: Output: default@test_custom diff --git a/ql/src/test/results/clientpositive/llap/translated_external_rename3.q.out b/ql/src/test/results/clientpositive/llap/translated_external_rename3.q.out index c7c920d156b3..ff321fa86f0a 100644 --- a/ql/src/test/results/clientpositive/llap/translated_external_rename3.q.out +++ b/ql/src/test/results/clientpositive/llap/translated_external_rename3.q.out @@ -98,14 +98,11 @@ Retention: 0 #### A masked pattern was here #### Table Type: EXTERNAL_TABLE Table Parameters: - COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\"}} EXTERNAL TRUE TRANSLATED_TO_EXTERNAL TRUE bucketing_version 2 external.table.purge TRUE numFiles 2 - numRows 1 - rawDataSize 1 totalSize 4 #### A masked pattern was here #### diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java index 0ee6bcfbfa2f..7ca76bf3741b 100644 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java +++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java @@ -167,6 +167,8 @@ public String getAggregator(Configuration conf) { public static final String CASCADE = "CASCADE"; + public static final String STATS_FOR_CREATE_TABLE = "setStatsStateForCreateTable"; + public static final String TRUE = "true"; public static final String FALSE = "false"; @@ -219,6 +221,55 @@ public Boolean deserialize(JsonParser jsonParser, } + /** + * Class for marking the column statistics when creating tables. + */ + public static class ColumnStatsSetup { + private static ObjectReader objectReader; + private static ObjectWriter objectWriter; + static { + ObjectMapper objectMapper = new ObjectMapper(); + objectReader = objectMapper.readerFor(ColumnStatsSetup.class); + objectWriter = objectMapper.writerFor(ColumnStatsSetup.class); + } + + @JsonInclude(JsonInclude.Include.NON_DEFAULT) + public boolean enabled; + @JsonInclude(JsonInclude.Include.NON_DEFAULT) + public String fileToEscape; + @JsonInclude(JsonInclude.Include.NON_EMPTY) + public List columnNames = new ArrayList<>(); + + public static ColumnStatsSetup parseStatsSetup(String statsSetup) { + if (statsSetup == null) { + return new ColumnStatsSetup(); + } + try { + return objectReader.readValue(statsSetup); + } catch (Exception e) { + return new ColumnStatsSetup(); + } + } + + /** + * Get json representation of the ColumnStatsSetup + */ + public static String getStatsSetupAsString(boolean enabled, + String fileToEscape, + List columns) { + try { + ColumnStatsSetup statsSetup = new ColumnStatsSetup(); + statsSetup.enabled = enabled; + statsSetup.columnNames = new ArrayList<>(columns); + statsSetup.fileToEscape = fileToEscape; + return objectWriter.writeValueAsString(statsSetup); + } catch (Exception e) { + // this should not happen + throw new RuntimeException(e); + } + } + } + public static boolean areBasicStatsUptoDate(Map params) { if (params == null) { return false; diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java index 10c9fb26d227..2952276020c7 100755 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java +++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java @@ -32,6 +32,7 @@ import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hive.common.TableName; import org.apache.hadoop.hive.metastore.api.Catalog; import org.apache.hadoop.hive.metastore.api.DatabaseType; @@ -500,16 +501,23 @@ public void recycleDirToCmPath(Path f, boolean ifPurge) throws MetaException { } public boolean isEmptyDir(Path path) throws IOException, MetaException { + return isEmptyDir(path, null); + } + + public boolean isEmptyDir(Path path, PathFilter pathFilter) + throws IOException, MetaException { try { - int listCount = getFs(path).listStatus(path).length; - if (listCount == 0) { - return true; + final int listCount; + if (pathFilter == null) { + listCount = getFs(path).listStatus(path).length; + } else { + listCount = getFs(path).listStatus(path, pathFilter).length; } + return listCount == 0; } catch (FileNotFoundException fnfe) { // File named by path doesn't exist; nothing to validate. return false; } - return false; } public boolean isWritable(Path path) throws IOException { diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HMSHandler.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HMSHandler.java index 15bb8d822455..b1e431ee6bec 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HMSHandler.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HMSHandler.java @@ -2361,10 +2361,8 @@ private void create_table_core(final RawStore ms, final CreateTableRequest req) madeDir = true; } } - if (MetastoreConf.getBoolVar(conf, ConfVars.STATS_AUTO_GATHER) && - !MetaStoreUtils.isView(tbl)) { - MetaStoreServerUtils.updateTableStatsSlow(db, tbl, wh, madeDir, false, envContext); - } + + MetaStoreServerUtils.updateTableStatsForCreateTable(wh, db, tbl, envContext, conf, tblPath, madeDir); // set create time long time = System.currentTimeMillis() / 1000; diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreServerUtils.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreServerUtils.java index 6a15b089cd60..9ef97f0c5782 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreServerUtils.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreServerUtils.java @@ -27,6 +27,7 @@ import java.net.ServerSocket; import java.net.Socket; import java.net.UnknownHostException; +import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; @@ -34,7 +35,7 @@ import java.util.Collection; import java.util.Collections; import java.util.HashMap; -import java.util.Iterator; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; @@ -65,9 +66,11 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.common.TableName; import org.apache.hadoop.hive.metastore.ColumnType; +import org.apache.hadoop.hive.metastore.ExceptionHandler; import org.apache.hadoop.hive.metastore.HiveMetaStore; import org.apache.hadoop.hive.metastore.IMetaStoreClient; import org.apache.hadoop.hive.metastore.Warehouse; @@ -510,6 +513,52 @@ public static void clearQuickStats(Map params) { params.remove(StatsSetupConst.NUM_ERASURE_CODED_FILES); } + public static void updateTableStatsForCreateTable(Warehouse wh, Database db, Table tbl, + EnvironmentContext envContext, Configuration conf, Path tblPath, boolean newDir) + throws MetaException { + // If the created table is a view, skip generating the stats + if (MetaStoreUtils.isView(tbl)) { + return; + } + assert tblPath != null; + if (tbl.isSetDictionary() && tbl.getDictionary().getValues() != null) { + List values = tbl.getDictionary().getValues(). + remove(StatsSetupConst.STATS_FOR_CREATE_TABLE); + ByteBuffer buffer; + if (values != null && values.size() > 0 && (buffer = values.get(0)).hasArray()) { + String val = new String(buffer.array(), StandardCharsets.UTF_8); + StatsSetupConst.ColumnStatsSetup statsSetup = StatsSetupConst.ColumnStatsSetup.parseStatsSetup(val); + if (statsSetup.enabled) { + try { + PathFilter pathFilter = FileUtils.HIDDEN_FILES_PATH_FILTER; + if (StringUtils.isNotEmpty(statsSetup.fileToEscape)) { + final Set filesToEscape = new HashSet<>(); + for (String fileName : statsSetup.fileToEscape.split(",")) { + filesToEscape.add(fileName.trim()); + } + pathFilter = p -> !filesToEscape.contains(p.getName()); + } + // Set the column stats true in order to make it merge-able + if (newDir || wh.isEmptyDir(tblPath, pathFilter)) { + List columns = statsSetup.columnNames; + if (columns == null || columns.isEmpty()) { + columns = getColumnNames(tbl.getSd().getCols()); + } + StatsSetupConst.setStatsStateForCreateTable(tbl.getParameters(), columns, StatsSetupConst.TRUE); + } + } catch (IOException e) { + LOG.error("Error while checking the table directory: " + tblPath + " is empty or not", e); + throw ExceptionHandler.newMetaException(e); + } + } + } + } + + if (MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.STATS_AUTO_GATHER)) { + updateTableStatsSlow(db, tbl, wh, newDir, false, envContext); + } + } + /** * Compare the names, types and comments of two lists of {@link FieldSchema}. *