Skip to content

Commit

Permalink
HIVE-27455: Iceberg: Set COLUMN_STATS_ACCURATE after writing stats fo…
Browse files Browse the repository at this point in the history
…r Iceberg tables
  • Loading branch information
SourabhBadhya committed Jul 10, 2023
1 parent 0865ca7 commit c6c82d7
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -428,10 +428,11 @@ public boolean setColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTabl
PuffinCompressionCodec.NONE,
ImmutableMap.of()));
writer.finish();
return true;
} catch (IOException e) {
LOG.error(String.valueOf(e));
return false;
}
return false;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.types.Types.NestedField;
import org.apache.thrift.TException;
import org.junit.Assert;
import org.junit.Assume;
Expand Down Expand Up @@ -207,6 +208,27 @@ public void testStatsRemoved() throws IOException {
checkColStat(identifier.name(), "customer_id", false);
}

@Test
public void testColumnStatsAccurate() throws Exception {
TableIdentifier identifier = TableIdentifier.of("default", "customers");

shell.setHiveSessionValue(HiveConf.ConfVars.HIVESTATSAUTOGATHER.varname, true);
testTables.createTable(shell, identifier.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
PartitionSpec.unpartitioned(), fileFormat, ImmutableList.of());

String insert = testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, identifier, true);
shell.executeStatement(insert);

org.apache.hadoop.hive.metastore.api.Table hmsTable = shell.metastore().getTable("default", identifier.name());

// Assert whether basic stats and column stats are accurate.
Assert.assertTrue(hmsTable.getParameters().containsKey(StatsSetupConst.COLUMN_STATS_ACCURATE));
Assert.assertTrue(StatsSetupConst.areBasicStatsUptoDate(hmsTable.getParameters()));
for (NestedField nestedField : HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.columns()) {
Assert.assertTrue(StatsSetupConst.areColumnStatsUptoDate(hmsTable.getParameters(), nestedField.name()));
}
}

private void checkColStat(String tableName, String colName, boolean accurate) {
List<Object[]> rows = shell.executeStatement("DESCRIBE " + tableName + " " + colName);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ Retention: 0
#### A masked pattern was here ####
Table Type: EXTERNAL_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"id\":\"true\",\"value\":\"true\"}}
EXTERNAL TRUE
bucketing_version 2
current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"id\",\"required\":false,\"type\":\"int\"},{\"id\":2,\"name\":\"value\",\"required\":false,\"type\":\"string\"}]}
Expand Down
3 changes: 3 additions & 0 deletions ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ public int execute() {
for (IStatsProcessor task : processors) {
task.setDpPartSpecs(dpPartSpecs);
ret = task.process(db, tbl);
if (task instanceof BasicStatsTask) {
tbl = ((BasicStatsTask) task).getTable();
}
if (ret != 0) {
return ret;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
package org.apache.hadoop.hive.ql.plan;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;

import org.apache.hadoop.hive.ql.plan.Explain.Level;

Expand All @@ -34,14 +37,14 @@ public class ColumnStatsDesc implements Serializable, Cloneable {
private int numBitVector;
private boolean needMerge;
private String tableName;
private List<String> colName;
private List<String> colName = new ArrayList<>();
private List<String> colType;


public ColumnStatsDesc(String tableName, List<String> colName,
List<String> colType, boolean isTblLevel, int numBitVector, FetchWork fWork1) {
this.tableName = tableName;
this.colName = colName;
colName.forEach(col -> this.colName.add(col.toLowerCase(Locale.ROOT)));
this.colType = colType;
this.isTblLevel = isTblLevel;
this.numBitVector = numBitVector;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ public String getName() {
return "STATS";
}

public Table getTable() { return table; }

private static class BasicStatsProcessor {

private Partish partish;
Expand Down Expand Up @@ -314,6 +316,11 @@ private int aggregateStats(Hive db) {
}
LOG.info("Table " + tableFullName + " stats: [" + toString(p.getPartParameters()) + ']');

// The table object is assigned to the latest table object.
// So that it can be used by ColStatsProcessor.
// This is only required for unpartitioned tables.
table = res;

} else {
// Partitioned table:
// Need to get the old stats of the partition
Expand Down
27 changes: 25 additions & 2 deletions ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;

import org.apache.commons.collections.CollectionUtils;
import org.apache.hadoop.hive.common.HiveStatsUtils;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.common.TableName;
import org.apache.hadoop.hive.common.ValidWriteIdList;
import org.apache.hadoop.hive.conf.Constants;
import org.apache.hadoop.hive.conf.HiveConf;
Expand All @@ -34,11 +38,13 @@
import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.SetPartitionsStatsRequest;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.FetchOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.lockmgr.HiveTxnManager;
import org.apache.hadoop.hive.ql.metadata.Hive;
Expand All @@ -60,7 +66,6 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


public class ColStatsProcessor implements IStatsProcessor {
private static transient final Logger LOG = LoggerFactory.getLogger(ColStatsProcessor.class);

Expand Down Expand Up @@ -219,8 +224,12 @@ public int persistColumnStats(Hive db, Table tbl) throws HiveException, MetaExce

start = System. currentTimeMillis();
if (tbl != null && tbl.isNonNative() && tbl.getStorageHandler().canSetColStatistics(tbl)) {
tbl.getStorageHandler().setColStatistics(tbl, colStats);
boolean success = tbl.getStorageHandler().setColStatistics(tbl, colStats);
if (!(tbl.isMaterializedView() || tbl.isView() || tbl.isTemporary())) {
setOrRemoveColumnStatsAccurateProperty(db, tbl, colStatDesc.getColName(), success);
}
}
// TODO: Write stats for native tables only (See HIVE-27421)
db.setPartitionColumnStatistics(request);
end = System.currentTimeMillis();
LOG.info("Time taken to update " + colStats.size() + " stats : " + ((end - start)/1000F) + " seconds.");
Expand All @@ -232,6 +241,20 @@ public int persistColumnStats(Hive db, Table tbl) throws HiveException, MetaExce
public void setDpPartSpecs(Collection<Partition> dpPartSpecs) {
}

private void setOrRemoveColumnStatsAccurateProperty(Hive db, Table tbl, List<String> colNames, boolean success) throws HiveException {
if (CollectionUtils.isEmpty(colNames) || !colStatDesc.isTblLevel()) {
return;
}
EnvironmentContext environmentContext = new EnvironmentContext();
environmentContext.putToProperties(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE);
if (success) {
StatsSetupConst.setColumnStatsState(tbl.getParameters(), colNames);
} else {
StatsSetupConst.removeColumnStatsState(tbl.getParameters(), colNames);
}
db.alterTable(tbl.getFullyQualifiedName(), tbl, environmentContext, AcidUtils.isTransactionalTable(tbl));
}

/**
* Enumeration of column stats fields that can currently
* be computed. Each one has a field name associated.
Expand Down

0 comments on commit c6c82d7

Please sign in to comment.