diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index ad807386f360..991c97e250a0 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2229,6 +2229,8 @@ public static enum ConfVars { HIVE_ICEBERG_MASK_DEFAULT_LOCATION("hive.iceberg.mask.default.location", false, "If this is set to true the URI for auth will have the default location masked with DEFAULT_TABLE_LOCATION"), + HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY("hive.iceberg.allow.datafiles.in.table.location.only", false, + "If this is set to true, then all the data files being read should be withing the table location"), HIVEUSEEXPLICITRCFILEHEADER("hive.exec.rcfile.use.explicit.header", true, "If this is set the header for RCFiles will simply be RCF. If this is not\n" + @@ -5575,7 +5577,8 @@ public static enum ConfVars { "hive.zookeeper.ssl.keystore.type," + "hive.zookeeper.ssl.truststore.location," + "hive.zookeeper.ssl.truststore.password," + - "hive.zookeeper.ssl.truststore.type", + "hive.zookeeper.ssl.truststore.type," + + "hive.iceberg.allow.datafiles.in.table.location.only", "Comma separated list of configuration options which are immutable at runtime"), HIVE_CONF_HIDDEN_LIST("hive.conf.hidden.list", METASTOREPWD.varname + "," + HIVE_SERVER2_SSL_KEYSTORE_PASSWORD.varname diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java index 5f4f97b9f72b..e3336437c778 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java @@ -278,6 +278,13 @@ public void configureInputJobProperties(TableDesc tableDesc, Map overlayTableProperties(conf, tableDesc, map); // Until the vectorized reader can handle delete files, let's fall back to non-vector mode for V2 tables fallbackToNonVectorizedModeBasedOnProperties(tableDesc.getProperties()); + + boolean allowDataFilesWithinTableLocationOnly = + conf.getBoolean(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname, + HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.defaultBoolVal); + + map.put(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname, + String.valueOf(allowDataFilesWithinTableLocationOnly)); } @Override diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java index dc50a1e34010..3ec1a3b3b7a5 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java @@ -33,8 +33,11 @@ import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.FileUtils; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.llap.LlapHiveUtils; import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.metadata.AuthorizationException; import org.apache.hadoop.hive.ql.metadata.HiveUtils; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.mapred.JobConf; @@ -218,6 +221,12 @@ public List getSplits(JobContext context) { scan = applyConfig(conf, createTableScan(table, conf)); } + boolean allowDataFilesWithinTableLocationOnly = + conf.getBoolean(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname, + HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.defaultBoolVal); + Path tableLocation = new Path(conf.get(InputFormatConfig.TABLE_LOCATION)); + + try (CloseableIterable tasksIterable = scan.planTasks()) { tasksIterable.forEach(task -> { if (applyResidual && (model == InputFormatConfig.InMemoryDataModel.HIVE || @@ -225,6 +234,9 @@ public List getSplits(JobContext context) { // TODO: We do not support residual evaluation for HIVE and PIG in memory data model yet checkResiduals(task); } + if (allowDataFilesWithinTableLocationOnly) { + validateFileLocations(task, tableLocation); + } splits.add(new IcebergSplit(conf, task)); }); } catch (IOException e) { @@ -241,6 +253,14 @@ public List getSplits(JobContext context) { return splits; } + private static void validateFileLocations(CombinedScanTask split, Path tableLocation) { + for (FileScanTask fileScanTask : split.files()) { + if (!FileUtils.isPathWithinSubtree(new Path(fileScanTask.file().path().toString()), tableLocation)) { + throw new AuthorizationException("The table contains paths which are outside the table location"); + } + } + } + private static void checkResiduals(CombinedScanTask task) { task.files().forEach(fileScanTask -> { Expression residual = fileScanTask.residual(); diff --git a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergRestrictDataFiles.java b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergRestrictDataFiles.java new file mode 100644 index 000000000000..e9d6950ef460 --- /dev/null +++ b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergRestrictDataFiles.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mr.hive; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import org.apache.commons.collections4.ListUtils; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.catalog.TableIdentifier; +import org.junit.BeforeClass; +import org.junit.Test; + +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY; + +public class TestHiveIcebergRestrictDataFiles extends HiveIcebergStorageHandlerWithEngineBase { + + @BeforeClass + public static void beforeClass() { + shell = HiveIcebergStorageHandlerTestUtils.shell( + Collections.singletonMap(HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname, "true")); + } + + @Test + public void testRestrictDataFiles() throws IOException, InterruptedException { + TableIdentifier table1 = TableIdentifier.of("default", "tab1"); + testTables.createTableWithVersions(shell, table1.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2); + + AssertHelpers.assertThrows("Should throw exception since there are files outside the table directory", + IllegalArgumentException.class, "The table contains paths which are outside the table location", + () -> shell.executeStatement("SELECT * FROM " + table1.name())); + + // Create another table with files within the table location + TableIdentifier table2 = TableIdentifier.of("default", "tab2"); + testTables.createTableWithVersions(shell, table2.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, null, 0); + + shell.executeStatement( + testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, table2, false)); + + List result = shell.executeStatement("SELECT * FROM " + table2.name()); + + HiveIcebergTestUtils.validateData(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, + HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, result), 0); + + // Insert some more records to generate new Data file + shell.executeStatement( + testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.OTHER_CUSTOMER_RECORDS_1, table2, false)); + + result = shell.executeStatement("SELECT * FROM " + table2.name()); + + HiveIcebergTestUtils.validateData(ListUtils.union(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, + HiveIcebergStorageHandlerTestUtils.OTHER_CUSTOMER_RECORDS_1), + HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, result), 0); + } +} diff --git a/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java b/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java index aeec57757c21..52be546dca8d 100644 --- a/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java +++ b/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java @@ -109,6 +109,7 @@ public static void startServices() throws Exception { addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.location"); addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.password"); addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.type"); + addToExpectedRestrictedMap("hive.iceberg.allow.datafiles.in.table.location.only"); checkRestrictedListMatch(); }