Skip to content

Commit 66b51d6

Browse files
authored
HIVE-27926: Iceberg: Allow restricting Iceberg data file reads to table location. (#4910). (Ayush Saxena, reviewed by Denys Kuzmenko)
1 parent 276b378 commit 66b51d6

File tree

5 files changed

+106
-1
lines changed

5 files changed

+106
-1
lines changed

common/src/java/org/apache/hadoop/hive/conf/HiveConf.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2229,6 +2229,8 @@ public static enum ConfVars {
22292229

22302230
HIVE_ICEBERG_MASK_DEFAULT_LOCATION("hive.iceberg.mask.default.location", false,
22312231
"If this is set to true the URI for auth will have the default location masked with DEFAULT_TABLE_LOCATION"),
2232+
HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY("hive.iceberg.allow.datafiles.in.table.location.only", false,
2233+
"If this is set to true, then all the data files being read should be withing the table location"),
22322234

22332235
HIVEUSEEXPLICITRCFILEHEADER("hive.exec.rcfile.use.explicit.header", true,
22342236
"If this is set the header for RCFiles will simply be RCF. If this is not\n" +
@@ -5575,7 +5577,8 @@ public static enum ConfVars {
55755577
"hive.zookeeper.ssl.keystore.type," +
55765578
"hive.zookeeper.ssl.truststore.location," +
55775579
"hive.zookeeper.ssl.truststore.password," +
5578-
"hive.zookeeper.ssl.truststore.type",
5580+
"hive.zookeeper.ssl.truststore.type," +
5581+
"hive.iceberg.allow.datafiles.in.table.location.only",
55795582
"Comma separated list of configuration options which are immutable at runtime"),
55805583
HIVE_CONF_HIDDEN_LIST("hive.conf.hidden.list",
55815584
METASTOREPWD.varname + "," + HIVE_SERVER2_SSL_KEYSTORE_PASSWORD.varname

iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,13 @@ public void configureInputJobProperties(TableDesc tableDesc, Map<String, String>
278278
overlayTableProperties(conf, tableDesc, map);
279279
// Until the vectorized reader can handle delete files, let's fall back to non-vector mode for V2 tables
280280
fallbackToNonVectorizedModeBasedOnProperties(tableDesc.getProperties());
281+
282+
boolean allowDataFilesWithinTableLocationOnly =
283+
conf.getBoolean(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname,
284+
HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.defaultBoolVal);
285+
286+
map.put(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname,
287+
String.valueOf(allowDataFilesWithinTableLocationOnly));
281288
}
282289

283290
@Override

iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,11 @@
3333
import org.apache.commons.lang3.StringUtils;
3434
import org.apache.hadoop.conf.Configuration;
3535
import org.apache.hadoop.fs.Path;
36+
import org.apache.hadoop.hive.common.FileUtils;
37+
import org.apache.hadoop.hive.conf.HiveConf;
3638
import org.apache.hadoop.hive.llap.LlapHiveUtils;
3739
import org.apache.hadoop.hive.ql.exec.Utilities;
40+
import org.apache.hadoop.hive.ql.metadata.AuthorizationException;
3841
import org.apache.hadoop.hive.ql.metadata.HiveUtils;
3942
import org.apache.hadoop.hive.ql.plan.MapWork;
4043
import org.apache.hadoop.mapred.JobConf;
@@ -218,13 +221,22 @@ public List<InputSplit> getSplits(JobContext context) {
218221
scan = applyConfig(conf, createTableScan(table, conf));
219222
}
220223

224+
boolean allowDataFilesWithinTableLocationOnly =
225+
conf.getBoolean(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname,
226+
HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.defaultBoolVal);
227+
Path tableLocation = new Path(conf.get(InputFormatConfig.TABLE_LOCATION));
228+
229+
221230
try (CloseableIterable<CombinedScanTask> tasksIterable = scan.planTasks()) {
222231
tasksIterable.forEach(task -> {
223232
if (applyResidual && (model == InputFormatConfig.InMemoryDataModel.HIVE ||
224233
model == InputFormatConfig.InMemoryDataModel.PIG)) {
225234
// TODO: We do not support residual evaluation for HIVE and PIG in memory data model yet
226235
checkResiduals(task);
227236
}
237+
if (allowDataFilesWithinTableLocationOnly) {
238+
validateFileLocations(task, tableLocation);
239+
}
228240
splits.add(new IcebergSplit(conf, task));
229241
});
230242
} catch (IOException e) {
@@ -241,6 +253,14 @@ public List<InputSplit> getSplits(JobContext context) {
241253
return splits;
242254
}
243255

256+
private static void validateFileLocations(CombinedScanTask split, Path tableLocation) {
257+
for (FileScanTask fileScanTask : split.files()) {
258+
if (!FileUtils.isPathWithinSubtree(new Path(fileScanTask.file().path().toString()), tableLocation)) {
259+
throw new AuthorizationException("The table contains paths which are outside the table location");
260+
}
261+
}
262+
}
263+
244264
private static void checkResiduals(CombinedScanTask task) {
245265
task.files().forEach(fileScanTask -> {
246266
Expression residual = fileScanTask.residual();
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.iceberg.mr.hive;
21+
22+
import java.io.IOException;
23+
import java.util.Collections;
24+
import java.util.List;
25+
import org.apache.commons.collections4.ListUtils;
26+
import org.apache.iceberg.AssertHelpers;
27+
import org.apache.iceberg.catalog.TableIdentifier;
28+
import org.junit.BeforeClass;
29+
import org.junit.Test;
30+
31+
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY;
32+
33+
public class TestHiveIcebergRestrictDataFiles extends HiveIcebergStorageHandlerWithEngineBase {
34+
35+
@BeforeClass
36+
public static void beforeClass() {
37+
shell = HiveIcebergStorageHandlerTestUtils.shell(
38+
Collections.singletonMap(HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname, "true"));
39+
}
40+
41+
@Test
42+
public void testRestrictDataFiles() throws IOException, InterruptedException {
43+
TableIdentifier table1 = TableIdentifier.of("default", "tab1");
44+
testTables.createTableWithVersions(shell, table1.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
45+
fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
46+
47+
AssertHelpers.assertThrows("Should throw exception since there are files outside the table directory",
48+
IllegalArgumentException.class, "The table contains paths which are outside the table location",
49+
() -> shell.executeStatement("SELECT * FROM " + table1.name()));
50+
51+
// Create another table with files within the table location
52+
TableIdentifier table2 = TableIdentifier.of("default", "tab2");
53+
testTables.createTableWithVersions(shell, table2.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
54+
fileFormat, null, 0);
55+
56+
shell.executeStatement(
57+
testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, table2, false));
58+
59+
List<Object[]> result = shell.executeStatement("SELECT * FROM " + table2.name());
60+
61+
HiveIcebergTestUtils.validateData(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS,
62+
HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, result), 0);
63+
64+
// Insert some more records to generate new Data file
65+
shell.executeStatement(
66+
testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.OTHER_CUSTOMER_RECORDS_1, table2, false));
67+
68+
result = shell.executeStatement("SELECT * FROM " + table2.name());
69+
70+
HiveIcebergTestUtils.validateData(ListUtils.union(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS,
71+
HiveIcebergStorageHandlerTestUtils.OTHER_CUSTOMER_RECORDS_1),
72+
HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, result), 0);
73+
}
74+
}

itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ public static void startServices() throws Exception {
109109
addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.location");
110110
addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.password");
111111
addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.type");
112+
addToExpectedRestrictedMap("hive.iceberg.allow.datafiles.in.table.location.only");
112113

113114
checkRestrictedListMatch();
114115
}

0 commit comments

Comments
 (0)