Skip to content

Commit

Permalink
HIVE-21714: Insert overwrite on an acid/mm table is ineffective if th…
Browse files Browse the repository at this point in the history
…e input is empty (Ivan Suller via Zoltan Haindrich)

Signed-off-by: Zoltan Haindrich <kirk@rxd.hu>
  • Loading branch information
Ivan Suller authored and kgyrtkirk committed May 16, 2019
1 parent ac477b6 commit 9a10bc2
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 19 deletions.
Expand Up @@ -1363,7 +1363,7 @@ public void sqlInsertPartition() throws Exception {
// Test a limit higher than available events
testEventCounts(defaultDbName, firstEventId, null, 100, 13);
// Test toEventId lower than current eventId
testEventCounts(defaultDbName, firstEventId, (long) firstEventId + 5, null, 5);
testEventCounts(defaultDbName, firstEventId, firstEventId + 5, null, 5);

// Event 10, 11, 12
driver.run("insert into table " + tblName + " partition (ds = 'yesterday') values (2)");
Expand Down Expand Up @@ -1448,7 +1448,8 @@ public void sqlInsertPartition() throws Exception {
insertMsg = md.getInsertMessage(event.getMessage());
assertTrue(insertMsg.isReplace());
// replace-overwrite introduces no new files
assertTrue(event.getMessage().matches(".*\"files\":\\[\\].*"));
// the insert overwrite creates an empty file with the current change
//assertTrue(event.getMessage().matches(".*\"files\":\\[\\].*"));

event = rsp.getEvents().get(29);
assertEquals(firstEventId + 30, event.getEventId());
Expand All @@ -1464,11 +1465,11 @@ public void sqlInsertPartition() throws Exception {
// Test a limit within the available events
testEventCounts(defaultDbName, firstEventId, null, 10, 10);
// Test toEventId greater than current eventId
testEventCounts(defaultDbName, firstEventId, (long) firstEventId + 100, null, 31);
testEventCounts(defaultDbName, firstEventId, firstEventId + 100, null, 31);
// Test toEventId greater than current eventId with some limit within available events
testEventCounts(defaultDbName, firstEventId, (long) firstEventId + 100, 10, 10);
testEventCounts(defaultDbName, firstEventId, firstEventId + 100, 10, 10);
// Test toEventId greater than current eventId with some limit beyond available events
testEventCounts(defaultDbName, firstEventId, (long) firstEventId + 100, 50, 31);
testEventCounts(defaultDbName, firstEventId, firstEventId + 100, 50, 31);
}

private void verifyInsert(NotificationEvent event, String dbName, String tblName) throws Exception {
Expand Down
19 changes: 12 additions & 7 deletions ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
Expand Up @@ -98,7 +98,7 @@
**/
@SuppressWarnings("deprecation")
public class FileSinkOperator extends TerminalOperator<FileSinkDesc> implements
Serializable {
Serializable, IConfigureJobConf {

public static final Logger LOG = LoggerFactory.getLogger(FileSinkOperator.class);

Expand Down Expand Up @@ -1260,13 +1260,12 @@ public void closeOp(boolean abort) throws HiveException {
}

if (!bDynParts && !filesCreated) {
boolean skipFiles = "tez".equalsIgnoreCase(
boolean isTez = "tez".equalsIgnoreCase(
HiveConf.getVar(hconf, ConfVars.HIVE_EXECUTION_ENGINE));
if (skipFiles) {
Class<?> clazz = conf.getTableInfo().getOutputFileFormatClass();
skipFiles = !StreamingOutputFormat.class.isAssignableFrom(clazz);
}
if (!skipFiles) {
Class<?> clazz = conf.getTableInfo().getOutputFileFormatClass();
boolean isStreaming = StreamingOutputFormat.class.isAssignableFrom(clazz);

if (!isTez || isStreaming || this.isInsertOverwrite) {
createBucketFiles(fsp);
}
}
Expand Down Expand Up @@ -1607,4 +1606,10 @@ private boolean isNativeTable() {
return !conf.getTableInfo().isNonNative();
}

@Override
public void configureJobConf(JobConf job) {
if (conf.getInsertOverwrite()) {
job.setBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, true);
}
}
}
1 change: 1 addition & 0 deletions ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
Expand Up @@ -6060,3 +6060,4 @@ public StorageHandlerInfo getStorageHandlerInfo(Table table)
}
}
}

24 changes: 23 additions & 1 deletion ql/src/test/queries/clientpositive/insert_overwrite.q
Expand Up @@ -3,9 +3,15 @@ set hive.stats.column.autogather=false;
set hive.stats.autogather=false;
set hive.compute.query.using.stats=false;

set hive.create.as.insert.only=true;
set hive.default.fileformat.managed=ORC;
set hive.strict.managed.tables=true;
set hive.support.concurrency=true;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;

CREATE EXTERNAL TABLE ext_non_part (col string);
INSERT INTO ext_non_part VALUES ('first'), ('second');
CREATE TABLE b LIKE ext_non_part;
CREATE TABLE b (col string);

INSERT OVERWRITE TABLE ext_non_part SELECT * FROM b;

Expand All @@ -23,6 +29,22 @@ INSERT OVERWRITE TABLE int_non_part SELECT * FROM b;
SELECT count(*) FROM int_non_part;

drop table int_non_part;

CREATE TABLE int_buck (col string)
CLUSTERED BY (col) INTO 4 BUCKETS;

INSERT INTO int_buck VALUES ('first'), ('second'), ('third'), ('fourth');

-- should be 4
SELECT count(*) FROM int_buck;

INSERT OVERWRITE TABLE int_buck SELECT col FROM b;

-- should be 0
SELECT count(*) FROM int_buck;
SELECT * FROM int_buck;

drop table int_buck;
drop table b;


Expand Down
66 changes: 64 additions & 2 deletions ql/src/test/results/clientpositive/llap/insert_overwrite.q.out
Expand Up @@ -15,11 +15,11 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ext_non_part
POSTHOOK: Lineage: ext_non_part.col SCRIPT []
PREHOOK: query: CREATE TABLE b LIKE ext_non_part
PREHOOK: query: CREATE TABLE b (col string)
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@b
POSTHOOK: query: CREATE TABLE b LIKE ext_non_part
POSTHOOK: query: CREATE TABLE b (col string)
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@b
Expand Down Expand Up @@ -92,6 +92,68 @@ POSTHOOK: query: drop table int_non_part
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@int_non_part
POSTHOOK: Output: default@int_non_part
PREHOOK: query: CREATE TABLE int_buck (col string)
CLUSTERED BY (col) INTO 4 BUCKETS
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@int_buck
POSTHOOK: query: CREATE TABLE int_buck (col string)
CLUSTERED BY (col) INTO 4 BUCKETS
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@int_buck
PREHOOK: query: INSERT INTO int_buck VALUES ('first'), ('second'), ('third'), ('fourth')
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@int_buck
POSTHOOK: query: INSERT INTO int_buck VALUES ('first'), ('second'), ('third'), ('fourth')
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@int_buck
POSTHOOK: Lineage: int_buck.col SCRIPT []
PREHOOK: query: SELECT count(*) FROM int_buck
PREHOOK: type: QUERY
PREHOOK: Input: default@int_buck
#### A masked pattern was here ####
POSTHOOK: query: SELECT count(*) FROM int_buck
POSTHOOK: type: QUERY
POSTHOOK: Input: default@int_buck
#### A masked pattern was here ####
4
PREHOOK: query: INSERT OVERWRITE TABLE int_buck SELECT col FROM b
PREHOOK: type: QUERY
PREHOOK: Input: default@b
PREHOOK: Output: default@int_buck
POSTHOOK: query: INSERT OVERWRITE TABLE int_buck SELECT col FROM b
POSTHOOK: type: QUERY
POSTHOOK: Input: default@b
POSTHOOK: Output: default@int_buck
POSTHOOK: Lineage: int_buck.col SIMPLE [(b)b.FieldSchema(name:col, type:string, comment:null), ]
PREHOOK: query: SELECT count(*) FROM int_buck
PREHOOK: type: QUERY
PREHOOK: Input: default@int_buck
#### A masked pattern was here ####
POSTHOOK: query: SELECT count(*) FROM int_buck
POSTHOOK: type: QUERY
POSTHOOK: Input: default@int_buck
#### A masked pattern was here ####
0
PREHOOK: query: SELECT * FROM int_buck
PREHOOK: type: QUERY
PREHOOK: Input: default@int_buck
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM int_buck
POSTHOOK: type: QUERY
POSTHOOK: Input: default@int_buck
#### A masked pattern was here ####
PREHOOK: query: drop table int_buck
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@int_buck
PREHOOK: Output: default@int_buck
POSTHOOK: query: drop table int_buck
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@int_buck
POSTHOOK: Output: default@int_buck
PREHOOK: query: drop table b
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@b
Expand Down
Expand Up @@ -732,7 +732,7 @@ STAGE PLANS:
columns.types bigint:bigint:date:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:varchar(1500):varchar(500):varchar(50):varchar(50):varchar(3000):varchar(50):varchar(50):varchar(50):varchar(1):decimal(32,6):timestamp:varchar(30):varchar(50):timestamp:bigint:bigint:varchar(70):varchar(250)
#### A masked pattern was here ####
name default.l3_monthly_dw_dimplan
numFiles 1
numFiles 64
numRows 180340
rawDataSize 269826156
serialization.ddl struct l3_monthly_dw_dimplan { i64 idp_warehouse_id, i64 idp_audit_id, date idp_data_date, i64 l3_snapshot_number, i64 plan_key, i64 project_key, i64 charge_code_key, i64 transclass_key, i64 resource_key, i64 finplan_detail_object_id, i64 project_object_id, i64 txn_class_object_id, i64 charge_code_object_id, i64 resoruce_object_id, varchar(1500) plan_name, varchar(500) plan_code, varchar(50) plan_type, varchar(50) period_type, varchar(3000) plan_description, varchar(50) plan_status, varchar(50) period_start, varchar(50) period_end, varchar(1) plan_of_record, decimal(32,6) percentage, timestamp l3_created_date, varchar(30) bmo_cost_type, varchar(50) bmo_fiscal_year, timestamp clarity_updated_date, i64 is_latest_snapshot, i64 latest_fiscal_budget_plan, varchar(70) plan_category, varchar(250) last_updated_by}
Expand All @@ -755,7 +755,7 @@ STAGE PLANS:
columns.types bigint:bigint:date:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:varchar(1500):varchar(500):varchar(50):varchar(50):varchar(3000):varchar(50):varchar(50):varchar(50):varchar(1):decimal(32,6):timestamp:varchar(30):varchar(50):timestamp:bigint:bigint:varchar(70):varchar(250)
#### A masked pattern was here ####
name default.l3_monthly_dw_dimplan
numFiles 1
numFiles 64
numRows 180340
rawDataSize 269826156
serialization.ddl struct l3_monthly_dw_dimplan { i64 idp_warehouse_id, i64 idp_audit_id, date idp_data_date, i64 l3_snapshot_number, i64 plan_key, i64 project_key, i64 charge_code_key, i64 transclass_key, i64 resource_key, i64 finplan_detail_object_id, i64 project_object_id, i64 txn_class_object_id, i64 charge_code_object_id, i64 resoruce_object_id, varchar(1500) plan_name, varchar(500) plan_code, varchar(50) plan_type, varchar(50) period_type, varchar(3000) plan_description, varchar(50) plan_status, varchar(50) period_start, varchar(50) period_end, varchar(1) plan_of_record, decimal(32,6) percentage, timestamp l3_created_date, varchar(30) bmo_cost_type, varchar(50) bmo_fiscal_year, timestamp clarity_updated_date, i64 is_latest_snapshot, i64 latest_fiscal_budget_plan, varchar(70) plan_category, varchar(250) last_updated_by}
Expand Down Expand Up @@ -1241,7 +1241,7 @@ STAGE PLANS:
columns.types bigint:bigint:date:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:varchar(1500):varchar(500):varchar(50):varchar(50):varchar(3000):varchar(50):varchar(50):varchar(50):varchar(1):decimal(32,6):timestamp:varchar(30):varchar(50):timestamp:bigint:bigint:varchar(70):varchar(250)
#### A masked pattern was here ####
name default.l3_monthly_dw_dimplan
numFiles 1
numFiles 64
numRows 180340
rawDataSize 269826156
serialization.ddl struct l3_monthly_dw_dimplan { i64 idp_warehouse_id, i64 idp_audit_id, date idp_data_date, i64 l3_snapshot_number, i64 plan_key, i64 project_key, i64 charge_code_key, i64 transclass_key, i64 resource_key, i64 finplan_detail_object_id, i64 project_object_id, i64 txn_class_object_id, i64 charge_code_object_id, i64 resoruce_object_id, varchar(1500) plan_name, varchar(500) plan_code, varchar(50) plan_type, varchar(50) period_type, varchar(3000) plan_description, varchar(50) plan_status, varchar(50) period_start, varchar(50) period_end, varchar(1) plan_of_record, decimal(32,6) percentage, timestamp l3_created_date, varchar(30) bmo_cost_type, varchar(50) bmo_fiscal_year, timestamp clarity_updated_date, i64 is_latest_snapshot, i64 latest_fiscal_budget_plan, varchar(70) plan_category, varchar(250) last_updated_by}
Expand All @@ -1264,7 +1264,7 @@ STAGE PLANS:
columns.types bigint:bigint:date:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:varchar(1500):varchar(500):varchar(50):varchar(50):varchar(3000):varchar(50):varchar(50):varchar(50):varchar(1):decimal(32,6):timestamp:varchar(30):varchar(50):timestamp:bigint:bigint:varchar(70):varchar(250)
#### A masked pattern was here ####
name default.l3_monthly_dw_dimplan
numFiles 1
numFiles 64
numRows 180340
rawDataSize 269826156
serialization.ddl struct l3_monthly_dw_dimplan { i64 idp_warehouse_id, i64 idp_audit_id, date idp_data_date, i64 l3_snapshot_number, i64 plan_key, i64 project_key, i64 charge_code_key, i64 transclass_key, i64 resource_key, i64 finplan_detail_object_id, i64 project_object_id, i64 txn_class_object_id, i64 charge_code_object_id, i64 resoruce_object_id, varchar(1500) plan_name, varchar(500) plan_code, varchar(50) plan_type, varchar(50) period_type, varchar(3000) plan_description, varchar(50) plan_status, varchar(50) period_start, varchar(50) period_end, varchar(1) plan_of_record, decimal(32,6) percentage, timestamp l3_created_date, varchar(30) bmo_cost_type, varchar(50) bmo_fiscal_year, timestamp clarity_updated_date, i64 is_latest_snapshot, i64 latest_fiscal_budget_plan, varchar(70) plan_category, varchar(250) last_updated_by}
Expand Down

0 comments on commit 9a10bc2

Please sign in to comment.