Skip to content

Commit

Permalink
IMPALA-3654: Parquet stats filtering for IN predicate
Browse files Browse the repository at this point in the history
This generates min/max predicates for InPredicates that
have only constant values in the IN list. It is only
used for statistics filtering on Parquet files.

Change-Id: I4a88963a7206f40a867e49eceeaf03fdd4f71997
Reviewed-on: http://gerrit.cloudera.org:8080/6810
Reviewed-by: Alex Behm <alex.behm@cloudera.com>
Tested-by: Impala Public Jenkins
  • Loading branch information
joemcdonnell authored and Impala Public Jenkins committed May 6, 2017
1 parent c26a485 commit aa05c64
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 36 deletions.
110 changes: 79 additions & 31 deletions fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@
import org.apache.impala.analysis.BinaryPredicate;
import org.apache.impala.analysis.DescriptorTable;
import org.apache.impala.analysis.Expr;
import org.apache.impala.analysis.FunctionCallExpr;
import org.apache.impala.analysis.InPredicate;
import org.apache.impala.analysis.LiteralExpr;
import org.apache.impala.analysis.NullLiteral;
import org.apache.impala.analysis.SlotDescriptor;
import org.apache.impala.analysis.SlotId;
import org.apache.impala.analysis.SlotRef;
Expand Down Expand Up @@ -326,6 +330,77 @@ private void buildStatsPredicate(Analyzer analyzer, SlotRef inputSlot,
minMaxConjuncts_.add(statsPred);
}

private void tryComputeBinaryMinMaxPredicate(Analyzer analyzer,
BinaryPredicate binaryPred) {
// We only support slot refs on the left hand side of the predicate, a rewriting
// rule makes sure that all compatible exprs are rewritten into this form. Only
// implicit casts are supported.
SlotRef slot = binaryPred.getChild(0).unwrapSlotRef(true);
if (slot == null) return;

// This node is a table scan, so this must be a scanning slot.
Preconditions.checkState(slot.getDesc().isScanSlot());
// If the column is null, then this can be a 'pos' scanning slot of a nested type.
if (slot.getDesc().getColumn() == null) return;

Expr constExpr = binaryPred.getChild(1);
// Only constant exprs can be evaluated against parquet::Statistics. This includes
// LiteralExpr, but can also be an expr like "1 + 2".
if (!constExpr.isConstant()) return;
if (constExpr.isNullLiteral()) return;

BinaryPredicate.Operator op = binaryPred.getOp();
if (op == BinaryPredicate.Operator.LT || op == BinaryPredicate.Operator.LE ||
op == BinaryPredicate.Operator.GE || op == BinaryPredicate.Operator.GT) {
minMaxOriginalConjuncts_.add(binaryPred);
buildStatsPredicate(analyzer, slot, binaryPred, op);
} else if (op == BinaryPredicate.Operator.EQ) {
minMaxOriginalConjuncts_.add(binaryPred);
// TODO: this could be optimized for boolean columns.
buildStatsPredicate(analyzer, slot, binaryPred, BinaryPredicate.Operator.LE);
buildStatsPredicate(analyzer, slot, binaryPred, BinaryPredicate.Operator.GE);
}
}

private void tryComputeInListMinMaxPredicate(Analyzer analyzer, InPredicate inPred) {
// Retrieve the left side of the IN predicate. It must be a simple slot to
// proceed.
SlotRef slot = inPred.getBoundSlot();
if (slot == null) return;
// This node is a table scan, so this must be a scanning slot.
Preconditions.checkState(slot.getDesc().isScanSlot());
// If the column is null, then this can be a 'pos' scanning slot of a nested type.
if (slot.getDesc().getColumn() == null) return;
if (inPred.isNotIn()) return;

ArrayList<Expr> children = inPred.getChildren();
LiteralExpr min = null;
LiteralExpr max = null;
for (int i = 1; i < children.size(); ++i) {
Expr child = children.get(i);

// If any child is not a literal, then nothing can be done
if (!child.isLiteral()) return;
LiteralExpr literalChild = (LiteralExpr) child;
// If any child is NULL, then there is not a valid min/max. Nothing can be done.
if (literalChild instanceof NullLiteral) return;

if (min == null || literalChild.compareTo(min) < 0) min = literalChild;
if (max == null || literalChild.compareTo(max) > 0) max = literalChild;
}
Preconditions.checkState(min != null);
Preconditions.checkState(max != null);

BinaryPredicate minBound = new BinaryPredicate(BinaryPredicate.Operator.GE,
children.get(0).clone(), min.clone());
BinaryPredicate maxBound = new BinaryPredicate(BinaryPredicate.Operator.LE,
children.get(0).clone(), max.clone());

minMaxOriginalConjuncts_.add(inPred);
buildStatsPredicate(analyzer, slot, minBound, minBound.getOp());
buildStatsPredicate(analyzer, slot, maxBound, maxBound.getOp());
}

/**
* Analyzes 'conjuncts_', populates 'minMaxTuple_' with slots for statistics values, and
* populates 'minMaxConjuncts_' with conjuncts pointing into the 'minMaxTuple_'. Only
Expand All @@ -340,38 +415,11 @@ private void computeMinMaxTupleAndConjuncts(Analyzer analyzer) throws ImpalaExce
minMaxTuple_.setPath(desc_.getPath());

for (Expr pred: conjuncts_) {
if (!(pred instanceof BinaryPredicate)) continue;
BinaryPredicate binaryPred = (BinaryPredicate) pred;

// We only support slot refs on the left hand side of the predicate, a rewriting
// rule makes sure that all compatible exprs are rewritten into this form. Only
// implicit casts are supported.
SlotRef slot = binaryPred.getChild(0).unwrapSlotRef(true);
if (slot == null) continue;

// This node is a table scan, so this must be a scanning slot.
Preconditions.checkState(slot.getDesc().isScanSlot());
// If the column is null, then this can be a 'pos' scanning slot of a nested type.
if (slot.getDesc().getColumn() == null) continue;

Expr constExpr = binaryPred.getChild(1);
// Only constant exprs can be evaluated against parquet::Statistics. This includes
// LiteralExpr, but can also be an expr like "1 + 2".
if (!constExpr.isConstant()) continue;
if (constExpr.isNullLiteral()) continue;

BinaryPredicate.Operator op = binaryPred.getOp();
if (op == BinaryPredicate.Operator.LT || op == BinaryPredicate.Operator.LE ||
op == BinaryPredicate.Operator.GE || op == BinaryPredicate.Operator.GT) {
minMaxOriginalConjuncts_.add(pred);
buildStatsPredicate(analyzer, slot, binaryPred, op);
} else if (op == BinaryPredicate.Operator.EQ) {
minMaxOriginalConjuncts_.add(pred);
// TODO: this could be optimized for boolean columns.
buildStatsPredicate(analyzer, slot, binaryPred, BinaryPredicate.Operator.LE);
buildStatsPredicate(analyzer, slot, binaryPred, BinaryPredicate.Operator.GE);
if (pred instanceof BinaryPredicate) {
tryComputeBinaryMinMaxPredicate(analyzer, (BinaryPredicate) pred);
} else if (pred instanceof InPredicate) {
tryComputeInListMinMaxPredicate(analyzer, (InPredicate) pred);
}

}
minMaxTuple_.computeMemLayout();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@ PLAN-ROOT SINK
====
# Test a variety of types
select count(*) from functional_parquet.alltypes
where id = 1 and bool_col and tinyint_col < 50 and smallint_col > 50
where id = 1 and bool_col and tinyint_col < 50 and smallint_col in (1,2,3,4,5)
and mod(int_col,2) = 1 and bigint_col < 5000 and float_col > 50.00
and double_col > 100.00 and date_string_col > '1993-10-01' and string_col > 'aaaa'
and double_col > 100.00 and date_string_col > '1993-10-01'
and string_col in ('aaaa', 'bbbb', 'cccc')
and timestamp_cmp(timestamp_col, '2016-11-20 00:00:00') = 1
and year > 2000 and month < 12;
---- PLAN
Expand All @@ -45,11 +46,39 @@ PLAN-ROOT SINK
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=22/24 files=22 size=143.36KB
predicates: bool_col, bigint_col < 5000, double_col > 100.00, float_col > 50.00, id = 1, smallint_col > 50, tinyint_col < 50, string_col > 'aaaa', mod(int_col, 2) = 1, timestamp_cmp(timestamp_col, TIMESTAMP '2016-11-20 00:00:00') = 1, date_string_col > '1993-10-01'
predicates: bool_col, bigint_col < 5000, double_col > 100.00, float_col > 50.00, id = 1, tinyint_col < 50, string_col IN ('aaaa', 'bbbb', 'cccc'), smallint_col IN (1, 2, 3, 4, 5), mod(int_col, 2) = 1, timestamp_cmp(timestamp_col, TIMESTAMP '2016-11-20 00:00:00') = 1, date_string_col > '1993-10-01'
table stats: unavailable
columns missing stats: id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col
parquet statistics predicates: bigint_col < 5000, double_col > 100.00, float_col > 50.00, id = 1, smallint_col > 50, tinyint_col < 50, string_col > 'aaaa', date_string_col > '1993-10-01'
parquet dictionary predicates: bool_col, bigint_col < 5000, double_col > 100.00, float_col > 50.00, id = 1, smallint_col > 50, tinyint_col < 50, string_col > 'aaaa', mod(int_col, 2) = 1, timestamp_cmp(timestamp_col, TIMESTAMP '2016-11-20 00:00:00') = 1, date_string_col > '1993-10-01'
parquet statistics predicates: bigint_col < 5000, double_col > 100.00, float_col > 50.00, id = 1, tinyint_col < 50, string_col IN ('aaaa', 'bbbb', 'cccc'), smallint_col IN (1, 2, 3, 4, 5), date_string_col > '1993-10-01'
parquet dictionary predicates: bool_col, bigint_col < 5000, double_col > 100.00, float_col > 50.00, id = 1, tinyint_col < 50, string_col IN ('aaaa', 'bbbb', 'cccc'), smallint_col IN (1, 2, 3, 4, 5), mod(int_col, 2) = 1, timestamp_cmp(timestamp_col, TIMESTAMP '2016-11-20 00:00:00') = 1, date_string_col > '1993-10-01'
mem-estimate=128.00MB mem-reservation=0B
tuple-ids=0 row-size=80B cardinality=unavailable
====
# Test negative cases for IN predicate min/max filtering
# - NOT IN
# - IN list with NULL
# - IN list contains non-Literals
# - complex expression on left side of IN
select count(*) from functional_parquet.alltypes
where id NOT IN (0,1,2) and string_col IN ('aaaa', 'bbbb', 'cccc', NULL)
and mod(int_col,50) IN (0,1)
and id IN (int_col);
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
01:AGGREGATE [FINALIZE]
| output: count(*)
| mem-estimate=10.00MB mem-reservation=0B
| tuple-ids=1 row-size=8B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=173.09KB
predicates: id IN (int_col), id NOT IN (0, 1, 2), string_col IN ('aaaa', 'bbbb', 'cccc', NULL), mod(int_col, 50) IN (0, 1)
table stats: unavailable
column stats: unavailable
parquet dictionary predicates: id NOT IN (0, 1, 2), string_col IN ('aaaa', 'bbbb', 'cccc', NULL), mod(int_col, 50) IN (0, 1)
mem-estimate=48.00MB mem-reservation=0B
tuple-ids=0 row-size=24B cardinality=unavailable
====
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,20 @@ select count(*) from functional_parquet.complextypestbl.int_array where pos < 5;
row_regex: .*NumRowGroups: 2 .*
row_regex: .*NumStatsFilteredRowGroups: 0 .*
====
---- QUERY
# Test the conversion of constant IN lists to min/max predicats
select count(*) from functional_parquet.alltypes where int_col in (-1,-2,-3,-4);
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 24
====
---- QUERY
select count(*) from functional_parquet.alltypes where id IN (1,25,49);
---- RESULTS
3
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 23
====

0 comments on commit aa05c64

Please sign in to comment.