Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
176 commits
Select commit Hold shift + click to select a range
9540999
add datasketches udfs for hll
kgyrtkirk Feb 19, 2020
9b08b9c
add crap
kgyrtkirk Feb 19, 2020
83367d1
add sketches1
kgyrtkirk Feb 19, 2020
a3b87d8
updates to q
kgyrtkirk Feb 19, 2020
85182b4
add test/etc
kgyrtkirk Feb 19, 2020
47551f3
s2
kgyrtkirk Feb 19, 2020
48c738f
add 0
kgyrtkirk Feb 19, 2020
4952952
add
kgyrtkirk Feb 19, 2020
dedb374
fx
kgyrtkirk Feb 19, 2020
600bf83
s3
kgyrtkirk Feb 19, 2020
50aea64
add to conversion
kgyrtkirk Feb 19, 2020
c67f538
add sq way
kgyrtkirk Feb 20, 2020
c42114c
u
kgyrtkirk Feb 20, 2020
e95afb0
cok
kgyrtkirk Feb 20, 2020
3c5f794
add local clones of UDxFs
kgyrtkirk Feb 20, 2020
d9b2246
modify t struct1
kgyrtkirk Feb 20, 2020
5f7a36a
sketchtoestimate v0.1
kgyrtkirk Feb 20, 2020
dcde5cf
working sk2est
kgyrtkirk Feb 20, 2020
1056ee5
begin to split?
kgyrtkirk Feb 21, 2020
ef27ec1
IRoll
kgyrtkirk Feb 21, 2020
5e15ff4
rollup complete
kgyrtkirk Feb 21, 2020
a3f0283
ws change
kgyrtkirk Feb 21, 2020
301d47b
Merge remote-tracking branch 'apache/master' into HIVE-sketches
kgyrtkirk Feb 21, 2020
fc13dc0
Merge remote-tracking branch 'remotes/kgyrtkirk/HIVE-sketches' into H…
kgyrtkirk Feb 28, 2020
9d9319b
unpatch some parts
kgyrtkirk Feb 28, 2020
0aac13f
remove local crap
kgyrtkirk Feb 28, 2020
d9e5c34
add a real datasketches release
kgyrtkirk Feb 28, 2020
b52f961
prefi
kgyrtkirk Feb 28, 2020
fc5ee1e
register hll
kgyrtkirk Feb 28, 2020
5ad9c87
add fixme
kgyrtkirk Mar 2, 2020
7239510
renameX
kgyrtkirk Mar 2, 2020
d265d19
add a bunch
kgyrtkirk Mar 2, 2020
4e4d7c4
add more/fix/etc
kgyrtkirk Mar 2, 2020
dab0947
correct typo
kgyrtkirk Mar 2, 2020
b557a55
undo ws
kgyrtkirk Mar 2, 2020
7a50054
add/note/etc
kgyrtkirk Mar 2, 2020
aff4bec
UDF/UDAF name clash
kgyrtkirk Mar 2, 2020
3a363ce
hll example
kgyrtkirk Mar 2, 2020
7910a67
pom changes I
kgyrtkirk Mar 2, 2020
94030e3
remove preliminary qtests
kgyrtkirk Mar 2, 2020
c17c77e
fixme comment
kgyrtkirk Mar 2, 2020
ab34866
add theta
kgyrtkirk Mar 2, 2020
279d355
add theta
kgyrtkirk Mar 2, 2020
c81ef36
run tests with minillapolocal
kgyrtkirk Mar 2, 2020
c198f0f
Merge remote-tracking branch 'apache/master' into HIVE-22940-sketches…
kgyrtkirk Mar 10, 2020
94f211a
renames
kgyrtkirk Mar 10, 2020
14be625
more changes
kgyrtkirk Mar 10, 2020
25571c2
fix
kgyrtkirk Mar 10, 2020
9be818a
fix name
kgyrtkirk Mar 10, 2020
404a5ed
Merge remote-tracking branch 'apache/master' into HIVE-22940-sketches…
kgyrtkirk Mar 11, 2020
f68f110
Merge remote-tracking branch 'apache/master' into HIVE-22940-sketches…
kgyrtkirk Mar 12, 2020
b5ef5bb
cleanup/etc
kgyrtkirk Mar 16, 2020
6a808d0
Merge remote-tracking branch 'remotes/kgyrtkirk/HIVE-22940-sketches-f…
kgyrtkirk Mar 16, 2020
797d846
Merge remote-tracking branch 'remotes/kgyrtkirk/HIVE-22940-sketches-f…
kgyrtkirk Mar 16, 2020
379e6c7
add prototype codes
kgyrtkirk Mar 16, 2020
75dfaf5
Revert "add prototype codes"
kgyrtkirk Mar 16, 2020
4d4aff4
Revert "Revert "add prototype codes""
kgyrtkirk Mar 16, 2020
e59f913
use metrgable
kgyrtkirk Mar 17, 2020
4198bce
union2
kgyrtkirk Mar 17, 2020
2e97b3a
rollup0
kgyrtkirk Mar 17, 2020
cc092a8
there..it works
kgyrtkirk Mar 17, 2020
3622fd4
indent
kgyrtkirk Mar 17, 2020
ef5f5f1
remove
kgyrtkirk Mar 17, 2020
a771c4f
remove
kgyrtkirk Mar 17, 2020
0f8363c
add initial
kgyrtkirk Mar 17, 2020
09a5614
somewhat better
kgyrtkirk Mar 20, 2020
5769151
fx
kgyrtkirk Mar 20, 2020
8ef6215
Merge remote-tracking branch 'remotes/kgyrtkirk/HIVE-23030-rollup-uni…
kgyrtkirk Mar 20, 2020
328ead9
add
kgyrtkirk Mar 20, 2020
2fb0cd5
cleanup/etc
kgyrtkirk Mar 20, 2020
e65f192
HIVE-22998 : Dump partition info if hive.repl.dump.metadata.only.for.…
aasha Mar 13, 2020
c8d5191
HIVE-22964: MM table split computation is very slow (Aditya Shah revi…
Mar 13, 2020
4746cbb
HIVE-16355 HIVE-22893: addendum - missing ASF headers
kgyrtkirk Mar 13, 2020
0a73fce
HIVE-23008: UDAFExampleMaxMinNUtil.sortedMerge must be able to handle…
kgyrtkirk Mar 13, 2020
e1d9663
HIVE-22762: Leap day is incorrectly parsed during cast in Hive (Karen…
belugabehr Mar 13, 2020
54b5bba
HIVE-21778: CBO: "Struct is not null" gets evaluated as `nullable` al…
vineetgarg02 Mar 13, 2020
ab4aeb6
HIVE-21939 : protoc:2.5.0 dependence has broken building on aarch64. …
chinnaraolalam Mar 14, 2020
755e990
HIVE-22974: Metastore's table location check should be applied when l…
nrg4878 Mar 15, 2020
daae908
HIVE-23015: Fix HIVE_VECTORIZATION_GROUPBY_COMPLEX_TYPES_ENABLED defi…
pvargacl Mar 16, 2020
fc73fdf
HIVE-22985: Failed compaction always throws TxnAbortedException (Kare…
Mar 16, 2020
1c848b2
HIVE-22976: Oracle and MSSQL upgrade script missing the addition of W…
bmaidics Mar 16, 2020
fb30aaa
HIVE-22970: Add a qoption to enable tests to use transactional mode (…
kgyrtkirk Mar 16, 2020
fbc8a4c
HIVE-22959 : Extend storage-api to expose FilterContext (Panos G via …
Mar 16, 2020
8debe93
HIVE-23027: Fix syntax error in llap package.py (Rajesh Balamohan, re…
rbalamohan Mar 16, 2020
6b9170e
HIVE-23023: MR compaction ignores column schema evolution (Kare Coppa…
Mar 17, 2020
a8dcfb8
HIVE-23011: Shared work optimizer should check residual predicates wh…
jcamachor Mar 17, 2020
d9e005d
HIVE-22901: Variable substitution can lead to OOM on circular referen…
dvoros Mar 17, 2020
3c37e74
HIVE-22539: HiveServer2 SPNEGO authentication should skip if authoriz…
risdenk Mar 17, 2020
bba01c6
HIVE-22841: ThriftHttpServlet#getClientNameFromCookie should handle C…
risdenk Mar 17, 2020
be620a6
HIVE-23022 : Arrow deserializer should ensure size of hive vector equ…
Mar 18, 2020
7936a94
HIVE-22990 : Build acknowledgement mechanism for repl dump and load. …
aasha Mar 18, 2020
6ff297e
HIVE-23019: Fix TestTxnCommandsForMmTable test case (Peter Varga via …
pvargacl Mar 18, 2020
17ad636
HIVE-22955 PreUpgradeTool can fail because access to CharsetDecoder i…
ghanko Mar 18, 2020
ccde408
HIVE-23034 : Arrow serializer should not keep the reference of arrow …
Mar 19, 2020
43d2440
HIVE-23002: Optimise LazyBinaryUtils.writeVLong (Rajesh Balamohan, re…
rbalamohan Mar 19, 2020
bce3225
HIVE-23035: Scheduled query executor may hang in case TezAMs are laun…
kgyrtkirk Mar 19, 2020
4551045
HIVE-23033: MSSQL metastore schema init script doesn't initialize NOT…
nrg4878 Mar 20, 2020
ef301e3
HIVE-23059 In constraint name uniqueness query use the MTable instead…
miklosgergely Mar 20, 2020
325e2ea
HIVE-23063 Use the same PerfLogger all over Compiler (Miklos Gergely,…
miklosgergely Mar 21, 2020
8d27295
Merge remote-tracking branch 'apache/master' into HIVE-22940-sketches…
kgyrtkirk Mar 22, 2020
a3630aa
no-transform
kgyrtkirk Mar 23, 2020
7523c8e
aa
kgyrtkirk Mar 23, 2020
d0058ad
it does work
kgyrtkirk Mar 23, 2020
55c5362
cleanup
kgyrtkirk Mar 23, 2020
4dbce31
Merge remote-tracking branch 'kgyrtkirk/HIVE-22940-sketches-fns' into…
kgyrtkirk Mar 23, 2020
99145fc
Merge remote-tracking branch 'apache/master' into HIVE-23031-rewrite-…
kgyrtkirk Mar 23, 2020
5c28eff
Merge remote-tracking branch 'kgyrtkirk/HIVE-22940-sketches-fns' into…
kgyrtkirk Mar 23, 2020
fb87cac
Merge remote-tracking branch 'apache/master' into HIVE-23030-rollup-u…
kgyrtkirk Mar 23, 2020
25fdd21
SketchFn enum
kgyrtkirk Mar 23, 2020
45c27eb
back to string consts
kgyrtkirk Mar 23, 2020
0b95dc2
register/x
kgyrtkirk Mar 23, 2020
062403b
fix/inline
kgyrtkirk Mar 23, 2020
8e07be3
chanma
kgyrtkirk Mar 23, 2020
cd21596
add to testconf
kgyrtkirk Mar 23, 2020
5adb78f
use map
kgyrtkirk Mar 23, 2020
d5ae78d
rename/cleanup/etc
kgyrtkirk Mar 23, 2020
7f08d49
fixes
kgyrtkirk Mar 23, 2020
cadb274
cleanup
kgyrtkirk Mar 23, 2020
c1037fd
cleanup
kgyrtkirk Mar 23, 2020
2deb2fb
cleanup
kgyrtkirk Mar 23, 2020
7064cae
unpatch DSF
kgyrtkirk Mar 23, 2020
7e49d62
Merge remote-tracking branch 'kgyrtkirk/HIVE-23030-rollup-union' into…
kgyrtkirk Mar 23, 2020
38f3b6e
remove crap
kgyrtkirk Mar 23, 2020
b2d8bf3
cleanup
kgyrtkirk Mar 25, 2020
25fc17d
reanme -a
kgyrtkirk Mar 25, 2020
0a31af6
reanme
kgyrtkirk Mar 25, 2020
60dcc01
add round
kgyrtkirk Mar 25, 2020
865a7f4
remove redundant test
kgyrtkirk Mar 25, 2020
960d29d
add round
kgyrtkirk Mar 25, 2020
8258555
singleton/etc
kgyrtkirk Mar 25, 2020
27b8600
cleanup
kgyrtkirk Mar 25, 2020
8458667
plugin reg
kgyrtkirk Mar 25, 2020
e1fd7f4
cleanup
kgyrtkirk Mar 25, 2020
739db22
cleanuo
kgyrtkirk Mar 25, 2020
708968d
fac epalm
kgyrtkirk Mar 25, 2020
1669827
Merge remote-tracking branch 'kgyrtkirk/HIVE-23030-rollup-union' into…
kgyrtkirk Mar 25, 2020
e2641d0
cleanup
kgyrtkirk Mar 29, 2020
fee7f7f
address review comments
kgyrtkirk Mar 30, 2020
541b3f9
Merge remote-tracking branch 'apache/master' into HIVE-23030-rollup-u…
kgyrtkirk Mar 30, 2020
8498aca
Merge remote-tracking branch 'apache/master' into HIVE-23031-rewrite-…
kgyrtkirk Mar 30, 2020
c8f9fbe
Merge remote-tracking branch 'kgyrtkirk/HIVE-23030-rollup-union' into…
kgyrtkirk Mar 30, 2020
79ae119
fx
kgyrtkirk Mar 30, 2020
9bdd2d7
added explicit drop for rollup
kgyrtkirk Mar 31, 2020
b0bd2a8
Merge remote-tracking branch 'kgyrtkirk/HIVE-23030-rollup-union' into…
kgyrtkirk Apr 6, 2020
f4bf29e
Merge remote-tracking branch 'apache/master' into HIVE-23031-rewrite-…
kgyrtkirk Apr 6, 2020
81602f4
Merge remote-tracking branch 'apache/master' into HIVE-23031-rewrite-…
kgyrtkirk Apr 15, 2020
391eb98
Merge remote-tracking branch 'apache/master' into HIVE-23031-rewrite-…
kgyrtkirk Apr 16, 2020
3a84bed
remove ws changes
kgyrtkirk Apr 21, 2020
eb3abe3
Merge remote-tracking branch 'apache/master' into HIVE-23031-rewrite-…
kgyrtkirk Apr 21, 2020
a552736
cleanup
kgyrtkirk Apr 21, 2020
0e64fab
change dsf
kgyrtkirk Apr 21, 2020
cf61be4
cleaner fns
kgyrtkirk Apr 21, 2020
51dd712
add estimate to dsf
kgyrtkirk Apr 21, 2020
b9b7302
fix fixme/add estimate
kgyrtkirk Apr 21, 2020
2298c86
add options/etc
kgyrtkirk Apr 21, 2020
826d50a
fix test
kgyrtkirk Apr 21, 2020
4706018
add to conf
kgyrtkirk Apr 23, 2020
729cfe6
git statusMerge remote-tracking branch 'apache/master' into HIVE-2303…
kgyrtkirk Apr 24, 2020
306d7c3
remove hiveconf
kgyrtkirk Apr 24, 2020
4ffe943
cleanup
kgyrtkirk Apr 24, 2020
a9e9089
rename/etc
kgyrtkirk Apr 24, 2020
35bfd3e
add fixme
kgyrtkirk Apr 24, 2020
676c542
add comment
kgyrtkirk Apr 27, 2020
4474144
one-way to add return type...
kgyrtkirk Apr 28, 2020
011ed58
rename options
kgyrtkirk Apr 28, 2020
f64af08
cleanup
kgyrtkirk Apr 28, 2020
e2e89d9
correct comment
kgyrtkirk Apr 28, 2020
b856e9a
cleanup
kgyrtkirk Apr 28, 2020
e6ea2d4
cleanup
kgyrtkirk Apr 28, 2020
83367c3
\updates
kgyrtkirk Apr 28, 2020
e4b82a5
update q.out
kgyrtkirk Apr 28, 2020
35ad023
add test
kgyrtkirk Apr 30, 2020
6f707c0
add new test
kgyrtkirk Apr 30, 2020
a68917b
remove multitype
kgyrtkirk Apr 30, 2020
4d079d3
removed cpc/theta
kgyrtkirk Apr 30, 2020
5b460d6
dis
kgyrtkirk May 4, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
Original file line number Diff line number Diff line change
Expand Up @@ -2465,6 +2465,19 @@ public static enum ConfVars {
"If the number of references to a CTE clause exceeds this threshold, Hive will materialize it\n" +
"before executing the main query block. -1 will disable this feature."),

HIVE_OPTIMIZE_BI_ENABLED("hive.optimize.bi.enabled", false,
"Enables query rewrites based on approximate functions(sketches)."),

HIVE_OPTIMIZE_BI_REWRITE_COUNTDISTINCT_ENABLED("hive.optimize.bi.rewrite.countdistinct.enabled",
true,
"Enables to rewrite COUNT(DISTINCT(X)) queries to be rewritten to use sketch functions."),

HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH(
"hive.optimize.bi.rewrite.countdistinct.sketch", "hll",
new StringSet("hll", "cpc", "theta"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we limit the algorithm choices to a single one for the time being?
The reason I am asking this is that this will not work with materialized views. Since we are not storing in the SQL view definition the algorithm that we used to generate the column, if the property value changes, this would lead to errors.
The multi-algorithm supports needs a little bit more work. One option would be to store this information in the MV table properties so we know how to interpret them when HS2 needs to load them (and thus parse them). What do you think?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that would be neccessary

  • it works correctly - even if we have 1 algo the interesting behaviour is still there - which is: if the rewrite is enabled the created MV will be a rewritten one
  • it won't get applied for different modes/etc so it doesn't lead to errors at all...

I've added a test(sketches_materialized_view_sketchtype.q)
which shows how it works when there is an MV for HLL ; in case the mode is not HLL the MV is ignored and computed directly

I think the real meaning of the MV should not change(I think we agree on this); we have 2 choices here:

  • ignore all rewriting during MV construction/rebuild - so this rewrite may not happen for an MV - and users have to use the expanded form of the sketch stuff to create an MV for that purpose
  • save a conf overlay alongside with the MV
    I think addressing this is outside of the scope of this change

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand for a single algorithm it will work. However, consider the following scenario:

  • A user enables BI mode and algorithm hll.
  • The user creates a MV with count distinct. The MV has stored the count distinct field using hll. The SQL statement still has count distinct.
  • We change default algorithm to cpc and restart HS2. Thus, when the MV is loaded by HS2, the count distinct is transformed to cpc.
  • The user runs a query with count distinct, which transforms to cpc, matches the MV... but fails at deserialization time because the sketch stored for the MV is hll.

That is why I suggested we could limit the options for algorithms till we have proper support. The risk I see if we do not do that now is that if anyone creates MVs using the different default algorithms, we will not have any way to distinguish between them anymore.

From the two choices that you mention above, I was suggesting the second option, since the main goal of the whole effort is to be able to use these algorithms seamlessly with the MVs. I agree it can be outside of the scope of this change, but let's limit the algorithm choices till then?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was not thinking about restarting HS2

sure...we can limit it to one - but if this incorrect behaviour does exists - then I think it could also be triggered with the main bi mode switch as well:

  • in one case there will be a sketch there
  • in the other some integer value

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've tried it out - I didn't seen any exceptions the MV match for a plain count(distinct id) didn't happened....
when I've changed the default algo no exceptions happened; but matches were made incorrectly - so there could be dragons...

I've removed cpc/theta for now...we can add it back later

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, failure can still happen when the sketch is stored and the mode changes.

Thanks for making the changes in any case. Let's check in this patch and give priority to the overlay issue, it should not be too difficult to address and will fix all these issues.

"Defines which sketch type to use when rewriting COUNT(DISTINCT(X)) expressions. "
+ "Distinct counting can be done with: hll,cpc or theta"),

// Statistics
HIVE_STATS_ESTIMATE_STATS("hive.stats.estimate", true,
"Estimate statistics in absence of statistics."),
Expand Down
2 changes: 2 additions & 0 deletions itests/src/test/resources/testconfiguration.properties
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,9 @@ minillaplocal.query.files=\
schq_ingest.q,\
sketches_hll.q,\
sketches_theta.q,\
sketches_rewrite.q,\
sketches_materialized_view_rollup.q,\
sketches_materialized_view_rollup2.q,\
table_access_keys_stats.q,\
temp_table_llap_partitioned.q,\
tez_bmj_schema_evolution.q,\
Expand Down
110 changes: 91 additions & 19 deletions ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,28 @@

package org.apache.hadoop.hive.ql.exec;

import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;

import org.apache.calcite.jdbc.JavaTypeFactoryImpl;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rel.type.RelDataTypeImpl;
import org.apache.calcite.rel.type.RelProtoDataType;
import org.apache.calcite.sql.SqlFunction;
import org.apache.calcite.sql.SqlFunctionCategory;
import org.apache.calcite.sql.SqlKind;
import org.apache.calcite.sql.type.InferTypes;
import org.apache.calcite.sql.type.OperandTypes;
import org.apache.calcite.sql.type.ReturnTypes;
import org.apache.calcite.sql.type.SqlTypeName;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveTypeSystemImpl;
import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveMergeableAggregate;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSqlFunction;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver2;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hive.plugin.api.HiveUDFPlugin;
Expand All @@ -48,9 +55,9 @@ public final class DataSketchesFunctions implements HiveUDFPlugin {

private static final String DATASKETCHES_PREFIX = "ds";

private static final String DATA_TO_SKETCH = "sketch";
public static final String DATA_TO_SKETCH = "sketch";
public static final String SKETCH_TO_ESTIMATE = "estimate";
private static final String SKETCH_TO_ESTIMATE_WITH_ERROR_BOUNDS = "estimate_bounds";
private static final String SKETCH_TO_ESTIMATE = "estimate";
private static final String SKETCH_TO_STRING = "stringify";
private static final String UNION_SKETCH = "union";
private static final String UNION_SKETCH1 = "union_f";
Expand All @@ -73,12 +80,12 @@ public final class DataSketchesFunctions implements HiveUDFPlugin {
private static final String SKETCH_TO_VARIANCES = "variances";
private static final String SKETCH_TO_PERCENTILE = "percentile";

private final List<SketchDescriptor> sketchClasses;
private final Map<String, SketchDescriptor> sketchClasses;
private final ArrayList<UDFDescriptor> descriptors;

private DataSketchesFunctions() {
this.sketchClasses = new ArrayList<SketchDescriptor>();
this.descriptors = new ArrayList<HiveUDFPlugin.UDFDescriptor>();
this.sketchClasses = new HashMap<>();
this.descriptors = new ArrayList<>();
registerHll();
registerCpc();
registerKll();
Expand All @@ -96,19 +103,31 @@ public Iterable<UDFDescriptor> getDescriptors() {
return descriptors;
}

public SketchFunctionDescriptor getSketchFunction(String className, String function) {
if (!sketchClasses.containsKey(className)) {
throw new IllegalArgumentException(String.format("Sketch-class '%s' doesn't exists", className));
}
SketchDescriptor sc = sketchClasses.get(className);
if (!sc.fnMap.containsKey(function)) {
throw new IllegalArgumentException(String.format("The Sketch-class '%s' doesn't have a '%s' method", function));
}
return sketchClasses.get(className).fnMap.get(function);
}

private void buildDescritors() {
for (SketchDescriptor sketchDescriptor : sketchClasses) {
for (SketchDescriptor sketchDescriptor : sketchClasses.values()) {
descriptors.addAll(sketchDescriptor.fnMap.values());
}
}

private void buildCalciteFns() {
for (SketchDescriptor sd : sketchClasses) {
for (SketchDescriptor sd : sketchClasses.values()) {
// Mergability is exposed to Calcite; which enables to use it during rollup.
RelProtoDataType sketchType = RelDataTypeImpl.proto(SqlTypeName.BINARY, true);

SketchFunctionDescriptor sketchSFD = sd.fnMap.get(DATA_TO_SKETCH);
SketchFunctionDescriptor unionSFD = sd.fnMap.get(UNION_SKETCH);
SketchFunctionDescriptor estimateSFD = sd.fnMap.get(SKETCH_TO_ESTIMATE);

if (sketchSFD == null || unionSFD == null) {
continue;
Expand All @@ -128,14 +147,27 @@ private void buildCalciteFns() {
OperandTypes.family(),
unionFn);


unionSFD.setCalciteFunction(unionFn);
sketchSFD.setCalciteFunction(sketchFn);
if (estimateSFD != null && estimateSFD.getReturnRelDataType().isPresent()) {

SqlFunction estimateFn = new HiveSqlFunction(estimateSFD.name,
SqlKind.OTHER_FUNCTION,
ReturnTypes.explicit(estimateSFD.getReturnRelDataType().get().getSqlTypeName()),
InferTypes.ANY_NULLABLE,
OperandTypes.family(),
SqlFunctionCategory.USER_DEFINED_FUNCTION,
true,
false);

estimateSFD.setCalciteFunction(estimateFn);
}
}
}


private void registerHiveFunctionsInternal(Registry system) {
for (SketchDescriptor sketchDescriptor : sketchClasses) {
for (SketchDescriptor sketchDescriptor : sketchClasses.values()) {
Collection<SketchFunctionDescriptor> functions = sketchDescriptor.fnMap.values();
for (SketchFunctionDescriptor fn : functions) {
if (UDF.class.isAssignableFrom(fn.udfClass)) {
Expand Down Expand Up @@ -165,6 +197,7 @@ private static class SketchFunctionDescriptor implements HiveUDFPlugin.UDFDescri
String name;
Class<?> udfClass;
private SqlFunction calciteFunction;
private Class<?> returnType;

public SketchFunctionDescriptor(String name, Class<?> udfClass) {
this.name = name;
Expand All @@ -181,6 +214,19 @@ public String getFunctionName() {
return name;
}

public Optional<RelDataType> getReturnRelDataType() {
if (returnType == null) {
return Optional.empty();
} else {
JavaTypeFactoryImpl typeFactory = new JavaTypeFactoryImpl(new HiveTypeSystemImpl());
return Optional.of(typeFactory.createType(returnType));
}
}

public void setReturnType(Class<?> returnType) {
this.returnType = returnType;
}

@Override
public Optional<SqlFunction> getCalciteFunction() {
return Optional.ofNullable(calciteFunction);
Expand All @@ -189,6 +235,11 @@ public Optional<SqlFunction> getCalciteFunction() {
public void setCalciteFunction(SqlFunction calciteFunction) {
this.calciteFunction = calciteFunction;
}

@Override
public String toString() {
return getClass().getCanonicalName() + "[" + name + "]";
}
}

private static class SketchDescriptor {
Expand All @@ -201,7 +252,28 @@ public SketchDescriptor(String string) {
}

private void register(String name, Class<?> clazz) {
fnMap.put(name, new SketchFunctionDescriptor(functionPrefix + name, clazz));
SketchFunctionDescriptor value = new SketchFunctionDescriptor(functionPrefix + name, clazz);
if (UDF.class.isAssignableFrom(clazz)) {
Optional<Method> evaluateMethod = getEvaluateMethod(clazz);
if (evaluateMethod.isPresent()) {
value.setReturnType(evaluateMethod.get().getReturnType());
}
}
fnMap.put(name, value);
}

private Optional<Method> getEvaluateMethod(Class<?> clazz) {
List<Method> evaluateMethods = new ArrayList<Method>();
for (Method method : clazz.getMethods()) {
if ("evaluate".equals(method.getName())) {
evaluateMethods.add(method);
}
}
if (evaluateMethods.size() == 1) {
return Optional.of(evaluateMethods.get(0));
} else {
return Optional.empty();
}
}
}

Expand All @@ -214,7 +286,7 @@ private void registerHll() {
sd.register(SKETCH_TO_STRING, org.apache.datasketches.hive.hll.SketchToStringUDF.class);
sd.register(UNION_SKETCH1, org.apache.datasketches.hive.hll.UnionSketchUDF.class);
sd.register(UNION_SKETCH, org.apache.datasketches.hive.hll.UnionSketchUDAF.class);
sketchClasses.add(sd);
sketchClasses.put("hll", sd);
}

private void registerCpc() {
Expand All @@ -228,7 +300,7 @@ private void registerCpc() {
sd.register(SKETCH_TO_STRING, org.apache.datasketches.hive.cpc.SketchToStringUDF.class);
sd.register(UNION_SKETCH1, org.apache.datasketches.hive.cpc.UnionSketchUDF.class);
sd.register(UNION_SKETCH, org.apache.datasketches.hive.cpc.UnionSketchUDAF.class);
sketchClasses.add(sd);
sketchClasses.put("cpc", sd);
}

private void registerKll() {
Expand All @@ -244,7 +316,7 @@ private void registerKll() {
sd.register(GET_QUANTILES, org.apache.datasketches.hive.kll.GetQuantilesUDF.class);
sd.register(GET_QUANTILE, org.apache.datasketches.hive.kll.GetQuantileUDF.class);
sd.register(GET_RANK, org.apache.datasketches.hive.kll.GetRankUDF.class);
sketchClasses.add(sd);
sketchClasses.put("kll", sd);
}

private void registerTheta() {
Expand All @@ -258,7 +330,7 @@ private void registerTheta() {
sd.register(INTERSECT_SKETCH, org.apache.datasketches.hive.theta.IntersectSketchUDAF.class);
sd.register(SKETCH_TO_ESTIMATE, org.apache.datasketches.hive.theta.EstimateSketchUDF.class);
sd.register(EXCLUDE_SKETCH, org.apache.datasketches.hive.theta.ExcludeSketchUDF.class);
sketchClasses.add(sd);
sketchClasses.put("theta", sd);

}

Expand All @@ -284,7 +356,7 @@ private void registerTupleArrayOfDoubles() {
org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToQuantilesSketchUDF.class);
sd.register(SKETCH_TO_VALUES, org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToValuesUDTF.class);
sd.register(SKETCH_TO_VARIANCES, org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToVariancesUDF.class);
sketchClasses.add(sd);
sketchClasses.put("tuple_arrayofdouble", sd);
}

private void registerTupleDoubleSummary() {
Expand All @@ -294,7 +366,7 @@ private void registerTupleDoubleSummary() {
sd.register(UNION_SKETCH, org.apache.datasketches.hive.tuple.UnionDoubleSummarySketchUDAF.class);
sd.register(SKETCH_TO_ESTIMATE, org.apache.datasketches.hive.tuple.DoubleSummarySketchToEstimatesUDF.class);
sd.register(SKETCH_TO_PERCENTILE, org.apache.datasketches.hive.tuple.DoubleSummarySketchToPercentileUDF.class);
sketchClasses.add(sd);
sketchClasses.put("tuple_doublesummary", sd);
}

private void registerQuantiles() {
Expand All @@ -312,7 +384,7 @@ private void registerFrequencies() {
sd.register(UNION_SKETCH, org.apache.datasketches.hive.frequencies.UnionStringsSketchUDAF.class);
sd.register(GET_FREQUENT_ITEMS,
org.apache.datasketches.hive.frequencies.GetFrequentItemsFromStringsSketchUDTF.class);
sketchClasses.add(sd);
sketchClasses.put("freq", sd);
}

private void registerQuantilesString() {
Expand All @@ -327,7 +399,7 @@ private void registerQuantilesString() {
sd.register(GET_PMF, org.apache.datasketches.hive.quantiles.GetPmfFromStringsSketchUDF.class);
sd.register(GET_QUANTILE, org.apache.datasketches.hive.quantiles.GetQuantileFromStringsSketchUDF.class);
sd.register(GET_QUANTILES, org.apache.datasketches.hive.quantiles.GetQuantilesFromStringsSketchUDF.class);
sketchClasses.add(sd);
sketchClasses.put("quantile_strings", sd);
}

private void registerQuantilesDoubles() {
Expand All @@ -342,7 +414,7 @@ private void registerQuantilesDoubles() {
sd.register(GET_PMF, org.apache.datasketches.hive.quantiles.GetPmfFromDoublesSketchUDF.class);
sd.register(GET_QUANTILE, org.apache.datasketches.hive.quantiles.GetQuantileFromDoublesSketchUDF.class);
sd.register(GET_QUANTILES, org.apache.datasketches.hive.quantiles.GetQuantilesFromDoublesSketchUDF.class);
sketchClasses.add(sd);
sketchClasses.put("quantile_doubles", sd);
}

}
Loading