Skip to content

Commit

Permalink
Merge pull request #66 from okumin/udf-descriptions-warning
Browse files Browse the repository at this point in the history
Add @description to UDFs registered in Apache Hive
  • Loading branch information
AlexanderSaydakov committed Feb 22, 2023
2 parents 35c889e + e3bafbe commit 50ef98e
Show file tree
Hide file tree
Showing 11 changed files with 68 additions and 5 deletions.
Expand Up @@ -55,7 +55,7 @@
extended = "Example:\n"
+ "> SELECT dataToSketch(val, 12) FROM src;\n"
+ "The return value is a binary blob that can be operated on by other sketch related functions."
+ " The lgK parameter controls the sketch size and rlative error expected from the sketch."
+ " The lgK parameter controls the sketch size and relative error expected from the sketch."
+ " It is optional and must be from 4 to 26. The default is 11, which is expected to yield errors"
+ " of roughly +-1.5% in the estimation of uniques with 95% confidence."
+ " The seed parameter is optional")
Expand Down
Expand Up @@ -51,7 +51,7 @@
extended = "Example:\n"
+ "> SELECT UnionSketch(sketch) FROM src;\n"
+ "The return value is a binary blob that can be operated on by other sketch related functions."
+ " The lgK parameter controls the sketch size and rlative error expected from the sketch."
+ " The lgK parameter controls the sketch size and relative error expected from the sketch."
+ " It is optional an must be from 4 to 26. The default is 11, which is expected to yield errors"
+ " of roughly +-1.5% in the estimation of uniques with 95% confidence."
+ " The seed parameter is optional")
Expand Down
10 changes: 10 additions & 0 deletions src/main/java/org/apache/datasketches/hive/cpc/UnionSketchUDF.java
Expand Up @@ -24,12 +24,22 @@
import org.apache.datasketches.cpc.CpcSketch;
import org.apache.datasketches.cpc.CpcUnion;
import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;

/**
* Hive union sketch UDF.
*/
@Description(
name = "unionSketch",
value = "_FUNC_(firstSketch, secondSketch[, lgK[, seed]]) - Compute the union of the given "
+ "sketches with the given size and seed",
extended = "The return value is a binary blob that can be operated on by other sketch related functions."
+ " The lgK parameter controls the sketch size and relative error expected from the sketch."
+ " It is optional an must be from 4 to 26. The default is 11, which is expected to yield errors"
+ " of roughly +-1.5% in the estimation of uniques with 95% confidence."
+ " The seed parameter is optional")
@SuppressWarnings("deprecation")
public class UnionSketchUDF extends UDF {

Expand Down
Expand Up @@ -54,7 +54,7 @@
extended = "Example:\n"
+ "> SELECT dataToSketch(val, 12) FROM src;\n"
+ "The return value is a binary blob that can be operated on by other sketch related functions."
+ " The lgK parameter controls the sketch size and rlative error expected from the sketch."
+ " The lgK parameter controls the sketch size and relative error expected from the sketch."
+ " It is optional and must be from 4 to 21. The default is 12, which is expected to yield errors"
+ " of roughly +-3% in the estimation of uniques with 95% confidence."
+ " The target type parameter is optional and must be 'HLL_4', 'HLL_6' or 'HLL_8'."
Expand Down
Expand Up @@ -50,7 +50,7 @@
extended = "Example:\n"
+ "> SELECT UnionSketch(sketch) FROM src;\n"
+ "The return value is a binary blob that can be operated on by other sketch related functions."
+ " The lgK parameter controls the sketch size and rlative error expected from the sketch."
+ " The lgK parameter controls the sketch size and relative error expected from the sketch."
+ " It is optional and must be from 4 to 21. The default is 12, which is expected to yield errors"
+ " of roughly +-3% in the estimation of uniques with 95% confidence."
+ " The target type parameter is optional and must be 'HLL_4', 'HLL_6' or 'HLL_8'."
Expand Down
11 changes: 11 additions & 0 deletions src/main/java/org/apache/datasketches/hive/hll/UnionSketchUDF.java
Expand Up @@ -23,12 +23,23 @@
import org.apache.datasketches.hll.HllSketch;
import org.apache.datasketches.hll.TgtHllType;
import org.apache.datasketches.hll.Union;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;

/**
* Hive union sketch UDF.
*/
@Description(
name = "unionSketch",
value = "_FUNC_(firstSketch, secondSketch[, lgK[, type]]) - Compute the union of the given "
+ "sketches with the given size and seed",
extended = "The return value is a binary blob that can be operated on by other sketch related functions."
+ " The lgK parameter controls the sketch size and relative error expected from the sketch."
+ " It is optional and must be from 4 to 21. The default is 12, which is expected to yield errors"
+ " of roughly +-3% in the estimation of uniques with 95% confidence."
+ " The target type parameter is optional and must be 'HLL_4', 'HLL_6' or 'HLL_8'."
+ " The default is 'HLL_4'")
@SuppressWarnings("deprecation")
public class UnionSketchUDF extends UDF {

Expand Down
Expand Up @@ -24,13 +24,17 @@
import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.theta.Sketch;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;

/**
* Hive estimate sketch udf. V4
*
*/
@Description(
name = "estimateSketch",
value = "_FUNC_(sketch) - Return the estimate unique count of the given sketch",
extended = "The given sketch is a binary blob computed by other Theta Sketch UDFs")
@SuppressWarnings("deprecation")
public class EstimateSketchUDF extends UDF {

Expand Down
Expand Up @@ -25,13 +25,22 @@
import org.apache.datasketches.theta.AnotB;
import org.apache.datasketches.theta.SetOperation;
import org.apache.datasketches.theta.Sketch;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;

/**
* Hive exclude sketch UDF. (i.e. in sketch a but not in sketch b)
*
*/
@Description(
name = "excludeSketch",
value = "_FUNC_(firstSketch, secondSketch[, seed]) - Computes the set difference, A-AND-NOT-B,"
+ "of the given sketches",
extended = "The return value is a binary blob that contains a compact sketch, which can "
+ "be operated on by the other sketch-related functions. "
+ "The seed is optional, "
+ "and using it is not recommended unless you really know why you need it.")
@SuppressWarnings("deprecation")
public class ExcludeSketchUDF extends UDF {

Expand Down
Expand Up @@ -25,13 +25,22 @@
import org.apache.datasketches.theta.Intersection;
import org.apache.datasketches.theta.SetOperation;
import org.apache.datasketches.theta.Sketch;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;

/**
* Hive intersection sketch UDF.
*
*/
@Description(
name = "intersectSketch",
value = "_FUNC_(firstSketch, secondSketch[, seed]) - Compute the intersection of the"
+ "given sketches",
extended = "The return value is a binary blob that contains a compact sketch, which can "
+ "be operated on by the other sketch-related functions. "
+ "The seed is optional, "
+ "and using it is not recommended unless you really know why you need it.")
@SuppressWarnings("deprecation")
public class IntersectSketchUDF extends UDF {

Expand Down
Expand Up @@ -25,12 +25,25 @@
import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.theta.SetOperation;
import org.apache.datasketches.theta.Union;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;

/**
* Hive union sketch UDF.
*/
@Description(
name = "unionSketch",
value = "_FUNC_(firstSketch, secondSketch[, size[, seed]]) - Compute the union of the given "
+ "sketches with the given size and seed",
extended = "The return value is a binary blob that contains a compact sketch, which can "
+ "be operated on by the other sketch-related functions. The optional "
+ "size must be a power of 2, and controls the relative error of the expected "
+ "result. A size of 16384 can be expected to yeild errors of roughly +-1.5% "
+ "in the estimation of uniques with 95% confidence. "
+ "The default size is defined in the sketches-core library and at the time of this writing "
+ "was 4096 (about 3% error). "
+ "The seed is optional, and using it is not recommended unless you really know why you need it")
@SuppressWarnings("deprecation")
public class UnionSketchUDF extends UDF {

Expand Down
Expand Up @@ -27,6 +27,7 @@
import org.apache.datasketches.tuple.arrayofdoubles.ArrayOfDoublesSketch;
import org.apache.datasketches.tuple.arrayofdoubles.ArrayOfDoublesSketchIterator;
import org.apache.datasketches.tuple.arrayofdoubles.ArrayOfDoublesSketches;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
Expand All @@ -38,6 +39,12 @@
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.BytesWritable;

@Description(
name = "ArrayOfDoublesSketchToValues",
value = "_FUNC_(sketch) Return the list of tuple values",
extended = "Returns associated values of a given ArrayOfDoublesSketch as rows."
+ " Each row will be N double values, where N is the number of double values kept in the"
+ " sketch per key.")
public class ArrayOfDoublesSketchToValuesUDTF extends GenericUDTF {

PrimitiveObjectInspector inputObjectInspector;
Expand Down

0 comments on commit 50ef98e

Please sign in to comment.