Skip to content
This repository has been archived by the owner on Sep 20, 2022. It is now read-only.

[HIVEMALL-145] Merge Brickhouse functions #135

Closed
wants to merge 56 commits into from
Closed
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
c19fd7c
Added array UDFs (array_append, element_at, array_union, first_elemen…
myui Mar 20, 2018
8f0aeec
Added array_flatten UDF
myui Mar 20, 2018
5d386e0
Added try_cast UDF
myui Mar 20, 2018
391472c
Added to_json/from_json UDFs
myui Mar 27, 2018
56ced64
Added MovingAverage class
myui Apr 3, 2018
c23cc60
Moved MovingAverage and OnlineVariance to hivemall.utils.stats
myui Apr 4, 2018
b127d0d
Added moving_avg UDTF
myui Apr 4, 2018
a713d9b
Added conditional_emit UDTF
myui Apr 5, 2018
d1b1f09
Added array_slice UDF
myui Apr 5, 2018
da9f0fa
Aded vector_add and vector_dot UDFs
myui Apr 6, 2018
0c79958
Fixed possible serialization errors
myui Apr 10, 2018
67615f7
Added Jerome to the Committer list
myui Apr 10, 2018
53c279d
Fixed a bug in vector_dot UDF
myui Apr 10, 2018
5bebe90
Fixed an error message of tree_predict
myui Apr 12, 2018
727775a
Fix array_append UDF behavior
myui Apr 12, 2018
49bf31a
Added map_key_values UDF
myui Apr 18, 2018
98c44da
Fixed UDFType of moving_avg UDF
myui Apr 20, 2018
1466de3
Added sessionize UDF
myui Apr 20, 2018
7b40218
Refactored generate_series UDTF to be more flexible
myui Apr 20, 2018
6bc0948
Added a new line in EoF
myui Apr 24, 2018
65fdffb
Added merge_maps UDAF
myui Apr 24, 2018
443f49c
Applied formatter
myui Apr 26, 2018
9ce97d1
Fixed warning for duplicate entry
myui Apr 27, 2018
3efefc4
Applied spotless-maven-plugin formatter
myui Apr 27, 2018
acea0f7
Fixed SSL related test error
myui May 14, 2018
b47468d
Moved package of moving_avg
myui May 14, 2018
052e45d
Updated UDF description of to_json UDF
myui May 14, 2018
8436dbe
Included timeseries doc generation
myui May 14, 2018
2e293a1
Updated function usage doc
myui May 16, 2018
d11735b
Add a script to generate function desc
myui May 17, 2018
6a507d4
Fixed markdown generation scheme for UDF descriptions
myui May 21, 2018
533140b
Fixed UDF descriptions
myui May 21, 2018
00a7e49
Added map_include_keys and map_exclude_keys UDFs
myui May 24, 2018
b7fd6a9
Updated DDLs
myui May 24, 2018
2982865
Added array_to_str UDF
myui May 25, 2018
91c1013
Added map_index UDF
myui May 25, 2018
85cc27f
Updated DDLs for map_key_values
myui May 25, 2018
bf70541
Updated DDLs
myui May 29, 2018
aae2450
Added assert and raise_error UDFs
myui May 31, 2018
09b246d
Fixed unit tests for raise_error UDF
myui May 31, 2018
099673f
Added moving_avg UDTF
myui May 31, 2018
7661efc
Fixed a typo in assert UDF
myui May 31, 2018
7726c8b
Added vector_add, vector_dot UDFs
myui May 31, 2018
1263b9a
Renamed BloomUDAF to BloomFilterUDAF
myui Jun 1, 2018
9ebf51a
Added DDLs for bloom filter
myui Jun 1, 2018
60be082
Fixed NPE bug in bloom UDAF
myui Jun 1, 2018
68e9511
Added bloom_contains_any UDF
myui Jun 1, 2018
2d95334
Supported contains all in bloom_contains
myui Jun 1, 2018
78c2dbd
Updated DDLs for bloom filter UDFs
myui Jun 1, 2018
512f930
Fixed bugs in bloom filter UDFs
myui Jun 5, 2018
e21606c
Updated UDF documents
myui Jun 5, 2018
906009a
Fixed DDLs for array_to_str UDF
myui Jun 6, 2018
e6bd7a5
formatted TD DDLs
myui Jun 6, 2018
bf81830
Formatted DDLs
myui Jun 6, 2018
5d73101
Updated DDL usage in tutorial
myui Jun 6, 2018
849940c
Updated user guide for bloom filter
myui Jun 6, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ the following organizations and individuals:
- Copyright 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST)
- Copyright 2015-2016 Makoto Yui
- Copyright 2015-2016 Treasure Data, Inc.
- Copyright 2012 Klout, Inc.
- Copyright 2012 Klout, Inc.
47 changes: 47 additions & 0 deletions bin/update_func_md.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/sh
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

if [ "$HIVEMALL_HOME" = "" ]; then
if [ -e ../bin/${0##*/} ]; then
HIVEMALL_HOME=".."
elif [ -e ./bin/${0##*/} ]; then
HIVEMALL_HOME="."
else
echo "env HIVEMALL_HOME not defined"
exit 1
fi
fi

cd $HIVEMALL_HOME
HIVEMALL_HOME=`pwd`

# Deploy to local Maven repos

export MAVEN_OPTS=-XX:MaxPermSize=256m
mvn clean install -DskipTests=true -Dmaven.test.skip=true -pl '.,core,nlp,xgboost,tools/hivemall-docs'

# Generate docs

mvn org.apache.hivemall:hivemall-docs:generate-funcs-list -pl '.,core,nlp,xgboost,tools/hivemall-docs' -X

# Run HTTP server on localhost:040

cd ${HIVEMALL_HOME}/docs/gitbook
gitbook install && gitbook serve
2 changes: 1 addition & 1 deletion core/src/main/java/hivemall/HivemallVersionUDF.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import org.apache.hadoop.io.Text;

@Description(name = "hivemall_version", value = "_FUNC_() - Returns the version of Hivemall",
extended = "Usage: SELECT hivemall_version();")
extended = "SELECT hivemall_version();")
@UDFType(deterministic = true, stateful = false)
public final class HivemallVersionUDF extends UDF {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import static hivemall.smile.utils.SmileExtUtils.resolveFeatureName;
import static hivemall.smile.utils.SmileExtUtils.resolveName;

import hivemall.annotations.VisibleForTesting;
import hivemall.math.matrix.Matrix;
import hivemall.math.matrix.ints.ColumnMajorIntMatrix;
Expand All @@ -36,6 +37,8 @@
import hivemall.utils.lang.StringUtils;
import hivemall.utils.lang.mutable.MutableInt;
import hivemall.utils.sampling.IntReservoirSampler;
import smile.classification.Classifier;
import smile.math.Math;

import java.io.Externalizable;
import java.io.IOException;
Expand All @@ -53,9 +56,6 @@
import org.roaringbitmap.IntConsumer;
import org.roaringbitmap.RoaringBitmap;

import smile.classification.Classifier;
import smile.math.Math;

/**
* Decision tree for classification. A decision tree can be learned by splitting the training set
* into subsets based on an attribute value test. This process is repeated on each derived subset in
Expand Down
10 changes: 5 additions & 5 deletions core/src/main/java/hivemall/smile/regression/RegressionTree.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package hivemall.smile.regression;

import static hivemall.smile.utils.SmileExtUtils.resolveFeatureName;

import hivemall.annotations.VisibleForTesting;
import hivemall.math.matrix.Matrix;
import hivemall.math.matrix.ints.ColumnMajorIntMatrix;
Expand All @@ -36,6 +37,10 @@
import hivemall.utils.lang.StringUtils;
import hivemall.utils.lang.mutable.MutableInt;
import hivemall.utils.math.MathUtils;
import smile.math.Math;
import smile.regression.GradientTreeBoost;
import smile.regression.RandomForest;
import smile.regression.Regression;

import java.io.Externalizable;
import java.io.IOException;
Expand All @@ -51,11 +56,6 @@

import org.apache.hadoop.hive.ql.metadata.HiveException;

import smile.math.Math;
import smile.regression.GradientTreeBoost;
import smile.regression.RandomForest;
import smile.regression.Regression;

/**
* Decision tree for regression. A decision tree can be learned by splitting the training set into
* subsets based on an attribute value test. This process is repeated on each derived subset in a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ public Object evaluate(@Nonnull DeferredObject[] arguments) throws HiveException

Object arg2 = arguments[2].get();
if (arg2 == null) {
throw new HiveException("array<double> features was null");
throw new HiveException("features was null");
}
this.featuresProbe = parseFeatures(arg2, featuresProbe);

Expand Down
190 changes: 150 additions & 40 deletions core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,77 +21,187 @@
import hivemall.utils.hadoop.HiveUtils;

import java.util.ArrayList;
import java.util.List;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;

// @formatter:off
@Description(name = "generate_series",
value = "_FUNC_(const int|bigint start, const int|bigint end) - "
+ "Generate a series of values, from start to end. A similar function to PostgreSQL's `generate_serics`. http://www.postgresql.org/docs/current/static/functions-srf.html",
extended = "select generate_series(1,9);\n\n" + "1\n" + "2\n" + "3\n" + "4\n" + "5\n"
+ "6\n" + "7\n" + "8\n" + "9")
+ "Generate a series of values, from start to end. " +
"A similar function to PostgreSQL's [generate_serics](http://www.postgresql.org/docs/current/static/functions-srf.html)",
extended = "SELECT generate_series(2,4);\n" +
"\n" +
" 2\n" +
" 3\n" +
" 4\n" +
"\n" +
"SELECT generate_series(5,1,-2);\n" +
"\n" +
" 5\n" +
" 3\n" +
" 1\n" +
"\n" +
"SELECT generate_series(4,3);\n" +
"\n" +
" (no return)\n" +
"\n" +
"SELECT date_add(current_date(),value),value from (SELECT generate_series(1,3)) t;\n" +
"\n" +
" 2018-04-21 1\n" +
" 2018-04-22 2\n" +
" 2018-04-23 3\n" +
"\n" +
"WITH input as (\n" +
" SELECT 1 as c1, 10 as c2, 3 as step\n" +
" UNION ALL\n" +
" SELECT 10, 2, -3\n" +
")\n" +
"SELECT generate_series(c1, c2, step) as series\n" +
"FROM input;\n" +
"\n" +
" 1\n" +
" 4\n" +
" 7\n" +
" 10\n" +
" 10\n" +
" 7\n" +
" 4")
// @formatter:on
public final class GenerateSeriesUDTF extends GenericUDTF {

private long start, end;
private boolean useBigInt;
private PrimitiveObjectInspector startOI, endOI;
@Nullable
private PrimitiveObjectInspector stepOI;

@Nonnull
private final Writable[] row = new Writable[1];
private boolean returnLong;

@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
if (argOIs.length != 2) {
throw new UDFArgumentException("Expected number of arguments is 2: " + argOIs.length);
if (argOIs.length != 2 && argOIs.length != 3) {
throw new UDFArgumentException(
"Expected number of arguments is 2 or 3: " + argOIs.length);
}
if (!HiveUtils.isIntegerOI(argOIs[0])) {
throw new UDFArgumentException(
"Expected Integer type for the first argument: " + argOIs[0].getTypeName());
}
if (!HiveUtils.isIntegerOI(argOIs[1])) {
throw new UDFArgumentException(
"Expected Integer type for the second argument: " + argOIs[1].getTypeName());
}
this.startOI = HiveUtils.asIntegerOI(argOIs[0]);
this.endOI = HiveUtils.asIntegerOI(argOIs[1]);

ArrayList<String> fieldNames = new ArrayList<String>(1);
if (argOIs.length == 3) {
if (!HiveUtils.isIntegerOI(argOIs[2])) {
throw new UDFArgumentException(
"Expected Integer type for the third argument: " + argOIs[2].getTypeName());
}
this.stepOI = HiveUtils.asIntegerOI(argOIs[2]);
}

this.returnLong = HiveUtils.isBigIntOI(startOI) || HiveUtils.isBigIntOI(endOI);

List<String> fieldNames = new ArrayList<>(1);
fieldNames.add("value");
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(1);
List<ObjectInspector> fieldOIs = new ArrayList<>(1);
if (returnLong) {
fieldOIs.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector);
} else {
fieldOIs.add(PrimitiveObjectInspectorFactory.writableIntObjectInspector);
}
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}

this.useBigInt = HiveUtils.isBigIntOI(argOIs[1]);
if (useBigInt) {
fieldOIs.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector);
@Override
public void process(Object[] args) throws HiveException {
if (returnLong) {
generateLongSeries(args);
} else {
fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
generateIntSeries(args);
}
}

this.start = HiveUtils.getAsConstLong(argOIs[0]);
this.end = HiveUtils.getAsConstLong(argOIs[1]);
if (start > end) {
throw new UDFArgumentException(
"start '" + start + "' must be less than or equals to end '" + end + "'");
private void generateLongSeries(@Nonnull final Object[] args) throws HiveException {
final long start, end;
long step = 1L;
switch (args.length) {
case 3:
step = PrimitiveObjectInspectorUtils.getLong(args[2], stepOI);
if (step == 0) {
throw new UDFArgumentException("Step MUST NOT be zero");
}
// fall through
case 2:
start = PrimitiveObjectInspectorUtils.getLong(args[0], startOI);
end = PrimitiveObjectInspectorUtils.getLong(args[1], endOI);
break;
default:
throw new UDFArgumentException("Expected number of arguments: " + args.length);
}

return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
final LongWritable row0 = new LongWritable();
row[0] = row0;
if (step > 0) {
for (long i = start; i <= end; i += step) {
row0.set(i);
forward(row);
}
} else {
for (long i = start; i >= end; i += step) {
row0.set(i);
forward(row);
}
}
}

@Override
public void process(Object[] argOIs) throws HiveException {
final Object[] forwardObjs = new Object[1];
if (useBigInt) {
if (start == end) {
forwardObjs[0] = start;
forward(forwardObjs);
} else {
for (long i = start; i <= end; i++) {
forwardObjs[0] = i;
forward(forwardObjs);
private void generateIntSeries(@Nonnull final Object[] args) throws HiveException {
final int start, end;
int step = 1;
switch (args.length) {
case 3:
step = PrimitiveObjectInspectorUtils.getInt(args[2], stepOI);
if (step == 0) {
throw new UDFArgumentException("Step MUST NOT be zero");
}
// fall through
case 2:
start = PrimitiveObjectInspectorUtils.getInt(args[0], startOI);
end = PrimitiveObjectInspectorUtils.getInt(args[1], endOI);
break;
default:
throw new UDFArgumentException("Expected number of arguments: " + args.length);
}

final IntWritable row0 = new IntWritable();
row[0] = row0;
if (step > 0) {
for (int i = start; i <= end; i += step) {
row0.set(i);
forward(row);
}
} else {
int starti = (int) start;
int endi = (int) end;
if (starti == endi) {
forwardObjs[0] = starti;
forward(forwardObjs);
} else {
for (int i = starti; i <= endi; i++) {
forwardObjs[0] = i;
forward(forwardObjs);
}
for (int i = start; i >= end; i += step) {
row0.set(i);
forward(row);
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions core/src/main/java/hivemall/tools/TryCastUDF.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@
@Description(name = "try_cast",
value = "_FUNC_(ANY src, const string typeName)"
+ " - Explicitly cast a value as a type. Returns null if cast fails.",
extended = "Usage: select try_cast(array(1.0,2.0,3.0), 'array<string>')\n"
+ " select try_cast(map('A',10,'B',20,'C',30), 'map<string,double>')")
extended = "SELECT try_cast(array(1.0,2.0,3.0), 'array<string>')\n"
+ "SELECT try_cast(map('A',10,'B',20,'C',30), 'map<string,double>')")
@UDFType(deterministic = true, stateful = false)
public final class TryCastUDF extends GenericUDF {

Expand Down
Loading