From c19fd7c6b3318bb694cf0052c0b62431fb1983e0 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 20 Mar 2018 17:12:42 +0900 Subject: [PATCH 01/56] Added array UDFs (array_append, element_at, array_union, first_element, last_element) --- core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java | 4 ++-- .../src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java b/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java index 921bbfa1b..33aadf8b3 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java @@ -95,9 +95,9 @@ public List evaluate(DeferredObject[] args) throws HiveException { for (int j = 0, len = oi.getListLength(undeferred); j < len; ++j) { Object nonStd = oi.getListElement(undeferred, j); - Object copied = ObjectInspectorUtils.copyToStandardObject(nonStd, elemOI, + Object copyed = ObjectInspectorUtils.copyToStandardObject(nonStd, elemOI, ObjectInspectorCopyOption.WRITABLE); - objectSet.add(copied); + objectSet.add(copyed); } } diff --git a/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java b/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java index ac0f73598..cc1703983 100644 --- a/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java @@ -74,4 +74,5 @@ public void testSerialization() throws HiveException, IOException { PrimitiveObjectInspectorFactory.javaDoubleObjectInspector)}, new Object[] {Arrays.asList(0.d, 1.d), Arrays.asList(2.d, 3.d)}); } + } From 8f0aeeccb87d143b3f1fb2cced86baa5c8c2ebab Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 20 Mar 2018 17:53:21 +0900 Subject: [PATCH 02/56] Added array_flatten UDF --- .../test/java/hivemall/tools/array/ArrayFlattenUDFTest.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java b/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java index f69cdd82b..11754aa71 100644 --- a/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java @@ -18,11 +18,12 @@ */ package hivemall.tools.array; +import hivemall.TestUtils; + import java.io.IOException; import java.util.Arrays; import java.util.List; -import hivemall.TestUtils; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; @@ -64,4 +65,5 @@ public void testSerialization() throws HiveException, IOException { new Object[] {Arrays.asList(Arrays.asList(0, 1, 2, 3), Arrays.asList(4, 5), Arrays.asList(6, 7))}); } + } From 5d386e0fded5b5b73c1482bf2e21b18b3a49cc25 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 20 Mar 2018 20:22:34 +0900 Subject: [PATCH 03/56] Added try_cast UDF --- core/src/main/java/hivemall/utils/hadoop/HiveUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java index f3fe7039e..12b0e974a 100644 --- a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java +++ b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java @@ -1211,7 +1211,7 @@ public static LazyLong lazyLong(@Nonnull final long v) { @Nonnull public static ObjectInspector getObjectInspector(@Nonnull final String typeString, - boolean preferWritable) { + final boolean preferWritable) { TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeString); if (preferWritable) { return TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(typeInfo); From 391472c039e7dddc7e0f5eece6de8acb7577cfe2 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 27 Mar 2018 15:49:25 +0900 Subject: [PATCH 04/56] Added to_json/from_json UDFs --- core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java | 1 + core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java b/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java index 8bb8db768..738a9390c 100644 --- a/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java +++ b/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java @@ -90,4 +90,5 @@ public void testSerialization() throws HiveException, IOException { HiveUtils.getConstStringObjectInspector("array")}, new Object[] {"[0.1,1.1,2.2]"}); } + } diff --git a/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java b/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java index 39bd64f79..f7f698cb2 100644 --- a/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java +++ b/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java @@ -21,6 +21,9 @@ import hivemall.TestUtils; import hivemall.utils.hadoop.WritableUtils; +import java.io.IOException; +import java.util.Arrays; + import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; @@ -31,9 +34,6 @@ import org.junit.Assert; import org.junit.Test; -import java.io.IOException; -import java.util.Arrays; - public class ToJsonUDFTest { @Test From 56ced6420dd332f6c54b7adfb8b78bba1c91ec94 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 3 Apr 2018 19:41:05 +0900 Subject: [PATCH 05/56] Added MovingAverage class --- .../hivemall/utils/math/MovingAverage.java | 74 +++++++++++++++++++ .../utils/math/MovingAverageTest.java | 51 +++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 core/src/main/java/hivemall/utils/math/MovingAverage.java create mode 100644 core/src/test/java/hivemall/utils/math/MovingAverageTest.java diff --git a/core/src/main/java/hivemall/utils/math/MovingAverage.java b/core/src/main/java/hivemall/utils/math/MovingAverage.java new file mode 100644 index 000000000..f9a511d24 --- /dev/null +++ b/core/src/main/java/hivemall/utils/math/MovingAverage.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.utils.math; + +import hivemall.utils.collections.DoubleRingBuffer; +import hivemall.utils.lang.NumberUtils; +import hivemall.utils.lang.Preconditions; + +import javax.annotation.Nonnegative; +import javax.annotation.Nonnull; + +public final class MovingAverage { + + @Nonnull + private final DoubleRingBuffer ring; + + private double totalSum; + + public MovingAverage(@Nonnegative int windowSize) { + Preconditions.checkArgument(windowSize > 1, "Invalid window size: " + windowSize); + this.ring = new DoubleRingBuffer(windowSize); + this.totalSum = 0.d; + } + + public double add(final double x) { + if (!NumberUtils.isFinite(x)) { + throw new IllegalArgumentException("Detected Infinite input: " + x); + } + + if (ring.isFull()) { + double head = ring.head(); + this.totalSum -= head; + } + ring.add(x); + totalSum += x; + + final int size = ring.size(); + if (size == 0) { + return 0.d; + } + return totalSum / size; + } + + public double get() { + final int size = ring.size(); + if (size == 0) { + return 0.d; + } + return totalSum / size; + } + + @Override + public String toString() { + return "MovingAverage [ring=" + ring + ", total=" + totalSum + ", moving_avg=" + get() + + "]"; + } + +} diff --git a/core/src/test/java/hivemall/utils/math/MovingAverageTest.java b/core/src/test/java/hivemall/utils/math/MovingAverageTest.java new file mode 100644 index 000000000..575caa859 --- /dev/null +++ b/core/src/test/java/hivemall/utils/math/MovingAverageTest.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.utils.math; + +import org.junit.Assert; +import org.junit.Test; + +public class MovingAverageTest { + + @Test + public void testAdd() { + MovingAverage movingAvg = new MovingAverage(3); + Assert.assertEquals(0.d, movingAvg.get(), 0.d); + Assert.assertEquals(1.d, movingAvg.add(1.d), 0.d); + Assert.assertEquals(1.5d, movingAvg.add(2.d), 0.d); // (1+2)/2 = 1.5 + Assert.assertEquals(2.d, movingAvg.add(3.d), 0.d); // (1+2+3)/3 = 2 + Assert.assertEquals(3.d, movingAvg.add(4.d), 0.d); // (2+3+4)/3 = 3 + Assert.assertEquals(4.d, movingAvg.add(5.d), 0.d); // (3+4+5)/3 = 4 + Assert.assertEquals(5.d, movingAvg.add(6.d), 0.d); // (4+5+6)/3 = 5 + Assert.assertEquals(6.d, movingAvg.add(7.d), 0.d); // (5+6+7)/3 = 6 + } + + @Test(expected = IllegalArgumentException.class) + public void testNaN() { + MovingAverage movingAvg = new MovingAverage(3); + movingAvg.add(Double.NaN); + } + + @Test(expected = IllegalArgumentException.class) + public void testInfinity() { + MovingAverage movingAvg = new MovingAverage(3); + movingAvg.add(Double.POSITIVE_INFINITY); + } + +} From c23cc601dea75e1037d8ee3b4318e6e3cdf52433 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Wed, 4 Apr 2018 14:27:07 +0900 Subject: [PATCH 06/56] Moved MovingAverage and OnlineVariance to hivemall.utils.stats --- .../hivemall/utils/math/MovingAverage.java | 74 ------------------- .../utils/math/MovingAverageTest.java | 51 ------------- 2 files changed, 125 deletions(-) delete mode 100644 core/src/main/java/hivemall/utils/math/MovingAverage.java delete mode 100644 core/src/test/java/hivemall/utils/math/MovingAverageTest.java diff --git a/core/src/main/java/hivemall/utils/math/MovingAverage.java b/core/src/main/java/hivemall/utils/math/MovingAverage.java deleted file mode 100644 index f9a511d24..000000000 --- a/core/src/main/java/hivemall/utils/math/MovingAverage.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.utils.math; - -import hivemall.utils.collections.DoubleRingBuffer; -import hivemall.utils.lang.NumberUtils; -import hivemall.utils.lang.Preconditions; - -import javax.annotation.Nonnegative; -import javax.annotation.Nonnull; - -public final class MovingAverage { - - @Nonnull - private final DoubleRingBuffer ring; - - private double totalSum; - - public MovingAverage(@Nonnegative int windowSize) { - Preconditions.checkArgument(windowSize > 1, "Invalid window size: " + windowSize); - this.ring = new DoubleRingBuffer(windowSize); - this.totalSum = 0.d; - } - - public double add(final double x) { - if (!NumberUtils.isFinite(x)) { - throw new IllegalArgumentException("Detected Infinite input: " + x); - } - - if (ring.isFull()) { - double head = ring.head(); - this.totalSum -= head; - } - ring.add(x); - totalSum += x; - - final int size = ring.size(); - if (size == 0) { - return 0.d; - } - return totalSum / size; - } - - public double get() { - final int size = ring.size(); - if (size == 0) { - return 0.d; - } - return totalSum / size; - } - - @Override - public String toString() { - return "MovingAverage [ring=" + ring + ", total=" + totalSum + ", moving_avg=" + get() - + "]"; - } - -} diff --git a/core/src/test/java/hivemall/utils/math/MovingAverageTest.java b/core/src/test/java/hivemall/utils/math/MovingAverageTest.java deleted file mode 100644 index 575caa859..000000000 --- a/core/src/test/java/hivemall/utils/math/MovingAverageTest.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.utils.math; - -import org.junit.Assert; -import org.junit.Test; - -public class MovingAverageTest { - - @Test - public void testAdd() { - MovingAverage movingAvg = new MovingAverage(3); - Assert.assertEquals(0.d, movingAvg.get(), 0.d); - Assert.assertEquals(1.d, movingAvg.add(1.d), 0.d); - Assert.assertEquals(1.5d, movingAvg.add(2.d), 0.d); // (1+2)/2 = 1.5 - Assert.assertEquals(2.d, movingAvg.add(3.d), 0.d); // (1+2+3)/3 = 2 - Assert.assertEquals(3.d, movingAvg.add(4.d), 0.d); // (2+3+4)/3 = 3 - Assert.assertEquals(4.d, movingAvg.add(5.d), 0.d); // (3+4+5)/3 = 4 - Assert.assertEquals(5.d, movingAvg.add(6.d), 0.d); // (4+5+6)/3 = 5 - Assert.assertEquals(6.d, movingAvg.add(7.d), 0.d); // (5+6+7)/3 = 6 - } - - @Test(expected = IllegalArgumentException.class) - public void testNaN() { - MovingAverage movingAvg = new MovingAverage(3); - movingAvg.add(Double.NaN); - } - - @Test(expected = IllegalArgumentException.class) - public void testInfinity() { - MovingAverage movingAvg = new MovingAverage(3); - movingAvg.add(Double.POSITIVE_INFINITY); - } - -} From b127d0dcefcf4fea9223ad0dddc278e7e52ded18 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Wed, 4 Apr 2018 17:47:18 +0900 Subject: [PATCH 07/56] Added moving_avg UDTF --- .../test/java/hivemall/statistics/MovingAverageUDTFTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/test/java/hivemall/statistics/MovingAverageUDTFTest.java b/core/src/test/java/hivemall/statistics/MovingAverageUDTFTest.java index e755e26af..bda2baa8a 100644 --- a/core/src/test/java/hivemall/statistics/MovingAverageUDTFTest.java +++ b/core/src/test/java/hivemall/statistics/MovingAverageUDTFTest.java @@ -18,11 +18,12 @@ */ package hivemall.statistics; +import hivemall.TestUtils; + import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import hivemall.TestUtils; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.Collector; import org.apache.hadoop.hive.serde2.io.DoubleWritable; From a713d9b9b79f6892f5ee22f9c0cc1ee97a7528cf Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 5 Apr 2018 18:34:04 +0900 Subject: [PATCH 08/56] Added conditional_emit UDTF --- .../java/hivemall/tools/array/ConditionalEmitUDTFTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java b/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java index ef459830d..7045235b3 100644 --- a/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java +++ b/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java @@ -18,12 +18,12 @@ */ package hivemall.tools.array; +import hivemall.TestUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import hivemall.TestUtils; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.Collector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -80,4 +80,5 @@ public void testSerialization() throws HiveException { {Arrays.asList(true, false, true), Arrays.asList("one", "two", "three")}, {Arrays.asList(true, true, false), Arrays.asList("one", "two", "three")}}); } + } From d1b1f09e98c9465e11745caa85b73eab0812a181 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 5 Apr 2018 20:03:48 +0900 Subject: [PATCH 09/56] Added array_slice UDF --- .../src/test/java/hivemall/tools/array/ArraySliceUDFTest.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java b/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java index a260936d0..fbc212aad 100644 --- a/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java @@ -18,12 +18,13 @@ */ package hivemall.tools.array; +import hivemall.TestUtils; + import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.List; -import hivemall.TestUtils; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; @@ -129,4 +130,5 @@ public void testSerialization() throws HiveException, IOException { new Object[] {Arrays.asList("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), 2, 5}); } + } From da9f0fac3891edaa6842a047b614469570866d36 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 6 Apr 2018 20:06:41 +0900 Subject: [PATCH 10/56] Aded vector_add and vector_dot UDFs --- core/src/test/java/hivemall/tools/vector/VectorAddUDFTest.java | 1 + core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java | 1 + 2 files changed, 2 insertions(+) diff --git a/core/src/test/java/hivemall/tools/vector/VectorAddUDFTest.java b/core/src/test/java/hivemall/tools/vector/VectorAddUDFTest.java index 0aa90e72a..fd70fcbb5 100644 --- a/core/src/test/java/hivemall/tools/vector/VectorAddUDFTest.java +++ b/core/src/test/java/hivemall/tools/vector/VectorAddUDFTest.java @@ -94,4 +94,5 @@ public void testSerialization() throws HiveException, IOException { PrimitiveObjectInspectorFactory.javaFloatObjectInspector)}, new Object[] {Arrays.asList(1.d, 2.d, 3.d), Arrays.asList(2.f, 3.f, 4.f)}); } + } diff --git a/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java b/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java index b13c447f0..650c005eb 100644 --- a/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java +++ b/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java @@ -92,4 +92,5 @@ public void testSerialization() throws HiveException, IOException { PrimitiveObjectInspectorFactory.javaFloatObjectInspector)}, new Object[] {Arrays.asList(1.d, 2.d, 3.d), Arrays.asList(2.f, 3.f, 4.f)}); } + } From 0c79958ce040332e1769d3b3ef3b860d238210fa Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 10 Apr 2018 14:11:55 +0900 Subject: [PATCH 11/56] Fixed possible serialization errors --- .../src/main/java/hivemall/statistics/MovingAverageUDTF.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java b/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java index 112c47fbd..ac59d1673 100644 --- a/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java +++ b/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java @@ -34,6 +34,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Writable; @Description(name = "moving_avg", value = "_FUNC_(NUMBER value, const int windowSize)" + " - Returns moving average of a time series using a given window") @@ -43,7 +44,7 @@ public final class MovingAverageUDTF extends GenericUDTF { private MovingAverage movingAvg; - private Object[] forwardObjs; + private Writable[] forwardObjs; private DoubleWritable result; @Override @@ -59,7 +60,7 @@ public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgu this.movingAvg = new MovingAverage(windowSize); this.result = new DoubleWritable(); - this.forwardObjs = new Object[] {result}; + this.forwardObjs = new Writable[] {result}; List fieldNames = Arrays.asList("avg"); List fieldOIs = Arrays.asList( From 67615f7bc04d3512d12657bcf77036439cb1109a Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 10 Apr 2018 14:33:13 +0900 Subject: [PATCH 12/56] Added Jerome to the Committer list --- pom.xml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pom.xml b/pom.xml index 5a78bd1ba..aa682c66b 100644 --- a/pom.xml +++ b/pom.xml @@ -196,6 +196,18 @@ +9 + + jbanks + Jerome Banks + jbanks[at]apache.org + https://github.com/jeromebanks/ + Jumpshot Inc. + https://www.jumpshot.com/ + + Committer + + -8 + rvs From 53c279d0c9e5cb38f85ee8c9b48ac19f0b08abd8 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 10 Apr 2018 18:23:32 +0900 Subject: [PATCH 13/56] Fixed a bug in vector_dot UDF --- .../hivemall/tools/vector/VectorDotUDF.java | 18 +++++++++--------- .../tools/vector/VectorDotUDFTest.java | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java b/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java index 2aa3c0370..9c6cea094 100644 --- a/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java +++ b/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java @@ -65,19 +65,19 @@ public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentEx ObjectInspector argOI1 = argOIs[1]; if (HiveUtils.isNumberListOI(argOI1)) { this.evaluator = new Dot2DVectors(xListOI, HiveUtils.asListOI(argOI1)); + return ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); } else if (HiveUtils.isNumberOI(argOI1)) { this.evaluator = new Multiply2D1D(xListOI, argOI1); + return PrimitiveObjectInspectorFactory.javaDoubleObjectInspector; } else { throw new UDFArgumentException( "Expected array or number for the send argument: " + argOI1.getTypeName()); } - - return ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); } @Override - public List evaluate(DeferredObject[] args) throws HiveException { + public Object evaluate(DeferredObject[] args) throws HiveException { final Object arg0 = args[0].get(); final Object arg1 = args[1].get(); if (arg0 == null || arg1 == null) { @@ -90,7 +90,7 @@ public List evaluate(DeferredObject[] args) throws HiveException { interface Evaluator extends Serializable { @Nonnull - List dot(@Nonnull Object x, @Nonnull Object y) throws HiveException; + Object dot(@Nonnull Object x, @Nonnull Object y) throws HiveException; } @@ -144,7 +144,7 @@ static final class Dot2DVectors implements Evaluator { } @Override - public List dot(@Nonnull Object x, @Nonnull Object y) throws HiveException { + public Double dot(@Nonnull Object x, @Nonnull Object y) throws HiveException { final int xLen = xListOI.getListLength(x); final int yLen = yListOI.getListLength(y); if (xLen != yLen) { @@ -152,7 +152,7 @@ public List dot(@Nonnull Object x, @Nonnull Object y) throws HiveExcepti + ", y=" + yListOI.getList(y)); } - final Double[] arr = new Double[xLen]; + double result = 0.d; for (int i = 0; i < xLen; i++) { Object xi = xListOI.getListElement(x, i); Object yi = yListOI.getListElement(y, i); @@ -162,10 +162,10 @@ public List dot(@Nonnull Object x, @Nonnull Object y) throws HiveExcepti double xd = PrimitiveObjectInspectorUtils.getDouble(xi, xElemOI); double yd = PrimitiveObjectInspectorUtils.getDouble(yi, yElemOI); double v = xd * yd; - arr[i] = Double.valueOf(v); + result += v; } - return Arrays.asList(arr); + return Double.valueOf(result); } } diff --git a/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java b/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java index 650c005eb..eb5c08f63 100644 --- a/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java +++ b/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java @@ -52,8 +52,8 @@ public void testDotp() throws HiveException, IOException { new GenericUDF.DeferredJavaObject( WritableUtils.toWritableList(new float[] {2, 3, 4}))}; - List actual = udf.evaluate(args); - List expected = Arrays.asList(2.d, 6.d, 12.d); + Object actual = udf.evaluate(args); + Double expected = Double.valueOf(1.d * 2.d + 2.d * 3.d + 3.d * 4.d); Assert.assertEquals(expected, actual); @@ -74,7 +74,7 @@ public void testDotpScalar() throws HiveException, IOException { WritableUtils.toWritableList(new double[] {1, 2, 3})), new GenericUDF.DeferredJavaObject(WritableUtils.val(2.f))}; - List actual = udf.evaluate(args); + Object actual = udf.evaluate(args); List expected = Arrays.asList(2.d, 4.d, 6.d); Assert.assertEquals(expected, actual); From 5bebe9009e67ab1c6ec253cbf3fe6ed625366acc Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 12 Apr 2018 11:38:57 +0900 Subject: [PATCH 14/56] Fixed an error message of tree_predict --- core/src/main/java/hivemall/smile/tools/TreePredictUDF.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/hivemall/smile/tools/TreePredictUDF.java b/core/src/main/java/hivemall/smile/tools/TreePredictUDF.java index 9b775bf4d..511944c30 100644 --- a/core/src/main/java/hivemall/smile/tools/TreePredictUDF.java +++ b/core/src/main/java/hivemall/smile/tools/TreePredictUDF.java @@ -158,7 +158,7 @@ public Object evaluate(@Nonnull DeferredObject[] arguments) throws HiveException Object arg2 = arguments[2].get(); if (arg2 == null) { - throw new HiveException("array features was null"); + throw new HiveException("features was null"); } this.featuresProbe = parseFeatures(arg2, featuresProbe); From 727775af1cc8e986895665851a836eae75a3610c Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 12 Apr 2018 15:21:22 +0900 Subject: [PATCH 15/56] Fix array_append UDF behavior --- .../src/main/java/hivemall/tools/array/ArrayAppendUDF.java | 7 +++++++ .../test/java/hivemall/tools/array/ArrayAppendUDFTest.java | 7 +++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java b/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java index 25d0f4cb5..c344c01a4 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java @@ -21,6 +21,7 @@ import hivemall.utils.hadoop.HiveUtils; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import javax.annotation.Nonnull; @@ -71,6 +72,12 @@ public List evaluate(@Nonnull DeferredObject[] args) throws HiveExceptio Object arg0 = args[0].get(); if (arg0 == null) { + Object arg1 = args[1].get(); + if (arg1 != null) { + Object toAppend = returnWritables ? primInspector.getPrimitiveWritableObject(arg1) + : primInspector.getPrimitiveJavaObject(arg1); + return Arrays.asList(toAppend); + } return null; } diff --git a/core/src/test/java/hivemall/tools/array/ArrayAppendUDFTest.java b/core/src/test/java/hivemall/tools/array/ArrayAppendUDFTest.java index b376abed3..119dd264d 100644 --- a/core/src/test/java/hivemall/tools/array/ArrayAppendUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArrayAppendUDFTest.java @@ -85,9 +85,8 @@ public void testEvaluateAvoidNullAppend() throws HiveException, IOException { udf.close(); } - @Test - public void testEvaluateReturnNull() throws HiveException, IOException { + public void testEvaluateNullList() throws HiveException, IOException { ArrayAppendUDF udf = new ArrayAppendUDF(); udf.initialize(new ObjectInspector[] { @@ -96,11 +95,11 @@ public void testEvaluateReturnNull() throws HiveException, IOException { PrimitiveObjectInspectorFactory.javaDoubleObjectInspector}); DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject(null), - new GenericUDF.DeferredJavaObject(new Double(3))}; + new GenericUDF.DeferredJavaObject(new Double(3d))}; List result = udf.evaluate(args); - Assert.assertNull(result); + Assert.assertEquals(Arrays.asList(new DoubleWritable(3d)), result); udf.close(); } From 49bf31aa44adb80643b6e49f0d8d3f5dcdaaf86a Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Wed, 18 Apr 2018 19:15:07 +0900 Subject: [PATCH 16/56] Added map_key_values UDF --- .../hivemall/tools/map/MapKeyValuesUDF.java | 95 +++++++++++++++++++ .../tools/map/MapKeyValuesUDFTest.java | 81 ++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java create mode 100644 core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java diff --git a/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java b/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java new file mode 100644 index 000000000..64065e9ed --- /dev/null +++ b/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.map; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import javax.annotation.Nullable; + +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; + +@Description(name = "map_key_values", + value = "_FUNC_(map) - " + "Returns a array of key-value pairs.") +@UDFType(deterministic = true, stateful = false) +public final class MapKeyValuesUDF extends GenericUDF { + + private final ArrayList retArray = new ArrayList(); + + private MapObjectInspector mapOI; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 1) { + throw new UDFArgumentLengthException( + "The function MAP_KEYS only accepts one argument."); + } else if (!(arguments[0] instanceof MapObjectInspector)) { + throw new UDFArgumentTypeException(0, + "\"" + Category.MAP.toString().toLowerCase() + + "\" is expected at function MAP_KEYS, " + "but \"" + + arguments[0].getTypeName() + "\" is found"); + } + + this.mapOI = (MapObjectInspector) arguments[0]; + + List structFieldNames = new ArrayList(); + List structFieldObjectInspectors = new ArrayList(); + structFieldNames.add("key"); + structFieldObjectInspectors.add(mapOI.getMapKeyObjectInspector()); + structFieldNames.add("value"); + structFieldObjectInspectors.add(mapOI.getMapValueObjectInspector()); + + return ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames, + structFieldObjectInspectors)); + } + + @Override + @Nullable + public List evaluate(DeferredObject[] arguments) throws HiveException { + Object mapObj = arguments[0].get(); + if (mapObj == null) { + return null; + } + retArray.clear(); + final Map map = mapOI.getMap(mapObj); + for (Map.Entry e : map.entrySet()) { + retArray.add(new Object[] {e.getKey(), e.getValue()}); + } + return retArray; + } + + @Override + public String getDisplayString(String[] children) { + return "map_key_values(" + StringUtils.join(children, ',') + ')'; + } + +} diff --git a/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java b/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java new file mode 100644 index 000000000..2164dc1ad --- /dev/null +++ b/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.map; + +import hivemall.TestUtils; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Assert; +import org.junit.Test; + +public class MapKeyValuesUDFTest { + + + @Test + public void testStringDouble() throws HiveException, IOException { + MapKeyValuesUDF udf = new MapKeyValuesUDF(); + + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardMapObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + + Map input = new HashMap<>(); + for (int i = 0; i < 10; i++) { + input.put("k" + i, new DoubleWritable(i)); + } + + GenericUDF.DeferredObject[] arguments = + new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject(input)}; + + List actual = udf.evaluate(arguments); + + Assert.assertEquals(input.size(), actual.size()); + for (Object[] e : actual) { + Assert.assertEquals(2, e.length); + Object v = input.get(e[0]); + Assert.assertEquals(e[1], v); + } + + udf.close(); + } + + @Test + public void testSerialization() throws UDFArgumentException { + MapKeyValuesUDF udf = new MapKeyValuesUDF(); + + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardMapObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + + byte[] serialized = TestUtils.serializeObjectByKryo(udf); + TestUtils.deserializeObjectByKryo(serialized, MapKeyValuesUDF.class); + } + +} From 98c44dad69c5aa857fe05ade49c2754b24650bb7 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 20 Apr 2018 19:03:24 +0900 Subject: [PATCH 17/56] Fixed UDFType of moving_avg UDF --- core/src/main/java/hivemall/statistics/MovingAverageUDTF.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java b/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java index ac59d1673..105c86bf5 100644 --- a/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java +++ b/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -38,6 +39,7 @@ @Description(name = "moving_avg", value = "_FUNC_(NUMBER value, const int windowSize)" + " - Returns moving average of a time series using a given window") +@UDFType(deterministic = false, stateful = true) public final class MovingAverageUDTF extends GenericUDTF { private PrimitiveObjectInspector valueOI; From 1466de3c1464f5a8932fe74147ab4d887c49fe2d Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 20 Apr 2018 19:12:06 +0900 Subject: [PATCH 18/56] Added sessionize UDF --- .../tools/datetime/SessionizeUDF.java | 91 +++++++++++++++++++ .../tools/datetime/SessionizeUDFTest.java | 76 ++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java create mode 100644 core/src/test/java/hivemall/tools/datetime/SessionizeUDFTest.java diff --git a/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java b/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java new file mode 100644 index 000000000..4ceec7768 --- /dev/null +++ b/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.datetime; + +import java.util.UUID; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +@Description(name = "sessionize", + value = "_FUNC_(long timeInSec, long thresholdInSec [, String subject])" + + "- Returns a UUID string of a session.", + extended = "SELECT sessionize(time, 3600, ip_addr) as session_id, time, ip_addr FROM (\n" + + "SELECT time, ipaddr FROM weblog DISTRIBUTE BY ip_addr, time SORT BY ip_addr, time DESC\n) t1") +@UDFType(deterministic = false, stateful = true) +public final class SessionizeUDF extends UDF { + + private long lastTime; + @Nullable + private Text lastSubject; + @Nonnull + private final Text sessionId = new Text(); + + @Nullable + public Text evaluate(@Nullable LongWritable time, @Nullable LongWritable threshold) { + if (time == null || threshold == null) { + return null; + } + + final long thisTime = time.get(); + final long diff = thisTime - lastTime; + if (diff < threshold.get()) { + this.lastTime = thisTime; + return sessionId; + } + + sessionId.set(UUID.randomUUID().toString()); + this.lastTime = time.get(); + return sessionId; + } + + @Nullable + public Text evaluate(@Nullable LongWritable time, @Nullable LongWritable threshold, + @Nullable Text subject) { + if (time == null || threshold == null || subject == null) { + return null; + } + + if (subject.equals(lastSubject)) { + final long thisTime = time.get(); + final long diff = thisTime - lastTime; + if (diff < threshold.get()) { + this.lastTime = thisTime; + return sessionId; + } + } else { + if (lastSubject == null) { + lastSubject = new Text(subject); + } else { + lastSubject.set(subject); + } + } + + sessionId.set(UUID.randomUUID().toString()); + this.lastTime = time.get(); + return sessionId; + } + +} diff --git a/core/src/test/java/hivemall/tools/datetime/SessionizeUDFTest.java b/core/src/test/java/hivemall/tools/datetime/SessionizeUDFTest.java new file mode 100644 index 000000000..2aca35123 --- /dev/null +++ b/core/src/test/java/hivemall/tools/datetime/SessionizeUDFTest.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.datetime; + +import static hivemall.utils.hadoop.WritableUtils.val; + +import hivemall.TestUtils; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + +public class SessionizeUDFTest { + + @Test + public void testTwoArgs() { + SessionizeUDF udf = new SessionizeUDF(); + + Text session1 = new Text(udf.evaluate(val(30L), val(10L))); + Assert.assertNotNull(session1); + + Text session2 = new Text(udf.evaluate(val(35L), val(10L))); + Assert.assertEquals(session1, session2); + + Text session3 = new Text(udf.evaluate(val(40L), val(10L))); + Assert.assertEquals(session2, session3); + + Text session4 = new Text(udf.evaluate(val(50L), val(10L))); + Assert.assertNotEquals(session3, session4); + } + + @Test + public void testThreeArgs() { + SessionizeUDF udf = new SessionizeUDF(); + + Text session1 = new Text(udf.evaluate(val(30L), val(10L), val("subject1"))); + Assert.assertNotNull(session1); + + Text session2 = new Text(udf.evaluate(val(35L), val(10L), val("subject1"))); + Assert.assertEquals(session1, session2); + + Text session3 = new Text(udf.evaluate(val(40L), val(10L), val("subject2"))); + Assert.assertNotEquals(session2, session3); + + Text session4 = new Text(udf.evaluate(val(45L), val(10L), val("subject2"))); + Assert.assertEquals(session3, session4); + } + + @Test + public void testSerialization() throws HiveException { + SessionizeUDF udf = new SessionizeUDF(); + + udf.evaluate(val((long) (System.currentTimeMillis() / 1000.0d)), val(30L)); + udf.evaluate(val((long) (System.currentTimeMillis() / 1000.0d)), val(30L)); + + byte[] serialized = TestUtils.serializeObjectByKryo(udf); + TestUtils.deserializeObjectByKryo(serialized, SessionizeUDF.class); + } +} From 7b40218e932feeb5465d957cb52b2d60f275336c Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 20 Apr 2018 21:57:35 +0900 Subject: [PATCH 19/56] Refactored generate_series UDTF to be more flexible --- .../hivemall/tools/GenerateSeriesUDTF.java | 146 ++++++++--- .../tools/GenerateSeriesUDTFTest.java | 246 ++++++++++++++++++ 2 files changed, 355 insertions(+), 37 deletions(-) create mode 100644 core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java diff --git a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java index 2567ac782..fd87510ab 100644 --- a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java +++ b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java @@ -21,6 +21,10 @@ import hivemall.utils.hadoop.HiveUtils; import java.util.ArrayList; +import java.util.List; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; @@ -28,8 +32,13 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; @Description(name = "generate_series", value = "_FUNC_(const int|bigint start, const int|bigint end) - " @@ -38,60 +47,123 @@ + "6\n" + "7\n" + "8\n" + "9") public final class GenerateSeriesUDTF extends GenericUDTF { - private long start, end; - private boolean useBigInt; + private PrimitiveObjectInspector startOI, endOI; + @Nullable + private PrimitiveObjectInspector stepOI; + + @Nonnull + private final Writable[] row = new Writable[1]; + private boolean returnLong; @Override public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { - if (argOIs.length != 2) { - throw new UDFArgumentException("Expected number of arguments is 2: " + argOIs.length); + if (argOIs.length != 2 && argOIs.length != 3) { + throw new UDFArgumentException( + "Expected number of arguments is 2 or 3: " + argOIs.length); + } + if (!HiveUtils.isIntegerOI(argOIs[0])) { + throw new UDFArgumentException( + "Expected Integer type for the first argument: " + argOIs[0].getTypeName()); + } + if (!HiveUtils.isIntegerOI(argOIs[1])) { + throw new UDFArgumentException( + "Expected Integer type for the second argument: " + argOIs[1].getTypeName()); } + this.startOI = HiveUtils.asIntegerOI(argOIs[0]); + this.endOI = HiveUtils.asIntegerOI(argOIs[1]); - ArrayList fieldNames = new ArrayList(1); + if (argOIs.length == 3) { + if (!HiveUtils.isIntegerOI(argOIs[2])) { + throw new UDFArgumentException( + "Expected Integer type for the third argument: " + argOIs[2].getTypeName()); + } + this.stepOI = HiveUtils.asIntegerOI(argOIs[2]); + } + + this.returnLong = HiveUtils.isBigIntOI(startOI) || HiveUtils.isBigIntOI(endOI); + + List fieldNames = new ArrayList<>(1); fieldNames.add("value"); - ArrayList fieldOIs = new ArrayList(1); + List fieldOIs = new ArrayList<>(1); + if (returnLong) { + fieldOIs.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + } else { + fieldOIs.add(PrimitiveObjectInspectorFactory.writableIntObjectInspector); + } + return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); + } - this.useBigInt = HiveUtils.isBigIntOI(argOIs[1]); - if (useBigInt) { - fieldOIs.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector); + @Override + public void process(Object[] args) throws HiveException { + if (returnLong) { + generateLongSeries(args); } else { - fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector); + generateIntSeries(args); } + } - this.start = HiveUtils.getAsConstLong(argOIs[0]); - this.end = HiveUtils.getAsConstLong(argOIs[1]); - if (start > end) { - throw new UDFArgumentException( - "start '" + start + "' must be less than or equals to end '" + end + "'"); + private void generateLongSeries(@Nonnull final Object[] args) throws HiveException { + final long start, end; + long step = 1L; + switch (args.length) { + case 3: + step = PrimitiveObjectInspectorUtils.getLong(args[2], stepOI); + if (step == 0) { + throw new UDFArgumentException("Step MUST NOT be zero"); + } + // fall through + case 2: + start = PrimitiveObjectInspectorUtils.getLong(args[0], startOI); + end = PrimitiveObjectInspectorUtils.getLong(args[1], endOI); + break; + default: + throw new UDFArgumentException("Expected number of arguments: " + args.length); } - return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); + final LongWritable row0 = new LongWritable(); + row[0] = row0; + if (step > 0) { + for (long i = start; i <= end; i += step) { + row0.set(i); + forward(row); + } + } else { + for (long i = start; i >= end; i += step) { + row0.set(i); + forward(row); + } + } } - @Override - public void process(Object[] argOIs) throws HiveException { - final Object[] forwardObjs = new Object[1]; - if (useBigInt) { - if (start == end) { - forwardObjs[0] = start; - forward(forwardObjs); - } else { - for (long i = start; i <= end; i++) { - forwardObjs[0] = i; - forward(forwardObjs); + private void generateIntSeries(@Nonnull final Object[] args) throws HiveException { + final int start, end; + int step = 1; + switch (args.length) { + case 3: + step = PrimitiveObjectInspectorUtils.getInt(args[2], stepOI); + if (step == 0) { + throw new UDFArgumentException("Step MUST NOT be zero"); } + // fall through + case 2: + start = PrimitiveObjectInspectorUtils.getInt(args[0], startOI); + end = PrimitiveObjectInspectorUtils.getInt(args[1], endOI); + break; + default: + throw new UDFArgumentException("Expected number of arguments: " + args.length); + } + + final IntWritable row0 = new IntWritable(); + row[0] = row0; + if (step > 0) { + for (int i = start; i <= end; i += step) { + row0.set(i); + forward(row); } } else { - int starti = (int) start; - int endi = (int) end; - if (starti == endi) { - forwardObjs[0] = starti; - forward(forwardObjs); - } else { - for (int i = starti; i <= endi; i++) { - forwardObjs[0] = i; - forward(forwardObjs); - } + for (int i = start; i >= end; i += step) { + row0.set(i); + forward(row); } } } diff --git a/core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java b/core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java new file mode 100644 index 000000000..04f432cd8 --- /dev/null +++ b/core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools; + +import hivemall.TestUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.Collector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.junit.Assert; +import org.junit.Test; + +public class GenerateSeriesUDTFTest { + + @Test + public void testTwoConstArgs() throws HiveException { + GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); + + udtf.initialize(new ObjectInspector[] { + PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + TypeInfoFactory.intTypeInfo, new IntWritable(1)), + PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + TypeInfoFactory.intTypeInfo, new IntWritable(3))}); + + final List actual = new ArrayList<>(); + + udtf.setCollector(new Collector() { + @Override + public void collect(Object args) throws HiveException { + Object[] row = (Object[]) args; + IntWritable row0 = (IntWritable) row[0]; + actual.add(new IntWritable(row0.get())); + } + }); + + udtf.process(new Object[] {new IntWritable(1), new IntWritable(3)}); + + List expected = + Arrays.asList(new IntWritable(1), new IntWritable(2), new IntWritable(3)); + Assert.assertEquals(expected, actual); + } + + @Test + public void testTwoIntArgs() throws HiveException { + GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); + + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector}); + + final List actual = new ArrayList<>(); + + udtf.setCollector(new Collector() { + @Override + public void collect(Object args) throws HiveException { + Object[] row = (Object[]) args; + IntWritable row0 = (IntWritable) row[0]; + actual.add(new IntWritable(row0.get())); + } + }); + + udtf.process(new Object[] {1, new IntWritable(3)}); + + List expected = + Arrays.asList(new IntWritable(1), new IntWritable(2), new IntWritable(3)); + Assert.assertEquals(expected, actual); + } + + @Test + public void testTwoLongArgs() throws HiveException { + GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); + + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableLongObjectInspector}); + + final List actual = new ArrayList<>(); + + udtf.setCollector(new Collector() { + @Override + public void collect(Object args) throws HiveException { + Object[] row = (Object[]) args; + LongWritable row0 = (LongWritable) row[0]; + actual.add(new LongWritable(row0.get())); + } + }); + + udtf.process(new Object[] {1, new LongWritable(3)}); + + List expected = + Arrays.asList(new LongWritable(1), new LongWritable(2), new LongWritable(3)); + Assert.assertEquals(expected, actual); + } + + @Test + public void testThreeIntArgs() throws HiveException { + GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); + + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector, + PrimitiveObjectInspectorFactory.javaLongObjectInspector}); + + final List actual = new ArrayList<>(); + + udtf.setCollector(new Collector() { + @Override + public void collect(Object args) throws HiveException { + Object[] row = (Object[]) args; + IntWritable row0 = (IntWritable) row[0]; + actual.add(new IntWritable(row0.get())); + } + }); + + udtf.process(new Object[] {1, new IntWritable(7), 3L}); + + List expected = + Arrays.asList(new IntWritable(1), new IntWritable(4), new IntWritable(7)); + Assert.assertEquals(expected, actual); + } + + @Test + public void testThreeLongArgs() throws HiveException { + GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); + + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaLongObjectInspector, + PrimitiveObjectInspectorFactory.writableLongObjectInspector, + PrimitiveObjectInspectorFactory.javaLongObjectInspector}); + + final List actual = new ArrayList<>(); + + udtf.setCollector(new Collector() { + @Override + public void collect(Object args) throws HiveException { + Object[] row = (Object[]) args; + LongWritable row0 = (LongWritable) row[0]; + actual.add(new LongWritable(row0.get())); + } + }); + + udtf.process(new Object[] {1L, new LongWritable(7), 3L}); + + List expected = + Arrays.asList(new LongWritable(1), new LongWritable(4), new LongWritable(7)); + Assert.assertEquals(expected, actual); + } + + @Test + public void testNegativeStepInt() throws HiveException { + GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); + + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector, + PrimitiveObjectInspectorFactory.javaLongObjectInspector}); + + final List actual = new ArrayList<>(); + + udtf.setCollector(new Collector() { + @Override + public void collect(Object args) throws HiveException { + Object[] row = (Object[]) args; + IntWritable row0 = (IntWritable) row[0]; + actual.add(new IntWritable(row0.get())); + } + }); + + udtf.process(new Object[] {5, new IntWritable(1), -2L}); + + List expected = + Arrays.asList(new IntWritable(5), new IntWritable(3), new IntWritable(1)); + Assert.assertEquals(expected, actual); + } + + @Test + public void testNegativeStepLong() throws HiveException { + GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); + + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaLongObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector, + PrimitiveObjectInspectorFactory.javaIntObjectInspector}); + + final List actual = new ArrayList<>(); + + udtf.setCollector(new Collector() { + @Override + public void collect(Object args) throws HiveException { + Object[] row = (Object[]) args; + LongWritable row0 = (LongWritable) row[0]; + actual.add(new LongWritable(row0.get())); + } + }); + + udtf.process(new Object[] {5L, new IntWritable(1), -2}); + + List expected = + Arrays.asList(new LongWritable(5), new LongWritable(3), new LongWritable(1)); + Assert.assertEquals(expected, actual); + } + + @Test + public void testSerialization() throws HiveException { + GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); + + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector}); + + udtf.setCollector(new Collector() { + @Override + public void collect(Object args) throws HiveException {} + }); + + udtf.process(new Object[] {1, new IntWritable(3)}); + + byte[] serialized = TestUtils.serializeObjectByKryo(udtf); + TestUtils.deserializeObjectByKryo(serialized, GenerateSeriesUDTF.class); + } + +} From 6bc09480531cc1cf9183c1c2f5e8ae50f94ab48b Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 24 Apr 2018 14:07:58 +0900 Subject: [PATCH 20/56] Added a new line in EoF --- NOTICE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NOTICE b/NOTICE index 385a19814..b4e5acebb 100644 --- a/NOTICE +++ b/NOTICE @@ -10,4 +10,4 @@ the following organizations and individuals: - Copyright 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST) - Copyright 2015-2016 Makoto Yui - Copyright 2015-2016 Treasure Data, Inc. - - Copyright 2012 Klout, Inc. \ No newline at end of file + - Copyright 2012 Klout, Inc. From 65fdffb61dc5e83cd61b77b1b19833d69f6b307d Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 24 Apr 2018 17:43:51 +0900 Subject: [PATCH 21/56] Added merge_maps UDAF --- .../hivemall/tools/map/MergeMapsUDAF.java | 165 ++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java diff --git a/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java b/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java new file mode 100644 index 000000000..21250b133 --- /dev/null +++ b/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.map; + +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.lang.Preconditions; + +import java.util.HashMap; +import java.util.Map; + +import javax.annotation.Nonnull; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +@Description(name = "merge_maps", + value = "_FUNC_(x) - Returns a map which contains the union of an aggregation of maps." + + " Note that an existing value of a key can be replaced with the other duplicate key entry.", + extended = "SELECT merge_maps(m) FROM ( " + + "SELECT map('A',10,'B',20,'C',30) UNION ALL SELECT map('A',10,'B',20,'C',30)) t") +public final class MergeMapsUDAF extends AbstractGenericUDAFResolver { + + @Override + public MergeMapsEvaluator getEvaluator(TypeInfo[] types) throws SemanticException { + if (types.length != 1) { + throw new UDFArgumentTypeException(types.length - 1, + "One argument is expected but got " + types.length); + } + TypeInfo paramType = types[0]; + if (paramType.getCategory() != Category.MAP) { + throw new UDFArgumentTypeException(0, "Only maps supported for now "); + } + return new MergeMapsEvaluator(); + } + + public static final class MergeMapsEvaluator extends GenericUDAFEvaluator { + + private transient MapObjectInspector inputMapOI, mergeMapOI; + private transient ObjectInspector inputKeyOI, inputValOI; + + @AggregationType(estimable = false) + static final class MapAggBuffer extends AbstractAggregationBuffer { + @Nonnull + final Map collectMap = new HashMap(); + } + + public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException { + Preconditions.checkArgument(parameters.length == 1); + super.init(mode, parameters); + + // initialize input + if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {// from original data + this.inputMapOI = HiveUtils.asMapOI(parameters[0]); + this.inputKeyOI = inputMapOI.getMapKeyObjectInspector(); + this.inputValOI = inputMapOI.getMapValueObjectInspector(); + } else {// from partial aggregation + this.mergeMapOI = HiveUtils.asMapOI(parameters[0]); + this.inputKeyOI = mergeMapOI.getMapKeyObjectInspector(); + this.inputValOI = mergeMapOI.getMapValueObjectInspector(); + } + + return ObjectInspectorFactory.getStandardMapObjectInspector( + ObjectInspectorUtils.getStandardObjectInspector(inputKeyOI), + ObjectInspectorUtils.getStandardObjectInspector(inputValOI)); + } + + @Override + public MapAggBuffer getNewAggregationBuffer() throws HiveException { + MapAggBuffer buff = new MapAggBuffer(); + reset(buff); + return buff; + } + + @Override + public void reset(@SuppressWarnings("deprecation") AggregationBuffer buff) + throws HiveException { + MapAggBuffer aggrBuf = (MapAggBuffer) buff; + aggrBuf.collectMap.clear(); + } + + @Override + public void iterate(@SuppressWarnings("deprecation") AggregationBuffer agg, + Object[] parameters) throws HiveException { + Preconditions.checkArgument(parameters.length == 1); + + Object param0 = parameters[0]; + if (param0 == null) { + return; + } + + Map m = inputMapOI.getMap(param0); + MapAggBuffer myagg = (MapAggBuffer) agg; + putIntoSet(m, myagg.collectMap, inputMapOI); + } + + @Override + public void merge(@SuppressWarnings("deprecation") AggregationBuffer agg, Object partial) + throws HiveException { + if (partial == null) { + return; + } + + MapAggBuffer myagg = (MapAggBuffer) agg; + Map m = mergeMapOI.getMap(partial); + putIntoSet(m, myagg.collectMap, mergeMapOI); + } + + private static void putIntoSet(@Nonnull final Map m, + @Nonnull final Map dst, @Nonnull final MapObjectInspector mapOI) { + final ObjectInspector keyOI = mapOI.getMapKeyObjectInspector(); + final ObjectInspector valueOI = mapOI.getMapValueObjectInspector(); + + for (Map.Entry e : m.entrySet()) { + Object k = e.getKey(); + Object v = e.getValue(); + Object keyCopy = ObjectInspectorUtils.copyToStandardObject(k, keyOI); + Object valCopy = ObjectInspectorUtils.copyToStandardObject(v, valueOI); + dst.put(keyCopy, valCopy); + } + } + + @Override + @Nonnull + public Map terminatePartial( + @SuppressWarnings("deprecation") AggregationBuffer agg) throws HiveException { + MapAggBuffer myagg = (MapAggBuffer) agg; + return myagg.collectMap; + } + + @Override + public Object terminate(@SuppressWarnings("deprecation") AggregationBuffer agg) + throws HiveException { + MapAggBuffer myagg = (MapAggBuffer) agg; + return myagg.collectMap; + } + + } + +} From 443f49cbd69232cfaaca36066b10d603b0bab27d Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 26 Apr 2018 15:59:21 +0900 Subject: [PATCH 22/56] Applied formatter --- .../hivemall/sketch/bloom/BloomAndUDF.java | 3 +- .../sketch/bloom/BloomContainsUDF.java | 3 +- .../hivemall/sketch/bloom/BloomOrUDF.java | 3 +- .../statistics/MovingAverageUDTF.java | 3 +- .../main/java/hivemall/tools/TryCastUDF.java | 5 +- .../hivemall/tools/array/ArrayAppendUDF.java | 6 +- .../hivemall/tools/array/ArrayConcatUDF.java | 8 +- .../hivemall/tools/array/ArrayFlattenUDF.java | 11 ++- .../hivemall/tools/array/ArrayUnionUDF.java | 9 +-- .../tools/array/ConditionalEmitUDTF.java | 3 +- .../hivemall/tools/array/SelectKBestUDF.java | 13 ++-- .../tools/datetime/SessionizeUDF.java | 3 +- .../java/hivemall/tools/json/FromJsonUDF.java | 17 ++-- .../java/hivemall/tools/json/ToJsonUDF.java | 10 +-- .../hivemall/tools/map/MapKeyValuesUDF.java | 19 ++--- .../hivemall/tools/map/MergeMapsUDAF.java | 5 +- .../hivemall/tools/vector/VectorAddUDF.java | 10 +-- .../hivemall/utils/hadoop/JsonSerdeUtils.java | 64 +++++++-------- .../sketch/bloom/BloomAndUDFTest.java | 4 +- .../hivemall/sketch/bloom/BloomOrUDFTest.java | 4 +- .../tools/GenerateSeriesUDTFTest.java | 78 +++++++++---------- .../java/hivemall/tools/TryCastUDFTest.java | 3 +- .../tools/array/ArrayAppendUDFTest.java | 2 +- .../tools/array/ArrayElementAtUDFTest.java | 21 ++--- .../tools/array/ArrayFlattenUDFTest.java | 4 +- .../tools/array/ArraySliceUDFTest.java | 22 ++---- .../tools/array/ArrayUnionUDFTest.java | 20 ++--- .../tools/array/ConditionalEmitUDTFTest.java | 14 ++-- .../tools/array/FirstElementUDFTest.java | 10 +-- .../tools/array/LastElementUDFTest.java | 10 +-- .../hivemall/tools/json/FromJsonUDFTest.java | 8 +- .../hivemall/tools/json/ToJsonUDFTest.java | 4 +- .../tools/map/MapKeyValuesUDFTest.java | 4 +- .../tools/vector/VectorDotUDFTest.java | 18 ++--- .../utils/hadoop/JsonSerdeUtilsTest.java | 78 +++++++++---------- 35 files changed, 225 insertions(+), 274 deletions(-) diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java index 87769da4b..9b029d454 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java @@ -30,7 +30,8 @@ import org.apache.hadoop.util.bloom.DynamicBloomFilter; import org.apache.hadoop.util.bloom.Filter; -@Description(name = "bloom_and", +@Description( + name = "bloom_and", value = "_FUNC_(string bloom1, string bloom2) - Returns the logical AND of two bloom filters") @UDFType(deterministic = true, stateful = false) public final class BloomAndUDF extends UDF { diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java index 2da65b33f..2aa751024 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java @@ -32,7 +32,8 @@ import org.apache.hadoop.util.bloom.Filter; import org.apache.hadoop.util.bloom.Key; -@Description(name = "bloom_contains", +@Description( + name = "bloom_contains", value = "_FUNC_(string bloom, string key) - Returns true if the bloom filter contains the given key") @UDFType(deterministic = true, stateful = false) public final class BloomContainsUDF extends UDF { diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java index 7d2980e4d..7f60be456 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java @@ -30,7 +30,8 @@ import org.apache.hadoop.util.bloom.DynamicBloomFilter; import org.apache.hadoop.util.bloom.Filter; -@Description(name = "bloom_or", +@Description( + name = "bloom_or", value = "_FUNC_(string bloom1, string bloom2) - Returns the logical OR of two bloom filters") @UDFType(deterministic = true, stateful = false) public final class BloomOrUDF extends UDF { diff --git a/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java b/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java index 105c86bf5..fe634ce81 100644 --- a/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java +++ b/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java @@ -65,8 +65,7 @@ public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgu this.forwardObjs = new Writable[] {result}; List fieldNames = Arrays.asList("avg"); - List fieldOIs = Arrays.asList( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + List fieldOIs = Arrays.asList(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); } diff --git a/core/src/main/java/hivemall/tools/TryCastUDF.java b/core/src/main/java/hivemall/tools/TryCastUDF.java index a0f3257d7..69ddc2faa 100644 --- a/core/src/main/java/hivemall/tools/TryCastUDF.java +++ b/core/src/main/java/hivemall/tools/TryCastUDF.java @@ -32,9 +32,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -@Description(name = "try_cast", - value = "_FUNC_(ANY src, const string typeName)" - + " - Explicitly cast a value as a type. Returns null if cast fails.", +@Description(name = "try_cast", value = "_FUNC_(ANY src, const string typeName)" + + " - Explicitly cast a value as a type. Returns null if cast fails.", extended = "Usage: select try_cast(array(1.0,2.0,3.0), 'array')\n" + " select try_cast(map('A',10,'B',20,'C',30), 'map')") @UDFType(deterministic = true, stateful = false) diff --git a/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java b/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java index c344c01a4..8d3e26af3 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java @@ -53,16 +53,14 @@ public final class ArrayAppendUDF extends GenericUDF { @Override public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { this.listInspector = HiveUtils.asListOI(argOIs[0]); - this.listElemInspector = - HiveUtils.asPrimitiveObjectInspector(listInspector.getListElementObjectInspector()); + this.listElemInspector = HiveUtils.asPrimitiveObjectInspector(listInspector.getListElementObjectInspector()); this.primInspector = HiveUtils.asPrimitiveObjectInspector(argOIs[1]); if (listElemInspector.getPrimitiveCategory() != primInspector.getPrimitiveCategory()) { throw new UDFArgumentException( "array_append expects the list type to match the type of the value being appended"); } this.returnWritables = listElemInspector.preferWritable(); - return ObjectInspectorFactory.getStandardListObjectInspector( - ObjectInspectorUtils.getStandardObjectInspector(listElemInspector)); + return ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorUtils.getStandardObjectInspector(listElemInspector)); } @Nullable diff --git a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java index 62e3e3660..223d69a3d 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java @@ -65,10 +65,10 @@ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumen break; } default: - throw new UDFArgumentTypeException(0, - "Argument " + i + " of function CONCAT_ARRAY must be " + LIST_TYPE_NAME - + "<" + Category.PRIMITIVE + ">, but " + arguments[0].getTypeName() - + " was found."); + throw new UDFArgumentTypeException(0, "Argument " + i + + " of function CONCAT_ARRAY must be " + LIST_TYPE_NAME + "<" + + Category.PRIMITIVE + ">, but " + arguments[0].getTypeName() + + " was found."); } } diff --git a/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java b/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java index 906d594d3..b35ad1e53 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java @@ -48,21 +48,20 @@ public final class ArrayFlattenUDF extends GenericUDF { @Override public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { if (argOIs.length != 1) { - throw new UDFArgumentException( - "array_flatten expects exactly one argument: " + argOIs.length); + throw new UDFArgumentException("array_flatten expects exactly one argument: " + + argOIs.length); } this.listOI = HiveUtils.asListOI(argOIs[0]); ObjectInspector listElemOI = listOI.getListElementObjectInspector(); if (listElemOI.getCategory() != Category.LIST) { - throw new UDFArgumentException( - "array_flatten takes array of array for the argument: " + listOI.toString()); + throw new UDFArgumentException("array_flatten takes array of array for the argument: " + + listOI.toString()); } this.nextedListOI = HiveUtils.asListOI(listElemOI); this.elemOI = nextedListOI.getListElementObjectInspector(); - return ObjectInspectorFactory.getStandardListObjectInspector( - ObjectInspectorUtils.getStandardObjectInspector(elemOI)); + return ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorUtils.getStandardObjectInspector(elemOI)); } @Override diff --git a/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java b/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java index 33aadf8b3..0b037779e 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java @@ -67,17 +67,16 @@ public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentEx ListObjectInspector checkOI = HiveUtils.asListOI(argOIs[i]); if (!ObjectInspectorUtils.compareTypes(arg0ElemOI, checkOI.getListElementObjectInspector())) { - throw new UDFArgumentException("Array types does not match: " + arg0OI.getTypeName() - + " != " + checkOI.getTypeName()); + throw new UDFArgumentException("Array types does not match: " + + arg0OI.getTypeName() + " != " + checkOI.getTypeName()); } listOIs[i] = checkOI; } this._listOIs = listOIs; - return ObjectInspectorFactory.getStandardListObjectInspector( - ObjectInspectorUtils.getStandardObjectInspector(arg0ElemOI, - ObjectInspectorCopyOption.WRITABLE)); + return ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorUtils.getStandardObjectInspector( + arg0ElemOI, ObjectInspectorCopyOption.WRITABLE)); } @Override diff --git a/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java b/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java index a73a06f78..c7acde626 100644 --- a/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java +++ b/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java @@ -79,8 +79,7 @@ public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgu this.condElemOI = HiveUtils.asBooleanOI(conditionsOI.getListElementObjectInspector()); this.featuresOI = HiveUtils.asListOI(argOIs[1]); - this.featureElemOI = - HiveUtils.asPrimitiveObjectInspector(featuresOI.getListElementObjectInspector()); + this.featureElemOI = HiveUtils.asPrimitiveObjectInspector(featuresOI.getListElementObjectInspector()); List fieldNames = Arrays.asList("feature"); List fieldOIs = Arrays.asList(featureElemOI); diff --git a/core/src/main/java/hivemall/tools/array/SelectKBestUDF.java b/core/src/main/java/hivemall/tools/array/SelectKBestUDF.java index ff3721795..527060c14 100644 --- a/core/src/main/java/hivemall/tools/array/SelectKBestUDF.java +++ b/core/src/main/java/hivemall/tools/array/SelectKBestUDF.java @@ -82,8 +82,7 @@ public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentExcep this.featuresOI = HiveUtils.asListOI(OIs[0]); this.featureOI = HiveUtils.asDoubleCompatibleOI(featuresOI.getListElementObjectInspector()); this.importanceListOI = HiveUtils.asListOI(OIs[1]); - this.importanceElemOI = - HiveUtils.asDoubleCompatibleOI(importanceListOI.getListElementObjectInspector()); + this.importanceElemOI = HiveUtils.asDoubleCompatibleOI(importanceListOI.getListElementObjectInspector()); this._k = HiveUtils.getConstInt(OIs[2]); Preconditions.checkArgument(_k >= 1, UDFArgumentException.class); @@ -93,15 +92,14 @@ public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentExcep } this._result = result; - return ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); } @Override public List evaluate(DeferredObject[] dObj) throws HiveException { final double[] features = HiveUtils.asDoubleArray(dObj[0].get(), featuresOI, featureOI); - final double[] importanceList = - HiveUtils.asDoubleArray(dObj[1].get(), importanceListOI, importanceElemOI); + final double[] importanceList = HiveUtils.asDoubleArray(dObj[1].get(), importanceListOI, + importanceElemOI); Preconditions.checkNotNull(features, UDFArgumentException.class); Preconditions.checkNotNull(importanceList, UDFArgumentException.class); @@ -111,8 +109,7 @@ public List evaluate(DeferredObject[] dObj) throws HiveException int[] topKIndices = _topKIndices; if (topKIndices == null) { - final List> list = - new ArrayList>(); + final List> list = new ArrayList>(); for (int i = 0; i < importanceList.length; i++) { list.add(new AbstractMap.SimpleEntry(i, importanceList[i])); } diff --git a/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java b/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java index 4ceec7768..8c90c811a 100644 --- a/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java +++ b/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java @@ -29,7 +29,8 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; -@Description(name = "sessionize", +@Description( + name = "sessionize", value = "_FUNC_(long timeInSec, long thresholdInSec [, String subject])" + "- Returns a UUID string of a session.", extended = "SELECT sessionize(time, 3600, ip_addr) as session_id, time, ip_addr FROM (\n" diff --git a/core/src/main/java/hivemall/tools/json/FromJsonUDF.java b/core/src/main/java/hivemall/tools/json/FromJsonUDF.java index 36c29cc8a..8ee2a2d36 100644 --- a/core/src/main/java/hivemall/tools/json/FromJsonUDF.java +++ b/core/src/main/java/hivemall/tools/json/FromJsonUDF.java @@ -43,7 +43,8 @@ import org.apache.hadoop.io.Text; import org.apache.hive.hcatalog.data.HCatRecordObjectInspectorFactory; -@Description(name = "from_json", +@Description( + name = "from_json", value = "_FUNC_(string jsonString, const string returnTypes [, const array|const string columnNames])" + " - Return Hive object.") @UDFType(deterministic = true, stateful = false) @@ -58,8 +59,8 @@ public final class FromJsonUDF extends GenericUDF { @Override public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { if (argOIs.length != 2 && argOIs.length != 3) { - throw new UDFArgumentException( - "from_json takes two or three arguments: " + argOIs.length); + throw new UDFArgumentException("from_json takes two or three arguments: " + + argOIs.length); } this.jsonOI = HiveUtils.asStringOI(argOIs[0]); @@ -94,8 +95,7 @@ private static ObjectInspector getObjectInspector(@Nonnull final List final int numColumns = columnTypes.size(); if (numColumns == 1) { TypeInfo type = columnTypes.get(0); - returnOI = - HCatRecordObjectInspectorFactory.getStandardObjectInspectorFromTypeInfo(type); + returnOI = HCatRecordObjectInspectorFactory.getStandardObjectInspectorFromTypeInfo(type); } else { if (columnNames == null) { columnNames = new ArrayList<>(numColumns); @@ -111,9 +111,7 @@ private static ObjectInspector getObjectInspector(@Nonnull final List final ObjectInspector[] fieldOIs = new ObjectInspector[numColumns]; for (int i = 0; i < fieldOIs.length; i++) { TypeInfo type = columnTypes.get(i); - fieldOIs[i] = - HCatRecordObjectInspectorFactory.getStandardObjectInspectorFromTypeInfo( - type); + fieldOIs[i] = HCatRecordObjectInspectorFactory.getStandardObjectInspectorFromTypeInfo(type); } returnOI = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, Arrays.asList(fieldOIs)); @@ -134,8 +132,7 @@ public Object evaluate(DeferredObject[] args) throws HiveException { result = JsonSerdeUtils.deserialize(jsonString, columnNames, columnTypes); } catch (Throwable e) { throw new HiveException("Failed to deserialize Json: \n" + jsonString.toString() + '\n' - + ExceptionUtils.prettyPrintStackTrace(e), - e); + + ExceptionUtils.prettyPrintStackTrace(e), e); } return result; } diff --git a/core/src/main/java/hivemall/tools/json/ToJsonUDF.java b/core/src/main/java/hivemall/tools/json/ToJsonUDF.java index 70c62b92d..416d0c948 100644 --- a/core/src/main/java/hivemall/tools/json/ToJsonUDF.java +++ b/core/src/main/java/hivemall/tools/json/ToJsonUDF.java @@ -37,7 +37,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.io.Text; -@Description(name = "to_json", +@Description( + name = "to_json", value = "_FUNC_(ANY object [, const array|const string columnNames]) - Returns Json string") @UDFType(deterministic = true, stateful = false) public final class ToJsonUDF extends GenericUDF { @@ -50,8 +51,7 @@ public final class ToJsonUDF extends GenericUDF { @Override public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { if (argOIs.length != 1 && argOIs.length != 2) { - throw new UDFArgumentException( - "from_json takes one or two arguments: " + argOIs.length); + throw new UDFArgumentException("from_json takes one or two arguments: " + argOIs.length); } this.objOI = argOIs[0]; @@ -81,8 +81,8 @@ public Text evaluate(DeferredObject[] args) throws HiveException { try { return JsonSerdeUtils.serialize(obj, objOI, columnNames); } catch (Throwable e) { - throw new HiveException( - "Failed to serialize: " + obj + '\n' + ExceptionUtils.prettyPrintStackTrace(e), e); + throw new HiveException("Failed to serialize: " + obj + '\n' + + ExceptionUtils.prettyPrintStackTrace(e), e); } } diff --git a/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java b/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java index 64065e9ed..43f558246 100644 --- a/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java +++ b/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java @@ -37,8 +37,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -@Description(name = "map_key_values", - value = "_FUNC_(map) - " + "Returns a array of key-value pairs.") +@Description(name = "map_key_values", value = "_FUNC_(map) - " + + "Returns a array of key-value pairs.") @UDFType(deterministic = true, stateful = false) public final class MapKeyValuesUDF extends GenericUDF { @@ -49,13 +49,11 @@ public final class MapKeyValuesUDF extends GenericUDF { @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { if (arguments.length != 1) { - throw new UDFArgumentLengthException( - "The function MAP_KEYS only accepts one argument."); + throw new UDFArgumentLengthException("The function MAP_KEYS only accepts one argument."); } else if (!(arguments[0] instanceof MapObjectInspector)) { - throw new UDFArgumentTypeException(0, - "\"" + Category.MAP.toString().toLowerCase() - + "\" is expected at function MAP_KEYS, " + "but \"" - + arguments[0].getTypeName() + "\" is found"); + throw new UDFArgumentTypeException(0, "\"" + Category.MAP.toString().toLowerCase() + + "\" is expected at function MAP_KEYS, " + "but \"" + + arguments[0].getTypeName() + "\" is found"); } this.mapOI = (MapObjectInspector) arguments[0]; @@ -67,9 +65,8 @@ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumen structFieldNames.add("value"); structFieldObjectInspectors.add(mapOI.getMapValueObjectInspector()); - return ObjectInspectorFactory.getStandardListObjectInspector( - ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames, - structFieldObjectInspectors)); + return ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorFactory.getStandardStructObjectInspector( + structFieldNames, structFieldObjectInspectors)); } @Override diff --git a/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java b/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java index 21250b133..c94764313 100644 --- a/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java +++ b/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java @@ -39,9 +39,10 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -@Description(name = "merge_maps", +@Description( + name = "merge_maps", value = "_FUNC_(x) - Returns a map which contains the union of an aggregation of maps." - + " Note that an existing value of a key can be replaced with the other duplicate key entry.", + + " Note that an existing value of a key can be replaced with the other duplicate key entry.", extended = "SELECT merge_maps(m) FROM ( " + "SELECT map('A',10,'B',20,'C',30) UNION ALL SELECT map('A',10,'B',20,'C',30)) t") public final class MergeMapsUDAF extends AbstractGenericUDAFResolver { diff --git a/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java b/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java index 8442ae370..ecff2f4ed 100644 --- a/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java +++ b/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java @@ -63,12 +63,10 @@ public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentEx if (HiveUtils.isIntegerOI(xElemOI) && HiveUtils.isIntegerOI(yElemOI)) { this.floatingPoints = false; - return ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.javaLongObjectInspector); + return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaLongObjectInspector); } else { this.floatingPoints = true; - return ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); + return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); } } @@ -84,8 +82,8 @@ public List evaluate(@Nonnull DeferredObject[] args) throws HiveException { final int xLen = xOI.getListLength(arg0); final int yLen = yOI.getListLength(arg1); if (xLen != yLen) { - throw new HiveException( - "vector lengths do not match. x=" + xOI.getList(arg0) + ", y=" + yOI.getList(arg1)); + throw new HiveException("vector lengths do not match. x=" + xOI.getList(arg0) + ", y=" + + yOI.getList(arg1)); } if (floatingPoints) { diff --git a/core/src/main/java/hivemall/utils/hadoop/JsonSerdeUtils.java b/core/src/main/java/hivemall/utils/hadoop/JsonSerdeUtils.java index 13155378f..34932bd39 100644 --- a/core/src/main/java/hivemall/utils/hadoop/JsonSerdeUtils.java +++ b/core/src/main/java/hivemall/utils/hadoop/JsonSerdeUtils.java @@ -127,9 +127,9 @@ public static Text serialize(@Nullable final Object obj, @Nonnull final ObjectIn /** * Serialize Hive objects as Text. */ - private static void serializeStruct(@Nonnull final StringBuilder sb, @Nullable final Object obj, - @Nonnull final StructObjectInspector soi, @Nullable final List columnNames) - throws SerDeException { + private static void serializeStruct(@Nonnull final StringBuilder sb, + @Nullable final Object obj, @Nonnull final StructObjectInspector soi, + @Nullable final List columnNames) throws SerDeException { if (obj == null) { sb.append("null"); } else { @@ -273,8 +273,7 @@ private static void serializePrimitive(@Nonnull final StringBuilder sb, break; } case STRING: { - String s = SerDeUtils.escapeString( - ((StringObjectInspector) poi).getPrimitiveJavaObject(obj)); + String s = SerDeUtils.escapeString(((StringObjectInspector) poi).getPrimitiveJavaObject(obj)); appendWithQuotes(sb, s); break; } @@ -297,28 +296,30 @@ private static void serializePrimitive(@Nonnull final StringBuilder sb, sb.append(((HiveDecimalObjectInspector) poi).getPrimitiveJavaObject(obj)); break; case VARCHAR: { - String s = SerDeUtils.escapeString( - ((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject(obj).toString()); + String s = SerDeUtils.escapeString(((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject( + obj) + .toString()); appendWithQuotes(sb, s); break; } case CHAR: { //this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13) // HiveChar.toString() returns getPaddedValue() - String s = SerDeUtils.escapeString( - ((HiveCharObjectInspector) poi).getPrimitiveJavaObject(obj).toString()); + String s = SerDeUtils.escapeString(((HiveCharObjectInspector) poi).getPrimitiveJavaObject( + obj) + .toString()); appendWithQuotes(sb, s); break; } default: - throw new SerDeException( - "Unknown primitive type: " + poi.getPrimitiveCategory()); + throw new SerDeException("Unknown primitive type: " + + poi.getPrimitiveCategory()); } } } - private static void buildJSONString(@Nonnull final StringBuilder sb, @Nullable final Object obj, - @Nonnull final ObjectInspector oi) throws SerDeException { + private static void buildJSONString(@Nonnull final StringBuilder sb, + @Nullable final Object obj, @Nonnull final ObjectInspector oi) throws SerDeException { switch (oi.getCategory()) { case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; @@ -375,12 +376,13 @@ public static T deserialize(@Nonnull final Text t, @Nonnull TypeInfo columnT @SuppressWarnings("unchecked") @Nonnull - public static T deserialize(@Nonnull final Text t, @Nullable final List columnNames, - @Nullable final List columnTypes) throws SerDeException { + public static T deserialize(@Nonnull final Text t, + @Nullable final List columnNames, @Nullable final List columnTypes) + throws SerDeException { final Object result; try { - JsonParser p = - new JsonFactory().createJsonParser(new FastByteArrayInputStream(t.getBytes())); + JsonParser p = new JsonFactory().createJsonParser(new FastByteArrayInputStream( + t.getBytes())); final JsonToken token = p.nextToken(); if (token == JsonToken.START_OBJECT) { result = parseObject(p, columnNames, columnTypes); @@ -400,8 +402,8 @@ public static T deserialize(@Nonnull final Text t, @Nullable final List columnNames, - @CheckForNull final List columnTypes) - throws JsonParseException, IOException, SerDeException { + @CheckForNull final List columnTypes) throws JsonParseException, IOException, + SerDeException { Preconditions.checkNotNull(columnNames, "columnNames MUST NOT be null in parseObject", SerDeException.class); Preconditions.checkNotNull(columnTypes, "columnTypes MUST NOT be null in parseObject", @@ -435,8 +437,8 @@ private static Object parseObject(@Nonnull final JsonParser p, @Nonnull private static List parseArray(@Nonnull final JsonParser p, - @CheckForNull final List columnTypes) - throws HCatException, IOException, SerDeException { + @CheckForNull final List columnTypes) throws HCatException, IOException, + SerDeException { Preconditions.checkNotNull(columnTypes, "columnTypes MUST NOT be null", SerDeException.class); if (columnTypes.size() != 1) { @@ -457,8 +459,8 @@ private static List parseArray(@Nonnull final JsonParser p, } @Nonnull - private static Object parseValue(@Nonnull final JsonParser p) - throws JsonParseException, IOException { + private static Object parseValue(@Nonnull final JsonParser p) throws JsonParseException, + IOException { final JsonToken t = p.getCurrentToken(); switch (t) { case VALUE_FALSE: @@ -479,8 +481,8 @@ private static Object parseValue(@Nonnull final JsonParser p) } private static void populateRecord(@Nonnull final List r, - @Nonnull final JsonToken token, @Nonnull final JsonParser p, - @Nonnull final HCatSchema s) throws IOException { + @Nonnull final JsonToken token, @Nonnull final JsonParser p, @Nonnull final HCatSchema s) + throws IOException { if (token != JsonToken.FIELD_NAME) { throw new IOException("Field name expected"); } @@ -575,8 +577,8 @@ private static Object extractCurrentField(@Nonnull final JsonParser p, break; case VARCHAR: int vLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength(); - val = (valueToken == JsonToken.VALUE_NULL) ? null - : new HiveVarchar(p.getText(), vLen); + val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveVarchar(p.getText(), + vLen); break; case CHAR: int cLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength(); @@ -676,8 +678,8 @@ private static Object getObjectOfCorrespondingPrimitiveType(String s, case CHAR: return new HiveChar(s, ((BaseCharTypeInfo) mapKeyType).getLength()); default: - throw new IOException( - "Could not convert from string to map type " + mapKeyType.getTypeName()); + throw new IOException("Could not convert from string to map type " + + mapKeyType.getTypeName()); } } @@ -691,8 +693,8 @@ private static int getPositionFromHiveInternalColumnName(String internalName) { } } - private static void skipValue(@Nonnull final JsonParser p) - throws JsonParseException, IOException { + private static void skipValue(@Nonnull final JsonParser p) throws JsonParseException, + IOException { JsonToken valueToken = p.nextToken(); if ((valueToken == JsonToken.START_ARRAY) || (valueToken == JsonToken.START_OBJECT)) { // if the currently read token is a beginning of an array or object, move stream forward diff --git a/core/src/test/java/hivemall/sketch/bloom/BloomAndUDFTest.java b/core/src/test/java/hivemall/sketch/bloom/BloomAndUDFTest.java index 97ad7c63d..76f32e8f9 100644 --- a/core/src/test/java/hivemall/sketch/bloom/BloomAndUDFTest.java +++ b/core/src/test/java/hivemall/sketch/bloom/BloomAndUDFTest.java @@ -50,8 +50,8 @@ public void test() throws IOException, HiveException { Assert.assertEquals(expected, actual); - DynamicBloomFilter deserialized = - BloomFilterUtils.deserialize(actual, new DynamicBloomFilter()); + DynamicBloomFilter deserialized = BloomFilterUtils.deserialize(actual, + new DynamicBloomFilter()); assertNotContains(bf1, deserialized, 1L, 10000); assertNotContains(bf1, deserialized, 2L, 10000); } diff --git a/core/src/test/java/hivemall/sketch/bloom/BloomOrUDFTest.java b/core/src/test/java/hivemall/sketch/bloom/BloomOrUDFTest.java index 64f95e0d5..0179a3076 100644 --- a/core/src/test/java/hivemall/sketch/bloom/BloomOrUDFTest.java +++ b/core/src/test/java/hivemall/sketch/bloom/BloomOrUDFTest.java @@ -50,8 +50,8 @@ public void test() throws IOException, HiveException { Assert.assertEquals(expected, actual); - DynamicBloomFilter deserialized = - BloomFilterUtils.deserialize(actual, new DynamicBloomFilter()); + DynamicBloomFilter deserialized = BloomFilterUtils.deserialize(actual, + new DynamicBloomFilter()); assertEquals(bf1, deserialized, 1L, 10000); assertEquals(bf1, deserialized, 2L, 10000); } diff --git a/core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java b/core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java index 04f432cd8..31991065e 100644 --- a/core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java +++ b/core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java @@ -59,8 +59,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {new IntWritable(1), new IntWritable(3)}); - List expected = - Arrays.asList(new IntWritable(1), new IntWritable(2), new IntWritable(3)); + List expected = Arrays.asList(new IntWritable(1), new IntWritable(2), + new IntWritable(3)); Assert.assertEquals(expected, actual); } @@ -68,9 +68,9 @@ public void collect(Object args) throws HiveException { public void testTwoIntArgs() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize( - new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, - PrimitiveObjectInspectorFactory.writableIntObjectInspector}); + udtf.initialize(new ObjectInspector[] { + PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector}); final List actual = new ArrayList<>(); @@ -85,8 +85,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {1, new IntWritable(3)}); - List expected = - Arrays.asList(new IntWritable(1), new IntWritable(2), new IntWritable(3)); + List expected = Arrays.asList(new IntWritable(1), new IntWritable(2), + new IntWritable(3)); Assert.assertEquals(expected, actual); } @@ -94,9 +94,9 @@ public void collect(Object args) throws HiveException { public void testTwoLongArgs() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize( - new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, - PrimitiveObjectInspectorFactory.writableLongObjectInspector}); + udtf.initialize(new ObjectInspector[] { + PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableLongObjectInspector}); final List actual = new ArrayList<>(); @@ -111,8 +111,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {1, new LongWritable(3)}); - List expected = - Arrays.asList(new LongWritable(1), new LongWritable(2), new LongWritable(3)); + List expected = Arrays.asList(new LongWritable(1), new LongWritable(2), + new LongWritable(3)); Assert.assertEquals(expected, actual); } @@ -120,10 +120,10 @@ public void collect(Object args) throws HiveException { public void testThreeIntArgs() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize( - new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, - PrimitiveObjectInspectorFactory.writableIntObjectInspector, - PrimitiveObjectInspectorFactory.javaLongObjectInspector}); + udtf.initialize(new ObjectInspector[] { + PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector, + PrimitiveObjectInspectorFactory.javaLongObjectInspector}); final List actual = new ArrayList<>(); @@ -138,8 +138,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {1, new IntWritable(7), 3L}); - List expected = - Arrays.asList(new IntWritable(1), new IntWritable(4), new IntWritable(7)); + List expected = Arrays.asList(new IntWritable(1), new IntWritable(4), + new IntWritable(7)); Assert.assertEquals(expected, actual); } @@ -147,10 +147,10 @@ public void collect(Object args) throws HiveException { public void testThreeLongArgs() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize( - new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaLongObjectInspector, - PrimitiveObjectInspectorFactory.writableLongObjectInspector, - PrimitiveObjectInspectorFactory.javaLongObjectInspector}); + udtf.initialize(new ObjectInspector[] { + PrimitiveObjectInspectorFactory.javaLongObjectInspector, + PrimitiveObjectInspectorFactory.writableLongObjectInspector, + PrimitiveObjectInspectorFactory.javaLongObjectInspector}); final List actual = new ArrayList<>(); @@ -165,8 +165,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {1L, new LongWritable(7), 3L}); - List expected = - Arrays.asList(new LongWritable(1), new LongWritable(4), new LongWritable(7)); + List expected = Arrays.asList(new LongWritable(1), new LongWritable(4), + new LongWritable(7)); Assert.assertEquals(expected, actual); } @@ -174,10 +174,10 @@ public void collect(Object args) throws HiveException { public void testNegativeStepInt() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize( - new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, - PrimitiveObjectInspectorFactory.writableIntObjectInspector, - PrimitiveObjectInspectorFactory.javaLongObjectInspector}); + udtf.initialize(new ObjectInspector[] { + PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector, + PrimitiveObjectInspectorFactory.javaLongObjectInspector}); final List actual = new ArrayList<>(); @@ -192,8 +192,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {5, new IntWritable(1), -2L}); - List expected = - Arrays.asList(new IntWritable(5), new IntWritable(3), new IntWritable(1)); + List expected = Arrays.asList(new IntWritable(5), new IntWritable(3), + new IntWritable(1)); Assert.assertEquals(expected, actual); } @@ -201,10 +201,10 @@ public void collect(Object args) throws HiveException { public void testNegativeStepLong() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize( - new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaLongObjectInspector, - PrimitiveObjectInspectorFactory.writableIntObjectInspector, - PrimitiveObjectInspectorFactory.javaIntObjectInspector}); + udtf.initialize(new ObjectInspector[] { + PrimitiveObjectInspectorFactory.javaLongObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector, + PrimitiveObjectInspectorFactory.javaIntObjectInspector}); final List actual = new ArrayList<>(); @@ -219,8 +219,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {5L, new IntWritable(1), -2}); - List expected = - Arrays.asList(new LongWritable(5), new LongWritable(3), new LongWritable(1)); + List expected = Arrays.asList(new LongWritable(5), new LongWritable(3), + new LongWritable(1)); Assert.assertEquals(expected, actual); } @@ -228,9 +228,9 @@ public void collect(Object args) throws HiveException { public void testSerialization() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize( - new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, - PrimitiveObjectInspectorFactory.writableIntObjectInspector}); + udtf.initialize(new ObjectInspector[] { + PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector}); udtf.setCollector(new Collector() { @Override diff --git a/core/src/test/java/hivemall/tools/TryCastUDFTest.java b/core/src/test/java/hivemall/tools/TryCastUDFTest.java index ddd79e8cc..8de3181d8 100644 --- a/core/src/test/java/hivemall/tools/TryCastUDFTest.java +++ b/core/src/test/java/hivemall/tools/TryCastUDFTest.java @@ -44,8 +44,7 @@ public void testList() throws IOException, HiveException { TryCastUDF udf = new TryCastUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( TypeInfoFactory.stringTypeInfo, new Text("array"))}); diff --git a/core/src/test/java/hivemall/tools/array/ArrayAppendUDFTest.java b/core/src/test/java/hivemall/tools/array/ArrayAppendUDFTest.java index 119dd264d..1e01274a3 100644 --- a/core/src/test/java/hivemall/tools/array/ArrayAppendUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArrayAppendUDFTest.java @@ -91,7 +91,7 @@ public void testEvaluateNullList() throws HiveException, IOException { udf.initialize(new ObjectInspector[] { ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.javaDoubleObjectInspector), + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), PrimitiveObjectInspectorFactory.javaDoubleObjectInspector}); DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject(null), diff --git a/core/src/test/java/hivemall/tools/array/ArrayElementAtUDFTest.java b/core/src/test/java/hivemall/tools/array/ArrayElementAtUDFTest.java index 7678d6ae0..09f7cfcc1 100644 --- a/core/src/test/java/hivemall/tools/array/ArrayElementAtUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArrayElementAtUDFTest.java @@ -41,27 +41,23 @@ public void testDouble() throws IOException, HiveException { ArrayElementAtUDF udf = new ArrayElementAtUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), PrimitiveObjectInspectorFactory.javaIntObjectInspector}); DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject( - WritableUtils.toWritableList(new double[] {0, 1, 2})), - new GenericUDF.DeferredJavaObject(new Integer(1))}; + new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {0, 1, + 2})), new GenericUDF.DeferredJavaObject(new Integer(1))}; Assert.assertEquals(new DoubleWritable(1), udf.evaluate(args)); args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject( - WritableUtils.toWritableList(new double[] {0, 1, 2})), - new GenericUDF.DeferredJavaObject(new Integer(4))}; + new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {0, 1, + 2})), new GenericUDF.DeferredJavaObject(new Integer(4))}; Assert.assertNull(udf.evaluate(args)); args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject( - WritableUtils.toWritableList(new double[] {0, 1, 2})), - new GenericUDF.DeferredJavaObject(new Integer(-2))}; + new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {0, 1, + 2})), new GenericUDF.DeferredJavaObject(new Integer(-2))}; Assert.assertEquals(new DoubleWritable(1), udf.evaluate(args)); udf.close(); @@ -72,8 +68,7 @@ public void testString() throws IOException, HiveException { ArrayElementAtUDF udf = new ArrayElementAtUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableStringObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector), PrimitiveObjectInspectorFactory.javaIntObjectInspector}); DeferredObject[] args = new DeferredObject[] { diff --git a/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java b/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java index 11754aa71..e1fba0d44 100644 --- a/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java @@ -39,9 +39,7 @@ public class ArrayFlattenUDFTest { public void testEvaluate() throws HiveException, IOException { ArrayFlattenUDF udf = new ArrayFlattenUDF(); - udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.javaIntObjectInspector))}); + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector))}); DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( Arrays.asList(Arrays.asList(0, 1, 2, 3), Arrays.asList(4, 5), Arrays.asList(6, 7)))}; diff --git a/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java b/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java index fbc212aad..23391048e 100644 --- a/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java @@ -42,8 +42,7 @@ public void testNonNullReturn() throws IOException, HiveException { ArraySliceUDF udf = new ArraySliceUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.javaStringObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector), PrimitiveObjectInspectorFactory.writableIntObjectInspector, PrimitiveObjectInspectorFactory.writableIntObjectInspector}); @@ -53,11 +52,9 @@ public void testNonNullReturn() throws IOException, HiveException { DeferredObject arg2 = new GenericUDF.DeferredJavaObject(length); DeferredObject nullarg = new GenericUDF.DeferredJavaObject(null); - DeferredObject[] args = - new DeferredObject[] { - new GenericUDF.DeferredJavaObject(Arrays.asList("zero", "one", "two", - "three", "four", "five", "six", "seven", "eight", "nine", "ten")), - arg1, arg2}; + DeferredObject[] args = new DeferredObject[] { + new GenericUDF.DeferredJavaObject(Arrays.asList("zero", "one", "two", "three", + "four", "five", "six", "seven", "eight", "nine", "ten")), arg1, arg2}; offset.set(0); length.set(3); @@ -93,8 +90,7 @@ public void testNullReturn() throws IOException, HiveException { ArraySliceUDF udf = new ArraySliceUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.javaStringObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector), PrimitiveObjectInspectorFactory.writableIntObjectInspector, PrimitiveObjectInspectorFactory.writableIntObjectInspector}); @@ -103,11 +99,9 @@ public void testNullReturn() throws IOException, HiveException { DeferredObject arg1 = new GenericUDF.DeferredJavaObject(offset); DeferredObject arg2 = new GenericUDF.DeferredJavaObject(length); - DeferredObject[] args = - new DeferredObject[] { - new GenericUDF.DeferredJavaObject(Arrays.asList("zero", "one", "two", - "three", "four", "five", "six", "seven", "eight", "nine", "ten")), - arg1, arg2}; + DeferredObject[] args = new DeferredObject[] { + new GenericUDF.DeferredJavaObject(Arrays.asList("zero", "one", "two", "three", + "four", "five", "six", "seven", "eight", "nine", "ten")), arg1, arg2}; offset.set(-12); diff --git a/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java b/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java index cc1703983..46206b77f 100644 --- a/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java @@ -41,20 +41,16 @@ public void testUnion3() throws HiveException, IOException { ArrayUnionUDF udf = new ArrayUnionUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject( - WritableUtils.toWritableList(new double[] {0, 1, 2})), - new GenericUDF.DeferredJavaObject( - WritableUtils.toWritableList(new double[] {2, 3, 4})), - new GenericUDF.DeferredJavaObject( - WritableUtils.toWritableList(new double[] {4, 5}))}; + new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {0, 1, + 2})), + new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {2, 3, + 4})), + new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {4, 5}))}; List result = udf.evaluate(args); diff --git a/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java b/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java index 7045235b3..fa34db1e4 100644 --- a/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java +++ b/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java @@ -39,10 +39,8 @@ public void test() throws HiveException { ConditionalEmitUDTF udtf = new ConditionalEmitUDTF(); udtf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.javaBooleanObjectInspector), - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.javaStringObjectInspector),}); + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector),}); final List actual = new ArrayList<>(); udtf.setCollector(new Collector() { @@ -54,15 +52,15 @@ public void collect(Object input) throws HiveException { } }); - udtf.process( - new Object[] {Arrays.asList(true, false, true), Arrays.asList("one", "two", "three")}); + udtf.process(new Object[] {Arrays.asList(true, false, true), + Arrays.asList("one", "two", "three")}); Assert.assertEquals(Arrays.asList("one", "three"), actual); actual.clear(); - udtf.process( - new Object[] {Arrays.asList(true, true, false), Arrays.asList("one", "two", "three")}); + udtf.process(new Object[] {Arrays.asList(true, true, false), + Arrays.asList("one", "two", "three")}); Assert.assertEquals(Arrays.asList("one", "two"), actual); udtf.close(); diff --git a/core/src/test/java/hivemall/tools/array/FirstElementUDFTest.java b/core/src/test/java/hivemall/tools/array/FirstElementUDFTest.java index c5cd5858a..b9c01e47c 100644 --- a/core/src/test/java/hivemall/tools/array/FirstElementUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/FirstElementUDFTest.java @@ -39,8 +39,7 @@ public class FirstElementUDFTest { public void test() throws IOException, HiveException { FirstElementUDF udf = new FirstElementUDF(); - udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( WritableUtils.toWritableList(new double[] {0, 1, 2}))}; @@ -54,11 +53,10 @@ public void test() throws IOException, HiveException { public void testNull() throws IOException, HiveException { FirstElementUDF udf = new FirstElementUDF(); - udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); - DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {}))}; + DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( + WritableUtils.toWritableList(new double[] {}))}; Assert.assertNull(udf.evaluate(args)); diff --git a/core/src/test/java/hivemall/tools/array/LastElementUDFTest.java b/core/src/test/java/hivemall/tools/array/LastElementUDFTest.java index b61e9dacf..12d0f95ab 100644 --- a/core/src/test/java/hivemall/tools/array/LastElementUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/LastElementUDFTest.java @@ -39,8 +39,7 @@ public class LastElementUDFTest { public void test() throws IOException, HiveException { LastElementUDF udf = new LastElementUDF(); - udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( WritableUtils.toWritableList(new double[] {0, 1, 2}))}; @@ -54,11 +53,10 @@ public void test() throws IOException, HiveException { public void testNull() throws IOException, HiveException { LastElementUDF udf = new LastElementUDF(); - udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); - DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {}))}; + DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( + WritableUtils.toWritableList(new double[] {}))}; Assert.assertNull(udf.evaluate(args)); diff --git a/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java b/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java index 738a9390c..01d2657c2 100644 --- a/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java +++ b/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java @@ -47,8 +47,8 @@ public void testDoubleArray() throws Exception { ObjectInspector[] argOIs = new ObjectInspector[] { PrimitiveObjectInspectorFactory.writableStringObjectInspector, HiveUtils.getConstStringObjectInspector(types)}; - DeferredObject[] args = - new DeferredObject[] {new GenericUDF.DeferredJavaObject(new Text(json)), null}; + DeferredObject[] args = new DeferredObject[] { + new GenericUDF.DeferredJavaObject(new Text(json)), null}; udf.initialize(argOIs); Object result = udf.evaluate(args); @@ -70,8 +70,8 @@ public void testPersonStruct() throws Exception { PrimitiveObjectInspectorFactory.writableStringObjectInspector, HiveUtils.getConstStringObjectInspector(types), HiveUtils.getConstStringObjectInspector("person")}; - DeferredObject[] args = - new DeferredObject[] {new GenericUDF.DeferredJavaObject(new Text(json)), null}; + DeferredObject[] args = new DeferredObject[] { + new GenericUDF.DeferredJavaObject(new Text(json)), null}; udf.initialize(argOIs); List result = (List) udf.evaluate(args); diff --git a/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java b/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java index f7f698cb2..f005145de 100644 --- a/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java +++ b/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java @@ -40,9 +40,7 @@ public class ToJsonUDFTest { public void testDoubleArray() throws Exception { ToJsonUDF udf = new ToJsonUDF(); - ObjectInspector[] argOIs = - new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}; + ObjectInspector[] argOIs = new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}; DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( WritableUtils.toWritableList(new double[] {0.1, 1.1, 2.1}))}; diff --git a/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java b/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java index 2164dc1ad..51d7575fe 100644 --- a/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java +++ b/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java @@ -51,8 +51,8 @@ public void testStringDouble() throws HiveException, IOException { input.put("k" + i, new DoubleWritable(i)); } - GenericUDF.DeferredObject[] arguments = - new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject(input)}; + GenericUDF.DeferredObject[] arguments = new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject( + input)}; List actual = udf.evaluate(arguments); diff --git a/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java b/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java index eb5c08f63..671db1383 100644 --- a/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java +++ b/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java @@ -41,14 +41,12 @@ public void testDotp() throws HiveException, IOException { VectorDotUDF udf = new VectorDotUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableFloatObjectInspector)}); + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableFloatObjectInspector)}); DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject( - WritableUtils.toWritableList(new double[] {1, 2, 3})), + new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {1, 2, + 3})), new GenericUDF.DeferredJavaObject( WritableUtils.toWritableList(new float[] {2, 3, 4}))}; @@ -65,14 +63,12 @@ public void testDotpScalar() throws HiveException, IOException { VectorDotUDF udf = new VectorDotUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), PrimitiveObjectInspectorFactory.writableFloatObjectInspector}); DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject( - WritableUtils.toWritableList(new double[] {1, 2, 3})), - new GenericUDF.DeferredJavaObject(WritableUtils.val(2.f))}; + new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {1, 2, + 3})), new GenericUDF.DeferredJavaObject(WritableUtils.val(2.f))}; Object actual = udf.evaluate(args); List expected = Arrays.asList(2.d, 4.d, 6.d); diff --git a/core/src/test/java/hivemall/utils/hadoop/JsonSerdeUtilsTest.java b/core/src/test/java/hivemall/utils/hadoop/JsonSerdeUtilsTest.java index a3e81d2cd..556cc71c8 100644 --- a/core/src/test/java/hivemall/utils/hadoop/JsonSerdeUtilsTest.java +++ b/core/src/test/java/hivemall/utils/hadoop/JsonSerdeUtilsTest.java @@ -80,17 +80,18 @@ public class JsonSerdeUtilsTest { @Test public void testLooseJsonReadability() throws Exception { List columnNames = Arrays.asList("s,k".split(",")); - List columnTypes = - TypeInfoUtils.getTypeInfosFromTypeString("struct,int"); + List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString("struct,int"); - Text jsonText1 = new Text("{ \"x\" : \"abc\" , " - + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ," - + "\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } }"); + Text jsonText1 = new Text( + "{ \"x\" : \"abc\" , " + + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ," + + "\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } }"); - Text jsonText2 = new Text("{ \"x\" : \"abc\" , " - + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ," - + "\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } , " + "\"k\" : 113 " - + "}"); + Text jsonText2 = new Text( + "{ \"x\" : \"abc\" , " + + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ," + + "\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } , " + + "\"k\" : 113 " + "}"); List expected1 = Arrays.asList(Arrays.asList(2, "blah"), null); List expected2 = Arrays.asList(Arrays.asList(2, "blah"), 113); @@ -104,8 +105,7 @@ public void testLooseJsonReadability() throws Exception { @Test public void testMapValues() throws SerDeException { List columnNames = Arrays.asList("a,b".split(",")); - List columnTypes = - TypeInfoUtils.getTypeInfosFromTypeString("array,map"); + List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString("array,map"); Text text1 = new Text("{ \"a\":[\"aaa\"],\"b\":{\"bbb\":1}} "); Text text2 = new Text("{\"a\":[\"yyy\"],\"b\":{\"zzz\":123}}"); @@ -193,22 +193,18 @@ public void testRW() throws Exception { DefaultHCatRecord r = new DefaultHCatRecord(rlist); - List columnNames = - Arrays.asList("ti,si,i,bi,d,f,s,n,r,l,m,b,c1,bd,hc,hvc,dt,ts,bin".split(",")); - List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString( - "tinyint,smallint,int,bigint,double,float,string,string," - + "struct,array,map,boolean," - + "array,ii2:map>>>>," - + "decimal(5,2),char(10),varchar(20),date,timestamp,binary"); + List columnNames = Arrays.asList("ti,si,i,bi,d,f,s,n,r,l,m,b,c1,bd,hc,hvc,dt,ts,bin".split(",")); + List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString("tinyint,smallint,int,bigint,double,float,string,string," + + "struct,array,map,boolean," + + "array,ii2:map>>>>," + + "decimal(5,2),char(10),varchar(20),date,timestamp,binary"); - StructTypeInfo rowTypeInfo = - (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); - HCatRecordObjectInspector objInspector = - HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo); + StructTypeInfo rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo( + columnNames, columnTypes); + HCatRecordObjectInspector objInspector = HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo); Text serialized = JsonSerdeUtils.serialize(r, objInspector, columnNames); - List deserialized = - JsonSerdeUtils.deserialize(serialized, columnNames, columnTypes); + List deserialized = JsonSerdeUtils.deserialize(serialized, columnNames, columnTypes); assertRecordEquals(rlist, deserialized); } @@ -240,22 +236,18 @@ public void testRWNull() throws Exception { DefaultHCatRecord r = new DefaultHCatRecord(nlist); - List columnNames = - Arrays.asList("ti,si,i,bi,d,f,s,n,r,l,m,b,c1,bd,hc,hvc,dt,ts,bin".split(",")); - List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString( - "tinyint,smallint,int,bigint,double,float,string,string," - + "struct,array,map,boolean," - + "array,ii2:map>>>>," - + "decimal(5,2),char(10),varchar(20),date,timestamp,binary"); + List columnNames = Arrays.asList("ti,si,i,bi,d,f,s,n,r,l,m,b,c1,bd,hc,hvc,dt,ts,bin".split(",")); + List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString("tinyint,smallint,int,bigint,double,float,string,string," + + "struct,array,map,boolean," + + "array,ii2:map>>>>," + + "decimal(5,2),char(10),varchar(20),date,timestamp,binary"); - StructTypeInfo rowTypeInfo = - (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); - HCatRecordObjectInspector objInspector = - HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo); + StructTypeInfo rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo( + columnNames, columnTypes); + HCatRecordObjectInspector objInspector = HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo); Text serialized = JsonSerdeUtils.serialize(r, objInspector, columnNames); - List deserialized = - JsonSerdeUtils.deserialize(serialized, columnNames, columnTypes); + List deserialized = JsonSerdeUtils.deserialize(serialized, columnNames, columnTypes); assertRecordEquals(nlist, deserialized); } @@ -266,8 +258,8 @@ public void testStructWithoutColumnNames() throws Exception { TypeInfo type1 = TypeInfoUtils.getTypeInfoFromTypeString("struct"); List expected1 = Arrays.asList("makoto", 37); - List deserialized1 = - JsonSerdeUtils.deserialize(json1, Arrays.asList("person"), Arrays.asList(type1)); + List deserialized1 = JsonSerdeUtils.deserialize(json1, Arrays.asList("person"), + Arrays.asList(type1)); assertRecordEquals(expected1, deserialized1); } @@ -337,15 +329,15 @@ private static void assertRecordEquals(@Nonnull final List first, int mySz = first.size(); int urSz = second.size(); if (mySz != urSz) { - throw new RuntimeException( - "#expected != #actual. #expected=" + mySz + ", #actual=" + urSz); + throw new RuntimeException("#expected != #actual. #expected=" + mySz + ", #actual=" + + urSz); } else { for (int i = 0; i < first.size(); i++) { int c = DataType.compare(first.get(i), second.get(i)); if (c != 0) { String msg = "first.get(" + i + "}='" + first.get(i) + "' second.get(" + i - + ")='" + second.get(i) + "' compared as " + c + "\n" + "Types 1st/2nd=" - + DataType.findType(first.get(i)) + "/" + + ")='" + second.get(i) + "' compared as " + c + "\n" + + "Types 1st/2nd=" + DataType.findType(first.get(i)) + "/" + DataType.findType(second.get(i)) + '\n' + "first='" + first.get(i) + "' second='" + second.get(i) + "'"; if (first.get(i) instanceof Date) { From 9ce97d106b381633e5327046291234c1f25a3b3f Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 27 Apr 2018 12:36:57 +0900 Subject: [PATCH 23/56] Fixed warning for duplicate entry --- spark/spark-2.2/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark/spark-2.2/pom.xml b/spark/spark-2.2/pom.xml index 100de598e..5cb36091c 100644 --- a/spark/spark-2.2/pom.xml +++ b/spark/spark-2.2/pom.xml @@ -141,7 +141,7 @@ ${env.JAVA8_HOME} ${env.JAVA8_HOME}/bin:${env.PATH} - + From 3efefc43b7e38b7259df16548cca19abf5b77a02 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 27 Apr 2018 15:46:26 +0900 Subject: [PATCH 24/56] Applied spotless-maven-plugin formatter --- .../hivemall/sketch/bloom/BloomAndUDF.java | 3 +- .../sketch/bloom/BloomContainsUDF.java | 3 +- .../hivemall/sketch/bloom/BloomOrUDF.java | 3 +- .../smile/classification/DecisionTree.java | 6 +- .../smile/regression/RegressionTree.java | 10 +-- .../statistics/MovingAverageUDTF.java | 3 +- .../main/java/hivemall/tools/TryCastUDF.java | 5 +- .../hivemall/tools/array/ArrayAppendUDF.java | 6 +- .../hivemall/tools/array/ArrayConcatUDF.java | 8 +- .../hivemall/tools/array/ArrayFlattenUDF.java | 11 +-- .../hivemall/tools/array/ArrayUnionUDF.java | 9 ++- .../tools/array/ConditionalEmitUDTF.java | 3 +- .../hivemall/tools/array/SelectKBestUDF.java | 13 ++-- .../tools/datetime/SessionizeUDF.java | 3 +- .../java/hivemall/tools/json/FromJsonUDF.java | 17 ++-- .../java/hivemall/tools/json/ToJsonUDF.java | 10 +-- .../hivemall/tools/map/MapKeyValuesUDF.java | 19 +++-- .../hivemall/tools/map/MergeMapsUDAF.java | 3 +- .../hivemall/tools/text/SplitWordsUDF.java | 2 +- .../hivemall/tools/vector/VectorAddUDF.java | 10 ++- .../hivemall/utils/hadoop/JsonSerdeUtils.java | 64 ++++++++------- .../java/hivemall/utils/math/MatrixUtils.java | 7 +- .../ftvec/hashing/FeatureHashingUDFTest.java | 6 +- .../sketch/bloom/BloomAndUDFTest.java | 4 +- .../hivemall/sketch/bloom/BloomOrUDFTest.java | 4 +- .../classification/DecisionTreeTest.java | 10 +-- .../smile/regression/RegressionTreeTest.java | 5 +- .../tools/GenerateSeriesUDTFTest.java | 78 +++++++++---------- .../java/hivemall/tools/TryCastUDFTest.java | 3 +- .../tools/array/ArrayElementAtUDFTest.java | 21 +++-- .../tools/array/ArrayFlattenUDFTest.java | 4 +- .../tools/array/ArraySliceUDFTest.java | 22 ++++-- .../tools/array/ArrayUnionUDFTest.java | 20 +++-- .../tools/array/ConditionalEmitUDTFTest.java | 14 ++-- .../tools/array/FirstElementUDFTest.java | 10 ++- .../tools/array/LastElementUDFTest.java | 10 ++- .../hivemall/tools/json/FromJsonUDFTest.java | 8 +- .../hivemall/tools/json/ToJsonUDFTest.java | 4 +- .../tools/map/MapKeyValuesUDFTest.java | 4 +- .../tools/vector/VectorDotUDFTest.java | 18 +++-- .../utils/hadoop/JsonSerdeUtilsTest.java | 78 ++++++++++--------- 41 files changed, 296 insertions(+), 245 deletions(-) diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java index 9b029d454..87769da4b 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java @@ -30,8 +30,7 @@ import org.apache.hadoop.util.bloom.DynamicBloomFilter; import org.apache.hadoop.util.bloom.Filter; -@Description( - name = "bloom_and", +@Description(name = "bloom_and", value = "_FUNC_(string bloom1, string bloom2) - Returns the logical AND of two bloom filters") @UDFType(deterministic = true, stateful = false) public final class BloomAndUDF extends UDF { diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java index 2aa751024..2da65b33f 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java @@ -32,8 +32,7 @@ import org.apache.hadoop.util.bloom.Filter; import org.apache.hadoop.util.bloom.Key; -@Description( - name = "bloom_contains", +@Description(name = "bloom_contains", value = "_FUNC_(string bloom, string key) - Returns true if the bloom filter contains the given key") @UDFType(deterministic = true, stateful = false) public final class BloomContainsUDF extends UDF { diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java index 7f60be456..7d2980e4d 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java @@ -30,8 +30,7 @@ import org.apache.hadoop.util.bloom.DynamicBloomFilter; import org.apache.hadoop.util.bloom.Filter; -@Description( - name = "bloom_or", +@Description(name = "bloom_or", value = "_FUNC_(string bloom1, string bloom2) - Returns the logical OR of two bloom filters") @UDFType(deterministic = true, stateful = false) public final class BloomOrUDF extends UDF { diff --git a/core/src/main/java/hivemall/smile/classification/DecisionTree.java b/core/src/main/java/hivemall/smile/classification/DecisionTree.java index cc92ae8da..a80a2994b 100644 --- a/core/src/main/java/hivemall/smile/classification/DecisionTree.java +++ b/core/src/main/java/hivemall/smile/classification/DecisionTree.java @@ -19,6 +19,7 @@ import static hivemall.smile.utils.SmileExtUtils.resolveFeatureName; import static hivemall.smile.utils.SmileExtUtils.resolveName; + import hivemall.annotations.VisibleForTesting; import hivemall.math.matrix.Matrix; import hivemall.math.matrix.ints.ColumnMajorIntMatrix; @@ -36,6 +37,8 @@ import hivemall.utils.lang.StringUtils; import hivemall.utils.lang.mutable.MutableInt; import hivemall.utils.sampling.IntReservoirSampler; +import smile.classification.Classifier; +import smile.math.Math; import java.io.Externalizable; import java.io.IOException; @@ -53,9 +56,6 @@ import org.roaringbitmap.IntConsumer; import org.roaringbitmap.RoaringBitmap; -import smile.classification.Classifier; -import smile.math.Math; - /** * Decision tree for classification. A decision tree can be learned by splitting the training set * into subsets based on an attribute value test. This process is repeated on each derived subset in diff --git a/core/src/main/java/hivemall/smile/regression/RegressionTree.java b/core/src/main/java/hivemall/smile/regression/RegressionTree.java index b8a3cc79c..61964ae1d 100755 --- a/core/src/main/java/hivemall/smile/regression/RegressionTree.java +++ b/core/src/main/java/hivemall/smile/regression/RegressionTree.java @@ -18,6 +18,7 @@ package hivemall.smile.regression; import static hivemall.smile.utils.SmileExtUtils.resolveFeatureName; + import hivemall.annotations.VisibleForTesting; import hivemall.math.matrix.Matrix; import hivemall.math.matrix.ints.ColumnMajorIntMatrix; @@ -36,6 +37,10 @@ import hivemall.utils.lang.StringUtils; import hivemall.utils.lang.mutable.MutableInt; import hivemall.utils.math.MathUtils; +import smile.math.Math; +import smile.regression.GradientTreeBoost; +import smile.regression.RandomForest; +import smile.regression.Regression; import java.io.Externalizable; import java.io.IOException; @@ -51,11 +56,6 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; -import smile.math.Math; -import smile.regression.GradientTreeBoost; -import smile.regression.RandomForest; -import smile.regression.Regression; - /** * Decision tree for regression. A decision tree can be learned by splitting the training set into * subsets based on an attribute value test. This process is repeated on each derived subset in a diff --git a/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java b/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java index fe634ce81..105c86bf5 100644 --- a/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java +++ b/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java @@ -65,7 +65,8 @@ public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgu this.forwardObjs = new Writable[] {result}; List fieldNames = Arrays.asList("avg"); - List fieldOIs = Arrays.asList(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + List fieldOIs = Arrays.asList( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); } diff --git a/core/src/main/java/hivemall/tools/TryCastUDF.java b/core/src/main/java/hivemall/tools/TryCastUDF.java index 69ddc2faa..a0f3257d7 100644 --- a/core/src/main/java/hivemall/tools/TryCastUDF.java +++ b/core/src/main/java/hivemall/tools/TryCastUDF.java @@ -32,8 +32,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -@Description(name = "try_cast", value = "_FUNC_(ANY src, const string typeName)" - + " - Explicitly cast a value as a type. Returns null if cast fails.", +@Description(name = "try_cast", + value = "_FUNC_(ANY src, const string typeName)" + + " - Explicitly cast a value as a type. Returns null if cast fails.", extended = "Usage: select try_cast(array(1.0,2.0,3.0), 'array')\n" + " select try_cast(map('A',10,'B',20,'C',30), 'map')") @UDFType(deterministic = true, stateful = false) diff --git a/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java b/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java index 8d3e26af3..c344c01a4 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java @@ -53,14 +53,16 @@ public final class ArrayAppendUDF extends GenericUDF { @Override public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { this.listInspector = HiveUtils.asListOI(argOIs[0]); - this.listElemInspector = HiveUtils.asPrimitiveObjectInspector(listInspector.getListElementObjectInspector()); + this.listElemInspector = + HiveUtils.asPrimitiveObjectInspector(listInspector.getListElementObjectInspector()); this.primInspector = HiveUtils.asPrimitiveObjectInspector(argOIs[1]); if (listElemInspector.getPrimitiveCategory() != primInspector.getPrimitiveCategory()) { throw new UDFArgumentException( "array_append expects the list type to match the type of the value being appended"); } this.returnWritables = listElemInspector.preferWritable(); - return ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorUtils.getStandardObjectInspector(listElemInspector)); + return ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorUtils.getStandardObjectInspector(listElemInspector)); } @Nullable diff --git a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java index 223d69a3d..62e3e3660 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java @@ -65,10 +65,10 @@ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumen break; } default: - throw new UDFArgumentTypeException(0, "Argument " + i - + " of function CONCAT_ARRAY must be " + LIST_TYPE_NAME + "<" - + Category.PRIMITIVE + ">, but " + arguments[0].getTypeName() - + " was found."); + throw new UDFArgumentTypeException(0, + "Argument " + i + " of function CONCAT_ARRAY must be " + LIST_TYPE_NAME + + "<" + Category.PRIMITIVE + ">, but " + arguments[0].getTypeName() + + " was found."); } } diff --git a/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java b/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java index b35ad1e53..906d594d3 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java @@ -48,20 +48,21 @@ public final class ArrayFlattenUDF extends GenericUDF { @Override public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { if (argOIs.length != 1) { - throw new UDFArgumentException("array_flatten expects exactly one argument: " - + argOIs.length); + throw new UDFArgumentException( + "array_flatten expects exactly one argument: " + argOIs.length); } this.listOI = HiveUtils.asListOI(argOIs[0]); ObjectInspector listElemOI = listOI.getListElementObjectInspector(); if (listElemOI.getCategory() != Category.LIST) { - throw new UDFArgumentException("array_flatten takes array of array for the argument: " - + listOI.toString()); + throw new UDFArgumentException( + "array_flatten takes array of array for the argument: " + listOI.toString()); } this.nextedListOI = HiveUtils.asListOI(listElemOI); this.elemOI = nextedListOI.getListElementObjectInspector(); - return ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorUtils.getStandardObjectInspector(elemOI)); + return ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorUtils.getStandardObjectInspector(elemOI)); } @Override diff --git a/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java b/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java index 0b037779e..33aadf8b3 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java @@ -67,16 +67,17 @@ public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentEx ListObjectInspector checkOI = HiveUtils.asListOI(argOIs[i]); if (!ObjectInspectorUtils.compareTypes(arg0ElemOI, checkOI.getListElementObjectInspector())) { - throw new UDFArgumentException("Array types does not match: " - + arg0OI.getTypeName() + " != " + checkOI.getTypeName()); + throw new UDFArgumentException("Array types does not match: " + arg0OI.getTypeName() + + " != " + checkOI.getTypeName()); } listOIs[i] = checkOI; } this._listOIs = listOIs; - return ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorUtils.getStandardObjectInspector( - arg0ElemOI, ObjectInspectorCopyOption.WRITABLE)); + return ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorUtils.getStandardObjectInspector(arg0ElemOI, + ObjectInspectorCopyOption.WRITABLE)); } @Override diff --git a/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java b/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java index c7acde626..a73a06f78 100644 --- a/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java +++ b/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java @@ -79,7 +79,8 @@ public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgu this.condElemOI = HiveUtils.asBooleanOI(conditionsOI.getListElementObjectInspector()); this.featuresOI = HiveUtils.asListOI(argOIs[1]); - this.featureElemOI = HiveUtils.asPrimitiveObjectInspector(featuresOI.getListElementObjectInspector()); + this.featureElemOI = + HiveUtils.asPrimitiveObjectInspector(featuresOI.getListElementObjectInspector()); List fieldNames = Arrays.asList("feature"); List fieldOIs = Arrays.asList(featureElemOI); diff --git a/core/src/main/java/hivemall/tools/array/SelectKBestUDF.java b/core/src/main/java/hivemall/tools/array/SelectKBestUDF.java index 527060c14..ff3721795 100644 --- a/core/src/main/java/hivemall/tools/array/SelectKBestUDF.java +++ b/core/src/main/java/hivemall/tools/array/SelectKBestUDF.java @@ -82,7 +82,8 @@ public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentExcep this.featuresOI = HiveUtils.asListOI(OIs[0]); this.featureOI = HiveUtils.asDoubleCompatibleOI(featuresOI.getListElementObjectInspector()); this.importanceListOI = HiveUtils.asListOI(OIs[1]); - this.importanceElemOI = HiveUtils.asDoubleCompatibleOI(importanceListOI.getListElementObjectInspector()); + this.importanceElemOI = + HiveUtils.asDoubleCompatibleOI(importanceListOI.getListElementObjectInspector()); this._k = HiveUtils.getConstInt(OIs[2]); Preconditions.checkArgument(_k >= 1, UDFArgumentException.class); @@ -92,14 +93,15 @@ public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentExcep } this._result = result; - return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + return ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); } @Override public List evaluate(DeferredObject[] dObj) throws HiveException { final double[] features = HiveUtils.asDoubleArray(dObj[0].get(), featuresOI, featureOI); - final double[] importanceList = HiveUtils.asDoubleArray(dObj[1].get(), importanceListOI, - importanceElemOI); + final double[] importanceList = + HiveUtils.asDoubleArray(dObj[1].get(), importanceListOI, importanceElemOI); Preconditions.checkNotNull(features, UDFArgumentException.class); Preconditions.checkNotNull(importanceList, UDFArgumentException.class); @@ -109,7 +111,8 @@ public List evaluate(DeferredObject[] dObj) throws HiveException int[] topKIndices = _topKIndices; if (topKIndices == null) { - final List> list = new ArrayList>(); + final List> list = + new ArrayList>(); for (int i = 0; i < importanceList.length; i++) { list.add(new AbstractMap.SimpleEntry(i, importanceList[i])); } diff --git a/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java b/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java index 8c90c811a..4ceec7768 100644 --- a/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java +++ b/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java @@ -29,8 +29,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; -@Description( - name = "sessionize", +@Description(name = "sessionize", value = "_FUNC_(long timeInSec, long thresholdInSec [, String subject])" + "- Returns a UUID string of a session.", extended = "SELECT sessionize(time, 3600, ip_addr) as session_id, time, ip_addr FROM (\n" diff --git a/core/src/main/java/hivemall/tools/json/FromJsonUDF.java b/core/src/main/java/hivemall/tools/json/FromJsonUDF.java index 8ee2a2d36..36c29cc8a 100644 --- a/core/src/main/java/hivemall/tools/json/FromJsonUDF.java +++ b/core/src/main/java/hivemall/tools/json/FromJsonUDF.java @@ -43,8 +43,7 @@ import org.apache.hadoop.io.Text; import org.apache.hive.hcatalog.data.HCatRecordObjectInspectorFactory; -@Description( - name = "from_json", +@Description(name = "from_json", value = "_FUNC_(string jsonString, const string returnTypes [, const array|const string columnNames])" + " - Return Hive object.") @UDFType(deterministic = true, stateful = false) @@ -59,8 +58,8 @@ public final class FromJsonUDF extends GenericUDF { @Override public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { if (argOIs.length != 2 && argOIs.length != 3) { - throw new UDFArgumentException("from_json takes two or three arguments: " - + argOIs.length); + throw new UDFArgumentException( + "from_json takes two or three arguments: " + argOIs.length); } this.jsonOI = HiveUtils.asStringOI(argOIs[0]); @@ -95,7 +94,8 @@ private static ObjectInspector getObjectInspector(@Nonnull final List final int numColumns = columnTypes.size(); if (numColumns == 1) { TypeInfo type = columnTypes.get(0); - returnOI = HCatRecordObjectInspectorFactory.getStandardObjectInspectorFromTypeInfo(type); + returnOI = + HCatRecordObjectInspectorFactory.getStandardObjectInspectorFromTypeInfo(type); } else { if (columnNames == null) { columnNames = new ArrayList<>(numColumns); @@ -111,7 +111,9 @@ private static ObjectInspector getObjectInspector(@Nonnull final List final ObjectInspector[] fieldOIs = new ObjectInspector[numColumns]; for (int i = 0; i < fieldOIs.length; i++) { TypeInfo type = columnTypes.get(i); - fieldOIs[i] = HCatRecordObjectInspectorFactory.getStandardObjectInspectorFromTypeInfo(type); + fieldOIs[i] = + HCatRecordObjectInspectorFactory.getStandardObjectInspectorFromTypeInfo( + type); } returnOI = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, Arrays.asList(fieldOIs)); @@ -132,7 +134,8 @@ public Object evaluate(DeferredObject[] args) throws HiveException { result = JsonSerdeUtils.deserialize(jsonString, columnNames, columnTypes); } catch (Throwable e) { throw new HiveException("Failed to deserialize Json: \n" + jsonString.toString() + '\n' - + ExceptionUtils.prettyPrintStackTrace(e), e); + + ExceptionUtils.prettyPrintStackTrace(e), + e); } return result; } diff --git a/core/src/main/java/hivemall/tools/json/ToJsonUDF.java b/core/src/main/java/hivemall/tools/json/ToJsonUDF.java index 416d0c948..70c62b92d 100644 --- a/core/src/main/java/hivemall/tools/json/ToJsonUDF.java +++ b/core/src/main/java/hivemall/tools/json/ToJsonUDF.java @@ -37,8 +37,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.io.Text; -@Description( - name = "to_json", +@Description(name = "to_json", value = "_FUNC_(ANY object [, const array|const string columnNames]) - Returns Json string") @UDFType(deterministic = true, stateful = false) public final class ToJsonUDF extends GenericUDF { @@ -51,7 +50,8 @@ public final class ToJsonUDF extends GenericUDF { @Override public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { if (argOIs.length != 1 && argOIs.length != 2) { - throw new UDFArgumentException("from_json takes one or two arguments: " + argOIs.length); + throw new UDFArgumentException( + "from_json takes one or two arguments: " + argOIs.length); } this.objOI = argOIs[0]; @@ -81,8 +81,8 @@ public Text evaluate(DeferredObject[] args) throws HiveException { try { return JsonSerdeUtils.serialize(obj, objOI, columnNames); } catch (Throwable e) { - throw new HiveException("Failed to serialize: " + obj + '\n' - + ExceptionUtils.prettyPrintStackTrace(e), e); + throw new HiveException( + "Failed to serialize: " + obj + '\n' + ExceptionUtils.prettyPrintStackTrace(e), e); } } diff --git a/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java b/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java index 43f558246..64065e9ed 100644 --- a/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java +++ b/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java @@ -37,8 +37,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -@Description(name = "map_key_values", value = "_FUNC_(map) - " - + "Returns a array of key-value pairs.") +@Description(name = "map_key_values", + value = "_FUNC_(map) - " + "Returns a array of key-value pairs.") @UDFType(deterministic = true, stateful = false) public final class MapKeyValuesUDF extends GenericUDF { @@ -49,11 +49,13 @@ public final class MapKeyValuesUDF extends GenericUDF { @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { if (arguments.length != 1) { - throw new UDFArgumentLengthException("The function MAP_KEYS only accepts one argument."); + throw new UDFArgumentLengthException( + "The function MAP_KEYS only accepts one argument."); } else if (!(arguments[0] instanceof MapObjectInspector)) { - throw new UDFArgumentTypeException(0, "\"" + Category.MAP.toString().toLowerCase() - + "\" is expected at function MAP_KEYS, " + "but \"" - + arguments[0].getTypeName() + "\" is found"); + throw new UDFArgumentTypeException(0, + "\"" + Category.MAP.toString().toLowerCase() + + "\" is expected at function MAP_KEYS, " + "but \"" + + arguments[0].getTypeName() + "\" is found"); } this.mapOI = (MapObjectInspector) arguments[0]; @@ -65,8 +67,9 @@ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumen structFieldNames.add("value"); structFieldObjectInspectors.add(mapOI.getMapValueObjectInspector()); - return ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorFactory.getStandardStructObjectInspector( - structFieldNames, structFieldObjectInspectors)); + return ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames, + structFieldObjectInspectors)); } @Override diff --git a/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java b/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java index c94764313..a661b3416 100644 --- a/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java +++ b/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java @@ -39,8 +39,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -@Description( - name = "merge_maps", +@Description(name = "merge_maps", value = "_FUNC_(x) - Returns a map which contains the union of an aggregation of maps." + " Note that an existing value of a key can be replaced with the other duplicate key entry.", extended = "SELECT merge_maps(m) FROM ( " diff --git a/core/src/main/java/hivemall/tools/text/SplitWordsUDF.java b/core/src/main/java/hivemall/tools/text/SplitWordsUDF.java index 0b10c2f10..31d155d3a 100644 --- a/core/src/main/java/hivemall/tools/text/SplitWordsUDF.java +++ b/core/src/main/java/hivemall/tools/text/SplitWordsUDF.java @@ -28,7 +28,7 @@ import org.apache.hadoop.io.Text; @Description(name = "split_words", - value = "_FUNC_(string query [, string regex]) - Returns an array containing split strings") + value = "_FUNC_(string query [, string regex]) - Returns an array containing splitted strings") @UDFType(deterministic = true, stateful = false) public final class SplitWordsUDF extends UDF { diff --git a/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java b/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java index ecff2f4ed..8442ae370 100644 --- a/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java +++ b/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java @@ -63,10 +63,12 @@ public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentEx if (HiveUtils.isIntegerOI(xElemOI) && HiveUtils.isIntegerOI(yElemOI)) { this.floatingPoints = false; - return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaLongObjectInspector); + return ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaLongObjectInspector); } else { this.floatingPoints = true; - return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); + return ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); } } @@ -82,8 +84,8 @@ public List evaluate(@Nonnull DeferredObject[] args) throws HiveException { final int xLen = xOI.getListLength(arg0); final int yLen = yOI.getListLength(arg1); if (xLen != yLen) { - throw new HiveException("vector lengths do not match. x=" + xOI.getList(arg0) + ", y=" - + yOI.getList(arg1)); + throw new HiveException( + "vector lengths do not match. x=" + xOI.getList(arg0) + ", y=" + yOI.getList(arg1)); } if (floatingPoints) { diff --git a/core/src/main/java/hivemall/utils/hadoop/JsonSerdeUtils.java b/core/src/main/java/hivemall/utils/hadoop/JsonSerdeUtils.java index 34932bd39..13155378f 100644 --- a/core/src/main/java/hivemall/utils/hadoop/JsonSerdeUtils.java +++ b/core/src/main/java/hivemall/utils/hadoop/JsonSerdeUtils.java @@ -127,9 +127,9 @@ public static Text serialize(@Nullable final Object obj, @Nonnull final ObjectIn /** * Serialize Hive objects as Text. */ - private static void serializeStruct(@Nonnull final StringBuilder sb, - @Nullable final Object obj, @Nonnull final StructObjectInspector soi, - @Nullable final List columnNames) throws SerDeException { + private static void serializeStruct(@Nonnull final StringBuilder sb, @Nullable final Object obj, + @Nonnull final StructObjectInspector soi, @Nullable final List columnNames) + throws SerDeException { if (obj == null) { sb.append("null"); } else { @@ -273,7 +273,8 @@ private static void serializePrimitive(@Nonnull final StringBuilder sb, break; } case STRING: { - String s = SerDeUtils.escapeString(((StringObjectInspector) poi).getPrimitiveJavaObject(obj)); + String s = SerDeUtils.escapeString( + ((StringObjectInspector) poi).getPrimitiveJavaObject(obj)); appendWithQuotes(sb, s); break; } @@ -296,30 +297,28 @@ private static void serializePrimitive(@Nonnull final StringBuilder sb, sb.append(((HiveDecimalObjectInspector) poi).getPrimitiveJavaObject(obj)); break; case VARCHAR: { - String s = SerDeUtils.escapeString(((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject( - obj) - .toString()); + String s = SerDeUtils.escapeString( + ((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject(obj).toString()); appendWithQuotes(sb, s); break; } case CHAR: { //this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13) // HiveChar.toString() returns getPaddedValue() - String s = SerDeUtils.escapeString(((HiveCharObjectInspector) poi).getPrimitiveJavaObject( - obj) - .toString()); + String s = SerDeUtils.escapeString( + ((HiveCharObjectInspector) poi).getPrimitiveJavaObject(obj).toString()); appendWithQuotes(sb, s); break; } default: - throw new SerDeException("Unknown primitive type: " - + poi.getPrimitiveCategory()); + throw new SerDeException( + "Unknown primitive type: " + poi.getPrimitiveCategory()); } } } - private static void buildJSONString(@Nonnull final StringBuilder sb, - @Nullable final Object obj, @Nonnull final ObjectInspector oi) throws SerDeException { + private static void buildJSONString(@Nonnull final StringBuilder sb, @Nullable final Object obj, + @Nonnull final ObjectInspector oi) throws SerDeException { switch (oi.getCategory()) { case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; @@ -376,13 +375,12 @@ public static T deserialize(@Nonnull final Text t, @Nonnull TypeInfo columnT @SuppressWarnings("unchecked") @Nonnull - public static T deserialize(@Nonnull final Text t, - @Nullable final List columnNames, @Nullable final List columnTypes) - throws SerDeException { + public static T deserialize(@Nonnull final Text t, @Nullable final List columnNames, + @Nullable final List columnTypes) throws SerDeException { final Object result; try { - JsonParser p = new JsonFactory().createJsonParser(new FastByteArrayInputStream( - t.getBytes())); + JsonParser p = + new JsonFactory().createJsonParser(new FastByteArrayInputStream(t.getBytes())); final JsonToken token = p.nextToken(); if (token == JsonToken.START_OBJECT) { result = parseObject(p, columnNames, columnTypes); @@ -402,8 +400,8 @@ public static T deserialize(@Nonnull final Text t, @Nonnull private static Object parseObject(@Nonnull final JsonParser p, @CheckForNull final List columnNames, - @CheckForNull final List columnTypes) throws JsonParseException, IOException, - SerDeException { + @CheckForNull final List columnTypes) + throws JsonParseException, IOException, SerDeException { Preconditions.checkNotNull(columnNames, "columnNames MUST NOT be null in parseObject", SerDeException.class); Preconditions.checkNotNull(columnTypes, "columnTypes MUST NOT be null in parseObject", @@ -437,8 +435,8 @@ private static Object parseObject(@Nonnull final JsonParser p, @Nonnull private static List parseArray(@Nonnull final JsonParser p, - @CheckForNull final List columnTypes) throws HCatException, IOException, - SerDeException { + @CheckForNull final List columnTypes) + throws HCatException, IOException, SerDeException { Preconditions.checkNotNull(columnTypes, "columnTypes MUST NOT be null", SerDeException.class); if (columnTypes.size() != 1) { @@ -459,8 +457,8 @@ private static List parseArray(@Nonnull final JsonParser p, } @Nonnull - private static Object parseValue(@Nonnull final JsonParser p) throws JsonParseException, - IOException { + private static Object parseValue(@Nonnull final JsonParser p) + throws JsonParseException, IOException { final JsonToken t = p.getCurrentToken(); switch (t) { case VALUE_FALSE: @@ -481,8 +479,8 @@ private static Object parseValue(@Nonnull final JsonParser p) throws JsonParseEx } private static void populateRecord(@Nonnull final List r, - @Nonnull final JsonToken token, @Nonnull final JsonParser p, @Nonnull final HCatSchema s) - throws IOException { + @Nonnull final JsonToken token, @Nonnull final JsonParser p, + @Nonnull final HCatSchema s) throws IOException { if (token != JsonToken.FIELD_NAME) { throw new IOException("Field name expected"); } @@ -577,8 +575,8 @@ private static Object extractCurrentField(@Nonnull final JsonParser p, break; case VARCHAR: int vLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength(); - val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveVarchar(p.getText(), - vLen); + val = (valueToken == JsonToken.VALUE_NULL) ? null + : new HiveVarchar(p.getText(), vLen); break; case CHAR: int cLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength(); @@ -678,8 +676,8 @@ private static Object getObjectOfCorrespondingPrimitiveType(String s, case CHAR: return new HiveChar(s, ((BaseCharTypeInfo) mapKeyType).getLength()); default: - throw new IOException("Could not convert from string to map type " - + mapKeyType.getTypeName()); + throw new IOException( + "Could not convert from string to map type " + mapKeyType.getTypeName()); } } @@ -693,8 +691,8 @@ private static int getPositionFromHiveInternalColumnName(String internalName) { } } - private static void skipValue(@Nonnull final JsonParser p) throws JsonParseException, - IOException { + private static void skipValue(@Nonnull final JsonParser p) + throws JsonParseException, IOException { JsonToken valueToken = p.nextToken(); if ((valueToken == JsonToken.START_ARRAY) || (valueToken == JsonToken.START_OBJECT)) { // if the currently read token is a beginning of an array or object, move stream forward diff --git a/core/src/main/java/hivemall/utils/math/MatrixUtils.java b/core/src/main/java/hivemall/utils/math/MatrixUtils.java index 6c08a6159..38329c157 100644 --- a/core/src/main/java/hivemall/utils/math/MatrixUtils.java +++ b/core/src/main/java/hivemall/utils/math/MatrixUtils.java @@ -239,7 +239,6 @@ public static RealMatrix[][] toeplitz(@Nonnull final RealMatrix[] c, final int d Preconditions.checkArgument(dim >= 1, "Invalid dimension: " + dim); Preconditions.checkArgument(c.length >= dim, "|c| must be greater than " + dim + ": " + c.length); - /* * Toeplitz matrix (symmetric, invertible, k*dimensions by k*dimensions) * @@ -511,8 +510,12 @@ public static RealMatrix solve(@Nonnull final RealMatrix L, @Nonnull final RealM } /** - * Find the first singular vector/value of a matrix A based on the Power method. + * Find the first singular vector/value of a matrix A based on the Power method. <<<<<<< HEAD * + * ======= + * + * >>>>>>> Applied spotless-maven-plugin formatter + * * @see http * ://www.cs.yale.edu/homes/el327/datamining2013aFiles/07_singular_value_decomposition.pdf * @param A target matrix diff --git a/core/src/test/java/hivemall/ftvec/hashing/FeatureHashingUDFTest.java b/core/src/test/java/hivemall/ftvec/hashing/FeatureHashingUDFTest.java index d91eb6676..06277c2b0 100644 --- a/core/src/test/java/hivemall/ftvec/hashing/FeatureHashingUDFTest.java +++ b/core/src/test/java/hivemall/ftvec/hashing/FeatureHashingUDFTest.java @@ -21,6 +21,9 @@ import hivemall.TestUtils; import hivemall.utils.hashing.MurmurHash3; +import java.io.IOException; +import java.util.Arrays; + import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; @@ -29,9 +32,6 @@ import org.junit.Assert; import org.junit.Test; -import java.io.IOException; -import java.util.Arrays; - public class FeatureHashingUDFTest { @Test diff --git a/core/src/test/java/hivemall/sketch/bloom/BloomAndUDFTest.java b/core/src/test/java/hivemall/sketch/bloom/BloomAndUDFTest.java index 76f32e8f9..97ad7c63d 100644 --- a/core/src/test/java/hivemall/sketch/bloom/BloomAndUDFTest.java +++ b/core/src/test/java/hivemall/sketch/bloom/BloomAndUDFTest.java @@ -50,8 +50,8 @@ public void test() throws IOException, HiveException { Assert.assertEquals(expected, actual); - DynamicBloomFilter deserialized = BloomFilterUtils.deserialize(actual, - new DynamicBloomFilter()); + DynamicBloomFilter deserialized = + BloomFilterUtils.deserialize(actual, new DynamicBloomFilter()); assertNotContains(bf1, deserialized, 1L, 10000); assertNotContains(bf1, deserialized, 2L, 10000); } diff --git a/core/src/test/java/hivemall/sketch/bloom/BloomOrUDFTest.java b/core/src/test/java/hivemall/sketch/bloom/BloomOrUDFTest.java index 0179a3076..64f95e0d5 100644 --- a/core/src/test/java/hivemall/sketch/bloom/BloomOrUDFTest.java +++ b/core/src/test/java/hivemall/sketch/bloom/BloomOrUDFTest.java @@ -50,8 +50,8 @@ public void test() throws IOException, HiveException { Assert.assertEquals(expected, actual); - DynamicBloomFilter deserialized = BloomFilterUtils.deserialize(actual, - new DynamicBloomFilter()); + DynamicBloomFilter deserialized = + BloomFilterUtils.deserialize(actual, new DynamicBloomFilter()); assertEquals(bf1, deserialized, 1L, 10000); assertEquals(bf1, deserialized, 2L, 10000); } diff --git a/core/src/test/java/hivemall/smile/classification/DecisionTreeTest.java b/core/src/test/java/hivemall/smile/classification/DecisionTreeTest.java index 018de82e1..b789e71ca 100644 --- a/core/src/test/java/hivemall/smile/classification/DecisionTreeTest.java +++ b/core/src/test/java/hivemall/smile/classification/DecisionTreeTest.java @@ -19,6 +19,7 @@ package hivemall.smile.classification; import static org.junit.Assert.assertEquals; + import hivemall.math.matrix.Matrix; import hivemall.math.matrix.builders.CSRMatrixBuilder; import hivemall.math.matrix.dense.RowMajorDenseMatrix2d; @@ -29,6 +30,10 @@ import hivemall.smile.tools.TreeExportUDF.OutputType; import hivemall.smile.utils.SmileExtUtils; import hivemall.utils.codec.Base91; +import smile.data.AttributeDataset; +import smile.data.parser.ArffParser; +import smile.math.Math; +import smile.validation.LOOCV; import java.io.BufferedInputStream; import java.io.IOException; @@ -43,11 +48,6 @@ import org.junit.Assert; import org.junit.Test; -import smile.data.AttributeDataset; -import smile.data.parser.ArffParser; -import smile.math.Math; -import smile.validation.LOOCV; - public class DecisionTreeTest { private static final boolean DEBUG = false; diff --git a/core/src/test/java/hivemall/smile/regression/RegressionTreeTest.java b/core/src/test/java/hivemall/smile/regression/RegressionTreeTest.java index a4a7f05eb..9d24b5414 100644 --- a/core/src/test/java/hivemall/smile/regression/RegressionTreeTest.java +++ b/core/src/test/java/hivemall/smile/regression/RegressionTreeTest.java @@ -27,6 +27,8 @@ import hivemall.smile.tools.TreeExportUDF.Evaluator; import hivemall.smile.tools.TreeExportUDF.OutputType; import hivemall.utils.codec.Base91; +import smile.math.Math; +import smile.validation.LOOCV; import java.io.IOException; import java.text.ParseException; @@ -39,9 +41,6 @@ import org.junit.Assert; import org.junit.Test; -import smile.math.Math; -import smile.validation.LOOCV; - public class RegressionTreeTest { private static final boolean DEBUG = false; diff --git a/core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java b/core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java index 31991065e..04f432cd8 100644 --- a/core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java +++ b/core/src/test/java/hivemall/tools/GenerateSeriesUDTFTest.java @@ -59,8 +59,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {new IntWritable(1), new IntWritable(3)}); - List expected = Arrays.asList(new IntWritable(1), new IntWritable(2), - new IntWritable(3)); + List expected = + Arrays.asList(new IntWritable(1), new IntWritable(2), new IntWritable(3)); Assert.assertEquals(expected, actual); } @@ -68,9 +68,9 @@ public void collect(Object args) throws HiveException { public void testTwoIntArgs() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize(new ObjectInspector[] { - PrimitiveObjectInspectorFactory.javaIntObjectInspector, - PrimitiveObjectInspectorFactory.writableIntObjectInspector}); + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector}); final List actual = new ArrayList<>(); @@ -85,8 +85,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {1, new IntWritable(3)}); - List expected = Arrays.asList(new IntWritable(1), new IntWritable(2), - new IntWritable(3)); + List expected = + Arrays.asList(new IntWritable(1), new IntWritable(2), new IntWritable(3)); Assert.assertEquals(expected, actual); } @@ -94,9 +94,9 @@ public void collect(Object args) throws HiveException { public void testTwoLongArgs() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize(new ObjectInspector[] { - PrimitiveObjectInspectorFactory.javaIntObjectInspector, - PrimitiveObjectInspectorFactory.writableLongObjectInspector}); + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableLongObjectInspector}); final List actual = new ArrayList<>(); @@ -111,8 +111,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {1, new LongWritable(3)}); - List expected = Arrays.asList(new LongWritable(1), new LongWritable(2), - new LongWritable(3)); + List expected = + Arrays.asList(new LongWritable(1), new LongWritable(2), new LongWritable(3)); Assert.assertEquals(expected, actual); } @@ -120,10 +120,10 @@ public void collect(Object args) throws HiveException { public void testThreeIntArgs() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize(new ObjectInspector[] { - PrimitiveObjectInspectorFactory.javaIntObjectInspector, - PrimitiveObjectInspectorFactory.writableIntObjectInspector, - PrimitiveObjectInspectorFactory.javaLongObjectInspector}); + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector, + PrimitiveObjectInspectorFactory.javaLongObjectInspector}); final List actual = new ArrayList<>(); @@ -138,8 +138,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {1, new IntWritable(7), 3L}); - List expected = Arrays.asList(new IntWritable(1), new IntWritable(4), - new IntWritable(7)); + List expected = + Arrays.asList(new IntWritable(1), new IntWritable(4), new IntWritable(7)); Assert.assertEquals(expected, actual); } @@ -147,10 +147,10 @@ public void collect(Object args) throws HiveException { public void testThreeLongArgs() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize(new ObjectInspector[] { - PrimitiveObjectInspectorFactory.javaLongObjectInspector, - PrimitiveObjectInspectorFactory.writableLongObjectInspector, - PrimitiveObjectInspectorFactory.javaLongObjectInspector}); + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaLongObjectInspector, + PrimitiveObjectInspectorFactory.writableLongObjectInspector, + PrimitiveObjectInspectorFactory.javaLongObjectInspector}); final List actual = new ArrayList<>(); @@ -165,8 +165,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {1L, new LongWritable(7), 3L}); - List expected = Arrays.asList(new LongWritable(1), new LongWritable(4), - new LongWritable(7)); + List expected = + Arrays.asList(new LongWritable(1), new LongWritable(4), new LongWritable(7)); Assert.assertEquals(expected, actual); } @@ -174,10 +174,10 @@ public void collect(Object args) throws HiveException { public void testNegativeStepInt() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize(new ObjectInspector[] { - PrimitiveObjectInspectorFactory.javaIntObjectInspector, - PrimitiveObjectInspectorFactory.writableIntObjectInspector, - PrimitiveObjectInspectorFactory.javaLongObjectInspector}); + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector, + PrimitiveObjectInspectorFactory.javaLongObjectInspector}); final List actual = new ArrayList<>(); @@ -192,8 +192,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {5, new IntWritable(1), -2L}); - List expected = Arrays.asList(new IntWritable(5), new IntWritable(3), - new IntWritable(1)); + List expected = + Arrays.asList(new IntWritable(5), new IntWritable(3), new IntWritable(1)); Assert.assertEquals(expected, actual); } @@ -201,10 +201,10 @@ public void collect(Object args) throws HiveException { public void testNegativeStepLong() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize(new ObjectInspector[] { - PrimitiveObjectInspectorFactory.javaLongObjectInspector, - PrimitiveObjectInspectorFactory.writableIntObjectInspector, - PrimitiveObjectInspectorFactory.javaIntObjectInspector}); + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaLongObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector, + PrimitiveObjectInspectorFactory.javaIntObjectInspector}); final List actual = new ArrayList<>(); @@ -219,8 +219,8 @@ public void collect(Object args) throws HiveException { udtf.process(new Object[] {5L, new IntWritable(1), -2}); - List expected = Arrays.asList(new LongWritable(5), new LongWritable(3), - new LongWritable(1)); + List expected = + Arrays.asList(new LongWritable(5), new LongWritable(3), new LongWritable(1)); Assert.assertEquals(expected, actual); } @@ -228,9 +228,9 @@ public void collect(Object args) throws HiveException { public void testSerialization() throws HiveException { GenerateSeriesUDTF udtf = new GenerateSeriesUDTF(); - udtf.initialize(new ObjectInspector[] { - PrimitiveObjectInspectorFactory.javaIntObjectInspector, - PrimitiveObjectInspectorFactory.writableIntObjectInspector}); + udtf.initialize( + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.writableIntObjectInspector}); udtf.setCollector(new Collector() { @Override diff --git a/core/src/test/java/hivemall/tools/TryCastUDFTest.java b/core/src/test/java/hivemall/tools/TryCastUDFTest.java index 8de3181d8..ddd79e8cc 100644 --- a/core/src/test/java/hivemall/tools/TryCastUDFTest.java +++ b/core/src/test/java/hivemall/tools/TryCastUDFTest.java @@ -44,7 +44,8 @@ public void testList() throws IOException, HiveException { TryCastUDF udf = new TryCastUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( TypeInfoFactory.stringTypeInfo, new Text("array"))}); diff --git a/core/src/test/java/hivemall/tools/array/ArrayElementAtUDFTest.java b/core/src/test/java/hivemall/tools/array/ArrayElementAtUDFTest.java index 09f7cfcc1..7678d6ae0 100644 --- a/core/src/test/java/hivemall/tools/array/ArrayElementAtUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArrayElementAtUDFTest.java @@ -41,23 +41,27 @@ public void testDouble() throws IOException, HiveException { ArrayElementAtUDF udf = new ArrayElementAtUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), PrimitiveObjectInspectorFactory.javaIntObjectInspector}); DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {0, 1, - 2})), new GenericUDF.DeferredJavaObject(new Integer(1))}; + new GenericUDF.DeferredJavaObject( + WritableUtils.toWritableList(new double[] {0, 1, 2})), + new GenericUDF.DeferredJavaObject(new Integer(1))}; Assert.assertEquals(new DoubleWritable(1), udf.evaluate(args)); args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {0, 1, - 2})), new GenericUDF.DeferredJavaObject(new Integer(4))}; + new GenericUDF.DeferredJavaObject( + WritableUtils.toWritableList(new double[] {0, 1, 2})), + new GenericUDF.DeferredJavaObject(new Integer(4))}; Assert.assertNull(udf.evaluate(args)); args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {0, 1, - 2})), new GenericUDF.DeferredJavaObject(new Integer(-2))}; + new GenericUDF.DeferredJavaObject( + WritableUtils.toWritableList(new double[] {0, 1, 2})), + new GenericUDF.DeferredJavaObject(new Integer(-2))}; Assert.assertEquals(new DoubleWritable(1), udf.evaluate(args)); udf.close(); @@ -68,7 +72,8 @@ public void testString() throws IOException, HiveException { ArrayElementAtUDF udf = new ArrayElementAtUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector), PrimitiveObjectInspectorFactory.javaIntObjectInspector}); DeferredObject[] args = new DeferredObject[] { diff --git a/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java b/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java index e1fba0d44..11754aa71 100644 --- a/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArrayFlattenUDFTest.java @@ -39,7 +39,9 @@ public class ArrayFlattenUDFTest { public void testEvaluate() throws HiveException, IOException { ArrayFlattenUDF udf = new ArrayFlattenUDF(); - udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector))}); + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaIntObjectInspector))}); DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( Arrays.asList(Arrays.asList(0, 1, 2, 3), Arrays.asList(4, 5), Arrays.asList(6, 7)))}; diff --git a/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java b/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java index 23391048e..fbc212aad 100644 --- a/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArraySliceUDFTest.java @@ -42,7 +42,8 @@ public void testNonNullReturn() throws IOException, HiveException { ArraySliceUDF udf = new ArraySliceUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector), PrimitiveObjectInspectorFactory.writableIntObjectInspector, PrimitiveObjectInspectorFactory.writableIntObjectInspector}); @@ -52,9 +53,11 @@ public void testNonNullReturn() throws IOException, HiveException { DeferredObject arg2 = new GenericUDF.DeferredJavaObject(length); DeferredObject nullarg = new GenericUDF.DeferredJavaObject(null); - DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(Arrays.asList("zero", "one", "two", "three", - "four", "five", "six", "seven", "eight", "nine", "ten")), arg1, arg2}; + DeferredObject[] args = + new DeferredObject[] { + new GenericUDF.DeferredJavaObject(Arrays.asList("zero", "one", "two", + "three", "four", "five", "six", "seven", "eight", "nine", "ten")), + arg1, arg2}; offset.set(0); length.set(3); @@ -90,7 +93,8 @@ public void testNullReturn() throws IOException, HiveException { ArraySliceUDF udf = new ArraySliceUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector), PrimitiveObjectInspectorFactory.writableIntObjectInspector, PrimitiveObjectInspectorFactory.writableIntObjectInspector}); @@ -99,9 +103,11 @@ public void testNullReturn() throws IOException, HiveException { DeferredObject arg1 = new GenericUDF.DeferredJavaObject(offset); DeferredObject arg2 = new GenericUDF.DeferredJavaObject(length); - DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(Arrays.asList("zero", "one", "two", "three", - "four", "five", "six", "seven", "eight", "nine", "ten")), arg1, arg2}; + DeferredObject[] args = + new DeferredObject[] { + new GenericUDF.DeferredJavaObject(Arrays.asList("zero", "one", "two", + "three", "four", "five", "six", "seven", "eight", "nine", "ten")), + arg1, arg2}; offset.set(-12); diff --git a/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java b/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java index 46206b77f..cc1703983 100644 --- a/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/ArrayUnionUDFTest.java @@ -41,16 +41,20 @@ public void testUnion3() throws HiveException, IOException { ArrayUnionUDF udf = new ArrayUnionUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {0, 1, - 2})), - new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {2, 3, - 4})), - new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {4, 5}))}; + new GenericUDF.DeferredJavaObject( + WritableUtils.toWritableList(new double[] {0, 1, 2})), + new GenericUDF.DeferredJavaObject( + WritableUtils.toWritableList(new double[] {2, 3, 4})), + new GenericUDF.DeferredJavaObject( + WritableUtils.toWritableList(new double[] {4, 5}))}; List result = udf.evaluate(args); diff --git a/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java b/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java index fa34db1e4..7045235b3 100644 --- a/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java +++ b/core/src/test/java/hivemall/tools/array/ConditionalEmitUDTFTest.java @@ -39,8 +39,10 @@ public void test() throws HiveException { ConditionalEmitUDTF udtf = new ConditionalEmitUDTF(); udtf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector), - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector),}); + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaBooleanObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector),}); final List actual = new ArrayList<>(); udtf.setCollector(new Collector() { @@ -52,15 +54,15 @@ public void collect(Object input) throws HiveException { } }); - udtf.process(new Object[] {Arrays.asList(true, false, true), - Arrays.asList("one", "two", "three")}); + udtf.process( + new Object[] {Arrays.asList(true, false, true), Arrays.asList("one", "two", "three")}); Assert.assertEquals(Arrays.asList("one", "three"), actual); actual.clear(); - udtf.process(new Object[] {Arrays.asList(true, true, false), - Arrays.asList("one", "two", "three")}); + udtf.process( + new Object[] {Arrays.asList(true, true, false), Arrays.asList("one", "two", "three")}); Assert.assertEquals(Arrays.asList("one", "two"), actual); udtf.close(); diff --git a/core/src/test/java/hivemall/tools/array/FirstElementUDFTest.java b/core/src/test/java/hivemall/tools/array/FirstElementUDFTest.java index b9c01e47c..c5cd5858a 100644 --- a/core/src/test/java/hivemall/tools/array/FirstElementUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/FirstElementUDFTest.java @@ -39,7 +39,8 @@ public class FirstElementUDFTest { public void test() throws IOException, HiveException { FirstElementUDF udf = new FirstElementUDF(); - udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( WritableUtils.toWritableList(new double[] {0, 1, 2}))}; @@ -53,10 +54,11 @@ public void test() throws IOException, HiveException { public void testNull() throws IOException, HiveException { FirstElementUDF udf = new FirstElementUDF(); - udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); - DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( - WritableUtils.toWritableList(new double[] {}))}; + DeferredObject[] args = new DeferredObject[] { + new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {}))}; Assert.assertNull(udf.evaluate(args)); diff --git a/core/src/test/java/hivemall/tools/array/LastElementUDFTest.java b/core/src/test/java/hivemall/tools/array/LastElementUDFTest.java index 12d0f95ab..b61e9dacf 100644 --- a/core/src/test/java/hivemall/tools/array/LastElementUDFTest.java +++ b/core/src/test/java/hivemall/tools/array/LastElementUDFTest.java @@ -39,7 +39,8 @@ public class LastElementUDFTest { public void test() throws IOException, HiveException { LastElementUDF udf = new LastElementUDF(); - udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( WritableUtils.toWritableList(new double[] {0, 1, 2}))}; @@ -53,10 +54,11 @@ public void test() throws IOException, HiveException { public void testNull() throws IOException, HiveException { LastElementUDF udf = new LastElementUDF(); - udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); - DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( - WritableUtils.toWritableList(new double[] {}))}; + DeferredObject[] args = new DeferredObject[] { + new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {}))}; Assert.assertNull(udf.evaluate(args)); diff --git a/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java b/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java index 01d2657c2..738a9390c 100644 --- a/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java +++ b/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java @@ -47,8 +47,8 @@ public void testDoubleArray() throws Exception { ObjectInspector[] argOIs = new ObjectInspector[] { PrimitiveObjectInspectorFactory.writableStringObjectInspector, HiveUtils.getConstStringObjectInspector(types)}; - DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(new Text(json)), null}; + DeferredObject[] args = + new DeferredObject[] {new GenericUDF.DeferredJavaObject(new Text(json)), null}; udf.initialize(argOIs); Object result = udf.evaluate(args); @@ -70,8 +70,8 @@ public void testPersonStruct() throws Exception { PrimitiveObjectInspectorFactory.writableStringObjectInspector, HiveUtils.getConstStringObjectInspector(types), HiveUtils.getConstStringObjectInspector("person")}; - DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(new Text(json)), null}; + DeferredObject[] args = + new DeferredObject[] {new GenericUDF.DeferredJavaObject(new Text(json)), null}; udf.initialize(argOIs); List result = (List) udf.evaluate(args); diff --git a/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java b/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java index f005145de..f7f698cb2 100644 --- a/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java +++ b/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java @@ -40,7 +40,9 @@ public class ToJsonUDFTest { public void testDoubleArray() throws Exception { ToJsonUDF udf = new ToJsonUDF(); - ObjectInspector[] argOIs = new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}; + ObjectInspector[] argOIs = + new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}; DeferredObject[] args = new DeferredObject[] {new GenericUDF.DeferredJavaObject( WritableUtils.toWritableList(new double[] {0.1, 1.1, 2.1}))}; diff --git a/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java b/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java index 51d7575fe..2164dc1ad 100644 --- a/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java +++ b/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java @@ -51,8 +51,8 @@ public void testStringDouble() throws HiveException, IOException { input.put("k" + i, new DoubleWritable(i)); } - GenericUDF.DeferredObject[] arguments = new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject( - input)}; + GenericUDF.DeferredObject[] arguments = + new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject(input)}; List actual = udf.evaluate(arguments); diff --git a/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java b/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java index 671db1383..eb5c08f63 100644 --- a/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java +++ b/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java @@ -41,12 +41,14 @@ public void testDotp() throws HiveException, IOException { VectorDotUDF udf = new VectorDotUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableFloatObjectInspector)}); + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableFloatObjectInspector)}); DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {1, 2, - 3})), + new GenericUDF.DeferredJavaObject( + WritableUtils.toWritableList(new double[] {1, 2, 3})), new GenericUDF.DeferredJavaObject( WritableUtils.toWritableList(new float[] {2, 3, 4}))}; @@ -63,12 +65,14 @@ public void testDotpScalar() throws HiveException, IOException { VectorDotUDF udf = new VectorDotUDF(); udf.initialize(new ObjectInspector[] { - ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), PrimitiveObjectInspectorFactory.writableFloatObjectInspector}); DeferredObject[] args = new DeferredObject[] { - new GenericUDF.DeferredJavaObject(WritableUtils.toWritableList(new double[] {1, 2, - 3})), new GenericUDF.DeferredJavaObject(WritableUtils.val(2.f))}; + new GenericUDF.DeferredJavaObject( + WritableUtils.toWritableList(new double[] {1, 2, 3})), + new GenericUDF.DeferredJavaObject(WritableUtils.val(2.f))}; Object actual = udf.evaluate(args); List expected = Arrays.asList(2.d, 4.d, 6.d); diff --git a/core/src/test/java/hivemall/utils/hadoop/JsonSerdeUtilsTest.java b/core/src/test/java/hivemall/utils/hadoop/JsonSerdeUtilsTest.java index 556cc71c8..a3e81d2cd 100644 --- a/core/src/test/java/hivemall/utils/hadoop/JsonSerdeUtilsTest.java +++ b/core/src/test/java/hivemall/utils/hadoop/JsonSerdeUtilsTest.java @@ -80,18 +80,17 @@ public class JsonSerdeUtilsTest { @Test public void testLooseJsonReadability() throws Exception { List columnNames = Arrays.asList("s,k".split(",")); - List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString("struct,int"); + List columnTypes = + TypeInfoUtils.getTypeInfosFromTypeString("struct,int"); - Text jsonText1 = new Text( - "{ \"x\" : \"abc\" , " - + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ," - + "\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } }"); + Text jsonText1 = new Text("{ \"x\" : \"abc\" , " + + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ," + + "\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } }"); - Text jsonText2 = new Text( - "{ \"x\" : \"abc\" , " - + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ," - + "\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } , " - + "\"k\" : 113 " + "}"); + Text jsonText2 = new Text("{ \"x\" : \"abc\" , " + + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ," + + "\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } , " + "\"k\" : 113 " + + "}"); List expected1 = Arrays.asList(Arrays.asList(2, "blah"), null); List expected2 = Arrays.asList(Arrays.asList(2, "blah"), 113); @@ -105,7 +104,8 @@ public void testLooseJsonReadability() throws Exception { @Test public void testMapValues() throws SerDeException { List columnNames = Arrays.asList("a,b".split(",")); - List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString("array,map"); + List columnTypes = + TypeInfoUtils.getTypeInfosFromTypeString("array,map"); Text text1 = new Text("{ \"a\":[\"aaa\"],\"b\":{\"bbb\":1}} "); Text text2 = new Text("{\"a\":[\"yyy\"],\"b\":{\"zzz\":123}}"); @@ -193,18 +193,22 @@ public void testRW() throws Exception { DefaultHCatRecord r = new DefaultHCatRecord(rlist); - List columnNames = Arrays.asList("ti,si,i,bi,d,f,s,n,r,l,m,b,c1,bd,hc,hvc,dt,ts,bin".split(",")); - List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString("tinyint,smallint,int,bigint,double,float,string,string," - + "struct,array,map,boolean," - + "array,ii2:map>>>>," - + "decimal(5,2),char(10),varchar(20),date,timestamp,binary"); + List columnNames = + Arrays.asList("ti,si,i,bi,d,f,s,n,r,l,m,b,c1,bd,hc,hvc,dt,ts,bin".split(",")); + List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString( + "tinyint,smallint,int,bigint,double,float,string,string," + + "struct,array,map,boolean," + + "array,ii2:map>>>>," + + "decimal(5,2),char(10),varchar(20),date,timestamp,binary"); - StructTypeInfo rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo( - columnNames, columnTypes); - HCatRecordObjectInspector objInspector = HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo); + StructTypeInfo rowTypeInfo = + (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); + HCatRecordObjectInspector objInspector = + HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo); Text serialized = JsonSerdeUtils.serialize(r, objInspector, columnNames); - List deserialized = JsonSerdeUtils.deserialize(serialized, columnNames, columnTypes); + List deserialized = + JsonSerdeUtils.deserialize(serialized, columnNames, columnTypes); assertRecordEquals(rlist, deserialized); } @@ -236,18 +240,22 @@ public void testRWNull() throws Exception { DefaultHCatRecord r = new DefaultHCatRecord(nlist); - List columnNames = Arrays.asList("ti,si,i,bi,d,f,s,n,r,l,m,b,c1,bd,hc,hvc,dt,ts,bin".split(",")); - List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString("tinyint,smallint,int,bigint,double,float,string,string," - + "struct,array,map,boolean," - + "array,ii2:map>>>>," - + "decimal(5,2),char(10),varchar(20),date,timestamp,binary"); + List columnNames = + Arrays.asList("ti,si,i,bi,d,f,s,n,r,l,m,b,c1,bd,hc,hvc,dt,ts,bin".split(",")); + List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString( + "tinyint,smallint,int,bigint,double,float,string,string," + + "struct,array,map,boolean," + + "array,ii2:map>>>>," + + "decimal(5,2),char(10),varchar(20),date,timestamp,binary"); - StructTypeInfo rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo( - columnNames, columnTypes); - HCatRecordObjectInspector objInspector = HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo); + StructTypeInfo rowTypeInfo = + (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); + HCatRecordObjectInspector objInspector = + HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo); Text serialized = JsonSerdeUtils.serialize(r, objInspector, columnNames); - List deserialized = JsonSerdeUtils.deserialize(serialized, columnNames, columnTypes); + List deserialized = + JsonSerdeUtils.deserialize(serialized, columnNames, columnTypes); assertRecordEquals(nlist, deserialized); } @@ -258,8 +266,8 @@ public void testStructWithoutColumnNames() throws Exception { TypeInfo type1 = TypeInfoUtils.getTypeInfoFromTypeString("struct"); List expected1 = Arrays.asList("makoto", 37); - List deserialized1 = JsonSerdeUtils.deserialize(json1, Arrays.asList("person"), - Arrays.asList(type1)); + List deserialized1 = + JsonSerdeUtils.deserialize(json1, Arrays.asList("person"), Arrays.asList(type1)); assertRecordEquals(expected1, deserialized1); } @@ -329,15 +337,15 @@ private static void assertRecordEquals(@Nonnull final List first, int mySz = first.size(); int urSz = second.size(); if (mySz != urSz) { - throw new RuntimeException("#expected != #actual. #expected=" + mySz + ", #actual=" - + urSz); + throw new RuntimeException( + "#expected != #actual. #expected=" + mySz + ", #actual=" + urSz); } else { for (int i = 0; i < first.size(); i++) { int c = DataType.compare(first.get(i), second.get(i)); if (c != 0) { String msg = "first.get(" + i + "}='" + first.get(i) + "' second.get(" + i - + ")='" + second.get(i) + "' compared as " + c + "\n" - + "Types 1st/2nd=" + DataType.findType(first.get(i)) + "/" + + ")='" + second.get(i) + "' compared as " + c + "\n" + "Types 1st/2nd=" + + DataType.findType(first.get(i)) + "/" + DataType.findType(second.get(i)) + '\n' + "first='" + first.get(i) + "' second='" + second.get(i) + "'"; if (first.get(i) instanceof Date) { From acea0f7875e8a57ae6d86ff077670122b53ffc01 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 14 May 2018 19:32:40 +0900 Subject: [PATCH 25/56] Fixed SSL related test error --- .../{statistics => tools/timeseries}/MovingAverageUDTF.java | 0 .../hivemall/fm/FieldAwareFactorizationMachineUDTFTest.java | 2 ++ .../{statistics => tools/timeseries}/MovingAverageUDTFTest.java | 0 3 files changed, 2 insertions(+) rename core/src/main/java/hivemall/{statistics => tools/timeseries}/MovingAverageUDTF.java (100%) rename core/src/test/java/hivemall/{statistics => tools/timeseries}/MovingAverageUDTFTest.java (100%) diff --git a/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java b/core/src/main/java/hivemall/tools/timeseries/MovingAverageUDTF.java similarity index 100% rename from core/src/main/java/hivemall/statistics/MovingAverageUDTF.java rename to core/src/main/java/hivemall/tools/timeseries/MovingAverageUDTF.java diff --git a/core/src/test/java/hivemall/fm/FieldAwareFactorizationMachineUDTFTest.java b/core/src/test/java/hivemall/fm/FieldAwareFactorizationMachineUDTFTest.java index 16196eb68..5b7aa8f5c 100644 --- a/core/src/test/java/hivemall/fm/FieldAwareFactorizationMachineUDTFTest.java +++ b/core/src/test/java/hivemall/fm/FieldAwareFactorizationMachineUDTFTest.java @@ -85,6 +85,7 @@ public void testFTRLNoCoeff() throws HiveException, IOException { @Test public void testSample() throws IOException, HiveException { + System.setProperty("https.protocols", "TLSv1,TLSv1.1,TLSv1.2"); run("[Sample.ffm] default option", "https://github.com/myui/ml_dataset/raw/master/ffm/sample.ffm.gz", "-classification -factors 2 -iters 10 -feature_hashing 20 -seed 43", 0.01f); @@ -92,6 +93,7 @@ public void testSample() throws IOException, HiveException { // TODO @Test public void testSampleEnableNorm() throws IOException, HiveException { + System.setProperty("https.protocols", "TLSv1,TLSv1.1,TLSv1.2"); run("[Sample.ffm] default option", "https://github.com/myui/ml_dataset/raw/master/ffm/sample.ffm.gz", "-classification -factors 2 -iters 10 -feature_hashing 20 -seed 43 -enable_norm", diff --git a/core/src/test/java/hivemall/statistics/MovingAverageUDTFTest.java b/core/src/test/java/hivemall/tools/timeseries/MovingAverageUDTFTest.java similarity index 100% rename from core/src/test/java/hivemall/statistics/MovingAverageUDTFTest.java rename to core/src/test/java/hivemall/tools/timeseries/MovingAverageUDTFTest.java From b47468d5fd09fe80d576adea0145d205e32ac80d Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 14 May 2018 19:35:21 +0900 Subject: [PATCH 26/56] Moved package of moving_avg --- .../tools/timeseries/MovingAverageUDTF.java | 16 +++++++++++++--- .../tools/timeseries/MovingAverageUDTFTest.java | 3 ++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/hivemall/tools/timeseries/MovingAverageUDTF.java b/core/src/main/java/hivemall/tools/timeseries/MovingAverageUDTF.java index 105c86bf5..e6c634cde 100644 --- a/core/src/main/java/hivemall/tools/timeseries/MovingAverageUDTF.java +++ b/core/src/main/java/hivemall/tools/timeseries/MovingAverageUDTF.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package hivemall.statistics; +package hivemall.tools.timeseries; import hivemall.utils.hadoop.HiveUtils; import hivemall.utils.stats.MovingAverage; @@ -37,8 +37,18 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.io.Writable; -@Description(name = "moving_avg", value = "_FUNC_(NUMBER value, const int windowSize)" - + " - Returns moving average of a time series using a given window") +@Description(name = "moving_avg", + value = "_FUNC_(NUMBER value, const int windowSize)" + + " - Returns moving average of a time series using a given window", + extended = "SELECT moving_avg(x, 3) FROM (SELECT explode(array(1.0,2.0,3.0,4.0,5.0,6.0,7.0)) as x) series;\n" + + "> avg\n" + + "> 1.0\n" + + "> 1.5\n" + + "> 2.0\n" + + "> 3.0\n" + + "> 4.0\n" + + "> 5.0\n" + + "> 6.0") @UDFType(deterministic = false, stateful = true) public final class MovingAverageUDTF extends GenericUDTF { diff --git a/core/src/test/java/hivemall/tools/timeseries/MovingAverageUDTFTest.java b/core/src/test/java/hivemall/tools/timeseries/MovingAverageUDTFTest.java index bda2baa8a..815b56766 100644 --- a/core/src/test/java/hivemall/tools/timeseries/MovingAverageUDTFTest.java +++ b/core/src/test/java/hivemall/tools/timeseries/MovingAverageUDTFTest.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ -package hivemall.statistics; +package hivemall.tools.timeseries; import hivemall.TestUtils; +import hivemall.tools.timeseries.MovingAverageUDTF; import java.util.ArrayList; import java.util.Arrays; From 052e45dec7e3c9fa875e0b454273e04f518938e2 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 14 May 2018 19:36:53 +0900 Subject: [PATCH 27/56] Updated UDF description of to_json UDF --- .../java/hivemall/tools/json/ToJsonUDF.java | 74 ++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/hivemall/tools/json/ToJsonUDF.java b/core/src/main/java/hivemall/tools/json/ToJsonUDF.java index 70c62b92d..54e447abc 100644 --- a/core/src/main/java/hivemall/tools/json/ToJsonUDF.java +++ b/core/src/main/java/hivemall/tools/json/ToJsonUDF.java @@ -37,8 +37,80 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.io.Text; +// @formatter:off @Description(name = "to_json", - value = "_FUNC_(ANY object [, const array|const string columnNames]) - Returns Json string") + value = "_FUNC_(ANY object [, const array|const string columnNames]) - Returns Json string", + extended = "SELECT \n" + + " NAMED_STRUCT(\"Name\", \"John\", \"age\", 31),\n" + + " to_json(\n" + + " NAMED_STRUCT(\"Name\", \"John\", \"age\", 31)\n" + + " ),\n" + + " to_json(\n" + + " NAMED_STRUCT(\"Name\", \"John\", \"age\", 31),\n" + + " array('Name', 'age')\n" + + " ),\n" + + " to_json(\n" + + " NAMED_STRUCT(\"Name\", \"John\", \"age\", 31),\n" + + " array('name', 'age')\n" + + " ),\n" + + " to_json(\n" + + " NAMED_STRUCT(\"Name\", \"John\", \"age\", 31),\n" + + " array('age')\n" + + " ),\n" + + " to_json(\n" + + " NAMED_STRUCT(\"Name\", \"John\", \"age\", 31),\n" + + " array()\n" + + " ),\n" + + " to_json(\n" + + " null,\n" + + " array()\n" + + " ),\n" + + " to_json(\n" + + " struct(\"123\", \"456\", 789, array(314,007)),\n" + + " array('ti','si','i','bi')\n" + + " ),\n" + + " to_json(\n" + + " struct(\"123\", \"456\", 789, array(314,007)),\n" + + " 'ti,si,i,bi'\n" + + " ),\n" + + " to_json(\n" + + " struct(\"123\", \"456\", 789, array(314,007))\n" + + " ),\n" + + " to_json(\n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"tokyo\")\n" + + " ),\n" + + " to_json(\n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"tokyo\"), \n" + + " array('city')\n" + + " ),\n" + + " to_json(\n" + + " ARRAY(\n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"tokyo\"), \n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"osaka\")\n" + + " )\n" + + " ),\n" + + " to_json(\n" + + " ARRAY(\n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"tokyo\"), \n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"osaka\")\n" + + " ),\n" + + " array('city')\n" + + " );\n" + + "> {\"name\":\"John\",\"age\":31} " + + "{\"name\":\"John\",\"age\":31} " + + "{\"Name\":\"John\",\"age\":31} " + + "{\"name\":\"John\",\"age\":31} " + + "{\"age\":31} " + + "{}" + + "NULL " + + "{\"ti\":\"123\",\"si\":\"456\",\"i\":789,\"bi\":[314,7]} " + + "{\"ti\":\"123\",\"si\":\"456\",\"i\":789,\"bi\":[314,7]} " + + "{\"col1\":\"123\",\"col2\":\"456\",\"col3\":789,\"col4\":[314,7]} " + + "{\"country\":\"japan\",\"city\":\"tokyo\"} " + + "{\"city\":\"tokyo\"} " + + "[{\"country\":\"japan\",\"city\":\"tokyo\"},{\"country\":\"japan\",\"city\":\"osaka\"}] " + + "[{\"country\":\"japan\",\"city\":\"tokyo\"},{\"country\":\"japan\",\"city\":\"osaka\"}]") +// @formatter:on @UDFType(deterministic = true, stateful = false) public final class ToJsonUDF extends GenericUDF { From 8436dbe9f6e0d522c873547f95a2b862f11366e6 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 14 May 2018 19:38:35 +0900 Subject: [PATCH 28/56] Included timeseries doc generation --- .../src/main/java/hivemall/docs/FuncsListGenerator.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java index 61fea68bd..019075116 100644 --- a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java +++ b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java @@ -87,6 +87,8 @@ public class FuncsListGenerator extends AbstractMojo { genericFuncsHeaders.put("# Matrix", Collections.singletonList("hivemall.tools.matrix")); genericFuncsHeaders.put("# Text processing", Collections.singletonList("hivemall.tools.text")); + genericFuncsHeaders.put("# Timeseries", + Collections.singletonList("hivemall.tools.timeseries")); genericFuncsHeaders.put("# Others", Collections.singletonList("hivemall.tools")); } From 2e293a1d7d707102e85009f12cab205b23a097ca Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Wed, 16 May 2018 20:20:19 +0900 Subject: [PATCH 29/56] Updated function usage doc --- .../java/hivemall/HivemallVersionUDF.java | 2 +- .../hivemall/tools/GenerateSeriesUDTF.java | 44 +- .../main/java/hivemall/tools/TryCastUDF.java | 4 +- .../hivemall/tools/array/ArrayConcatUDF.java | 2 +- .../tools/array/ArrayIntersectUDF.java | 2 +- .../hivemall/tools/array/ArrayRemoveUDF.java | 4 +- .../hivemall/tools/array/ArraySliceUDF.java | 41 +- .../tools/array/ConditionalEmitUDTF.java | 19 +- .../tools/array/SortAndUniqArrayUDF.java | 2 +- .../tools/array/SubarrayEndWithUDF.java | 2 +- .../tools/array/SubarrayStartWithUDF.java | 2 +- .../java/hivemall/tools/bits/BitsORUDF.java | 4 +- .../java/hivemall/tools/bits/ToBitsUDF.java | 2 +- .../java/hivemall/tools/bits/UnBitsUDF.java | 2 +- .../hivemall/tools/compress/DeflateUDF.java | 2 +- .../hivemall/tools/compress/InflateUDF.java | 4 +- .../tools/datetime/SessionizeUDF.java | 12 +- .../java/hivemall/tools/json/FromJsonUDF.java | 38 +- .../java/hivemall/tools/json/ToJsonUDF.java | 30 +- .../tools/list/UDAFToOrderedList.java | 16 +- .../hivemall/tools/map/MergeMapsUDAF.java | 11 +- .../hivemall/tools/mapred/RowNumberUDF.java | 4 +- .../java/hivemall/tools/text/Base91UDF.java | 2 +- .../tools/text/NormalizeUnicodeUDF.java | 4 +- .../hivemall/tools/text/SingularizeUDF.java | 2 +- .../java/hivemall/tools/text/Unbase91UDF.java | 4 +- .../hivemall/tools/text/WordNgramsUDF.java | 4 +- .../tools/timeseries/MovingAverageUDTF.java | 19 +- .../hivemall/tools/vector/VectorDotUDF.java | 3 +- docs/gitbook/misc/funcs.md | 2 +- docs/gitbook/misc/generic_funcs.md | 436 ++++++++++++++---- .../hivemall/docs/FuncsListGenerator.java | 7 +- 32 files changed, 566 insertions(+), 166 deletions(-) diff --git a/core/src/main/java/hivemall/HivemallVersionUDF.java b/core/src/main/java/hivemall/HivemallVersionUDF.java index c42701f94..92636d5dc 100644 --- a/core/src/main/java/hivemall/HivemallVersionUDF.java +++ b/core/src/main/java/hivemall/HivemallVersionUDF.java @@ -24,7 +24,7 @@ import org.apache.hadoop.io.Text; @Description(name = "hivemall_version", value = "_FUNC_() - Returns the version of Hivemall", - extended = "Usage: SELECT hivemall_version();") + extended = "SELECT hivemall_version();") @UDFType(deterministic = true, stateful = false) public final class HivemallVersionUDF extends UDF { diff --git a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java index fd87510ab..ab79911fc 100644 --- a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java +++ b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java @@ -40,11 +40,49 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Writable; +// @formatter:off @Description(name = "generate_series", value = "_FUNC_(const int|bigint start, const int|bigint end) - " - + "Generate a series of values, from start to end. A similar function to PostgreSQL's `generate_serics`. http://www.postgresql.org/docs/current/static/functions-srf.html", - extended = "select generate_series(1,9);\n\n" + "1\n" + "2\n" + "3\n" + "4\n" + "5\n" - + "6\n" + "7\n" + "8\n" + "9") + + "Generate a series of values, from start to end. " + + "A similar function to PostgreSQL's [generate_serics](http://www.postgresql.org/docs/current/static/functions-srf.html)", + extended = "SELECT generate_series(2,4);\n" + + "\n" + + " 2\n" + + " 3\n" + + " 4\n" + + "\n" + + "SELECT generate_series(5,1,-2);\n" + + "\n" + + " 5\n" + + " 3\n" + + " 1\n" + + "\n" + + "SELECT generate_series(4,3);\n" + + "\n" + + "> (no return)\n" + + "\n" + + "SELECT date_add(current_date(),value),value from (SELECT generate_series(1,3)) t;\n" + + "\n" + + " 2018-04-21 1\n" + + " 2018-04-22 2\n" + + " 2018-04-23 3\n" + + "\n" + + "WITH input as (\n" + + " SELECT 1 as c1, 10 as c2, 3 as step\n" + + " UNION ALL\n" + + " SELECT 10, 2, -3\n" + + ")\n" + + "SELECT generate_series(c1, c2, step) as series\n" + + "FROM input;\n" + + "\n" + + " 1\n" + + " 4\n" + + " 7\n" + + " 10\n" + + " 10\n" + + " 7\n" + + " 4") +// @formatter:on public final class GenerateSeriesUDTF extends GenericUDTF { private PrimitiveObjectInspector startOI, endOI; diff --git a/core/src/main/java/hivemall/tools/TryCastUDF.java b/core/src/main/java/hivemall/tools/TryCastUDF.java index a0f3257d7..adb8328c8 100644 --- a/core/src/main/java/hivemall/tools/TryCastUDF.java +++ b/core/src/main/java/hivemall/tools/TryCastUDF.java @@ -35,8 +35,8 @@ @Description(name = "try_cast", value = "_FUNC_(ANY src, const string typeName)" + " - Explicitly cast a value as a type. Returns null if cast fails.", - extended = "Usage: select try_cast(array(1.0,2.0,3.0), 'array')\n" - + " select try_cast(map('A',10,'B',20,'C',30), 'map')") + extended = "SELECT try_cast(array(1.0,2.0,3.0), 'array')\n" + + "SELECT try_cast(map('A',10,'B',20,'C',30), 'map')") @UDFType(deterministic = true, stateful = false) public final class TryCastUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java index 62e3e3660..bdaa30aba 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java @@ -38,7 +38,7 @@ @Description(name = "array_concat", value = "_FUNC_(array x1, array x2, ..) - Returns a concatenated array", - extended = "select array_concat(array(1),array(2,3));\n" + "> [1,2,3]") + extended = "SELECT array_concat(array(1),array(2,3));\n" + " [1,2,3]") @UDFType(deterministic = true, stateful = false) public class ArrayConcatUDF extends GenericUDF { /** diff --git a/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java b/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java index 909176a17..fb99f6d3a 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java @@ -41,7 +41,7 @@ @Description(name = "array_intersect", value = "_FUNC_(array x1, array x2, ..) - Returns an intersect of given arrays", - extended = "select array_intersect(array(1,3,4),array(2,3,4),array(3,5));\n" + "> [3]") + extended = "SELECT array_intersect(array(1,3,4),array(2,3,4),array(3,5));\n" + " [3]") @UDFType(deterministic = true, stateful = false) public final class ArrayIntersectUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java b/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java index 523093bd3..207c398f6 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java @@ -29,8 +29,8 @@ @Description(name = "array_remove", value = "_FUNC_(array original, int|text|array target)" + " - Returns an array that the target is removed " + "from the original array", - extended = "select array_remove(array(1,null,3),array(null));\n" + "> [3]\n" + "\n" - + "select array_remove(array(\"aaa\",\"bbb\"),\"bbb\");\n" + "> [\"aaa\"]") + extended = "SELECT array_remove(array(1,null,3),array(null));\n" + " [3]\n" + "\n" + + "SELECT array_remove(array(\"aaa\",\"bbb\"),\"bbb\");\n" + " [\"aaa\"]") @UDFType(deterministic = true, stateful = false) public class ArrayRemoveUDF extends UDF { diff --git a/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java b/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java index 2bc98b9ef..e842df6ef 100644 --- a/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java @@ -39,9 +39,48 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +// @formatter:off @Description(name = "array_slice", value = "_FUNC_(array values, int offset [, int length]) - Slices the given array by the given offset and length parameters.", - extended = "select array_slice(array(1,2,3,4,5,6), 2,4);\n" + "> [3,4]") + extended = "SELECT \n" + + " array_slice(array(1,2,3,4,5,6), 2,4),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " 0, -- offset\n" + + " 2 -- length\n" + + " ),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " 6, -- offset\n" + + " 3 -- length\n" + + " ),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " 6, -- offset\n" + + " 10 -- length\n" + + " ),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " 6 -- offset\n" + + " ),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " -3 -- offset\n" + + " ),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " -3, -- offset\n" + + " 2 -- length\n" + + " );\n" + + "\n" + + " [3,4]\n" + + " [\"zero\",\"one\"] \n" + + " [\"six\",\"seven\",\"eight\"]\n" + + " [\"six\",\"seven\",\"eight\",\"nine\",\"ten\"]\n" + + " [\"six\",\"seven\",\"eight\",\"nine\",\"ten\"]\n" + + " [\"eight\",\"nine\",\"ten\"]\n" + + " [\"eight\",\"nine\"]") +// @formatter:on @UDFType(deterministic = true, stateful = false) public final class ArraySliceUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java b/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java index a73a06f78..89862b93c 100644 --- a/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java +++ b/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java @@ -55,9 +55,26 @@ * table_to_scan_once * */ +// @formatter:off @Description(name = "conditional_emit", value = "_FUNC_(array conditions, array features)" - + " - Emit features of a row according to various conditions") + + " - Emit features of a row according to various conditions", + extended = "WITH input as (\n" + + " select array(true, false, true) as conditions, array(\"one\", \"two\", \"three\") as features\n" + + " UNION ALL\n" + + " select array(true, true, false), array(\"four\", \"five\", \"six\")\n" + + ")\n" + + "SELECT\n" + + " conditional_emit(\n" + + " conditions, features\n" + + " )\n" + + "FROM \n" + + " input;\n" + + " one\n" + + " three\n" + + " four\n" + + " five") +// @formatter:on @UDFType(deterministic = true, stateful = false) public final class ConditionalEmitUDTF extends GenericUDTF { diff --git a/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java b/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java index e9c1cc52e..d9f265eac 100644 --- a/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java +++ b/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java @@ -31,7 +31,7 @@ @Description(name = "sort_and_uniq_array", value = "_FUNC_(array) - Takes array and " + "returns a sorted array with duplicate elements eliminated", - extended = "select sort_and_uniq_array(array(3,1,1,-2,10));\n" + "> [-2,1,3,10]") + extended = "SELECT sort_and_uniq_array(array(3,1,1,-2,10));\n" + " [-2,1,3,10]") @UDFType(deterministic = true, stateful = false) public class SortAndUniqArrayUDF extends UDF { diff --git a/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java b/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java index 901811755..36f23b50f 100644 --- a/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java +++ b/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java @@ -29,7 +29,7 @@ @Description(name = "subarray_endwith", value = "_FUNC_(array original, int|text key)" + " - Returns an array that ends with the specified key", - extended = "select subarray_endwith(array(1,2,3,4), 3);\n" + "> [1,2,3]") + extended = "SELECT subarray_endwith(array(1,2,3,4), 3);\n" + " [1,2,3]") @UDFType(deterministic = true, stateful = false) public class SubarrayEndWithUDF extends UDF { diff --git a/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java b/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java index ae0d4fbf6..95ae6cf0c 100644 --- a/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java +++ b/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java @@ -29,7 +29,7 @@ @Description(name = "subarray_startwith", value = "_FUNC_(array original, int|text key)" + " - Returns an array that starts with the specified key", - extended = "select subarray_startwith(array(1,2,3,4), 2);\n" + "> [2,3,4]") + extended = "SELECT subarray_startwith(array(1,2,3,4), 2);\n" + " [2,3,4]") @UDFType(deterministic = true, stateful = false) public class SubarrayStartWithUDF extends UDF { diff --git a/core/src/main/java/hivemall/tools/bits/BitsORUDF.java b/core/src/main/java/hivemall/tools/bits/BitsORUDF.java index d5497b6ce..40f7b5c3d 100644 --- a/core/src/main/java/hivemall/tools/bits/BitsORUDF.java +++ b/core/src/main/java/hivemall/tools/bits/BitsORUDF.java @@ -38,8 +38,8 @@ @Description(name = "bits_or", value = "_FUNC_(array b1, array b2, ..) - Returns a logical OR given bitsets", - extended = "select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3))));\n" - + "> [1,2,3,4]") + extended = "SELECT unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3))));\n" + + " [1,2,3,4]") public final class BitsORUDF extends GenericUDF { private ListObjectInspector[] _listOIs; diff --git a/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java b/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java index 599d90ad3..c33bd3172 100644 --- a/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java +++ b/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java @@ -40,7 +40,7 @@ @Description(name = "to_bits", value = "_FUNC_(int[] indexes) - Returns an bitset representation if the given indexes in long[]", - extended = "select to_bits(array(1,2,3,128));\n" + "> [14,-9223372036854775808]") + extended = "SELECT to_bits(array(1,2,3,128));\n" + " [14,-9223372036854775808]") @UDFType(deterministic = true, stateful = false) public final class ToBitsUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java b/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java index e3f6bae6f..0ec37629f 100644 --- a/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java +++ b/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java @@ -40,7 +40,7 @@ @Description(name = "unbits", value = "_FUNC_(long[] bitset) - Returns an long array of the give bitset representation", - extended = "select unbits(to_bits(array(1,4,2,3)));\n" + "> [1,2,3,4]") + extended = "SELECT unbits(to_bits(array(1,4,2,3)));\n" + " [1,2,3,4]") @UDFType(deterministic = true, stateful = false) public final class UnBitsUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/compress/DeflateUDF.java b/core/src/main/java/hivemall/tools/compress/DeflateUDF.java index 4c82387e6..410ffd617 100644 --- a/core/src/main/java/hivemall/tools/compress/DeflateUDF.java +++ b/core/src/main/java/hivemall/tools/compress/DeflateUDF.java @@ -41,7 +41,7 @@ @Description(name = "deflate", value = "_FUNC_(TEXT data [, const int compressionLevel]) - " + "Returns a compressed BINARY object by using Deflater. The compression level must be in range [-1,9]", - extended = "select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));\n" + "> AA+=kaIM|WTt!+wbGAA") + extended = "SELECT base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));\n" + " AA+=kaIM|WTt!+wbGAA") @UDFType(deterministic = true, stateful = false) public final class DeflateUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/compress/InflateUDF.java b/core/src/main/java/hivemall/tools/compress/InflateUDF.java index e2c4cc287..883d43238 100644 --- a/core/src/main/java/hivemall/tools/compress/InflateUDF.java +++ b/core/src/main/java/hivemall/tools/compress/InflateUDF.java @@ -40,8 +40,8 @@ @Description(name = "inflate", value = "_FUNC_(BINARY compressedData) - Returns a decompressed STRING by using Inflater", - extended = "select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));\n" - + "> aaaaaaaaaaaaaaaabbbbccc") + extended = "SELECT inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));\n" + + " aaaaaaaaaaaaaaaabbbbccc") @UDFType(deterministic = true, stateful = false) public final class InflateUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java b/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java index 4ceec7768..823584246 100644 --- a/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java +++ b/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java @@ -29,11 +29,19 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; +//@formatter:off @Description(name = "sessionize", value = "_FUNC_(long timeInSec, long thresholdInSec [, String subject])" + "- Returns a UUID string of a session.", - extended = "SELECT sessionize(time, 3600, ip_addr) as session_id, time, ip_addr FROM (\n" - + "SELECT time, ipaddr FROM weblog DISTRIBUTE BY ip_addr, time SORT BY ip_addr, time DESC\n) t1") + extended = "SELECT \n" + + " sessionize(time, 3600, ip_addr) as session_id, \n" + + " time, ip_addr\n" + + "FROM (\n" + + " SELECT time, ipaddr \n" + + " FROM weblog \n" + + " DISTRIBUTE BY ip_addr, time SORT BY ip_addr, time DESC\n" + + ") t1") +//@formatter:on @UDFType(deterministic = false, stateful = true) public final class SessionizeUDF extends UDF { diff --git a/core/src/main/java/hivemall/tools/json/FromJsonUDF.java b/core/src/main/java/hivemall/tools/json/FromJsonUDF.java index 36c29cc8a..76071e9f7 100644 --- a/core/src/main/java/hivemall/tools/json/FromJsonUDF.java +++ b/core/src/main/java/hivemall/tools/json/FromJsonUDF.java @@ -43,9 +43,45 @@ import org.apache.hadoop.io.Text; import org.apache.hive.hcatalog.data.HCatRecordObjectInspectorFactory; +// @formatter:off @Description(name = "from_json", value = "_FUNC_(string jsonString, const string returnTypes [, const array|const string columnNames])" - + " - Return Hive object.") + + " - Return Hive object.", + extended = "SELECT\n" + + " from_json(\n" + + " '{ \"person\" : { \"name\" : \"makoto\" , \"age\" : 37 } }',\n" + + " 'struct', \n" + + " array('person')\n" + + " ),\n" + + " from_json(\n" + + " '[0.1,1.1,2.2]',\n" + + " 'array'\n" + + " ),\n" + + " from_json(to_json(\n" + + " ARRAY(\n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"tokyo\"), \n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"osaka\")\n" + + " )\n" + + " ),'array>'),\n" + + " from_json(to_json(\n" + + " ARRAY(\n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"tokyo\"), \n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"osaka\")\n" + + " ),\n" + + " array('city')\n" + + " ), 'array>'),\n" + + " from_json(to_json(\n" + + " ARRAY(\n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"tokyo\"), \n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"osaka\")\n" + + " )\n" + + " ),'array>');\n\n" + + " {\"name\":\"makoto\",\"age\":37}\n" + + " [0.1,1.1,2.2]\n" + + " [{\"country\":\"japan\",\"city\":\"tokyo\"},{\"country\":\"japan\",\"city\":\"osaka\"}]\n" + + " [{\"country\":\"japan\",\"city\":\"tokyo\"},{\"country\":\"japan\",\"city\":\"osaka\"}]\n" + + " [{\"city\":\"tokyo\"},{\"city\":\"osaka\"}]") +//@formatter:on @UDFType(deterministic = true, stateful = false) public final class FromJsonUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/json/ToJsonUDF.java b/core/src/main/java/hivemall/tools/json/ToJsonUDF.java index 54e447abc..8ba2d1b06 100644 --- a/core/src/main/java/hivemall/tools/json/ToJsonUDF.java +++ b/core/src/main/java/hivemall/tools/json/ToJsonUDF.java @@ -95,21 +95,21 @@ " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"osaka\")\n" + " ),\n" + " array('city')\n" + - " );\n" + - "> {\"name\":\"John\",\"age\":31} " - + "{\"name\":\"John\",\"age\":31} " - + "{\"Name\":\"John\",\"age\":31} " - + "{\"name\":\"John\",\"age\":31} " - + "{\"age\":31} " - + "{}" - + "NULL " - + "{\"ti\":\"123\",\"si\":\"456\",\"i\":789,\"bi\":[314,7]} " - + "{\"ti\":\"123\",\"si\":\"456\",\"i\":789,\"bi\":[314,7]} " - + "{\"col1\":\"123\",\"col2\":\"456\",\"col3\":789,\"col4\":[314,7]} " - + "{\"country\":\"japan\",\"city\":\"tokyo\"} " - + "{\"city\":\"tokyo\"} " - + "[{\"country\":\"japan\",\"city\":\"tokyo\"},{\"country\":\"japan\",\"city\":\"osaka\"}] " - + "[{\"country\":\"japan\",\"city\":\"tokyo\"},{\"country\":\"japan\",\"city\":\"osaka\"}]") + " );\n\n" + + " {\"name\":\"John\",\"age\":31}\n" + + " {\"name\":\"John\",\"age\":31}\n" + + " {\"Name\":\"John\",\"age\":31}\n" + + " {\"name\":\"John\",\"age\":31}\n" + + " {\"age\":31}\n" + + " {}\n" + + " NULL\n" + + " {\"ti\":\"123\",\"si\":\"456\",\"i\":789,\"bi\":[314,7]}\n" + + " {\"ti\":\"123\",\"si\":\"456\",\"i\":789,\"bi\":[314,7]}\n" + + " {\"col1\":\"123\",\"col2\":\"456\",\"col3\":789,\"col4\":[314,7]}\n" + + " {\"country\":\"japan\",\"city\":\"tokyo\"}\n" + + " {\"city\":\"tokyo\"}\n" + + " [{\"country\":\"japan\",\"city\":\"tokyo\"},{\"country\":\"japan\",\"city\":\"osaka\"}]\n" + + " [{\"country\":\"japan\",\"city\":\"tokyo\"},{\"country\":\"japan\",\"city\":\"osaka\"}]") // @formatter:on @UDFType(deterministic = true, stateful = false) public final class ToJsonUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java b/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java index 5ef6ddbfa..83adb0f5f 100644 --- a/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java +++ b/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java @@ -66,15 +66,16 @@ /** * Return list of values sorted by value itself or specific key. */ +//@formatter:off @Description(name = "to_ordered_list", value = "_FUNC_(PRIMITIVE value [, PRIMITIVE key, const string options])" + " - Return list of values sorted by value itself or specific key", - extended = "with t as (\n" + " select 5 as key, 'apple' as value\n" + " union all\n" - + " select 3 as key, 'banana' as value\n" + " union all\n" - + " select 4 as key, 'candy' as value\n" + " union all\n" - + " select 2 as key, 'donut' as value\n" + " union all\n" - + " select 3 as key, 'egg' as value\n" + ")\n" - + "select -- expected output\n" + extended = "WITH t as (\n" + " SELECT 5 as key, 'apple' as value\n" + " UNION ALL\n" + + " SELECT 3 as key, 'banana' as value\n" + " UNION ALL\n" + + " SELECT 4 as key, 'candy' as value\n" + " UNION ALL\n" + + " SELECT 2 as key, 'donut' as value\n" + " UNION ALL\n" + + " SELECT 3 as key, 'egg' as value\n" + ")\n" + + "SELECT -- expected output\n" + " to_ordered_list(value, key, '-reverse'), -- [apple, candy, (banana, egg | egg, banana), donut] (reverse order)\n" + " to_ordered_list(value, key, '-k 2'), -- [apple, candy] (top-k)\n" + " to_ordered_list(value, key, '-k 100'), -- [apple, candy, (banana, egg | egg, banana), dunut]\n" @@ -86,7 +87,8 @@ + " to_ordered_list(value, '-k 2'), -- [egg, donut] (alphabetically)\n" + " to_ordered_list(key, '-k -2 -reverse'), -- [5, 4] (top-2 keys)\n" + " to_ordered_list(key) -- [2, 3, 3, 4, 5] (natural ordered keys)\n" - + "from\n" + " t") + + "FROM\n" + " t") +//@formatter:on public final class UDAFToOrderedList extends AbstractGenericUDAFResolver { @Override diff --git a/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java b/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java index a661b3416..e4a251636 100644 --- a/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java +++ b/core/src/main/java/hivemall/tools/map/MergeMapsUDAF.java @@ -39,11 +39,18 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +//@formatter:off @Description(name = "merge_maps", value = "_FUNC_(x) - Returns a map which contains the union of an aggregation of maps." + " Note that an existing value of a key can be replaced with the other duplicate key entry.", - extended = "SELECT merge_maps(m) FROM ( " - + "SELECT map('A',10,'B',20,'C',30) UNION ALL SELECT map('A',10,'B',20,'C',30)) t") + extended = "SELECT \n" + + " merge_maps(m) \n" + + "FROM (\n" + + " SELECT map('A',10,'B',20,'C',30) \n" + + " UNION ALL \n" + + " SELECT map('A',10,'B',20,'C',30)\n" + + ") t") +//@formatter:on public final class MergeMapsUDAF extends AbstractGenericUDAFResolver { @Override diff --git a/core/src/main/java/hivemall/tools/mapred/RowNumberUDF.java b/core/src/main/java/hivemall/tools/mapred/RowNumberUDF.java index 3c92e30f0..95c97dc14 100644 --- a/core/src/main/java/hivemall/tools/mapred/RowNumberUDF.java +++ b/core/src/main/java/hivemall/tools/mapred/RowNumberUDF.java @@ -28,8 +28,8 @@ import org.apache.hadoop.hive.ql.udf.UDFType; import org.apache.hadoop.io.LongWritable; -@Description(name = "rownum", value = "_FUNC_() - Returns a generated row number in long", - extended = "returns sprintf(`%d%04d`,sequence,taskId) as long") +@Description(name = "rownum", value = "_FUNC_() - Returns a generated row number `sprintf(`%d%04d`,sequence,taskId)` in long", + extended = "SELECT rownum() as rownum, xxx from ...") @UDFType(deterministic = false, stateful = true) public final class RowNumberUDF extends UDF { diff --git a/core/src/main/java/hivemall/tools/text/Base91UDF.java b/core/src/main/java/hivemall/tools/text/Base91UDF.java index 6f525997f..73f365bc2 100644 --- a/core/src/main/java/hivemall/tools/text/Base91UDF.java +++ b/core/src/main/java/hivemall/tools/text/Base91UDF.java @@ -40,7 +40,7 @@ @Description(name = "base91", value = "_FUNC_(BINARY bin) - Convert the argument from binary to a BASE91 string", - extended = "select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));\n" + "> AA+=kaIM|WTt!+wbGAA") + extended = "SELECT base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));\n" + " AA+=kaIM|WTt!+wbGAA") @UDFType(deterministic = true, stateful = false) public final class Base91UDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java b/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java index aefb4e23b..9a7b54746 100644 --- a/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java +++ b/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java @@ -29,8 +29,8 @@ @Description(name = "normalize_unicode", value = "_FUNC_(string str [, string form]) - Transforms `str` with the specified normalization form. " + "The `form` takes one of NFC (default), NFD, NFKC, or NFKD", - extended = "select normalize_unicode('ハンカクカナ','NFKC');\n" + "> ハンカクカナ\n" + "\n" - + "select normalize_unicode('㈱㌧㌦Ⅲ','NFKC');\n" + "> (株)トンドルIII") + extended = "SELECT normalize_unicode('ハンカクカナ','NFKC');\n" + " ハンカクカナ\n" + "\n" + + "SELECT normalize_unicode('㈱㌧㌦Ⅲ','NFKC');\n" + " (株)トンドルIII") @UDFType(deterministic = true, stateful = false) public final class NormalizeUnicodeUDF extends UDF { diff --git a/core/src/main/java/hivemall/tools/text/SingularizeUDF.java b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java index 73d2d6329..3b1682876 100644 --- a/core/src/main/java/hivemall/tools/text/SingularizeUDF.java +++ b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java @@ -40,7 +40,7 @@ // https://github.com/clips/pattern/blob/3eef00481a4555331cf9a099308910d977f6fc22/pattern/text/en/inflect.py#L445-L623 @Description(name = "singularize", value = "_FUNC_(string word) - Returns singular form of a given English word", - extended = "select singularize(lower(\"Apples\"));\n" + "\n" + "> \"apple\"") + extended = "SELECT singularize(lower(\"Apples\"));\n" + "\n" + " \"apple\"") @UDFType(deterministic = true, stateful = false) public final class SingularizeUDF extends UDF { diff --git a/core/src/main/java/hivemall/tools/text/Unbase91UDF.java b/core/src/main/java/hivemall/tools/text/Unbase91UDF.java index a96b3bf19..9c277cea9 100644 --- a/core/src/main/java/hivemall/tools/text/Unbase91UDF.java +++ b/core/src/main/java/hivemall/tools/text/Unbase91UDF.java @@ -39,8 +39,8 @@ import org.apache.hadoop.io.Text; @Description(name = "unbase91", value = "_FUNC_(string) - Convert a BASE91 string to a binary", - extended = "select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));\n" - + "> aaaaaaaaaaaaaaaabbbbccc") + extended = "SELECT inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));\n" + + " aaaaaaaaaaaaaaaabbbbccc") @UDFType(deterministic = true, stateful = false) public final class Unbase91UDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java index fa8308b5f..9b3658f55 100644 --- a/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java +++ b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java @@ -37,8 +37,8 @@ @Description(name = "word_ngrams", value = "_FUNC_(array words, int minSize, int maxSize])" + " - Returns list of n-grams for given words, where `minSize <= n <= maxSize`", - extended = "select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2);\n" + "\n" - + "> [\"machine\",\"machine learning\",\"learning\",\"learning is\",\"is\",\"is fun\",\"fun\"]") + extended = "SELECT word_ngrams(tokenize('Machine learning is fun!', true), 1, 2);\n" + "\n" + + " [\"machine\",\"machine learning\",\"learning\",\"learning is\",\"is\",\"is fun\",\"fun\"]") @UDFType(deterministic = true, stateful = false) public final class WordNgramsUDF extends UDF { diff --git a/core/src/main/java/hivemall/tools/timeseries/MovingAverageUDTF.java b/core/src/main/java/hivemall/tools/timeseries/MovingAverageUDTF.java index e6c634cde..b5267a10a 100644 --- a/core/src/main/java/hivemall/tools/timeseries/MovingAverageUDTF.java +++ b/core/src/main/java/hivemall/tools/timeseries/MovingAverageUDTF.java @@ -37,18 +37,19 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.io.Writable; +// @formatter:off @Description(name = "moving_avg", value = "_FUNC_(NUMBER value, const int windowSize)" + " - Returns moving average of a time series using a given window", - extended = "SELECT moving_avg(x, 3) FROM (SELECT explode(array(1.0,2.0,3.0,4.0,5.0,6.0,7.0)) as x) series;\n" - + "> avg\n" + - "> 1.0\n" + - "> 1.5\n" + - "> 2.0\n" + - "> 3.0\n" + - "> 4.0\n" + - "> 5.0\n" + - "> 6.0") + extended = "SELECT moving_avg(x, 3) FROM (SELECT explode(array(1.0,2.0,3.0,4.0,5.0,6.0,7.0)) as x) series;\n" + + " 1.0\n" + + " 1.5\n" + + " 2.0\n" + + " 3.0\n" + + " 4.0\n" + + " 5.0\n" + + " 6.0") +// @formatter:on @UDFType(deterministic = false, stateful = true) public final class MovingAverageUDTF extends GenericUDTF { diff --git a/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java b/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java index 9c6cea094..007cfcf87 100644 --- a/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java +++ b/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java @@ -42,8 +42,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; @Description(name = "vector_dot", - value = "_FUNC_(array x, array y) - Performs vector dot product.", - extended = "_FUNC_(array x, NUMBER y) - Performs vector multiplication") + value = "_FUNC_(array x, array y) - Performs vector dot product.") @UDFType(deterministic = true, stateful = false) public final class VectorDotUDF extends GenericUDF { diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md index 00d7bba1d..f98c82797 100644 --- a/docs/gitbook/misc/funcs.md +++ b/docs/gitbook/misc/funcs.md @@ -446,7 +446,7 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi - `hivemall_version()` - Returns the version of Hivemall ```sql - Usage: SELECT hivemall_version(); + SELECT hivemall_version(); ``` - `lr_datagen(options string)` - Generates a logistic regression dataset diff --git a/docs/gitbook/misc/generic_funcs.md b/docs/gitbook/misc/generic_funcs.md index d33ab213a..c695bcdcd 100644 --- a/docs/gitbook/misc/generic_funcs.md +++ b/docs/gitbook/misc/generic_funcs.md @@ -29,31 +29,67 @@ This page describes a list of useful Hivemall generic functions. See also a [lis - `array_concat(array x1, array x2, ..)` - Returns a concatenated array ```sql - select array_concat(array(1),array(2,3)); - > [1,2,3] + SELECT array_concat(array(1),array(2,3)); + [1,2,3] ``` - `array_flatten(array>)` - Returns an array with the elements flattened. - `array_intersect(array x1, array x2, ..)` - Returns an intersect of given arrays ```sql - select array_intersect(array(1,3,4),array(2,3,4),array(3,5)); - > [3] + SELECT array_intersect(array(1,3,4),array(2,3,4),array(3,5)); + [3] ``` - `array_remove(array original, int|text|array target)` - Returns an array that the target is removed from the original array ```sql - select array_remove(array(1,null,3),array(null)); - > [3] + SELECT array_remove(array(1,null,3),array(null)); + [3] - select array_remove(array("aaa","bbb"),"bbb"); - > ["aaa"] + SELECT array_remove(array("aaa","bbb"),"bbb"); + ["aaa"] ``` - `array_slice(array values, int offset [, int length])` - Slices the given array by the given offset and length parameters. ```sql - select array_slice(array(1,2,3,4,5,6), 2,4); - > [3,4] + SELECT + array_slice(array(1,2,3,4,5,6), 2,4), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + 0, -- offset + 2 -- length + ), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + 6, -- offset + 3 -- length + ), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + 6, -- offset + 10 -- length + ), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + 6 -- offset + ), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + -3 -- offset + ), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + -3, -- offset + 2 -- length + ); + + [3,4] + ["zero","one"] + ["six","seven","eight"] + ["six","seven","eight","nine","ten"] + ["six","seven","eight","nine","ten"] + ["eight","nine","ten"] + ["eight","nine"] ``` - `array_sum(array)` - Returns an array<double> in which each element is summed up @@ -61,6 +97,23 @@ This page describes a list of useful Hivemall generic functions. See also a [lis - `array_union(array1, array2, ...)` - Returns the union of a set of arrays - `conditional_emit(array conditions, array features)` - Emit features of a row according to various conditions + ```sql + WITH input as ( + select array(true, false, true) as conditions, array("one", "two", "three") as features + UNION ALL + select array(true, true, false), array("four", "five", "six") + ) + SELECT + conditional_emit( + conditions, features + ) + FROM + input; + one + three + four + five + ``` - `element_at(array list, int pos)` - Returns an element at the given position @@ -74,38 +127,38 @@ This page describes a list of useful Hivemall generic functions. See also a [lis - `sort_and_uniq_array(array)` - Takes array<int> and returns a sorted array with duplicate elements eliminated ```sql - select sort_and_uniq_array(array(3,1,1,-2,10)); - > [-2,1,3,10] + SELECT sort_and_uniq_array(array(3,1,1,-2,10)); + [-2,1,3,10] ``` - `subarray_endwith(array original, int|text key)` - Returns an array that ends with the specified key ```sql - select subarray_endwith(array(1,2,3,4), 3); - > [1,2,3] + SELECT subarray_endwith(array(1,2,3,4), 3); + [1,2,3] ``` - `subarray_startwith(array original, int|text key)` - Returns an array that starts with the specified key ```sql - select subarray_startwith(array(1,2,3,4), 2); - > [2,3,4] + SELECT subarray_startwith(array(1,2,3,4), 2); + [2,3,4] ``` - `to_string_array(array)` - Returns an array of strings - `to_ordered_list(PRIMITIVE value [, PRIMITIVE key, const string options])` - Return list of values sorted by value itself or specific key ```sql - with t as ( - select 5 as key, 'apple' as value - union all - select 3 as key, 'banana' as value - union all - select 4 as key, 'candy' as value - union all - select 2 as key, 'donut' as value - union all - select 3 as key, 'egg' as value + WITH t as ( + SELECT 5 as key, 'apple' as value + UNION ALL + SELECT 3 as key, 'banana' as value + UNION ALL + SELECT 4 as key, 'candy' as value + UNION ALL + SELECT 2 as key, 'donut' as value + UNION ALL + SELECT 3 as key, 'egg' as value ) - select -- expected output + SELECT -- expected output to_ordered_list(value, key, '-reverse'), -- [apple, candy, (banana, egg | egg, banana), donut] (reverse order) to_ordered_list(value, key, '-k 2'), -- [apple, candy] (top-k) to_ordered_list(value, key, '-k 100'), -- [apple, candy, (banana, egg | egg, banana), dunut] @@ -117,16 +170,195 @@ This page describes a list of useful Hivemall generic functions. See also a [lis to_ordered_list(value, '-k 2'), -- [egg, donut] (alphabetically) to_ordered_list(key, '-k -2 -reverse'), -- [5, 4] (top-2 keys) to_ordered_list(key) -- [2, 3, 3, 4, 5] (natural ordered keys) - from + FROM t ``` +# Bitset + +- `bits_collect(int|long x)` - Returns a bitset in array<long> + +- `bits_or(array b1, array b2, ..)` - Returns a logical OR given bitsets + ```sql + SELECT unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3)))); + [1,2,3,4] + ``` + +- `to_bits(int[] indexes)` - Returns an bitset representation if the given indexes in long[] + ```sql + SELECT to_bits(array(1,2,3,128)); + [14,-9223372036854775808] + ``` + +- `unbits(long[] bitset)` - Returns an long array of the give bitset representation + ```sql + SELECT unbits(to_bits(array(1,4,2,3))); + [1,2,3,4] + ``` + +# Compression + +- `deflate(TEXT data [, const int compressionLevel])` - Returns a compressed BINARY object by using Deflater. The compression level must be in range [-1,9] + ```sql + SELECT base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); + AA+=kaIM|WTt!+wbGAA + ``` + +- `inflate(BINARY compressedData)` - Returns a decompressed STRING by using Inflater + ```sql + SELECT inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); + aaaaaaaaaaaaaaaabbbbccc + ``` + +# Datetime + +- `sessionize(long timeInSec, long thresholdInSec [, String subject])`- Returns a UUID string of a session. + ```sql + SELECT + sessionize(time, 3600, ip_addr) as session_id, + time, ip_addr + FROM ( + SELECT time, ipaddr + FROM weblog + DISTRIBUTE BY ip_addr, time SORT BY ip_addr, time DESC + ) t1 + ``` + +# JSON + +- `from_json(string jsonString, const string returnTypes [, const array|const string columnNames])` - Return Hive object. + ```sql + SELECT + from_json( + '{ "person" : { "name" : "makoto" , "age" : 37 } }', + 'struct', + array('person') + ), + from_json( + '[0.1,1.1,2.2]', + 'array' + ), + from_json(to_json( + ARRAY( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + NAMED_STRUCT("country", "japan", "city", "osaka") + ) + ),'array>'), + from_json(to_json( + ARRAY( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + NAMED_STRUCT("country", "japan", "city", "osaka") + ), + array('city') + ), 'array>'), + from_json(to_json( + ARRAY( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + NAMED_STRUCT("country", "japan", "city", "osaka") + ) + ),'array>'); + + {"name":"makoto","age":37} + [0.1,1.1,2.2] + [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] + [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] + [{"city":"tokyo"},{"city":"osaka"}] + ``` + +- `to_json(ANY object [, const array|const string columnNames])` - Returns Json string + ```sql + SELECT + NAMED_STRUCT("Name", "John", "age", 31), + to_json( + NAMED_STRUCT("Name", "John", "age", 31) + ), + to_json( + NAMED_STRUCT("Name", "John", "age", 31), + array('Name', 'age') + ), + to_json( + NAMED_STRUCT("Name", "John", "age", 31), + array('name', 'age') + ), + to_json( + NAMED_STRUCT("Name", "John", "age", 31), + array('age') + ), + to_json( + NAMED_STRUCT("Name", "John", "age", 31), + array() + ), + to_json( + null, + array() + ), + to_json( + struct("123", "456", 789, array(314,007)), + array('ti','si','i','bi') + ), + to_json( + struct("123", "456", 789, array(314,007)), + 'ti,si,i,bi' + ), + to_json( + struct("123", "456", 789, array(314,007)) + ), + to_json( + NAMED_STRUCT("country", "japan", "city", "tokyo") + ), + to_json( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + array('city') + ), + to_json( + ARRAY( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + NAMED_STRUCT("country", "japan", "city", "osaka") + ) + ), + to_json( + ARRAY( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + NAMED_STRUCT("country", "japan", "city", "osaka") + ), + array('city') + ); + + {"name":"John","age":31} + {"name":"John","age":31} + {"Name":"John","age":31} + {"name":"John","age":31} + {"age":31} + {} + NULL + {"ti":"123","si":"456","i":789,"bi":[314,7]} + {"ti":"123","si":"456","i":789,"bi":[314,7]} + {"col1":"123","col2":"456","col3":789,"col4":[314,7]} + {"country":"japan","city":"tokyo"} + {"city":"tokyo"} + [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] + [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] + ``` + # Map - `map_get_sum(map src, array keys)` - Returns sum of values that are retrieved by keys +- `map_key_values(map)` - Returns a array of key-value pairs. + - `map_tail_n(map SRC, int N)` - Returns the last N elements from a sorted array of SRC +- `merge_maps(x)` - Returns a map which contains the union of an aggregation of maps. Note that an existing value of a key can be replaced with the other duplicate key entry. + ```sql + SELECT + merge_maps(m) + FROM ( + SELECT map('A',10,'B',20,'C',30) + UNION ALL + SELECT map('A',10,'B',20,'C',30) + ) t + ``` + - `to_map(key, value)` - Convert two aggregated columns into a key-value map - `to_ordered_map(key, value [, const int k|const boolean reverseOrder=false])` - Convert two aggregated columns into an ordered key-value map @@ -152,42 +384,6 @@ This page describes a list of useful Hivemall generic functions. See also a [lis from t ``` -# Bitset - -- `bits_collect(int|long x)` - Returns a bitset in array<long> - -- `bits_or(array b1, array b2, ..)` - Returns a logical OR given bitsets - ```sql - select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3)))); - > [1,2,3,4] - ``` - -- `to_bits(int[] indexes)` - Returns an bitset representation if the given indexes in long[] - ```sql - select to_bits(array(1,2,3,128)); - > [14,-9223372036854775808] - ``` - -- `unbits(long[] bitset)` - Returns an long array of the give bitset representation - ```sql - select unbits(to_bits(array(1,4,2,3))); - > [1,2,3,4] - ``` - -# Compression - -- `deflate(TEXT data [, const int compressionLevel])` - Returns a compressed BINARY object by using Deflater. The compression level must be in range [-1,9] - ```sql - select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); - > AA+=kaIM|WTt!+wbGAA - ``` - -- `inflate(BINARY compressedData)` - Returns a decompressed STRING by using Inflater - ```sql - select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); - > aaaaaaaaaaaaaaaabbbbccc - ``` - # MapReduce - `distcache_gets(filepath, key, default_value [, parseKey])` - Returns map<key_type, value_type>|value_type @@ -198,9 +394,9 @@ This page describes a list of useful Hivemall generic functions. See also a [lis - `rowid()` - Returns a generated row id of a form {TASK_ID}-{SEQUENCE_NUMBER} -- `rownum()` - Returns a generated row number in long - ``` - returns sprintf(`%d%04d`,sequence,taskId) as long +- `rownum()` - Returns a generated row number `sprintf(`%d%04d`,sequence,taskId)` in long + ```sql + SELECT rownum() as rownum, xxx from ... ``` - `taskid()` - Returns the value of mapred.task.partition @@ -215,74 +411,126 @@ This page describes a list of useful Hivemall generic functions. See also a [lis - `transpose_and_dot(array matrix0_row, array matrix1_row)` - Returns dot(matrix0.T, matrix1) as array<array<double>>, shape = (matrix0.#cols, matrix1.#cols) +# Sanity Checks + +- `assert(boolean condition)` or _FUNC_(boolean condition, string errMsg)- Throws HiveException if condition is not met + +- `raise_error()` or _FUNC_(string msg) - Throws an error + # Text processing - `base91(BINARY bin)` - Convert the argument from binary to a BASE91 string ```sql - select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); - > AA+=kaIM|WTt!+wbGAA + SELECT base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); + AA+=kaIM|WTt!+wbGAA ``` - `is_stopword(string word)` - Returns whether English stopword or not - `normalize_unicode(string str [, string form])` - Transforms `str` with the specified normalization form. The `form` takes one of NFC (default), NFD, NFKC, or NFKD ```sql - select normalize_unicode('ハンカクカナ','NFKC'); - > ハンカクカナ + SELECT normalize_unicode('ハンカクカナ','NFKC'); + ハンカクカナ - select normalize_unicode('㈱㌧㌦Ⅲ','NFKC'); - > (株)トンドルIII + SELECT normalize_unicode('㈱㌧㌦Ⅲ','NFKC'); + (株)トンドルIII ``` - `singularize(string word)` - Returns singular form of a given English word ```sql - select singularize(lower("Apples")); + SELECT singularize(lower("Apples")); - > "apple" + "apple" ``` -- `split_words(string query [, string regex])` - Returns an array<text> containing split strings +- `split_words(string query [, string regex])` - Returns an array<text> containing splitted strings - `tokenize(string englishText [, boolean toLowerCase])` - Returns tokenized words in array<string> - `unbase91(string)` - Convert a BASE91 string to a binary ```sql - select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); - > aaaaaaaaaaaaaaaabbbbccc + SELECT inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); + aaaaaaaaaaaaaaaabbbbccc ``` - `word_ngrams(array words, int minSize, int maxSize])` - Returns list of n-grams for given words, where `minSize <= n <= maxSize` ```sql - select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2); + SELECT word_ngrams(tokenize('Machine learning is fun!', true), 1, 2); + + ["machine","machine learning","learning","learning is","is","is fun","fun"] + ``` + +# Timeseries - > ["machine","machine learning","learning","learning is","is","is fun","fun"] +- `moving_avg(NUMBER value, const int windowSize)` - Returns moving average of a time series using a given window + ```sql + SELECT moving_avg(x, 3) FROM (SELECT explode(array(1.0,2.0,3.0,4.0,5.0,6.0,7.0)) as x) series; + 1.0 + 1.5 + 2.0 + 3.0 + 4.0 + 5.0 + 6.0 ``` +# Vector + +- `vector_add(array x, array y)` - Perform vector ADD operation. + +- `vector_dot(array x, array y)` - Performs vector dot product. + # Others - `convert_label(const int|const float)` - Convert from -1|1 to 0.0f|1.0f, or from 0.0f|1.0f to -1|1 - `each_top_k(int K, Object group, double cmpKey, *)` - Returns top-K values (or tail-K values when k is less than 0) -- `generate_series(const int|bigint start, const int|bigint end)` - Generate a series of values, from start to end. A similar function to PostgreSQL's `generate_serics`. http://www.postgresql.org/docs/current/static/functions-srf.html +- `generate_series(const int|bigint start, const int|bigint end)` - Generate a series of values, from start to end. A similar function to PostgreSQL's [generate_serics](http://www.postgresql.org/docs/current/static/functions-srf.html) ```sql - select generate_series(1,9); - - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 + SELECT generate_series(2,4); + + 2 + 3 + 4 + + SELECT generate_series(5,1,-2); + + 5 + 3 + 1 + + SELECT generate_series(4,3); + + > (no return) + + SELECT date_add(current_date(),value) as `date`,value from (SELECT generate_series(1,3)) t; + + 2018-04-21 1 + 2018-04-22 2 + 2018-04-23 3 + + WITH input as ( + SELECT 1 as c1, 10 as c2, 3 as step + UNION ALL + SELECT 10, 2, -3 + ) + SELECT generate_series(c1, c2, step) as series + FROM input; + + 1 + 4 + 7 + 10 + 10 + 7 + 4 ``` - `try_cast(ANY src, const string typeName)` - Explicitly cast a value as a type. Returns null if cast fails. ```sql - Usage: select try_cast(array(1.0,2.0,3.0), 'array') - select try_cast(map('A',10,'B',20,'C',30), 'map') + SELECT try_cast(array(1.0,2.0,3.0), 'array') + SELECT try_cast(map('A',10,'B',20,'C',30), 'map') ``` - `x_rank(KEY)` - Generates a pseudo sequence number starting from 1 for each key diff --git a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java index 019075116..f8bda4de4 100644 --- a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java +++ b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java @@ -78,17 +78,22 @@ public class FuncsListGenerator extends AbstractMojo { static { genericFuncsHeaders.put("# Array", Arrays.asList("hivemall.tools.array", "hivemall.tools.list")); - genericFuncsHeaders.put("# Map", Collections.singletonList("hivemall.tools.map")); genericFuncsHeaders.put("# Bitset", Collections.singletonList("hivemall.tools.bits")); genericFuncsHeaders.put("# Compression", Collections.singletonList("hivemall.tools.compress")); + genericFuncsHeaders.put("# Datetime", Collections.singletonList("hivemall.tools.datetime")); + genericFuncsHeaders.put("# JSON", Collections.singletonList("hivemall.tools.json")); + genericFuncsHeaders.put("# Map", Collections.singletonList("hivemall.tools.map")); genericFuncsHeaders.put("# MapReduce", Collections.singletonList("hivemall.tools.mapred")); genericFuncsHeaders.put("# Math", Collections.singletonList("hivemall.tools.math")); genericFuncsHeaders.put("# Matrix", Collections.singletonList("hivemall.tools.matrix")); + genericFuncsHeaders.put("# Sanity Checks", + Collections.singletonList("hivemall.tools.sanity")); genericFuncsHeaders.put("# Text processing", Collections.singletonList("hivemall.tools.text")); genericFuncsHeaders.put("# Timeseries", Collections.singletonList("hivemall.tools.timeseries")); + genericFuncsHeaders.put("# Vector", Collections.singletonList("hivemall.tools.vector")); genericFuncsHeaders.put("# Others", Collections.singletonList("hivemall.tools")); } From d11735bbbb54f485640b1bc8abfaf273cde8b1b9 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 17 May 2018 19:14:36 +0900 Subject: [PATCH 30/56] Add a script to generate function desc --- bin/update_func_md.sh | 47 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100755 bin/update_func_md.sh diff --git a/bin/update_func_md.sh b/bin/update_func_md.sh new file mode 100755 index 000000000..d24e9bef1 --- /dev/null +++ b/bin/update_func_md.sh @@ -0,0 +1,47 @@ +#!/bin/sh +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +if [ "$HIVEMALL_HOME" = "" ]; then + if [ -e ../bin/${0##*/} ]; then + HIVEMALL_HOME=".." + elif [ -e ./bin/${0##*/} ]; then + HIVEMALL_HOME="." + else + echo "env HIVEMALL_HOME not defined" + exit 1 + fi +fi + +cd $HIVEMALL_HOME +HIVEMALL_HOME=`pwd` + +# Deploy to local Maven repos + +export MAVEN_OPTS=-XX:MaxPermSize=256m +mvn clean install -pl tools/hivemall-docs + +# Generate docs + +mvn org.apache.hivemall:hivemall-docs:generate-funcs-list + +# Run HTTP server on localhost:040 + +cd ${HIVEMALL_HOME}/docs/gitbook +gitbook install && gitbook serve From 6a507d48eb140333a4f80cdfc3d3397f67a565a1 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 21 May 2018 16:40:46 +0900 Subject: [PATCH 31/56] Fixed markdown generation scheme for UDF descriptions --- bin/update_func_md.sh | 4 +- docs/gitbook/misc/generic_funcs.md | 28 ++--- tools/hivemall-docs/pom.xml | 113 ++++++++++++++---- ...rator.java => FuncsListGeneratorMojo.java} | 17 ++- ...jectDependenciesComponentConfigurator.java | 100 ++++++++++++++++ 5 files changed, 219 insertions(+), 43 deletions(-) rename tools/hivemall-docs/src/main/java/hivemall/docs/{FuncsListGenerator.java => FuncsListGeneratorMojo.java} (94%) create mode 100644 tools/hivemall-docs/src/main/java/hivemall/docs/IncludeProjectDependenciesComponentConfigurator.java diff --git a/bin/update_func_md.sh b/bin/update_func_md.sh index d24e9bef1..bb0afda3a 100755 --- a/bin/update_func_md.sh +++ b/bin/update_func_md.sh @@ -35,11 +35,11 @@ HIVEMALL_HOME=`pwd` # Deploy to local Maven repos export MAVEN_OPTS=-XX:MaxPermSize=256m -mvn clean install -pl tools/hivemall-docs +mvn clean install -DskipTests=true -Dmaven.test.skip=true -pl '.,core,nlp,xgboost,tools/hivemall-docs' # Generate docs -mvn org.apache.hivemall:hivemall-docs:generate-funcs-list +mvn org.apache.hivemall:hivemall-docs:generate-funcs-list -pl '.,core,nlp,xgboost,tools/hivemall-docs' -X # Run HTTP server on localhost:040 diff --git a/docs/gitbook/misc/generic_funcs.md b/docs/gitbook/misc/generic_funcs.md index c695bcdcd..d4e7c1128 100644 --- a/docs/gitbook/misc/generic_funcs.md +++ b/docs/gitbook/misc/generic_funcs.md @@ -325,19 +325,19 @@ This page describes a list of useful Hivemall generic functions. See also a [lis ); {"name":"John","age":31} - {"name":"John","age":31} - {"Name":"John","age":31} - {"name":"John","age":31} - {"age":31} - {} - NULL - {"ti":"123","si":"456","i":789,"bi":[314,7]} - {"ti":"123","si":"456","i":789,"bi":[314,7]} - {"col1":"123","col2":"456","col3":789,"col4":[314,7]} - {"country":"japan","city":"tokyo"} - {"city":"tokyo"} - [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] - [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] + {"name":"John","age":31} + {"Name":"John","age":31} + {"name":"John","age":31} + {"age":31} + {} + NULL + {"ti":"123","si":"456","i":789,"bi":[314,7]} + {"ti":"123","si":"456","i":789,"bi":[314,7]} + {"col1":"123","col2":"456","col3":789,"col4":[314,7]} + {"country":"japan","city":"tokyo"} + {"city":"tokyo"} + [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] + [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] ``` # Map @@ -504,7 +504,7 @@ This page describes a list of useful Hivemall generic functions. See also a [lis > (no return) - SELECT date_add(current_date(),value) as `date`,value from (SELECT generate_series(1,3)) t; + SELECT date_add(current_date(),value),value from (SELECT generate_series(1,3)) t; 2018-04-21 1 2018-04-22 2 diff --git a/tools/hivemall-docs/pom.xml b/tools/hivemall-docs/pom.xml index 76994d75e..99cd13857 100644 --- a/tools/hivemall-docs/pom.xml +++ b/tools/hivemall-docs/pom.xml @@ -16,7 +16,9 @@ specific language governing permissions and limitations under the License. --> - + 4.0.0 @@ -56,46 +58,94 @@ provided - + + + org.apache.hive + hive-exec + compile + + + org.reflections + reflections + 0.9.10 + compile + + + org.apache.hivemall hivemall-core ${project.version} - compile + runtime org.apache.hivemall hivemall-nlp ${project.version} - compile + runtime org.apache.hivemall hivemall-xgboost ${project.version} - compile + runtime - - - org.apache.hive - hive-exec - compile - - - com.google.guava - guava - compile - - - org.reflections - reflections - 0.9.10 - compile - + + + + + org.eclipse.m2e + lifecycle-mapping + 1.0.0 + + + + + + org.codehaus.plexus + plexus-component-metadata + [1.5.5,) + + generate-metadata + + + + + false + + + + + + + org.apache.maven.plugins + + + maven-plugin-plugin + + + [3.2,) + + + descriptor + + + + + + + + + + + + + org.apache.maven.plugins @@ -105,6 +155,12 @@ default-descriptor process-classes + + mojo-descriptor + + descriptor + + generate-helpmojo @@ -113,6 +169,19 @@ + + org.codehaus.plexus + plexus-component-metadata + 1.7.1 + + + + generate-metadata + + + + + diff --git a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java similarity index 94% rename from tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java rename to tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java index f8bda4de4..ba0e07224 100644 --- a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java +++ b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java @@ -24,8 +24,6 @@ import static hivemall.docs.utils.MarkdownUtils.indent; import static org.apache.commons.lang.StringEscapeUtils.escapeHtml; -import hivemall.utils.lang.StringUtils; - import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; @@ -49,8 +47,10 @@ import org.apache.maven.execution.MavenSession; import org.apache.maven.plugin.AbstractMojo; import org.apache.maven.plugin.MojoExecutionException; +import org.apache.maven.plugins.annotations.LifecyclePhase; import org.apache.maven.plugins.annotations.Mojo; import org.apache.maven.plugins.annotations.Parameter; +import org.apache.maven.plugins.annotations.ResolutionScope; import org.reflections.Reflections; /** @@ -59,8 +59,10 @@ * @link https://hivemall.incubator.apache.org/userguide/misc/generic_funcs.html * @link https://hivemall.incubator.apache.org/userguide/misc/funcs.html */ -@Mojo(name = "generate-funcs-list") -public class FuncsListGenerator extends AbstractMojo { +@Mojo(name = "generate-funcs-list", defaultPhase = LifecyclePhase.PROCESS_CLASSES, + requiresDependencyResolution = ResolutionScope.COMPILE_PLUS_RUNTIME, + configurator = "include-project-dependencies") +public class FuncsListGeneratorMojo extends AbstractMojo { @Parameter(defaultValue = "${basedir}", readonly = true) private File basedir; @@ -213,7 +215,7 @@ private void generate(@Nonnull File outputFile, @Nonnull String preface, Set List = packages.get(packageName); List.add(sb.toString()); - StringUtils.clear(sb); + sb.setLength(0); } try (PrintWriter writer = new PrintWriter(outputFile)) { @@ -242,6 +244,11 @@ private void generate(@Nonnull File outputFile, @Nonnull String preface, writer.println(e.getKey() + "\n"); List packageNames = e.getValue(); for (String packageName : packageNames) { + if (!packages.containsKey(packageName)) { + writer.close(); + throw new MojoExecutionException( + "Failed to find package in the classpath: " + packageName); + } for (String desc : packages.get(packageName)) { writer.println(desc); } diff --git a/tools/hivemall-docs/src/main/java/hivemall/docs/IncludeProjectDependenciesComponentConfigurator.java b/tools/hivemall-docs/src/main/java/hivemall/docs/IncludeProjectDependenciesComponentConfigurator.java new file mode 100644 index 000000000..29774f0b3 --- /dev/null +++ b/tools/hivemall-docs/src/main/java/hivemall/docs/IncludeProjectDependenciesComponentConfigurator.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.docs; + +import java.io.File; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import javax.annotation.Nonnull; + +import org.codehaus.classworlds.ClassRealm; +import org.codehaus.plexus.component.annotations.Component; +import org.codehaus.plexus.component.configurator.AbstractComponentConfigurator; +import org.codehaus.plexus.component.configurator.ComponentConfigurationException; +import org.codehaus.plexus.component.configurator.ConfigurationListener; +import org.codehaus.plexus.component.configurator.converters.composite.ObjectWithFieldsConverter; +import org.codehaus.plexus.component.configurator.converters.special.ClassRealmConverter; +import org.codehaus.plexus.component.configurator.expression.ExpressionEvaluationException; +import org.codehaus.plexus.component.configurator.expression.ExpressionEvaluator; +import org.codehaus.plexus.configuration.PlexusConfiguration; + +/** + * A custom ComponentConfigurator which adds the project's runtime classpath elements to the MOJO + * classloader. + */ +@SuppressWarnings("deprecation") +@Component(role = org.codehaus.plexus.component.configurator.ComponentConfigurator.class, + hint = "include-project-dependencies") +public class IncludeProjectDependenciesComponentConfigurator extends AbstractComponentConfigurator { + + public void configureComponent(final Object component, final PlexusConfiguration configuration, + final ExpressionEvaluator expressionEvaluator, final ClassRealm containerRealm, + final ConfigurationListener listener) throws ComponentConfigurationException { + addProjectDependenciesToClassRealm(expressionEvaluator, containerRealm); + + converterLookup.registerConverter(new ClassRealmConverter(containerRealm)); + + ObjectWithFieldsConverter converter = new ObjectWithFieldsConverter(); + + converter.processConfiguration(converterLookup, component, containerRealm.getClassLoader(), + configuration, expressionEvaluator, listener); + } + + @SuppressWarnings("unchecked") + private static void addProjectDependenciesToClassRealm( + final ExpressionEvaluator expressionEvaluator, final ClassRealm containerRealm) + throws ComponentConfigurationException { + final List runtimeClasspathElements; + try { + // noinspection unchecked + runtimeClasspathElements = (List) expressionEvaluator.evaluate( + "${project.runtimeClasspathElements}"); + } catch (ExpressionEvaluationException e) { + throw new ComponentConfigurationException( + "There was a problem evaluating: ${project.runtimeClasspathElements}", e); + } + + // Add the project dependencies to the ClassRealm + final URL[] urls = buildURLs(runtimeClasspathElements); + for (URL url : urls) { + containerRealm.addConstituent(url); + } + } + + @Nonnull + private static URL[] buildURLs(@Nonnull final List runtimeClasspathElements) + throws ComponentConfigurationException { + // Add the projects classes and dependencies + final List urls = new ArrayList<>(runtimeClasspathElements.size()); + for (String element : runtimeClasspathElements) { + try { + URL url = new File(element).toURI().toURL(); + urls.add(url); + } catch (MalformedURLException e) { + throw new ComponentConfigurationException( + "Unable to access project dependency: " + element, e); + } + } + return urls.toArray(new URL[urls.size()]); + } + +} From 533140b7a41b5bd0fab03ecb65d484f17fc0bb1d Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 21 May 2018 17:00:04 +0900 Subject: [PATCH 32/56] Fixed UDF descriptions --- core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java | 2 +- core/src/main/java/hivemall/tools/json/FromJsonUDF.java | 4 +++- core/src/main/java/hivemall/tools/json/ToJsonUDF.java | 4 +++- docs/gitbook/misc/generic_funcs.md | 6 +++++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java index ab79911fc..ed9a7a23d 100644 --- a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java +++ b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java @@ -59,7 +59,7 @@ "\n" + "SELECT generate_series(4,3);\n" + "\n" + - "> (no return)\n" + + " (no return)\n" + "\n" + "SELECT date_add(current_date(),value),value from (SELECT generate_series(1,3)) t;\n" + "\n" + diff --git a/core/src/main/java/hivemall/tools/json/FromJsonUDF.java b/core/src/main/java/hivemall/tools/json/FromJsonUDF.java index 76071e9f7..97409dd22 100644 --- a/core/src/main/java/hivemall/tools/json/FromJsonUDF.java +++ b/core/src/main/java/hivemall/tools/json/FromJsonUDF.java @@ -75,7 +75,9 @@ " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"tokyo\"), \n" + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"osaka\")\n" + " )\n" + - " ),'array>');\n\n" + + " ),'array>');\n" + + "```\n\n" + + "```\n" + " {\"name\":\"makoto\",\"age\":37}\n" + " [0.1,1.1,2.2]\n" + " [{\"country\":\"japan\",\"city\":\"tokyo\"},{\"country\":\"japan\",\"city\":\"osaka\"}]\n" + diff --git a/core/src/main/java/hivemall/tools/json/ToJsonUDF.java b/core/src/main/java/hivemall/tools/json/ToJsonUDF.java index 8ba2d1b06..95d1af92f 100644 --- a/core/src/main/java/hivemall/tools/json/ToJsonUDF.java +++ b/core/src/main/java/hivemall/tools/json/ToJsonUDF.java @@ -95,7 +95,9 @@ " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"osaka\")\n" + " ),\n" + " array('city')\n" + - " );\n\n" + + " );\n" + + "```\n\n" + + "```\n" + " {\"name\":\"John\",\"age\":31}\n" + " {\"name\":\"John\",\"age\":31}\n" + " {\"Name\":\"John\",\"age\":31}\n" diff --git a/docs/gitbook/misc/generic_funcs.md b/docs/gitbook/misc/generic_funcs.md index d4e7c1128..28bb341c5 100644 --- a/docs/gitbook/misc/generic_funcs.md +++ b/docs/gitbook/misc/generic_funcs.md @@ -257,7 +257,9 @@ This page describes a list of useful Hivemall generic functions. See also a [lis NAMED_STRUCT("country", "japan", "city", "osaka") ) ),'array>'); + ``` + ``` {"name":"makoto","age":37} [0.1,1.1,2.2] [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] @@ -323,7 +325,9 @@ This page describes a list of useful Hivemall generic functions. See also a [lis ), array('city') ); + ``` + ``` {"name":"John","age":31} {"name":"John","age":31} {"Name":"John","age":31} @@ -502,7 +506,7 @@ This page describes a list of useful Hivemall generic functions. See also a [lis SELECT generate_series(4,3); - > (no return) + (no return) SELECT date_add(current_date(),value),value from (SELECT generate_series(1,3)) t; From 00a7e49a5dc90fd4aa6077e175d30ff8d0b2df96 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 24 May 2018 18:52:46 +0900 Subject: [PATCH 33/56] Added map_include_keys and map_exclude_keys UDFs --- .../hivemall/tools/map/MapExcludeKeysUDF.java | 100 +++++++++++++++++ .../hivemall/tools/map/MapIncludeKeysUDF.java | 103 ++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 core/src/main/java/hivemall/tools/map/MapExcludeKeysUDF.java create mode 100644 core/src/main/java/hivemall/tools/map/MapIncludeKeysUDF.java diff --git a/core/src/main/java/hivemall/tools/map/MapExcludeKeysUDF.java b/core/src/main/java/hivemall/tools/map/MapExcludeKeysUDF.java new file mode 100644 index 000000000..6a462108f --- /dev/null +++ b/core/src/main/java/hivemall/tools/map/MapExcludeKeysUDF.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.map; + +import hivemall.utils.hadoop.HiveUtils; + +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; + +@Description(name = "map_exclude_keys", + value = "_FUNC_(Map map, array filteringKeys)" + + " - Returns the filtered entries of a map not having specified keys", + extended = "SELECT map_exclude_keys(map(1,'one',2,'two',3,'three'),array(2,3));\n" + + "{1:\"one\"}") +@UDFType(deterministic = true, stateful = false) +public final class MapExcludeKeysUDF extends GenericUDF { + + private MapObjectInspector mapOI; + private ListObjectInspector listOI; + + @Override + public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { + if (argOIs.length != 2) { + throw new UDFArgumentLengthException( + "Expected two arguments for map_filter_keys: " + argOIs.length); + } + + this.mapOI = HiveUtils.asMapOI(argOIs[0]); + this.listOI = HiveUtils.asListOI(argOIs[1]); + + ObjectInspector mapKeyOI = mapOI.getMapKeyObjectInspector(); + ObjectInspector filterKeyOI = listOI.getListElementObjectInspector(); + + if (!ObjectInspectorUtils.compareTypes(mapKeyOI, filterKeyOI)) { + throw new UDFArgumentException("Element types does not match: mapKey " + + mapKeyOI.getTypeName() + ", filterKey" + filterKeyOI.getTypeName()); + } + + return ObjectInspectorUtils.getStandardObjectInspector(mapOI, + ObjectInspectorCopyOption.WRITABLE); + } + + @Override + public Map evaluate(DeferredObject[] arguments) throws HiveException { + Object arg0 = arguments[0].get(); + if (arg0 == null) { + return null; + } + final Map map = (Map) ObjectInspectorUtils.copyToStandardObject(arg0, mapOI, + ObjectInspectorCopyOption.WRITABLE); + + Object arg1 = arguments[1].get(); + if (arg1 == null) { + return map; + } + + final List filterKeys = (List) ObjectInspectorUtils.copyToStandardObject(arg1, listOI, + ObjectInspectorCopyOption.WRITABLE); + for (Object k : filterKeys) { + map.remove(k); + } + + return map; + } + + @Override + public String getDisplayString(String[] children) { + return "map_exclude_keys(" + StringUtils.join(children, ',') + ")"; + } + +} diff --git a/core/src/main/java/hivemall/tools/map/MapIncludeKeysUDF.java b/core/src/main/java/hivemall/tools/map/MapIncludeKeysUDF.java new file mode 100644 index 000000000..902569fd7 --- /dev/null +++ b/core/src/main/java/hivemall/tools/map/MapIncludeKeysUDF.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.map; + +import hivemall.utils.hadoop.HiveUtils; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; + +@Description(name = "map_include_keys", + value = "_FUNC_(Map map, array filteringKeys)" + + " - Returns the filtered entries of a map having specified keys", + extended = "SELECT map_include_keys(map(1,'one',2,'two',3,'three'),array(2,3));\n" + + "{2:\"two\",3:\"three\"}") +@UDFType(deterministic = true, stateful = false) +public final class MapIncludeKeysUDF extends GenericUDF { + + private MapObjectInspector mapOI; + private ListObjectInspector listOI; + + @Override + public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { + if (argOIs.length != 2) { + throw new UDFArgumentLengthException( + "Expected two arguments for map_filter_keys: " + argOIs.length); + } + + this.mapOI = HiveUtils.asMapOI(argOIs[0]); + this.listOI = HiveUtils.asListOI(argOIs[1]); + + ObjectInspector mapKeyOI = mapOI.getMapKeyObjectInspector(); + ObjectInspector filterKeyOI = listOI.getListElementObjectInspector(); + if (!ObjectInspectorUtils.compareTypes(mapKeyOI, filterKeyOI)) { + throw new UDFArgumentException("Element types does not match: mapKey " + + mapKeyOI.getTypeName() + ", filterKey" + filterKeyOI.getTypeName()); + } + + return ObjectInspectorUtils.getStandardObjectInspector(mapOI, + ObjectInspectorCopyOption.WRITABLE); + } + + @Override + public Map evaluate(DeferredObject[] arguments) throws HiveException { + Object arg0 = arguments[0].get(); + if (arg0 == null) { + return null; + } + final Map map = (Map) ObjectInspectorUtils.copyToStandardObject(arg0, mapOI, + ObjectInspectorCopyOption.WRITABLE); + + Object arg1 = arguments[1].get(); + if (arg1 == null) { + return null; + } + final List filterKeys = (List) ObjectInspectorUtils.copyToStandardObject(arg1, listOI, + ObjectInspectorCopyOption.WRITABLE); + + final Map result = new HashMap<>(); + for (Object k : filterKeys) { + Object v = map.get(k); + if (v != null) { + result.put(k, v); + } + } + return result; + } + + @Override + public String getDisplayString(String[] children) { + return "map_include_keys(" + StringUtils.join(children, ',') + ")"; + } + +} From b7fd6a96269e47d87c20d747e5e0a957c83792fa Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 24 May 2018 18:53:32 +0900 Subject: [PATCH 34/56] Updated DDLs --- .../hivemall/tools/array/ArrayAppendUDF.java | 4 +- .../hivemall/tools/array/ArrayConcatUDF.java | 4 +- .../tools/array/ArrayElementAtUDF.java | 4 +- .../hivemall/tools/array/ArrayFlattenUDF.java | 39 +++++++++---------- .../tools/array/ArrayIntersectUDF.java | 5 +-- .../hivemall/tools/array/ArrayUnionUDF.java | 4 +- .../hivemall/tools/array/FirstElementUDF.java | 4 +- .../hivemall/tools/array/LastElementUDF.java | 3 +- resources/ddl/define-all-as-permanent.hive | 27 +++++++++++++ resources/ddl/define-all.hive | 27 +++++++++++++ resources/ddl/define-all.spark | 27 +++++++++++++ resources/ddl/define-udfs.td.hql | 18 +++++++++ 12 files changed, 136 insertions(+), 30 deletions(-) diff --git a/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java b/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java index c344c01a4..b3768bb34 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java @@ -39,7 +39,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; @Description(name = "array_append", - value = "_FUNC_(array arr, T elem) - Append an element to the end of an array") + value = "_FUNC_(array arr, T elem) - Append an element to the end of an array", + extended = "SELECT array_append(array(1,2),3);\n 1,2,3\n\n" + + "SELECT array_append(array('a','b'),'c');\n \"a\",\"b\",\"c\"") @UDFType(deterministic = true, stateful = false) public final class ArrayAppendUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java index bdaa30aba..7781564ba 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java @@ -95,11 +95,11 @@ public List evaluate(DeferredObject[] arguments) throws HiveException { continue; } - final ListObjectInspector arrayOI = (ListObjectInspector) argumentOIs[i]; + final ListObjectInspector arrayOI = argumentOIs[i]; + final ObjectInspector elemOI = arrayOI.getListElementObjectInspector(); final int arraylength = arrayOI.getListLength(arrayObject); for (int j = 0; j < arraylength; j++) { Object rawObj = arrayOI.getListElement(arrayObject, j); - ObjectInspector elemOI = arrayOI.getListElementObjectInspector(); Object obj = ObjectInspectorUtils.copyToStandardObject(rawObj, elemOI, ObjectInspectorCopyOption.WRITABLE); ret.add(obj); diff --git a/core/src/main/java/hivemall/tools/array/ArrayElementAtUDF.java b/core/src/main/java/hivemall/tools/array/ArrayElementAtUDF.java index 631d92d5e..21be8d431 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayElementAtUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayElementAtUDF.java @@ -30,7 +30,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; @Description(name = "element_at", - value = "_FUNC_(array list, int pos) - Returns an element at the given position") + value = "_FUNC_(array list, int pos) - Returns an element at the given position", + extended = "SELECT element_at(array(1,2,3,4),0);\n 1\n\n" + + "SELECT element_at(array(1,2,3,4),-2);\n 3") @UDFType(deterministic = true, stateful = false) public final class ArrayElementAtUDF extends GenericUDF { private ListObjectInspector listInspector; diff --git a/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java b/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java index 906d594d3..39d0110d9 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java @@ -19,6 +19,7 @@ package hivemall.tools.array; import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.lang.StringUtils; import java.util.ArrayList; import java.util.List; @@ -33,16 +34,16 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; @Description(name = "array_flatten", - value = "_FUNC_(array>) - Returns an array with the elements flattened.") + value = "_FUNC_(array>) - Returns an array with the elements flattened.", + extended = "SELECT array_flatten(array(array(1,2,3),array(4,5),array(6,7,8)));\n" + + " [1,2,3,4,5,6,7,8]") @UDFType(deterministic = true, stateful = false) public final class ArrayFlattenUDF extends GenericUDF { private ListObjectInspector listOI; - private ListObjectInspector nextedListOI; - private ObjectInspector elemOI; - private final List result = new ArrayList<>(); @Override @@ -58,11 +59,13 @@ public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentEx throw new UDFArgumentException( "array_flatten takes array of array for the argument: " + listOI.toString()); } - this.nextedListOI = HiveUtils.asListOI(listElemOI); - this.elemOI = nextedListOI.getListElementObjectInspector(); + + ListObjectInspector nestedListOI = HiveUtils.asListOI(listElemOI); + ObjectInspector elemOI = nestedListOI.getListElementObjectInspector(); return ObjectInspectorFactory.getStandardListObjectInspector( - ObjectInspectorUtils.getStandardObjectInspector(elemOI)); + ObjectInspectorUtils.getStandardObjectInspector(elemOI, + ObjectInspectorCopyOption.WRITABLE)); } @Override @@ -81,12 +84,17 @@ public List evaluate(DeferredObject[] args) throws HiveException { continue; } - final int subarrayLength = nextedListOI.getListLength(subarray); + final ListObjectInspector subarrayOI = + HiveUtils.asListOI(listOI.getListElementObjectInspector()); + final ObjectInspector elemOI = subarrayOI.getListElementObjectInspector(); + final int subarrayLength = subarrayOI.getListLength(subarray); for (int j = 0; j < subarrayLength; j++) { - Object elem = nextedListOI.getListElement(subarray, j); - if (elem == null) { + Object rawElem = subarrayOI.getListElement(subarray, j); + if (rawElem == null) { continue; } + Object elem = ObjectInspectorUtils.copyToStandardObject(rawElem, elemOI, + ObjectInspectorCopyOption.WRITABLE); result.add(elem); } } @@ -96,16 +104,7 @@ public List evaluate(DeferredObject[] args) throws HiveException { @Override public String getDisplayString(String[] args) { - final StringBuffer buf = new StringBuffer(); - buf.append("array_flatten("); - for (int i = 0, len = args.length; i < len; i++) { - if (i != 0) { - buf.append(", "); - } - buf.append(args[i]); - } - buf.append(")"); - return buf.toString(); + return "array_flatten(" + StringUtils.join(args, ',') + ")"; } } diff --git a/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java b/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java index fb99f6d3a..4b7f91cab 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java @@ -88,7 +88,7 @@ public List evaluate(@Nonnull DeferredObject[] args) throws HiveExceptio return Collections.emptyList(); } - Set checkSet = new HashSet(); + Set checkSet = new HashSet(); final ListObjectInspector arg0ListOI = argListOIs[0]; final ObjectInspector arg0ElemOI = arg0ListOI.getListElementObjectInspector(); final int arg0size = arg0ListOI.getListLength(arg0); @@ -106,8 +106,7 @@ public List evaluate(@Nonnull DeferredObject[] args) throws HiveExceptio if (argI == null) { continue; } - final Set newSet = - new HashSet(); + final Set newSet = new HashSet(); final ListObjectInspector argIListOI = argListOIs[i]; final ObjectInspector argIElemOI = argIListOI.getListElementObjectInspector(); for (int j = 0, j_size = argIListOI.getListLength(argI); j < j_size; j++) { diff --git a/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java b/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java index 33aadf8b3..91ae53f1c 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java @@ -46,7 +46,9 @@ * */ @Description(name = "array_union", - value = "_FUNC_(array1, array2, ...) - Returns the union of a set of arrays") + value = "_FUNC_(array1, array2, ...) - Returns the union of a set of arrays", + extended = "SELECT array_union(array(1,2),array(1,2));\n" + "[1,2]\n\n" + + "SELECT array_union(array(1,2),array(2,3),array(2,5));\n" + "[1,2,3,5]") @UDFType(deterministic = true, stateful = false) public final class ArrayUnionUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/array/FirstElementUDF.java b/core/src/main/java/hivemall/tools/array/FirstElementUDF.java index 957e724d9..a7085291b 100644 --- a/core/src/main/java/hivemall/tools/array/FirstElementUDF.java +++ b/core/src/main/java/hivemall/tools/array/FirstElementUDF.java @@ -31,7 +31,9 @@ /** * Return the first element in an array. */ -@Description(name = "first_element", value = "_FUNC_(x) - Returns the first element in an array ") +@Description(name = "first_element", value = "_FUNC_(x) - Returns the first element in an array", + extended = "SELECT first_element(array('a','b','c'));\n a\n\n" + + "SELECT first_element(array());\n NULL") @UDFType(deterministic = true, stateful = false) public class FirstElementUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/array/LastElementUDF.java b/core/src/main/java/hivemall/tools/array/LastElementUDF.java index 91050b801..f36a6aa91 100644 --- a/core/src/main/java/hivemall/tools/array/LastElementUDF.java +++ b/core/src/main/java/hivemall/tools/array/LastElementUDF.java @@ -31,7 +31,8 @@ /** * Return the last element in an array. */ -@Description(name = "last_element", value = "_FUNC_(x) - Return the last element in an array") +@Description(name = "last_element", value = "_FUNC_(x) - Return the last element in an array", + extended = "SELECT last_element(array('a','b','c'));\n c") @UDFType(deterministic = true, stateful = false) public class LastElementUDF extends GenericUDF { diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index e7da8e350..59cf638ce 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -767,3 +767,30 @@ CREATE FUNCTION xgboost_predict AS 'hivemall.xgboost.tools.XGBoostPredictUDTF' U DROP FUNCTION xgboost_multiclass_predict; CREATE FUNCTION xgboost_multiclass_predict AS 'hivemall.xgboost.tools.XGBoostMulticlassPredictUDTF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS try_cast; +CREATE FUNCTION try_cast as 'hivemall.tools.TryCastUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS array_append; +CREATE FUNCTION array_append as 'hivemall.tools.array.ArrayAppendUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS element_at; +CREATE FUNCTION element_at as 'hivemall.tools.array.ArrayElementAtUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS array_union; +CREATE FUNCTION array_union as 'hivemall.tools.array.ArrayUnionUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS first_element; +CREATE FUNCTION first_element as 'hivemall.tools.array.FirstElementUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS last_element; +CREATE FUNCTION last_element as 'hivemall.tools.array.LastElementUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS array_flatten; +CREATE FUNCTION array_flatten as 'hivemall.tools.array.ArrayFlattenUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS map_include_keys; +CREATE FUNCTION map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS map_exclude_keys; +CREATE FUNCTION map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 9228ce946..a0bd91756 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -773,3 +773,30 @@ log(10, n_docs / max2(1,df_t)) + 1.0; create temporary macro tfidf(tf FLOAT, df_t DOUBLE, n_docs DOUBLE) tf * (log(10, n_docs / max2(1,df_t)) + 1.0); + +drop temporary function if exists try_cast; +create temporary function try_cast as 'hivemall.tools.TryCastUDF'; + +drop temporary function if exists array_append; +create temporary function array_append as 'hivemall.tools.array.ArrayAppendUDF'; + +drop temporary function if exists element_at; +create temporary function element_at as 'hivemall.tools.array.ArrayElementAtUDF'; + +drop temporary function if exists array_union; +create temporary function array_union as 'hivemall.tools.array.ArrayUnionUDF'; + +drop temporary function if exists first_element; +create temporary function first_element as 'hivemall.tools.array.FirstElementUDF'; + +drop temporary function if exists last_element; +create temporary function last_element as 'hivemall.tools.array.LastElementUDF'; + +drop temporary function if exists array_flatten; +create temporary function array_flatten as 'hivemall.tools.array.ArrayFlattenUDF'; + +drop temporary function if exists map_include_keys; +create temporary function map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF'; + +drop temporary function if exists map_exclude_keys; +create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 3764ca268..6da8c993d 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -726,3 +726,30 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION train_slim AS 'hivemall.recommend.Slim sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS approx_count_distinct") sqlContext.sql("CREATE TEMPORARY FUNCTION approx_count_distinct AS 'hivemall.sketch.hll.ApproxCountDistinctUDAF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS try_cast") +sqlContext.sql("CREATE TEMPORARY FUNCTION try_cast AS 'hivemall.tools.TryCastUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_append") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_append AS 'hivemall.tools.array.ArrayAppendUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS element_at") +sqlContext.sql("CREATE TEMPORARY FUNCTION element_at AS 'hivemall.tools.array.ArrayElementAtUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_union") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_union AS 'hivemall.tools.array.ArrayUnionUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS first_element") +sqlContext.sql("CREATE TEMPORARY FUNCTION first_element AS 'hivemall.tools.array.FirstElementUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS last_element") +sqlContext.sql("CREATE TEMPORARY FUNCTION last_element AS 'hivemall.tools.array.LastElementUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_flatten") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_flatten AS 'hivemall.tools.array.ArrayFlattenUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_include_keys") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_include_keys AS 'hivemall.tools.map.MapIncludeKeysUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_exclude_keys") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_exclude_keys AS 'hivemall.tools.map.MapExcludeKeysUDF'") diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index b106eda53..706a47ba5 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -200,3 +200,21 @@ create temporary function subarray as 'hivemall.tools.array.ArraySliceUDF'; -- alias for TD create temporary function approx_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF'; + +create temporary function try_cast as 'hivemall.tools.TryCastUDF'; + +create temporary function array_append as 'hivemall.tools.array.ArrayAppendUDF'; + +create temporary function element_at as 'hivemall.tools.array.ArrayElementAtUDF'; + +create temporary function array_union as 'hivemall.tools.array.ArrayUnionUDF'; + +create temporary function first_element as 'hivemall.tools.array.FirstElementUDF'; + +create temporary function last_element as 'hivemall.tools.array.LastElementUDF'; + +create temporary function array_flatten as 'hivemall.tools.array.ArrayFlattenUDF'; + +create temporary function map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF'; + +create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; From 2982865c089567febe62e8e502fb96fe6144b90d Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 25 May 2018 15:08:02 +0900 Subject: [PATCH 35/56] Added array_to_str UDF --- .../hivemall/tools/array/ArrayToStrUDF.java | 96 ++++++++++++++++ .../tools/array/ArrayToStrUDFTest.java | 103 ++++++++++++++++++ resources/ddl/define-all-as-permanent.hive | 3 + resources/ddl/define-all.hive | 3 + resources/ddl/define-all.spark | 3 + resources/ddl/define-udfs.td.hql | 2 + 6 files changed, 210 insertions(+) create mode 100644 core/src/main/java/hivemall/tools/array/ArrayToStrUDF.java create mode 100644 core/src/test/java/hivemall/tools/array/ArrayToStrUDFTest.java diff --git a/core/src/main/java/hivemall/tools/array/ArrayToStrUDF.java b/core/src/main/java/hivemall/tools/array/ArrayToStrUDF.java new file mode 100644 index 000000000..185abc74d --- /dev/null +++ b/core/src/main/java/hivemall/tools/array/ArrayToStrUDF.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.array; + +import hivemall.utils.hadoop.HiveUtils; + +import javax.annotation.Nullable; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; + +@Description(name = "array_to_str", + value = "_FUNC_(array arr [, string sep=',']) - Convert array to string using a sperator", + extended = "SELECT array_to_str(array(1,2,3),'-');\n" + "1-2-3") +@UDFType(deterministic = true, stateful = false) +public final class ArrayToStrUDF extends GenericUDF { + + private ListObjectInspector listOI; + @Nullable + private StringObjectInspector sepOI; + + @Override + public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { + if (argOIs.length != 1 && argOIs.length != 2) { + throw new UDFArgumentLengthException( + "array_to_str(array, string sep) expects one or two arguments: " + argOIs.length); + } + + this.listOI = HiveUtils.asListOI(argOIs[0]); + if (argOIs.length == 2) { + this.sepOI = HiveUtils.asStringOI(argOIs[1]); + } + + return PrimitiveObjectInspectorFactory.javaStringObjectInspector; + } + + @Override + public String evaluate(DeferredObject[] arguments) throws HiveException { + Object arg0 = arguments[0].get(); + if (arg0 == null) { + return null; + } + + String sep = ","; + if (arguments.length == 2) { + Object arg1 = arguments[1].get(); + if (arg1 != null) { + sep = sepOI.getPrimitiveJavaObject(arg1); + } + } + + final StringBuilder buf = new StringBuilder(); + final int len = listOI.getListLength(arg0); + for (int i = 0; i < len; i++) { + Object e = listOI.getListElement(arg0, i); + if (e != null) { + if (i != 0 && buf.length() > 0) { + buf.append(sep); + } + buf.append(e.toString()); + } + } + return buf.toString(); + } + + @Override + public String getDisplayString(String[] children) { + return "array_to_str(" + StringUtils.join(children, ',') + ")"; + } + +} diff --git a/core/src/test/java/hivemall/tools/array/ArrayToStrUDFTest.java b/core/src/test/java/hivemall/tools/array/ArrayToStrUDFTest.java new file mode 100644 index 000000000..373217f2f --- /dev/null +++ b/core/src/test/java/hivemall/tools/array/ArrayToStrUDFTest.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.array; + +import hivemall.TestUtils; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + +public class ArrayToStrUDFTest { + + @Test + public void testSimpleCase() throws HiveException, IOException { + ArrayToStrUDF udf = new ArrayToStrUDF(); + + udf.initialize(new ObjectInspector[] { + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaIntObjectInspector), + PrimitiveObjectInspectorFactory.writableStringObjectInspector}); + + Text sep = new Text("#"); + DeferredObject[] args = + new DeferredObject[] {new GenericUDF.DeferredJavaObject(Arrays.asList(1, 2, 3)), + new GenericUDF.DeferredJavaObject(sep)}; + Assert.assertEquals("1#2#3", udf.evaluate(args)); + + args = new DeferredObject[] {new GenericUDF.DeferredJavaObject(Arrays.asList(1, 2, 3)), + new GenericUDF.DeferredJavaObject(null)}; + Assert.assertEquals("1,2,3", udf.evaluate(args)); + + udf.close(); + } + + @Test + public void testNoSep() throws HiveException, IOException { + ArrayToStrUDF udf = new ArrayToStrUDF(); + + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaIntObjectInspector)}); + + DeferredObject[] args = + new DeferredObject[] {new GenericUDF.DeferredJavaObject(Arrays.asList(1, 2, 3))}; + + Assert.assertEquals("1,2,3", udf.evaluate(args)); + + udf.close(); + } + + @Test + public void testNull() throws HiveException, IOException { + ArrayToStrUDF udf = new ArrayToStrUDF(); + + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaIntObjectInspector)}); + + DeferredObject[] args = + new DeferredObject[] {new GenericUDF.DeferredJavaObject(Arrays.asList(1, null, 3))}; + + Assert.assertEquals("1,3", udf.evaluate(args)); + + args = new DeferredObject[] {new GenericUDF.DeferredJavaObject(Arrays.asList(null, 2, 3))}; + + Assert.assertEquals("2,3", udf.evaluate(args)); + + udf.close(); + } + + @Test + public void testSerialization() throws HiveException, IOException { + TestUtils.testGenericUDFSerialization(ArrayToStrUDF.class, + new ObjectInspector[] { + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaIntObjectInspector), + PrimitiveObjectInspectorFactory.javaStringObjectInspector}, + new Object[] {Arrays.asList(1, 2, 3), "-"}); + } +} diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index 59cf638ce..28af90c22 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -794,3 +794,6 @@ CREATE FUNCTION map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF' USING DROP FUNCTION IF EXISTS map_exclude_keys; CREATE FUNCTION map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS array_to_str; +CREATE FUNCTION array_to_str as 'hadoop.tools.array.ArrayToStrUDF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index a0bd91756..96574f47e 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -800,3 +800,6 @@ create temporary function map_include_keys as 'hivemall.tools.map.MapIncludeKeys drop temporary function if exists map_exclude_keys; create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; + +drop temporary function if exists array_to_str; +create temporary function array_to_str as 'hadoop.tools.array.ArrayToStrUDF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 6da8c993d..be88e9ba2 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -753,3 +753,6 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION map_include_keys AS 'hivemall.tools.ma sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_exclude_keys") sqlContext.sql("CREATE TEMPORARY FUNCTION map_exclude_keys AS 'hivemall.tools.map.MapExcludeKeysUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_to_str") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_to_str AS 'hadoop.tools.array.ArrayToStrUDF'") diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 706a47ba5..0355720d1 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -218,3 +218,5 @@ create temporary function array_flatten as 'hivemall.tools.array.ArrayFlattenUDF create temporary function map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF'; create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; + +create temporary function array_to_str as 'hadoop.tools.array.ArrayToStrUDF'; From 91c1013ce10f475a038a579b4c18112203e60a9a Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 25 May 2018 15:52:50 +0900 Subject: [PATCH 36/56] Added map_index UDF --- .../java/hivemall/tools/map/MapIndexUDF.java | 104 ++++++++++++++++++ resources/ddl/define-all-as-permanent.hive | 3 + resources/ddl/define-all.hive | 3 + resources/ddl/define-all.spark | 3 + resources/ddl/define-udfs.td.hql | 2 + 5 files changed, 115 insertions(+) create mode 100644 core/src/main/java/hivemall/tools/map/MapIndexUDF.java diff --git a/core/src/main/java/hivemall/tools/map/MapIndexUDF.java b/core/src/main/java/hivemall/tools/map/MapIndexUDF.java new file mode 100644 index 000000000..73ffa36e0 --- /dev/null +++ b/core/src/main/java/hivemall/tools/map/MapIndexUDF.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.map; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; + +//@formatter:off +@Description(name = "map_index", + value = "_FUNC_(a, n) - Returns the n-th element of the given array", + extended = "WITH tmp as (\n" + + " SELECT \"one\" as key\n" + + " UNION ALL\n" + + " SELECT \"two\" as key\n" + + ")\n" + + "SELECT map_index(map(\"one\",1,\"two\",2),key)\n" + + "FROM tmp;\n\n" + + "1\n" + + "2") +//@formatter:on +@UDFType(deterministic = true, stateful = false) +public final class MapIndexUDF extends GenericUDF { + + private transient MapObjectInspector mapOI; + private transient Converter converter; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 2) { + throw new UDFArgumentLengthException("The function INDEX accepts exactly 2 arguments."); + } + + if (arguments[0] instanceof MapObjectInspector) { + this.mapOI = (MapObjectInspector) arguments[0]; + } else { + throw new UDFArgumentTypeException(0, "\"map\" is expected at function INDEX, but \"" + + arguments[0].getTypeName() + "\" is found"); + } + + // index has to be a primitive + if (!(arguments[1] instanceof PrimitiveObjectInspector)) { + throw new UDFArgumentTypeException(1, + "Primitive Type is expected but " + arguments[1].getTypeName() + "\" is found"); + } + + PrimitiveObjectInspector inputOI = (PrimitiveObjectInspector) arguments[1]; + ObjectInspector indexOI = + ObjectInspectorConverters.getConvertedOI(inputOI, mapOI.getMapKeyObjectInspector()); + this.converter = ObjectInspectorConverters.getConverter(inputOI, indexOI); + + return mapOI.getMapValueObjectInspector(); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length == 2); + Object index = arguments[1].get(); + + Object indexObject = converter.convert(index); + if (indexObject == null) { + return null; + } + + Object arg0 = arguments[0].get(); + if (arg0 == null) { + return null; + } + + return mapOI.getMapValueElement(arg0, indexObject); + } + + @Override + public String getDisplayString(String[] children) { + assert (children.length == 2); + return children[0] + "[" + children[1] + "]"; + } + +} diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index 28af90c22..f644d4266 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -797,3 +797,6 @@ CREATE FUNCTION map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF' USING DROP FUNCTION IF EXISTS array_to_str; CREATE FUNCTION array_to_str as 'hadoop.tools.array.ArrayToStrUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS map_index; +CREATE FUNCTION map_index as 'hivemall.tools.map.MapIndexUDF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 96574f47e..1c322f120 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -803,3 +803,6 @@ create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeys drop temporary function if exists array_to_str; create temporary function array_to_str as 'hadoop.tools.array.ArrayToStrUDF'; + +drop temporary function if exists map_index; +create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index be88e9ba2..96ae8ad04 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -756,3 +756,6 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION map_exclude_keys AS 'hivemall.tools.ma sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_to_str") sqlContext.sql("CREATE TEMPORARY FUNCTION array_to_str AS 'hadoop.tools.array.ArrayToStrUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_index") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_index AS 'hivemall.tools.map.MapIndexUDF'") diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 0355720d1..0d6290e5b 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -220,3 +220,5 @@ create temporary function map_include_keys as 'hivemall.tools.map.MapIncludeKeys create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; create temporary function array_to_str as 'hadoop.tools.array.ArrayToStrUDF'; + +create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; From 85cc27f8b63812a7ae235802e6bcd7b1eb0610f6 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 25 May 2018 15:56:57 +0900 Subject: [PATCH 37/56] Updated DDLs for map_key_values --- core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java | 4 +++- resources/ddl/define-all-as-permanent.hive | 3 +++ resources/ddl/define-all.hive | 3 +++ resources/ddl/define-all.spark | 3 +++ resources/ddl/define-udfs.td.hql | 2 ++ 5 files changed, 14 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java b/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java index 64065e9ed..3992f9e6d 100644 --- a/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java +++ b/core/src/main/java/hivemall/tools/map/MapKeyValuesUDF.java @@ -38,7 +38,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; @Description(name = "map_key_values", - value = "_FUNC_(map) - " + "Returns a array of key-value pairs.") + value = "_FUNC_(map) - " + "Returns a array of key-value pairs.", + extended = "SELECT map_key_values(map(\"one\",1,\"two\",2));\n\n" + + "[{\"key\":\"one\",\"value\":1},{\"key\":\"two\",\"value\":2}]") @UDFType(deterministic = true, stateful = false) public final class MapKeyValuesUDF extends GenericUDF { diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index f644d4266..b380d5cd2 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -800,3 +800,6 @@ CREATE FUNCTION array_to_str as 'hadoop.tools.array.ArrayToStrUDF' USING JAR '${ DROP FUNCTION IF EXISTS map_index; CREATE FUNCTION map_index as 'hivemall.tools.map.MapIndexUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS map_key_values; +CREATE FUNCTION map_key_values as 'hivemall.tools.map.MapKeyValuesUDF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 1c322f120..5c2f39c32 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -806,3 +806,6 @@ create temporary function array_to_str as 'hadoop.tools.array.ArrayToStrUDF'; drop temporary function if exists map_index; create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; + +drop temporary function if exists map_key_values; +create temporary function map_key_values as 'hivemall.tools.map.MapKeyValuesUDF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 96ae8ad04..fda94ccb4 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -759,3 +759,6 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION array_to_str AS 'hadoop.tools.array.Ar sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_index") sqlContext.sql("CREATE TEMPORARY FUNCTION map_index AS 'hivemall.tools.map.MapIndexUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_key_values") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_key_values AS 'hivemall.tools.map.MapKeyValuesUDF'") diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 0d6290e5b..d6e947290 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -222,3 +222,5 @@ create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeys create temporary function array_to_str as 'hadoop.tools.array.ArrayToStrUDF'; create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; + +create temporary function map_key_values as 'hivemall.tools.map.MapKeyValuesUDF'; From bf70541bf0eab52396ca2b23db8d97d6f61091bb Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 29 May 2018 12:53:51 +0900 Subject: [PATCH 38/56] Updated DDLs --- resources/ddl/define-all-as-permanent.hive | 9 +++++++++ resources/ddl/define-all.hive | 9 +++++++++ resources/ddl/define-all.spark | 9 +++++++++ resources/ddl/define-udfs.td.hql | 6 ++++++ 4 files changed, 33 insertions(+) diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index b380d5cd2..1e1a20a7a 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -803,3 +803,12 @@ CREATE FUNCTION map_index as 'hivemall.tools.map.MapIndexUDF' USING JAR '${hivem DROP FUNCTION IF EXISTS map_key_values; CREATE FUNCTION map_key_values as 'hivemall.tools.map.MapKeyValuesUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS sessionize; +CREATE FUNCTION sessionize as 'hivemall.tools.datetime.SessionizeUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS to_json; +CREATE FUNCTION to_json as 'hivemall.tools.json.ToJsonUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS from_json; +CREATE FUNCTION from_json as 'hivemall.tools.json.FromJsonUDF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 5c2f39c32..2baf0b3a8 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -809,3 +809,12 @@ create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; drop temporary function if exists map_key_values; create temporary function map_key_values as 'hivemall.tools.map.MapKeyValuesUDF'; + +drop temporary function if exists sessionize; +create temporary function sessionize as 'hivemall.tools.datetime.SessionizeUDF'; + +drop temporary function if exists to_json; +create temporary function to_json as 'hivemall.tools.json.ToJsonUDF'; + +drop temporary function if exists from_json; +create temporary function from_json as 'hivemall.tools.json.FromJsonUDF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index fda94ccb4..7766b6dfb 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -762,3 +762,12 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION map_index AS 'hivemall.tools.map.MapIn sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_key_values") sqlContext.sql("CREATE TEMPORARY FUNCTION map_key_values AS 'hivemall.tools.map.MapKeyValuesUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS sessionize") +sqlContext.sql("CREATE TEMPORARY FUNCTION sessionize AS 'hivemall.tools.datetime.SessionizeUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS to_json") +sqlContext.sql("CREATE TEMPORARY FUNCTION to_json AS 'hivemall.tools.json.ToJsonUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS from_json") +sqlContext.sql("CREATE TEMPORARY FUNCTION from_json AS 'hivemall.tools.json.FromJsonUDF'") diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index d6e947290..27a098e3e 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -224,3 +224,9 @@ create temporary function array_to_str as 'hadoop.tools.array.ArrayToStrUDF'; create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; create temporary function map_key_values as 'hivemall.tools.map.MapKeyValuesUDF'; + +create temporary function sessionize as 'hivemall.tools.datetime.SessionizeUDF'; + +create temporary function to_json as 'hivemall.tools.json.ToJsonUDF'; + +create temporary function from_json as 'hivemall.tools.json.FromJsonUDF'; From aae2450188f4963afb0c012bf1156409060215d6 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 31 May 2018 14:56:11 +0900 Subject: [PATCH 39/56] Added assert and raise_error UDFs --- .../java/hivemall/tools/sanity/AssertUDF.java | 6 ++- .../hivemall/tools/sanity/RaiseErrorUDF.java | 43 +++++++++++++++---- resources/ddl/define-all-as-permanent.hive | 6 +++ resources/ddl/define-all.hive | 6 +++ resources/ddl/define-all.spark | 6 +++ resources/ddl/define-udfs.td.hql | 4 ++ 6 files changed, 61 insertions(+), 10 deletions(-) diff --git a/core/src/main/java/hivemall/tools/sanity/AssertUDF.java b/core/src/main/java/hivemall/tools/sanity/AssertUDF.java index d34cd2073..fb8f59386 100644 --- a/core/src/main/java/hivemall/tools/sanity/AssertUDF.java +++ b/core/src/main/java/hivemall/tools/sanity/AssertUDF.java @@ -25,8 +25,10 @@ @Description(name = "assert", value = "_FUNC_(boolean condition) or _FUNC_(boolean condition, string errMsg)" - + "- Throws HiveException if condition is not met") -@UDFType(deterministic = true, stateful = false) + + "- Throws HiveException if condition is not met", + extended = "SELECT count(1) FROM stock_price WHERE assert(price > 0.0);\n" + + "SELECT count(1) FROM stock_price WHRE assert(price > 0.0, 'price MUST be more than 0.0')") +@UDFType(deterministic = false, stateful = false) public final class AssertUDF extends UDF { public boolean evaluate(boolean condition) throws HiveException { diff --git a/core/src/main/java/hivemall/tools/sanity/RaiseErrorUDF.java b/core/src/main/java/hivemall/tools/sanity/RaiseErrorUDF.java index 194085ce2..fb6b8eb42 100644 --- a/core/src/main/java/hivemall/tools/sanity/RaiseErrorUDF.java +++ b/core/src/main/java/hivemall/tools/sanity/RaiseErrorUDF.java @@ -18,21 +18,48 @@ */ package hivemall.tools.sanity; +import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -@Description(name = "raise_error", value = "_FUNC_() or _FUNC_(string msg) - Throws an error") -@UDFType(deterministic = true, stateful = false) -public final class RaiseErrorUDF extends UDF { +@Description(name = "raise_error", value = "_FUNC_() or _FUNC_(string msg) - Throws an error", + extended = "SELECT product_id, price, raise_error('Found an invalid record') FROM xxx WHERE price < 0.0") +@UDFType(deterministic = false, stateful = false) +public class RaiseErrorUDF extends GenericUDF { - public boolean evaluate() throws HiveException { - throw new HiveException(); + @Override + public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { + if (argOIs.length != 0 && argOIs.length != 1) { + throw new UDFArgumentLengthException( + "Expected one or two arguments for raise_error UDF: " + argOIs.length); + } + + return PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + if (arguments.length == 1) { + Object arg0 = arguments[0].get(); + if (arg0 == null) { + throw new HiveException(); + } + String msg = arg0.toString(); + throw new HiveException(msg); + } else { + throw new HiveException(); + } } - public boolean evaluate(String errorMessage) throws HiveException { - throw new HiveException(errorMessage); + @Override + public String getDisplayString(String[] children) { + return "raise_error(" + StringUtils.join(children, ',') + ')'; } } diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index 1e1a20a7a..11d2777d3 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -812,3 +812,9 @@ CREATE FUNCTION to_json as 'hivemall.tools.json.ToJsonUDF' USING JAR '${hivemall DROP FUNCTION IF EXISTS from_json; CREATE FUNCTION from_json as 'hivemall.tools.json.FromJsonUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS assert; +CREATE FUNCTION assert as 'hivemall.tools.sanity.AssertUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS raise_error; +CREATE FUNCTION raise_error as 'hivemall.tools.sanity.RaiseErrorUDF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 2baf0b3a8..7b7f85b64 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -818,3 +818,9 @@ create temporary function to_json as 'hivemall.tools.json.ToJsonUDF'; drop temporary function if exists from_json; create temporary function from_json as 'hivemall.tools.json.FromJsonUDF'; + +drop temporary function if exists assert; +create temporary function assert as 'hivemall.tools.sanity.AssertUDF'; + +drop temporary function if exists raise_error; +create temporary function raise_error as 'hivemall.tools.sanity.RaiseErrorUDF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 7766b6dfb..0a1c3d234 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -771,3 +771,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION to_json AS 'hivemall.tools.json.ToJson sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS from_json") sqlContext.sql("CREATE TEMPORARY FUNCTION from_json AS 'hivemall.tools.json.FromJsonUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS assert") +sqlContext.sql("CREATE TEMPORARY FUNCTION assert AS 'hivemall.tools.sanity.AssertUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS raise_error") +sqlContext.sql("CREATE TEMPORARY FUNCTION raise_error AS 'hivemall.tools.sanity.RaiseErrorUDF'") diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 27a098e3e..cdf358913 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -230,3 +230,7 @@ create temporary function sessionize as 'hivemall.tools.datetime.SessionizeUDF'; create temporary function to_json as 'hivemall.tools.json.ToJsonUDF'; create temporary function from_json as 'hivemall.tools.json.FromJsonUDF'; + +create temporary function assert as 'hivemall.tools.sanity.AssertUDF'; + +create temporary function raise_error as 'hivemall.tools.sanity.RaiseErrorUDF'; From 09b246ddc7d2796600d1bbdb54bed5b067aba12e Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 31 May 2018 15:06:58 +0900 Subject: [PATCH 40/56] Fixed unit tests for raise_error UDF --- .../java/hivemall/tools/sanity/RaiseErrorUDFTest.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/core/src/test/java/hivemall/tools/sanity/RaiseErrorUDFTest.java b/core/src/test/java/hivemall/tools/sanity/RaiseErrorUDFTest.java index 004ba26d8..a96ea57ea 100644 --- a/core/src/test/java/hivemall/tools/sanity/RaiseErrorUDFTest.java +++ b/core/src/test/java/hivemall/tools/sanity/RaiseErrorUDFTest.java @@ -18,15 +18,21 @@ */ package hivemall.tools.sanity; +import java.io.IOException; + import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; import org.junit.Test; public class RaiseErrorUDFTest { @Test(expected = HiveException.class) - public void test() throws HiveException { + public void test() throws HiveException, IOException { RaiseErrorUDF udf = new RaiseErrorUDF(); - udf.evaluate(); + + udf.evaluate(new DeferredObject[] {}); + + udf.close(); } } From 099673f45c7001544692af58d5845d6e2717f77c Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 31 May 2018 15:26:43 +0900 Subject: [PATCH 41/56] Added moving_avg UDTF --- resources/ddl/define-all-as-permanent.hive | 3 +++ resources/ddl/define-all.hive | 3 +++ resources/ddl/define-all.spark | 3 +++ resources/ddl/define-udfs.td.hql | 2 ++ 4 files changed, 11 insertions(+) diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index 11d2777d3..b1e527d87 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -818,3 +818,6 @@ CREATE FUNCTION assert as 'hivemall.tools.sanity.AssertUDF' USING JAR '${hivemal DROP FUNCTION IF EXISTS raise_error; CREATE FUNCTION raise_error as 'hivemall.tools.sanity.RaiseErrorUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS moving_avg; +CREATE FUNCTION moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 7b7f85b64..b1074bc47 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -824,3 +824,6 @@ create temporary function assert as 'hivemall.tools.sanity.AssertUDF'; drop temporary function if exists raise_error; create temporary function raise_error as 'hivemall.tools.sanity.RaiseErrorUDF'; + +drop temporary function if exists moving_avg; +create temporary function moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 0a1c3d234..44d5fe552 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -777,3 +777,6 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION assert AS 'hivemall.tools.sanity.Asser sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS raise_error") sqlContext.sql("CREATE TEMPORARY FUNCTION raise_error AS 'hivemall.tools.sanity.RaiseErrorUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS moving_avg") +sqlContext.sql("CREATE TEMPORARY FUNCTION moving_avg AS 'hivemall.tools.timeseries.MovingAverageUDTF'") diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index cdf358913..93376e092 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -234,3 +234,5 @@ create temporary function from_json as 'hivemall.tools.json.FromJsonUDF'; create temporary function assert as 'hivemall.tools.sanity.AssertUDF'; create temporary function raise_error as 'hivemall.tools.sanity.RaiseErrorUDF'; + +create temporary function moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF'; From 7661efcf8304b2443c1d248c696285ac76897395 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 31 May 2018 17:34:23 +0900 Subject: [PATCH 42/56] Fixed a typo in assert UDF --- core/src/main/java/hivemall/tools/sanity/AssertUDF.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/hivemall/tools/sanity/AssertUDF.java b/core/src/main/java/hivemall/tools/sanity/AssertUDF.java index fb8f59386..c74aeb8ee 100644 --- a/core/src/main/java/hivemall/tools/sanity/AssertUDF.java +++ b/core/src/main/java/hivemall/tools/sanity/AssertUDF.java @@ -27,7 +27,7 @@ value = "_FUNC_(boolean condition) or _FUNC_(boolean condition, string errMsg)" + "- Throws HiveException if condition is not met", extended = "SELECT count(1) FROM stock_price WHERE assert(price > 0.0);\n" - + "SELECT count(1) FROM stock_price WHRE assert(price > 0.0, 'price MUST be more than 0.0')") + + "SELECT count(1) FROM stock_price WHERE assert(price > 0.0, 'price MUST be more than 0.0')") @UDFType(deterministic = false, stateful = false) public final class AssertUDF extends UDF { From 7726c8b355afd0e76a06981175c06f0a84c26af5 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 31 May 2018 18:18:26 +0900 Subject: [PATCH 43/56] Added vector_add, vector_dot UDFs --- .../main/java/hivemall/tools/vector/VectorAddUDF.java | 3 ++- .../main/java/hivemall/tools/vector/VectorDotUDF.java | 10 ++++++---- resources/ddl/define-all-as-permanent.hive | 6 ++++++ resources/ddl/define-all.hive | 6 ++++++ resources/ddl/define-all.spark | 6 ++++++ resources/ddl/define-udfs.td.hql | 4 ++++ 6 files changed, 30 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java b/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java index 8442ae370..98fab992f 100644 --- a/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java +++ b/core/src/main/java/hivemall/tools/vector/VectorAddUDF.java @@ -42,7 +42,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; @Description(name = "vector_add", - value = "_FUNC_(array x, array y) - Perform vector ADD operation.") + value = "_FUNC_(array x, array y) - Perform vector ADD operation.", + extended = "SELECT vector_add(array(1.0,2.0,3.0), array(2, 3, 4));\n" + "[3.0,5.0,7.0]") @UDFType(deterministic = true, stateful = false) public final class VectorAddUDF extends GenericUDF { diff --git a/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java b/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java index 007cfcf87..b43b562da 100644 --- a/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java +++ b/core/src/main/java/hivemall/tools/vector/VectorDotUDF.java @@ -42,7 +42,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; @Description(name = "vector_dot", - value = "_FUNC_(array x, array y) - Performs vector dot product.") + value = "_FUNC_(array x, array y) - Performs vector dot product.", + extended = "SELECT vector_dot(array(1.0,2.0,3.0),array(2.0,3.0,4.0));\n20\n\n" + + "SELECT vector_dot(array(1.0,2.0,3.0),2);\n[2.0,4.0,6.0]") @UDFType(deterministic = true, stateful = false) public final class VectorDotUDF extends GenericUDF { @@ -64,11 +66,11 @@ public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentEx ObjectInspector argOI1 = argOIs[1]; if (HiveUtils.isNumberListOI(argOI1)) { this.evaluator = new Dot2DVectors(xListOI, HiveUtils.asListOI(argOI1)); - return ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); + return PrimitiveObjectInspectorFactory.javaDoubleObjectInspector; } else if (HiveUtils.isNumberOI(argOI1)) { this.evaluator = new Multiply2D1D(xListOI, argOI1); - return PrimitiveObjectInspectorFactory.javaDoubleObjectInspector; + return ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); } else { throw new UDFArgumentException( "Expected array or number for the send argument: " + argOI1.getTypeName()); diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index b1e527d87..8627240b0 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -821,3 +821,9 @@ CREATE FUNCTION raise_error as 'hivemall.tools.sanity.RaiseErrorUDF' USING JAR ' DROP FUNCTION IF EXISTS moving_avg; CREATE FUNCTION moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS vector_add; +CREATE FUNCTION vector_add as 'hivemall.tools.vector.VectorAddUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS vector_dot; +CREATE FUNCTION vector_dot as 'hivemall.tools.vector.VectorDotUDF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index b1074bc47..5cee101ce 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -827,3 +827,9 @@ create temporary function raise_error as 'hivemall.tools.sanity.RaiseErrorUDF'; drop temporary function if exists moving_avg; create temporary function moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF'; + +drop temporary function if exists vector_add; +create temporary function vector_add as 'hivemall.tools.vector.VectorAddUDF'; + +drop temporary function if exists vector_dot; +create temporary function vector_dot as 'hivemall.tools.vector.VectorDotUDF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 44d5fe552..3d9a0a3f9 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -780,3 +780,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION raise_error AS 'hivemall.tools.sanity. sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS moving_avg") sqlContext.sql("CREATE TEMPORARY FUNCTION moving_avg AS 'hivemall.tools.timeseries.MovingAverageUDTF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS vector_add") +sqlContext.sql("CREATE TEMPORARY FUNCTION vector_add AS 'hivemall.tools.vector.VectorAddUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS vector_dot") +sqlContext.sql("CREATE TEMPORARY FUNCTION vector_dot AS 'hivemall.tools.vector.VectorDotUDF'") diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 93376e092..4f7e883fb 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -236,3 +236,7 @@ create temporary function assert as 'hivemall.tools.sanity.AssertUDF'; create temporary function raise_error as 'hivemall.tools.sanity.RaiseErrorUDF'; create temporary function moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF'; + +create temporary function vector_add as 'hivemall.tools.vector.VectorAddUDF'; + +create temporary function vector_dot as 'hivemall.tools.vector.VectorDotUDF'; From 1263b9acf7aa09af7da8b172ae8631108d25748e Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 1 Jun 2018 15:07:56 +0900 Subject: [PATCH 44/56] Renamed BloomUDAF to BloomFilterUDAF --- .../sketch/bloom/{BloomUDAF.java => BloomFilterUDAF.java} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename core/src/main/java/hivemall/sketch/bloom/{BloomUDAF.java => BloomFilterUDAF.java} (97%) diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomUDAF.java b/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java similarity index 97% rename from core/src/main/java/hivemall/sketch/bloom/BloomUDAF.java rename to core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java index cb09b9347..966e84603 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomUDAF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java @@ -32,10 +32,10 @@ import org.apache.hadoop.util.bloom.Filter; import org.apache.hadoop.util.bloom.Key; -@Description(name = "bloom", +@Description(name = "bloom_filter", value = "_FUNC_(string key) - Constructs a BloomFilter by aggregating a set of keys") @SuppressWarnings("deprecation") -public final class BloomUDAF extends UDAF { +public final class BloomFilterUDAF extends UDAF { public static class Evaluator implements UDAFEvaluator { From 9ebf51a1be3f78adbd5e3949d99a3dd12f152ed0 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 1 Jun 2018 15:08:44 +0900 Subject: [PATCH 45/56] Added DDLs for bloom filter --- resources/ddl/define-all-as-permanent.hive | 15 +++++++++++++++ resources/ddl/define-all.hive | 15 +++++++++++++++ resources/ddl/define-all.spark | 15 +++++++++++++++ resources/ddl/define-udfs.td.hql | 10 ++++++++++ 4 files changed, 55 insertions(+) diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index 8627240b0..de99f4141 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -827,3 +827,18 @@ CREATE FUNCTION vector_add as 'hivemall.tools.vector.VectorAddUDF' USING JAR '${ DROP FUNCTION IF EXISTS vector_dot; CREATE FUNCTION vector_dot as 'hivemall.tools.vector.VectorDotUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS bloom_filter; +CREATE FUNCTION bloom_filter as 'hivemall.sketch.bloom.BloomFilterUDAF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS bloom_and; +CREATE FUNCTION bloom_and as 'hivemall.sketch.bloom.BloomAndUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS bloom_contains; +CREATE FUNCTION bloom_contains as 'hivemall.sketch.bloom.BloomContainsUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS bloom_not; +CREATE FUNCTION bloom_not as 'hivemall.sketch.bloom.BloomNotUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS bloom_or; +CREATE FUNCTION bloom_or as 'hivemall.sketch.bloom.BloomOrUDF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 5cee101ce..8497369c6 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -833,3 +833,18 @@ create temporary function vector_add as 'hivemall.tools.vector.VectorAddUDF'; drop temporary function if exists vector_dot; create temporary function vector_dot as 'hivemall.tools.vector.VectorDotUDF'; + +drop temporary function if exists bloom_filter; +create temporary function bloom_filter as 'hivemall.sketch.bloom.BloomFilterUDAF'; + +drop temporary function if exists bloom_and; +create temporary function bloom_and as 'hivemall.sketch.bloom.BloomAndUDF'; + +drop temporary function if exists bloom_contains; +create temporary function bloom_contains as 'hivemall.sketch.bloom.BloomContainsUDF'; + +drop temporary function if exists bloom_not; +create temporary function bloom_not as 'hivemall.sketch.bloom.BloomNotUDF'; + +drop temporary function if exists bloom_or; +create temporary function bloom_or as 'hivemall.sketch.bloom.BloomOrUDF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 3d9a0a3f9..b806daf32 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -786,3 +786,18 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION vector_add AS 'hivemall.tools.vector.V sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS vector_dot") sqlContext.sql("CREATE TEMPORARY FUNCTION vector_dot AS 'hivemall.tools.vector.VectorDotUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_filter") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_filter AS 'hivemall.sketch.bloom.BloomFilterUDAF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_and") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_and AS 'hivemall.sketch.bloom.BloomAndUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_contains") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_contains AS 'hivemall.sketch.bloom.BloomContainsUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_not") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_not AS 'hivemall.sketch.bloom.BloomNotUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_or") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_or AS 'hivemall.sketch.bloom.BloomOrUDF'") diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 4f7e883fb..4bf30f672 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -240,3 +240,13 @@ create temporary function moving_avg as 'hivemall.tools.timeseries.MovingAverage create temporary function vector_add as 'hivemall.tools.vector.VectorAddUDF'; create temporary function vector_dot as 'hivemall.tools.vector.VectorDotUDF'; + +create temporary function bloom_filter as 'hivemall.sketch.bloom.BloomFilterUDAF'; + +create temporary function bloom_and as 'hivemall.sketch.bloom.BloomAndUDF'; + +create temporary function bloom_contains as 'hivemall.sketch.bloom.BloomContainsUDF'; + +create temporary function bloom_not as 'hivemall.sketch.bloom.BloomNotUDF'; + +create temporary function bloom_or as 'hivemall.sketch.bloom.BloomOrUDF'; From 60be0825646d43b190fadbd89a2270eac8917d93 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 1 Jun 2018 19:14:33 +0900 Subject: [PATCH 46/56] Fixed NPE bug in bloom UDAF --- .../main/java/hivemall/sketch/bloom/BloomFilterUDAF.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java b/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java index 966e84603..8d37e8925 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java @@ -32,7 +32,7 @@ import org.apache.hadoop.util.bloom.Filter; import org.apache.hadoop.util.bloom.Key; -@Description(name = "bloom_filter", +@Description(name = "bloom", value = "_FUNC_(string key) - Constructs a BloomFilter by aggregating a set of keys") @SuppressWarnings("deprecation") public final class BloomFilterUDAF extends UDAF { @@ -52,8 +52,11 @@ public boolean iterate(@Nullable Text keyStr) { if (keyStr == null) { return true; } - key.set(keyStr.getBytes(), 1.0d); + if (filter == null) { + init(); + } + key.set(keyStr.getBytes(), 1.0d); filter.add(key); return true; From 68e95113eef5a32368df676bbbb57bfaf9512d79 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 1 Jun 2018 19:14:55 +0900 Subject: [PATCH 47/56] Added bloom_contains_any UDF --- .../sketch/bloom/BloomContainsAnyUDF.java | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java new file mode 100644 index 000000000..cc937c7f6 --- /dev/null +++ b/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.sketch.bloom; + +import java.io.IOException; +import java.util.List; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.bloom.DynamicBloomFilter; +import org.apache.hadoop.util.bloom.Filter; +import org.apache.hadoop.util.bloom.Key; + +@Description(name = "bloom_contains_any", + value = "_FUNC_(string bloom, string key) - Returns true if the bloom filter contains any of the given key") +@UDFType(deterministic = true, stateful = false) +public final class BloomContainsAnyUDF extends UDF { + + @Nonnull + private final Key key = new Key(); + + @Nullable + private Text prevBfStr; + @Nullable + private Filter prevBf; + + @Nullable + public Boolean evaluate(@Nullable Text bloomStr, @Nullable List keys) + throws HiveException { + if (bloomStr == null) { + return null; + } + if (keys == null) { + return Boolean.FALSE; + } + + final Filter bloom = getFilter(bloomStr); + + for (Text keyStr : keys) { + if (keyStr == null) { + continue; + } + key.set(keyStr.getBytes(), 1.0d); + if (bloom.membershipTest(key)) { + return Boolean.TRUE; + } + } + + return Boolean.FALSE; + } + + @Nonnull + private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException { + final Filter bloom; + if (prevBf != null && prevBfStr.equals(bloomStr)) { + bloom = prevBf; + } else { + try { + bloom = BloomFilterUtils.deserialize(bloomStr, new DynamicBloomFilter()); + } catch (IOException e) { + throw new HiveException(e); + } + this.prevBfStr = bloomStr; + this.prevBf = bloom; + } + return bloom; + } + +} From 2d953345a2da4b90ec7b4e54572333b4fc5ade11 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 1 Jun 2018 19:15:43 +0900 Subject: [PATCH 48/56] Supported contains all in bloom_contains --- .../sketch/bloom/BloomContainsUDF.java | 57 +++++++++++++++---- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java index 2da65b33f..6afd0cf41 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java @@ -19,6 +19,7 @@ package hivemall.sketch.bloom; import java.io.IOException; +import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -33,7 +34,8 @@ import org.apache.hadoop.util.bloom.Key; @Description(name = "bloom_contains", - value = "_FUNC_(string bloom, string key) - Returns true if the bloom filter contains the given key") + value = "_FUNC_(string bloom, string key) or _FUNC_(string bloom, array keys)" + + " - Returns true if the bloom filter contains all the given key(s). Returns false if key is null.") @UDFType(deterministic = true, stateful = false) public final class BloomContainsUDF extends UDF { @@ -41,31 +43,64 @@ public final class BloomContainsUDF extends UDF { private final Key key = new Key(); @Nullable - private Text prevKey; + private Text prevBfStr; @Nullable - private Filter prevFilter; + private Filter prevBf; @Nullable public Boolean evaluate(@Nullable Text bloomStr, @Nullable Text keyStr) throws HiveException { - if (bloomStr == null || key == null) { + if (bloomStr == null) { return null; } + if (keyStr == null) { + return Boolean.FALSE; + } + + Filter bloom = getFilter(bloomStr); + key.set(keyStr.getBytes(), 1.0d); + return Boolean.valueOf(bloom.membershipTest(key)); + } + @Nullable + public Boolean evaluate(@Nullable Text bloomStr, @Nullable List keys) + throws HiveException { + if (bloomStr == null) { + return null; + } + if (keys == null) { + return Boolean.FALSE; + } + + final Filter bloom = getFilter(bloomStr); + + for (Text keyStr : keys) { + if (keyStr == null) { + continue; + } + key.set(keyStr.getBytes(), 1.0d); + if (bloom.membershipTest(key) == false) { + return Boolean.FALSE; + } + } + + return Boolean.TRUE; + } + + @Nonnull + private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException { final Filter bloom; - if (prevFilter != null && prevKey.equals(keyStr)) { - bloom = prevFilter; + if (prevBf != null && prevBfStr.equals(bloomStr)) { + bloom = prevBf; } else { try { bloom = BloomFilterUtils.deserialize(bloomStr, new DynamicBloomFilter()); } catch (IOException e) { throw new HiveException(e); } - this.prevKey = keyStr; - this.prevFilter = bloom; - key.set(keyStr.getBytes(), 1.0d); + this.prevBfStr = bloomStr; + this.prevBf = bloom; } - - return Boolean.valueOf(bloom.membershipTest(key)); + return bloom; } } From 78c2dbdefeab2ad33dfdb976285303d2e10e0fed Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 1 Jun 2018 19:16:10 +0900 Subject: [PATCH 49/56] Updated DDLs for bloom filter UDFs --- resources/ddl/define-all-as-permanent.hive | 7 +++++-- resources/ddl/define-all.hive | 7 +++++-- resources/ddl/define-all.spark | 7 +++++-- resources/ddl/define-udfs.td.hql | 4 +++- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index de99f4141..4138f26fa 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -828,8 +828,8 @@ CREATE FUNCTION vector_add as 'hivemall.tools.vector.VectorAddUDF' USING JAR '${ DROP FUNCTION IF EXISTS vector_dot; CREATE FUNCTION vector_dot as 'hivemall.tools.vector.VectorDotUDF' USING JAR '${hivemall_jar}'; -DROP FUNCTION IF EXISTS bloom_filter; -CREATE FUNCTION bloom_filter as 'hivemall.sketch.bloom.BloomFilterUDAF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS bloom; +CREATE FUNCTION bloom as 'hivemall.sketch.bloom.BloomFilterUDAF' USING JAR '${hivemall_jar}'; DROP FUNCTION IF EXISTS bloom_and; CREATE FUNCTION bloom_and as 'hivemall.sketch.bloom.BloomAndUDF' USING JAR '${hivemall_jar}'; @@ -842,3 +842,6 @@ CREATE FUNCTION bloom_not as 'hivemall.sketch.bloom.BloomNotUDF' USING JAR '${hi DROP FUNCTION IF EXISTS bloom_or; CREATE FUNCTION bloom_or as 'hivemall.sketch.bloom.BloomOrUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS bloom_contains_any; +CREATE FUNCTION bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 8497369c6..1d9791504 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -834,8 +834,8 @@ create temporary function vector_add as 'hivemall.tools.vector.VectorAddUDF'; drop temporary function if exists vector_dot; create temporary function vector_dot as 'hivemall.tools.vector.VectorDotUDF'; -drop temporary function if exists bloom_filter; -create temporary function bloom_filter as 'hivemall.sketch.bloom.BloomFilterUDAF'; +drop temporary function if exists bloom; +create temporary function bloom as 'hivemall.sketch.bloom.BloomFilterUDAF'; drop temporary function if exists bloom_and; create temporary function bloom_and as 'hivemall.sketch.bloom.BloomAndUDF'; @@ -848,3 +848,6 @@ create temporary function bloom_not as 'hivemall.sketch.bloom.BloomNotUDF'; drop temporary function if exists bloom_or; create temporary function bloom_or as 'hivemall.sketch.bloom.BloomOrUDF'; + +drop temporary function if exists bloom_contains_any; +create temporary function bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index b806daf32..bb787bb20 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -787,8 +787,8 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION vector_add AS 'hivemall.tools.vector.V sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS vector_dot") sqlContext.sql("CREATE TEMPORARY FUNCTION vector_dot AS 'hivemall.tools.vector.VectorDotUDF'") -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_filter") -sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_filter AS 'hivemall.sketch.bloom.BloomFilterUDAF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom AS 'hivemall.sketch.bloom.BloomFilterUDAF'") sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_and") sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_and AS 'hivemall.sketch.bloom.BloomAndUDF'") @@ -801,3 +801,6 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_not AS 'hivemall.sketch.bloom.Bl sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_or") sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_or AS 'hivemall.sketch.bloom.BloomOrUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_contains_any") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_contains_any AS 'hivemall.sketch.bloom.BloomContainsAnyUDF'") diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 4bf30f672..d6aa0bded 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -241,7 +241,7 @@ create temporary function vector_add as 'hivemall.tools.vector.VectorAddUDF'; create temporary function vector_dot as 'hivemall.tools.vector.VectorDotUDF'; -create temporary function bloom_filter as 'hivemall.sketch.bloom.BloomFilterUDAF'; +create temporary function bloom as 'hivemall.sketch.bloom.BloomFilterUDAF'; create temporary function bloom_and as 'hivemall.sketch.bloom.BloomAndUDF'; @@ -250,3 +250,5 @@ create temporary function bloom_contains as 'hivemall.sketch.bloom.BloomContains create temporary function bloom_not as 'hivemall.sketch.bloom.BloomNotUDF'; create temporary function bloom_or as 'hivemall.sketch.bloom.BloomOrUDF'; + +create temporary function bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF'; From 512f930b3f48717562cc501c63aa872c9feb8e47 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 5 Jun 2018 18:00:31 +0900 Subject: [PATCH 50/56] Fixed bugs in bloom filter UDFs --- .../java/hivemall/sketch/bloom/BloomContainsAnyUDF.java | 4 ++-- .../main/java/hivemall/sketch/bloom/BloomContainsUDF.java | 6 +++--- .../main/java/hivemall/sketch/bloom/BloomFilterUDAF.java | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java index cc937c7f6..c10c52791 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java @@ -62,7 +62,7 @@ public Boolean evaluate(@Nullable Text bloomStr, @Nullable List keys) if (keyStr == null) { continue; } - key.set(keyStr.getBytes(), 1.0d); + key.set(keyStr.copyBytes(), 1.0d); if (bloom.membershipTest(key)) { return Boolean.TRUE; } @@ -82,7 +82,7 @@ private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException { } catch (IOException e) { throw new HiveException(e); } - this.prevBfStr = bloomStr; + this.prevBfStr = new Text(bloomStr); this.prevBf = bloom; } return bloom; diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java index 6afd0cf41..449364e2d 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java @@ -57,7 +57,7 @@ public Boolean evaluate(@Nullable Text bloomStr, @Nullable Text keyStr) throws H } Filter bloom = getFilter(bloomStr); - key.set(keyStr.getBytes(), 1.0d); + key.set(keyStr.copyBytes(), 1.0d); return Boolean.valueOf(bloom.membershipTest(key)); } @@ -77,7 +77,7 @@ public Boolean evaluate(@Nullable Text bloomStr, @Nullable List keys) if (keyStr == null) { continue; } - key.set(keyStr.getBytes(), 1.0d); + key.set(keyStr.copyBytes(), 1.0d); if (bloom.membershipTest(key) == false) { return Boolean.FALSE; } @@ -97,7 +97,7 @@ private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException { } catch (IOException e) { throw new HiveException(e); } - this.prevBfStr = bloomStr; + this.prevBfStr = new Text(bloomStr); this.prevBf = bloom; } return bloom; diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java b/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java index 8d37e8925..5276ab2fb 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java @@ -56,7 +56,7 @@ public boolean iterate(@Nullable Text keyStr) { init(); } - key.set(keyStr.getBytes(), 1.0d); + key.set(keyStr.copyBytes(), 1.0d); filter.add(key); return true; From e21606cc4d7ec141f5339f1083a67d33d2688e35 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 5 Jun 2018 18:42:32 +0900 Subject: [PATCH 51/56] Updated UDF documents --- .../hivemall/sketch/bloom/BloomAndUDF.java | 3 +- .../sketch/bloom/BloomContainsAnyUDF.java | 22 +++++++++++++- .../sketch/bloom/BloomContainsUDF.java | 30 ++++++++++++++++++- .../sketch/bloom/BloomFilterUDAF.java | 12 +++++++- .../hivemall/sketch/bloom/BloomNotUDF.java | 3 +- .../hivemall/sketch/bloom/BloomOrUDF.java | 3 +- 6 files changed, 67 insertions(+), 6 deletions(-) diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java index 87769da4b..f723acd79 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java @@ -31,7 +31,8 @@ import org.apache.hadoop.util.bloom.Filter; @Description(name = "bloom_and", - value = "_FUNC_(string bloom1, string bloom2) - Returns the logical AND of two bloom filters") + value = "_FUNC_(string bloom1, string bloom2) - Returns the logical AND of two bloom filters", + extended = "SELECT bloom_and(bf1, bf2) FROM xxx;") @UDFType(deterministic = true, stateful = false) public final class BloomAndUDF extends UDF { diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java index c10c52791..d8ff28b41 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java @@ -33,8 +33,28 @@ import org.apache.hadoop.util.bloom.Filter; import org.apache.hadoop.util.bloom.Key; +//@formatter:off @Description(name = "bloom_contains_any", - value = "_FUNC_(string bloom, string key) - Returns true if the bloom filter contains any of the given key") + value = "_FUNC_(string bloom, string key) or _FUNC_(string bloom, array keys)" + + "- Returns true if the bloom filter contains any of the given key", + extended = "WITH data1 as (\n" + + " SELECT explode(array(1,2,3,4,5)) as id\n" + + "),\n" + + "data2 as (\n" + + " SELECT explode(array(1,3,5,6,8)) as id\n" + + "),\n" + + "bloom as (\n" + + " SELECT bloom(id) as bf\n" + + " FROM data1\n" + + ")\n" + + "SELECT \n" + + " l.* \n" + + "FROM \n" + + " data2 l\n" + + " CROSS JOIN bloom r\n" + + "WHERE\n" + + " bloom_contains_any(r.bf, array(l.id))") +//@formatter:on @UDFType(deterministic = true, stateful = false) public final class BloomContainsAnyUDF extends UDF { diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java index 449364e2d..f76b85dcf 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java @@ -33,9 +33,37 @@ import org.apache.hadoop.util.bloom.Filter; import org.apache.hadoop.util.bloom.Key; +//@formatter:off @Description(name = "bloom_contains", value = "_FUNC_(string bloom, string key) or _FUNC_(string bloom, array keys)" - + " - Returns true if the bloom filter contains all the given key(s). Returns false if key is null.") + + " - Returns true if the bloom filter contains all the given key(s). Returns false if key is null.", + extended = "WITH satisfied_movies as (\n" + + " SELECT bloom(movieid) as movies\n" + + " FROM (\n" + + " SELECT movieid\n" + + " FROM ratings\n" + + " GROUP BY movieid\n" + + " HAVING avg(rating) >= 4.0\n" + + " ) t\n" + + ")\n" + + "SELECT\n" + + " l.rating,\n" + + " count(distinct l.userid) as cnt\n" + + "FROM\n" + + " ratings l \n" + + " CROSS JOIN satisfied_movies r\n" + + "WHERE\n" + + " bloom_contains(r.movies, l.movieid) -- includes false positive\n" + + "GROUP BY \n" + + " l.rating;\n" + + "\n" + + "l.rating cnt\n" + + "1 1296\n" + + "2 2770\n" + + "3 5008\n" + + "4 5824\n" + + "5 5925") +//@formatter:on @UDFType(deterministic = true, stateful = false) public final class BloomContainsUDF extends UDF { diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java b/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java index 5276ab2fb..a76a60c49 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java @@ -32,8 +32,18 @@ import org.apache.hadoop.util.bloom.Filter; import org.apache.hadoop.util.bloom.Key; +//@formatter:off @Description(name = "bloom", - value = "_FUNC_(string key) - Constructs a BloomFilter by aggregating a set of keys") + value = "_FUNC_(string key) - Constructs a BloomFilter by aggregating a set of keys", + extended = "CREATE TABLE satisfied_movies AS \n" + + " SELECT bloom(movieid) as movies\n" + + " FROM (\n" + + " SELECT movieid\n" + + " FROM ratings\n" + + " GROUP BY movieid\n" + + " HAVING avg(rating) >= 4.0\n" + + " ) t;") +//@formatter:on @SuppressWarnings("deprecation") public final class BloomFilterUDAF extends UDAF { diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomNotUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomNotUDF.java index cd385e357..1a074b015 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomNotUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomNotUDF.java @@ -31,7 +31,8 @@ import org.apache.hadoop.util.bloom.Filter; @Description(name = "bloom_not", - value = "_FUNC_(string bloom) - Returns the logical NOT of a bloom filters") + value = "_FUNC_(string bloom) - Returns the logical NOT of a bloom filters", + extended = "SELECT bloom_not(bf) FROM xxx;") @UDFType(deterministic = true, stateful = false) public final class BloomNotUDF extends UDF { diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java index 7d2980e4d..0fcb53902 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java @@ -31,7 +31,8 @@ import org.apache.hadoop.util.bloom.Filter; @Description(name = "bloom_or", - value = "_FUNC_(string bloom1, string bloom2) - Returns the logical OR of two bloom filters") + value = "_FUNC_(string bloom1, string bloom2) - Returns the logical OR of two bloom filters", + extended = "SELECT bloom_or(bf1, bf2) FROM xxx;") @UDFType(deterministic = true, stateful = false) public final class BloomOrUDF extends UDF { From 906009abcfcc6283204216948106442a8c136aa3 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Wed, 6 Jun 2018 16:18:45 +0900 Subject: [PATCH 52/56] Fixed DDLs for array_to_str UDF --- resources/ddl/define-all-as-permanent.hive | 5 ++++- resources/ddl/define-all.hive | 5 ++++- resources/ddl/define-all.spark | 5 ++++- resources/ddl/define-udfs.td.hql | 4 +++- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index 4138f26fa..fcca58370 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -796,7 +796,7 @@ DROP FUNCTION IF EXISTS map_exclude_keys; CREATE FUNCTION map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF' USING JAR '${hivemall_jar}'; DROP FUNCTION IF EXISTS array_to_str; -CREATE FUNCTION array_to_str as 'hadoop.tools.array.ArrayToStrUDF' USING JAR '${hivemall_jar}'; +CREATE FUNCTION array_to_str as 'hivemall.tools.array.ArrayToStrUDF' USING JAR '${hivemall_jar}'; DROP FUNCTION IF EXISTS map_index; CREATE FUNCTION map_index as 'hivemall.tools.map.MapIndexUDF' USING JAR '${hivemall_jar}'; @@ -845,3 +845,6 @@ CREATE FUNCTION bloom_or as 'hivemall.sketch.bloom.BloomOrUDF' USING JAR '${hive DROP FUNCTION IF EXISTS bloom_contains_any; CREATE FUNCTION bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS conditional_emit; +CREATE FUNCTION conditional_emit as 'hivemall.tools.array.ConditionalEmitUDTF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 1d9791504..1c71ea558 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -802,7 +802,7 @@ drop temporary function if exists map_exclude_keys; create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; drop temporary function if exists array_to_str; -create temporary function array_to_str as 'hadoop.tools.array.ArrayToStrUDF'; +create temporary function array_to_str as 'hivemall.tools.array.ArrayToStrUDF'; drop temporary function if exists map_index; create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; @@ -851,3 +851,6 @@ create temporary function bloom_or as 'hivemall.sketch.bloom.BloomOrUDF'; drop temporary function if exists bloom_contains_any; create temporary function bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF'; + +drop temporary function if exists conditional_emit; +create temporary function conditional_emit as 'hivemall.tools.array.ConditionalEmitUDTF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index bb787bb20..d41a61a6d 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -755,7 +755,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_exclude_keys") sqlContext.sql("CREATE TEMPORARY FUNCTION map_exclude_keys AS 'hivemall.tools.map.MapExcludeKeysUDF'") sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_to_str") -sqlContext.sql("CREATE TEMPORARY FUNCTION array_to_str AS 'hadoop.tools.array.ArrayToStrUDF'") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_to_str AS 'hivemall.tools.array.ArrayToStrUDF'") sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_index") sqlContext.sql("CREATE TEMPORARY FUNCTION map_index AS 'hivemall.tools.map.MapIndexUDF'") @@ -804,3 +804,6 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_or AS 'hivemall.sketch.bloom.Blo sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_contains_any") sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_contains_any AS 'hivemall.sketch.bloom.BloomContainsAnyUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS conditional_emit") +sqlContext.sql("CREATE TEMPORARY FUNCTION conditional_emit AS 'hivemall.tools.array.ConditionalEmitUDTF'") diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index d6aa0bded..cb5b21ad0 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -219,7 +219,7 @@ create temporary function map_include_keys as 'hivemall.tools.map.MapIncludeKeys create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; -create temporary function array_to_str as 'hadoop.tools.array.ArrayToStrUDF'; +create temporary function array_to_str as 'hivemall.tools.array.ArrayToStrUDF'; create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; @@ -252,3 +252,5 @@ create temporary function bloom_not as 'hivemall.sketch.bloom.BloomNotUDF'; create temporary function bloom_or as 'hivemall.sketch.bloom.BloomOrUDF'; create temporary function bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF'; + +create temporary function conditional_emit as 'hivemall.tools.array.ConditionalEmitUDTF'; From e6bd7a5f32b3529975568c758283c907397887d3 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Wed, 6 Jun 2018 17:01:14 +0900 Subject: [PATCH 53/56] formatted TD DDLs --- resources/ddl/define-udfs.td.hql | 62 ++++++++++---------------------- 1 file changed, 18 insertions(+), 44 deletions(-) diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index cb5b21ad0..23523906a 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -155,7 +155,7 @@ create temporary function train_randomforest_regr as 'hivemall.smile.regression. create temporary function tree_predict as 'hivemall.smile.tools.TreePredictUDF'; create temporary function rf_ensemble as 'hivemall.smile.tools.RandomForestEnsembleUDAF'; create temporary function guess_attribute_types as 'hivemall.smile.tools.GuessAttributesUDF'; --- since Hivemall v0.5 +-- since Hivemall v0.5.0 create temporary function changefinder as 'hivemall.anomaly.ChangeFinderUDF'; create temporary function sst as 'hivemall.anomaly.SingularSpectrumTransformUDF'; create temporary function train_lda as 'hivemall.topicmodel.LDAUDTF'; @@ -183,74 +183,48 @@ create temporary function train_slim as 'hivemall.recommend.SlimUDTF'; create temporary function hitrate as 'hivemall.evaluation.HitRateUDAF'; create temporary function word_ngrams as 'hivemall.tools.text.WordNgramsUDF'; create temporary function approx_count_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF'; +-- since Hivemall v0.5.2 create temporary function array_slice as 'hivemall.tools.array.ArraySliceUDF'; - --- NLP features -create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF'; -create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF'; - --- Backward compatibilities -create temporary function concat_array as 'hivemall.tools.array.ArrayConcatUDF'; -create temporary function pa2a_regress as 'hivemall.regression.PassiveAggressiveRegressionUDTF$PA2a'; -create temporary function arow_regress as 'hivemall.regression.AROWRegressionUDTF'; -create temporary function addBias as 'hivemall.ftvec.AddBiasUDF'; -create temporary function tree_predict_v1 as 'hivemall.smile.tools.TreePredictUDFv1'; -create temporary function add_field_indicies as 'hivemall.ftvec.trans.AddFieldIndicesUDF'; -create temporary function subarray as 'hivemall.tools.array.ArraySliceUDF'; - --- alias for TD -create temporary function approx_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF'; - create temporary function try_cast as 'hivemall.tools.TryCastUDF'; - create temporary function array_append as 'hivemall.tools.array.ArrayAppendUDF'; - create temporary function element_at as 'hivemall.tools.array.ArrayElementAtUDF'; - create temporary function array_union as 'hivemall.tools.array.ArrayUnionUDF'; - create temporary function first_element as 'hivemall.tools.array.FirstElementUDF'; - create temporary function last_element as 'hivemall.tools.array.LastElementUDF'; - create temporary function array_flatten as 'hivemall.tools.array.ArrayFlattenUDF'; - create temporary function map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF'; - create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; - create temporary function array_to_str as 'hivemall.tools.array.ArrayToStrUDF'; - create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; - create temporary function map_key_values as 'hivemall.tools.map.MapKeyValuesUDF'; - create temporary function sessionize as 'hivemall.tools.datetime.SessionizeUDF'; - create temporary function to_json as 'hivemall.tools.json.ToJsonUDF'; - create temporary function from_json as 'hivemall.tools.json.FromJsonUDF'; - create temporary function assert as 'hivemall.tools.sanity.AssertUDF'; - create temporary function raise_error as 'hivemall.tools.sanity.RaiseErrorUDF'; - create temporary function moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF'; - create temporary function vector_add as 'hivemall.tools.vector.VectorAddUDF'; - create temporary function vector_dot as 'hivemall.tools.vector.VectorDotUDF'; - create temporary function bloom as 'hivemall.sketch.bloom.BloomFilterUDAF'; - create temporary function bloom_and as 'hivemall.sketch.bloom.BloomAndUDF'; - create temporary function bloom_contains as 'hivemall.sketch.bloom.BloomContainsUDF'; - create temporary function bloom_not as 'hivemall.sketch.bloom.BloomNotUDF'; - create temporary function bloom_or as 'hivemall.sketch.bloom.BloomOrUDF'; - create temporary function bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF'; - create temporary function conditional_emit as 'hivemall.tools.array.ConditionalEmitUDTF'; + +-- NLP features +create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF'; +create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF'; + +-- Backward compatibilities +create temporary function concat_array as 'hivemall.tools.array.ArrayConcatUDF'; +create temporary function pa2a_regress as 'hivemall.regression.PassiveAggressiveRegressionUDTF$PA2a'; +create temporary function arow_regress as 'hivemall.regression.AROWRegressionUDTF'; +create temporary function addBias as 'hivemall.ftvec.AddBiasUDF'; +create temporary function tree_predict_v1 as 'hivemall.smile.tools.TreePredictUDFv1'; +create temporary function add_field_indicies as 'hivemall.ftvec.trans.AddFieldIndicesUDF'; +create temporary function subarray as 'hivemall.tools.array.ArraySliceUDF'; + +-- alias for TD +create temporary function approx_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF'; From bf8183021e368c435eee7a4d2a32f77de0feb848 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Wed, 6 Jun 2018 17:15:09 +0900 Subject: [PATCH 54/56] Formatted DDLs --- resources/ddl/define-all-as-permanent.hive | 178 ++++++++++---------- resources/ddl/define-all.hive | 180 +++++++++++---------- resources/ddl/define-all.spark | 138 ++++++++-------- 3 files changed, 266 insertions(+), 230 deletions(-) diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index fcca58370..5c257c584 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -435,6 +435,30 @@ CREATE FUNCTION array_intersect as 'hivemall.tools.array.ArrayIntersectUDF' USIN DROP FUNCTION IF EXISTS select_k_best; CREATE FUNCTION select_k_best as 'hivemall.tools.array.SelectKBestUDF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS array_append; +CREATE FUNCTION array_append as 'hivemall.tools.array.ArrayAppendUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS element_at; +CREATE FUNCTION element_at as 'hivemall.tools.array.ArrayElementAtUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS array_union; +CREATE FUNCTION array_union as 'hivemall.tools.array.ArrayUnionUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS first_element; +CREATE FUNCTION first_element as 'hivemall.tools.array.FirstElementUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS last_element; +CREATE FUNCTION last_element as 'hivemall.tools.array.LastElementUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS array_flatten; +CREATE FUNCTION array_flatten as 'hivemall.tools.array.ArrayFlattenUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS array_to_str; +CREATE FUNCTION array_to_str as 'hivemall.tools.array.ArrayToStrUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS conditional_emit; +CREATE FUNCTION conditional_emit as 'hivemall.tools.array.ConditionalEmitUDTF' USING JAR '${hivemall_jar}'; + ----------------------------- -- bit operation functions -- ----------------------------- @@ -477,6 +501,18 @@ CREATE FUNCTION to_map as 'hivemall.tools.map.UDAFToMap' USING JAR '${hivemall_j DROP FUNCTION IF EXISTS to_ordered_map; CREATE FUNCTION to_ordered_map as 'hivemall.tools.map.UDAFToOrderedMap' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS map_include_keys; +CREATE FUNCTION map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS map_exclude_keys; +CREATE FUNCTION map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS map_index; +CREATE FUNCTION map_index as 'hivemall.tools.map.MapIndexUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS map_key_values; +CREATE FUNCTION map_key_values as 'hivemall.tools.map.MapKeyValuesUDF' USING JAR '${hivemall_jar}'; + --------------------- -- list functions -- --------------------- @@ -494,13 +530,19 @@ CREATE FUNCTION sigmoid as 'hivemall.tools.math.SigmoidGenericUDF' USING JAR '${ DROP FUNCTION IF EXISTS l2_norm; CREATE FUNCTION l2_norm as 'hivemall.tools.math.L2NormUDAF' USING JAR '${hivemall_jar}'; ----------------------- --- Matrix functions -- ----------------------- +----------------------------- +-- Matrix/Vector functions -- +----------------------------- DROP FUNCTION IF EXISTS transpose_and_dot; CREATE FUNCTION transpose_and_dot as 'hivemall.tools.matrix.TransposeAndDotUDAF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS vector_add; +CREATE FUNCTION vector_add as 'hivemall.tools.vector.VectorAddUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS vector_dot; +CREATE FUNCTION vector_dot as 'hivemall.tools.vector.VectorDotUDF' USING JAR '${hivemall_jar}'; + ---------------------- -- mapred functions -- ---------------------- @@ -523,6 +565,26 @@ CREATE FUNCTION distcache_gets as 'hivemall.tools.mapred.DistributedCacheLookupU DROP FUNCTION IF EXISTS jobconf_gets; CREATE FUNCTION jobconf_gets as 'hivemall.tools.mapred.JobConfGetsUDF' USING JAR '${hivemall_jar}'; +-------------------- +-- JSON functions -- +-------------------- + +DROP FUNCTION IF EXISTS to_json; +CREATE FUNCTION to_json as 'hivemall.tools.json.ToJsonUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS from_json; +CREATE FUNCTION from_json as 'hivemall.tools.json.FromJsonUDF' USING JAR '${hivemall_jar}'; + +---------------------------- +-- Sanity Check functions -- +---------------------------- + +DROP FUNCTION IF EXISTS assert; +CREATE FUNCTION assert as 'hivemall.tools.sanity.AssertUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS raise_error; +CREATE FUNCTION raise_error as 'hivemall.tools.sanity.RaiseErrorUDF' USING JAR '${hivemall_jar}'; + -------------------- -- misc functions -- -------------------- @@ -539,6 +601,15 @@ CREATE FUNCTION x_rank as 'hivemall.tools.RankSequenceUDF' USING JAR '${hivemall DROP FUNCTION IF EXISTS each_top_k; CREATE FUNCTION each_top_k as 'hivemall.tools.EachTopKUDTF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS try_cast; +CREATE FUNCTION try_cast as 'hivemall.tools.TryCastUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS sessionize; +CREATE FUNCTION sessionize as 'hivemall.tools.datetime.SessionizeUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS moving_avg; +CREATE FUNCTION moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF' USING JAR '${hivemall_jar}'; + ------------------------------- -- Text processing functions -- ------------------------------- @@ -749,84 +820,9 @@ CREATE FUNCTION train_slim as 'hivemall.recommend.SlimUDTF' USING JAR '${hivemal DROP FUNCTION IF EXISTS approx_count_distinct; CREATE FUNCTION approx_count_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF' USING JAR '${hivemall_jar}'; ------------------------------- --- XGBoost related features -- ------------------------------- - -DROP FUNCTION train_xgboost_regr; -CREATE FUNCTION train_xgboost_regr AS 'hivemall.xgboost.regression.XGBoostRegressionUDTF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION train_xgboost_classifier; -CREATE FUNCTION train_xgboost_classifier AS 'hivemall.xgboost.classification.XGBoostBinaryClassifierUDTF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION train_multiclass_xgboost_classifier; -CREATE FUNCTION train_multiclass_xgboost_classifier AS 'hivemall.xgboost.classification.XGBoostMulticlassClassifierUDTF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION xgboost_predict; -CREATE FUNCTION xgboost_predict AS 'hivemall.xgboost.tools.XGBoostPredictUDTF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION xgboost_multiclass_predict; -CREATE FUNCTION xgboost_multiclass_predict AS 'hivemall.xgboost.tools.XGBoostMulticlassPredictUDTF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS try_cast; -CREATE FUNCTION try_cast as 'hivemall.tools.TryCastUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS array_append; -CREATE FUNCTION array_append as 'hivemall.tools.array.ArrayAppendUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS element_at; -CREATE FUNCTION element_at as 'hivemall.tools.array.ArrayElementAtUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS array_union; -CREATE FUNCTION array_union as 'hivemall.tools.array.ArrayUnionUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS first_element; -CREATE FUNCTION first_element as 'hivemall.tools.array.FirstElementUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS last_element; -CREATE FUNCTION last_element as 'hivemall.tools.array.LastElementUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS array_flatten; -CREATE FUNCTION array_flatten as 'hivemall.tools.array.ArrayFlattenUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS map_include_keys; -CREATE FUNCTION map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS map_exclude_keys; -CREATE FUNCTION map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS array_to_str; -CREATE FUNCTION array_to_str as 'hivemall.tools.array.ArrayToStrUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS map_index; -CREATE FUNCTION map_index as 'hivemall.tools.map.MapIndexUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS map_key_values; -CREATE FUNCTION map_key_values as 'hivemall.tools.map.MapKeyValuesUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS sessionize; -CREATE FUNCTION sessionize as 'hivemall.tools.datetime.SessionizeUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS to_json; -CREATE FUNCTION to_json as 'hivemall.tools.json.ToJsonUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS from_json; -CREATE FUNCTION from_json as 'hivemall.tools.json.FromJsonUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS assert; -CREATE FUNCTION assert as 'hivemall.tools.sanity.AssertUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS raise_error; -CREATE FUNCTION raise_error as 'hivemall.tools.sanity.RaiseErrorUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS moving_avg; -CREATE FUNCTION moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS vector_add; -CREATE FUNCTION vector_add as 'hivemall.tools.vector.VectorAddUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS vector_dot; -CREATE FUNCTION vector_dot as 'hivemall.tools.vector.VectorDotUDF' USING JAR '${hivemall_jar}'; +------------------ +-- Bloom Filter -- +------------------ DROP FUNCTION IF EXISTS bloom; CREATE FUNCTION bloom as 'hivemall.sketch.bloom.BloomFilterUDAF' USING JAR '${hivemall_jar}'; @@ -846,5 +842,21 @@ CREATE FUNCTION bloom_or as 'hivemall.sketch.bloom.BloomOrUDF' USING JAR '${hive DROP FUNCTION IF EXISTS bloom_contains_any; CREATE FUNCTION bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF' USING JAR '${hivemall_jar}'; -DROP FUNCTION IF EXISTS conditional_emit; -CREATE FUNCTION conditional_emit as 'hivemall.tools.array.ConditionalEmitUDTF' USING JAR '${hivemall_jar}'; +------------------------------ +-- XGBoost related features -- +------------------------------ + +DROP FUNCTION train_xgboost_regr; +CREATE FUNCTION train_xgboost_regr AS 'hivemall.xgboost.regression.XGBoostRegressionUDTF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION train_xgboost_classifier; +CREATE FUNCTION train_xgboost_classifier AS 'hivemall.xgboost.classification.XGBoostBinaryClassifierUDTF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION train_multiclass_xgboost_classifier; +CREATE FUNCTION train_multiclass_xgboost_classifier AS 'hivemall.xgboost.classification.XGBoostMulticlassClassifierUDTF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION xgboost_predict; +CREATE FUNCTION xgboost_predict AS 'hivemall.xgboost.tools.XGBoostPredictUDTF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION xgboost_multiclass_predict; +CREATE FUNCTION xgboost_multiclass_predict AS 'hivemall.xgboost.tools.XGBoostMulticlassPredictUDTF' USING JAR '${hivemall_jar}'; diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 1c71ea558..fbb3ed211 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -427,6 +427,30 @@ create temporary function array_intersect as 'hivemall.tools.array.ArrayIntersec drop temporary function if exists select_k_best; create temporary function select_k_best as 'hivemall.tools.array.SelectKBestUDF'; +drop temporary function if exists array_append; +create temporary function array_append as 'hivemall.tools.array.ArrayAppendUDF'; + +drop temporary function if exists element_at; +create temporary function element_at as 'hivemall.tools.array.ArrayElementAtUDF'; + +drop temporary function if exists array_union; +create temporary function array_union as 'hivemall.tools.array.ArrayUnionUDF'; + +drop temporary function if exists first_element; +create temporary function first_element as 'hivemall.tools.array.FirstElementUDF'; + +drop temporary function if exists last_element; +create temporary function last_element as 'hivemall.tools.array.LastElementUDF'; + +drop temporary function if exists array_flatten; +create temporary function array_flatten as 'hivemall.tools.array.ArrayFlattenUDF'; + +drop temporary function if exists array_to_str; +create temporary function array_to_str as 'hivemall.tools.array.ArrayToStrUDF'; + +drop temporary function if exists conditional_emit; +create temporary function conditional_emit as 'hivemall.tools.array.ConditionalEmitUDTF'; + ----------------------------- -- bit operation functions -- ----------------------------- @@ -469,6 +493,18 @@ create temporary function to_map as 'hivemall.tools.map.UDAFToMap'; drop temporary function if exists to_ordered_map; create temporary function to_ordered_map as 'hivemall.tools.map.UDAFToOrderedMap'; +drop temporary function if exists map_include_keys; +create temporary function map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF'; + +drop temporary function if exists map_exclude_keys; +create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; + +drop temporary function if exists map_index; +create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; + +drop temporary function if exists map_key_values; +create temporary function map_key_values as 'hivemall.tools.map.MapKeyValuesUDF'; + --------------------- -- list functions -- --------------------- @@ -486,13 +522,19 @@ create temporary function sigmoid as 'hivemall.tools.math.SigmoidGenericUDF'; drop temporary function if exists l2_norm; create temporary function l2_norm as 'hivemall.tools.math.L2NormUDAF'; ----------------------- --- Matrix functions -- ----------------------- +----------------------------- +-- Matrix/Vector functions -- +----------------------------- drop temporary function if exists transpose_and_dot; create temporary function transpose_and_dot as 'hivemall.tools.matrix.TransposeAndDotUDAF'; +drop temporary function if exists vector_add; +create temporary function vector_add as 'hivemall.tools.vector.VectorAddUDF'; + +drop temporary function if exists vector_dot; +create temporary function vector_dot as 'hivemall.tools.vector.VectorDotUDF'; + ---------------------- -- mapred functions -- ---------------------- @@ -515,6 +557,26 @@ create temporary function distcache_gets as 'hivemall.tools.mapred.DistributedCa drop temporary function if exists jobconf_gets; create temporary function jobconf_gets as 'hivemall.tools.mapred.JobConfGetsUDF'; +-------------------- +-- JSON functions -- +-------------------- + +drop temporary function if exists to_json; +create temporary function to_json as 'hivemall.tools.json.ToJsonUDF'; + +drop temporary function if exists from_json; +create temporary function from_json as 'hivemall.tools.json.FromJsonUDF'; + +---------------------------- +-- Sanity Check functions -- +---------------------------- + +drop temporary function if exists assert; +create temporary function assert as 'hivemall.tools.sanity.AssertUDF'; + +drop temporary function if exists raise_error; +create temporary function raise_error as 'hivemall.tools.sanity.RaiseErrorUDF'; + -------------------- -- misc functions -- -------------------- @@ -531,6 +593,15 @@ create temporary function x_rank as 'hivemall.tools.RankSequenceUDF'; drop temporary function if exists each_top_k; create temporary function each_top_k as 'hivemall.tools.EachTopKUDTF'; +drop temporary function if exists try_cast; +create temporary function try_cast as 'hivemall.tools.TryCastUDF'; + +drop temporary function if exists sessionize; +create temporary function sessionize as 'hivemall.tools.datetime.SessionizeUDF'; + +drop temporary function if exists moving_avg; +create temporary function moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF'; + ------------------------------- -- Text processing functions -- ------------------------------- @@ -741,6 +812,28 @@ create temporary function train_slim as 'hivemall.recommend.SlimUDTF'; drop temporary function if exists approx_count_distinct; create temporary function approx_count_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF'; +------------------ +-- Bloom Filter -- +------------------ + +drop temporary function if exists bloom; +create temporary function bloom as 'hivemall.sketch.bloom.BloomFilterUDAF'; + +drop temporary function if exists bloom_and; +create temporary function bloom_and as 'hivemall.sketch.bloom.BloomAndUDF'; + +drop temporary function if exists bloom_contains; +create temporary function bloom_contains as 'hivemall.sketch.bloom.BloomContainsUDF'; + +drop temporary function if exists bloom_not; +create temporary function bloom_not as 'hivemall.sketch.bloom.BloomNotUDF'; + +drop temporary function if exists bloom_or; +create temporary function bloom_or as 'hivemall.sketch.bloom.BloomOrUDF'; + +drop temporary function if exists bloom_contains_any; +create temporary function bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF'; + -------------------------------------------------------------------------------------------------- -- macros available from hive 0.12.0 -- see https://issues.apache.org/jira/browse/HIVE-2655 @@ -773,84 +866,3 @@ log(10, n_docs / max2(1,df_t)) + 1.0; create temporary macro tfidf(tf FLOAT, df_t DOUBLE, n_docs DOUBLE) tf * (log(10, n_docs / max2(1,df_t)) + 1.0); - -drop temporary function if exists try_cast; -create temporary function try_cast as 'hivemall.tools.TryCastUDF'; - -drop temporary function if exists array_append; -create temporary function array_append as 'hivemall.tools.array.ArrayAppendUDF'; - -drop temporary function if exists element_at; -create temporary function element_at as 'hivemall.tools.array.ArrayElementAtUDF'; - -drop temporary function if exists array_union; -create temporary function array_union as 'hivemall.tools.array.ArrayUnionUDF'; - -drop temporary function if exists first_element; -create temporary function first_element as 'hivemall.tools.array.FirstElementUDF'; - -drop temporary function if exists last_element; -create temporary function last_element as 'hivemall.tools.array.LastElementUDF'; - -drop temporary function if exists array_flatten; -create temporary function array_flatten as 'hivemall.tools.array.ArrayFlattenUDF'; - -drop temporary function if exists map_include_keys; -create temporary function map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF'; - -drop temporary function if exists map_exclude_keys; -create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; - -drop temporary function if exists array_to_str; -create temporary function array_to_str as 'hivemall.tools.array.ArrayToStrUDF'; - -drop temporary function if exists map_index; -create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; - -drop temporary function if exists map_key_values; -create temporary function map_key_values as 'hivemall.tools.map.MapKeyValuesUDF'; - -drop temporary function if exists sessionize; -create temporary function sessionize as 'hivemall.tools.datetime.SessionizeUDF'; - -drop temporary function if exists to_json; -create temporary function to_json as 'hivemall.tools.json.ToJsonUDF'; - -drop temporary function if exists from_json; -create temporary function from_json as 'hivemall.tools.json.FromJsonUDF'; - -drop temporary function if exists assert; -create temporary function assert as 'hivemall.tools.sanity.AssertUDF'; - -drop temporary function if exists raise_error; -create temporary function raise_error as 'hivemall.tools.sanity.RaiseErrorUDF'; - -drop temporary function if exists moving_avg; -create temporary function moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF'; - -drop temporary function if exists vector_add; -create temporary function vector_add as 'hivemall.tools.vector.VectorAddUDF'; - -drop temporary function if exists vector_dot; -create temporary function vector_dot as 'hivemall.tools.vector.VectorDotUDF'; - -drop temporary function if exists bloom; -create temporary function bloom as 'hivemall.sketch.bloom.BloomFilterUDAF'; - -drop temporary function if exists bloom_and; -create temporary function bloom_and as 'hivemall.sketch.bloom.BloomAndUDF'; - -drop temporary function if exists bloom_contains; -create temporary function bloom_contains as 'hivemall.sketch.bloom.BloomContainsUDF'; - -drop temporary function if exists bloom_not; -create temporary function bloom_not as 'hivemall.sketch.bloom.BloomNotUDF'; - -drop temporary function if exists bloom_or; -create temporary function bloom_or as 'hivemall.sketch.bloom.BloomOrUDF'; - -drop temporary function if exists bloom_contains_any; -create temporary function bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF'; - -drop temporary function if exists conditional_emit; -create temporary function conditional_emit as 'hivemall.tools.array.ConditionalEmitUDTF'; diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index d41a61a6d..e78a966eb 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -425,6 +425,30 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION array_intersect AS 'hivemall.tools.arr sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS select_k_best") sqlContext.sql("CREATE TEMPORARY FUNCTION select_k_best AS 'hivemall.tools.array.SelectKBestUDF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_append") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_append AS 'hivemall.tools.array.ArrayAppendUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS element_at") +sqlContext.sql("CREATE TEMPORARY FUNCTION element_at AS 'hivemall.tools.array.ArrayElementAtUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_union") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_union AS 'hivemall.tools.array.ArrayUnionUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS first_element") +sqlContext.sql("CREATE TEMPORARY FUNCTION first_element AS 'hivemall.tools.array.FirstElementUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS last_element") +sqlContext.sql("CREATE TEMPORARY FUNCTION last_element AS 'hivemall.tools.array.LastElementUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_flatten") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_flatten AS 'hivemall.tools.array.ArrayFlattenUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_to_str") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_to_str AS 'hivemall.tools.array.ArrayToStrUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS conditional_emit") +sqlContext.sql("CREATE TEMPORARY FUNCTION conditional_emit AS 'hivemall.tools.array.ConditionalEmitUDTF'") + /** * Bit operation functions */ @@ -467,6 +491,18 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION to_map AS 'hivemall.tools.map.UDAFToMa sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS to_ordered_map") sqlContext.sql("CREATE TEMPORARY FUNCTION to_ordered_map AS 'hivemall.tools.map.UDAFToOrderedMap'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_include_keys") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_include_keys AS 'hivemall.tools.map.MapIncludeKeysUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_exclude_keys") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_exclude_keys AS 'hivemall.tools.map.MapExcludeKeysUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_index") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_index AS 'hivemall.tools.map.MapIndexUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_key_values") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_key_values AS 'hivemall.tools.map.MapKeyValuesUDF'") + /** * List functions */ @@ -485,12 +521,18 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS l2_norm") sqlContext.sql("CREATE TEMPORARY FUNCTION l2_norm AS 'hivemall.tools.math.L2NormUDAF'") /** - * Matrix functions + * Matrix/Vector functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS transpose_and_dot") sqlContext.sql("CREATE TEMPORARY FUNCTION transpose_and_dot AS 'hivemall.tools.matrix.TransposeAndDotUDAF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS vector_add") +sqlContext.sql("CREATE TEMPORARY FUNCTION vector_add AS 'hivemall.tools.vector.VectorAddUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS vector_dot") +sqlContext.sql("CREATE TEMPORARY FUNCTION vector_dot AS 'hivemall.tools.vector.VectorDotUDF'") + /** * MAPRED functions */ @@ -498,6 +540,26 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION transpose_and_dot AS 'hivemall.tools.m sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS rowid") sqlContext.sql("CREATE TEMPORARY FUNCTION rowid AS 'hivemall.tools.mapred.RowIdUDFWrapper'") +/** + * JSON functions + */ + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS to_json") +sqlContext.sql("CREATE TEMPORARY FUNCTION to_json AS 'hivemall.tools.json.ToJsonUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS from_json") +sqlContext.sql("CREATE TEMPORARY FUNCTION from_json AS 'hivemall.tools.json.FromJsonUDF'") + +/** + * Sanity Check functions + */ + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS assert") +sqlContext.sql("CREATE TEMPORARY FUNCTION assert AS 'hivemall.tools.sanity.AssertUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS raise_error") +sqlContext.sql("CREATE TEMPORARY FUNCTION raise_error AS 'hivemall.tools.sanity.RaiseErrorUDF'") + /** * MISC functions */ @@ -514,6 +576,15 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION x_rank AS 'hivemall.tools.RankSequence sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS each_top_k") sqlContext.sql("CREATE TEMPORARY FUNCTION each_top_k AS 'hivemall.tools.EachTopKUDTF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS try_cast") +sqlContext.sql("CREATE TEMPORARY FUNCTION try_cast AS 'hivemall.tools.TryCastUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS sessionize") +sqlContext.sql("CREATE TEMPORARY FUNCTION sessionize AS 'hivemall.tools.datetime.SessionizeUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS moving_avg") +sqlContext.sql("CREATE TEMPORARY FUNCTION moving_avg AS 'hivemall.tools.timeseries.MovingAverageUDTF'") + /** * Text processing functions */ @@ -727,65 +798,9 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS approx_count_distinct") sqlContext.sql("CREATE TEMPORARY FUNCTION approx_count_distinct AS 'hivemall.sketch.hll.ApproxCountDistinctUDAF'") -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS try_cast") -sqlContext.sql("CREATE TEMPORARY FUNCTION try_cast AS 'hivemall.tools.TryCastUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_append") -sqlContext.sql("CREATE TEMPORARY FUNCTION array_append AS 'hivemall.tools.array.ArrayAppendUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS element_at") -sqlContext.sql("CREATE TEMPORARY FUNCTION element_at AS 'hivemall.tools.array.ArrayElementAtUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_union") -sqlContext.sql("CREATE TEMPORARY FUNCTION array_union AS 'hivemall.tools.array.ArrayUnionUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS first_element") -sqlContext.sql("CREATE TEMPORARY FUNCTION first_element AS 'hivemall.tools.array.FirstElementUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS last_element") -sqlContext.sql("CREATE TEMPORARY FUNCTION last_element AS 'hivemall.tools.array.LastElementUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_flatten") -sqlContext.sql("CREATE TEMPORARY FUNCTION array_flatten AS 'hivemall.tools.array.ArrayFlattenUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_include_keys") -sqlContext.sql("CREATE TEMPORARY FUNCTION map_include_keys AS 'hivemall.tools.map.MapIncludeKeysUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_exclude_keys") -sqlContext.sql("CREATE TEMPORARY FUNCTION map_exclude_keys AS 'hivemall.tools.map.MapExcludeKeysUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_to_str") -sqlContext.sql("CREATE TEMPORARY FUNCTION array_to_str AS 'hivemall.tools.array.ArrayToStrUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_index") -sqlContext.sql("CREATE TEMPORARY FUNCTION map_index AS 'hivemall.tools.map.MapIndexUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_key_values") -sqlContext.sql("CREATE TEMPORARY FUNCTION map_key_values AS 'hivemall.tools.map.MapKeyValuesUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS sessionize") -sqlContext.sql("CREATE TEMPORARY FUNCTION sessionize AS 'hivemall.tools.datetime.SessionizeUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS to_json") -sqlContext.sql("CREATE TEMPORARY FUNCTION to_json AS 'hivemall.tools.json.ToJsonUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS from_json") -sqlContext.sql("CREATE TEMPORARY FUNCTION from_json AS 'hivemall.tools.json.FromJsonUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS assert") -sqlContext.sql("CREATE TEMPORARY FUNCTION assert AS 'hivemall.tools.sanity.AssertUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS raise_error") -sqlContext.sql("CREATE TEMPORARY FUNCTION raise_error AS 'hivemall.tools.sanity.RaiseErrorUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS moving_avg") -sqlContext.sql("CREATE TEMPORARY FUNCTION moving_avg AS 'hivemall.tools.timeseries.MovingAverageUDTF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS vector_add") -sqlContext.sql("CREATE TEMPORARY FUNCTION vector_add AS 'hivemall.tools.vector.VectorAddUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS vector_dot") -sqlContext.sql("CREATE TEMPORARY FUNCTION vector_dot AS 'hivemall.tools.vector.VectorDotUDF'") +/** + * Bloom Filter + */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom") sqlContext.sql("CREATE TEMPORARY FUNCTION bloom AS 'hivemall.sketch.bloom.BloomFilterUDAF'") @@ -804,6 +819,3 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_or AS 'hivemall.sketch.bloom.Blo sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_contains_any") sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_contains_any AS 'hivemall.sketch.bloom.BloomContainsAnyUDF'") - -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS conditional_emit") -sqlContext.sql("CREATE TEMPORARY FUNCTION conditional_emit AS 'hivemall.tools.array.ConditionalEmitUDTF'") From 5d731012111a9dda775223f5a497ff0fb3777cd5 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Wed, 6 Jun 2018 17:24:19 +0900 Subject: [PATCH 55/56] Updated DDL usage in tutorial --- docs/gitbook/misc/generic_funcs.md | 93 +++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/docs/gitbook/misc/generic_funcs.md b/docs/gitbook/misc/generic_funcs.md index 28bb341c5..343a64a62 100644 --- a/docs/gitbook/misc/generic_funcs.md +++ b/docs/gitbook/misc/generic_funcs.md @@ -24,6 +24,13 @@ This page describes a list of useful Hivemall generic functions. See also a [lis # Array - `array_append(array arr, T elem)` - Append an element to the end of an array + ```sql + SELECT array_append(array(1,2),3); + 1,2,3 + + SELECT array_append(array('a','b'),'c'); + "a","b","c" + ``` - `array_avg(array)` - Returns an array<double> in which each element is the mean of a set of numbers @@ -34,6 +41,10 @@ This page describes a list of useful Hivemall generic functions. See also a [lis ``` - `array_flatten(array>)` - Returns an array with the elements flattened. + ```sql + SELECT array_flatten(array(array(1,2,3),array(4,5),array(6,7,8))); + [1,2,3,4,5,6,7,8] + ``` - `array_intersect(array x1, array x2, ..)` - Returns an intersect of given arrays ```sql @@ -94,7 +105,20 @@ This page describes a list of useful Hivemall generic functions. See also a [lis - `array_sum(array)` - Returns an array<double> in which each element is summed up +- `array_to_str(array arr [, string sep=','])` - Convert array to string using a sperator + ```sql + SELECT array_to_str(array(1,2,3),'-'); + 1-2-3 + ``` + - `array_union(array1, array2, ...)` - Returns the union of a set of arrays + ```sql + SELECT array_union(array(1,2),array(1,2)); + [1,2] + + SELECT array_union(array(1,2),array(2,3),array(2,5)); + [1,2,3,5] + ``` - `conditional_emit(array conditions, array features)` - Emit features of a row according to various conditions ```sql @@ -116,12 +140,30 @@ This page describes a list of useful Hivemall generic functions. See also a [lis ``` - `element_at(array list, int pos)` - Returns an element at the given position + ```sql + SELECT element_at(array(1,2,3,4),0); + 1 + + SELECT element_at(array(1,2,3,4),-2); + 3 + ``` -- `first_element(x)` - Returns the first element in an array +- `first_element(x)` - Returns the first element in an array + ```sql + SELECT first_element(array('a','b','c')); + a + + SELECT first_element(array()); + NULL + ``` - `float_array(nDims)` - Returns an array<float> of nDims elements - `last_element(x)` - Return the last element in an array + ```sql + SELECT last_element(array('a','b','c')); + c + ``` - `select_k_best(array array, const array importance, const int k)` - Returns selected top-k elements as array<double> @@ -346,9 +388,40 @@ This page describes a list of useful Hivemall generic functions. See also a [lis # Map +- `map_exclude_keys(Map map, array filteringKeys)` - Returns the filtered entries of a map not having specified keys + ```sql + SELECT map_exclude_keys(map(1,'one',2,'two',3,'three'),array(2,3)); + {1:"one"} + ``` + - `map_get_sum(map src, array keys)` - Returns sum of values that are retrieved by keys +- `map_include_keys(Map map, array filteringKeys)` - Returns the filtered entries of a map having specified keys + ```sql + SELECT map_include_keys(map(1,'one',2,'two',3,'three'),array(2,3)); + {2:"two",3:"three"} + ``` + +- `map_index(a, n)` - Returns the n-th element of the given array + ```sql + WITH tmp as ( + SELECT "one" as key + UNION ALL + SELECT "two" as key + ) + SELECT map_index(map("one",1,"two",2),key) + FROM tmp; + + 1 + 2 + ``` + - `map_key_values(map)` - Returns a array of key-value pairs. + ```sql + SELECT map_key_values(map("one",1,"two",2)); + + [{"key":"one","value":1},{"key":"two","value":2}] + ``` - `map_tail_n(map SRC, int N)` - Returns the last N elements from a sorted array of SRC @@ -418,8 +491,15 @@ This page describes a list of useful Hivemall generic functions. See also a [lis # Sanity Checks - `assert(boolean condition)` or _FUNC_(boolean condition, string errMsg)- Throws HiveException if condition is not met + ```sql + SELECT count(1) FROM stock_price WHERE assert(price > 0.0); + SELECT count(1) FROM stock_price WHERE assert(price > 0.0, 'price MUST be more than 0.0') + ``` - `raise_error()` or _FUNC_(string msg) - Throws an error + ```sql + SELECT product_id, price, raise_error('Found an invalid record') FROM xxx WHERE price < 0.0 + ``` # Text processing @@ -481,8 +561,19 @@ This page describes a list of useful Hivemall generic functions. See also a [lis # Vector - `vector_add(array x, array y)` - Perform vector ADD operation. + ```sql + SELECT vector_add(array(1.0,2.0,3.0), array(2, 3, 4)); + [3.0,5.0,7.0] + ``` - `vector_dot(array x, array y)` - Performs vector dot product. + ```sql + SELECT vector_dot(array(1.0,2.0,3.0),array(2.0,3.0,4.0)); + 20 + + SELECT vector_dot(array(1.0,2.0,3.0),2); + [2.0,4.0,6.0] + ``` # Others From 849940cf4b9f555eb31f32079b490c537a0bfc2d Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Wed, 6 Jun 2018 17:57:16 +0900 Subject: [PATCH 56/56] Updated user guide for bloom filter --- docs/gitbook/misc/funcs.md | 78 +++++++++++++++++++ .../hivemall/docs/FuncsListGeneratorMojo.java | 3 +- 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md index f98c82797..3449419f4 100644 --- a/docs/gitbook/misc/funcs.md +++ b/docs/gitbook/misc/funcs.md @@ -393,6 +393,84 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi - `approx_count_distinct(expr x [, const string options])` - Returns an approximation of count(DISTINCT x) using HyperLogLogPlus algorithm +- `bloom(string key)` - Constructs a BloomFilter by aggregating a set of keys + ```sql + CREATE TABLE satisfied_movies AS + SELECT bloom(movieid) as movies + FROM ( + SELECT movieid + FROM ratings + GROUP BY movieid + HAVING avg(rating) >= 4.0 + ) t; + ``` + +- `bloom_and(string bloom1, string bloom2)` - Returns the logical AND of two bloom filters + ```sql + SELECT bloom_and(bf1, bf2) FROM xxx; + ``` + +- `bloom_contains(string bloom, string key)` or _FUNC_(string bloom, array<string> keys) - Returns true if the bloom filter contains all the given key(s). Returns false if key is null. + ```sql + WITH satisfied_movies as ( + SELECT bloom(movieid) as movies + FROM ( + SELECT movieid + FROM ratings + GROUP BY movieid + HAVING avg(rating) >= 4.0 + ) t + ) + SELECT + l.rating, + count(distinct l.userid) as cnt + FROM + ratings l + CROSS JOIN satisfied_movies r + WHERE + bloom_contains(r.movies, l.movieid) -- includes false positive + GROUP BY + l.rating; + + l.rating cnt + 1 1296 + 2 2770 + 3 5008 + 4 5824 + 5 5925 + ``` + +- `bloom_contains_any(string bloom, string key)` or _FUNC_(string bloom, array<string> keys)- Returns true if the bloom filter contains any of the given key + ```sql + WITH data1 as ( + SELECT explode(array(1,2,3,4,5)) as id + ), + data2 as ( + SELECT explode(array(1,3,5,6,8)) as id + ), + bloom as ( + SELECT bloom(id) as bf + FROM data1 + ) + SELECT + l.* + FROM + data2 l + CROSS JOIN bloom r + WHERE + bloom_contains_any(r.bf, array(l.id)) + ``` + +- `bloom_not(string bloom)` - Returns the logical NOT of a bloom filters + ```sql + SELECT bloom_not(bf) FROM xxx; + ``` + +- `bloom_or(string bloom1, string bloom2)` - Returns the logical OR of two bloom filters + ```sql + SELECT bloom_or(bf1, bf2) FROM xxx; + ``` + # Ensemble learning - `argmin_kld(float mean, float covar)` - Returns mean or covar that minimize a KL-distance among distributions diff --git a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java index ba0e07224..0d58b3f9d 100644 --- a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java +++ b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java @@ -134,7 +134,8 @@ public class FuncsListGeneratorMojo extends AbstractMojo { funcsHeaders.put("# Similarity measures", Collections.singletonList("hivemall.knn.similarity")); funcsHeaders.put("# Evaluation", Collections.singletonList("hivemall.evaluation")); - funcsHeaders.put("# Sketching", Collections.singletonList("hivemall.sketch.hll")); + funcsHeaders.put("# Sketching", + Arrays.asList("hivemall.sketch.hll", "hivemall.sketch.bloom")); funcsHeaders.put("# Ensemble learning", Collections.singletonList("hivemall.ensemble")); funcsHeaders.put("## Bagging", Collections.singletonList("hivemall.ensemble.bagging")); funcsHeaders.put("# Decision trees and RandomForest", Arrays.asList(