From e0b8f49ed6bb81e0570cdd8bb4976b48b9c27573 Mon Sep 17 00:00:00 2001 From: aaronshan Date: Mon, 23 Jul 2018 20:30:27 +0800 Subject: [PATCH 01/16] add some simple math functions --- pom.xml | 2 +- .../functions/math/UDFMathFromBase.java | 39 +++++++++++++++++++ .../functions/math/UDFMathInfinity.java | 24 ++++++++++++ .../functions/math/UDFMathIsFinite.java | 29 ++++++++++++++ .../functions/math/UDFMathIsInfinite.java | 29 ++++++++++++++ .../functions/math/UDFMathIsNaN.java | 29 ++++++++++++++ .../functions/math/UDFMathNaN.java | 24 ++++++++++++ .../functions/math/UDFMathToBase.java | 33 ++++++++++++++++ .../shanruifeng/functions/utils/Failures.java | 16 ++++++++ .../functions/utils/MathUtils.java | 14 +++++++ 10 files changed, 238 insertions(+), 1 deletion(-) create mode 100644 src/main/java/cc/shanruifeng/functions/math/UDFMathFromBase.java create mode 100644 src/main/java/cc/shanruifeng/functions/math/UDFMathInfinity.java create mode 100644 src/main/java/cc/shanruifeng/functions/math/UDFMathIsFinite.java create mode 100644 src/main/java/cc/shanruifeng/functions/math/UDFMathIsInfinite.java create mode 100644 src/main/java/cc/shanruifeng/functions/math/UDFMathIsNaN.java create mode 100644 src/main/java/cc/shanruifeng/functions/math/UDFMathNaN.java create mode 100644 src/main/java/cc/shanruifeng/functions/math/UDFMathToBase.java create mode 100644 src/main/java/cc/shanruifeng/functions/utils/Failures.java create mode 100644 src/main/java/cc/shanruifeng/functions/utils/MathUtils.java diff --git a/pom.xml b/pom.xml index fdebeb0..546c029 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ cc.shanruifeng hive-third-functions - 2.1.3 + 3.0.0 UTF-8 diff --git a/src/main/java/cc/shanruifeng/functions/math/UDFMathFromBase.java b/src/main/java/cc/shanruifeng/functions/math/UDFMathFromBase.java new file mode 100644 index 0000000..a61a223 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/math/UDFMathFromBase.java @@ -0,0 +1,39 @@ +package cc.shanruifeng.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import static cc.shanruifeng.functions.utils.MathUtils.checkRadix; +import static java.lang.String.format; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "from_base" + , value = "_FUNC_(string, long) - convert a number to a string in the given base." + , extended = "Example:\n > select _FUNC_(string, long) from src;") +public class UDFMathFromBase extends UDF { + private LongWritable result = new LongWritable(); + + public UDFMathFromBase() { + } + + public LongWritable evaluate(Text value, LongWritable radix) throws HiveException { + if (value == null || radix == null) { + return null; + } + + checkRadix(radix.get()); + try { + result.set(Long.parseLong(value.toString(), (int) radix.get())); + } + catch (NumberFormatException e) { + throw new HiveException(format("Not a valid base-%d number: %s", radix, value.toString()), e); + } + return result; + } +} diff --git a/src/main/java/cc/shanruifeng/functions/math/UDFMathInfinity.java b/src/main/java/cc/shanruifeng/functions/math/UDFMathInfinity.java new file mode 100644 index 0000000..6c9799c --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/math/UDFMathInfinity.java @@ -0,0 +1,24 @@ +package cc.shanruifeng.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.DoubleWritable; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "infinity" + , value = "_FUNC_() - Infinity." + , extended = "Example:\n > select _FUNC_() from src;") +public class UDFMathInfinity extends UDF { + private DoubleWritable result = new DoubleWritable(); + + public UDFMathInfinity() { + } + + public DoubleWritable evaluate() { + result.set(Double.POSITIVE_INFINITY); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/math/UDFMathIsFinite.java b/src/main/java/cc/shanruifeng/functions/math/UDFMathIsFinite.java new file mode 100644 index 0000000..f0243a0 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/math/UDFMathIsFinite.java @@ -0,0 +1,29 @@ +package cc.shanruifeng.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.DoubleWritable; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "is_finite" + , value = "_FUNC_(double) - test if value is finite." + , extended = "Example:\n > select _FUNC_(double) from src;") +public class UDFMathIsFinite extends UDF { + BooleanWritable result = new BooleanWritable(); + + public UDFMathIsFinite() { + } + + public BooleanWritable evaluate(DoubleWritable num) { + if (num == null) { + result.set(false); + } else { + result.set(Double.isFinite(num.get())); + } + return result; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/math/UDFMathIsInfinite.java b/src/main/java/cc/shanruifeng/functions/math/UDFMathIsInfinite.java new file mode 100644 index 0000000..765bce3 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/math/UDFMathIsInfinite.java @@ -0,0 +1,29 @@ +package cc.shanruifeng.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.DoubleWritable; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "is_infinite" + , value = "_FUNC_(double) - test if value is infinite." + , extended = "Example:\n > select _FUNC_(double) from src;") +public class UDFMathIsInfinite extends UDF { + BooleanWritable result = new BooleanWritable(); + + public UDFMathIsInfinite() { + } + + public BooleanWritable evaluate(DoubleWritable num) { + if (num == null) { + result.set(false); + } else { + result.set(Double.isInfinite(num.get())); + } + return result; + } +} diff --git a/src/main/java/cc/shanruifeng/functions/math/UDFMathIsNaN.java b/src/main/java/cc/shanruifeng/functions/math/UDFMathIsNaN.java new file mode 100644 index 0000000..4338c85 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/math/UDFMathIsNaN.java @@ -0,0 +1,29 @@ +package cc.shanruifeng.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.DoubleWritable; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "is_nan" + , value = "_FUNC_(double) - test if value is nan." + , extended = "Example:\n > select _FUNC_(double) from src;") +public class UDFMathIsNaN extends UDF { + BooleanWritable result = new BooleanWritable(); + + public UDFMathIsNaN() { + } + + public BooleanWritable evaluate(DoubleWritable num) { + if (num == null) { + result.set(false); + } else { + result.set(Double.isNaN(num.get())); + } + return result; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/math/UDFMathNaN.java b/src/main/java/cc/shanruifeng/functions/math/UDFMathNaN.java new file mode 100644 index 0000000..1da5ce1 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/math/UDFMathNaN.java @@ -0,0 +1,24 @@ +package cc.shanruifeng.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.DoubleWritable; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "NaN" + , value = "_FUNC_() - constant representing not-a-number." + , extended = "Example:\n > select _FUNC_() from src;") +public class UDFMathNaN extends UDF { + private DoubleWritable result = new DoubleWritable(); + + public UDFMathNaN() { + } + + public DoubleWritable evaluate() { + result.set(Double.NaN); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/math/UDFMathToBase.java b/src/main/java/cc/shanruifeng/functions/math/UDFMathToBase.java new file mode 100644 index 0000000..ed2e2f9 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/math/UDFMathToBase.java @@ -0,0 +1,33 @@ +package cc.shanruifeng.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import static cc.shanruifeng.functions.utils.MathUtils.checkRadix; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "to_base" + , value = "_FUNC_(long, long) - convert a number to a string in the given base." + , extended = "Example:\n > select _FUNC_(long, long) from src;") +public class UDFMathToBase extends UDF { + private Text result = new Text(); + + public UDFMathToBase() { + } + + public Text evaluate(LongWritable value, LongWritable radix) throws HiveException { + if (value == null || radix == null) { + return null; + } + + checkRadix(radix.get()); + result.set(Long.toString(value.get(), (int) radix.get())); + return result; + } +} diff --git a/src/main/java/cc/shanruifeng/functions/utils/Failures.java b/src/main/java/cc/shanruifeng/functions/utils/Failures.java new file mode 100644 index 0000000..354927c --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/utils/Failures.java @@ -0,0 +1,16 @@ +package cc.shanruifeng.functions.utils; + +import org.apache.hadoop.hive.ql.metadata.HiveException; + +import static com.google.common.collect.Sets.newIdentityHashSet; +import static java.lang.String.format; + +public class Failures { + private Failures() {} + + public static void checkCondition(boolean condition, String formatString, Object... args) throws HiveException { + if (!condition) { + throw new HiveException(format(formatString, args)); + } + } +} diff --git a/src/main/java/cc/shanruifeng/functions/utils/MathUtils.java b/src/main/java/cc/shanruifeng/functions/utils/MathUtils.java new file mode 100644 index 0000000..0fc88ae --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/utils/MathUtils.java @@ -0,0 +1,14 @@ +package cc.shanruifeng.functions.utils; + +import org.apache.hadoop.hive.ql.metadata.HiveException; + +import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static java.lang.Character.MAX_RADIX; +import static java.lang.Character.MIN_RADIX; + +public class MathUtils { + public static void checkRadix(long radix) throws HiveException { + checkCondition(radix >= MIN_RADIX && radix <= MAX_RADIX, "Radix must be between %d and %d", MIN_RADIX, MAX_RADIX); + } + +} From 0057ce43673e99d89051c8e6b6b6407b1655f327 Mon Sep 17 00:00:00 2001 From: aaronshan Date: Tue, 24 Jul 2018 13:58:49 +0800 Subject: [PATCH 02/16] add math cosine similarity functions, revise readme doc. --- README-zh.md | 26 +++ README.md | 26 ++- .../math/UDFMathCosineSimilarity.java | 151 ++++++++++++++++++ .../math/UDFMathCosineSimilarityTest.java | 51 ++++++ 4 files changed, 253 insertions(+), 1 deletion(-) create mode 100644 src/main/java/cc/shanruifeng/functions/math/UDFMathCosineSimilarity.java create mode 100644 src/test/java/cc/shanruifeng/functions/math/UDFMathCosineSimilarityTest.java diff --git a/README-zh.md b/README-zh.md index fb060d2..6667c4c 100644 --- a/README-zh.md +++ b/README-zh.md @@ -145,6 +145,20 @@ mvn clean package -DskipTests |url_encode(value) -> string | escapes value by encoding it so that it can be safely included in URL query parameter names and values| |url_decode(value) -> string | unescape the URL encoded value. This function is the inverse of `url_encode`. | +### 10. 数学函数 + +| function| description | +|:--|:--| +|infinity() -> double | 获取正无穷常数| +|is_finite(x) -> boolean | 判断x是否为有限数值| +|is_infinite(x) -> boolean |判断x是否为无穷数值| +|is_nan(x) -> boolean | 判断x是否不是一个数值类型的变量| +|nan() -> double | 获取一个表示NAN(not-a-number)的常数 | +|from_base(string, radix) -> bigint | 获取字面量的值,该值的基数为radix| +|to_base(x, radix) -> varchar | 返回x以radix为基数的字面量| +|cosine_similarity(x, y) -> double | 返回两个稀疏向量的余弦相似度| + + ## 用法 将下面这些内容写入 `${HOME}/.hiverc` 文件, 或者也可以按需在hive命令行环境中执行. @@ -205,6 +219,14 @@ create temporary function gcj_to_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjT create temporary function gcj_extract_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjExtractWgs'; create temporary function url_encode as 'cc.shanruifeng.functions.url.UDFUrlEncode'; create temporary function url_decode as 'cc.shanruifeng.functions.url.UDFUrlDecode'; +create temporary function infinity as 'cc.shanruifeng.functions.math.UDFMathInfinity'; +create temporary function is_finite as 'cc.shanruifeng.functions.math.UDFMathIsFinite'; +create temporary function is_infinite as 'cc.shanruifeng.functions.math.UDFMathIsInfinite'; +create temporary function is_nan as 'cc.shanruifeng.functions.math.UDFMathIsNaN'; +create temporary function nan as 'cc.shanruifeng.functions.math.UDFMathIsNaN'; +create temporary function from_base as 'cc.shanruifeng.functions.math.UDFMathFromBase'; +create temporary function to_base as 'cc.shanruifeng.functions.math.UDFMathToBase'; +create temporary function cosine_similarity as 'cc.shanruifeng.functions.math.UDFMathCosineSimilarity'; ``` 你可以在hive的命令杭中使用下面的语句来查看函数的细节. @@ -302,3 +324,7 @@ select gcj_extract_wgs(39.915, 116.404) => {"lng":116.39775549316407,"lat":39.91 ``` select url_encode('http://shanruifeng.cc/') => http%3A%2F%2Fshanruifeng.cc%2F ``` + +``` +SELECT cosine_similarity(map_build(array['a'], array[1.0]), map_build(array['a'], array[2.0])); => 1.0 +``` \ No newline at end of file diff --git a/README.md b/README.md index 3cce4ac..f97026c 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,19 @@ You can also directly download file from [release page](https://github.com/aaron |url_encode(value) -> string | escapes value by encoding it so that it can be safely included in URL query parameter names and values| |url_decode(value) -> string | unescape the URL encoded value. This function is the inverse of `url_encode`. | +### 10. math functions + +| function| description | +|:--|:--| +|infinity() -> double | Returns the constant representing positive infinity.| +|is_finite(x) -> boolean | Determine if x is finite.| +|is_infinite(x) -> boolean |Determine if x is infinite.| +|is_nan(x) -> boolean | Determine if x is not-a-number.| +|nan() -> double | Returns the constant representing not-a-number. | +|from_base(string, radix) -> bigint | Returns the value of string interpreted as a base-radix number.| +|to_base(x, radix) -> varchar | Returns the base-radix representation of x.| +|cosine_similarity(x, y) -> double | Returns the cosine similarity between the sparse vectors x and y| + ## Use Put these statements into `${HOME}/.hiverc` or exec its on hive cli env. @@ -204,7 +217,14 @@ create temporary function wgs_to_gcj as 'cc.shanruifeng.functions.geo.UDFGeoWgsT create temporary function gcj_to_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjToWgs'; create temporary function gcj_extract_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjExtractWgs'; create temporary function url_encode as 'cc.shanruifeng.functions.url.UDFUrlEncode'; -create temporary function url_decode as 'cc.shanruifeng.functions.url.UDFUrlDecode'; +create temporary function infinity as 'cc.shanruifeng.functions.math.UDFMathInfinity'; +create temporary function is_finite as 'cc.shanruifeng.functions.math.UDFMathIsFinite'; +create temporary function is_infinite as 'cc.shanruifeng.functions.math.UDFMathIsInfinite'; +create temporary function is_nan as 'cc.shanruifeng.functions.math.UDFMathIsNaN'; +create temporary function nan as 'cc.shanruifeng.functions.math.UDFMathIsNaN'; +create temporary function from_base as 'cc.shanruifeng.functions.math.UDFMathFromBase'; +create temporary function to_base as 'cc.shanruifeng.functions.math.UDFMathToBase'; +create temporary function cosine_similarity as 'cc.shanruifeng.functions.math.UDFMathCosineSimilarity'; ``` You can use these statements on hive cli env get detail of function. @@ -302,3 +322,7 @@ select gcj_extract_wgs(39.915, 116.404) => {"lng":116.39775549316407,"lat":39.91 ``` select url_encode('http://shanruifeng.cc/') => http%3A%2F%2Fshanruifeng.cc%2F ``` + +``` +SELECT cosine_similarity(map_build(array['a'], array[1.0]), map_build(array['a'], array[2.0])); => 1.0 +``` \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/math/UDFMathCosineSimilarity.java b/src/main/java/cc/shanruifeng/functions/math/UDFMathCosineSimilarity.java new file mode 100644 index 0000000..a4084a5 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/math/UDFMathCosineSimilarity.java @@ -0,0 +1,151 @@ +package cc.shanruifeng.functions.math; + +import org.apache.hadoop.hive.ql.exec.*; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +import java.util.Map; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "cosine_similarity" + , value = "_FUNC_(map(varchar,double), map(varchar,double)) - cosine similarity between the given sparse vectors." + , extended = "Example:\n > select _FUNC_(map(varchar,double), map(varchar,double)) from src;") +public class UDFMathCosineSimilarity extends GenericUDF { + private static final int ARG_COUNT = 2; // Number of arguments to this UDF + private transient MapObjectInspector leftMapOI; + private transient MapObjectInspector rightMapOI; + + public UDFMathCosineSimilarity() { + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != ARG_COUNT) { + throw new UDFArgumentLengthException( + "The function cosine_similarity(map, map) takes exactly " + ARG_COUNT + " arguments."); + } + + // Check if two argument is of category LIST + for (int i = 0; i < 2; i++) { + if (!arguments[i].getCategory().equals(ObjectInspector.Category.MAP)) { + throw new UDFArgumentTypeException(i, + "\"" + serdeConstants.MAP_TYPE_NAME + "\" " + + "expected at function cosine_similarity, but " + + "\"" + arguments[i].getTypeName() + "\" " + + "is found"); + } + } + + leftMapOI = (MapObjectInspector) arguments[0]; + rightMapOI = (MapObjectInspector) arguments[1]; + + ObjectInspector leftMapKeyOI = leftMapOI.getMapKeyObjectInspector(); + ObjectInspector leftMapValueOI = leftMapOI.getMapValueObjectInspector(); + ObjectInspector rightMapKeyOI = rightMapOI.getMapKeyObjectInspector(); + ObjectInspector rightMapValueOI = rightMapOI.getMapValueObjectInspector(); + + // Check if two map are of same key and value type + if (!ObjectInspectorUtils.compareTypes(leftMapKeyOI, rightMapKeyOI)) { + throw new UDFArgumentTypeException(1, + "\"" + leftMapKeyOI.getTypeName() + "\"" + + " expected at function cosine_similarity key, but " + + "\"" + rightMapKeyOI.getTypeName() + "\"" + + " is found"); + } + + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, leftMapKeyOI)) { + throw new UDFArgumentTypeException(1, + "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\"" + + " expected at function cosine_similarity key, but " + + "\"" + leftMapKeyOI.getTypeName() + "\"" + + " is found"); + } + + if (!ObjectInspectorUtils.compareTypes(leftMapValueOI, rightMapValueOI)) { + throw new UDFArgumentTypeException(1, + "\"" + leftMapValueOI.getTypeName() + "\"" + + " expected at function cosine_similarity value, but " + + "\"" + rightMapValueOI.getTypeName() + "\"" + + " is found"); + } + + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaDoubleObjectInspector, leftMapValueOI)) { + throw new UDFArgumentTypeException(1, + "\"" + PrimitiveObjectInspectorFactory.javaDoubleObjectInspector.getTypeName() + "\"" + + " expected at function cosine_similarity value, but " + + "\"" + leftMapValueOI.getTypeName() + "\"" + + " is found"); + } + + return ObjectInspectorFactory.getStandardMapObjectInspector(leftMapKeyOI, leftMapValueOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + Object leftMapObj = arguments[0].get(); + Object rightMapObj = arguments[1].get(); + + if (leftMapObj == null || rightMapObj == null) { + return null; + } + + Map leftMap = leftMapOI.getMap(leftMapObj); + Map rightMap = leftMapOI.getMap(rightMapObj); + + Double normLeftMap = mapL2Norm(leftMap); + Double normRightMap = mapL2Norm(rightMap); + + if (normLeftMap == null || normRightMap == null) { + return null; + } + + double dotProduct = mapDotProduct(leftMap, rightMap); + return new DoubleWritable(dotProduct / (normLeftMap * normRightMap)); + } + + private double mapDotProduct(Map leftMap, Map rightMap) { + double result = 0.0; + + for (Map.Entry entry : rightMap.entrySet()) { + if (leftMap.containsKey(entry.getKey())) { + Double leftValue = (Double) leftMap.get(entry.getKey()); + Double rightValue = (Double) entry.getValue(); + result += leftValue * rightValue; + } + } + + return result; + } + + private Double mapL2Norm(Map map) { + double norm = 0.0; + for (Map.Entry entry : map.entrySet()) { + if (entry.getValue() == null) { + return null; + } + + Double value = (Double) entry.getValue(); + norm += value * value; + } + + return Math.sqrt(norm); + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == ARG_COUNT); + return "cosine_similarity(" + strings[0] + ", " + + strings[1] + ")"; + } +} diff --git a/src/test/java/cc/shanruifeng/functions/math/UDFMathCosineSimilarityTest.java b/src/test/java/cc/shanruifeng/functions/math/UDFMathCosineSimilarityTest.java new file mode 100644 index 0000000..188a643 --- /dev/null +++ b/src/test/java/cc/shanruifeng/functions/math/UDFMathCosineSimilarityTest.java @@ -0,0 +1,51 @@ +package cc.shanruifeng.functions.math; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Test; + +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.*; + +public class UDFMathCosineSimilarityTest { + + @Test + public void testCosineSimilarity() throws HiveException { + Double result = getResult(ImmutableMap.of("a", 1.0, "b", 2.0), ImmutableMap.of("c", 1.0, "b", 3.0)); + assertEquals(result, 2 * 3 / (Math.sqrt(5) * Math.sqrt(10)), 0.0); + result = getResult(ImmutableMap.of("a", 1.0, "b", 2.0, "c", -1.0), ImmutableMap.of("c", 1.0, "b", 3.0)); + assertEquals(result, (2 * 3 + (-1) * 1) / (Math.sqrt(1 + 4 + 1) * Math.sqrt(1 + 9)), 0.0); + result = getResult(ImmutableMap.of("a", 1.0, "b", 2.0, "c", -1.0), ImmutableMap.of("d", 1.0, "e", 3.0)); + assertEquals(result, 0.0, 0.0); + result = getResult(null, ImmutableMap.of("c", 1.0, "b", 3.0)); + assertEquals(result, null); + LinkedHashMap leftMap = Maps.newLinkedHashMap(); + leftMap.put("a", 1.0); + leftMap.put("b", null); + result = getResult(leftMap, ImmutableMap.of("c", 1.0, "b", 3.0)); + assertEquals(result, null); + } + + public Double getResult(Map leftMap, Map rightMap) throws HiveException { + UDFMathCosineSimilarity udf = new UDFMathCosineSimilarity(); + + ObjectInspector leftMapOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); + ObjectInspector rightMapOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); + ObjectInspector[] arguments = {leftMapOI, rightMapOI}; + udf.initialize(arguments); + + GenericUDF.DeferredObject leftMapObj = new GenericUDF.DeferredJavaObject(leftMap); + GenericUDF.DeferredObject rightMapObj = new GenericUDF.DeferredJavaObject(rightMap); + GenericUDF.DeferredObject[] args = {leftMapObj, rightMapObj}; + DoubleWritable output = (DoubleWritable) udf.evaluate(args); + return output == null ? null : output.get(); + } +} \ No newline at end of file From 7dd88c42c11154f55ff93438f37d3096c7373fd8 Mon Sep 17 00:00:00 2001 From: aaronshan Date: Wed, 25 Jul 2018 17:42:20 +0800 Subject: [PATCH 03/16] add array filter function, revise readme doc. --- README-zh.md | 13 +- README.md | 11 +- pom.xml | 17 ++- .../functions/array/UDFArrayFilter.java | 113 ++++++++++++++++++ .../functions/array/UDFArrayValueCount.java | 2 +- .../functions/utils/LambdaUtils.java | 55 +++++++++ .../functions/array/UDFArrayFilterTest.java | 54 +++++++++ 7 files changed, 257 insertions(+), 8 deletions(-) create mode 100644 src/main/java/cc/shanruifeng/functions/array/UDFArrayFilter.java create mode 100644 src/main/java/cc/shanruifeng/functions/utils/LambdaUtils.java create mode 100644 src/test/java/cc/shanruifeng/functions/array/UDFArrayFilterTest.java diff --git a/README-zh.md b/README-zh.md index 6667c4c..046dffa 100644 --- a/README-zh.md +++ b/README-zh.md @@ -10,7 +10,8 @@ hive-third-functions 包含了一些很有用的hive udf函数,特别是数组和json函数. > 注意: -> hive-third-functions支持hive-0.11.0或更高版本. +> 1. hive-third-functions支持hive-0.11.0或更高版本. +> 2. 运行`3.0.0`及以上版本需要Java8及以上 ## 编译 @@ -40,7 +41,7 @@ mvn clean package -DskipTests 你也可以直接在发布页下载打包好了最新版本 [发布页](https://github.com/aaronshan/hive-third-functions/releases). -> 当前最新的版本是 `2.1.3` +> 当前最新的版本是 `3.0.0` ## 函数 @@ -71,6 +72,7 @@ mvn clean package -DskipTests |array_value_count(array<E>, E) -> long | 统计数组中包含给定元素的个数.| |array_slice(array, start, length) -> array | 对数组进行分片操作,start为正数从前开始分片, start为负数从后开始分片, 长度为指定的长度.| |array_element_at(array<E>, index) -> E | 返回指定位置的数组元素. 如果索引位置 < 0, 则从尾部开始计数并返回.| +|array_filter(array<E>, function)) -> E | 根据一个返回值为boolean类型的lambda表达式函数来对数组元素进行过滤.| ### 3. map函数 | 函数| 描述 | @@ -180,6 +182,7 @@ create temporary function array_concat as 'cc.shanruifeng.functions.array.UDFArr create temporary function array_value_count as 'cc.shanruifeng.functions.array.UDFArrayValueCount'; create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArraySlice'; create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; +create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayFilter'; create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; create temporary function bitwise_and as 'cc.shanruifeng.functions.bitwise.UDFBitwiseAnd'; create temporary function bitwise_not as 'cc.shanruifeng.functions.bitwise.UDFBitwiseNot'; @@ -279,6 +282,10 @@ select array_concat(array(16,12,18,9,null), array(14,9,6,18,null)) => [16,12,18, select array_value_count(array(16,13,12,13,18,16,9,18), 13) => 2 select array_slice(array(16,13,12,13,18,16,9,18), -2, 3) => [9,18] select array_element_at(array(16,13,12,13,18,16,9,18), -1) => 18 +select array_filter(array(16,13), 'x -> x > 15') => [16] +select array_filter(array('a','b'), 'x -> x == \'a\'') => [a] +select array_filter(array(true, false, NULL), 'x -> x != null && x') => [true] +select array_filter(array(array('abc', null, '123'), array ('def', 'x', '456')), 'x -> x.get(1) == null') => [['abc', null, '123']] ``` ``` @@ -326,5 +333,5 @@ select url_encode('http://shanruifeng.cc/') => http%3A%2F%2Fshanruifeng.cc%2F ``` ``` -SELECT cosine_similarity(map_build(array['a'], array[1.0]), map_build(array['a'], array[2.0])); => 1.0 +select cosine_similarity(map_build(array['a'], array[1.0]), map_build(array['a'], array[2.0])); => 1.0 ``` \ No newline at end of file diff --git a/README.md b/README.md index f97026c..a481412 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,8 @@ Some useful custom hive udf functions, especial array and json functions. > Note: -> hive-third-functions support hive-0.11.0 or higher. +> 1. hive-third-functions support hive-0.11.0 or higher. +> 2. hive-third-functions `3.0.0` need java8 or higher. ## Build @@ -71,6 +72,7 @@ You can also directly download file from [release page](https://github.com/aaron |array_value_count(array<E>, E) -> long | count array's element number that element value equals given value.| |array_slice(array, start, length) -> array | subsets array starting from index start (or starting from the end if start is negative) with a length of length.| |array_element_at(array<E>, index) -> E | returns element of array at given index. If index < 0, element_at accesses elements from the last to the first.| +|array_filter(array<E>, function)) -> E | constructs an array from those elements of array for which function returns true.| ### 3. map functions | function| description | @@ -179,6 +181,7 @@ create temporary function array_concat as 'cc.shanruifeng.functions.array.UDFArr create temporary function array_value_count as 'cc.shanruifeng.functions.array.UDFArrayValueCount'; create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArraySlice'; create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; +create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayFilter'; create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; create temporary function bitwise_and as 'cc.shanruifeng.functions.bitwise.UDFBitwiseAnd'; create temporary function bitwise_not as 'cc.shanruifeng.functions.bitwise.UDFBitwiseNot'; @@ -277,6 +280,10 @@ select array_concat(array(16,12,18,9,null), array(14,9,6,18,null)) => [16,12,18, select array_value_count(array(16,13,12,13,18,16,9,18), 13) => 2 select array_slice(array(16,13,12,13,18,16,9,18), -2, 3) => [9,18] select array_element_at(array(16,13,12,13,18,16,9,18), -1) => 18 +select array_filter(array(16,13), 'x -> x > 15') => [16] +select array_filter(array('a','b'), 'x -> x == \'a\'') => [a] +select array_filter(array(true, false, NULL), 'x -> x != null && x') => [true] +select array_filter(array(array('abc', null, '123'), array ('def', 'x', '456')), 'x -> x.get(1) == null') => [['abc', null, '123']] ``` ``` @@ -324,5 +331,5 @@ select url_encode('http://shanruifeng.cc/') => http%3A%2F%2Fshanruifeng.cc%2F ``` ``` -SELECT cosine_similarity(map_build(array['a'], array[1.0]), map_build(array['a'], array[2.0])); => 1.0 +select cosine_similarity(map_build(array['a'], array[1.0]), map_build(array['a'], array[2.0])); => 1.0 ``` \ No newline at end of file diff --git a/pom.xml b/pom.xml index 546c029..97eba4d 100644 --- a/pom.xml +++ b/pom.xml @@ -24,6 +24,7 @@ 2.4.4 1.9.3 4.12 + 1.6 @@ -87,6 +88,13 @@ junit ${junit.version} + + + pl.joegreen + lambda-from-string + ${lambda.from.string.version} + + @@ -138,6 +146,11 @@ jackson-databind + + pl.joegreen + lambda-from-string + + junit junit @@ -176,8 +189,8 @@ maven-compiler-plugin 3.1 - ${project.build.targetJdk} - ${project.build.targetJdk} + 8 + 8 ${project.build.sourceEncoding} true diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayFilter.java b/src/main/java/cc/shanruifeng/functions/array/UDFArrayFilter.java new file mode 100644 index 0000000..5b23295 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/array/UDFArrayFilter.java @@ -0,0 +1,113 @@ +package cc.shanruifeng.functions.array; + +import cc.shanruifeng.functions.utils.LambdaUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import pl.joegreen.lambdaFromString.LambdaCreationException; +import pl.joegreen.lambdaFromString.LambdaFactory; + +import java.util.ArrayList; +import java.util.function.Function; + +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.compareTypes; + +/** + * @author ruifeng.shan + * @date 2018-07-25 下午1:37 + */ +@Description(name = "array_filter" + , value = "_FUNC_(array, function) - constructs an array from those elements of array for which function returns true." + , extended = "Example:\n > select _FUNC_(array, x -> true) from src;") +public class UDFArrayFilter extends GenericUDF { + + private static final int ARRAY_IDX = 0; + private static final int LAMBDA_FUNCTION_IDX = 1; + private static final int ARG_COUNT = 2; // Number of arguments to this UDF + private transient ObjectInspector lambdaStringOI; + private transient ListObjectInspector arrayOI; + private transient ObjectInspector arrayElementOI; + private transient ArrayList result = new ArrayList(); + private static final LambdaFactory LAMBDA_FACTORY = LambdaFactory.get(); + private Function applyFunction; + + public UDFArrayFilter() { + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != ARG_COUNT) { + throw new UDFArgumentLengthException( + "The function array_filter(array, function) takes exactly " + ARG_COUNT + " arguments."); + } + + // Check if ARRAY_IDX argument is of category LIST + if (!arguments[ARRAY_IDX].getCategory().equals(ObjectInspector.Category.LIST)) { + throw new UDFArgumentTypeException(ARRAY_IDX, + "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " + + "expected at function array_filter, but " + + "\"" + arguments[ARRAY_IDX].getTypeName() + "\" " + + "is found"); + } + + arrayOI = (ListObjectInspector) arguments[ARRAY_IDX]; + arrayElementOI = arrayOI.getListElementObjectInspector(); + + lambdaStringOI = arguments[LAMBDA_FUNCTION_IDX]; + + // Check if list element and value are of same type + if (!compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, lambdaStringOI)) { + throw new UDFArgumentTypeException(LAMBDA_FUNCTION_IDX, + "\"" + arrayElementOI.getTypeName() + "\"" + + " expected at function array_filter, but " + + "\"" + lambdaStringOI.getTypeName() + "\"" + + " is found"); + } + + return ObjectInspectorFactory.getStandardListObjectInspector(arrayElementOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + Object array = arguments[ARRAY_IDX].get(); + String lambdaString = (String) arguments[LAMBDA_FUNCTION_IDX].get(); + + if (applyFunction == null) { + try { + applyFunction = LambdaUtils.getFilterFunction(LAMBDA_FACTORY, lambdaString, arrayElementOI); + } catch (LambdaCreationException e) { + throw new UDFArgumentTypeException(LAMBDA_FUNCTION_IDX, e.getMessage()); + } + } + + int arrayLength = arrayOI.getListLength(array); + // Check if array is null or empty or value is null + if (array == null || arrayLength <= 0) { + return array; + } + + for (int i = 0; i < arrayLength; ++i) { + Object listElement = arrayOI.getListElement(array, i); + if ((boolean)applyFunction.apply(listElement)) { + result.add(listElement); + } + } + + return result; + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == ARG_COUNT); + return "array_filter(" + strings[ARRAY_IDX] + ", " + + strings[LAMBDA_FUNCTION_IDX] + ")"; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayValueCount.java b/src/main/java/cc/shanruifeng/functions/array/UDFArrayValueCount.java index 4984e15..8674576 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayValueCount.java +++ b/src/main/java/cc/shanruifeng/functions/array/UDFArrayValueCount.java @@ -67,7 +67,7 @@ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumen // Check if the comparison is supported for this type if (!ObjectInspectorUtils.compareSupported(valueOI)) { - throw new UDFArgumentException("The function array_contains" + throw new UDFArgumentException("The function array_value_count" + " does not support comparison for " + "\"" + valueOI.getTypeName() + "\"" + " types"); diff --git a/src/main/java/cc/shanruifeng/functions/utils/LambdaUtils.java b/src/main/java/cc/shanruifeng/functions/utils/LambdaUtils.java new file mode 100644 index 0000000..40ec104 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/utils/LambdaUtils.java @@ -0,0 +1,55 @@ +package cc.shanruifeng.functions.utils; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import pl.joegreen.lambdaFromString.LambdaCreationException; +import pl.joegreen.lambdaFromString.LambdaFactory; +import pl.joegreen.lambdaFromString.TypeReference; + +import java.sql.Timestamp; +import java.util.List; +import java.util.Map; +import java.util.function.Function; + +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.compareTypes; + +/** + * @author aaron02 + * @date 2018-07-25 下午4:29 + */ +public class LambdaUtils { + public static Function getFilterFunction(LambdaFactory lambdaFactory, String lambdaString, ObjectInspector oi) throws LambdaCreationException { + if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaBooleanObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaByteObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaShortObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaIntObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaLongObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaFloatObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaDoubleObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaStringObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaVoidObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaTimestampObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>() {}); + }else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } else if (oi.getCategory().equals(ObjectInspector.Category.LIST)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference, Boolean>>(){}); + } else if (oi.getCategory().equals(ObjectInspector.Category.MAP)) { + return lambdaFactory.createLambda(lambdaString, new TypeReference, Boolean>>(){}); + } else { + return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); + } + } +} diff --git a/src/test/java/cc/shanruifeng/functions/array/UDFArrayFilterTest.java b/src/test/java/cc/shanruifeng/functions/array/UDFArrayFilterTest.java new file mode 100644 index 0000000..0c0eda7 --- /dev/null +++ b/src/test/java/cc/shanruifeng/functions/array/UDFArrayFilterTest.java @@ -0,0 +1,54 @@ +package cc.shanruifeng.functions.array; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import static java.util.Arrays.asList; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.*; + +public class UDFArrayFilterTest { + @Test + public void test() throws HiveException { + UDFArrayFilter udf = new UDFArrayFilter(); + + ObjectInspector arrayOI = ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector)); + ObjectInspector valueOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ObjectInspector[] arguments = {arrayOI, valueOI}; + + udf.initialize(arguments); + List> array = ImmutableList.of(asList("abc", null, "123"), ImmutableList.of("a", "b")); + GenericUDF.DeferredObject arrayObj = new GenericUDF.DeferredJavaObject(array); + GenericUDF.DeferredObject valueObj = new GenericUDF.DeferredJavaObject("s -> s.get(1) == null"); + GenericUDF.DeferredObject[] args = {arrayObj, valueObj}; + ArrayList output = (ArrayList) udf.evaluate(args); + + assertTrue(Iterables.elementsEqual(ImmutableList.of(asList("abc", null, "123")), output)); + } + + @Test + public void test1() throws HiveException { + UDFArrayFilter udf = new UDFArrayFilter(); + + ObjectInspector arrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector); + ObjectInspector valueOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ObjectInspector[] arguments = {arrayOI, valueOI}; + + udf.initialize(arguments); + List array = asList(null); + GenericUDF.DeferredObject arrayObj = new GenericUDF.DeferredJavaObject(array); + GenericUDF.DeferredObject valueObj = new GenericUDF.DeferredJavaObject("x -> x != null && x"); + GenericUDF.DeferredObject[] args = {arrayObj, valueObj}; + ArrayList output = (ArrayList) udf.evaluate(args); + + assertTrue(Iterables.elementsEqual(asList(true), output)); + } +} \ No newline at end of file From 084c501de990af026132304cd01f22d90641f4be Mon Sep 17 00:00:00 2001 From: aaronshan Date: Fri, 27 Jul 2018 00:50:44 +0800 Subject: [PATCH 04/16] add some useful math functions and string functions --- README.md | 2 +- pom.xml | 22 ++++ .../math/UDFMathInverseNormalCdf.java | 32 +++++ .../functions/math/UDFMathNormalCdf.java | 30 +++++ .../functions/string/UDFCodePoint.java | 47 ++++++++ .../string/UDFStringHammingDistance.java | 66 ++++++++++ .../string/UDFStringLevenshteinDistance.java | 113 ++++++++++++++++++ .../functions/string/UDFStringSplitToMap.java | 83 +++++++++++++ .../string/UDFStringSplitToMultimap.java | 99 +++++++++++++++ 9 files changed, 493 insertions(+), 1 deletion(-) create mode 100644 src/main/java/cc/shanruifeng/functions/math/UDFMathInverseNormalCdf.java create mode 100644 src/main/java/cc/shanruifeng/functions/math/UDFMathNormalCdf.java create mode 100644 src/main/java/cc/shanruifeng/functions/string/UDFCodePoint.java create mode 100644 src/main/java/cc/shanruifeng/functions/string/UDFStringHammingDistance.java create mode 100644 src/main/java/cc/shanruifeng/functions/string/UDFStringLevenshteinDistance.java create mode 100644 src/main/java/cc/shanruifeng/functions/string/UDFStringSplitToMap.java create mode 100644 src/main/java/cc/shanruifeng/functions/string/UDFStringSplitToMultimap.java diff --git a/README.md b/README.md index a481412..9c7b6fd 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ It will generate hive-third-functions-${version}-shaded.jar in target directory. You can also directly download file from [release page](https://github.com/aaronshan/hive-third-functions/releases). -> current latest version is `2.1.3` +> current latest version is `3.0.0` ## Functions diff --git a/pom.xml b/pom.xml index 97eba4d..0e3d3e3 100644 --- a/pom.xml +++ b/pom.xml @@ -71,6 +71,12 @@ ${dep.airlift.version} + + io.airlift + slice + 0.35 + + com.fasterxml.jackson.core jackson-core @@ -95,6 +101,12 @@ ${lambda.from.string.version} + + org.apache.commons + commons-math3 + 3.6.1 + + @@ -136,6 +148,11 @@ json + + io.airlift + slice + + com.fasterxml.jackson.core jackson-core @@ -151,6 +168,11 @@ lambda-from-string + + org.apache.commons + commons-math3 + + junit junit diff --git a/src/main/java/cc/shanruifeng/functions/math/UDFMathInverseNormalCdf.java b/src/main/java/cc/shanruifeng/functions/math/UDFMathInverseNormalCdf.java new file mode 100644 index 0000000..036ccbe --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/math/UDFMathInverseNormalCdf.java @@ -0,0 +1,32 @@ +package cc.shanruifeng.functions.math; + +import org.apache.commons.math3.special.Erf; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.DoubleWritable; + +import static cc.shanruifeng.functions.utils.Failures.checkCondition; + +/** + * @author ruifeng.shan + * @date 2018-07-26 + * @time 23:04 + */ +@Description(name = "inverse_normal_cdf" + , value = "_FUNC_(mean, sd, p) - inverse of normal cdf given a mean, std, and probability." + , extended = "Example:\n > select _FUNC_(mean, sd, p) from src;") +public class UDFMathInverseNormalCdf extends UDF { + private DoubleWritable result = new DoubleWritable(); + + public UDFMathInverseNormalCdf() { + } + + public DoubleWritable evaluate(double mean, double sd, double p) throws HiveException { + checkCondition(p > 0 && p < 1, "p must be 0 > p > 1"); + checkCondition(sd > 0, "sd must > 0"); + + result.set(mean + sd * 1.4142135623730951 * Erf.erfInv(2 * p - 1)); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/math/UDFMathNormalCdf.java b/src/main/java/cc/shanruifeng/functions/math/UDFMathNormalCdf.java new file mode 100644 index 0000000..b666b5a --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/math/UDFMathNormalCdf.java @@ -0,0 +1,30 @@ +package cc.shanruifeng.functions.math; + +import org.apache.commons.math3.special.Erf; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.DoubleWritable; + +import static cc.shanruifeng.functions.utils.Failures.checkCondition; + +/** + * @author ruifeng.shan + * @date 2018-07-26 + * @time 23:03 + */ +@Description(name = "normal_cdf" + , value = "_FUNC_(mean, sd, v) - normal cdf given a mean, standard deviation, and value." + , extended = "Example:\n > select _FUNC_(mean, sd, v) from src;") +public class UDFMathNormalCdf extends UDF { + private DoubleWritable result = new DoubleWritable(); + + public UDFMathNormalCdf() { + } + + public DoubleWritable evaluate(double mean, double standardDeviation, double value) throws HiveException { + checkCondition(standardDeviation > 0, "standardDeviation must > 0"); + result.set(0.5 * (1 + Erf.erf((value - mean) / (standardDeviation * Math.sqrt(2))))); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFCodePoint.java b/src/main/java/cc/shanruifeng/functions/string/UDFCodePoint.java new file mode 100644 index 0000000..d6a95bd --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/string/UDFCodePoint.java @@ -0,0 +1,47 @@ +package cc.shanruifeng.functions.string; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static io.airlift.slice.SliceUtf8.getCodePointAt; +import static io.airlift.slice.SliceUtf8.countCodePoints; + +/** + * @author ruifeng.shan + * @date 2018-07-26 + * @time 23:23 + */ +@Description(name = "codepoint" + , value = "_FUNC_(string) - returns Unicode code point of a single character string." + , extended = "Example:\n > select _FUNC_(string) from src;") +public class UDFCodePoint extends UDF { + private LongWritable result = new LongWritable(); + + public UDFCodePoint() { + } + + /** + * codepoint. + * + * @param text + * @return + */ + public LongWritable evaluate(Text text) throws HiveException { + if (text == null) { + return null; + } + + Slice slice = Slices.utf8Slice(text.toString()); + checkCondition(countCodePoints(slice) == 1, "Input string must be a single character string"); + + result.set(getCodePointAt(slice, 0)); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFStringHammingDistance.java b/src/main/java/cc/shanruifeng/functions/string/UDFStringHammingDistance.java new file mode 100644 index 0000000..e465922 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/string/UDFStringHammingDistance.java @@ -0,0 +1,66 @@ +package cc.shanruifeng.functions.string; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static io.airlift.slice.SliceUtf8.lengthOfCodePoint; +import static io.airlift.slice.SliceUtf8.tryGetCodePointAt; + +/** + * @author ruifeng.shan + * @date 2018-07-26 + * @time 23:43 + */ +@Description(name = "hamming_distance" + , value = "_FUNC_(string, string) - computes Hamming distance between two strings." + , extended = "Example:\n > select _FUNC_(string, string) from src;") +public class UDFStringHammingDistance extends UDF { + private LongWritable result = new LongWritable(0); + + public UDFStringHammingDistance() { + } + + /** + * hamming distance. + * + * @param leftText + * @param rightText + * @return + */ + public LongWritable evaluate(Text leftText, Text rightText) throws HiveException { + if (leftText == null || rightText == null) { + return result; + } + + Slice left = Slices.utf8Slice(leftText.toString()); + Slice right = Slices.utf8Slice(rightText.toString()); + int distance = 0; + int leftPosition = 0; + int rightPosition = 0; + while (leftPosition < left.length() && rightPosition < right.length()) { + int codePointLeft = tryGetCodePointAt(left, leftPosition); + int codePointRight = tryGetCodePointAt(right, rightPosition); + + // if both code points are invalid, we do not care if they are equal + // the following code treats them as equal if they happen to be of the same length + if (codePointLeft != codePointRight) { + distance++; + } + + leftPosition += codePointLeft > 0 ? lengthOfCodePoint(codePointLeft) : -codePointLeft; + rightPosition += codePointRight > 0 ? lengthOfCodePoint(codePointRight) : -codePointRight; + } + + checkCondition(leftPosition == left.length() && rightPosition == right.length(), + "The input strings to hamming_distance function must have the same length"); + result.set(distance); + + return result; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFStringLevenshteinDistance.java b/src/main/java/cc/shanruifeng/functions/string/UDFStringLevenshteinDistance.java new file mode 100644 index 0000000..a37060f --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/string/UDFStringLevenshteinDistance.java @@ -0,0 +1,113 @@ +package cc.shanruifeng.functions.string; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static io.airlift.slice.SliceUtf8.getCodePointAt; +import static io.airlift.slice.SliceUtf8.lengthOfCodePoint; +import static io.airlift.slice.SliceUtf8.tryGetCodePointAt; + +/** + * @author ruifeng.shan + * @date 2018-07-26 + * @time 23:53 + */ +@Description(name = "levenshtein_distance" + , value = "_FUNC_(string, string) - computes Levenshtein distance between two strings." + , extended = "Example:\n > select _FUNC_(string, string) from src;") +public class UDFStringLevenshteinDistance extends UDF { + private LongWritable result = new LongWritable(0); + + public UDFStringLevenshteinDistance() { + } + + /** + * hamming distance. + * + * @param leftText + * @param rightText + * @return + */ + public LongWritable evaluate(Text leftText, Text rightText) throws HiveException { + if (leftText == null || rightText == null) { + return null; + } + + Slice left = Slices.utf8Slice(leftText.toString()); + Slice right = Slices.utf8Slice(rightText.toString()); + int[] leftCodePoints = castToCodePoints(left); + int[] rightCodePoints = castToCodePoints(right); + + if (leftCodePoints.length < rightCodePoints.length) { + int[] tempCodePoints = leftCodePoints; + leftCodePoints = rightCodePoints; + rightCodePoints = tempCodePoints; + } + + if (rightCodePoints.length == 0) { + result.set(leftCodePoints.length); + return result; + } + + checkCondition((leftCodePoints.length * (rightCodePoints.length - 1)) <= 1_000_000, + "The combined inputs for Levenshtein distance are too large"); + + int[] distances = new int[rightCodePoints.length]; + for (int i = 0; i < rightCodePoints.length; i++) { + distances[i] = i + 1; + } + + for (int i = 0; i < leftCodePoints.length; i++) { + int leftUpDistance = distances[0]; + if (leftCodePoints[i] == rightCodePoints[0]) { + distances[0] = i; + } + else { + distances[0] = Math.min(i, distances[0]) + 1; + } + for (int j = 1; j < rightCodePoints.length; j++) { + int leftUpDistanceNext = distances[j]; + if (leftCodePoints[i] == rightCodePoints[j]) { + distances[j] = leftUpDistance; + } + else { + distances[j] = Math.min(distances[j - 1], Math.min(leftUpDistance, distances[j])) + 1; + } + leftUpDistance = leftUpDistanceNext; + } + } + + result.set(distances[rightCodePoints.length - 1]); + + return result; + } + + private static int[] castToCodePoints(Slice slice) throws HiveException { + int[] codePoints = new int[safeCountCodePoints(slice)]; + int position = 0; + for (int index = 0; index < codePoints.length; index++) { + codePoints[index] = getCodePointAt(slice, position); + position += lengthOfCodePoint(slice, position); + } + return codePoints; + } + + private static int safeCountCodePoints(Slice slice) throws HiveException { + int codePoints = 0; + for (int position = 0; position < slice.length(); ) { + int codePoint = tryGetCodePointAt(slice, position); + if (codePoint < 0) { + throw new HiveException("Invalid UTF-8 encoding in characters: " + slice.toStringUtf8()); + } + position += lengthOfCodePoint(codePoint); + codePoints++; + } + return codePoints; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFStringSplitToMap.java b/src/main/java/cc/shanruifeng/functions/string/UDFStringSplitToMap.java new file mode 100644 index 0000000..63bb45f --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/string/UDFStringSplitToMap.java @@ -0,0 +1,83 @@ +package cc.shanruifeng.functions.string; + +import com.google.common.base.Splitter; +import java.util.HashMap; +import java.util.Map; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +import static cc.shanruifeng.functions.utils.Failures.checkCondition; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 00:04 + */ +@Description(name = "split_to_map" + , value = "_FUNC_(string, string, string) - returns a map created using the given key/value arrays." + , extended = "Example:\n > select _FUNC_('a=123,b=.4,c=,=d', ',', '=') from src;") +public class UDFStringSplitToMap extends GenericUDF { + private static final int ARG_COUNT = 3; // Number of arguments to this UDF + HashMap result = new HashMap(); + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != ARG_COUNT) { + throw new UDFArgumentLengthException( + "The function split_to_map(string, string, string) takes exactly " + ARG_COUNT + " arguments."); + } + + // Check if two argument is of string + for (int i = 0; i < 3; i++) { + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { + throw new UDFArgumentTypeException(i, + "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " + + "expected at function split_to_map, but " + + "\"" + arguments[i].getTypeName() + "\" " + + "is found"); + } + } + + ObjectInspector mapKeyOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ObjectInspector mapValueOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + + return ObjectInspectorFactory.getStandardMapObjectInspector(mapKeyOI, mapValueOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + String string = (String) arguments[0].get(); + String entryDelimiter = (String) arguments[1].get(); + String keyValueDelimiter = (String) arguments[2].get(); + + checkCondition(entryDelimiter.length() > 0, "entryDelimiter is empty"); + checkCondition(keyValueDelimiter.length() > 0, "keyValueDelimiter is empty"); + checkCondition(!entryDelimiter.equals(keyValueDelimiter), "entryDelimiter and keyValueDelimiter must not be the same"); + + if (string == null) { + return null; + } + + result.clear(); + Map map = Splitter.on(entryDelimiter).withKeyValueSeparator(keyValueDelimiter).split(string); + result.putAll(map); + + return result; + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == ARG_COUNT); + return "split_to_map(" + strings[0] + ", " + + strings[1] + ", " + strings[2] + ")"; + } +} diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFStringSplitToMultimap.java b/src/main/java/cc/shanruifeng/functions/string/UDFStringSplitToMultimap.java new file mode 100644 index 0000000..036dd73 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/string/UDFStringSplitToMultimap.java @@ -0,0 +1,99 @@ +package cc.shanruifeng.functions.string; + +import com.google.common.base.Splitter; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Lists; +import com.google.common.collect.Multimap; +import java.util.HashMap; +import java.util.List; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +import static cc.shanruifeng.functions.utils.Failures.checkCondition; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 00:04 + */ +@Description(name = "split_to_multimap" + , value = "_FUNC_(string, string, string) - creates a multimap by splitting a string into key/value pairs." + , extended = "Example:\n > select _FUNC_('a=123,b=.4,c=,=d', ',', '=') from src;") +public class UDFStringSplitToMultimap extends GenericUDF { + private static final int ARG_COUNT = 3; // Number of arguments to this UDF + HashMap> result = new HashMap>(); + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != ARG_COUNT) { + throw new UDFArgumentLengthException( + "The function split_to_multimap(string, string, string) takes exactly " + ARG_COUNT + " arguments."); + } + + // Check if two argument is of string + for (int i = 0; i < 3; i++) { + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { + throw new UDFArgumentTypeException(i, + "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " + + "expected at function split_to_multimap, but " + + "\"" + arguments[i].getTypeName() + "\" " + + "is found"); + } + } + + ObjectInspector mapKeyOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ObjectInspector mapValueOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector); + + return ObjectInspectorFactory.getStandardMapObjectInspector(mapKeyOI, mapValueOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + String string = (String) arguments[0].get(); + String entryDelimiter = (String) arguments[1].get(); + String keyValueDelimiter = (String) arguments[2].get(); + + checkCondition(entryDelimiter.length() > 0, "entryDelimiter is empty"); + checkCondition(keyValueDelimiter.length() > 0, "keyValueDelimiter is empty"); + checkCondition(!entryDelimiter.equals(keyValueDelimiter), "entryDelimiter and keyValueDelimiter must not be the same"); + + if (string == null) { + return null; + } + + Multimap multimap = ArrayListMultimap.create(); + + result.clear(); + List list = Splitter.on(entryDelimiter).splitToList(string); + for (String str : list) { + String[] fields = str.split(keyValueDelimiter); + if (fields.length != 2) { + throw new HiveException("Key-value delimiter must appear exactly once in each entry. Bad input: " + string); + } + multimap.put(fields[0], fields[1]); + + } + + for (String key : multimap.keySet()) { + result.put(key, Lists.newArrayList(multimap.get(key))); + } + + return result; + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == ARG_COUNT); + return "split_to_multimap(" + strings[0] + ", " + + strings[1] + ", " + strings[2] + ")"; + } +} \ No newline at end of file From 9768ea4057032086274fd8f2015f8401a2aef092 Mon Sep 17 00:00:00 2001 From: aaronshan Date: Fri, 27 Jul 2018 15:35:58 +0800 Subject: [PATCH 05/16] add some string functions. fix sha256 udf error. --- pom.xml | 17 ++++ .../functions/string/UDFSha256.java | 3 +- .../functions/string/UDFStringFromUTF8.java | 91 +++++++++++++++++++ .../functions/string/UDFStringNormalize.java | 39 ++++++++ .../functions/string/UDFStringPosition.java | 40 ++++++++ .../functions/string/UDFStringToUTF8.java | 30 ++++++ .../functions/string/UDFWordStem.java | 89 ++++++++++++++++++ 7 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 src/main/java/cc/shanruifeng/functions/string/UDFStringFromUTF8.java create mode 100644 src/main/java/cc/shanruifeng/functions/string/UDFStringNormalize.java create mode 100644 src/main/java/cc/shanruifeng/functions/string/UDFStringPosition.java create mode 100644 src/main/java/cc/shanruifeng/functions/string/UDFStringToUTF8.java create mode 100644 src/main/java/cc/shanruifeng/functions/string/UDFWordStem.java diff --git a/pom.xml b/pom.xml index 0e3d3e3..638450d 100644 --- a/pom.xml +++ b/pom.xml @@ -107,6 +107,18 @@ 3.6.1 + + org.apache.lucene + lucene-analyzers-common + 7.2.1 + + + org.apache.lucene + lucene-core + + + + @@ -173,6 +185,11 @@ commons-math3 + + org.apache.lucene + lucene-analyzers-common + + junit junit diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFSha256.java b/src/main/java/cc/shanruifeng/functions/string/UDFSha256.java index df4d05d..63ac3ea 100644 --- a/src/main/java/cc/shanruifeng/functions/string/UDFSha256.java +++ b/src/main/java/cc/shanruifeng/functions/string/UDFSha256.java @@ -2,6 +2,7 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; /** @@ -12,7 +13,7 @@ @Description(name = "sha256" , value = "_FUNC_(string) - get sha256 hash code by given input string." , extended = "Example:\n > select _FUNC_(string) from src;") -public class UDFSha256 { +public class UDFSha256 extends UDF { private Text result = new Text(); public UDFSha256() { diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFStringFromUTF8.java b/src/main/java/cc/shanruifeng/functions/string/UDFStringFromUTF8.java new file mode 100644 index 0000000..1dd6a86 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/string/UDFStringFromUTF8.java @@ -0,0 +1,91 @@ +package cc.shanruifeng.functions.string; + +import io.airlift.slice.InvalidUtf8Exception; +import io.airlift.slice.SliceUtf8; +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import java.util.OptionalInt; + +import static java.lang.Character.MAX_CODE_POINT; +import static java.lang.Character.SURROGATE; + +/** + * @author ruifeng.shan + * @date 2018-07-27 下午2:02 + */ +@Description(name = "from_utf8" + , value = "_FUNC_(string) - decodes the UTF-8 encoded string\n" + + "_FUNC_(string, string) - decodes the UTF-8 encoded string\n" + + "_FUNC_(string, long) - decodes the UTF-8 encoded string." + , extended = "Example:\n > select _FUNC_(string) from src;") +public class UDFStringFromUTF8 extends UDF { + private Text result = new Text(); + + public UDFStringFromUTF8() { + } + + public Text evaluate(Text text) { + if (text == null) { + return null; + } + + result.set(SliceUtf8.fixInvalidUtf8(Slices.utf8Slice(text.toString())).toString()); + return result; + } + + public Text evaluate(Text text, Text replacementCharacter) throws HiveException { + if (text == null) { + return null; + } + + if (replacementCharacter == null) { + throw new HiveException("Replacement character string must empty or a single character"); + } + + int count = replacementCharacter.getLength(); + if (count > 1) { + throw new HiveException("Replacement character string must empty or a single character"); + } + + OptionalInt replacementCodePoint; + if (count == 1) { + try { + replacementCodePoint = OptionalInt.of(replacementCharacter.charAt(0)); + } + catch (InvalidUtf8Exception e) { + throw new HiveException("Invalid replacement character"); + } + } + else { + replacementCodePoint = OptionalInt.empty(); + } + + result.set(SliceUtf8.fixInvalidUtf8(Slices.utf8Slice(text.toString()), replacementCodePoint).toString()); + return result; + } + + public Text evaluate(Text text, LongWritable replacementCodePoint) throws HiveException { + if (text == null) { + return null; + } + + if (replacementCodePoint == null) { + throw new HiveException("replacement long value cannot be null!"); + } + + long codePoint = replacementCodePoint.get(); + + if (replacementCodePoint.get() > MAX_CODE_POINT || Character.getType((int) codePoint) == SURROGATE) { + throw new HiveException("Invalid replacement character"); + } + + result.set(SliceUtf8.fixInvalidUtf8(Slices.utf8Slice(text.toString()), OptionalInt.of((int) codePoint)).toString()); + return result; + } + +} diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFStringNormalize.java b/src/main/java/cc/shanruifeng/functions/string/UDFStringNormalize.java new file mode 100644 index 0000000..4b15d8f --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/string/UDFStringNormalize.java @@ -0,0 +1,39 @@ +package cc.shanruifeng.functions.string; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.Text; + +import java.text.Normalizer; + +/** + * @author ruifeng.shan + * @date 2018-07-27 下午12:37 + */ +@Description(name = "normalize" + , value = "_FUNC_(string, string) - transforms the string to normalized form." + , extended = "Example:\n > select _FUNC_(string, form_str) from src;") +public class UDFStringNormalize extends UDF { + private Text result = new Text(); + + public UDFStringNormalize() { + } + + public Text evaluate(Text text, Text form) throws HiveException { + if (text == null) { + return null; + } + + Normalizer.Form targetForm; + try { + targetForm = Normalizer.Form.valueOf(form.toString()); + } + catch (IllegalArgumentException e) { + throw new HiveException("Normalization form must be one of [NFD, NFC, NFKD, NFKC]"); + } + + result.set(Normalizer.normalize(text.toString(), targetForm)); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFStringPosition.java b/src/main/java/cc/shanruifeng/functions/string/UDFStringPosition.java new file mode 100644 index 0000000..38def10 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/string/UDFStringPosition.java @@ -0,0 +1,40 @@ +package cc.shanruifeng.functions.string; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +/** + * @author ruifeng.shan + * @date 2018-07-27 下午12:08 + */ +@Description(name = "strpos" + , value = "_FUNC_(string, substring) - returns index of first occurrence of a substring (or 0 if not found)." + , extended = "Example:\n > select _FUNC_(string, substring) from src;") +public class UDFStringPosition extends UDF { + private LongWritable result = new LongWritable(0); + + public UDFStringPosition() { + } + + public LongWritable evaluate(Text text, Text subText) { + if (text == null || subText == null) { + return result; + } + + if (subText.getLength() == 1) { + result.set(1); + return result; + } + + int index = text.toString().indexOf(subText.toString()); + if (index < 0) { + return result; + } + + result.set(index + 1); + return result; + } +} diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFStringToUTF8.java b/src/main/java/cc/shanruifeng/functions/string/UDFStringToUTF8.java new file mode 100644 index 0000000..070b13c --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/string/UDFStringToUTF8.java @@ -0,0 +1,30 @@ +package cc.shanruifeng.functions.string; + +import io.airlift.slice.SliceUtf8; +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.Text; + +/** + * @author aaron02 + * @date 2018-07-27 下午2:02 + */ +@Description(name = "to_utf8" + , value = "_FUNC_(string) - encodes the string to UTF-8." + , extended = "Example:\n > select _FUNC_(string) from src;") +public class UDFStringToUTF8 extends UDF { + private Text result = new Text(); + + public UDFStringToUTF8() { + } + + public Text evaluate(Text text) { + if (text == null) { + return null; + } + + result.set(Slices.utf8Slice(text.toString()).toString()); + return result; + } +} diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFWordStem.java b/src/main/java/cc/shanruifeng/functions/string/UDFWordStem.java new file mode 100644 index 0000000..7c58703 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/string/UDFWordStem.java @@ -0,0 +1,89 @@ +package cc.shanruifeng.functions.string; + +import com.google.common.collect.ImmutableMap; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.Text; +import org.tartarus.snowball.SnowballProgram; +import org.tartarus.snowball.ext.*; + +import java.util.Map; +import java.util.function.Supplier; + +import static io.airlift.slice.Slices.utf8Slice; + +/** + * @author ruifeng.shan + * @date 2018-07-27 下午12:22 + */ +@Description(name = "word_stem" + , value = "_FUNC_(word) - returns the stem of a word in the English language\n" + + "_FUNC_(word, language) - returns the stem of a word in the given language." + , extended = "Example:\n > select _FUNC_(string, language_str) from src;") +public class UDFWordStem extends UDF { + private Text result = new Text(); + + private static final Map> STEMMERS = ImmutableMap.>builder() + .put(utf8Slice("ca"), () -> new CatalanStemmer()) + .put(utf8Slice("da"), DanishStemmer::new) + .put(utf8Slice("de"), German2Stemmer::new) + .put(utf8Slice("en"), EnglishStemmer::new) + .put(utf8Slice("es"), SpanishStemmer::new) + .put(utf8Slice("eu"), BasqueStemmer::new) + .put(utf8Slice("fi"), FinnishStemmer::new) + .put(utf8Slice("fr"), FrenchStemmer::new) + .put(utf8Slice("hu"), HungarianStemmer::new) + .put(utf8Slice("hy"), ArmenianStemmer::new) + .put(utf8Slice("ir"), IrishStemmer::new) + .put(utf8Slice("it"), ItalianStemmer::new) + .put(utf8Slice("lt"), LithuanianStemmer::new) + .put(utf8Slice("nl"), DutchStemmer::new) + .put(utf8Slice("no"), NorwegianStemmer::new) + .put(utf8Slice("pt"), PortugueseStemmer::new) + .put(utf8Slice("ro"), RomanianStemmer::new) + .put(utf8Slice("ru"), RussianStemmer::new) + .put(utf8Slice("sv"), SwedishStemmer::new) + .put(utf8Slice("tr"), TurkishStemmer::new) + .build(); + + + public UDFWordStem() { + } + + public Text evaluate(Text text) { + if (text == null) { + return null; + } + + String string = wordStem(Slices.utf8Slice(text.toString()), new EnglishStemmer()).toString(); + result.set(string); + + return result; + } + + public Text evaluate(Text text, Text language) throws HiveException { + if (text == null) { + return null; + } + + Supplier stemmer = STEMMERS.get(language.toString()); + if (stemmer == null) { + throw new HiveException("Unknown stemmer language: " + language.toString()); + } + + String string = wordStem(Slices.utf8Slice(text.toString()), stemmer.get()).toString(); + result.set(string); + + return result; + } + + private static Slice wordStem(Slice slice, SnowballProgram stemmer) + { + stemmer.setCurrent(slice.toStringUtf8()); + return stemmer.stem() ? utf8Slice(stemmer.getCurrent()) : slice; + } +} \ No newline at end of file From e942b0815db0cce19e4fc116b584f134fb2f3c3c Mon Sep 17 00:00:00 2001 From: aaronshan Date: Fri, 27 Jul 2018 23:45:52 +0800 Subject: [PATCH 06/16] add some useful regexp functions --- pom.xml | 11 ++ .../functions/regexp/Re2JRegexp.java | 139 ++++++++++++++++++ .../regexp/UDFRe2JRegexpExtract.java | 42 ++++++ .../regexp/UDFRe2JRegexpExtractAll.java | 100 +++++++++++++ .../functions/regexp/UDFRe2JRegexpLike.java | 35 +++++ .../regexp/UDFRe2JRegexpReplace.java | 44 ++++++ .../functions/regexp/UDFRe2JRegexpSplit.java | 81 ++++++++++ 7 files changed, 452 insertions(+) create mode 100644 src/main/java/cc/shanruifeng/functions/regexp/Re2JRegexp.java create mode 100644 src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpExtract.java create mode 100644 src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpExtractAll.java create mode 100644 src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpLike.java create mode 100644 src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpReplace.java create mode 100644 src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpSplit.java diff --git a/pom.xml b/pom.xml index 638450d..a38cfd4 100644 --- a/pom.xml +++ b/pom.xml @@ -107,6 +107,12 @@ 3.6.1 + + com.teradata + re2j-td + 1.4 + + org.apache.lucene lucene-analyzers-common @@ -185,6 +191,11 @@ commons-math3 + + com.teradata + re2j-td + + org.apache.lucene lucene-analyzers-common diff --git a/src/main/java/cc/shanruifeng/functions/regexp/Re2JRegexp.java b/src/main/java/cc/shanruifeng/functions/regexp/Re2JRegexp.java new file mode 100644 index 0000000..3466e7b --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/regexp/Re2JRegexp.java @@ -0,0 +1,139 @@ +package cc.shanruifeng.functions.regexp; + +import com.google.common.collect.Lists; +import com.google.re2j.Matcher; +import com.google.re2j.Options; +import com.google.re2j.Pattern; +import io.airlift.slice.Slice; +import java.util.List; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static com.google.common.base.Preconditions.checkState; +import static com.google.re2j.Options.Algorithm.DFA_FALLBACK_TO_NFA; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:20 + */ +public final class Re2JRegexp { + private static final Logger log = LoggerFactory.getLogger(Re2JRegexp.class); + + private static final java.util.regex.Pattern DOT_STAR_PREFIX_PATTERN = java.util.regex.Pattern.compile("(?s)^(\\.\\*\\??)?(.*)"); + private static final int CORE_PATTERN_INDEX = 2; + + public final int dfaStatesLimit; + public final int dfaRetries; + + public final Pattern re2jPattern; + public final Pattern re2jPatternWithoutDotStartPrefix; + + public Re2JRegexp(int dfaStatesLimit, int dfaRetries, Slice pattern) { + this.dfaStatesLimit = dfaStatesLimit; + this.dfaRetries = dfaRetries; + + Options options = Options.builder() + .setAlgorithm(DFA_FALLBACK_TO_NFA) + .setMaximumNumberOfDFAStates(dfaStatesLimit) + .setNumberOfDFARetries(dfaRetries) + .setEventsListener(new RE2JEventsListener()) + .build(); + + String patternString = pattern.toStringUtf8(); + re2jPattern = Pattern.compile(patternString, options); + + // Remove .*? prefix. DFA has optimization which does fast lookup for first byte of a potential match. + // When pattern is prefixed with .*? this optimization doesn't work in Pattern.find() function. + java.util.regex.Matcher dotStarPrefixMatcher = DOT_STAR_PREFIX_PATTERN.matcher(patternString); + checkState(dotStarPrefixMatcher.matches()); + String patternStringWithoutDotStartPrefix = dotStarPrefixMatcher.group(CORE_PATTERN_INDEX); + + if (!patternStringWithoutDotStartPrefix.equals(patternString)) { + re2jPatternWithoutDotStartPrefix = Pattern.compile(patternStringWithoutDotStartPrefix, options); + } else { + re2jPatternWithoutDotStartPrefix = re2jPattern; + } + } + + private static void validateGroup(int group, int groupCount) throws HiveException { + if (group < 0) { + throw new HiveException("Group cannot be negative"); + } + if (group > groupCount) { + throw new HiveException(format("Pattern has %d groups. Cannot access group %d", groupCount, group)); + } + } + + public boolean matches(Slice source) { + return re2jPatternWithoutDotStartPrefix.find(source); + } + + public Slice replace(Slice source, Slice replacement) throws HiveException { + Matcher matcher = re2jPattern.matcher(source); + try { + return matcher.replaceAll(replacement); + } catch (IndexOutOfBoundsException | IllegalArgumentException e) { + throw new HiveException("Illegal replacement sequence: " + replacement.toStringUtf8()); + } + } + + public List extractAll(Slice source, long groupIndex) throws HiveException { + Matcher matcher = re2jPattern.matcher(source); + int group = toIntExact(groupIndex); + validateGroup(group, matcher.groupCount()); + + List list = Lists.newArrayList(); + while (true) { + if (!matcher.find()) { + break; + } + + Slice searchedGroup = matcher.group(group); + if (searchedGroup == null) { + list.add(null); + continue; + } + list.add(searchedGroup.toString()); + } + return list; + } + + public Slice extract(Slice source, long groupIndex) throws HiveException { + Matcher matcher = re2jPattern.matcher(source); + int group = toIntExact(groupIndex); + validateGroup(group, matcher.groupCount()); + + if (!matcher.find()) { + return null; + } + + return matcher.group(group); + } + + public List split(Slice source) { + Matcher matcher = re2jPattern.matcher(source); + List list = Lists.newArrayList(); + + int lastEnd = 0; + while (matcher.find()) { + Slice slice = source.slice(lastEnd, matcher.start() - lastEnd); + lastEnd = matcher.end(); + list.add(slice.toString()); + } + + list.add(source.slice(lastEnd, source.length() - lastEnd).toString()); + return list; + } + + private class RE2JEventsListener + implements Options.EventsListener { + @Override + public void fallbackToNFA() { + log.debug("Fallback to NFA, pattern: %s, DFA states limit: %d, DFA retries: %d", re2jPattern.pattern(), dfaStatesLimit, dfaRetries); + } + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpExtract.java b/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpExtract.java new file mode 100644 index 0000000..a6afc0c --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpExtract.java @@ -0,0 +1,42 @@ +package cc.shanruifeng.functions.regexp; + +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:38 + */ +@Description(name = "regexp_like" + , value = "_FUNC_(string, string) - returns substrings matching a regular expression." + , extended = "Example:\n > select _FUNC_(string, pattern) from src;") +public class UDFRe2JRegexpExtract extends UDF { + private static Re2JRegexp re2JRegexp; + private Text result = new Text(); + + public UDFRe2JRegexpExtract() { + + } + + public Text evaluate(Text source, Text pattern) throws HiveException { + return evaluate(source, pattern, new LongWritable(0)); + } + + public Text evaluate(Text source, Text pattern, LongWritable groupIndex) throws HiveException { + if (source == null) { + return null; + } + + if (re2JRegexp == null) { + re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern.toString())); + } + + result.set(re2JRegexp.extract(Slices.utf8Slice(source.toString()), groupIndex.get()).toStringUtf8()); + return result; + } +} diff --git a/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpExtractAll.java b/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpExtractAll.java new file mode 100644 index 0000000..416a68a --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpExtractAll.java @@ -0,0 +1,100 @@ +package cc.shanruifeng.functions.regexp; + +import io.airlift.slice.Slices; +import java.util.ArrayList; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:38 + */ +@Description(name = "regexp_extract_all" + , value = "_FUNC_(string, string) - string(s) extracted using the given pattern\n" + + "_FUNC_(string, string, long) - group(s) extracted using the given pattern." + , extended = "Example:\n > select _FUNC_(string, pattern) from src;") +public class UDFRe2JRegexpExtractAll extends GenericUDF { + private transient ArrayList result = new ArrayList(); + private transient Re2JRegexp re2JRegexp; + + public UDFRe2JRegexpExtractAll() { + + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != 2 || arguments.length != 3) { + throw new UDFArgumentLengthException( + "The function regexp_extract_all takes exactly 2 or 3 arguments."); + } + + for (int i = 0; i < 2; i++) { + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { + throw new UDFArgumentTypeException(i, + "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " + + "expected at function regexp_extract_all, but " + + "\"" + arguments[i].getTypeName() + "\" " + + "is found"); + } + } + + if (arguments.length == 3) { + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaLongObjectInspector, arguments[2])) { + throw new UDFArgumentTypeException(2, + "\"" + PrimitiveObjectInspectorFactory.javaLongObjectInspector.getTypeName() + "\" " + + "expected at function regexp_extract_all, but " + + "\"" + arguments[2].getTypeName() + "\" " + + "is found"); + } + } + + ObjectInspector expectOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + + return ObjectInspectorFactory.getStandardListObjectInspector(expectOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + String source = (String) arguments[0].get(); + String pattern = (String) arguments[1].get(); + Long groupIndex = 0L; + if (arguments.length == 3) { + groupIndex = (Long) arguments[2].get(); + } + + if (source == null) { + return null; + } + + if (re2JRegexp == null) { + re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern)); + } + + result.clear(); + result.addAll(re2JRegexp.extractAll(Slices.utf8Slice(source), groupIndex)); + + return result; + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == 2 || strings.length == 3); + if (strings.length == 2) { + return "regexp_extract_all(" + strings[0] + ", " + + strings[1] + ")"; + } else { + return "regexp_extract_all(" + strings[0] + ", " + + strings[1] + ", " + strings[2] + ")"; + } + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpLike.java b/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpLike.java new file mode 100644 index 0000000..b5cdd13 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpLike.java @@ -0,0 +1,35 @@ +package cc.shanruifeng.functions.regexp; + +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.Text; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:36 + */ +@Description(name = "regexp_like" + , value = "_FUNC_(string, string) - returns substrings matching a regular expression." + , extended = "Example:\n > select _FUNC_(string, pattern) from src;") +public class UDFRe2JRegexpLike extends UDF { + private static Re2JRegexp re2JRegexp; + + public UDFRe2JRegexpLike() { + + } + + public boolean evaluate(Text text, Text pattern) { + if (text == null) { + return false; + } + + if (re2JRegexp == null) { + re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern.toString())); + } + + return re2JRegexp.matches(Slices.utf8Slice(text.toString())); + } + +} diff --git a/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpReplace.java b/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpReplace.java new file mode 100644 index 0000000..13694a5 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpReplace.java @@ -0,0 +1,44 @@ +package cc.shanruifeng.functions.regexp; + +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.Text; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:37 + */ +@Description(name = "regexp_replace" + , value = "_FUNC_(string, string) - removes substrings matching a regular expression\n" + + "_FUNC_(string, string, string) - replaces substrings matching a regular expression by given string." + , extended = "Example:\n > select _FUNC_(string, pattern) from src;\n" + + "select _FUNC_(string, pattern, replacement) from src;") +public class UDFRe2JRegexpReplace extends UDF { + private static Re2JRegexp re2JRegexp; + private Text result = new Text(); + + public UDFRe2JRegexpReplace() { + + } + + public Text evaluate(Text source, Text pattern) throws HiveException { + return evaluate(source, pattern, new Text(Slices.EMPTY_SLICE.toStringUtf8())); + } + + public Text evaluate(Text source, Text pattern, Text replacement) throws HiveException { + if (source == null) { + return null; + } + + if (re2JRegexp == null) { + re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern.toString())); + } + + result.set(re2JRegexp.replace(Slices.utf8Slice(source.toString()), Slices.utf8Slice(replacement.toString())).toStringUtf8()); + return result; + } + +} diff --git a/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpSplit.java b/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpSplit.java new file mode 100644 index 0000000..a0cf629 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/regexp/UDFRe2JRegexpSplit.java @@ -0,0 +1,81 @@ +package cc.shanruifeng.functions.regexp; + +import io.airlift.slice.Slices; +import java.util.ArrayList; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:38 + */ +@Description(name = "regexp_split" + , value = "_FUNC_(string, string) - returns array of strings split by pattern." + , extended = "Example:\n > select _FUNC_(string, pattern) from src;") +public class UDFRe2JRegexpSplit extends GenericUDF { + private static final int ARG_COUNT = 2; + private transient ArrayList result = new ArrayList(); + private transient Re2JRegexp re2JRegexp; + + public UDFRe2JRegexpSplit() { + + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != ARG_COUNT) { + throw new UDFArgumentLengthException( + "The function regexp_split(string, pattern) takes exactly " + ARG_COUNT + " arguments."); + } + + for (int i = 0; i < ARG_COUNT; i++) { + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { + throw new UDFArgumentTypeException(i, + "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " + + "expected at function regexp_split, but " + + "\"" + arguments[i].getTypeName() + "\" " + + "is found"); + } + } + + ObjectInspector expectOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + + return ObjectInspectorFactory.getStandardListObjectInspector(expectOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + String source = (String) arguments[0].get(); + String pattern = (String) arguments[1].get(); + + if (source == null) { + return null; + } + + if (re2JRegexp == null) { + re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern)); + } + + result.clear(); + result.addAll(re2JRegexp.split(Slices.utf8Slice(source))); + + return result; + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == ARG_COUNT); + return "regexp_split(" + strings[0] + ", " + + strings[1] + ")"; + } +} \ No newline at end of file From 933ed2fc11b77cc322cf00ca7efc19ef4f26cb0b Mon Sep 17 00:00:00 2001 From: aaronshan Date: Fri, 27 Jul 2018 23:46:38 +0800 Subject: [PATCH 07/16] revise travis file --- .travis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index c462890..08c124e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,7 +17,5 @@ install: - mvn install:install-file -DgroupId=javax.jdo -DartifactId=jdo2-api -Dversion=2.3-ec -Dpackaging=jar -Dfile=$HOME/jdo2-api-2.3-ec.jar script: - - jdk_switcher use openjdk7 - - mvn clean package - jdk_switcher use oraclejdk8 - mvn clean package From 8ca2e406796aad70978d0b9688643f497568c4e7 Mon Sep 17 00:00:00 2001 From: aaronshan Date: Sat, 18 Aug 2018 09:11:51 +0800 Subject: [PATCH 08/16] add array_shuffle functions. --- README-zh.md | 5 +- README.md | 5 +- .../functions/array/UDFArrayShuffle.java | 112 ++++++++++++++++++ .../functions/array/UDFArrayShuffleTest.java | 32 +++++ 4 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 src/main/java/cc/shanruifeng/functions/array/UDFArrayShuffle.java create mode 100644 src/test/java/cc/shanruifeng/functions/array/UDFArrayShuffleTest.java diff --git a/README-zh.md b/README-zh.md index 046dffa..6183176 100644 --- a/README-zh.md +++ b/README-zh.md @@ -73,6 +73,7 @@ mvn clean package -DskipTests |array_slice(array, start, length) -> array | 对数组进行分片操作,start为正数从前开始分片, start为负数从后开始分片, 长度为指定的长度.| |array_element_at(array<E>, index) -> E | 返回指定位置的数组元素. 如果索引位置 < 0, 则从尾部开始计数并返回.| |array_filter(array<E>, function)) -> E | 根据一个返回值为boolean类型的lambda表达式函数来对数组元素进行过滤.| +|array_shuffle(array) -> array | 对数组shuffle.| ### 3. map函数 | 函数| 描述 | @@ -182,7 +183,8 @@ create temporary function array_concat as 'cc.shanruifeng.functions.array.UDFArr create temporary function array_value_count as 'cc.shanruifeng.functions.array.UDFArrayValueCount'; create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArraySlice'; create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; -create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayFilter'; +create temporary function array_filter as 'cc.shanruifeng.functions.array.UDFArrayFilter'; +create temporary function array_shuffle as 'cc.shanruifeng.functions.array.UDFArrayShuffle'; create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; create temporary function bitwise_and as 'cc.shanruifeng.functions.bitwise.UDFBitwiseAnd'; create temporary function bitwise_not as 'cc.shanruifeng.functions.bitwise.UDFBitwiseNot'; @@ -286,6 +288,7 @@ select array_filter(array(16,13), 'x -> x > 15') => [16] select array_filter(array('a','b'), 'x -> x == \'a\'') => [a] select array_filter(array(true, false, NULL), 'x -> x != null && x') => [true] select array_filter(array(array('abc', null, '123'), array ('def', 'x', '456')), 'x -> x.get(1) == null') => [['abc', null, '123']] +select array_shuffle(array(16,12,18,9)) ``` ``` diff --git a/README.md b/README.md index 9c7b6fd..d035d49 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ You can also directly download file from [release page](https://github.com/aaron |array_slice(array, start, length) -> array | subsets array starting from index start (or starting from the end if start is negative) with a length of length.| |array_element_at(array<E>, index) -> E | returns element of array at given index. If index < 0, element_at accesses elements from the last to the first.| |array_filter(array<E>, function)) -> E | constructs an array from those elements of array for which function returns true.| +|array_shuffle(array) -> array | Generate a random permutation of the given array x.| ### 3. map functions | function| description | @@ -181,7 +182,8 @@ create temporary function array_concat as 'cc.shanruifeng.functions.array.UDFArr create temporary function array_value_count as 'cc.shanruifeng.functions.array.UDFArrayValueCount'; create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArraySlice'; create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; -create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayFilter'; +create temporary function array_filter as 'cc.shanruifeng.functions.array.UDFArrayFilter'; +create temporary function array_shuffle as 'cc.shanruifeng.functions.array.UDFArrayShuffle'; create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; create temporary function bitwise_and as 'cc.shanruifeng.functions.bitwise.UDFBitwiseAnd'; create temporary function bitwise_not as 'cc.shanruifeng.functions.bitwise.UDFBitwiseNot'; @@ -284,6 +286,7 @@ select array_filter(array(16,13), 'x -> x > 15') => [16] select array_filter(array('a','b'), 'x -> x == \'a\'') => [a] select array_filter(array(true, false, NULL), 'x -> x != null && x') => [true] select array_filter(array(array('abc', null, '123'), array ('def', 'x', '456')), 'x -> x.get(1) == null') => [['abc', null, '123']] +select array_shuffle(array(16,12,18,9)) ``` ``` diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayShuffle.java b/src/main/java/cc/shanruifeng/functions/array/UDFArrayShuffle.java new file mode 100644 index 0000000..3b2c7c9 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/array/UDFArrayShuffle.java @@ -0,0 +1,112 @@ +package cc.shanruifeng.functions.array; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.*; + +import java.util.ArrayList; +import java.util.concurrent.ThreadLocalRandom; + +/** + * @author aaron02 + * @date 2018-08-18 上午8:52 + */ +@Description(name = "array_shuffle" + , value = "_FUNC_(array) - Generates a random permutation of the given array." + , extended = "Example:\n > select _FUNC_(array) from src;") +public class UDFArrayShuffle extends GenericUDF { + private static final int ARG_COUNT = 1; // Number of arguments to this UDF + private transient ListObjectInspector arrayOI; + private transient ObjectInspector arrayElementOI; + + private transient ObjectInspectorConverters.Converter converter; + private transient ArrayList result = new ArrayList(); + + private static final int INITIAL_LENGTH = 128; + private int[] positions = new int[INITIAL_LENGTH]; + + public UDFArrayShuffle() { + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != ARG_COUNT) { + throw new UDFArgumentLengthException( + "The function array_shuffle(array) takes exactly " + ARG_COUNT + " arguments."); + } + + // Check if two argument is of category LIST + if (!arguments[0].getCategory().equals(ObjectInspector.Category.LIST)) { + throw new UDFArgumentTypeException(0, + "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " + + "expected at function array_shuffle, but " + + "\"" + arguments[0].getTypeName() + "\" " + + "is found"); + } + + arrayOI = (ListObjectInspector) arguments[0]; + arrayElementOI = arrayOI.getListElementObjectInspector(); + + // Check if the comparison is supported for this type + if (!ObjectInspectorUtils.compareSupported(arrayElementOI)) { + throw new UDFArgumentException("The function array_shuffle" + + " does not support comparison for " + + "\"" + arrayElementOI.getTypeName() + "\"" + + " types"); + } + + converter = ObjectInspectorConverters.getConverter(arrayElementOI, arrayElementOI); + + return ObjectInspectorFactory.getStandardListObjectInspector(arrayElementOI); + } + + @Override + public Object evaluate(GenericUDF.DeferredObject[] arguments) throws HiveException { + Object array = arguments[0].get(); + int arrayLength = arrayOI.getListLength(array); + + // Check if array is null or empty + if (array == null || arrayLength <= 0) { + return null; + } + + if (arrayLength == 1) { + return array; + } + + result.clear(); + + if (positions.length < arrayLength) { + positions = new int[arrayLength]; + } + for (int i = 0; i < arrayLength; i++) { + positions[i] = i; + } + + // Fisher-Yates shuffle + // Randomly swap a pair of positions + for (int i = arrayLength - 1; i > 0; i--) { + int index = ThreadLocalRandom.current().nextInt(i + 1); + int swap = positions[i]; + positions[i] = positions[index]; + positions[index] = swap; + } + + for (int i = 0; i < arrayLength; i++) { + Object arrayElement = arrayOI.getListElement(array, positions[i]); + result.add(arrayElement); + } + return result; + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == ARG_COUNT); + return "array_shuffle(" + strings[0] + ")"; + } +} \ No newline at end of file diff --git a/src/test/java/cc/shanruifeng/functions/array/UDFArrayShuffleTest.java b/src/test/java/cc/shanruifeng/functions/array/UDFArrayShuffleTest.java new file mode 100644 index 0000000..5a1c021 --- /dev/null +++ b/src/test/java/cc/shanruifeng/functions/array/UDFArrayShuffleTest.java @@ -0,0 +1,32 @@ +package cc.shanruifeng.functions.array; + +import com.google.common.collect.ImmutableList; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Test; + +import java.util.List; + +/** + * @author aaron02 + * @date 2018-08-18 上午8:59 + */ +public class UDFArrayShuffleTest { + @Test + public void testArrayShuffle() throws HiveException { + UDFArrayShuffle udf = new UDFArrayShuffle(); + + ObjectInspector arrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector); + ObjectInspector[] arguments = {arrayOI}; + + udf.initialize(arguments); + + List array = ImmutableList.of(1,2,5,6); + GenericUDF.DeferredObject arrayObj = new GenericUDF.DeferredJavaObject(array); + GenericUDF.DeferredObject[] args = {arrayObj}; + System.out.println(udf.evaluate(args)); + } +} From 7ee3f788b6295889fd96d06536c527220c400aaf Mon Sep 17 00:00:00 2001 From: aaronshan Date: Sat, 18 Aug 2018 10:11:13 +0800 Subject: [PATCH 09/16] add sequence functions. --- README-zh.md | 8 ++ README.md | 8 ++ .../functions/array/UDFSequence.java | 86 +++++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 src/main/java/cc/shanruifeng/functions/array/UDFSequence.java diff --git a/README-zh.md b/README-zh.md index 6183176..a887188 100644 --- a/README-zh.md +++ b/README-zh.md @@ -74,6 +74,9 @@ mvn clean package -DskipTests |array_element_at(array<E>, index) -> E | 返回指定位置的数组元素. 如果索引位置 < 0, 则从尾部开始计数并返回.| |array_filter(array<E>, function)) -> E | 根据一个返回值为boolean类型的lambda表达式函数来对数组元素进行过滤.| |array_shuffle(array) -> array | 对数组shuffle.| +|sequence(start, end) -> array | 生成数组序列.| +|sequence(start, end, step) -> array | 生成数组序列.| +|sequence(start_date_string, end_data_string, step) -> array | 生成日期数组序列.| ### 3. map函数 | 函数| 描述 | @@ -185,6 +188,7 @@ create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArra create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; create temporary function array_filter as 'cc.shanruifeng.functions.array.UDFArrayFilter'; create temporary function array_shuffle as 'cc.shanruifeng.functions.array.UDFArrayShuffle'; +create temporary function sequence as 'cc.shanruifeng.functions.array.UDFSequence'; create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; create temporary function bitwise_and as 'cc.shanruifeng.functions.bitwise.UDFBitwiseAnd'; create temporary function bitwise_not as 'cc.shanruifeng.functions.bitwise.UDFBitwiseNot'; @@ -289,6 +293,10 @@ select array_filter(array('a','b'), 'x -> x == \'a\'') => [a] select array_filter(array(true, false, NULL), 'x -> x != null && x') => [true] select array_filter(array(array('abc', null, '123'), array ('def', 'x', '456')), 'x -> x.get(1) == null') => [['abc', null, '123']] select array_shuffle(array(16,12,18,9)) +select sequence(1, 5) => [1, 2, 3, 4, 5] +select sequence(5, 1) => [5, 4, 3, 2, 1] +select sequence(1, 9, 4) => [1, 5, 9] +select sequence('2016-04-12 00:00:00', '2016-04-14 00:00:00', 24*3600*1000) => ['2016-04-12 00:00:00', '2016-04-13 00:00:00', '2016-04-14 00:00:00'] ``` ``` diff --git a/README.md b/README.md index d035d49..0faaec6 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,9 @@ You can also directly download file from [release page](https://github.com/aaron |array_element_at(array<E>, index) -> E | returns element of array at given index. If index < 0, element_at accesses elements from the last to the first.| |array_filter(array<E>, function)) -> E | constructs an array from those elements of array for which function returns true.| |array_shuffle(array) -> array | Generate a random permutation of the given array x.| +|sequence(start, end) -> array | Generate a sequence of integers from start to stop.| +|sequence(start, end, step) -> array | Generate a sequence of integers from start to stop, incrementing by step.| +|sequence(start_date_string, end_data_string, step) -> array | Generate a sequence of date string from start to stop, incrementing by step.| ### 3. map functions | function| description | @@ -184,6 +187,7 @@ create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArra create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; create temporary function array_filter as 'cc.shanruifeng.functions.array.UDFArrayFilter'; create temporary function array_shuffle as 'cc.shanruifeng.functions.array.UDFArrayShuffle'; +create temporary function sequence as 'cc.shanruifeng.functions.array.UDFSequence'; create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; create temporary function bitwise_and as 'cc.shanruifeng.functions.bitwise.UDFBitwiseAnd'; create temporary function bitwise_not as 'cc.shanruifeng.functions.bitwise.UDFBitwiseNot'; @@ -287,6 +291,10 @@ select array_filter(array('a','b'), 'x -> x == \'a\'') => [a] select array_filter(array(true, false, NULL), 'x -> x != null && x') => [true] select array_filter(array(array('abc', null, '123'), array ('def', 'x', '456')), 'x -> x.get(1) == null') => [['abc', null, '123']] select array_shuffle(array(16,12,18,9)) +select sequence(1, 5) => [1, 2, 3, 4, 5] +select sequence(5, 1) => [5, 4, 3, 2, 1] +select sequence(1, 9, 4) => [1, 5, 9] +select sequence('2016-04-12 00:00:00', '2016-04-14 00:00:00', 24*3600*1000) => ['2016-04-12 00:00:00', '2016-04-13 00:00:00', '2016-04-14 00:00:00'] ``` ``` diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFSequence.java b/src/main/java/cc/shanruifeng/functions/array/UDFSequence.java new file mode 100644 index 0000000..93ef141 --- /dev/null +++ b/src/main/java/cc/shanruifeng/functions/array/UDFSequence.java @@ -0,0 +1,86 @@ +package cc.shanruifeng.functions.array; + +import com.google.common.collect.Lists; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.joda.time.DateTime; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; + +import java.util.List; + +import static java.lang.Math.toIntExact; +import static cc.shanruifeng.functions.utils.Failures.checkCondition; +/** + * @author aaron02 + * @date 2018-08-18 上午9:23 + */ +@Description(name = "sequence" + , value = "_FUNC_(start, stop) - Generate a sequence of integers from start to stop.\n" + + "_FUNC_(start, stop, step) - Generate a sequence of integers from start to stop, incrementing by step." + , extended = "Example:\n > select _FUNC_(1, 5) from src;\n > select _FUNC_(1, 9, 4) from src;\n" + + " > select _FUNC_('2016-04-12', '2016-04-14') from src;") +public class UDFSequence extends UDF { + public final static DateTimeFormatter DEFAULT_DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); + private static final long MAX_RESULT_ENTRIES = 10_000; + + public UDFSequence() { + + } + + public Object evaluate(LongWritable start, LongWritable stop) throws HiveException { + return fixedWidthSequence(start.get(), stop.get(), stop.get() >= start.get() ? 1 : -1, Long.class); + } + + public Object evaluate(LongWritable start, LongWritable stop, LongWritable step) throws HiveException { + return fixedWidthSequence(start.get(), stop.get(), step.get(), Long.class); + } + + public Object evaluate(Text start, Text stop, long step) throws HiveException { + long startMillis = DateTime.parse(start.toString(), DEFAULT_DATE_FORMATTER).getMillis(); + long stopMillis = DateTime.parse(stop.toString(), DEFAULT_DATE_FORMATTER).getMillis(); + return fixedWidthSequence(startMillis, stopMillis, step, String.class); + } + + private static Object fixedWidthSequence(long start, long stop, long step, Class type) throws HiveException { + checkValidStep(start, stop, step); + + int length = toIntExact((stop - start) / step + 1L); + checkMaxEntry(length); + + if (type == long.class || type == Long.class) { + List result = Lists.newArrayList(); + for (long i = 0, value = start; i < length; ++i, value += step) { + result.add(value); + } + return result; + } else if (type == String.class){ + List result = Lists.newArrayList(); + for (long i = 0, value = start; i < length; ++i, value += step) { + DateTime dateTime = new DateTime(value); + result.add(dateTime.toString(DEFAULT_DATE_FORMATTER)); + } + return result; + } else { + throw new HiveException("Don't support this class type!" + type); + } + } + + private static void checkValidStep(long start, long stop, long step) throws HiveException { + checkCondition( + step != 0, + "step must not be zero"); + checkCondition( + step > 0 ? stop >= start : stop <= start, + "sequence stop value should be greater than or equal to start value if step is greater than zero otherwise stop should be less than or equal to start"); + } + + private static void checkMaxEntry(int length) throws HiveException { + checkCondition( + length <= MAX_RESULT_ENTRIES, + "result of sequence function must not have more than 10000 entries"); + } +} From 72a2d1357e4405cabf31b16ccc7f50ef47f2ab00 Mon Sep 17 00:00:00 2001 From: aaronshan Date: Fri, 7 Dec 2018 17:09:12 +0800 Subject: [PATCH 10/16] update jackson version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a38cfd4..55163f7 100644 --- a/pom.xml +++ b/pom.xml @@ -21,7 +21,7 @@ 18.0 1.10 0.131 - 2.4.4 + 2.8.11.1 1.9.3 4.12 1.6 From 68783e0ebfaadd93f544b7dedb85700d01f8d983 Mon Sep 17 00:00:00 2001 From: aaronshan Date: Fri, 7 Dec 2018 17:15:37 +0800 Subject: [PATCH 11/16] remove lambda functions --- README-zh.md | 6 - README.md | 6 - pom.xml | 6 +- .../functions/array/UDFArrayFilter.java | 113 ------------------ .../functions/utils/LambdaUtils.java | 55 --------- 5 files changed, 3 insertions(+), 183 deletions(-) delete mode 100644 src/main/java/cc/shanruifeng/functions/array/UDFArrayFilter.java delete mode 100644 src/main/java/cc/shanruifeng/functions/utils/LambdaUtils.java diff --git a/README-zh.md b/README-zh.md index a887188..7e1e093 100644 --- a/README-zh.md +++ b/README-zh.md @@ -72,7 +72,6 @@ mvn clean package -DskipTests |array_value_count(array<E>, E) -> long | 统计数组中包含给定元素的个数.| |array_slice(array, start, length) -> array | 对数组进行分片操作,start为正数从前开始分片, start为负数从后开始分片, 长度为指定的长度.| |array_element_at(array<E>, index) -> E | 返回指定位置的数组元素. 如果索引位置 < 0, 则从尾部开始计数并返回.| -|array_filter(array<E>, function)) -> E | 根据一个返回值为boolean类型的lambda表达式函数来对数组元素进行过滤.| |array_shuffle(array) -> array | 对数组shuffle.| |sequence(start, end) -> array | 生成数组序列.| |sequence(start, end, step) -> array | 生成数组序列.| @@ -186,7 +185,6 @@ create temporary function array_concat as 'cc.shanruifeng.functions.array.UDFArr create temporary function array_value_count as 'cc.shanruifeng.functions.array.UDFArrayValueCount'; create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArraySlice'; create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; -create temporary function array_filter as 'cc.shanruifeng.functions.array.UDFArrayFilter'; create temporary function array_shuffle as 'cc.shanruifeng.functions.array.UDFArrayShuffle'; create temporary function sequence as 'cc.shanruifeng.functions.array.UDFSequence'; create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; @@ -288,10 +286,6 @@ select array_concat(array(16,12,18,9,null), array(14,9,6,18,null)) => [16,12,18, select array_value_count(array(16,13,12,13,18,16,9,18), 13) => 2 select array_slice(array(16,13,12,13,18,16,9,18), -2, 3) => [9,18] select array_element_at(array(16,13,12,13,18,16,9,18), -1) => 18 -select array_filter(array(16,13), 'x -> x > 15') => [16] -select array_filter(array('a','b'), 'x -> x == \'a\'') => [a] -select array_filter(array(true, false, NULL), 'x -> x != null && x') => [true] -select array_filter(array(array('abc', null, '123'), array ('def', 'x', '456')), 'x -> x.get(1) == null') => [['abc', null, '123']] select array_shuffle(array(16,12,18,9)) select sequence(1, 5) => [1, 2, 3, 4, 5] select sequence(5, 1) => [5, 4, 3, 2, 1] diff --git a/README.md b/README.md index 0faaec6..ce98821 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,6 @@ You can also directly download file from [release page](https://github.com/aaron |array_value_count(array<E>, E) -> long | count array's element number that element value equals given value.| |array_slice(array, start, length) -> array | subsets array starting from index start (or starting from the end if start is negative) with a length of length.| |array_element_at(array<E>, index) -> E | returns element of array at given index. If index < 0, element_at accesses elements from the last to the first.| -|array_filter(array<E>, function)) -> E | constructs an array from those elements of array for which function returns true.| |array_shuffle(array) -> array | Generate a random permutation of the given array x.| |sequence(start, end) -> array | Generate a sequence of integers from start to stop.| |sequence(start, end, step) -> array | Generate a sequence of integers from start to stop, incrementing by step.| @@ -185,7 +184,6 @@ create temporary function array_concat as 'cc.shanruifeng.functions.array.UDFArr create temporary function array_value_count as 'cc.shanruifeng.functions.array.UDFArrayValueCount'; create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArraySlice'; create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; -create temporary function array_filter as 'cc.shanruifeng.functions.array.UDFArrayFilter'; create temporary function array_shuffle as 'cc.shanruifeng.functions.array.UDFArrayShuffle'; create temporary function sequence as 'cc.shanruifeng.functions.array.UDFSequence'; create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; @@ -286,10 +284,6 @@ select array_concat(array(16,12,18,9,null), array(14,9,6,18,null)) => [16,12,18, select array_value_count(array(16,13,12,13,18,16,9,18), 13) => 2 select array_slice(array(16,13,12,13,18,16,9,18), -2, 3) => [9,18] select array_element_at(array(16,13,12,13,18,16,9,18), -1) => 18 -select array_filter(array(16,13), 'x -> x > 15') => [16] -select array_filter(array('a','b'), 'x -> x == \'a\'') => [a] -select array_filter(array(true, false, NULL), 'x -> x != null && x') => [true] -select array_filter(array(array('abc', null, '123'), array ('def', 'x', '456')), 'x -> x.get(1) == null') => [['abc', null, '123']] select array_shuffle(array(16,12,18,9)) select sequence(1, 5) => [1, 2, 3, 4, 5] select sequence(5, 1) => [5, 4, 3, 2, 1] diff --git a/pom.xml b/pom.xml index 55163f7..289dc8d 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ cc.shanruifeng hive-third-functions - 3.0.0 + 2.2.0 UTF-8 @@ -239,8 +239,8 @@ maven-compiler-plugin 3.1 - 8 - 8 + ${project.build.targetJdk} + ${project.build.targetJdk} ${project.build.sourceEncoding} true diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayFilter.java b/src/main/java/cc/shanruifeng/functions/array/UDFArrayFilter.java deleted file mode 100644 index 5b23295..0000000 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayFilter.java +++ /dev/null @@ -1,113 +0,0 @@ -package cc.shanruifeng.functions.array; - -import cc.shanruifeng.functions.utils.LambdaUtils; -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import pl.joegreen.lambdaFromString.LambdaCreationException; -import pl.joegreen.lambdaFromString.LambdaFactory; - -import java.util.ArrayList; -import java.util.function.Function; - -import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.compareTypes; - -/** - * @author ruifeng.shan - * @date 2018-07-25 下午1:37 - */ -@Description(name = "array_filter" - , value = "_FUNC_(array, function) - constructs an array from those elements of array for which function returns true." - , extended = "Example:\n > select _FUNC_(array, x -> true) from src;") -public class UDFArrayFilter extends GenericUDF { - - private static final int ARRAY_IDX = 0; - private static final int LAMBDA_FUNCTION_IDX = 1; - private static final int ARG_COUNT = 2; // Number of arguments to this UDF - private transient ObjectInspector lambdaStringOI; - private transient ListObjectInspector arrayOI; - private transient ObjectInspector arrayElementOI; - private transient ArrayList result = new ArrayList(); - private static final LambdaFactory LAMBDA_FACTORY = LambdaFactory.get(); - private Function applyFunction; - - public UDFArrayFilter() { - } - - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - // Check if two arguments were passed - if (arguments.length != ARG_COUNT) { - throw new UDFArgumentLengthException( - "The function array_filter(array, function) takes exactly " + ARG_COUNT + " arguments."); - } - - // Check if ARRAY_IDX argument is of category LIST - if (!arguments[ARRAY_IDX].getCategory().equals(ObjectInspector.Category.LIST)) { - throw new UDFArgumentTypeException(ARRAY_IDX, - "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " - + "expected at function array_filter, but " - + "\"" + arguments[ARRAY_IDX].getTypeName() + "\" " - + "is found"); - } - - arrayOI = (ListObjectInspector) arguments[ARRAY_IDX]; - arrayElementOI = arrayOI.getListElementObjectInspector(); - - lambdaStringOI = arguments[LAMBDA_FUNCTION_IDX]; - - // Check if list element and value are of same type - if (!compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, lambdaStringOI)) { - throw new UDFArgumentTypeException(LAMBDA_FUNCTION_IDX, - "\"" + arrayElementOI.getTypeName() + "\"" - + " expected at function array_filter, but " - + "\"" + lambdaStringOI.getTypeName() + "\"" - + " is found"); - } - - return ObjectInspectorFactory.getStandardListObjectInspector(arrayElementOI); - } - - @Override - public Object evaluate(DeferredObject[] arguments) throws HiveException { - Object array = arguments[ARRAY_IDX].get(); - String lambdaString = (String) arguments[LAMBDA_FUNCTION_IDX].get(); - - if (applyFunction == null) { - try { - applyFunction = LambdaUtils.getFilterFunction(LAMBDA_FACTORY, lambdaString, arrayElementOI); - } catch (LambdaCreationException e) { - throw new UDFArgumentTypeException(LAMBDA_FUNCTION_IDX, e.getMessage()); - } - } - - int arrayLength = arrayOI.getListLength(array); - // Check if array is null or empty or value is null - if (array == null || arrayLength <= 0) { - return array; - } - - for (int i = 0; i < arrayLength; ++i) { - Object listElement = arrayOI.getListElement(array, i); - if ((boolean)applyFunction.apply(listElement)) { - result.add(listElement); - } - } - - return result; - } - - @Override - public String getDisplayString(String[] strings) { - assert (strings.length == ARG_COUNT); - return "array_filter(" + strings[ARRAY_IDX] + ", " - + strings[LAMBDA_FUNCTION_IDX] + ")"; - } -} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/utils/LambdaUtils.java b/src/main/java/cc/shanruifeng/functions/utils/LambdaUtils.java deleted file mode 100644 index 40ec104..0000000 --- a/src/main/java/cc/shanruifeng/functions/utils/LambdaUtils.java +++ /dev/null @@ -1,55 +0,0 @@ -package cc.shanruifeng.functions.utils; - -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import pl.joegreen.lambdaFromString.LambdaCreationException; -import pl.joegreen.lambdaFromString.LambdaFactory; -import pl.joegreen.lambdaFromString.TypeReference; - -import java.sql.Timestamp; -import java.util.List; -import java.util.Map; -import java.util.function.Function; - -import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.compareTypes; - -/** - * @author aaron02 - * @date 2018-07-25 下午4:29 - */ -public class LambdaUtils { - public static Function getFilterFunction(LambdaFactory lambdaFactory, String lambdaString, ObjectInspector oi) throws LambdaCreationException { - if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaBooleanObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaByteObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaShortObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaIntObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaLongObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaFloatObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaDoubleObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaStringObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaVoidObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaTimestampObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>() {}); - }else if (compareTypes(oi, PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } else if (oi.getCategory().equals(ObjectInspector.Category.LIST)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference, Boolean>>(){}); - } else if (oi.getCategory().equals(ObjectInspector.Category.MAP)) { - return lambdaFactory.createLambda(lambdaString, new TypeReference, Boolean>>(){}); - } else { - return lambdaFactory.createLambda(lambdaString, new TypeReference>(){}); - } - } -} From a2ba1b3fd5d2d68935a0e9ccc230227d2432a647 Mon Sep 17 00:00:00 2001 From: aaronshan Date: Fri, 7 Dec 2018 17:16:53 +0800 Subject: [PATCH 12/16] remove array filter functions --- .../functions/array/UDFArrayFilterTest.java | 54 ------------------- 1 file changed, 54 deletions(-) delete mode 100644 src/test/java/cc/shanruifeng/functions/array/UDFArrayFilterTest.java diff --git a/src/test/java/cc/shanruifeng/functions/array/UDFArrayFilterTest.java b/src/test/java/cc/shanruifeng/functions/array/UDFArrayFilterTest.java deleted file mode 100644 index 0c0eda7..0000000 --- a/src/test/java/cc/shanruifeng/functions/array/UDFArrayFilterTest.java +++ /dev/null @@ -1,54 +0,0 @@ -package cc.shanruifeng.functions.array; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Iterables; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import static java.util.Arrays.asList; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.List; - -import static org.junit.Assert.*; - -public class UDFArrayFilterTest { - @Test - public void test() throws HiveException { - UDFArrayFilter udf = new UDFArrayFilter(); - - ObjectInspector arrayOI = ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector)); - ObjectInspector valueOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - ObjectInspector[] arguments = {arrayOI, valueOI}; - - udf.initialize(arguments); - List> array = ImmutableList.of(asList("abc", null, "123"), ImmutableList.of("a", "b")); - GenericUDF.DeferredObject arrayObj = new GenericUDF.DeferredJavaObject(array); - GenericUDF.DeferredObject valueObj = new GenericUDF.DeferredJavaObject("s -> s.get(1) == null"); - GenericUDF.DeferredObject[] args = {arrayObj, valueObj}; - ArrayList output = (ArrayList) udf.evaluate(args); - - assertTrue(Iterables.elementsEqual(ImmutableList.of(asList("abc", null, "123")), output)); - } - - @Test - public void test1() throws HiveException { - UDFArrayFilter udf = new UDFArrayFilter(); - - ObjectInspector arrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector); - ObjectInspector valueOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - ObjectInspector[] arguments = {arrayOI, valueOI}; - - udf.initialize(arguments); - List array = asList(null); - GenericUDF.DeferredObject arrayObj = new GenericUDF.DeferredJavaObject(array); - GenericUDF.DeferredObject valueObj = new GenericUDF.DeferredJavaObject("x -> x != null && x"); - GenericUDF.DeferredObject[] args = {arrayObj, valueObj}; - ArrayList output = (ArrayList) udf.evaluate(args); - - assertTrue(Iterables.elementsEqual(asList(true), output)); - } -} \ No newline at end of file From 19234c87581db83e825c3cf84aa03adca4cfcace Mon Sep 17 00:00:00 2001 From: aaronshan Date: Fri, 7 Dec 2018 17:39:31 +0800 Subject: [PATCH 13/16] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E7=A8=8B=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- pom.xml | 8 +- .../functions/array/UDFSequence.java | 2 +- .../functions/regexp/Re2JRegexp.java | 11 ++- .../string/UDFStringLevenshteinDistance.java | 2 +- .../functions/string/UDFWordStem.java | 89 ------------------- 6 files changed, 17 insertions(+), 97 deletions(-) delete mode 100644 src/main/java/cc/shanruifeng/functions/string/UDFWordStem.java diff --git a/README.md b/README.md index ce98821..5b24d3e 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Some useful custom hive udf functions, especial array and json functions. > Note: > 1. hive-third-functions support hive-0.11.0 or higher. -> 2. hive-third-functions `3.0.0` need java8 or higher. +> 2. hive-third-functions `2.2.0` need java8 or higher. ## Build diff --git a/pom.xml b/pom.xml index 289dc8d..d4d9f61 100644 --- a/pom.xml +++ b/pom.xml @@ -21,7 +21,7 @@ 18.0 1.10 0.131 - 2.8.11.1 + 2.9.0 1.9.3 4.12 1.6 @@ -89,6 +89,12 @@ ${dep.jackson.version} + + com.fasterxml.jackson.core + jackson-annotations + ${dep.jackson.version} + + junit junit diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFSequence.java b/src/main/java/cc/shanruifeng/functions/array/UDFSequence.java index 93ef141..51cf631 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFSequence.java +++ b/src/main/java/cc/shanruifeng/functions/array/UDFSequence.java @@ -25,7 +25,7 @@ " > select _FUNC_('2016-04-12', '2016-04-14') from src;") public class UDFSequence extends UDF { public final static DateTimeFormatter DEFAULT_DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); - private static final long MAX_RESULT_ENTRIES = 10_000; + private static final long MAX_RESULT_ENTRIES = 10000; public UDFSequence() { diff --git a/src/main/java/cc/shanruifeng/functions/regexp/Re2JRegexp.java b/src/main/java/cc/shanruifeng/functions/regexp/Re2JRegexp.java index 3466e7b..7720238 100644 --- a/src/main/java/cc/shanruifeng/functions/regexp/Re2JRegexp.java +++ b/src/main/java/cc/shanruifeng/functions/regexp/Re2JRegexp.java @@ -5,11 +5,12 @@ import com.google.re2j.Options; import com.google.re2j.Pattern; import io.airlift.slice.Slice; -import java.util.List; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.List; + import static com.google.common.base.Preconditions.checkState; import static com.google.re2j.Options.Algorithm.DFA_FALLBACK_TO_NFA; import static java.lang.Math.toIntExact; @@ -76,14 +77,16 @@ public Slice replace(Slice source, Slice replacement) throws HiveException { Matcher matcher = re2jPattern.matcher(source); try { return matcher.replaceAll(replacement); - } catch (IndexOutOfBoundsException | IllegalArgumentException e) { + } catch (IndexOutOfBoundsException e) { + throw new HiveException("Illegal replacement sequence: " + replacement.toStringUtf8()); + } catch (IllegalArgumentException e) { throw new HiveException("Illegal replacement sequence: " + replacement.toStringUtf8()); } } public List extractAll(Slice source, long groupIndex) throws HiveException { Matcher matcher = re2jPattern.matcher(source); - int group = toIntExact(groupIndex); + int group = (int)(groupIndex); validateGroup(group, matcher.groupCount()); List list = Lists.newArrayList(); @@ -104,7 +107,7 @@ public List extractAll(Slice source, long groupIndex) throws HiveExcepti public Slice extract(Slice source, long groupIndex) throws HiveException { Matcher matcher = re2jPattern.matcher(source); - int group = toIntExact(groupIndex); + int group = (int)(groupIndex); validateGroup(group, matcher.groupCount()); if (!matcher.find()) { diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFStringLevenshteinDistance.java b/src/main/java/cc/shanruifeng/functions/string/UDFStringLevenshteinDistance.java index a37060f..7bef2b9 100644 --- a/src/main/java/cc/shanruifeng/functions/string/UDFStringLevenshteinDistance.java +++ b/src/main/java/cc/shanruifeng/functions/string/UDFStringLevenshteinDistance.java @@ -55,7 +55,7 @@ public LongWritable evaluate(Text leftText, Text rightText) throws HiveException return result; } - checkCondition((leftCodePoints.length * (rightCodePoints.length - 1)) <= 1_000_000, + checkCondition((leftCodePoints.length * (rightCodePoints.length - 1)) <= 1000000, "The combined inputs for Levenshtein distance are too large"); int[] distances = new int[rightCodePoints.length]; diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFWordStem.java b/src/main/java/cc/shanruifeng/functions/string/UDFWordStem.java deleted file mode 100644 index 7c58703..0000000 --- a/src/main/java/cc/shanruifeng/functions/string/UDFWordStem.java +++ /dev/null @@ -1,89 +0,0 @@ -package cc.shanruifeng.functions.string; - -import com.google.common.collect.ImmutableMap; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; -import org.apache.commons.codec.digest.DigestUtils; -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDF; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.io.Text; -import org.tartarus.snowball.SnowballProgram; -import org.tartarus.snowball.ext.*; - -import java.util.Map; -import java.util.function.Supplier; - -import static io.airlift.slice.Slices.utf8Slice; - -/** - * @author ruifeng.shan - * @date 2018-07-27 下午12:22 - */ -@Description(name = "word_stem" - , value = "_FUNC_(word) - returns the stem of a word in the English language\n" + - "_FUNC_(word, language) - returns the stem of a word in the given language." - , extended = "Example:\n > select _FUNC_(string, language_str) from src;") -public class UDFWordStem extends UDF { - private Text result = new Text(); - - private static final Map> STEMMERS = ImmutableMap.>builder() - .put(utf8Slice("ca"), () -> new CatalanStemmer()) - .put(utf8Slice("da"), DanishStemmer::new) - .put(utf8Slice("de"), German2Stemmer::new) - .put(utf8Slice("en"), EnglishStemmer::new) - .put(utf8Slice("es"), SpanishStemmer::new) - .put(utf8Slice("eu"), BasqueStemmer::new) - .put(utf8Slice("fi"), FinnishStemmer::new) - .put(utf8Slice("fr"), FrenchStemmer::new) - .put(utf8Slice("hu"), HungarianStemmer::new) - .put(utf8Slice("hy"), ArmenianStemmer::new) - .put(utf8Slice("ir"), IrishStemmer::new) - .put(utf8Slice("it"), ItalianStemmer::new) - .put(utf8Slice("lt"), LithuanianStemmer::new) - .put(utf8Slice("nl"), DutchStemmer::new) - .put(utf8Slice("no"), NorwegianStemmer::new) - .put(utf8Slice("pt"), PortugueseStemmer::new) - .put(utf8Slice("ro"), RomanianStemmer::new) - .put(utf8Slice("ru"), RussianStemmer::new) - .put(utf8Slice("sv"), SwedishStemmer::new) - .put(utf8Slice("tr"), TurkishStemmer::new) - .build(); - - - public UDFWordStem() { - } - - public Text evaluate(Text text) { - if (text == null) { - return null; - } - - String string = wordStem(Slices.utf8Slice(text.toString()), new EnglishStemmer()).toString(); - result.set(string); - - return result; - } - - public Text evaluate(Text text, Text language) throws HiveException { - if (text == null) { - return null; - } - - Supplier stemmer = STEMMERS.get(language.toString()); - if (stemmer == null) { - throw new HiveException("Unknown stemmer language: " + language.toString()); - } - - String string = wordStem(Slices.utf8Slice(text.toString()), stemmer.get()).toString(); - result.set(string); - - return result; - } - - private static Slice wordStem(Slice slice, SnowballProgram stemmer) - { - stemmer.setCurrent(slice.toStringUtf8()); - return stemmer.stem() ? utf8Slice(stemmer.getCurrent()) : slice; - } -} \ No newline at end of file From 99143a639bf15af84c9fa3e29473b9c5978725bd Mon Sep 17 00:00:00 2001 From: aaronshan Date: Fri, 1 Feb 2019 16:12:11 +0800 Subject: [PATCH 14/16] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E7=A8=8B=E5=BA=8F?= =?UTF-8?q?=E5=8C=85=E5=90=8D=EF=BC=8C=E4=BB=A5=E4=BE=BF=E5=92=8Cmaven?= =?UTF-8?q?=E4=B8=AD=E5=A4=AE=E4=BB=93=E5=BA=93=E7=9A=84groupid=E4=B8=80?= =?UTF-8?q?=E8=87=B4=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-zh.md | 108 +++++++++--------- README.md | 108 +++++++++--------- pom.xml | 67 ++++++++++- .../functions/array/UDFArrayConcat.java | 2 +- .../functions/array/UDFArrayContains.java | 2 +- .../functions/array/UDFArrayDistinct.java | 6 +- .../functions/array/UDFArrayElementAt.java | 2 +- .../functions/array/UDFArrayEquals.java | 4 +- .../functions/array/UDFArrayIntersect.java | 6 +- .../functions/array/UDFArrayJoin.java | 2 +- .../functions/array/UDFArrayMax.java | 6 +- .../functions/array/UDFArrayMin.java | 6 +- .../functions/array/UDFArrayPosition.java | 2 +- .../functions/array/UDFArrayRemove.java | 2 +- .../functions/array/UDFArrayReverse.java | 2 +- .../functions/array/UDFArraySlice.java | 2 +- .../functions/array/UDFArraySort.java | 10 +- .../functions/array/UDFArrayValueCount.java | 2 +- .../functions/bitwise/UDFBitCount.java | 2 +- .../functions/bitwise/UDFBitwiseAnd.java | 2 +- .../functions/bitwise/UDFBitwiseNot.java | 2 +- .../functions/bitwise/UDFBitwiseOr.java | 2 +- .../functions/bitwise/UDFBitwiseXor.java | 2 +- .../functions/card/UDFChinaIdCardArea.java | 4 +- .../card/UDFChinaIdCardBirthday.java | 4 +- .../functions/card/UDFChinaIdCardCity.java | 4 +- .../functions/card/UDFChinaIdCardGender.java | 4 +- .../functions/card/UDFChinaIdCardInfo.java | 4 +- .../card/UDFChinaIdCardProvince.java | 4 +- .../functions/card/UDFChinaIdCardValid.java | 4 +- .../functions/date/UDFDayOfWeek.java | 2 +- .../functions/date/UDFDayOfYear.java | 2 +- .../functions/date/UDFTypeOfDay.java | 4 +- .../functions/date/UDFZodiacSignCn.java | 2 +- .../functions/date/UDFZodiacSignEn.java | 2 +- .../fastuitl/ints/AbstractIntComparator.java | 2 +- .../functions/fastuitl/ints/IntArrays.java | 2 +- .../fastuitl/ints/IntComparator.java | 2 +- .../functions/geo/UDFGeoBdToGcj.java | 4 +- .../functions/geo/UDFGeoGcjExtractWgs.java | 4 +- .../functions/geo/UDFGeoGcjToBd.java | 4 +- .../functions/geo/UDFGeoGcjToWgs.java | 4 +- .../functions/geo/UDFGeoWgsDistance.java | 4 +- .../functions/geo/UDFGeoWgsToGcj.java | 4 +- .../functions/json/UDFJsonArrayExtract.java | 8 +- .../json/UDFJsonArrayExtractScalar.java | 8 +- .../functions/json/UDFJsonArrayGet.java | 4 +- .../functions/json/UDFJsonArrayLength.java | 4 +- .../functions/json/UDFJsonExtract.java | 6 +- .../functions/json/UDFJsonExtractScalar.java | 6 +- .../functions/json/UDFJsonSize.java | 6 +- .../aaronshan}/functions/map/UDFMapBuild.java | 2 +- .../functions/map/UDFMapConcat.java | 2 +- .../functions/map/UDFMapElementAt.java | 2 +- .../functions/map/UDFMapEquals.java | 4 +- .../functions/model/ChinaIdArea.java | 2 +- .../functions/string/UDFChineseToPinYin.java | 2 +- .../aaronshan}/functions/string/UDFMd5.java | 2 +- .../functions/string/UDFSha256.java | 2 +- .../functions/url/UDFUrlDecode.java | 2 +- .../functions/url/UDFUrlEncode.java | 2 +- .../functions/utils/ArrayUtils.java | 6 +- .../aaronshan}/functions/utils/CardUtils.java | 4 +- .../functions/utils/ConfigUtils.java | 4 +- .../aaronshan}/functions/utils/GeoUtils.java | 2 +- .../aaronshan}/functions/utils/MapUtils.java | 2 +- .../functions/utils/json/JsonExtract.java | 2 +- .../functions/utils/json/JsonPath.java | 2 +- .../utils/json/JsonPathTokenizer.java | 2 +- .../functions/utils/json/JsonUtils.java | 2 +- .../functions/array/UDFArrayContainsTest.java | 2 +- .../array/UDFArrayIntersectTest.java | 2 +- .../functions/bitwise/UDFBitCountTest.java | 2 +- .../functions/date/UDFDayOfYearTest.java | 2 +- .../functions/geo/UDFGeoBdToGcjTest.java | 2 +- .../functions/map/UDFMapBuildTest.java | 8 +- .../functions/map/UDFMapConcatTest.java | 8 +- .../functions/map/UDFMapElementAtTest.java | 4 +- .../functions/map/UDFMapEqualsTest.java | 4 +- .../functions/url/UDFUrlDecodeTest.java | 2 +- .../functions/url/UDFUrlEncodeTest.java | 4 +- 81 files changed, 310 insertions(+), 239 deletions(-) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayConcat.java (99%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayContains.java (99%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayDistinct.java (95%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayElementAt.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayEquals.java (97%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayIntersect.java (97%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayJoin.java (99%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayMax.java (94%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayMin.java (94%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayPosition.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayRemove.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayReverse.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArraySlice.java (99%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArraySort.java (91%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayValueCount.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/bitwise/UDFBitCount.java (96%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/bitwise/UDFBitwiseAnd.java (92%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/bitwise/UDFBitwiseNot.java (92%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/bitwise/UDFBitwiseOr.java (92%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/bitwise/UDFBitwiseXor.java (92%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/card/UDFChinaIdCardArea.java (87%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/card/UDFChinaIdCardBirthday.java (87%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/card/UDFChinaIdCardCity.java (87%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/card/UDFChinaIdCardGender.java (87%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/card/UDFChinaIdCardInfo.java (87%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/card/UDFChinaIdCardProvince.java (87%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/card/UDFChinaIdCardValid.java (88%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/date/UDFDayOfWeek.java (97%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/date/UDFDayOfYear.java (97%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/date/UDFTypeOfDay.java (96%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/date/UDFZodiacSignCn.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/date/UDFZodiacSignEn.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/fastuitl/ints/AbstractIntComparator.java (88%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/fastuitl/ints/IntArrays.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/fastuitl/ints/IntComparator.java (82%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/geo/UDFGeoBdToGcj.java (85%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/geo/UDFGeoGcjExtractWgs.java (85%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/geo/UDFGeoGcjToBd.java (84%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/geo/UDFGeoGcjToWgs.java (84%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/geo/UDFGeoWgsDistance.java (86%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/geo/UDFGeoWgsToGcj.java (84%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/json/UDFJsonArrayExtract.java (93%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/json/UDFJsonArrayExtractScalar.java (93%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/json/UDFJsonArrayGet.java (88%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/json/UDFJsonArrayLength.java (89%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/json/UDFJsonExtract.java (84%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/json/UDFJsonExtractScalar.java (85%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/json/UDFJsonSize.java (87%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/map/UDFMapBuild.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/map/UDFMapConcat.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/map/UDFMapElementAt.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/map/UDFMapEquals.java (97%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/model/ChinaIdArea.java (91%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/string/UDFChineseToPinYin.java (97%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/string/UDFMd5.java (94%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/string/UDFSha256.java (94%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/url/UDFUrlDecode.java (95%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/url/UDFUrlEncode.java (96%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/utils/ArrayUtils.java (92%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/utils/CardUtils.java (97%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/utils/ConfigUtils.java (96%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/utils/GeoUtils.java (99%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/utils/MapUtils.java (96%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/utils/json/JsonExtract.java (99%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/utils/json/JsonPath.java (95%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/utils/json/JsonPathTokenizer.java (98%) rename src/main/java/{cc/shanruifeng => com/github/aaronshan}/functions/utils/json/JsonUtils.java (98%) rename src/test/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayContainsTest.java (97%) rename src/test/java/{cc/shanruifeng => com/github/aaronshan}/functions/array/UDFArrayIntersectTest.java (97%) rename src/test/java/{cc/shanruifeng => com/github/aaronshan}/functions/bitwise/UDFBitCountTest.java (95%) rename src/test/java/{cc/shanruifeng => com/github/aaronshan}/functions/date/UDFDayOfYearTest.java (94%) rename src/test/java/{cc/shanruifeng => com/github/aaronshan}/functions/geo/UDFGeoBdToGcjTest.java (93%) rename src/test/java/{cc/shanruifeng => com/github/aaronshan}/functions/map/UDFMapBuildTest.java (87%) rename src/test/java/{cc/shanruifeng => com/github/aaronshan}/functions/map/UDFMapConcatTest.java (88%) rename src/test/java/{cc/shanruifeng => com/github/aaronshan}/functions/map/UDFMapElementAtTest.java (94%) rename src/test/java/{cc/shanruifeng => com/github/aaronshan}/functions/map/UDFMapEqualsTest.java (96%) rename src/test/java/{cc/shanruifeng => com/github/aaronshan}/functions/url/UDFUrlDecodeTest.java (94%) rename src/test/java/{cc/shanruifeng => com/github/aaronshan}/functions/url/UDFUrlEncodeTest.java (87%) diff --git a/README-zh.md b/README-zh.md index fb060d2..3c5d2ee 100644 --- a/README-zh.md +++ b/README-zh.md @@ -151,60 +151,60 @@ mvn clean package -DskipTests ``` add jar ${jar_location_dir}/hive-third-functions-${version}-shaded.jar -create temporary function array_contains as 'cc.shanruifeng.functions.array.UDFArrayContains'; -create temporary function array_equals as 'cc.shanruifeng.functions.array.UDFArrayEquals'; -create temporary function array_intersect as 'cc.shanruifeng.functions.array.UDFArrayIntersect'; -create temporary function array_max as 'cc.shanruifeng.functions.array.UDFArrayMax'; -create temporary function array_min as 'cc.shanruifeng.functions.array.UDFArrayMin'; -create temporary function array_join as 'cc.shanruifeng.functions.array.UDFArrayJoin'; -create temporary function array_distinct as 'cc.shanruifeng.functions.array.UDFArrayDistinct'; -create temporary function array_position as 'cc.shanruifeng.functions.array.UDFArrayPosition'; -create temporary function array_remove as 'cc.shanruifeng.functions.array.UDFArrayRemove'; -create temporary function array_reverse as 'cc.shanruifeng.functions.array.UDFArrayReverse'; -create temporary function array_sort as 'cc.shanruifeng.functions.array.UDFArraySort'; -create temporary function array_concat as 'cc.shanruifeng.functions.array.UDFArrayConcat'; -create temporary function array_value_count as 'cc.shanruifeng.functions.array.UDFArrayValueCount'; -create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArraySlice'; -create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; -create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; -create temporary function bitwise_and as 'cc.shanruifeng.functions.bitwise.UDFBitwiseAnd'; -create temporary function bitwise_not as 'cc.shanruifeng.functions.bitwise.UDFBitwiseNot'; -create temporary function bitwise_or as 'cc.shanruifeng.functions.bitwise.UDFBitwiseOr'; -create temporary function bitwise_xor as 'cc.shanruifeng.functions.bitwise.UDFBitwiseXor'; -create temporary function map_build as 'cc.shanruifeng.functions.map.UDFMapBuild'; -create temporary function map_concat as 'cc.shanruifeng.functions.map.UDFMapConcat'; -create temporary function map_element_at as 'cc.shanruifeng.functions.map.UDFMapElementAt'; -create temporary function map_equals as 'cc.shanruifeng.functions.map.UDFMapEquals'; -create temporary function day_of_week as 'cc.shanruifeng.functions.date.UDFDayOfWeek'; -create temporary function day_of_year as 'cc.shanruifeng.functions.date.UDFDayOfYear'; -create temporary function type_of_day as 'cc.shanruifeng.functions.date.UDFTypeOfDay'; -create temporary function zodiac_cn as 'cc.shanruifeng.functions.date.UDFZodiacSignCn'; -create temporary function zodiac_en as 'cc.shanruifeng.functions.date.UDFZodiacSignEn'; -create temporary function pinyin as 'cc.shanruifeng.functions.string.UDFChineseToPinYin'; -create temporary function md5 as 'cc.shanruifeng.functions.string.UDFMd5'; -create temporary function sha256 as 'cc.shanruifeng.functions.string.UDFSha256'; -create temporary function json_array_get as 'cc.shanruifeng.functions.json.UDFJsonArrayGet'; -create temporary function json_array_length as 'cc.shanruifeng.functions.json.UDFJsonArrayLength'; -create temporary function json_array_extract as 'cc.shanruifeng.functions.json.UDFJsonArrayExtract'; -create temporary function json_array_extract_scalar as 'cc.shanruifeng.functions.json.UDFJsonArrayExtractScalar'; -create temporary function json_extract as 'cc.shanruifeng.functions.json.UDFJsonExtract'; -create temporary function json_extract_scalar as 'cc.shanruifeng.functions.json.UDFJsonExtractScalar'; -create temporary function json_size as 'cc.shanruifeng.functions.json.UDFJsonSize'; -create temporary function id_card_province as 'cc.shanruifeng.functions.card.UDFChinaIdCardProvince'; -create temporary function id_card_city as 'cc.shanruifeng.functions.card.UDFChinaIdCardCity'; -create temporary function id_card_area as 'cc.shanruifeng.functions.card.UDFChinaIdCardArea'; -create temporary function id_card_birthday as 'cc.shanruifeng.functions.card.UDFChinaIdCardBirthday'; -create temporary function id_card_gender as 'cc.shanruifeng.functions.card.UDFChinaIdCardGender'; -create temporary function is_valid_id_card as 'cc.shanruifeng.functions.card.UDFChinaIdCardValid'; -create temporary function id_card_info as 'cc.shanruifeng.functions.card.UDFChinaIdCardInfo'; -create temporary function wgs_distance as 'cc.shanruifeng.functions.geo.UDFGeoWgsDistance'; -create temporary function gcj_to_bd as 'cc.shanruifeng.functions.geo.UDFGeoGcjToBd'; -create temporary function bd_to_gcj as 'cc.shanruifeng.functions.geo.UDFGeoBdToGcj'; -create temporary function wgs_to_gcj as 'cc.shanruifeng.functions.geo.UDFGeoWgsToGcj'; -create temporary function gcj_to_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjToWgs'; -create temporary function gcj_extract_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjExtractWgs'; -create temporary function url_encode as 'cc.shanruifeng.functions.url.UDFUrlEncode'; -create temporary function url_decode as 'cc.shanruifeng.functions.url.UDFUrlDecode'; +create temporary function array_contains as 'com.github.aaronshan.functions.array.UDFArrayContains'; +create temporary function array_equals as 'com.github.aaronshan.functions.array.UDFArrayEquals'; +create temporary function array_intersect as 'com.github.aaronshan.functions.array.UDFArrayIntersect'; +create temporary function array_max as 'com.github.aaronshan.functions.array.UDFArrayMax'; +create temporary function array_min as 'com.github.aaronshan.functions.array.UDFArrayMin'; +create temporary function array_join as 'com.github.aaronshan.functions.array.UDFArrayJoin'; +create temporary function array_distinct as 'com.github.aaronshan.functions.array.UDFArrayDistinct'; +create temporary function array_position as 'com.github.aaronshan.functions.array.UDFArrayPosition'; +create temporary function array_remove as 'com.github.aaronshan.functions.array.UDFArrayRemove'; +create temporary function array_reverse as 'com.github.aaronshan.functions.array.UDFArrayReverse'; +create temporary function array_sort as 'com.github.aaronshan.functions.array.UDFArraySort'; +create temporary function array_concat as 'com.github.aaronshan.functions.array.UDFArrayConcat'; +create temporary function array_value_count as 'com.github.aaronshan.functions.array.UDFArrayValueCount'; +create temporary function array_slice as 'com.github.aaronshan.functions.array.UDFArraySlice'; +create temporary function array_element_at as 'com.github.aaronshan.functions.array.UDFArrayElementAt'; +create temporary function bit_count as 'com.github.aaronshan.functions.bitwise.UDFBitCount'; +create temporary function bitwise_and as 'com.github.aaronshan.functions.bitwise.UDFBitwiseAnd'; +create temporary function bitwise_not as 'com.github.aaronshan.functions.bitwise.UDFBitwiseNot'; +create temporary function bitwise_or as 'com.github.aaronshan.functions.bitwise.UDFBitwiseOr'; +create temporary function bitwise_xor as 'com.github.aaronshan.functions.bitwise.UDFBitwiseXor'; +create temporary function map_build as 'com.github.aaronshan.functions.map.UDFMapBuild'; +create temporary function map_concat as 'com.github.aaronshan.functions.map.UDFMapConcat'; +create temporary function map_element_at as 'com.github.aaronshan.functions.map.UDFMapElementAt'; +create temporary function map_equals as 'com.github.aaronshan.functions.map.UDFMapEquals'; +create temporary function day_of_week as 'com.github.aaronshan.functions.date.UDFDayOfWeek'; +create temporary function day_of_year as 'com.github.aaronshan.functions.date.UDFDayOfYear'; +create temporary function type_of_day as 'com.github.aaronshan.functions.date.UDFTypeOfDay'; +create temporary function zodiac_cn as 'com.github.aaronshan.functions.date.UDFZodiacSignCn'; +create temporary function zodiac_en as 'com.github.aaronshan.functions.date.UDFZodiacSignEn'; +create temporary function pinyin as 'com.github.aaronshan.functions.string.UDFChineseToPinYin'; +create temporary function md5 as 'com.github.aaronshan.functions.string.UDFMd5'; +create temporary function sha256 as 'com.github.aaronshan.functions.string.UDFSha256'; +create temporary function json_array_get as 'com.github.aaronshan.functions.json.UDFJsonArrayGet'; +create temporary function json_array_length as 'com.github.aaronshan.functions.json.UDFJsonArrayLength'; +create temporary function json_array_extract as 'com.github.aaronshan.functions.json.UDFJsonArrayExtract'; +create temporary function json_array_extract_scalar as 'com.github.aaronshan.functions.json.UDFJsonArrayExtractScalar'; +create temporary function json_extract as 'com.github.aaronshan.functions.json.UDFJsonExtract'; +create temporary function json_extract_scalar as 'com.github.aaronshan.functions.json.UDFJsonExtractScalar'; +create temporary function json_size as 'com.github.aaronshan.functions.json.UDFJsonSize'; +create temporary function id_card_province as 'com.github.aaronshan.functions.card.UDFChinaIdCardProvince'; +create temporary function id_card_city as 'com.github.aaronshan.functions.card.UDFChinaIdCardCity'; +create temporary function id_card_area as 'com.github.aaronshan.functions.card.UDFChinaIdCardArea'; +create temporary function id_card_birthday as 'com.github.aaronshan.functions.card.UDFChinaIdCardBirthday'; +create temporary function id_card_gender as 'com.github.aaronshan.functions.card.UDFChinaIdCardGender'; +create temporary function is_valid_id_card as 'com.github.aaronshan.functions.card.UDFChinaIdCardValid'; +create temporary function id_card_info as 'com.github.aaronshan.functions.card.UDFChinaIdCardInfo'; +create temporary function wgs_distance as 'com.github.aaronshan.functions.geo.UDFGeoWgsDistance'; +create temporary function gcj_to_bd as 'com.github.aaronshan.functions.geo.UDFGeoGcjToBd'; +create temporary function bd_to_gcj as 'com.github.aaronshan.functions.geo.UDFGeoBdToGcj'; +create temporary function wgs_to_gcj as 'com.github.aaronshan.functions.geo.UDFGeoWgsToGcj'; +create temporary function gcj_to_wgs as 'com.github.aaronshan.functions.geo.UDFGeoGcjToWgs'; +create temporary function gcj_extract_wgs as 'com.github.aaronshan.functions.geo.UDFGeoGcjExtractWgs'; +create temporary function url_encode as 'com.github.aaronshan.functions.url.UDFUrlEncode'; +create temporary function url_decode as 'com.github.aaronshan.functions.url.UDFUrlDecode'; ``` 你可以在hive的命令杭中使用下面的语句来查看函数的细节. diff --git a/README.md b/README.md index 3cce4ac..0ebb4be 100644 --- a/README.md +++ b/README.md @@ -151,60 +151,60 @@ Put these statements into `${HOME}/.hiverc` or exec its on hive cli env. ``` add jar ${jar_location_dir}/hive-third-functions-${version}-shaded.jar -create temporary function array_contains as 'cc.shanruifeng.functions.array.UDFArrayContains'; -create temporary function array_equals as 'cc.shanruifeng.functions.array.UDFArrayEquals'; -create temporary function array_intersect as 'cc.shanruifeng.functions.array.UDFArrayIntersect'; -create temporary function array_max as 'cc.shanruifeng.functions.array.UDFArrayMax'; -create temporary function array_min as 'cc.shanruifeng.functions.array.UDFArrayMin'; -create temporary function array_join as 'cc.shanruifeng.functions.array.UDFArrayJoin'; -create temporary function array_distinct as 'cc.shanruifeng.functions.array.UDFArrayDistinct'; -create temporary function array_position as 'cc.shanruifeng.functions.array.UDFArrayPosition'; -create temporary function array_remove as 'cc.shanruifeng.functions.array.UDFArrayRemove'; -create temporary function array_reverse as 'cc.shanruifeng.functions.array.UDFArrayReverse'; -create temporary function array_sort as 'cc.shanruifeng.functions.array.UDFArraySort'; -create temporary function array_concat as 'cc.shanruifeng.functions.array.UDFArrayConcat'; -create temporary function array_value_count as 'cc.shanruifeng.functions.array.UDFArrayValueCount'; -create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArraySlice'; -create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; -create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; -create temporary function bitwise_and as 'cc.shanruifeng.functions.bitwise.UDFBitwiseAnd'; -create temporary function bitwise_not as 'cc.shanruifeng.functions.bitwise.UDFBitwiseNot'; -create temporary function bitwise_or as 'cc.shanruifeng.functions.bitwise.UDFBitwiseOr'; -create temporary function bitwise_xor as 'cc.shanruifeng.functions.bitwise.UDFBitwiseXor'; -create temporary function map_build as 'cc.shanruifeng.functions.map.UDFMapBuild'; -create temporary function map_concat as 'cc.shanruifeng.functions.map.UDFMapConcat'; -create temporary function map_element_at as 'cc.shanruifeng.functions.map.UDFMapElementAt'; -create temporary function map_equals as 'cc.shanruifeng.functions.map.UDFMapEquals'; -create temporary function day_of_week as 'cc.shanruifeng.functions.date.UDFDayOfWeek'; -create temporary function day_of_year as 'cc.shanruifeng.functions.date.UDFDayOfYear'; -create temporary function type_of_day as 'cc.shanruifeng.functions.date.UDFTypeOfDay'; -create temporary function zodiac_cn as 'cc.shanruifeng.functions.date.UDFZodiacSignCn'; -create temporary function zodiac_en as 'cc.shanruifeng.functions.date.UDFZodiacSignEn'; -create temporary function pinyin as 'cc.shanruifeng.functions.string.UDFChineseToPinYin'; -create temporary function md5 as 'cc.shanruifeng.functions.string.UDFMd5'; -create temporary function sha256 as 'cc.shanruifeng.functions.string.UDFSha256'; -create temporary function json_array_get as 'cc.shanruifeng.functions.json.UDFJsonArrayGet'; -create temporary function json_array_length as 'cc.shanruifeng.functions.json.UDFJsonArrayLength'; -create temporary function json_array_extract as 'cc.shanruifeng.functions.json.UDFJsonArrayExtract'; -create temporary function json_array_extract_scalar as 'cc.shanruifeng.functions.json.UDFJsonArrayExtractScalar'; -create temporary function json_extract as 'cc.shanruifeng.functions.json.UDFJsonExtract'; -create temporary function json_extract_scalar as 'cc.shanruifeng.functions.json.UDFJsonExtractScalar'; -create temporary function json_size as 'cc.shanruifeng.functions.json.UDFJsonSize'; -create temporary function id_card_province as 'cc.shanruifeng.functions.card.UDFChinaIdCardProvince'; -create temporary function id_card_city as 'cc.shanruifeng.functions.card.UDFChinaIdCardCity'; -create temporary function id_card_area as 'cc.shanruifeng.functions.card.UDFChinaIdCardArea'; -create temporary function id_card_birthday as 'cc.shanruifeng.functions.card.UDFChinaIdCardBirthday'; -create temporary function id_card_gender as 'cc.shanruifeng.functions.card.UDFChinaIdCardGender'; -create temporary function is_valid_id_card as 'cc.shanruifeng.functions.card.UDFChinaIdCardValid'; -create temporary function id_card_info as 'cc.shanruifeng.functions.card.UDFChinaIdCardInfo'; -create temporary function wgs_distance as 'cc.shanruifeng.functions.geo.UDFGeoWgsDistance'; -create temporary function gcj_to_bd as 'cc.shanruifeng.functions.geo.UDFGeoGcjToBd'; -create temporary function bd_to_gcj as 'cc.shanruifeng.functions.geo.UDFGeoBdToGcj'; -create temporary function wgs_to_gcj as 'cc.shanruifeng.functions.geo.UDFGeoWgsToGcj'; -create temporary function gcj_to_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjToWgs'; -create temporary function gcj_extract_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjExtractWgs'; -create temporary function url_encode as 'cc.shanruifeng.functions.url.UDFUrlEncode'; -create temporary function url_decode as 'cc.shanruifeng.functions.url.UDFUrlDecode'; +create temporary function array_contains as 'com.github.aaronshan.functions.array.UDFArrayContains'; +create temporary function array_equals as 'com.github.aaronshan.functions.array.UDFArrayEquals'; +create temporary function array_intersect as 'com.github.aaronshan.functions.array.UDFArrayIntersect'; +create temporary function array_max as 'com.github.aaronshan.functions.array.UDFArrayMax'; +create temporary function array_min as 'com.github.aaronshan.functions.array.UDFArrayMin'; +create temporary function array_join as 'com.github.aaronshan.functions.array.UDFArrayJoin'; +create temporary function array_distinct as 'com.github.aaronshan.functions.array.UDFArrayDistinct'; +create temporary function array_position as 'com.github.aaronshan.functions.array.UDFArrayPosition'; +create temporary function array_remove as 'com.github.aaronshan.functions.array.UDFArrayRemove'; +create temporary function array_reverse as 'com.github.aaronshan.functions.array.UDFArrayReverse'; +create temporary function array_sort as 'com.github.aaronshan.functions.array.UDFArraySort'; +create temporary function array_concat as 'com.github.aaronshan.functions.array.UDFArrayConcat'; +create temporary function array_value_count as 'com.github.aaronshan.functions.array.UDFArrayValueCount'; +create temporary function array_slice as 'com.github.aaronshan.functions.array.UDFArraySlice'; +create temporary function array_element_at as 'com.github.aaronshan.functions.array.UDFArrayElementAt'; +create temporary function bit_count as 'com.github.aaronshan.functions.bitwise.UDFBitCount'; +create temporary function bitwise_and as 'com.github.aaronshan.functions.bitwise.UDFBitwiseAnd'; +create temporary function bitwise_not as 'com.github.aaronshan.functions.bitwise.UDFBitwiseNot'; +create temporary function bitwise_or as 'com.github.aaronshan.functions.bitwise.UDFBitwiseOr'; +create temporary function bitwise_xor as 'com.github.aaronshan.functions.bitwise.UDFBitwiseXor'; +create temporary function map_build as 'com.github.aaronshan.functions.map.UDFMapBuild'; +create temporary function map_concat as 'com.github.aaronshan.functions.map.UDFMapConcat'; +create temporary function map_element_at as 'com.github.aaronshan.functions.map.UDFMapElementAt'; +create temporary function map_equals as 'com.github.aaronshan.functions.map.UDFMapEquals'; +create temporary function day_of_week as 'com.github.aaronshan.functions.date.UDFDayOfWeek'; +create temporary function day_of_year as 'com.github.aaronshan.functions.date.UDFDayOfYear'; +create temporary function type_of_day as 'com.github.aaronshan.functions.date.UDFTypeOfDay'; +create temporary function zodiac_cn as 'com.github.aaronshan.functions.date.UDFZodiacSignCn'; +create temporary function zodiac_en as 'com.github.aaronshan.functions.date.UDFZodiacSignEn'; +create temporary function pinyin as 'com.github.aaronshan.functions.string.UDFChineseToPinYin'; +create temporary function md5 as 'com.github.aaronshan.functions.string.UDFMd5'; +create temporary function sha256 as 'com.github.aaronshan.functions.string.UDFSha256'; +create temporary function json_array_get as 'com.github.aaronshan.functions.json.UDFJsonArrayGet'; +create temporary function json_array_length as 'com.github.aaronshan.functions.json.UDFJsonArrayLength'; +create temporary function json_array_extract as 'com.github.aaronshan.functions.json.UDFJsonArrayExtract'; +create temporary function json_array_extract_scalar as 'com.github.aaronshan.functions.json.UDFJsonArrayExtractScalar'; +create temporary function json_extract as 'com.github.aaronshan.functions.json.UDFJsonExtract'; +create temporary function json_extract_scalar as 'com.github.aaronshan.functions.json.UDFJsonExtractScalar'; +create temporary function json_size as 'com.github.aaronshan.functions.json.UDFJsonSize'; +create temporary function id_card_province as 'com.github.aaronshan.functions.card.UDFChinaIdCardProvince'; +create temporary function id_card_city as 'com.github.aaronshan.functions.card.UDFChinaIdCardCity'; +create temporary function id_card_area as 'com.github.aaronshan.functions.card.UDFChinaIdCardArea'; +create temporary function id_card_birthday as 'com.github.aaronshan.functions.card.UDFChinaIdCardBirthday'; +create temporary function id_card_gender as 'com.github.aaronshan.functions.card.UDFChinaIdCardGender'; +create temporary function is_valid_id_card as 'com.github.aaronshan.functions.card.UDFChinaIdCardValid'; +create temporary function id_card_info as 'com.github.aaronshan.functions.card.UDFChinaIdCardInfo'; +create temporary function wgs_distance as 'com.github.aaronshan.functions.geo.UDFGeoWgsDistance'; +create temporary function gcj_to_bd as 'com.github.aaronshan.functions.geo.UDFGeoGcjToBd'; +create temporary function bd_to_gcj as 'com.github.aaronshan.functions.geo.UDFGeoBdToGcj'; +create temporary function wgs_to_gcj as 'com.github.aaronshan.functions.geo.UDFGeoWgsToGcj'; +create temporary function gcj_to_wgs as 'com.github.aaronshan.functions.geo.UDFGeoGcjToWgs'; +create temporary function gcj_extract_wgs as 'com.github.aaronshan.functions.geo.UDFGeoGcjExtractWgs'; +create temporary function url_encode as 'com.github.aaronshan.functions.url.UDFUrlEncode'; +create temporary function url_decode as 'com.github.aaronshan.functions.url.UDFUrlDecode'; ``` You can use these statements on hive cli env get detail of function. diff --git a/pom.xml b/pom.xml index fdebeb0..ef57e1e 100644 --- a/pom.xml +++ b/pom.xml @@ -4,10 +4,15 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - cc.shanruifeng + com.github.aaronshan hive-third-functions 2.1.3 + + https://github.com/aaronshan + https://github.com/aaronshan/hive-third-functions.git + + UTF-8 1.6 @@ -194,5 +199,65 @@ + + + + aaronshan + aaronshan + shanruifeng@gmail.com + + Developer + + +8 + + + + + + release + + + + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + package + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.6 + + + verify + + sign + + + + + + + + + oss + https://oss.sonatype.org/content/repositories/snapshots/ + + + oss + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + + \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayConcat.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayConcat.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayConcat.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayConcat.java index b5e689a..47c53ec 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayConcat.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayConcat.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayContains.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayContains.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayContains.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayContains.java index 6a48726..9b6177d 100755 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayContains.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayContains.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayDistinct.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayDistinct.java similarity index 95% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayDistinct.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayDistinct.java index 51b718b..ca230bd 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayDistinct.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayDistinct.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.fastuitl.ints.IntArrays; +import com.github.aaronshan.functions.fastuitl.ints.IntArrays; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; @@ -10,7 +10,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.*; -import static cc.shanruifeng.functions.utils.ArrayUtils.IntArrayCompare; +import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; /** * @author ruifeng.shan diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayElementAt.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayElementAt.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayElementAt.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayElementAt.java index bd4009b..5524708 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayElementAt.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayElementAt.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayEquals.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayEquals.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayEquals.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayEquals.java index f9811d7..9bc41bf 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayEquals.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayEquals.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.utils.ArrayUtils; +import com.github.aaronshan.functions.utils.ArrayUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayIntersect.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayIntersect.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayIntersect.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayIntersect.java index 1991a82..b4cd0f7 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayIntersect.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayIntersect.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.fastuitl.ints.IntArrays; +import com.github.aaronshan.functions.fastuitl.ints.IntArrays; import java.util.ArrayList; import java.util.Arrays; @@ -13,7 +13,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.*; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; -import static cc.shanruifeng.functions.utils.ArrayUtils.IntArrayCompare; +import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; /** * @author ruifeng.shan diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayJoin.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayJoin.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayJoin.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayJoin.java index 9321286..dc370cb 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayJoin.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayJoin.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayMax.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayMax.java similarity index 94% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayMax.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayMax.java index f2101f5..c4d5da9 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayMax.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayMax.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.fastuitl.ints.IntArrays; +import com.github.aaronshan.functions.fastuitl.ints.IntArrays; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; @@ -11,7 +11,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import static cc.shanruifeng.functions.utils.ArrayUtils.IntArrayCompare; +import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; /** * @author ruifeng.shan diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayMin.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayMin.java similarity index 94% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayMin.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayMin.java index 992b090..a8d022b 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayMin.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayMin.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.fastuitl.ints.IntArrays; +import com.github.aaronshan.functions.fastuitl.ints.IntArrays; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; @@ -11,7 +11,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import static cc.shanruifeng.functions.utils.ArrayUtils.IntArrayCompare; +import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; /** * @author ruifeng.shan diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayPosition.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayPosition.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayPosition.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayPosition.java index f916cb3..89d293c 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayPosition.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayPosition.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayRemove.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayRemove.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayRemove.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayRemove.java index af1f303..fac4613 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayRemove.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayRemove.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayReverse.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayReverse.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayReverse.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayReverse.java index e449e34..f9874a7 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayReverse.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayReverse.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArraySlice.java b/src/main/java/com/github/aaronshan/functions/array/UDFArraySlice.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/array/UDFArraySlice.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArraySlice.java index 4f1feb2..0bed6e5 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArraySlice.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArraySlice.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArraySort.java b/src/main/java/com/github/aaronshan/functions/array/UDFArraySort.java similarity index 91% rename from src/main/java/cc/shanruifeng/functions/array/UDFArraySort.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArraySort.java index 3a0313f..7378a16 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArraySort.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArraySort.java @@ -1,7 +1,9 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.fastuitl.ints.IntArrays; +import com.github.aaronshan.functions.fastuitl.ints.IntArrays; import java.util.ArrayList; + +import com.github.aaronshan.functions.utils.ArrayUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; @@ -10,7 +12,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.*; -import static cc.shanruifeng.functions.utils.ArrayUtils.IntArrayCompare; +import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; /** * @author ruifeng.shan @@ -88,7 +90,7 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException { positions[i] = i; } - IntArrays.quickSort(positions, 0, arrayLength, IntArrayCompare(array, arrayOI)); + IntArrays.quickSort(positions, 0, arrayLength, ArrayUtils.IntArrayCompare(array, arrayOI)); result.clear(); for (int i = 0; i < arrayLength; i++) { diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayValueCount.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayValueCount.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayValueCount.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayValueCount.java index 4984e15..46d3ff5 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayValueCount.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayValueCount.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitCount.java b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitCount.java similarity index 96% rename from src/main/java/cc/shanruifeng/functions/bitwise/UDFBitCount.java rename to src/main/java/com/github/aaronshan/functions/bitwise/UDFBitCount.java index b4dbb84..bb39515 100644 --- a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitCount.java +++ b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitCount.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseAnd.java b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseAnd.java similarity index 92% rename from src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseAnd.java rename to src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseAnd.java index 3df2ce1..acfc013 100644 --- a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseAnd.java +++ b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseAnd.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseNot.java b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseNot.java similarity index 92% rename from src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseNot.java rename to src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseNot.java index a23d238..b50d4b5 100644 --- a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseNot.java +++ b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseNot.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseOr.java b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseOr.java similarity index 92% rename from src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseOr.java rename to src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseOr.java index f7fdc8c..8a9bc75 100644 --- a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseOr.java +++ b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseOr.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseXor.java b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseXor.java similarity index 92% rename from src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseXor.java rename to src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseXor.java index 977c03a..1693c4e 100644 --- a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseXor.java +++ b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseXor.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardArea.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardArea.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardArea.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardArea.java index d47e1dc..902dc81 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardArea.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardArea.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardBirthday.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardBirthday.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardBirthday.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardBirthday.java index 56d274a..ea43b27 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardBirthday.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardBirthday.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardCity.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardCity.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardCity.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardCity.java index 807fa6a..31cc99a 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardCity.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardCity.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardGender.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardGender.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardGender.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardGender.java index cba80c9..06d7686 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardGender.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardGender.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardInfo.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardInfo.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardInfo.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardInfo.java index 209a941..5a79b4a 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardInfo.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardInfo.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardProvince.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardProvince.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardProvince.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardProvince.java index a6fea25..5ffaa55 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardProvince.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardProvince.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardValid.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardValid.java similarity index 88% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardValid.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardValid.java index 239a094..f6d9d77 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardValid.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardValid.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BooleanWritable; diff --git a/src/main/java/cc/shanruifeng/functions/date/UDFDayOfWeek.java b/src/main/java/com/github/aaronshan/functions/date/UDFDayOfWeek.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/date/UDFDayOfWeek.java rename to src/main/java/com/github/aaronshan/functions/date/UDFDayOfWeek.java index 982e776..a7034f4 100755 --- a/src/main/java/cc/shanruifeng/functions/date/UDFDayOfWeek.java +++ b/src/main/java/com/github/aaronshan/functions/date/UDFDayOfWeek.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/date/UDFDayOfYear.java b/src/main/java/com/github/aaronshan/functions/date/UDFDayOfYear.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/date/UDFDayOfYear.java rename to src/main/java/com/github/aaronshan/functions/date/UDFDayOfYear.java index d446f7b..f482355 100644 --- a/src/main/java/cc/shanruifeng/functions/date/UDFDayOfYear.java +++ b/src/main/java/com/github/aaronshan/functions/date/UDFDayOfYear.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; import java.util.Calendar; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/date/UDFTypeOfDay.java b/src/main/java/com/github/aaronshan/functions/date/UDFTypeOfDay.java similarity index 96% rename from src/main/java/cc/shanruifeng/functions/date/UDFTypeOfDay.java rename to src/main/java/com/github/aaronshan/functions/date/UDFTypeOfDay.java index 9fde811..6dc6851 100755 --- a/src/main/java/cc/shanruifeng/functions/date/UDFTypeOfDay.java +++ b/src/main/java/com/github/aaronshan/functions/date/UDFTypeOfDay.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; -import cc.shanruifeng.functions.utils.ConfigUtils; +import com.github.aaronshan.functions.utils.ConfigUtils; import java.util.Calendar; import java.util.Map; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignCn.java b/src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignCn.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignCn.java rename to src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignCn.java index 7a5bdd2..fecee21 100644 --- a/src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignCn.java +++ b/src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignCn.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignEn.java b/src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignEn.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignEn.java rename to src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignEn.java index 64281bd..db2a0c2 100644 --- a/src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignEn.java +++ b/src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignEn.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/AbstractIntComparator.java b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/AbstractIntComparator.java similarity index 88% rename from src/main/java/cc/shanruifeng/functions/fastuitl/ints/AbstractIntComparator.java rename to src/main/java/com/github/aaronshan/functions/fastuitl/ints/AbstractIntComparator.java index 3c83a45..972ae97 100644 --- a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/AbstractIntComparator.java +++ b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/AbstractIntComparator.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.fastuitl.ints; +package com.github.aaronshan.functions.fastuitl.ints; // Note: this code was forked from fastutil (http://fastutil.di.unimi.it/) // Copyright (C) 2010-2013 Sebastiano Vigna diff --git a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntArrays.java b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntArrays.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntArrays.java rename to src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntArrays.java index 833ed56..9341566 100644 --- a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntArrays.java +++ b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntArrays.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.fastuitl.ints; +package com.github.aaronshan.functions.fastuitl.ints; // Note: this code was forked from fastutil (http://fastutil.di.unimi.it/) // Copyright (C) 2010-2013 Sebastiano Vigna diff --git a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntComparator.java b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntComparator.java similarity index 82% rename from src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntComparator.java rename to src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntComparator.java index 37eba9f..67f9d03 100644 --- a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntComparator.java +++ b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntComparator.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.fastuitl.ints; +package com.github.aaronshan.functions.fastuitl.ints; import java.util.Comparator; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcj.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcj.java similarity index 85% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcj.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcj.java index de9d05a..07b6323 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcj.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcj.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjExtractWgs.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjExtractWgs.java similarity index 85% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjExtractWgs.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjExtractWgs.java index 34dca23..4d0fee4 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjExtractWgs.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjExtractWgs.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToBd.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToBd.java similarity index 84% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToBd.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToBd.java index a1c422a..e2eee04 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToBd.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToBd.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToWgs.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToWgs.java similarity index 84% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToWgs.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToWgs.java index 10278ae..e019a49 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToWgs.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToWgs.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsDistance.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsDistance.java similarity index 86% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsDistance.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsDistance.java index 000bb3b..0ed7237 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsDistance.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsDistance.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.serde2.io.DoubleWritable; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsToGcj.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsToGcj.java similarity index 84% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsToGcj.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsToGcj.java index 1081356..f2e4eb3 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsToGcj.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsToGcj.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtract.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtract.java similarity index 93% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtract.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtract.java index 266245f..f2ca590 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtract.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtract.java @@ -1,8 +1,8 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonExtract; -import cc.shanruifeng.functions.utils.json.JsonPath; -import cc.shanruifeng.functions.utils.json.JsonUtils; +import com.github.aaronshan.functions.utils.json.JsonExtract; +import com.github.aaronshan.functions.utils.json.JsonPath; +import com.github.aaronshan.functions.utils.json.JsonUtils; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtractScalar.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtractScalar.java similarity index 93% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtractScalar.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtractScalar.java index 713bee0..bf82984 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtractScalar.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtractScalar.java @@ -1,8 +1,8 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonExtract; -import cc.shanruifeng.functions.utils.json.JsonPath; -import cc.shanruifeng.functions.utils.json.JsonUtils; +import com.github.aaronshan.functions.utils.json.JsonExtract; +import com.github.aaronshan.functions.utils.json.JsonPath; +import com.github.aaronshan.functions.utils.json.JsonUtils; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayGet.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayGet.java similarity index 88% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayGet.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayGet.java index 8987b82..2c1eb91 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayGet.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayGet.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonUtils; +import com.github.aaronshan.functions.utils.json.JsonUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayLength.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayLength.java similarity index 89% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayLength.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayLength.java index 4bfa91b..284b368 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayLength.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayLength.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonUtils; +import com.github.aaronshan.functions.utils.json.JsonUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.LongWritable; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonExtract.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonExtract.java similarity index 84% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonExtract.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonExtract.java index 3f57c7e..6be5cf8 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonExtract.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonExtract.java @@ -1,7 +1,7 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonExtract; -import cc.shanruifeng.functions.utils.json.JsonPath; +import com.github.aaronshan.functions.utils.json.JsonExtract; +import com.github.aaronshan.functions.utils.json.JsonPath; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonExtractScalar.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonExtractScalar.java similarity index 85% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonExtractScalar.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonExtractScalar.java index 6af6884..ab51588 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonExtractScalar.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonExtractScalar.java @@ -1,7 +1,7 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonExtract; -import cc.shanruifeng.functions.utils.json.JsonPath; +import com.github.aaronshan.functions.utils.json.JsonExtract; +import com.github.aaronshan.functions.utils.json.JsonPath; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonSize.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonSize.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonSize.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonSize.java index 4ed1dd2..7286413 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonSize.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonSize.java @@ -1,7 +1,7 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonExtract; -import cc.shanruifeng.functions.utils.json.JsonPath; +import com.github.aaronshan.functions.utils.json.JsonExtract; +import com.github.aaronshan.functions.utils.json.JsonPath; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/map/UDFMapBuild.java b/src/main/java/com/github/aaronshan/functions/map/UDFMapBuild.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/map/UDFMapBuild.java rename to src/main/java/com/github/aaronshan/functions/map/UDFMapBuild.java index 5446550..f96b3e4 100644 --- a/src/main/java/cc/shanruifeng/functions/map/UDFMapBuild.java +++ b/src/main/java/com/github/aaronshan/functions/map/UDFMapBuild.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; import java.util.LinkedHashMap; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/map/UDFMapConcat.java b/src/main/java/com/github/aaronshan/functions/map/UDFMapConcat.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/map/UDFMapConcat.java rename to src/main/java/com/github/aaronshan/functions/map/UDFMapConcat.java index 6a257d8..8b4acf7 100644 --- a/src/main/java/cc/shanruifeng/functions/map/UDFMapConcat.java +++ b/src/main/java/com/github/aaronshan/functions/map/UDFMapConcat.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; import java.util.LinkedHashMap; import java.util.Map; diff --git a/src/main/java/cc/shanruifeng/functions/map/UDFMapElementAt.java b/src/main/java/com/github/aaronshan/functions/map/UDFMapElementAt.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/map/UDFMapElementAt.java rename to src/main/java/com/github/aaronshan/functions/map/UDFMapElementAt.java index cd9f12e..56627ad 100644 --- a/src/main/java/cc/shanruifeng/functions/map/UDFMapElementAt.java +++ b/src/main/java/com/github/aaronshan/functions/map/UDFMapElementAt.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; import java.util.Map; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/map/UDFMapEquals.java b/src/main/java/com/github/aaronshan/functions/map/UDFMapEquals.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/map/UDFMapEquals.java rename to src/main/java/com/github/aaronshan/functions/map/UDFMapEquals.java index 0fead58..97cf2f2 100644 --- a/src/main/java/cc/shanruifeng/functions/map/UDFMapEquals.java +++ b/src/main/java/com/github/aaronshan/functions/map/UDFMapEquals.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; -import cc.shanruifeng.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; import java.util.Map; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/model/ChinaIdArea.java b/src/main/java/com/github/aaronshan/functions/model/ChinaIdArea.java similarity index 91% rename from src/main/java/cc/shanruifeng/functions/model/ChinaIdArea.java rename to src/main/java/com/github/aaronshan/functions/model/ChinaIdArea.java index b45684f..9d9e145 100644 --- a/src/main/java/cc/shanruifeng/functions/model/ChinaIdArea.java +++ b/src/main/java/com/github/aaronshan/functions/model/ChinaIdArea.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.model; +package com.github.aaronshan.functions.model; /** * @author ruifeng.shan diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFChineseToPinYin.java b/src/main/java/com/github/aaronshan/functions/string/UDFChineseToPinYin.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/string/UDFChineseToPinYin.java rename to src/main/java/com/github/aaronshan/functions/string/UDFChineseToPinYin.java index b468748..66ddb11 100755 --- a/src/main/java/cc/shanruifeng/functions/string/UDFChineseToPinYin.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFChineseToPinYin.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFMd5.java b/src/main/java/com/github/aaronshan/functions/string/UDFMd5.java similarity index 94% rename from src/main/java/cc/shanruifeng/functions/string/UDFMd5.java rename to src/main/java/com/github/aaronshan/functions/string/UDFMd5.java index 807b6bf..dd9060a 100644 --- a/src/main/java/cc/shanruifeng/functions/string/UDFMd5.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFMd5.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import org.apache.commons.codec.digest.DigestUtils; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFSha256.java b/src/main/java/com/github/aaronshan/functions/string/UDFSha256.java similarity index 94% rename from src/main/java/cc/shanruifeng/functions/string/UDFSha256.java rename to src/main/java/com/github/aaronshan/functions/string/UDFSha256.java index df4d05d..a5e1ab1 100644 --- a/src/main/java/cc/shanruifeng/functions/string/UDFSha256.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFSha256.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import org.apache.commons.codec.digest.DigestUtils; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/url/UDFUrlDecode.java b/src/main/java/com/github/aaronshan/functions/url/UDFUrlDecode.java similarity index 95% rename from src/main/java/cc/shanruifeng/functions/url/UDFUrlDecode.java rename to src/main/java/com/github/aaronshan/functions/url/UDFUrlDecode.java index 21d1f2f..42b70fe 100644 --- a/src/main/java/cc/shanruifeng/functions/url/UDFUrlDecode.java +++ b/src/main/java/com/github/aaronshan/functions/url/UDFUrlDecode.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.url; +package com.github.aaronshan.functions.url; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; diff --git a/src/main/java/cc/shanruifeng/functions/url/UDFUrlEncode.java b/src/main/java/com/github/aaronshan/functions/url/UDFUrlEncode.java similarity index 96% rename from src/main/java/cc/shanruifeng/functions/url/UDFUrlEncode.java rename to src/main/java/com/github/aaronshan/functions/url/UDFUrlEncode.java index 7c8a876..bae9d7f 100644 --- a/src/main/java/cc/shanruifeng/functions/url/UDFUrlEncode.java +++ b/src/main/java/com/github/aaronshan/functions/url/UDFUrlEncode.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.url; +package com.github.aaronshan.functions.url; import com.google.common.escape.Escaper; import com.google.common.net.UrlEscapers; diff --git a/src/main/java/cc/shanruifeng/functions/utils/ArrayUtils.java b/src/main/java/com/github/aaronshan/functions/utils/ArrayUtils.java similarity index 92% rename from src/main/java/cc/shanruifeng/functions/utils/ArrayUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/ArrayUtils.java index e04f3b3..9a31194 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/ArrayUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/ArrayUtils.java @@ -1,7 +1,7 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; -import cc.shanruifeng.functions.fastuitl.ints.AbstractIntComparator; -import cc.shanruifeng.functions.fastuitl.ints.IntComparator; +import com.github.aaronshan.functions.fastuitl.ints.AbstractIntComparator; +import com.github.aaronshan.functions.fastuitl.ints.IntComparator; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; diff --git a/src/main/java/cc/shanruifeng/functions/utils/CardUtils.java b/src/main/java/com/github/aaronshan/functions/utils/CardUtils.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/utils/CardUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/CardUtils.java index 41c5d23..226a2fc 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/CardUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/CardUtils.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; -import cc.shanruifeng.functions.model.ChinaIdArea; +import com.github.aaronshan.functions.model.ChinaIdArea; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Strings; diff --git a/src/main/java/cc/shanruifeng/functions/utils/ConfigUtils.java b/src/main/java/com/github/aaronshan/functions/utils/ConfigUtils.java similarity index 96% rename from src/main/java/cc/shanruifeng/functions/utils/ConfigUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/ConfigUtils.java index 6dfca16..5eb7b9e 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/ConfigUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/ConfigUtils.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; -import cc.shanruifeng.functions.model.ChinaIdArea; +import com.github.aaronshan.functions.model.ChinaIdArea; import com.google.common.base.Strings; import com.google.common.collect.Lists; import com.google.common.collect.Maps; diff --git a/src/main/java/cc/shanruifeng/functions/utils/GeoUtils.java b/src/main/java/com/github/aaronshan/functions/utils/GeoUtils.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/utils/GeoUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/GeoUtils.java index a499e6f..951efdd 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/GeoUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/GeoUtils.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/src/main/java/cc/shanruifeng/functions/utils/MapUtils.java b/src/main/java/com/github/aaronshan/functions/utils/MapUtils.java similarity index 96% rename from src/main/java/cc/shanruifeng/functions/utils/MapUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/MapUtils.java index b8f2320..1b8caf6 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/MapUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/MapUtils.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; import java.util.Map; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; diff --git a/src/main/java/cc/shanruifeng/functions/utils/json/JsonExtract.java b/src/main/java/com/github/aaronshan/functions/utils/json/JsonExtract.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/utils/json/JsonExtract.java rename to src/main/java/com/github/aaronshan/functions/utils/json/JsonExtract.java index 3dbb603..c04bd32 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/json/JsonExtract.java +++ b/src/main/java/com/github/aaronshan/functions/utils/json/JsonExtract.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils.json; +package com.github.aaronshan.functions.utils.json; import com.fasterxml.jackson.core.*; import com.fasterxml.jackson.core.io.SerializedString; diff --git a/src/main/java/cc/shanruifeng/functions/utils/json/JsonPath.java b/src/main/java/com/github/aaronshan/functions/utils/json/JsonPath.java similarity index 95% rename from src/main/java/cc/shanruifeng/functions/utils/json/JsonPath.java rename to src/main/java/com/github/aaronshan/functions/utils/json/JsonPath.java index 70a09ce..43f8073 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/json/JsonPath.java +++ b/src/main/java/com/github/aaronshan/functions/utils/json/JsonPath.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils.json; +package com.github.aaronshan.functions.utils.json; /** * @author ruifeng.shan diff --git a/src/main/java/cc/shanruifeng/functions/utils/json/JsonPathTokenizer.java b/src/main/java/com/github/aaronshan/functions/utils/json/JsonPathTokenizer.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/utils/json/JsonPathTokenizer.java rename to src/main/java/com/github/aaronshan/functions/utils/json/JsonPathTokenizer.java index c150594..4be89ed 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/json/JsonPathTokenizer.java +++ b/src/main/java/com/github/aaronshan/functions/utils/json/JsonPathTokenizer.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils.json; +package com.github.aaronshan.functions.utils.json; import com.google.common.collect.AbstractIterator; diff --git a/src/main/java/cc/shanruifeng/functions/utils/json/JsonUtils.java b/src/main/java/com/github/aaronshan/functions/utils/json/JsonUtils.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/utils/json/JsonUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/json/JsonUtils.java index 9031319..633a089 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/json/JsonUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/json/JsonUtils.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils.json; +package com.github.aaronshan.functions.utils.json; import com.fasterxml.jackson.core.JsonFactory; diff --git a/src/test/java/cc/shanruifeng/functions/array/UDFArrayContainsTest.java b/src/test/java/com/github/aaronshan/functions/array/UDFArrayContainsTest.java similarity index 97% rename from src/test/java/cc/shanruifeng/functions/array/UDFArrayContainsTest.java rename to src/test/java/com/github/aaronshan/functions/array/UDFArrayContainsTest.java index ac9de3e..a5efc4b 100644 --- a/src/test/java/cc/shanruifeng/functions/array/UDFArrayContainsTest.java +++ b/src/test/java/com/github/aaronshan/functions/array/UDFArrayContainsTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import com.google.common.collect.ImmutableList; import java.util.List; diff --git a/src/test/java/cc/shanruifeng/functions/array/UDFArrayIntersectTest.java b/src/test/java/com/github/aaronshan/functions/array/UDFArrayIntersectTest.java similarity index 97% rename from src/test/java/cc/shanruifeng/functions/array/UDFArrayIntersectTest.java rename to src/test/java/com/github/aaronshan/functions/array/UDFArrayIntersectTest.java index 639384a..37c1143 100644 --- a/src/test/java/cc/shanruifeng/functions/array/UDFArrayIntersectTest.java +++ b/src/test/java/com/github/aaronshan/functions/array/UDFArrayIntersectTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; diff --git a/src/test/java/cc/shanruifeng/functions/bitwise/UDFBitCountTest.java b/src/test/java/com/github/aaronshan/functions/bitwise/UDFBitCountTest.java similarity index 95% rename from src/test/java/cc/shanruifeng/functions/bitwise/UDFBitCountTest.java rename to src/test/java/com/github/aaronshan/functions/bitwise/UDFBitCountTest.java index d064cba..c344a33 100644 --- a/src/test/java/cc/shanruifeng/functions/bitwise/UDFBitCountTest.java +++ b/src/test/java/com/github/aaronshan/functions/bitwise/UDFBitCountTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.LongWritable; diff --git a/src/test/java/cc/shanruifeng/functions/date/UDFDayOfYearTest.java b/src/test/java/com/github/aaronshan/functions/date/UDFDayOfYearTest.java similarity index 94% rename from src/test/java/cc/shanruifeng/functions/date/UDFDayOfYearTest.java rename to src/test/java/com/github/aaronshan/functions/date/UDFDayOfYearTest.java index cc35d19..d70cdd6 100644 --- a/src/test/java/cc/shanruifeng/functions/date/UDFDayOfYearTest.java +++ b/src/test/java/com/github/aaronshan/functions/date/UDFDayOfYearTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; diff --git a/src/test/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcjTest.java b/src/test/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcjTest.java similarity index 93% rename from src/test/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcjTest.java rename to src/test/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcjTest.java index 2e9fec3..a720e53 100644 --- a/src/test/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcjTest.java +++ b/src/test/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcjTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; import org.apache.hadoop.io.Text; import org.junit.Assert; diff --git a/src/test/java/cc/shanruifeng/functions/map/UDFMapBuildTest.java b/src/test/java/com/github/aaronshan/functions/map/UDFMapBuildTest.java similarity index 87% rename from src/test/java/cc/shanruifeng/functions/map/UDFMapBuildTest.java rename to src/test/java/com/github/aaronshan/functions/map/UDFMapBuildTest.java index 630635e..0fc2ed6 100644 --- a/src/test/java/cc/shanruifeng/functions/map/UDFMapBuildTest.java +++ b/src/test/java/com/github/aaronshan/functions/map/UDFMapBuildTest.java @@ -1,6 +1,7 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; -import cc.shanruifeng.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; @@ -11,6 +12,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Assert; import org.junit.Test; import static org.junit.Assert.*; @@ -38,6 +40,6 @@ public void testMapBuild() throws Exception { LinkedHashMap expect = Maps.newLinkedHashMap(); expect.putAll(ImmutableMap.of("key1", "value1", "key2", "value2", "key3", "value3")); - assertEquals("map_build() test", true, MapUtils.mapEquals(output, expect)); + Assert.assertEquals("map_build() test", true, MapUtils.mapEquals(output, expect)); } } \ No newline at end of file diff --git a/src/test/java/cc/shanruifeng/functions/map/UDFMapConcatTest.java b/src/test/java/com/github/aaronshan/functions/map/UDFMapConcatTest.java similarity index 88% rename from src/test/java/cc/shanruifeng/functions/map/UDFMapConcatTest.java rename to src/test/java/com/github/aaronshan/functions/map/UDFMapConcatTest.java index e9fcc1b..0094f43 100644 --- a/src/test/java/cc/shanruifeng/functions/map/UDFMapConcatTest.java +++ b/src/test/java/com/github/aaronshan/functions/map/UDFMapConcatTest.java @@ -1,6 +1,7 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; -import cc.shanruifeng.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; import java.util.LinkedHashMap; @@ -9,6 +10,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Assert; import org.junit.Test; import static org.junit.Assert.*; @@ -39,6 +41,6 @@ public void testMapConcat() throws Exception { LinkedHashMap expect = Maps.newLinkedHashMap(); expect.putAll(ImmutableMap.of("key1", "11", "key2", "12", "key3", "21", "key4", "22", "key5", "23")); - assertEquals("map_concat() test", true, MapUtils.mapEquals(output, expect)); + Assert.assertEquals("map_concat() test", true, MapUtils.mapEquals(output, expect)); } } \ No newline at end of file diff --git a/src/test/java/cc/shanruifeng/functions/map/UDFMapElementAtTest.java b/src/test/java/com/github/aaronshan/functions/map/UDFMapElementAtTest.java similarity index 94% rename from src/test/java/cc/shanruifeng/functions/map/UDFMapElementAtTest.java rename to src/test/java/com/github/aaronshan/functions/map/UDFMapElementAtTest.java index d542414..b98259d 100644 --- a/src/test/java/cc/shanruifeng/functions/map/UDFMapElementAtTest.java +++ b/src/test/java/com/github/aaronshan/functions/map/UDFMapElementAtTest.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; -import cc.shanruifeng.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; diff --git a/src/test/java/cc/shanruifeng/functions/map/UDFMapEqualsTest.java b/src/test/java/com/github/aaronshan/functions/map/UDFMapEqualsTest.java similarity index 96% rename from src/test/java/cc/shanruifeng/functions/map/UDFMapEqualsTest.java rename to src/test/java/com/github/aaronshan/functions/map/UDFMapEqualsTest.java index 3c841a1..af8917f 100644 --- a/src/test/java/cc/shanruifeng/functions/map/UDFMapEqualsTest.java +++ b/src/test/java/com/github/aaronshan/functions/map/UDFMapEqualsTest.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; -import cc.shanruifeng.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; import java.util.LinkedHashMap; diff --git a/src/test/java/cc/shanruifeng/functions/url/UDFUrlDecodeTest.java b/src/test/java/com/github/aaronshan/functions/url/UDFUrlDecodeTest.java similarity index 94% rename from src/test/java/cc/shanruifeng/functions/url/UDFUrlDecodeTest.java rename to src/test/java/com/github/aaronshan/functions/url/UDFUrlDecodeTest.java index 56cd312..a771cd1 100644 --- a/src/test/java/cc/shanruifeng/functions/url/UDFUrlDecodeTest.java +++ b/src/test/java/com/github/aaronshan/functions/url/UDFUrlDecodeTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.url; +package com.github.aaronshan.functions.url; import org.apache.hadoop.io.Text; import org.junit.Assert; diff --git a/src/test/java/cc/shanruifeng/functions/url/UDFUrlEncodeTest.java b/src/test/java/com/github/aaronshan/functions/url/UDFUrlEncodeTest.java similarity index 87% rename from src/test/java/cc/shanruifeng/functions/url/UDFUrlEncodeTest.java rename to src/test/java/com/github/aaronshan/functions/url/UDFUrlEncodeTest.java index a7c894d..3cec438 100644 --- a/src/test/java/cc/shanruifeng/functions/url/UDFUrlEncodeTest.java +++ b/src/test/java/com/github/aaronshan/functions/url/UDFUrlEncodeTest.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.url; +package com.github.aaronshan.functions.url; -import cc.shanruifeng.functions.date.UDFDayOfYear; +import com.github.aaronshan.functions.date.UDFDayOfYear; import org.apache.hadoop.io.Text; import org.junit.Assert; import org.junit.Test; From 1135d8a6eba5c135810d5a8f31f6eef86b3c6369 Mon Sep 17 00:00:00 2001 From: aaronshan Date: Fri, 1 Feb 2019 17:40:34 +0800 Subject: [PATCH 15/16] =?UTF-8?q?=E5=AE=8C=E5=96=842.2.0=E7=89=88=E6=9C=AC?= =?UTF-8?q?=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-zh.md | 20 ++--- README.md | 77 ++++++++++++++----- .../functions/array/UDFArrayShuffle.java | 7 +- .../functions/array/UDFSequence.java | 14 +++- .../math/UDFMathCosineSimilarity.java | 2 +- .../functions/math/UDFMathFromBase.java | 4 +- .../functions/math/UDFMathInfinity.java | 2 +- .../math/UDFMathInverseNormalCdf.java | 4 +- .../functions/math/UDFMathIsFinite.java | 2 +- .../functions/math/UDFMathIsInfinite.java | 2 +- .../functions/math/UDFMathIsNaN.java | 2 +- .../aaronshan/functions/math/UDFMathNaN.java | 2 +- .../functions/math/UDFMathNormalCdf.java | 4 +- .../functions/math/UDFMathToBase.java | 4 +- .../functions/regexp/Re2JRegexp.java | 14 +++- .../regexp/UDFRe2JRegexpExtract.java | 4 +- .../regexp/UDFRe2JRegexpExtractAll.java | 2 +- .../functions/regexp/UDFRe2JRegexpLike.java | 2 +- .../regexp/UDFRe2JRegexpReplace.java | 2 +- .../functions/regexp/UDFRe2JRegexpSplit.java | 2 +- .../functions/string/UDFCodePoint.java | 5 +- .../string/UDFStringHammingDistance.java | 4 +- .../string/UDFStringLevenshteinDistance.java | 6 +- .../functions/string/UDFStringNormalize.java | 2 +- .../functions/string/UDFStringPosition.java | 2 +- .../functions/string/UDFStringSplitToMap.java | 4 +- .../string/UDFStringSplitToMultimap.java | 4 +- .../aaronshan/functions/utils/Failures.java | 2 +- .../aaronshan/functions/utils/MathUtils.java | 4 +- src/main/resources/china_day_type.config | 33 +++++++- .../functions/array/UDFArrayShuffleTest.java | 1 + .../math/UDFMathCosineSimilarityTest.java | 2 +- 32 files changed, 164 insertions(+), 77 deletions(-) diff --git a/README-zh.md b/README-zh.md index 5d7ea8e..3ba181a 100644 --- a/README-zh.md +++ b/README-zh.md @@ -187,8 +187,8 @@ create temporary function array_slice as 'com.github.aaronshan.functions.array.U create temporary function array_element_at as 'com.github.aaronshan.functions.array.UDFArrayElementAt'; create temporary function bit_count as 'com.github.aaronshan.functions.bitwise.UDFBitCount'; create temporary function bitwise_and as 'com.github.aaronshan.functions.bitwise.UDFBitwiseAnd'; -create temporary function array_shuffle as 'cc.shanruifeng.functions.array.UDFArrayShuffle'; -create temporary function sequence as 'cc.shanruifeng.functions.array.UDFSequence'; +create temporary function array_shuffle as 'com.github.aaronshan.functions.array.UDFArrayShuffle'; +create temporary function sequence as 'com.github.aaronshan.functions.array.UDFSequence'; create temporary function bitwise_not as 'com.github.aaronshan.functions.bitwise.UDFBitwiseNot'; create temporary function bitwise_or as 'com.github.aaronshan.functions.bitwise.UDFBitwiseOr'; create temporary function bitwise_xor as 'com.github.aaronshan.functions.bitwise.UDFBitwiseXor'; @@ -226,14 +226,14 @@ create temporary function gcj_to_wgs as 'com.github.aaronshan.functions.geo.UDFG create temporary function gcj_extract_wgs as 'com.github.aaronshan.functions.geo.UDFGeoGcjExtractWgs'; create temporary function url_encode as 'com.github.aaronshan.functions.url.UDFUrlEncode'; create temporary function url_decode as 'com.github.aaronshan.functions.url.UDFUrlDecode'; -create temporary function infinity as 'cc.shanruifeng.functions.math.UDFMathInfinity'; -create temporary function is_finite as 'cc.shanruifeng.functions.math.UDFMathIsFinite'; -create temporary function is_infinite as 'cc.shanruifeng.functions.math.UDFMathIsInfinite'; -create temporary function is_nan as 'cc.shanruifeng.functions.math.UDFMathIsNaN'; -create temporary function nan as 'cc.shanruifeng.functions.math.UDFMathIsNaN'; -create temporary function from_base as 'cc.shanruifeng.functions.math.UDFMathFromBase'; -create temporary function to_base as 'cc.shanruifeng.functions.math.UDFMathToBase'; -create temporary function cosine_similarity as 'cc.shanruifeng.functions.math.UDFMathCosineSimilarity'; +create temporary function infinity as 'com.github.aaronshan.functions.math.UDFMathInfinity'; +create temporary function is_finite as 'com.github.aaronshan.functions.math.UDFMathIsFinite'; +create temporary function is_infinite as 'com.github.aaronshan.functions.math.UDFMathIsInfinite'; +create temporary function is_nan as 'com.github.aaronshan.functions.math.UDFMathIsNaN'; +create temporary function nan as 'com.github.aaronshan.functions.math.UDFMathIsNaN'; +create temporary function from_base as 'com.github.aaronshan.functions.math.UDFMathFromBase'; +create temporary function to_base as 'com.github.aaronshan.functions.math.UDFMathToBase'; +create temporary function cosine_similarity as 'com.github.aaronshan.functions.math.UDFMathCosineSimilarity'; ``` 你可以在hive的命令杭中使用下面的语句来查看函数的细节. diff --git a/README.md b/README.md index b2e9adf..b66d2f5 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,7 @@ Some useful custom hive udf functions, especial array and json functions. > Note: -> 1. hive-third-functions support hive-0.11.0 or higher. -> 2. hive-third-functions `3.0.0` need java8 or higher. +> hive-third-functions support hive-0.11.0 or higher. ## Build @@ -41,7 +40,7 @@ It will generate hive-third-functions-${version}-shaded.jar in target directory. You can also directly download file from [release page](https://github.com/aaronshan/hive-third-functions/releases). -> current latest version is `3.0.0` +> current latest version is `2.2.0` ## Functions @@ -52,6 +51,21 @@ You can also directly download file from [release page](https://github.com/aaron |pinyin(string) -> string | convert chinese to pinyin| |md5(string) -> string | md5 hash| |sha256(string) -> string |sha256 hash| +|codepoint(string) -> integer | Returns the Unicode code point of the only character of string.| +|hamming_distance(string1, string2) -> bigint | Returns the Hamming distance of string1 and string2.| +|levenshtein_distance(string1, string2) -> bigint | Returns the Levenshtein edit distance of string1 and string2.| +|normalize(string, form) -> varchar | Transforms string with the specified normalization form. form must be be one of the following keywords: Normalize Form Description | +|strpos(string, substring) -> bigint | Returns the starting position of the first instance of substring in string. Positions start with 1. If not found, 0 is returned.| +|split_to_map(string, entryDelimiter, keyValueDelimiter) -> map<varchar, varchar> | Splits string by entryDelimiter and keyValueDelimiter and returns a map. entryDelimiter splits string into key-value pairs. keyValueDelimiter splits each pair into key and value.| +|split_to_multimap(string, entryDelimiter, keyValueDelimiter) -> map(varchar, array(varchar)) | Splits string by entryDelimiter and keyValueDelimiter and returns a map containing an array of values for each unique key. entryDelimiter splits string into key-value pairs. keyValueDelimiter splits each pair into key and value. The values for each key will be in the same order as they appeared in string.| + +[Normalize Form Description](#jump) +| Form | Description | +|:--|:--| +| NFD | Canonical Decomposition | +| NFC | Canonical Decomposition, followed by Canonical Composition | +| NFKD | Compatibility Decomposition | +| NFKC | Compatibility Decomposition, followed by Canonical Composition | ### 2. array functions @@ -76,12 +90,13 @@ You can also directly download file from [release page](https://github.com/aaron |sequence(start, end) -> array | Generate a sequence of integers from start to stop.| |sequence(start, end, step) -> array | Generate a sequence of integers from start to stop, incrementing by step.| |sequence(start_date_string, end_data_string, step) -> array | Generate a sequence of date string from start to stop, incrementing by step.| +|array_value_count(array<E>, E) -> long | count array's element number that element value equals given value..| ### 3. map functions | function| description | |:--|:--| |map_build(x<K>, y<V>) -> map<K, V>| returns a map created using the given key/value arrays.| -|map_concat(x<K, V>, y<K, V>) -> map<K,V> | returns the union of two maps. If a key is found in both `x` and `y`, that key’s value in the resulting map comes from `y`.| +|map_concat(x<K, V>, y<K, V>) -> map<K,V> | returns the union of two maps. If a key is found in both `x` and `y`, that key’s value in the resulting map comes from `y`.| |map_element_at(map<K, V>, key) -> V | returns value for given `key`, or `NULL` if the key is not contained in the map.| |map_equals(x<K, V>, y<K, V>) -> boolean | whether map x equals with map y or not.| @@ -92,7 +107,7 @@ You can also directly download file from [release page](https://github.com/aaron |day_of_week(date_string \| date) -> int | day of week,if monday,return 1, sunday return 7, error return null.| |day_of_year(date_string \| date) -> int | day of year. The value ranges from 1 to 366.| |zodiac_en(date_string \| date) -> string | convert date to zodiac| -|zodiac_cn(date_string \| date) -> string | convert date to zodiac chinese | +|zodiac_cn(date_string \| date) -> string | convert date to zodiac chinese | |type_of_day(date_string \| date) -> string | for chinese. 获取日期的类型(1: 法定节假日, 2: 正常周末, 3: 正常工作日 4:攒假的工作日),错误返回-1. | ### 5. json functions @@ -113,9 +128,9 @@ You can also directly download file from [release page](https://github.com/aaron |:--|:--| |bit_count(x, bits) -> bigint | count the number of bits set in `x` (treated as bits-bit signed integer) in 2’s complement representation | |bitwise_and(x, y) -> bigint | returns the bitwise AND of `x` and `y` in 2’s complement arithmetic.| -|bitwise_not(x) -> bigint | returns the bitwise NOT of `x` in 2’s complement arithmetic. | +|bitwise_not(x) -> bigint | returns the bitwise NOT of `x` in 2’s complement arithmetic. | |bitwise_or(x, y) -> bigint | returns the bitwise OR of `x` and `y` in 2’s complement arithmetic.| -|bitwise_xor(x, y) -> bigint | returns the bitwise XOR of `x` and `y` in 2’s complement arithmetic. | +|bitwise_xor(x, y) -> bigint | returns the bitwise XOR of `x` and `y` in 2’s complement arithmetic. | ### 7. china id card functions @@ -148,7 +163,7 @@ You can also directly download file from [release page](https://github.com/aaron | function| description | |:--|:--| |url_encode(value) -> string | escapes value by encoding it so that it can be safely included in URL query parameter names and values| -|url_decode(value) -> string | unescape the URL encoded value. This function is the inverse of `url_encode`. | +|url_decode(value) -> string | unescape the URL encoded value. This function is the inverse of `url_encode`. | ### 10. math functions @@ -162,6 +177,17 @@ You can also directly download file from [release page](https://github.com/aaron |from_base(string, radix) -> bigint | Returns the value of string interpreted as a base-radix number.| |to_base(x, radix) -> varchar | Returns the base-radix representation of x.| |cosine_similarity(x, y) -> double | Returns the cosine similarity between the sparse vectors x and y| +|inverse_normal_cdf(mean, sd, p) -> double | Compute the inverse of the Normal cdf with given mean and standard deviation (sd) for the cumulative probability (p): P(N < n). The mean must be a real value and the standard deviation must be a real and positive value. The probability p must lie on the interval (0, 1). | +|normal_cdf(mean, sd, v) -> double | Compute the Normal cdf with given mean and standard deviation (sd): P(N < v; mean, sd). The mean and value v must be real values and the standard deviation must be a real and positive value.| + +### 11. regexp functions +| function| description | +|:--|:--| +|regexp_like(string, pattern) -> boolean | Evaluates the regular expression pattern and determines if it is contained within string.| +|regexp_extract_all(string, pattern) -> array(varchar) | Returns the substring(s) matched by the regular expression pattern in string. | +|regexp_extract(string, pattern) -> varchar | Returns the first substring matched by the regular expression pattern in string.| +|regexp_replace(string, pattern) -> varchar | Removes every instance of the substring matched by the regular expression pattern from string.| +|regexp_replace(string, pattern, replacement) -> varchar | Replaces every instance of the substring matched by the regular expression pattern in string with replacement. | ## Use @@ -186,8 +212,9 @@ create temporary function array_slice as 'com.github.aaronshan.functions.array.U create temporary function array_element_at as 'com.github.aaronshan.functions.array.UDFArrayElementAt'; create temporary function bit_count as 'com.github.aaronshan.functions.bitwise.UDFBitCount'; create temporary function bitwise_and as 'com.github.aaronshan.functions.bitwise.UDFBitwiseAnd'; -create temporary function array_shuffle as 'cc.shanruifeng.functions.array.UDFArrayShuffle'; -create temporary function sequence as 'cc.shanruifeng.functions.array.UDFSequence'; +create temporary function array_shuffle as 'com.github.aaronshan.functions.array.UDFArrayShuffle'; +create temporary function sequence as 'com.github.aaronshan.functions.array.UDFSequence'; +create temporary function array_value_count as 'com.github.aaronshan.functions.array.UDFArrayValueCount'; create temporary function bitwise_not as 'com.github.aaronshan.functions.bitwise.UDFBitwiseNot'; create temporary function bitwise_or as 'com.github.aaronshan.functions.bitwise.UDFBitwiseOr'; create temporary function bitwise_xor as 'com.github.aaronshan.functions.bitwise.UDFBitwiseXor'; @@ -203,6 +230,13 @@ create temporary function zodiac_en as 'com.github.aaronshan.functions.date.UDFZ create temporary function pinyin as 'com.github.aaronshan.functions.string.UDFChineseToPinYin'; create temporary function md5 as 'com.github.aaronshan.functions.string.UDFMd5'; create temporary function sha256 as 'com.github.aaronshan.functions.string.UDFSha256'; +create temporary function codepoint as 'com.github.aaronshan.functions.string.UDFCodePoint'; +create temporary function hamming_distance as 'com.github.aaronshan.functions.string.UDFStringHammingDistance'; +create temporary function levenshtein_distance as 'com.github.aaronshan.functions.string.UDFStringLevenshteinDistance'; +create temporary function normalize as 'com.github.aaronshan.functions.string.UDFStringNormalize'; +create temporary function strpos as 'com.github.aaronshan.functions.string.UDFStringPosition'; +create temporary function split_to_map as 'com.github.aaronshan.functions.string.UDFStringSplitToMap'; +create temporary function split_to_multimap as 'com.github.aaronshan.functions.string.UDFStringSplitToMultimap'; create temporary function json_array_get as 'com.github.aaronshan.functions.json.UDFJsonArrayGet'; create temporary function json_array_length as 'com.github.aaronshan.functions.json.UDFJsonArrayLength'; create temporary function json_array_extract as 'com.github.aaronshan.functions.json.UDFJsonArrayExtract'; @@ -225,14 +259,21 @@ create temporary function gcj_to_wgs as 'com.github.aaronshan.functions.geo.UDFG create temporary function gcj_extract_wgs as 'com.github.aaronshan.functions.geo.UDFGeoGcjExtractWgs'; create temporary function url_encode as 'com.github.aaronshan.functions.url.UDFUrlEncode'; create temporary function url_decode as 'com.github.aaronshan.functions.url.UDFUrlDecode'; -create temporary function infinity as 'cc.shanruifeng.functions.math.UDFMathInfinity'; -create temporary function is_finite as 'cc.shanruifeng.functions.math.UDFMathIsFinite'; -create temporary function is_infinite as 'cc.shanruifeng.functions.math.UDFMathIsInfinite'; -create temporary function is_nan as 'cc.shanruifeng.functions.math.UDFMathIsNaN'; -create temporary function nan as 'cc.shanruifeng.functions.math.UDFMathIsNaN'; -create temporary function from_base as 'cc.shanruifeng.functions.math.UDFMathFromBase'; -create temporary function to_base as 'cc.shanruifeng.functions.math.UDFMathToBase'; -create temporary function cosine_similarity as 'cc.shanruifeng.functions.math.UDFMathCosineSimilarity'; +create temporary function infinity as 'com.github.aaronshan.functions.math.UDFMathInfinity'; +create temporary function is_finite as 'com.github.aaronshan.functions.math.UDFMathIsFinite'; +create temporary function is_infinite as 'com.github.aaronshan.functions.math.UDFMathIsInfinite'; +create temporary function nan as 'com.github.aaronshan.functions.math.UDFMathNaN'; +create temporary function is_nan as 'com.github.aaronshan.functions.math.UDFMathIsNaN'; +create temporary function from_base as 'com.github.aaronshan.functions.math.UDFMathFromBase'; +create temporary function to_base as 'com.github.aaronshan.functions.math.UDFMathToBase'; +create temporary function cosine_similarity as 'com.github.aaronshan.functions.math.UDFMathCosineSimilarity'; +create temporary function normal_cdf as 'com.github.aaronshan.functions.math.UDFMathNormalCdf'; +create temporary function inverse_normal_cdf as 'com.github.aaronshan.functions.math.UDFMathInverseNormalCdf'; +create temporary function regexp_extract as 'com.github.aaronshan.functions.regexp.UDFRe2JRegexpExtract'; +create temporary function regexp_extract_all as 'com.github.aaronshan.functions.regexp.UDFRe2JRegexpExtractAll'; +create temporary function regexp_like as 'com.github.aaronshan.functions.regexp.UDFRe2JRegexpLike'; +create temporary function regexp_replace as 'com.github.aaronshan.functions.regexp.UDFRe2JRegexpReplace'; +create temporary function regexp_split as 'com.github.aaronshan.functions.regexp.UDFRe2JRegexpSplit'; ``` You can use these statements on hive cli env get detail of function. diff --git a/src/main/java/com/github/aaronshan/functions/array/UDFArrayShuffle.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayShuffle.java index 3b2c7c9..460b9ca 100644 --- a/src/main/java/com/github/aaronshan/functions/array/UDFArrayShuffle.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayShuffle.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; @@ -9,7 +9,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.*; import java.util.ArrayList; -import java.util.concurrent.ThreadLocalRandom; +import java.util.Random; /** * @author aaron02 @@ -91,7 +91,8 @@ public Object evaluate(GenericUDF.DeferredObject[] arguments) throws HiveExcepti // Fisher-Yates shuffle // Randomly swap a pair of positions for (int i = arrayLength - 1; i > 0; i--) { - int index = ThreadLocalRandom.current().nextInt(i + 1); + Random random = new Random(); + int index = random.nextInt(i + 1); int swap = positions[i]; positions[i] = positions[index]; positions[index] = swap; diff --git a/src/main/java/com/github/aaronshan/functions/array/UDFSequence.java b/src/main/java/com/github/aaronshan/functions/array/UDFSequence.java index 93ef141..cf1c975 100644 --- a/src/main/java/com/github/aaronshan/functions/array/UDFSequence.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFSequence.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import com.google.common.collect.Lists; import org.apache.hadoop.hive.ql.exec.Description; @@ -12,8 +12,7 @@ import java.util.List; -import static java.lang.Math.toIntExact; -import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static com.github.aaronshan.functions.utils.Failures.checkCondition; /** * @author aaron02 * @date 2018-08-18 上午9:23 @@ -25,7 +24,7 @@ " > select _FUNC_('2016-04-12', '2016-04-14') from src;") public class UDFSequence extends UDF { public final static DateTimeFormatter DEFAULT_DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); - private static final long MAX_RESULT_ENTRIES = 10_000; + private static final long MAX_RESULT_ENTRIES = 10000; public UDFSequence() { @@ -45,6 +44,13 @@ public Object evaluate(Text start, Text stop, long step) throws HiveException { return fixedWidthSequence(startMillis, stopMillis, step, String.class); } + public static int toIntExact(long value) { + if ((int)value != value) { + throw new ArithmeticException("integer overflow"); + } + return (int)value; + } + private static Object fixedWidthSequence(long start, long stop, long step, Class type) throws HiveException { checkValidStep(start, stop, step); diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarity.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarity.java index a4084a5..e0acea6 100644 --- a/src/main/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarity.java +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarity.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.math; +package com.github.aaronshan.functions.math; import org.apache.hadoop.hive.ql.exec.*; import org.apache.hadoop.hive.ql.metadata.HiveException; diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathFromBase.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathFromBase.java index a61a223..d7313bf 100644 --- a/src/main/java/com/github/aaronshan/functions/math/UDFMathFromBase.java +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathFromBase.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.math; +package com.github.aaronshan.functions.math; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; @@ -6,7 +6,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; -import static cc.shanruifeng.functions.utils.MathUtils.checkRadix; +import static com.github.aaronshan.functions.utils.MathUtils.checkRadix; import static java.lang.String.format; /** diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathInfinity.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathInfinity.java index 6c9799c..bf6b0c0 100644 --- a/src/main/java/com/github/aaronshan/functions/math/UDFMathInfinity.java +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathInfinity.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.math; +package com.github.aaronshan.functions.math; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathInverseNormalCdf.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathInverseNormalCdf.java index 036ccbe..c192549 100644 --- a/src/main/java/com/github/aaronshan/functions/math/UDFMathInverseNormalCdf.java +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathInverseNormalCdf.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.math; +package com.github.aaronshan.functions.math; import org.apache.commons.math3.special.Erf; import org.apache.hadoop.hive.ql.exec.Description; @@ -6,7 +6,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.DoubleWritable; -import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static com.github.aaronshan.functions.utils.Failures.checkCondition; /** * @author ruifeng.shan diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathIsFinite.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsFinite.java index f0243a0..597906e 100644 --- a/src/main/java/com/github/aaronshan/functions/math/UDFMathIsFinite.java +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsFinite.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.math; +package com.github.aaronshan.functions.math; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathIsInfinite.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsInfinite.java index 765bce3..84f8abe 100644 --- a/src/main/java/com/github/aaronshan/functions/math/UDFMathIsInfinite.java +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsInfinite.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.math; +package com.github.aaronshan.functions.math; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathIsNaN.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsNaN.java index 4338c85..efef890 100644 --- a/src/main/java/com/github/aaronshan/functions/math/UDFMathIsNaN.java +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsNaN.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.math; +package com.github.aaronshan.functions.math; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathNaN.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathNaN.java index 1da5ce1..9d6d55f 100644 --- a/src/main/java/com/github/aaronshan/functions/math/UDFMathNaN.java +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathNaN.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.math; +package com.github.aaronshan.functions.math; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathNormalCdf.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathNormalCdf.java index b666b5a..a6cc43a 100644 --- a/src/main/java/com/github/aaronshan/functions/math/UDFMathNormalCdf.java +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathNormalCdf.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.math; +package com.github.aaronshan.functions.math; import org.apache.commons.math3.special.Erf; import org.apache.hadoop.hive.ql.exec.Description; @@ -6,7 +6,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.DoubleWritable; -import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static com.github.aaronshan.functions.utils.Failures.checkCondition; /** * @author ruifeng.shan diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathToBase.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathToBase.java index ed2e2f9..0368843 100644 --- a/src/main/java/com/github/aaronshan/functions/math/UDFMathToBase.java +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathToBase.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.math; +package com.github.aaronshan.functions.math; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; @@ -6,7 +6,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; -import static cc.shanruifeng.functions.utils.MathUtils.checkRadix; +import static com.github.aaronshan.functions.utils.MathUtils.checkRadix; /** * @author ruifeng.shan diff --git a/src/main/java/com/github/aaronshan/functions/regexp/Re2JRegexp.java b/src/main/java/com/github/aaronshan/functions/regexp/Re2JRegexp.java index 3466e7b..f19a70f 100644 --- a/src/main/java/com/github/aaronshan/functions/regexp/Re2JRegexp.java +++ b/src/main/java/com/github/aaronshan/functions/regexp/Re2JRegexp.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.regexp; +package com.github.aaronshan.functions.regexp; import com.google.common.collect.Lists; import com.google.re2j.Matcher; @@ -12,7 +12,6 @@ import static com.google.common.base.Preconditions.checkState; import static com.google.re2j.Options.Algorithm.DFA_FALLBACK_TO_NFA; -import static java.lang.Math.toIntExact; import static java.lang.String.format; /** @@ -76,9 +75,18 @@ public Slice replace(Slice source, Slice replacement) throws HiveException { Matcher matcher = re2jPattern.matcher(source); try { return matcher.replaceAll(replacement); - } catch (IndexOutOfBoundsException | IllegalArgumentException e) { + } catch (IndexOutOfBoundsException e) { throw new HiveException("Illegal replacement sequence: " + replacement.toStringUtf8()); + } catch (IllegalArgumentException e) { + throw new HiveException("Illegal replacement sequence: " + replacement.toStringUtf8()); + } + } + + public static int toIntExact(long value) { + if ((int)value != value) { + throw new ArithmeticException("integer overflow"); } + return (int)value; } public List extractAll(Slice source, long groupIndex) throws HiveException { diff --git a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtract.java b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtract.java index a6afc0c..5041729 100644 --- a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtract.java +++ b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtract.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.regexp; +package com.github.aaronshan.functions.regexp; import io.airlift.slice.Slices; import org.apache.hadoop.hive.ql.exec.Description; @@ -12,7 +12,7 @@ * @date 2018-07-27 * @time 22:38 */ -@Description(name = "regexp_like" +@Description(name = "regexp_extract" , value = "_FUNC_(string, string) - returns substrings matching a regular expression." , extended = "Example:\n > select _FUNC_(string, pattern) from src;") public class UDFRe2JRegexpExtract extends UDF { diff --git a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtractAll.java b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtractAll.java index 416a68a..fa03a5b 100644 --- a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtractAll.java +++ b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtractAll.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.regexp; +package com.github.aaronshan.functions.regexp; import io.airlift.slice.Slices; import java.util.ArrayList; diff --git a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpLike.java b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpLike.java index b5cdd13..683e6f8 100644 --- a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpLike.java +++ b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpLike.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.regexp; +package com.github.aaronshan.functions.regexp; import io.airlift.slice.Slices; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpReplace.java b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpReplace.java index 13694a5..6c74f52 100644 --- a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpReplace.java +++ b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpReplace.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.regexp; +package com.github.aaronshan.functions.regexp; import io.airlift.slice.Slices; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpSplit.java b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpSplit.java index a0cf629..1022a8f 100644 --- a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpSplit.java +++ b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpSplit.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.regexp; +package com.github.aaronshan.functions.regexp; import io.airlift.slice.Slices; import java.util.ArrayList; diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFCodePoint.java b/src/main/java/com/github/aaronshan/functions/string/UDFCodePoint.java index d6a95bd..bd0324f 100644 --- a/src/main/java/com/github/aaronshan/functions/string/UDFCodePoint.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFCodePoint.java @@ -1,15 +1,14 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import io.airlift.slice.Slice; import io.airlift.slice.Slices; -import org.apache.commons.codec.digest.DigestUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; -import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static com.github.aaronshan.functions.utils.Failures.checkCondition; import static io.airlift.slice.SliceUtf8.getCodePointAt; import static io.airlift.slice.SliceUtf8.countCodePoints; diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringHammingDistance.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringHammingDistance.java index e465922..ecf7a49 100644 --- a/src/main/java/com/github/aaronshan/functions/string/UDFStringHammingDistance.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringHammingDistance.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import io.airlift.slice.Slice; import io.airlift.slice.Slices; @@ -8,7 +8,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; -import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static com.github.aaronshan.functions.utils.Failures.checkCondition; import static io.airlift.slice.SliceUtf8.lengthOfCodePoint; import static io.airlift.slice.SliceUtf8.tryGetCodePointAt; diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringLevenshteinDistance.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringLevenshteinDistance.java index a37060f..08cd10d 100644 --- a/src/main/java/com/github/aaronshan/functions/string/UDFStringLevenshteinDistance.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringLevenshteinDistance.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import io.airlift.slice.Slice; import io.airlift.slice.Slices; @@ -8,7 +8,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; -import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static com.github.aaronshan.functions.utils.Failures.checkCondition; import static io.airlift.slice.SliceUtf8.getCodePointAt; import static io.airlift.slice.SliceUtf8.lengthOfCodePoint; import static io.airlift.slice.SliceUtf8.tryGetCodePointAt; @@ -55,7 +55,7 @@ public LongWritable evaluate(Text leftText, Text rightText) throws HiveException return result; } - checkCondition((leftCodePoints.length * (rightCodePoints.length - 1)) <= 1_000_000, + checkCondition((leftCodePoints.length * (rightCodePoints.length - 1)) <= 1000000, "The combined inputs for Levenshtein distance are too large"); int[] distances = new int[rightCodePoints.length]; diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringNormalize.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringNormalize.java index 4b15d8f..cf2ce98 100644 --- a/src/main/java/com/github/aaronshan/functions/string/UDFStringNormalize.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringNormalize.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringPosition.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringPosition.java index 38def10..59ab65c 100644 --- a/src/main/java/com/github/aaronshan/functions/string/UDFStringPosition.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringPosition.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import org.apache.commons.codec.digest.DigestUtils; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMap.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMap.java index 63bb45f..d57ae8e 100644 --- a/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMap.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMap.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import com.google.common.base.Splitter; import java.util.HashMap; @@ -14,7 +14,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static com.github.aaronshan.functions.utils.Failures.checkCondition; /** * @author ruifeng.shan diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimap.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimap.java index 036dd73..2714dcc 100644 --- a/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimap.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimap.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import com.google.common.base.Splitter; import com.google.common.collect.ArrayListMultimap; @@ -17,7 +17,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static com.github.aaronshan.functions.utils.Failures.checkCondition; /** * @author ruifeng.shan diff --git a/src/main/java/com/github/aaronshan/functions/utils/Failures.java b/src/main/java/com/github/aaronshan/functions/utils/Failures.java index 354927c..6437bea 100644 --- a/src/main/java/com/github/aaronshan/functions/utils/Failures.java +++ b/src/main/java/com/github/aaronshan/functions/utils/Failures.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; import org.apache.hadoop.hive.ql.metadata.HiveException; diff --git a/src/main/java/com/github/aaronshan/functions/utils/MathUtils.java b/src/main/java/com/github/aaronshan/functions/utils/MathUtils.java index 0fc88ae..4adb505 100644 --- a/src/main/java/com/github/aaronshan/functions/utils/MathUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/MathUtils.java @@ -1,8 +1,8 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; import org.apache.hadoop.hive.ql.metadata.HiveException; -import static cc.shanruifeng.functions.utils.Failures.checkCondition; +import static com.github.aaronshan.functions.utils.Failures.checkCondition; import static java.lang.Character.MAX_RADIX; import static java.lang.Character.MIN_RADIX; diff --git a/src/main/resources/china_day_type.config b/src/main/resources/china_day_type.config index 8ae306a..bae8705 100644 --- a/src/main/resources/china_day_type.config +++ b/src/main/resources/china_day_type.config @@ -199,4 +199,35 @@ 2018-10-04 holiday 2018-10-05 holiday 2018-10-06 holiday -2018-10-07 holiday \ No newline at end of file +2018-10-07 holiday +2018-12-29 workday +2018-12-30 holiday +2018-12-31 holiday +2019-01-01 holiday +2019-02-02 workday +2019-02-03 workday +2019-02-04 holiday +2019-02-05 holiday +2019-02-06 holiday +2019-02-07 holiday +2019-02-08 holiday +2019-02-09 holiday +2019-02-10 holiday +2019-04-05 holiday +2019-04-06 holiday +2019-04-07 holiday +2019-05-01 holiday +2019-06-07 holiday +2019-06-08 holiday +2019-06-09 holiday +2019-09-13 holiday +2019-09-14 holiday +2019-09-15 holiday +2019-09-29 workday +2019-10-01 holiday +2019-10-02 holiday +2019-10-03 holiday +2019-10-04 holiday +2019-10-05 holiday +2019-10-06 holiday +2019-10-07 holiday \ No newline at end of file diff --git a/src/test/java/cc/shanruifeng/functions/array/UDFArrayShuffleTest.java b/src/test/java/cc/shanruifeng/functions/array/UDFArrayShuffleTest.java index 5a1c021..dc3578a 100644 --- a/src/test/java/cc/shanruifeng/functions/array/UDFArrayShuffleTest.java +++ b/src/test/java/cc/shanruifeng/functions/array/UDFArrayShuffleTest.java @@ -1,5 +1,6 @@ package cc.shanruifeng.functions.array; +import com.github.aaronshan.functions.array.UDFArrayShuffle; import com.google.common.collect.ImmutableList; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; diff --git a/src/test/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarityTest.java b/src/test/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarityTest.java index 188a643..ecff4ea 100644 --- a/src/test/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarityTest.java +++ b/src/test/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarityTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.math; +package com.github.aaronshan.functions.math; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; From 2724c149dd9455c99ad208b90b9970132558afbe Mon Sep 17 00:00:00 2001 From: aaronshan Date: Fri, 1 Feb 2019 17:42:11 +0800 Subject: [PATCH 16/16] =?UTF-8?q?=E4=BF=AE=E6=94=B9readme=E6=A0=BC?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b66d2f5..cdeb7f8 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ You can also directly download file from [release page](https://github.com/aaron |split_to_multimap(string, entryDelimiter, keyValueDelimiter) -> map(varchar, array(varchar)) | Splits string by entryDelimiter and keyValueDelimiter and returns a map containing an array of values for each unique key. entryDelimiter splits string into key-value pairs. keyValueDelimiter splits each pair into key and value. The values for each key will be in the same order as they appeared in string.| [Normalize Form Description](#jump) + | Form | Description | |:--|:--| | NFD | Canonical Decomposition |