diff --git a/.travis.yml b/.travis.yml index c462890..08c124e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,7 +17,5 @@ install: - mvn install:install-file -DgroupId=javax.jdo -DartifactId=jdo2-api -Dversion=2.3-ec -Dpackaging=jar -Dfile=$HOME/jdo2-api-2.3-ec.jar script: - - jdk_switcher use openjdk7 - - mvn clean package - jdk_switcher use oraclejdk8 - mvn clean package diff --git a/README-zh.md b/README-zh.md index fb060d2..3ba181a 100644 --- a/README-zh.md +++ b/README-zh.md @@ -10,7 +10,8 @@ hive-third-functions 包含了一些很有用的hive udf函数,特别是数组和json函数. > 注意: -> hive-third-functions支持hive-0.11.0或更高版本. +> 1. hive-third-functions支持hive-0.11.0或更高版本. +> 2. 运行`3.0.0`及以上版本需要Java8及以上 ## 编译 @@ -40,7 +41,7 @@ mvn clean package -DskipTests 你也可以直接在发布页下载打包好了最新版本 [发布页](https://github.com/aaronshan/hive-third-functions/releases). -> 当前最新的版本是 `2.1.3` +> 当前最新的版本是 `3.0.0` ## 函数 @@ -71,6 +72,10 @@ mvn clean package -DskipTests |array_value_count(array<E>, E) -> long | 统计数组中包含给定元素的个数.| |array_slice(array, start, length) -> array | 对数组进行分片操作,start为正数从前开始分片, start为负数从后开始分片, 长度为指定的长度.| |array_element_at(array<E>, index) -> E | 返回指定位置的数组元素. 如果索引位置 < 0, 则从尾部开始计数并返回.| +|array_shuffle(array) -> array | 对数组shuffle.| +|sequence(start, end) -> array | 生成数组序列.| +|sequence(start, end, step) -> array | 生成数组序列.| +|sequence(start_date_string, end_data_string, step) -> array | 生成日期数组序列.| ### 3. map函数 | 函数| 描述 | @@ -145,66 +150,90 @@ mvn clean package -DskipTests |url_encode(value) -> string | escapes value by encoding it so that it can be safely included in URL query parameter names and values| |url_decode(value) -> string | unescape the URL encoded value. This function is the inverse of `url_encode`. | +### 10. 数学函数 + +| function| description | +|:--|:--| +|infinity() -> double | 获取正无穷常数| +|is_finite(x) -> boolean | 判断x是否为有限数值| +|is_infinite(x) -> boolean |判断x是否为无穷数值| +|is_nan(x) -> boolean | 判断x是否不是一个数值类型的变量| +|nan() -> double | 获取一个表示NAN(not-a-number)的常数 | +|from_base(string, radix) -> bigint | 获取字面量的值,该值的基数为radix| +|to_base(x, radix) -> varchar | 返回x以radix为基数的字面量| +|cosine_similarity(x, y) -> double | 返回两个稀疏向量的余弦相似度| + + ## 用法 将下面这些内容写入 `${HOME}/.hiverc` 文件, 或者也可以按需在hive命令行环境中执行. ``` add jar ${jar_location_dir}/hive-third-functions-${version}-shaded.jar -create temporary function array_contains as 'cc.shanruifeng.functions.array.UDFArrayContains'; -create temporary function array_equals as 'cc.shanruifeng.functions.array.UDFArrayEquals'; -create temporary function array_intersect as 'cc.shanruifeng.functions.array.UDFArrayIntersect'; -create temporary function array_max as 'cc.shanruifeng.functions.array.UDFArrayMax'; -create temporary function array_min as 'cc.shanruifeng.functions.array.UDFArrayMin'; -create temporary function array_join as 'cc.shanruifeng.functions.array.UDFArrayJoin'; -create temporary function array_distinct as 'cc.shanruifeng.functions.array.UDFArrayDistinct'; -create temporary function array_position as 'cc.shanruifeng.functions.array.UDFArrayPosition'; -create temporary function array_remove as 'cc.shanruifeng.functions.array.UDFArrayRemove'; -create temporary function array_reverse as 'cc.shanruifeng.functions.array.UDFArrayReverse'; -create temporary function array_sort as 'cc.shanruifeng.functions.array.UDFArraySort'; -create temporary function array_concat as 'cc.shanruifeng.functions.array.UDFArrayConcat'; -create temporary function array_value_count as 'cc.shanruifeng.functions.array.UDFArrayValueCount'; -create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArraySlice'; -create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; -create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; -create temporary function bitwise_and as 'cc.shanruifeng.functions.bitwise.UDFBitwiseAnd'; -create temporary function bitwise_not as 'cc.shanruifeng.functions.bitwise.UDFBitwiseNot'; -create temporary function bitwise_or as 'cc.shanruifeng.functions.bitwise.UDFBitwiseOr'; -create temporary function bitwise_xor as 'cc.shanruifeng.functions.bitwise.UDFBitwiseXor'; -create temporary function map_build as 'cc.shanruifeng.functions.map.UDFMapBuild'; -create temporary function map_concat as 'cc.shanruifeng.functions.map.UDFMapConcat'; -create temporary function map_element_at as 'cc.shanruifeng.functions.map.UDFMapElementAt'; -create temporary function map_equals as 'cc.shanruifeng.functions.map.UDFMapEquals'; -create temporary function day_of_week as 'cc.shanruifeng.functions.date.UDFDayOfWeek'; -create temporary function day_of_year as 'cc.shanruifeng.functions.date.UDFDayOfYear'; -create temporary function type_of_day as 'cc.shanruifeng.functions.date.UDFTypeOfDay'; -create temporary function zodiac_cn as 'cc.shanruifeng.functions.date.UDFZodiacSignCn'; -create temporary function zodiac_en as 'cc.shanruifeng.functions.date.UDFZodiacSignEn'; -create temporary function pinyin as 'cc.shanruifeng.functions.string.UDFChineseToPinYin'; -create temporary function md5 as 'cc.shanruifeng.functions.string.UDFMd5'; -create temporary function sha256 as 'cc.shanruifeng.functions.string.UDFSha256'; -create temporary function json_array_get as 'cc.shanruifeng.functions.json.UDFJsonArrayGet'; -create temporary function json_array_length as 'cc.shanruifeng.functions.json.UDFJsonArrayLength'; -create temporary function json_array_extract as 'cc.shanruifeng.functions.json.UDFJsonArrayExtract'; -create temporary function json_array_extract_scalar as 'cc.shanruifeng.functions.json.UDFJsonArrayExtractScalar'; -create temporary function json_extract as 'cc.shanruifeng.functions.json.UDFJsonExtract'; -create temporary function json_extract_scalar as 'cc.shanruifeng.functions.json.UDFJsonExtractScalar'; -create temporary function json_size as 'cc.shanruifeng.functions.json.UDFJsonSize'; -create temporary function id_card_province as 'cc.shanruifeng.functions.card.UDFChinaIdCardProvince'; -create temporary function id_card_city as 'cc.shanruifeng.functions.card.UDFChinaIdCardCity'; -create temporary function id_card_area as 'cc.shanruifeng.functions.card.UDFChinaIdCardArea'; -create temporary function id_card_birthday as 'cc.shanruifeng.functions.card.UDFChinaIdCardBirthday'; -create temporary function id_card_gender as 'cc.shanruifeng.functions.card.UDFChinaIdCardGender'; -create temporary function is_valid_id_card as 'cc.shanruifeng.functions.card.UDFChinaIdCardValid'; -create temporary function id_card_info as 'cc.shanruifeng.functions.card.UDFChinaIdCardInfo'; -create temporary function wgs_distance as 'cc.shanruifeng.functions.geo.UDFGeoWgsDistance'; -create temporary function gcj_to_bd as 'cc.shanruifeng.functions.geo.UDFGeoGcjToBd'; -create temporary function bd_to_gcj as 'cc.shanruifeng.functions.geo.UDFGeoBdToGcj'; -create temporary function wgs_to_gcj as 'cc.shanruifeng.functions.geo.UDFGeoWgsToGcj'; -create temporary function gcj_to_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjToWgs'; -create temporary function gcj_extract_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjExtractWgs'; -create temporary function url_encode as 'cc.shanruifeng.functions.url.UDFUrlEncode'; -create temporary function url_decode as 'cc.shanruifeng.functions.url.UDFUrlDecode'; +create temporary function array_contains as 'com.github.aaronshan.functions.array.UDFArrayContains'; +create temporary function array_equals as 'com.github.aaronshan.functions.array.UDFArrayEquals'; +create temporary function array_intersect as 'com.github.aaronshan.functions.array.UDFArrayIntersect'; +create temporary function array_max as 'com.github.aaronshan.functions.array.UDFArrayMax'; +create temporary function array_min as 'com.github.aaronshan.functions.array.UDFArrayMin'; +create temporary function array_join as 'com.github.aaronshan.functions.array.UDFArrayJoin'; +create temporary function array_distinct as 'com.github.aaronshan.functions.array.UDFArrayDistinct'; +create temporary function array_position as 'com.github.aaronshan.functions.array.UDFArrayPosition'; +create temporary function array_remove as 'com.github.aaronshan.functions.array.UDFArrayRemove'; +create temporary function array_reverse as 'com.github.aaronshan.functions.array.UDFArrayReverse'; +create temporary function array_sort as 'com.github.aaronshan.functions.array.UDFArraySort'; +create temporary function array_concat as 'com.github.aaronshan.functions.array.UDFArrayConcat'; +create temporary function array_value_count as 'com.github.aaronshan.functions.array.UDFArrayValueCount'; +create temporary function array_slice as 'com.github.aaronshan.functions.array.UDFArraySlice'; +create temporary function array_element_at as 'com.github.aaronshan.functions.array.UDFArrayElementAt'; +create temporary function bit_count as 'com.github.aaronshan.functions.bitwise.UDFBitCount'; +create temporary function bitwise_and as 'com.github.aaronshan.functions.bitwise.UDFBitwiseAnd'; +create temporary function array_shuffle as 'com.github.aaronshan.functions.array.UDFArrayShuffle'; +create temporary function sequence as 'com.github.aaronshan.functions.array.UDFSequence'; +create temporary function bitwise_not as 'com.github.aaronshan.functions.bitwise.UDFBitwiseNot'; +create temporary function bitwise_or as 'com.github.aaronshan.functions.bitwise.UDFBitwiseOr'; +create temporary function bitwise_xor as 'com.github.aaronshan.functions.bitwise.UDFBitwiseXor'; +create temporary function map_build as 'com.github.aaronshan.functions.map.UDFMapBuild'; +create temporary function map_concat as 'com.github.aaronshan.functions.map.UDFMapConcat'; +create temporary function map_element_at as 'com.github.aaronshan.functions.map.UDFMapElementAt'; +create temporary function map_equals as 'com.github.aaronshan.functions.map.UDFMapEquals'; +create temporary function day_of_week as 'com.github.aaronshan.functions.date.UDFDayOfWeek'; +create temporary function day_of_year as 'com.github.aaronshan.functions.date.UDFDayOfYear'; +create temporary function type_of_day as 'com.github.aaronshan.functions.date.UDFTypeOfDay'; +create temporary function zodiac_cn as 'com.github.aaronshan.functions.date.UDFZodiacSignCn'; +create temporary function zodiac_en as 'com.github.aaronshan.functions.date.UDFZodiacSignEn'; +create temporary function pinyin as 'com.github.aaronshan.functions.string.UDFChineseToPinYin'; +create temporary function md5 as 'com.github.aaronshan.functions.string.UDFMd5'; +create temporary function sha256 as 'com.github.aaronshan.functions.string.UDFSha256'; +create temporary function json_array_get as 'com.github.aaronshan.functions.json.UDFJsonArrayGet'; +create temporary function json_array_length as 'com.github.aaronshan.functions.json.UDFJsonArrayLength'; +create temporary function json_array_extract as 'com.github.aaronshan.functions.json.UDFJsonArrayExtract'; +create temporary function json_array_extract_scalar as 'com.github.aaronshan.functions.json.UDFJsonArrayExtractScalar'; +create temporary function json_extract as 'com.github.aaronshan.functions.json.UDFJsonExtract'; +create temporary function json_extract_scalar as 'com.github.aaronshan.functions.json.UDFJsonExtractScalar'; +create temporary function json_size as 'com.github.aaronshan.functions.json.UDFJsonSize'; +create temporary function id_card_province as 'com.github.aaronshan.functions.card.UDFChinaIdCardProvince'; +create temporary function id_card_city as 'com.github.aaronshan.functions.card.UDFChinaIdCardCity'; +create temporary function id_card_area as 'com.github.aaronshan.functions.card.UDFChinaIdCardArea'; +create temporary function id_card_birthday as 'com.github.aaronshan.functions.card.UDFChinaIdCardBirthday'; +create temporary function id_card_gender as 'com.github.aaronshan.functions.card.UDFChinaIdCardGender'; +create temporary function is_valid_id_card as 'com.github.aaronshan.functions.card.UDFChinaIdCardValid'; +create temporary function id_card_info as 'com.github.aaronshan.functions.card.UDFChinaIdCardInfo'; +create temporary function wgs_distance as 'com.github.aaronshan.functions.geo.UDFGeoWgsDistance'; +create temporary function gcj_to_bd as 'com.github.aaronshan.functions.geo.UDFGeoGcjToBd'; +create temporary function bd_to_gcj as 'com.github.aaronshan.functions.geo.UDFGeoBdToGcj'; +create temporary function wgs_to_gcj as 'com.github.aaronshan.functions.geo.UDFGeoWgsToGcj'; +create temporary function gcj_to_wgs as 'com.github.aaronshan.functions.geo.UDFGeoGcjToWgs'; +create temporary function gcj_extract_wgs as 'com.github.aaronshan.functions.geo.UDFGeoGcjExtractWgs'; +create temporary function url_encode as 'com.github.aaronshan.functions.url.UDFUrlEncode'; +create temporary function url_decode as 'com.github.aaronshan.functions.url.UDFUrlDecode'; +create temporary function infinity as 'com.github.aaronshan.functions.math.UDFMathInfinity'; +create temporary function is_finite as 'com.github.aaronshan.functions.math.UDFMathIsFinite'; +create temporary function is_infinite as 'com.github.aaronshan.functions.math.UDFMathIsInfinite'; +create temporary function is_nan as 'com.github.aaronshan.functions.math.UDFMathIsNaN'; +create temporary function nan as 'com.github.aaronshan.functions.math.UDFMathIsNaN'; +create temporary function from_base as 'com.github.aaronshan.functions.math.UDFMathFromBase'; +create temporary function to_base as 'com.github.aaronshan.functions.math.UDFMathToBase'; +create temporary function cosine_similarity as 'com.github.aaronshan.functions.math.UDFMathCosineSimilarity'; ``` 你可以在hive的命令杭中使用下面的语句来查看函数的细节. @@ -257,6 +286,11 @@ select array_concat(array(16,12,18,9,null), array(14,9,6,18,null)) => [16,12,18, select array_value_count(array(16,13,12,13,18,16,9,18), 13) => 2 select array_slice(array(16,13,12,13,18,16,9,18), -2, 3) => [9,18] select array_element_at(array(16,13,12,13,18,16,9,18), -1) => 18 +select array_shuffle(array(16,12,18,9)) +select sequence(1, 5) => [1, 2, 3, 4, 5] +select sequence(5, 1) => [5, 4, 3, 2, 1] +select sequence(1, 9, 4) => [1, 5, 9] +select sequence('2016-04-12 00:00:00', '2016-04-14 00:00:00', 24*3600*1000) => ['2016-04-12 00:00:00', '2016-04-13 00:00:00', '2016-04-14 00:00:00'] ``` ``` @@ -302,3 +336,7 @@ select gcj_extract_wgs(39.915, 116.404) => {"lng":116.39775549316407,"lat":39.91 ``` select url_encode('http://shanruifeng.cc/') => http%3A%2F%2Fshanruifeng.cc%2F ``` + +``` +select cosine_similarity(map_build(array['a'], array[1.0]), map_build(array['a'], array[2.0])); => 1.0 +``` \ No newline at end of file diff --git a/README.md b/README.md index 3cce4ac..cdeb7f8 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ It will generate hive-third-functions-${version}-shaded.jar in target directory. You can also directly download file from [release page](https://github.com/aaronshan/hive-third-functions/releases). -> current latest version is `2.1.3` +> current latest version is `2.2.0` ## Functions @@ -51,6 +51,22 @@ You can also directly download file from [release page](https://github.com/aaron |pinyin(string) -> string | convert chinese to pinyin| |md5(string) -> string | md5 hash| |sha256(string) -> string |sha256 hash| +|codepoint(string) -> integer | Returns the Unicode code point of the only character of string.| +|hamming_distance(string1, string2) -> bigint | Returns the Hamming distance of string1 and string2.| +|levenshtein_distance(string1, string2) -> bigint | Returns the Levenshtein edit distance of string1 and string2.| +|normalize(string, form) -> varchar | Transforms string with the specified normalization form. form must be be one of the following keywords: Normalize Form Description | +|strpos(string, substring) -> bigint | Returns the starting position of the first instance of substring in string. Positions start with 1. If not found, 0 is returned.| +|split_to_map(string, entryDelimiter, keyValueDelimiter) -> map<varchar, varchar> | Splits string by entryDelimiter and keyValueDelimiter and returns a map. entryDelimiter splits string into key-value pairs. keyValueDelimiter splits each pair into key and value.| +|split_to_multimap(string, entryDelimiter, keyValueDelimiter) -> map(varchar, array(varchar)) | Splits string by entryDelimiter and keyValueDelimiter and returns a map containing an array of values for each unique key. entryDelimiter splits string into key-value pairs. keyValueDelimiter splits each pair into key and value. The values for each key will be in the same order as they appeared in string.| + +[Normalize Form Description](#jump) + +| Form | Description | +|:--|:--| +| NFD | Canonical Decomposition | +| NFC | Canonical Decomposition, followed by Canonical Composition | +| NFKD | Compatibility Decomposition | +| NFKC | Compatibility Decomposition, followed by Canonical Composition | ### 2. array functions @@ -71,12 +87,17 @@ You can also directly download file from [release page](https://github.com/aaron |array_value_count(array<E>, E) -> long | count array's element number that element value equals given value.| |array_slice(array, start, length) -> array | subsets array starting from index start (or starting from the end if start is negative) with a length of length.| |array_element_at(array<E>, index) -> E | returns element of array at given index. If index < 0, element_at accesses elements from the last to the first.| +|array_shuffle(array) -> array | Generate a random permutation of the given array x.| +|sequence(start, end) -> array | Generate a sequence of integers from start to stop.| +|sequence(start, end, step) -> array | Generate a sequence of integers from start to stop, incrementing by step.| +|sequence(start_date_string, end_data_string, step) -> array | Generate a sequence of date string from start to stop, incrementing by step.| +|array_value_count(array<E>, E) -> long | count array's element number that element value equals given value..| ### 3. map functions | function| description | |:--|:--| |map_build(x<K>, y<V>) -> map<K, V>| returns a map created using the given key/value arrays.| -|map_concat(x<K, V>, y<K, V>) -> map<K,V> | returns the union of two maps. If a key is found in both `x` and `y`, that key’s value in the resulting map comes from `y`.| +|map_concat(x<K, V>, y<K, V>) -> map<K,V> | returns the union of two maps. If a key is found in both `x` and `y`, that key’s value in the resulting map comes from `y`.| |map_element_at(map<K, V>, key) -> V | returns value for given `key`, or `NULL` if the key is not contained in the map.| |map_equals(x<K, V>, y<K, V>) -> boolean | whether map x equals with map y or not.| @@ -87,7 +108,7 @@ You can also directly download file from [release page](https://github.com/aaron |day_of_week(date_string \| date) -> int | day of week,if monday,return 1, sunday return 7, error return null.| |day_of_year(date_string \| date) -> int | day of year. The value ranges from 1 to 366.| |zodiac_en(date_string \| date) -> string | convert date to zodiac| -|zodiac_cn(date_string \| date) -> string | convert date to zodiac chinese | +|zodiac_cn(date_string \| date) -> string | convert date to zodiac chinese | |type_of_day(date_string \| date) -> string | for chinese. 获取日期的类型(1: 法定节假日, 2: 正常周末, 3: 正常工作日 4:攒假的工作日),错误返回-1. | ### 5. json functions @@ -108,9 +129,9 @@ You can also directly download file from [release page](https://github.com/aaron |:--|:--| |bit_count(x, bits) -> bigint | count the number of bits set in `x` (treated as bits-bit signed integer) in 2’s complement representation | |bitwise_and(x, y) -> bigint | returns the bitwise AND of `x` and `y` in 2’s complement arithmetic.| -|bitwise_not(x) -> bigint | returns the bitwise NOT of `x` in 2’s complement arithmetic. | +|bitwise_not(x) -> bigint | returns the bitwise NOT of `x` in 2’s complement arithmetic. | |bitwise_or(x, y) -> bigint | returns the bitwise OR of `x` and `y` in 2’s complement arithmetic.| -|bitwise_xor(x, y) -> bigint | returns the bitwise XOR of `x` and `y` in 2’s complement arithmetic. | +|bitwise_xor(x, y) -> bigint | returns the bitwise XOR of `x` and `y` in 2’s complement arithmetic. | ### 7. china id card functions @@ -143,7 +164,31 @@ You can also directly download file from [release page](https://github.com/aaron | function| description | |:--|:--| |url_encode(value) -> string | escapes value by encoding it so that it can be safely included in URL query parameter names and values| -|url_decode(value) -> string | unescape the URL encoded value. This function is the inverse of `url_encode`. | +|url_decode(value) -> string | unescape the URL encoded value. This function is the inverse of `url_encode`. | + +### 10. math functions + +| function| description | +|:--|:--| +|infinity() -> double | Returns the constant representing positive infinity.| +|is_finite(x) -> boolean | Determine if x is finite.| +|is_infinite(x) -> boolean |Determine if x is infinite.| +|is_nan(x) -> boolean | Determine if x is not-a-number.| +|nan() -> double | Returns the constant representing not-a-number. | +|from_base(string, radix) -> bigint | Returns the value of string interpreted as a base-radix number.| +|to_base(x, radix) -> varchar | Returns the base-radix representation of x.| +|cosine_similarity(x, y) -> double | Returns the cosine similarity between the sparse vectors x and y| +|inverse_normal_cdf(mean, sd, p) -> double | Compute the inverse of the Normal cdf with given mean and standard deviation (sd) for the cumulative probability (p): P(N < n). The mean must be a real value and the standard deviation must be a real and positive value. The probability p must lie on the interval (0, 1). | +|normal_cdf(mean, sd, v) -> double | Compute the Normal cdf with given mean and standard deviation (sd): P(N < v; mean, sd). The mean and value v must be real values and the standard deviation must be a real and positive value.| + +### 11. regexp functions +| function| description | +|:--|:--| +|regexp_like(string, pattern) -> boolean | Evaluates the regular expression pattern and determines if it is contained within string.| +|regexp_extract_all(string, pattern) -> array(varchar) | Returns the substring(s) matched by the regular expression pattern in string. | +|regexp_extract(string, pattern) -> varchar | Returns the first substring matched by the regular expression pattern in string.| +|regexp_replace(string, pattern) -> varchar | Removes every instance of the substring matched by the regular expression pattern from string.| +|regexp_replace(string, pattern, replacement) -> varchar | Replaces every instance of the substring matched by the regular expression pattern in string with replacement. | ## Use @@ -151,60 +196,85 @@ Put these statements into `${HOME}/.hiverc` or exec its on hive cli env. ``` add jar ${jar_location_dir}/hive-third-functions-${version}-shaded.jar -create temporary function array_contains as 'cc.shanruifeng.functions.array.UDFArrayContains'; -create temporary function array_equals as 'cc.shanruifeng.functions.array.UDFArrayEquals'; -create temporary function array_intersect as 'cc.shanruifeng.functions.array.UDFArrayIntersect'; -create temporary function array_max as 'cc.shanruifeng.functions.array.UDFArrayMax'; -create temporary function array_min as 'cc.shanruifeng.functions.array.UDFArrayMin'; -create temporary function array_join as 'cc.shanruifeng.functions.array.UDFArrayJoin'; -create temporary function array_distinct as 'cc.shanruifeng.functions.array.UDFArrayDistinct'; -create temporary function array_position as 'cc.shanruifeng.functions.array.UDFArrayPosition'; -create temporary function array_remove as 'cc.shanruifeng.functions.array.UDFArrayRemove'; -create temporary function array_reverse as 'cc.shanruifeng.functions.array.UDFArrayReverse'; -create temporary function array_sort as 'cc.shanruifeng.functions.array.UDFArraySort'; -create temporary function array_concat as 'cc.shanruifeng.functions.array.UDFArrayConcat'; -create temporary function array_value_count as 'cc.shanruifeng.functions.array.UDFArrayValueCount'; -create temporary function array_slice as 'cc.shanruifeng.functions.array.UDFArraySlice'; -create temporary function array_element_at as 'cc.shanruifeng.functions.array.UDFArrayElementAt'; -create temporary function bit_count as 'cc.shanruifeng.functions.bitwise.UDFBitCount'; -create temporary function bitwise_and as 'cc.shanruifeng.functions.bitwise.UDFBitwiseAnd'; -create temporary function bitwise_not as 'cc.shanruifeng.functions.bitwise.UDFBitwiseNot'; -create temporary function bitwise_or as 'cc.shanruifeng.functions.bitwise.UDFBitwiseOr'; -create temporary function bitwise_xor as 'cc.shanruifeng.functions.bitwise.UDFBitwiseXor'; -create temporary function map_build as 'cc.shanruifeng.functions.map.UDFMapBuild'; -create temporary function map_concat as 'cc.shanruifeng.functions.map.UDFMapConcat'; -create temporary function map_element_at as 'cc.shanruifeng.functions.map.UDFMapElementAt'; -create temporary function map_equals as 'cc.shanruifeng.functions.map.UDFMapEquals'; -create temporary function day_of_week as 'cc.shanruifeng.functions.date.UDFDayOfWeek'; -create temporary function day_of_year as 'cc.shanruifeng.functions.date.UDFDayOfYear'; -create temporary function type_of_day as 'cc.shanruifeng.functions.date.UDFTypeOfDay'; -create temporary function zodiac_cn as 'cc.shanruifeng.functions.date.UDFZodiacSignCn'; -create temporary function zodiac_en as 'cc.shanruifeng.functions.date.UDFZodiacSignEn'; -create temporary function pinyin as 'cc.shanruifeng.functions.string.UDFChineseToPinYin'; -create temporary function md5 as 'cc.shanruifeng.functions.string.UDFMd5'; -create temporary function sha256 as 'cc.shanruifeng.functions.string.UDFSha256'; -create temporary function json_array_get as 'cc.shanruifeng.functions.json.UDFJsonArrayGet'; -create temporary function json_array_length as 'cc.shanruifeng.functions.json.UDFJsonArrayLength'; -create temporary function json_array_extract as 'cc.shanruifeng.functions.json.UDFJsonArrayExtract'; -create temporary function json_array_extract_scalar as 'cc.shanruifeng.functions.json.UDFJsonArrayExtractScalar'; -create temporary function json_extract as 'cc.shanruifeng.functions.json.UDFJsonExtract'; -create temporary function json_extract_scalar as 'cc.shanruifeng.functions.json.UDFJsonExtractScalar'; -create temporary function json_size as 'cc.shanruifeng.functions.json.UDFJsonSize'; -create temporary function id_card_province as 'cc.shanruifeng.functions.card.UDFChinaIdCardProvince'; -create temporary function id_card_city as 'cc.shanruifeng.functions.card.UDFChinaIdCardCity'; -create temporary function id_card_area as 'cc.shanruifeng.functions.card.UDFChinaIdCardArea'; -create temporary function id_card_birthday as 'cc.shanruifeng.functions.card.UDFChinaIdCardBirthday'; -create temporary function id_card_gender as 'cc.shanruifeng.functions.card.UDFChinaIdCardGender'; -create temporary function is_valid_id_card as 'cc.shanruifeng.functions.card.UDFChinaIdCardValid'; -create temporary function id_card_info as 'cc.shanruifeng.functions.card.UDFChinaIdCardInfo'; -create temporary function wgs_distance as 'cc.shanruifeng.functions.geo.UDFGeoWgsDistance'; -create temporary function gcj_to_bd as 'cc.shanruifeng.functions.geo.UDFGeoGcjToBd'; -create temporary function bd_to_gcj as 'cc.shanruifeng.functions.geo.UDFGeoBdToGcj'; -create temporary function wgs_to_gcj as 'cc.shanruifeng.functions.geo.UDFGeoWgsToGcj'; -create temporary function gcj_to_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjToWgs'; -create temporary function gcj_extract_wgs as 'cc.shanruifeng.functions.geo.UDFGeoGcjExtractWgs'; -create temporary function url_encode as 'cc.shanruifeng.functions.url.UDFUrlEncode'; -create temporary function url_decode as 'cc.shanruifeng.functions.url.UDFUrlDecode'; +create temporary function array_contains as 'com.github.aaronshan.functions.array.UDFArrayContains'; +create temporary function array_equals as 'com.github.aaronshan.functions.array.UDFArrayEquals'; +create temporary function array_intersect as 'com.github.aaronshan.functions.array.UDFArrayIntersect'; +create temporary function array_max as 'com.github.aaronshan.functions.array.UDFArrayMax'; +create temporary function array_min as 'com.github.aaronshan.functions.array.UDFArrayMin'; +create temporary function array_join as 'com.github.aaronshan.functions.array.UDFArrayJoin'; +create temporary function array_distinct as 'com.github.aaronshan.functions.array.UDFArrayDistinct'; +create temporary function array_position as 'com.github.aaronshan.functions.array.UDFArrayPosition'; +create temporary function array_remove as 'com.github.aaronshan.functions.array.UDFArrayRemove'; +create temporary function array_reverse as 'com.github.aaronshan.functions.array.UDFArrayReverse'; +create temporary function array_sort as 'com.github.aaronshan.functions.array.UDFArraySort'; +create temporary function array_concat as 'com.github.aaronshan.functions.array.UDFArrayConcat'; +create temporary function array_value_count as 'com.github.aaronshan.functions.array.UDFArrayValueCount'; +create temporary function array_slice as 'com.github.aaronshan.functions.array.UDFArraySlice'; +create temporary function array_element_at as 'com.github.aaronshan.functions.array.UDFArrayElementAt'; +create temporary function bit_count as 'com.github.aaronshan.functions.bitwise.UDFBitCount'; +create temporary function bitwise_and as 'com.github.aaronshan.functions.bitwise.UDFBitwiseAnd'; +create temporary function array_shuffle as 'com.github.aaronshan.functions.array.UDFArrayShuffle'; +create temporary function sequence as 'com.github.aaronshan.functions.array.UDFSequence'; +create temporary function array_value_count as 'com.github.aaronshan.functions.array.UDFArrayValueCount'; +create temporary function bitwise_not as 'com.github.aaronshan.functions.bitwise.UDFBitwiseNot'; +create temporary function bitwise_or as 'com.github.aaronshan.functions.bitwise.UDFBitwiseOr'; +create temporary function bitwise_xor as 'com.github.aaronshan.functions.bitwise.UDFBitwiseXor'; +create temporary function map_build as 'com.github.aaronshan.functions.map.UDFMapBuild'; +create temporary function map_concat as 'com.github.aaronshan.functions.map.UDFMapConcat'; +create temporary function map_element_at as 'com.github.aaronshan.functions.map.UDFMapElementAt'; +create temporary function map_equals as 'com.github.aaronshan.functions.map.UDFMapEquals'; +create temporary function day_of_week as 'com.github.aaronshan.functions.date.UDFDayOfWeek'; +create temporary function day_of_year as 'com.github.aaronshan.functions.date.UDFDayOfYear'; +create temporary function type_of_day as 'com.github.aaronshan.functions.date.UDFTypeOfDay'; +create temporary function zodiac_cn as 'com.github.aaronshan.functions.date.UDFZodiacSignCn'; +create temporary function zodiac_en as 'com.github.aaronshan.functions.date.UDFZodiacSignEn'; +create temporary function pinyin as 'com.github.aaronshan.functions.string.UDFChineseToPinYin'; +create temporary function md5 as 'com.github.aaronshan.functions.string.UDFMd5'; +create temporary function sha256 as 'com.github.aaronshan.functions.string.UDFSha256'; +create temporary function codepoint as 'com.github.aaronshan.functions.string.UDFCodePoint'; +create temporary function hamming_distance as 'com.github.aaronshan.functions.string.UDFStringHammingDistance'; +create temporary function levenshtein_distance as 'com.github.aaronshan.functions.string.UDFStringLevenshteinDistance'; +create temporary function normalize as 'com.github.aaronshan.functions.string.UDFStringNormalize'; +create temporary function strpos as 'com.github.aaronshan.functions.string.UDFStringPosition'; +create temporary function split_to_map as 'com.github.aaronshan.functions.string.UDFStringSplitToMap'; +create temporary function split_to_multimap as 'com.github.aaronshan.functions.string.UDFStringSplitToMultimap'; +create temporary function json_array_get as 'com.github.aaronshan.functions.json.UDFJsonArrayGet'; +create temporary function json_array_length as 'com.github.aaronshan.functions.json.UDFJsonArrayLength'; +create temporary function json_array_extract as 'com.github.aaronshan.functions.json.UDFJsonArrayExtract'; +create temporary function json_array_extract_scalar as 'com.github.aaronshan.functions.json.UDFJsonArrayExtractScalar'; +create temporary function json_extract as 'com.github.aaronshan.functions.json.UDFJsonExtract'; +create temporary function json_extract_scalar as 'com.github.aaronshan.functions.json.UDFJsonExtractScalar'; +create temporary function json_size as 'com.github.aaronshan.functions.json.UDFJsonSize'; +create temporary function id_card_province as 'com.github.aaronshan.functions.card.UDFChinaIdCardProvince'; +create temporary function id_card_city as 'com.github.aaronshan.functions.card.UDFChinaIdCardCity'; +create temporary function id_card_area as 'com.github.aaronshan.functions.card.UDFChinaIdCardArea'; +create temporary function id_card_birthday as 'com.github.aaronshan.functions.card.UDFChinaIdCardBirthday'; +create temporary function id_card_gender as 'com.github.aaronshan.functions.card.UDFChinaIdCardGender'; +create temporary function is_valid_id_card as 'com.github.aaronshan.functions.card.UDFChinaIdCardValid'; +create temporary function id_card_info as 'com.github.aaronshan.functions.card.UDFChinaIdCardInfo'; +create temporary function wgs_distance as 'com.github.aaronshan.functions.geo.UDFGeoWgsDistance'; +create temporary function gcj_to_bd as 'com.github.aaronshan.functions.geo.UDFGeoGcjToBd'; +create temporary function bd_to_gcj as 'com.github.aaronshan.functions.geo.UDFGeoBdToGcj'; +create temporary function wgs_to_gcj as 'com.github.aaronshan.functions.geo.UDFGeoWgsToGcj'; +create temporary function gcj_to_wgs as 'com.github.aaronshan.functions.geo.UDFGeoGcjToWgs'; +create temporary function gcj_extract_wgs as 'com.github.aaronshan.functions.geo.UDFGeoGcjExtractWgs'; +create temporary function url_encode as 'com.github.aaronshan.functions.url.UDFUrlEncode'; +create temporary function url_decode as 'com.github.aaronshan.functions.url.UDFUrlDecode'; +create temporary function infinity as 'com.github.aaronshan.functions.math.UDFMathInfinity'; +create temporary function is_finite as 'com.github.aaronshan.functions.math.UDFMathIsFinite'; +create temporary function is_infinite as 'com.github.aaronshan.functions.math.UDFMathIsInfinite'; +create temporary function nan as 'com.github.aaronshan.functions.math.UDFMathNaN'; +create temporary function is_nan as 'com.github.aaronshan.functions.math.UDFMathIsNaN'; +create temporary function from_base as 'com.github.aaronshan.functions.math.UDFMathFromBase'; +create temporary function to_base as 'com.github.aaronshan.functions.math.UDFMathToBase'; +create temporary function cosine_similarity as 'com.github.aaronshan.functions.math.UDFMathCosineSimilarity'; +create temporary function normal_cdf as 'com.github.aaronshan.functions.math.UDFMathNormalCdf'; +create temporary function inverse_normal_cdf as 'com.github.aaronshan.functions.math.UDFMathInverseNormalCdf'; +create temporary function regexp_extract as 'com.github.aaronshan.functions.regexp.UDFRe2JRegexpExtract'; +create temporary function regexp_extract_all as 'com.github.aaronshan.functions.regexp.UDFRe2JRegexpExtractAll'; +create temporary function regexp_like as 'com.github.aaronshan.functions.regexp.UDFRe2JRegexpLike'; +create temporary function regexp_replace as 'com.github.aaronshan.functions.regexp.UDFRe2JRegexpReplace'; +create temporary function regexp_split as 'com.github.aaronshan.functions.regexp.UDFRe2JRegexpSplit'; ``` You can use these statements on hive cli env get detail of function. @@ -257,6 +327,11 @@ select array_concat(array(16,12,18,9,null), array(14,9,6,18,null)) => [16,12,18, select array_value_count(array(16,13,12,13,18,16,9,18), 13) => 2 select array_slice(array(16,13,12,13,18,16,9,18), -2, 3) => [9,18] select array_element_at(array(16,13,12,13,18,16,9,18), -1) => 18 +select array_shuffle(array(16,12,18,9)) +select sequence(1, 5) => [1, 2, 3, 4, 5] +select sequence(5, 1) => [5, 4, 3, 2, 1] +select sequence(1, 9, 4) => [1, 5, 9] +select sequence('2016-04-12 00:00:00', '2016-04-14 00:00:00', 24*3600*1000) => ['2016-04-12 00:00:00', '2016-04-13 00:00:00', '2016-04-14 00:00:00'] ``` ``` @@ -302,3 +377,7 @@ select gcj_extract_wgs(39.915, 116.404) => {"lng":116.39775549316407,"lat":39.91 ``` select url_encode('http://shanruifeng.cc/') => http%3A%2F%2Fshanruifeng.cc%2F ``` + +``` +select cosine_similarity(map_build(array['a'], array[1.0]), map_build(array['a'], array[2.0])); => 1.0 +``` \ No newline at end of file diff --git a/pom.xml b/pom.xml index fdebeb0..95f9a63 100644 --- a/pom.xml +++ b/pom.xml @@ -4,9 +4,14 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - cc.shanruifeng + com.github.aaronshan hive-third-functions - 2.1.3 + 2.2.0 + + + https://github.com/aaronshan + https://github.com/aaronshan/hive-third-functions.git + UTF-8 @@ -21,9 +26,10 @@ 18.0 1.10 0.131 - 2.4.4 + 2.9.0 1.9.3 4.12 + 1.6 @@ -70,6 +76,12 @@ ${dep.airlift.version} + + io.airlift + slice + 0.35 + + com.fasterxml.jackson.core jackson-core @@ -82,11 +94,48 @@ ${dep.jackson.version} + + com.fasterxml.jackson.core + jackson-annotations + ${dep.jackson.version} + + junit junit ${junit.version} + + + pl.joegreen + lambda-from-string + ${lambda.from.string.version} + + + + org.apache.commons + commons-math3 + 3.6.1 + + + + com.teradata + re2j-td + 1.4 + + + + org.apache.lucene + lucene-analyzers-common + 7.2.1 + + + org.apache.lucene + lucene-core + + + + @@ -128,6 +177,11 @@ json + + io.airlift + slice + + com.fasterxml.jackson.core jackson-core @@ -138,6 +192,26 @@ jackson-databind + + pl.joegreen + lambda-from-string + + + + org.apache.commons + commons-math3 + + + + com.teradata + re2j-td + + + + org.apache.lucene + lucene-analyzers-common + + junit junit @@ -194,5 +268,65 @@ + + + + aaronshan + aaronshan + shanruifeng@gmail.com + + Developer + + +8 + + + + + + release + + + + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + package + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.6 + + + verify + + sign + + + + + + + + + oss + https://oss.sonatype.org/content/repositories/snapshots/ + + + oss + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + + \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayConcat.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayConcat.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayConcat.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayConcat.java index b5e689a..47c53ec 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayConcat.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayConcat.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayContains.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayContains.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayContains.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayContains.java index 6a48726..9b6177d 100755 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayContains.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayContains.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayDistinct.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayDistinct.java similarity index 95% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayDistinct.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayDistinct.java index 51b718b..ca230bd 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayDistinct.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayDistinct.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.fastuitl.ints.IntArrays; +import com.github.aaronshan.functions.fastuitl.ints.IntArrays; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; @@ -10,7 +10,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.*; -import static cc.shanruifeng.functions.utils.ArrayUtils.IntArrayCompare; +import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; /** * @author ruifeng.shan diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayElementAt.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayElementAt.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayElementAt.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayElementAt.java index bd4009b..5524708 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayElementAt.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayElementAt.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayEquals.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayEquals.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayEquals.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayEquals.java index f9811d7..9bc41bf 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayEquals.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayEquals.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.utils.ArrayUtils; +import com.github.aaronshan.functions.utils.ArrayUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayIntersect.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayIntersect.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayIntersect.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayIntersect.java index 1991a82..b4cd0f7 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayIntersect.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayIntersect.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.fastuitl.ints.IntArrays; +import com.github.aaronshan.functions.fastuitl.ints.IntArrays; import java.util.ArrayList; import java.util.Arrays; @@ -13,7 +13,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.*; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; -import static cc.shanruifeng.functions.utils.ArrayUtils.IntArrayCompare; +import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; /** * @author ruifeng.shan diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayJoin.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayJoin.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayJoin.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayJoin.java index 9321286..dc370cb 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayJoin.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayJoin.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayMax.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayMax.java similarity index 94% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayMax.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayMax.java index f2101f5..c4d5da9 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayMax.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayMax.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.fastuitl.ints.IntArrays; +import com.github.aaronshan.functions.fastuitl.ints.IntArrays; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; @@ -11,7 +11,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import static cc.shanruifeng.functions.utils.ArrayUtils.IntArrayCompare; +import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; /** * @author ruifeng.shan diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayMin.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayMin.java similarity index 94% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayMin.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayMin.java index 992b090..a8d022b 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayMin.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayMin.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.fastuitl.ints.IntArrays; +import com.github.aaronshan.functions.fastuitl.ints.IntArrays; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; @@ -11,7 +11,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import static cc.shanruifeng.functions.utils.ArrayUtils.IntArrayCompare; +import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; /** * @author ruifeng.shan diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayPosition.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayPosition.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayPosition.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayPosition.java index f916cb3..89d293c 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayPosition.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayPosition.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayRemove.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayRemove.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayRemove.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayRemove.java index af1f303..fac4613 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayRemove.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayRemove.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayReverse.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayReverse.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayReverse.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayReverse.java index e449e34..f9874a7 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayReverse.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayReverse.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/com/github/aaronshan/functions/array/UDFArrayShuffle.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayShuffle.java new file mode 100644 index 0000000..460b9ca --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayShuffle.java @@ -0,0 +1,113 @@ +package com.github.aaronshan.functions.array; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.*; + +import java.util.ArrayList; +import java.util.Random; + +/** + * @author aaron02 + * @date 2018-08-18 上午8:52 + */ +@Description(name = "array_shuffle" + , value = "_FUNC_(array) - Generates a random permutation of the given array." + , extended = "Example:\n > select _FUNC_(array) from src;") +public class UDFArrayShuffle extends GenericUDF { + private static final int ARG_COUNT = 1; // Number of arguments to this UDF + private transient ListObjectInspector arrayOI; + private transient ObjectInspector arrayElementOI; + + private transient ObjectInspectorConverters.Converter converter; + private transient ArrayList result = new ArrayList(); + + private static final int INITIAL_LENGTH = 128; + private int[] positions = new int[INITIAL_LENGTH]; + + public UDFArrayShuffle() { + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != ARG_COUNT) { + throw new UDFArgumentLengthException( + "The function array_shuffle(array) takes exactly " + ARG_COUNT + " arguments."); + } + + // Check if two argument is of category LIST + if (!arguments[0].getCategory().equals(ObjectInspector.Category.LIST)) { + throw new UDFArgumentTypeException(0, + "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " + + "expected at function array_shuffle, but " + + "\"" + arguments[0].getTypeName() + "\" " + + "is found"); + } + + arrayOI = (ListObjectInspector) arguments[0]; + arrayElementOI = arrayOI.getListElementObjectInspector(); + + // Check if the comparison is supported for this type + if (!ObjectInspectorUtils.compareSupported(arrayElementOI)) { + throw new UDFArgumentException("The function array_shuffle" + + " does not support comparison for " + + "\"" + arrayElementOI.getTypeName() + "\"" + + " types"); + } + + converter = ObjectInspectorConverters.getConverter(arrayElementOI, arrayElementOI); + + return ObjectInspectorFactory.getStandardListObjectInspector(arrayElementOI); + } + + @Override + public Object evaluate(GenericUDF.DeferredObject[] arguments) throws HiveException { + Object array = arguments[0].get(); + int arrayLength = arrayOI.getListLength(array); + + // Check if array is null or empty + if (array == null || arrayLength <= 0) { + return null; + } + + if (arrayLength == 1) { + return array; + } + + result.clear(); + + if (positions.length < arrayLength) { + positions = new int[arrayLength]; + } + for (int i = 0; i < arrayLength; i++) { + positions[i] = i; + } + + // Fisher-Yates shuffle + // Randomly swap a pair of positions + for (int i = arrayLength - 1; i > 0; i--) { + Random random = new Random(); + int index = random.nextInt(i + 1); + int swap = positions[i]; + positions[i] = positions[index]; + positions[index] = swap; + } + + for (int i = 0; i < arrayLength; i++) { + Object arrayElement = arrayOI.getListElement(array, positions[i]); + result.add(arrayElement); + } + return result; + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == ARG_COUNT); + return "array_shuffle(" + strings[0] + ")"; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArraySlice.java b/src/main/java/com/github/aaronshan/functions/array/UDFArraySlice.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/array/UDFArraySlice.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArraySlice.java index 4f1feb2..0bed6e5 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArraySlice.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArraySlice.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArraySort.java b/src/main/java/com/github/aaronshan/functions/array/UDFArraySort.java similarity index 91% rename from src/main/java/cc/shanruifeng/functions/array/UDFArraySort.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArraySort.java index 3a0313f..7378a16 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArraySort.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArraySort.java @@ -1,7 +1,9 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; -import cc.shanruifeng.functions.fastuitl.ints.IntArrays; +import com.github.aaronshan.functions.fastuitl.ints.IntArrays; import java.util.ArrayList; + +import com.github.aaronshan.functions.utils.ArrayUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; @@ -10,7 +12,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.*; -import static cc.shanruifeng.functions.utils.ArrayUtils.IntArrayCompare; +import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; /** * @author ruifeng.shan @@ -88,7 +90,7 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException { positions[i] = i; } - IntArrays.quickSort(positions, 0, arrayLength, IntArrayCompare(array, arrayOI)); + IntArrays.quickSort(positions, 0, arrayLength, ArrayUtils.IntArrayCompare(array, arrayOI)); result.clear(); for (int i = 0; i < arrayLength; i++) { diff --git a/src/main/java/cc/shanruifeng/functions/array/UDFArrayValueCount.java b/src/main/java/com/github/aaronshan/functions/array/UDFArrayValueCount.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/array/UDFArrayValueCount.java rename to src/main/java/com/github/aaronshan/functions/array/UDFArrayValueCount.java index 4984e15..3b1c643 100644 --- a/src/main/java/cc/shanruifeng/functions/array/UDFArrayValueCount.java +++ b/src/main/java/com/github/aaronshan/functions/array/UDFArrayValueCount.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; @@ -67,7 +67,7 @@ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumen // Check if the comparison is supported for this type if (!ObjectInspectorUtils.compareSupported(valueOI)) { - throw new UDFArgumentException("The function array_contains" + throw new UDFArgumentException("The function array_value_count" + " does not support comparison for " + "\"" + valueOI.getTypeName() + "\"" + " types"); diff --git a/src/main/java/com/github/aaronshan/functions/array/UDFSequence.java b/src/main/java/com/github/aaronshan/functions/array/UDFSequence.java new file mode 100644 index 0000000..cf1c975 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/array/UDFSequence.java @@ -0,0 +1,92 @@ +package com.github.aaronshan.functions.array; + +import com.google.common.collect.Lists; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.joda.time.DateTime; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; + +import java.util.List; + +import static com.github.aaronshan.functions.utils.Failures.checkCondition; +/** + * @author aaron02 + * @date 2018-08-18 上午9:23 + */ +@Description(name = "sequence" + , value = "_FUNC_(start, stop) - Generate a sequence of integers from start to stop.\n" + + "_FUNC_(start, stop, step) - Generate a sequence of integers from start to stop, incrementing by step." + , extended = "Example:\n > select _FUNC_(1, 5) from src;\n > select _FUNC_(1, 9, 4) from src;\n" + + " > select _FUNC_('2016-04-12', '2016-04-14') from src;") +public class UDFSequence extends UDF { + public final static DateTimeFormatter DEFAULT_DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); + private static final long MAX_RESULT_ENTRIES = 10000; + + public UDFSequence() { + + } + + public Object evaluate(LongWritable start, LongWritable stop) throws HiveException { + return fixedWidthSequence(start.get(), stop.get(), stop.get() >= start.get() ? 1 : -1, Long.class); + } + + public Object evaluate(LongWritable start, LongWritable stop, LongWritable step) throws HiveException { + return fixedWidthSequence(start.get(), stop.get(), step.get(), Long.class); + } + + public Object evaluate(Text start, Text stop, long step) throws HiveException { + long startMillis = DateTime.parse(start.toString(), DEFAULT_DATE_FORMATTER).getMillis(); + long stopMillis = DateTime.parse(stop.toString(), DEFAULT_DATE_FORMATTER).getMillis(); + return fixedWidthSequence(startMillis, stopMillis, step, String.class); + } + + public static int toIntExact(long value) { + if ((int)value != value) { + throw new ArithmeticException("integer overflow"); + } + return (int)value; + } + + private static Object fixedWidthSequence(long start, long stop, long step, Class type) throws HiveException { + checkValidStep(start, stop, step); + + int length = toIntExact((stop - start) / step + 1L); + checkMaxEntry(length); + + if (type == long.class || type == Long.class) { + List result = Lists.newArrayList(); + for (long i = 0, value = start; i < length; ++i, value += step) { + result.add(value); + } + return result; + } else if (type == String.class){ + List result = Lists.newArrayList(); + for (long i = 0, value = start; i < length; ++i, value += step) { + DateTime dateTime = new DateTime(value); + result.add(dateTime.toString(DEFAULT_DATE_FORMATTER)); + } + return result; + } else { + throw new HiveException("Don't support this class type!" + type); + } + } + + private static void checkValidStep(long start, long stop, long step) throws HiveException { + checkCondition( + step != 0, + "step must not be zero"); + checkCondition( + step > 0 ? stop >= start : stop <= start, + "sequence stop value should be greater than or equal to start value if step is greater than zero otherwise stop should be less than or equal to start"); + } + + private static void checkMaxEntry(int length) throws HiveException { + checkCondition( + length <= MAX_RESULT_ENTRIES, + "result of sequence function must not have more than 10000 entries"); + } +} diff --git a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitCount.java b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitCount.java similarity index 96% rename from src/main/java/cc/shanruifeng/functions/bitwise/UDFBitCount.java rename to src/main/java/com/github/aaronshan/functions/bitwise/UDFBitCount.java index b4dbb84..bb39515 100644 --- a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitCount.java +++ b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitCount.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseAnd.java b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseAnd.java similarity index 92% rename from src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseAnd.java rename to src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseAnd.java index 3df2ce1..acfc013 100644 --- a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseAnd.java +++ b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseAnd.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseNot.java b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseNot.java similarity index 92% rename from src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseNot.java rename to src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseNot.java index a23d238..b50d4b5 100644 --- a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseNot.java +++ b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseNot.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseOr.java b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseOr.java similarity index 92% rename from src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseOr.java rename to src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseOr.java index f7fdc8c..8a9bc75 100644 --- a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseOr.java +++ b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseOr.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseXor.java b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseXor.java similarity index 92% rename from src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseXor.java rename to src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseXor.java index 977c03a..1693c4e 100644 --- a/src/main/java/cc/shanruifeng/functions/bitwise/UDFBitwiseXor.java +++ b/src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseXor.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardArea.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardArea.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardArea.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardArea.java index d47e1dc..902dc81 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardArea.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardArea.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardBirthday.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardBirthday.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardBirthday.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardBirthday.java index 56d274a..ea43b27 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardBirthday.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardBirthday.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardCity.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardCity.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardCity.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardCity.java index 807fa6a..31cc99a 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardCity.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardCity.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardGender.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardGender.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardGender.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardGender.java index cba80c9..06d7686 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardGender.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardGender.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardInfo.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardInfo.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardInfo.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardInfo.java index 209a941..5a79b4a 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardInfo.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardInfo.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardProvince.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardProvince.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardProvince.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardProvince.java index a6fea25..5ffaa55 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardProvince.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardProvince.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardValid.java b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardValid.java similarity index 88% rename from src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardValid.java rename to src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardValid.java index 239a094..f6d9d77 100644 --- a/src/main/java/cc/shanruifeng/functions/card/UDFChinaIdCardValid.java +++ b/src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardValid.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.card; +package com.github.aaronshan.functions.card; -import cc.shanruifeng.functions.utils.CardUtils; +import com.github.aaronshan.functions.utils.CardUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BooleanWritable; diff --git a/src/main/java/cc/shanruifeng/functions/date/UDFDayOfWeek.java b/src/main/java/com/github/aaronshan/functions/date/UDFDayOfWeek.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/date/UDFDayOfWeek.java rename to src/main/java/com/github/aaronshan/functions/date/UDFDayOfWeek.java index 982e776..a7034f4 100755 --- a/src/main/java/cc/shanruifeng/functions/date/UDFDayOfWeek.java +++ b/src/main/java/com/github/aaronshan/functions/date/UDFDayOfWeek.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/date/UDFDayOfYear.java b/src/main/java/com/github/aaronshan/functions/date/UDFDayOfYear.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/date/UDFDayOfYear.java rename to src/main/java/com/github/aaronshan/functions/date/UDFDayOfYear.java index d446f7b..f482355 100644 --- a/src/main/java/cc/shanruifeng/functions/date/UDFDayOfYear.java +++ b/src/main/java/com/github/aaronshan/functions/date/UDFDayOfYear.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; import java.util.Calendar; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/date/UDFTypeOfDay.java b/src/main/java/com/github/aaronshan/functions/date/UDFTypeOfDay.java similarity index 96% rename from src/main/java/cc/shanruifeng/functions/date/UDFTypeOfDay.java rename to src/main/java/com/github/aaronshan/functions/date/UDFTypeOfDay.java index 9fde811..6dc6851 100755 --- a/src/main/java/cc/shanruifeng/functions/date/UDFTypeOfDay.java +++ b/src/main/java/com/github/aaronshan/functions/date/UDFTypeOfDay.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; -import cc.shanruifeng.functions.utils.ConfigUtils; +import com.github.aaronshan.functions.utils.ConfigUtils; import java.util.Calendar; import java.util.Map; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignCn.java b/src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignCn.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignCn.java rename to src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignCn.java index 7a5bdd2..fecee21 100644 --- a/src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignCn.java +++ b/src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignCn.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignEn.java b/src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignEn.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignEn.java rename to src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignEn.java index 64281bd..db2a0c2 100644 --- a/src/main/java/cc/shanruifeng/functions/date/UDFZodiacSignEn.java +++ b/src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignEn.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; diff --git a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/AbstractIntComparator.java b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/AbstractIntComparator.java similarity index 88% rename from src/main/java/cc/shanruifeng/functions/fastuitl/ints/AbstractIntComparator.java rename to src/main/java/com/github/aaronshan/functions/fastuitl/ints/AbstractIntComparator.java index 3c83a45..972ae97 100644 --- a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/AbstractIntComparator.java +++ b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/AbstractIntComparator.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.fastuitl.ints; +package com.github.aaronshan.functions.fastuitl.ints; // Note: this code was forked from fastutil (http://fastutil.di.unimi.it/) // Copyright (C) 2010-2013 Sebastiano Vigna diff --git a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntArrays.java b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntArrays.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntArrays.java rename to src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntArrays.java index 833ed56..9341566 100644 --- a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntArrays.java +++ b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntArrays.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.fastuitl.ints; +package com.github.aaronshan.functions.fastuitl.ints; // Note: this code was forked from fastutil (http://fastutil.di.unimi.it/) // Copyright (C) 2010-2013 Sebastiano Vigna diff --git a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntComparator.java b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntComparator.java similarity index 82% rename from src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntComparator.java rename to src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntComparator.java index 37eba9f..67f9d03 100644 --- a/src/main/java/cc/shanruifeng/functions/fastuitl/ints/IntComparator.java +++ b/src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntComparator.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.fastuitl.ints; +package com.github.aaronshan.functions.fastuitl.ints; import java.util.Comparator; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcj.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcj.java similarity index 85% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcj.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcj.java index de9d05a..07b6323 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcj.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcj.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjExtractWgs.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjExtractWgs.java similarity index 85% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjExtractWgs.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjExtractWgs.java index 34dca23..4d0fee4 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjExtractWgs.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjExtractWgs.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToBd.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToBd.java similarity index 84% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToBd.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToBd.java index a1c422a..e2eee04 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToBd.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToBd.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToWgs.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToWgs.java similarity index 84% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToWgs.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToWgs.java index 10278ae..e019a49 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoGcjToWgs.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToWgs.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsDistance.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsDistance.java similarity index 86% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsDistance.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsDistance.java index 000bb3b..0ed7237 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsDistance.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsDistance.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.serde2.io.DoubleWritable; diff --git a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsToGcj.java b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsToGcj.java similarity index 84% rename from src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsToGcj.java rename to src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsToGcj.java index 1081356..f2e4eb3 100644 --- a/src/main/java/cc/shanruifeng/functions/geo/UDFGeoWgsToGcj.java +++ b/src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsToGcj.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; -import cc.shanruifeng.functions.utils.GeoUtils; +import com.github.aaronshan.functions.utils.GeoUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtract.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtract.java similarity index 93% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtract.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtract.java index 266245f..f2ca590 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtract.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtract.java @@ -1,8 +1,8 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonExtract; -import cc.shanruifeng.functions.utils.json.JsonPath; -import cc.shanruifeng.functions.utils.json.JsonUtils; +import com.github.aaronshan.functions.utils.json.JsonExtract; +import com.github.aaronshan.functions.utils.json.JsonPath; +import com.github.aaronshan.functions.utils.json.JsonUtils; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtractScalar.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtractScalar.java similarity index 93% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtractScalar.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtractScalar.java index 713bee0..bf82984 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayExtractScalar.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtractScalar.java @@ -1,8 +1,8 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonExtract; -import cc.shanruifeng.functions.utils.json.JsonPath; -import cc.shanruifeng.functions.utils.json.JsonUtils; +import com.github.aaronshan.functions.utils.json.JsonExtract; +import com.github.aaronshan.functions.utils.json.JsonPath; +import com.github.aaronshan.functions.utils.json.JsonUtils; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayGet.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayGet.java similarity index 88% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayGet.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayGet.java index 8987b82..2c1eb91 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayGet.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayGet.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonUtils; +import com.github.aaronshan.functions.utils.json.JsonUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayLength.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayLength.java similarity index 89% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayLength.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayLength.java index 4bfa91b..284b368 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonArrayLength.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayLength.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonUtils; +import com.github.aaronshan.functions.utils.json.JsonUtils; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.LongWritable; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonExtract.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonExtract.java similarity index 84% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonExtract.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonExtract.java index 3f57c7e..6be5cf8 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonExtract.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonExtract.java @@ -1,7 +1,7 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonExtract; -import cc.shanruifeng.functions.utils.json.JsonPath; +import com.github.aaronshan.functions.utils.json.JsonExtract; +import com.github.aaronshan.functions.utils.json.JsonPath; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonExtractScalar.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonExtractScalar.java similarity index 85% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonExtractScalar.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonExtractScalar.java index 6af6884..ab51588 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonExtractScalar.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonExtractScalar.java @@ -1,7 +1,7 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonExtract; -import cc.shanruifeng.functions.utils.json.JsonPath; +import com.github.aaronshan.functions.utils.json.JsonExtract; +import com.github.aaronshan.functions.utils.json.JsonPath; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; diff --git a/src/main/java/cc/shanruifeng/functions/json/UDFJsonSize.java b/src/main/java/com/github/aaronshan/functions/json/UDFJsonSize.java similarity index 87% rename from src/main/java/cc/shanruifeng/functions/json/UDFJsonSize.java rename to src/main/java/com/github/aaronshan/functions/json/UDFJsonSize.java index 4ed1dd2..7286413 100644 --- a/src/main/java/cc/shanruifeng/functions/json/UDFJsonSize.java +++ b/src/main/java/com/github/aaronshan/functions/json/UDFJsonSize.java @@ -1,7 +1,7 @@ -package cc.shanruifeng.functions.json; +package com.github.aaronshan.functions.json; -import cc.shanruifeng.functions.utils.json.JsonExtract; -import cc.shanruifeng.functions.utils.json.JsonPath; +import com.github.aaronshan.functions.utils.json.JsonExtract; +import com.github.aaronshan.functions.utils.json.JsonPath; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/map/UDFMapBuild.java b/src/main/java/com/github/aaronshan/functions/map/UDFMapBuild.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/map/UDFMapBuild.java rename to src/main/java/com/github/aaronshan/functions/map/UDFMapBuild.java index 5446550..f96b3e4 100644 --- a/src/main/java/cc/shanruifeng/functions/map/UDFMapBuild.java +++ b/src/main/java/com/github/aaronshan/functions/map/UDFMapBuild.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; import java.util.LinkedHashMap; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/map/UDFMapConcat.java b/src/main/java/com/github/aaronshan/functions/map/UDFMapConcat.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/map/UDFMapConcat.java rename to src/main/java/com/github/aaronshan/functions/map/UDFMapConcat.java index 6a257d8..8b4acf7 100644 --- a/src/main/java/cc/shanruifeng/functions/map/UDFMapConcat.java +++ b/src/main/java/com/github/aaronshan/functions/map/UDFMapConcat.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; import java.util.LinkedHashMap; import java.util.Map; diff --git a/src/main/java/cc/shanruifeng/functions/map/UDFMapElementAt.java b/src/main/java/com/github/aaronshan/functions/map/UDFMapElementAt.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/map/UDFMapElementAt.java rename to src/main/java/com/github/aaronshan/functions/map/UDFMapElementAt.java index cd9f12e..56627ad 100644 --- a/src/main/java/cc/shanruifeng/functions/map/UDFMapElementAt.java +++ b/src/main/java/com/github/aaronshan/functions/map/UDFMapElementAt.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; import java.util.Map; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/map/UDFMapEquals.java b/src/main/java/com/github/aaronshan/functions/map/UDFMapEquals.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/map/UDFMapEquals.java rename to src/main/java/com/github/aaronshan/functions/map/UDFMapEquals.java index 0fead58..97cf2f2 100644 --- a/src/main/java/cc/shanruifeng/functions/map/UDFMapEquals.java +++ b/src/main/java/com/github/aaronshan/functions/map/UDFMapEquals.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; -import cc.shanruifeng.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; import java.util.Map; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarity.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarity.java new file mode 100644 index 0000000..e0acea6 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarity.java @@ -0,0 +1,151 @@ +package com.github.aaronshan.functions.math; + +import org.apache.hadoop.hive.ql.exec.*; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +import java.util.Map; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "cosine_similarity" + , value = "_FUNC_(map(varchar,double), map(varchar,double)) - cosine similarity between the given sparse vectors." + , extended = "Example:\n > select _FUNC_(map(varchar,double), map(varchar,double)) from src;") +public class UDFMathCosineSimilarity extends GenericUDF { + private static final int ARG_COUNT = 2; // Number of arguments to this UDF + private transient MapObjectInspector leftMapOI; + private transient MapObjectInspector rightMapOI; + + public UDFMathCosineSimilarity() { + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != ARG_COUNT) { + throw new UDFArgumentLengthException( + "The function cosine_similarity(map, map) takes exactly " + ARG_COUNT + " arguments."); + } + + // Check if two argument is of category LIST + for (int i = 0; i < 2; i++) { + if (!arguments[i].getCategory().equals(ObjectInspector.Category.MAP)) { + throw new UDFArgumentTypeException(i, + "\"" + serdeConstants.MAP_TYPE_NAME + "\" " + + "expected at function cosine_similarity, but " + + "\"" + arguments[i].getTypeName() + "\" " + + "is found"); + } + } + + leftMapOI = (MapObjectInspector) arguments[0]; + rightMapOI = (MapObjectInspector) arguments[1]; + + ObjectInspector leftMapKeyOI = leftMapOI.getMapKeyObjectInspector(); + ObjectInspector leftMapValueOI = leftMapOI.getMapValueObjectInspector(); + ObjectInspector rightMapKeyOI = rightMapOI.getMapKeyObjectInspector(); + ObjectInspector rightMapValueOI = rightMapOI.getMapValueObjectInspector(); + + // Check if two map are of same key and value type + if (!ObjectInspectorUtils.compareTypes(leftMapKeyOI, rightMapKeyOI)) { + throw new UDFArgumentTypeException(1, + "\"" + leftMapKeyOI.getTypeName() + "\"" + + " expected at function cosine_similarity key, but " + + "\"" + rightMapKeyOI.getTypeName() + "\"" + + " is found"); + } + + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, leftMapKeyOI)) { + throw new UDFArgumentTypeException(1, + "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\"" + + " expected at function cosine_similarity key, but " + + "\"" + leftMapKeyOI.getTypeName() + "\"" + + " is found"); + } + + if (!ObjectInspectorUtils.compareTypes(leftMapValueOI, rightMapValueOI)) { + throw new UDFArgumentTypeException(1, + "\"" + leftMapValueOI.getTypeName() + "\"" + + " expected at function cosine_similarity value, but " + + "\"" + rightMapValueOI.getTypeName() + "\"" + + " is found"); + } + + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaDoubleObjectInspector, leftMapValueOI)) { + throw new UDFArgumentTypeException(1, + "\"" + PrimitiveObjectInspectorFactory.javaDoubleObjectInspector.getTypeName() + "\"" + + " expected at function cosine_similarity value, but " + + "\"" + leftMapValueOI.getTypeName() + "\"" + + " is found"); + } + + return ObjectInspectorFactory.getStandardMapObjectInspector(leftMapKeyOI, leftMapValueOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + Object leftMapObj = arguments[0].get(); + Object rightMapObj = arguments[1].get(); + + if (leftMapObj == null || rightMapObj == null) { + return null; + } + + Map leftMap = leftMapOI.getMap(leftMapObj); + Map rightMap = leftMapOI.getMap(rightMapObj); + + Double normLeftMap = mapL2Norm(leftMap); + Double normRightMap = mapL2Norm(rightMap); + + if (normLeftMap == null || normRightMap == null) { + return null; + } + + double dotProduct = mapDotProduct(leftMap, rightMap); + return new DoubleWritable(dotProduct / (normLeftMap * normRightMap)); + } + + private double mapDotProduct(Map leftMap, Map rightMap) { + double result = 0.0; + + for (Map.Entry entry : rightMap.entrySet()) { + if (leftMap.containsKey(entry.getKey())) { + Double leftValue = (Double) leftMap.get(entry.getKey()); + Double rightValue = (Double) entry.getValue(); + result += leftValue * rightValue; + } + } + + return result; + } + + private Double mapL2Norm(Map map) { + double norm = 0.0; + for (Map.Entry entry : map.entrySet()) { + if (entry.getValue() == null) { + return null; + } + + Double value = (Double) entry.getValue(); + norm += value * value; + } + + return Math.sqrt(norm); + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == ARG_COUNT); + return "cosine_similarity(" + strings[0] + ", " + + strings[1] + ")"; + } +} diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathFromBase.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathFromBase.java new file mode 100644 index 0000000..d7313bf --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathFromBase.java @@ -0,0 +1,39 @@ +package com.github.aaronshan.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import static com.github.aaronshan.functions.utils.MathUtils.checkRadix; +import static java.lang.String.format; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "from_base" + , value = "_FUNC_(string, long) - convert a number to a string in the given base." + , extended = "Example:\n > select _FUNC_(string, long) from src;") +public class UDFMathFromBase extends UDF { + private LongWritable result = new LongWritable(); + + public UDFMathFromBase() { + } + + public LongWritable evaluate(Text value, LongWritable radix) throws HiveException { + if (value == null || radix == null) { + return null; + } + + checkRadix(radix.get()); + try { + result.set(Long.parseLong(value.toString(), (int) radix.get())); + } + catch (NumberFormatException e) { + throw new HiveException(format("Not a valid base-%d number: %s", radix, value.toString()), e); + } + return result; + } +} diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathInfinity.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathInfinity.java new file mode 100644 index 0000000..bf6b0c0 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathInfinity.java @@ -0,0 +1,24 @@ +package com.github.aaronshan.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.DoubleWritable; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "infinity" + , value = "_FUNC_() - Infinity." + , extended = "Example:\n > select _FUNC_() from src;") +public class UDFMathInfinity extends UDF { + private DoubleWritable result = new DoubleWritable(); + + public UDFMathInfinity() { + } + + public DoubleWritable evaluate() { + result.set(Double.POSITIVE_INFINITY); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathInverseNormalCdf.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathInverseNormalCdf.java new file mode 100644 index 0000000..c192549 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathInverseNormalCdf.java @@ -0,0 +1,32 @@ +package com.github.aaronshan.functions.math; + +import org.apache.commons.math3.special.Erf; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.DoubleWritable; + +import static com.github.aaronshan.functions.utils.Failures.checkCondition; + +/** + * @author ruifeng.shan + * @date 2018-07-26 + * @time 23:04 + */ +@Description(name = "inverse_normal_cdf" + , value = "_FUNC_(mean, sd, p) - inverse of normal cdf given a mean, std, and probability." + , extended = "Example:\n > select _FUNC_(mean, sd, p) from src;") +public class UDFMathInverseNormalCdf extends UDF { + private DoubleWritable result = new DoubleWritable(); + + public UDFMathInverseNormalCdf() { + } + + public DoubleWritable evaluate(double mean, double sd, double p) throws HiveException { + checkCondition(p > 0 && p < 1, "p must be 0 > p > 1"); + checkCondition(sd > 0, "sd must > 0"); + + result.set(mean + sd * 1.4142135623730951 * Erf.erfInv(2 * p - 1)); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathIsFinite.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsFinite.java new file mode 100644 index 0000000..597906e --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsFinite.java @@ -0,0 +1,29 @@ +package com.github.aaronshan.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.DoubleWritable; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "is_finite" + , value = "_FUNC_(double) - test if value is finite." + , extended = "Example:\n > select _FUNC_(double) from src;") +public class UDFMathIsFinite extends UDF { + BooleanWritable result = new BooleanWritable(); + + public UDFMathIsFinite() { + } + + public BooleanWritable evaluate(DoubleWritable num) { + if (num == null) { + result.set(false); + } else { + result.set(Double.isFinite(num.get())); + } + return result; + } +} \ No newline at end of file diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathIsInfinite.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsInfinite.java new file mode 100644 index 0000000..84f8abe --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsInfinite.java @@ -0,0 +1,29 @@ +package com.github.aaronshan.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.DoubleWritable; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "is_infinite" + , value = "_FUNC_(double) - test if value is infinite." + , extended = "Example:\n > select _FUNC_(double) from src;") +public class UDFMathIsInfinite extends UDF { + BooleanWritable result = new BooleanWritable(); + + public UDFMathIsInfinite() { + } + + public BooleanWritable evaluate(DoubleWritable num) { + if (num == null) { + result.set(false); + } else { + result.set(Double.isInfinite(num.get())); + } + return result; + } +} diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathIsNaN.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsNaN.java new file mode 100644 index 0000000..efef890 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathIsNaN.java @@ -0,0 +1,29 @@ +package com.github.aaronshan.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.DoubleWritable; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "is_nan" + , value = "_FUNC_(double) - test if value is nan." + , extended = "Example:\n > select _FUNC_(double) from src;") +public class UDFMathIsNaN extends UDF { + BooleanWritable result = new BooleanWritable(); + + public UDFMathIsNaN() { + } + + public BooleanWritable evaluate(DoubleWritable num) { + if (num == null) { + result.set(false); + } else { + result.set(Double.isNaN(num.get())); + } + return result; + } +} \ No newline at end of file diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathNaN.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathNaN.java new file mode 100644 index 0000000..9d6d55f --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathNaN.java @@ -0,0 +1,24 @@ +package com.github.aaronshan.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.DoubleWritable; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "NaN" + , value = "_FUNC_() - constant representing not-a-number." + , extended = "Example:\n > select _FUNC_() from src;") +public class UDFMathNaN extends UDF { + private DoubleWritable result = new DoubleWritable(); + + public UDFMathNaN() { + } + + public DoubleWritable evaluate() { + result.set(Double.NaN); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathNormalCdf.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathNormalCdf.java new file mode 100644 index 0000000..a6cc43a --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathNormalCdf.java @@ -0,0 +1,30 @@ +package com.github.aaronshan.functions.math; + +import org.apache.commons.math3.special.Erf; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.DoubleWritable; + +import static com.github.aaronshan.functions.utils.Failures.checkCondition; + +/** + * @author ruifeng.shan + * @date 2018-07-26 + * @time 23:03 + */ +@Description(name = "normal_cdf" + , value = "_FUNC_(mean, sd, v) - normal cdf given a mean, standard deviation, and value." + , extended = "Example:\n > select _FUNC_(mean, sd, v) from src;") +public class UDFMathNormalCdf extends UDF { + private DoubleWritable result = new DoubleWritable(); + + public UDFMathNormalCdf() { + } + + public DoubleWritable evaluate(double mean, double standardDeviation, double value) throws HiveException { + checkCondition(standardDeviation > 0, "standardDeviation must > 0"); + result.set(0.5 * (1 + Erf.erf((value - mean) / (standardDeviation * Math.sqrt(2))))); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/com/github/aaronshan/functions/math/UDFMathToBase.java b/src/main/java/com/github/aaronshan/functions/math/UDFMathToBase.java new file mode 100644 index 0000000..0368843 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/math/UDFMathToBase.java @@ -0,0 +1,33 @@ +package com.github.aaronshan.functions.math; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import static com.github.aaronshan.functions.utils.MathUtils.checkRadix; + +/** + * @author ruifeng.shan + * @date 18-7-23 + */ +@Description(name = "to_base" + , value = "_FUNC_(long, long) - convert a number to a string in the given base." + , extended = "Example:\n > select _FUNC_(long, long) from src;") +public class UDFMathToBase extends UDF { + private Text result = new Text(); + + public UDFMathToBase() { + } + + public Text evaluate(LongWritable value, LongWritable radix) throws HiveException { + if (value == null || radix == null) { + return null; + } + + checkRadix(radix.get()); + result.set(Long.toString(value.get(), (int) radix.get())); + return result; + } +} diff --git a/src/main/java/cc/shanruifeng/functions/model/ChinaIdArea.java b/src/main/java/com/github/aaronshan/functions/model/ChinaIdArea.java similarity index 91% rename from src/main/java/cc/shanruifeng/functions/model/ChinaIdArea.java rename to src/main/java/com/github/aaronshan/functions/model/ChinaIdArea.java index b45684f..9d9e145 100644 --- a/src/main/java/cc/shanruifeng/functions/model/ChinaIdArea.java +++ b/src/main/java/com/github/aaronshan/functions/model/ChinaIdArea.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.model; +package com.github.aaronshan.functions.model; /** * @author ruifeng.shan diff --git a/src/main/java/com/github/aaronshan/functions/regexp/Re2JRegexp.java b/src/main/java/com/github/aaronshan/functions/regexp/Re2JRegexp.java new file mode 100644 index 0000000..585fae1 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/regexp/Re2JRegexp.java @@ -0,0 +1,148 @@ +package com.github.aaronshan.functions.regexp; + +import com.google.common.collect.Lists; +import com.google.re2j.Matcher; +import com.google.re2j.Options; +import com.google.re2j.Pattern; +import io.airlift.slice.Slice; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; + +import static com.google.common.base.Preconditions.checkState; +import static com.google.re2j.Options.Algorithm.DFA_FALLBACK_TO_NFA; +import static java.lang.String.format; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:20 + */ +public final class Re2JRegexp { + private static final Logger log = LoggerFactory.getLogger(Re2JRegexp.class); + + private static final java.util.regex.Pattern DOT_STAR_PREFIX_PATTERN = java.util.regex.Pattern.compile("(?s)^(\\.\\*\\??)?(.*)"); + private static final int CORE_PATTERN_INDEX = 2; + + public final int dfaStatesLimit; + public final int dfaRetries; + + public final Pattern re2jPattern; + public final Pattern re2jPatternWithoutDotStartPrefix; + + public Re2JRegexp(int dfaStatesLimit, int dfaRetries, Slice pattern) { + this.dfaStatesLimit = dfaStatesLimit; + this.dfaRetries = dfaRetries; + + Options options = Options.builder() + .setAlgorithm(DFA_FALLBACK_TO_NFA) + .setMaximumNumberOfDFAStates(dfaStatesLimit) + .setNumberOfDFARetries(dfaRetries) + .setEventsListener(new RE2JEventsListener()) + .build(); + + String patternString = pattern.toStringUtf8(); + re2jPattern = Pattern.compile(patternString, options); + + // Remove .*? prefix. DFA has optimization which does fast lookup for first byte of a potential match. + // When pattern is prefixed with .*? this optimization doesn't work in Pattern.find() function. + java.util.regex.Matcher dotStarPrefixMatcher = DOT_STAR_PREFIX_PATTERN.matcher(patternString); + checkState(dotStarPrefixMatcher.matches()); + String patternStringWithoutDotStartPrefix = dotStarPrefixMatcher.group(CORE_PATTERN_INDEX); + + if (!patternStringWithoutDotStartPrefix.equals(patternString)) { + re2jPatternWithoutDotStartPrefix = Pattern.compile(patternStringWithoutDotStartPrefix, options); + } else { + re2jPatternWithoutDotStartPrefix = re2jPattern; + } + } + + private static void validateGroup(int group, int groupCount) throws HiveException { + if (group < 0) { + throw new HiveException("Group cannot be negative"); + } + if (group > groupCount) { + throw new HiveException(format("Pattern has %d groups. Cannot access group %d", groupCount, group)); + } + } + + public boolean matches(Slice source) { + return re2jPatternWithoutDotStartPrefix.find(source); + } + + public Slice replace(Slice source, Slice replacement) throws HiveException { + Matcher matcher = re2jPattern.matcher(source); + try { + return matcher.replaceAll(replacement); + } catch (IndexOutOfBoundsException e) { + throw new HiveException("Illegal replacement sequence: " + replacement.toStringUtf8()); + } catch (IllegalArgumentException e) { + throw new HiveException("Illegal replacement sequence: " + replacement.toStringUtf8()); + } + } + + public static int toIntExact(long value) { + if ((int)value != value) { + throw new ArithmeticException("integer overflow"); + } + return (int)value; + } + + public List extractAll(Slice source, long groupIndex) throws HiveException { + Matcher matcher = re2jPattern.matcher(source); + int group = (int)(groupIndex); + validateGroup(group, matcher.groupCount()); + + List list = Lists.newArrayList(); + while (true) { + if (!matcher.find()) { + break; + } + + Slice searchedGroup = matcher.group(group); + if (searchedGroup == null) { + list.add(null); + continue; + } + list.add(searchedGroup.toString()); + } + return list; + } + + public Slice extract(Slice source, long groupIndex) throws HiveException { + Matcher matcher = re2jPattern.matcher(source); + int group = (int)(groupIndex); + validateGroup(group, matcher.groupCount()); + + if (!matcher.find()) { + return null; + } + + return matcher.group(group); + } + + public List split(Slice source) { + Matcher matcher = re2jPattern.matcher(source); + List list = Lists.newArrayList(); + + int lastEnd = 0; + while (matcher.find()) { + Slice slice = source.slice(lastEnd, matcher.start() - lastEnd); + lastEnd = matcher.end(); + list.add(slice.toString()); + } + + list.add(source.slice(lastEnd, source.length() - lastEnd).toString()); + return list; + } + + private class RE2JEventsListener + implements Options.EventsListener { + @Override + public void fallbackToNFA() { + log.debug("Fallback to NFA, pattern: %s, DFA states limit: %d, DFA retries: %d", re2jPattern.pattern(), dfaStatesLimit, dfaRetries); + } + } +} \ No newline at end of file diff --git a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtract.java b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtract.java new file mode 100644 index 0000000..5041729 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtract.java @@ -0,0 +1,42 @@ +package com.github.aaronshan.functions.regexp; + +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:38 + */ +@Description(name = "regexp_extract" + , value = "_FUNC_(string, string) - returns substrings matching a regular expression." + , extended = "Example:\n > select _FUNC_(string, pattern) from src;") +public class UDFRe2JRegexpExtract extends UDF { + private static Re2JRegexp re2JRegexp; + private Text result = new Text(); + + public UDFRe2JRegexpExtract() { + + } + + public Text evaluate(Text source, Text pattern) throws HiveException { + return evaluate(source, pattern, new LongWritable(0)); + } + + public Text evaluate(Text source, Text pattern, LongWritable groupIndex) throws HiveException { + if (source == null) { + return null; + } + + if (re2JRegexp == null) { + re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern.toString())); + } + + result.set(re2JRegexp.extract(Slices.utf8Slice(source.toString()), groupIndex.get()).toStringUtf8()); + return result; + } +} diff --git a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtractAll.java b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtractAll.java new file mode 100644 index 0000000..fa03a5b --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtractAll.java @@ -0,0 +1,100 @@ +package com.github.aaronshan.functions.regexp; + +import io.airlift.slice.Slices; +import java.util.ArrayList; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:38 + */ +@Description(name = "regexp_extract_all" + , value = "_FUNC_(string, string) - string(s) extracted using the given pattern\n" + + "_FUNC_(string, string, long) - group(s) extracted using the given pattern." + , extended = "Example:\n > select _FUNC_(string, pattern) from src;") +public class UDFRe2JRegexpExtractAll extends GenericUDF { + private transient ArrayList result = new ArrayList(); + private transient Re2JRegexp re2JRegexp; + + public UDFRe2JRegexpExtractAll() { + + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != 2 || arguments.length != 3) { + throw new UDFArgumentLengthException( + "The function regexp_extract_all takes exactly 2 or 3 arguments."); + } + + for (int i = 0; i < 2; i++) { + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { + throw new UDFArgumentTypeException(i, + "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " + + "expected at function regexp_extract_all, but " + + "\"" + arguments[i].getTypeName() + "\" " + + "is found"); + } + } + + if (arguments.length == 3) { + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaLongObjectInspector, arguments[2])) { + throw new UDFArgumentTypeException(2, + "\"" + PrimitiveObjectInspectorFactory.javaLongObjectInspector.getTypeName() + "\" " + + "expected at function regexp_extract_all, but " + + "\"" + arguments[2].getTypeName() + "\" " + + "is found"); + } + } + + ObjectInspector expectOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + + return ObjectInspectorFactory.getStandardListObjectInspector(expectOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + String source = (String) arguments[0].get(); + String pattern = (String) arguments[1].get(); + Long groupIndex = 0L; + if (arguments.length == 3) { + groupIndex = (Long) arguments[2].get(); + } + + if (source == null) { + return null; + } + + if (re2JRegexp == null) { + re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern)); + } + + result.clear(); + result.addAll(re2JRegexp.extractAll(Slices.utf8Slice(source), groupIndex)); + + return result; + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == 2 || strings.length == 3); + if (strings.length == 2) { + return "regexp_extract_all(" + strings[0] + ", " + + strings[1] + ")"; + } else { + return "regexp_extract_all(" + strings[0] + ", " + + strings[1] + ", " + strings[2] + ")"; + } + } +} \ No newline at end of file diff --git a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpLike.java b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpLike.java new file mode 100644 index 0000000..683e6f8 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpLike.java @@ -0,0 +1,35 @@ +package com.github.aaronshan.functions.regexp; + +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.Text; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:36 + */ +@Description(name = "regexp_like" + , value = "_FUNC_(string, string) - returns substrings matching a regular expression." + , extended = "Example:\n > select _FUNC_(string, pattern) from src;") +public class UDFRe2JRegexpLike extends UDF { + private static Re2JRegexp re2JRegexp; + + public UDFRe2JRegexpLike() { + + } + + public boolean evaluate(Text text, Text pattern) { + if (text == null) { + return false; + } + + if (re2JRegexp == null) { + re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern.toString())); + } + + return re2JRegexp.matches(Slices.utf8Slice(text.toString())); + } + +} diff --git a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpReplace.java b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpReplace.java new file mode 100644 index 0000000..6c74f52 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpReplace.java @@ -0,0 +1,44 @@ +package com.github.aaronshan.functions.regexp; + +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.Text; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:37 + */ +@Description(name = "regexp_replace" + , value = "_FUNC_(string, string) - removes substrings matching a regular expression\n" + + "_FUNC_(string, string, string) - replaces substrings matching a regular expression by given string." + , extended = "Example:\n > select _FUNC_(string, pattern) from src;\n" + + "select _FUNC_(string, pattern, replacement) from src;") +public class UDFRe2JRegexpReplace extends UDF { + private static Re2JRegexp re2JRegexp; + private Text result = new Text(); + + public UDFRe2JRegexpReplace() { + + } + + public Text evaluate(Text source, Text pattern) throws HiveException { + return evaluate(source, pattern, new Text(Slices.EMPTY_SLICE.toStringUtf8())); + } + + public Text evaluate(Text source, Text pattern, Text replacement) throws HiveException { + if (source == null) { + return null; + } + + if (re2JRegexp == null) { + re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern.toString())); + } + + result.set(re2JRegexp.replace(Slices.utf8Slice(source.toString()), Slices.utf8Slice(replacement.toString())).toStringUtf8()); + return result; + } + +} diff --git a/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpSplit.java b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpSplit.java new file mode 100644 index 0000000..1022a8f --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpSplit.java @@ -0,0 +1,81 @@ +package com.github.aaronshan.functions.regexp; + +import io.airlift.slice.Slices; +import java.util.ArrayList; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 22:38 + */ +@Description(name = "regexp_split" + , value = "_FUNC_(string, string) - returns array of strings split by pattern." + , extended = "Example:\n > select _FUNC_(string, pattern) from src;") +public class UDFRe2JRegexpSplit extends GenericUDF { + private static final int ARG_COUNT = 2; + private transient ArrayList result = new ArrayList(); + private transient Re2JRegexp re2JRegexp; + + public UDFRe2JRegexpSplit() { + + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != ARG_COUNT) { + throw new UDFArgumentLengthException( + "The function regexp_split(string, pattern) takes exactly " + ARG_COUNT + " arguments."); + } + + for (int i = 0; i < ARG_COUNT; i++) { + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { + throw new UDFArgumentTypeException(i, + "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " + + "expected at function regexp_split, but " + + "\"" + arguments[i].getTypeName() + "\" " + + "is found"); + } + } + + ObjectInspector expectOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + + return ObjectInspectorFactory.getStandardListObjectInspector(expectOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + String source = (String) arguments[0].get(); + String pattern = (String) arguments[1].get(); + + if (source == null) { + return null; + } + + if (re2JRegexp == null) { + re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern)); + } + + result.clear(); + result.addAll(re2JRegexp.split(Slices.utf8Slice(source))); + + return result; + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == ARG_COUNT); + return "regexp_split(" + strings[0] + ", " + + strings[1] + ")"; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFChineseToPinYin.java b/src/main/java/com/github/aaronshan/functions/string/UDFChineseToPinYin.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/string/UDFChineseToPinYin.java rename to src/main/java/com/github/aaronshan/functions/string/UDFChineseToPinYin.java index b468748..66ddb11 100755 --- a/src/main/java/cc/shanruifeng/functions/string/UDFChineseToPinYin.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFChineseToPinYin.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFCodePoint.java b/src/main/java/com/github/aaronshan/functions/string/UDFCodePoint.java new file mode 100644 index 0000000..bd0324f --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/string/UDFCodePoint.java @@ -0,0 +1,46 @@ +package com.github.aaronshan.functions.string; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import static com.github.aaronshan.functions.utils.Failures.checkCondition; +import static io.airlift.slice.SliceUtf8.getCodePointAt; +import static io.airlift.slice.SliceUtf8.countCodePoints; + +/** + * @author ruifeng.shan + * @date 2018-07-26 + * @time 23:23 + */ +@Description(name = "codepoint" + , value = "_FUNC_(string) - returns Unicode code point of a single character string." + , extended = "Example:\n > select _FUNC_(string) from src;") +public class UDFCodePoint extends UDF { + private LongWritable result = new LongWritable(); + + public UDFCodePoint() { + } + + /** + * codepoint. + * + * @param text + * @return + */ + public LongWritable evaluate(Text text) throws HiveException { + if (text == null) { + return null; + } + + Slice slice = Slices.utf8Slice(text.toString()); + checkCondition(countCodePoints(slice) == 1, "Input string must be a single character string"); + + result.set(getCodePointAt(slice, 0)); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFMd5.java b/src/main/java/com/github/aaronshan/functions/string/UDFMd5.java similarity index 94% rename from src/main/java/cc/shanruifeng/functions/string/UDFMd5.java rename to src/main/java/com/github/aaronshan/functions/string/UDFMd5.java index 807b6bf..dd9060a 100644 --- a/src/main/java/cc/shanruifeng/functions/string/UDFMd5.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFMd5.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import org.apache.commons.codec.digest.DigestUtils; import org.apache.hadoop.hive.ql.exec.Description; diff --git a/src/main/java/cc/shanruifeng/functions/string/UDFSha256.java b/src/main/java/com/github/aaronshan/functions/string/UDFSha256.java similarity index 85% rename from src/main/java/cc/shanruifeng/functions/string/UDFSha256.java rename to src/main/java/com/github/aaronshan/functions/string/UDFSha256.java index df4d05d..b8e942f 100644 --- a/src/main/java/cc/shanruifeng/functions/string/UDFSha256.java +++ b/src/main/java/com/github/aaronshan/functions/string/UDFSha256.java @@ -1,7 +1,8 @@ -package cc.shanruifeng.functions.string; +package com.github.aaronshan.functions.string; import org.apache.commons.codec.digest.DigestUtils; import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; /** @@ -12,7 +13,7 @@ @Description(name = "sha256" , value = "_FUNC_(string) - get sha256 hash code by given input string." , extended = "Example:\n > select _FUNC_(string) from src;") -public class UDFSha256 { +public class UDFSha256 extends UDF { private Text result = new Text(); public UDFSha256() { diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringHammingDistance.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringHammingDistance.java new file mode 100644 index 0000000..ecf7a49 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringHammingDistance.java @@ -0,0 +1,66 @@ +package com.github.aaronshan.functions.string; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import static com.github.aaronshan.functions.utils.Failures.checkCondition; +import static io.airlift.slice.SliceUtf8.lengthOfCodePoint; +import static io.airlift.slice.SliceUtf8.tryGetCodePointAt; + +/** + * @author ruifeng.shan + * @date 2018-07-26 + * @time 23:43 + */ +@Description(name = "hamming_distance" + , value = "_FUNC_(string, string) - computes Hamming distance between two strings." + , extended = "Example:\n > select _FUNC_(string, string) from src;") +public class UDFStringHammingDistance extends UDF { + private LongWritable result = new LongWritable(0); + + public UDFStringHammingDistance() { + } + + /** + * hamming distance. + * + * @param leftText + * @param rightText + * @return + */ + public LongWritable evaluate(Text leftText, Text rightText) throws HiveException { + if (leftText == null || rightText == null) { + return result; + } + + Slice left = Slices.utf8Slice(leftText.toString()); + Slice right = Slices.utf8Slice(rightText.toString()); + int distance = 0; + int leftPosition = 0; + int rightPosition = 0; + while (leftPosition < left.length() && rightPosition < right.length()) { + int codePointLeft = tryGetCodePointAt(left, leftPosition); + int codePointRight = tryGetCodePointAt(right, rightPosition); + + // if both code points are invalid, we do not care if they are equal + // the following code treats them as equal if they happen to be of the same length + if (codePointLeft != codePointRight) { + distance++; + } + + leftPosition += codePointLeft > 0 ? lengthOfCodePoint(codePointLeft) : -codePointLeft; + rightPosition += codePointRight > 0 ? lengthOfCodePoint(codePointRight) : -codePointRight; + } + + checkCondition(leftPosition == left.length() && rightPosition == right.length(), + "The input strings to hamming_distance function must have the same length"); + result.set(distance); + + return result; + } +} \ No newline at end of file diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringLevenshteinDistance.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringLevenshteinDistance.java new file mode 100644 index 0000000..08cd10d --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringLevenshteinDistance.java @@ -0,0 +1,113 @@ +package com.github.aaronshan.functions.string; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import static com.github.aaronshan.functions.utils.Failures.checkCondition; +import static io.airlift.slice.SliceUtf8.getCodePointAt; +import static io.airlift.slice.SliceUtf8.lengthOfCodePoint; +import static io.airlift.slice.SliceUtf8.tryGetCodePointAt; + +/** + * @author ruifeng.shan + * @date 2018-07-26 + * @time 23:53 + */ +@Description(name = "levenshtein_distance" + , value = "_FUNC_(string, string) - computes Levenshtein distance between two strings." + , extended = "Example:\n > select _FUNC_(string, string) from src;") +public class UDFStringLevenshteinDistance extends UDF { + private LongWritable result = new LongWritable(0); + + public UDFStringLevenshteinDistance() { + } + + /** + * hamming distance. + * + * @param leftText + * @param rightText + * @return + */ + public LongWritable evaluate(Text leftText, Text rightText) throws HiveException { + if (leftText == null || rightText == null) { + return null; + } + + Slice left = Slices.utf8Slice(leftText.toString()); + Slice right = Slices.utf8Slice(rightText.toString()); + int[] leftCodePoints = castToCodePoints(left); + int[] rightCodePoints = castToCodePoints(right); + + if (leftCodePoints.length < rightCodePoints.length) { + int[] tempCodePoints = leftCodePoints; + leftCodePoints = rightCodePoints; + rightCodePoints = tempCodePoints; + } + + if (rightCodePoints.length == 0) { + result.set(leftCodePoints.length); + return result; + } + + checkCondition((leftCodePoints.length * (rightCodePoints.length - 1)) <= 1000000, + "The combined inputs for Levenshtein distance are too large"); + + int[] distances = new int[rightCodePoints.length]; + for (int i = 0; i < rightCodePoints.length; i++) { + distances[i] = i + 1; + } + + for (int i = 0; i < leftCodePoints.length; i++) { + int leftUpDistance = distances[0]; + if (leftCodePoints[i] == rightCodePoints[0]) { + distances[0] = i; + } + else { + distances[0] = Math.min(i, distances[0]) + 1; + } + for (int j = 1; j < rightCodePoints.length; j++) { + int leftUpDistanceNext = distances[j]; + if (leftCodePoints[i] == rightCodePoints[j]) { + distances[j] = leftUpDistance; + } + else { + distances[j] = Math.min(distances[j - 1], Math.min(leftUpDistance, distances[j])) + 1; + } + leftUpDistance = leftUpDistanceNext; + } + } + + result.set(distances[rightCodePoints.length - 1]); + + return result; + } + + private static int[] castToCodePoints(Slice slice) throws HiveException { + int[] codePoints = new int[safeCountCodePoints(slice)]; + int position = 0; + for (int index = 0; index < codePoints.length; index++) { + codePoints[index] = getCodePointAt(slice, position); + position += lengthOfCodePoint(slice, position); + } + return codePoints; + } + + private static int safeCountCodePoints(Slice slice) throws HiveException { + int codePoints = 0; + for (int position = 0; position < slice.length(); ) { + int codePoint = tryGetCodePointAt(slice, position); + if (codePoint < 0) { + throw new HiveException("Invalid UTF-8 encoding in characters: " + slice.toStringUtf8()); + } + position += lengthOfCodePoint(codePoint); + codePoints++; + } + return codePoints; + } +} \ No newline at end of file diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringNormalize.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringNormalize.java new file mode 100644 index 0000000..cf2ce98 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringNormalize.java @@ -0,0 +1,39 @@ +package com.github.aaronshan.functions.string; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.Text; + +import java.text.Normalizer; + +/** + * @author ruifeng.shan + * @date 2018-07-27 下午12:37 + */ +@Description(name = "normalize" + , value = "_FUNC_(string, string) - transforms the string to normalized form." + , extended = "Example:\n > select _FUNC_(string, form_str) from src;") +public class UDFStringNormalize extends UDF { + private Text result = new Text(); + + public UDFStringNormalize() { + } + + public Text evaluate(Text text, Text form) throws HiveException { + if (text == null) { + return null; + } + + Normalizer.Form targetForm; + try { + targetForm = Normalizer.Form.valueOf(form.toString()); + } + catch (IllegalArgumentException e) { + throw new HiveException("Normalization form must be one of [NFD, NFC, NFKD, NFKC]"); + } + + result.set(Normalizer.normalize(text.toString(), targetForm)); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringPosition.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringPosition.java new file mode 100644 index 0000000..59ab65c --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringPosition.java @@ -0,0 +1,40 @@ +package com.github.aaronshan.functions.string; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +/** + * @author ruifeng.shan + * @date 2018-07-27 下午12:08 + */ +@Description(name = "strpos" + , value = "_FUNC_(string, substring) - returns index of first occurrence of a substring (or 0 if not found)." + , extended = "Example:\n > select _FUNC_(string, substring) from src;") +public class UDFStringPosition extends UDF { + private LongWritable result = new LongWritable(0); + + public UDFStringPosition() { + } + + public LongWritable evaluate(Text text, Text subText) { + if (text == null || subText == null) { + return result; + } + + if (subText.getLength() == 1) { + result.set(1); + return result; + } + + int index = text.toString().indexOf(subText.toString()); + if (index < 0) { + return result; + } + + result.set(index + 1); + return result; + } +} diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMap.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMap.java new file mode 100644 index 0000000..d57ae8e --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMap.java @@ -0,0 +1,83 @@ +package com.github.aaronshan.functions.string; + +import com.google.common.base.Splitter; +import java.util.HashMap; +import java.util.Map; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +import static com.github.aaronshan.functions.utils.Failures.checkCondition; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 00:04 + */ +@Description(name = "split_to_map" + , value = "_FUNC_(string, string, string) - returns a map created using the given key/value arrays." + , extended = "Example:\n > select _FUNC_('a=123,b=.4,c=,=d', ',', '=') from src;") +public class UDFStringSplitToMap extends GenericUDF { + private static final int ARG_COUNT = 3; // Number of arguments to this UDF + HashMap result = new HashMap(); + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != ARG_COUNT) { + throw new UDFArgumentLengthException( + "The function split_to_map(string, string, string) takes exactly " + ARG_COUNT + " arguments."); + } + + // Check if two argument is of string + for (int i = 0; i < 3; i++) { + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { + throw new UDFArgumentTypeException(i, + "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " + + "expected at function split_to_map, but " + + "\"" + arguments[i].getTypeName() + "\" " + + "is found"); + } + } + + ObjectInspector mapKeyOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ObjectInspector mapValueOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + + return ObjectInspectorFactory.getStandardMapObjectInspector(mapKeyOI, mapValueOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + String string = (String) arguments[0].get(); + String entryDelimiter = (String) arguments[1].get(); + String keyValueDelimiter = (String) arguments[2].get(); + + checkCondition(entryDelimiter.length() > 0, "entryDelimiter is empty"); + checkCondition(keyValueDelimiter.length() > 0, "keyValueDelimiter is empty"); + checkCondition(!entryDelimiter.equals(keyValueDelimiter), "entryDelimiter and keyValueDelimiter must not be the same"); + + if (string == null) { + return null; + } + + result.clear(); + Map map = Splitter.on(entryDelimiter).withKeyValueSeparator(keyValueDelimiter).split(string); + result.putAll(map); + + return result; + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == ARG_COUNT); + return "split_to_map(" + strings[0] + ", " + + strings[1] + ", " + strings[2] + ")"; + } +} diff --git a/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimap.java b/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimap.java new file mode 100644 index 0000000..2714dcc --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimap.java @@ -0,0 +1,99 @@ +package com.github.aaronshan.functions.string; + +import com.google.common.base.Splitter; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Lists; +import com.google.common.collect.Multimap; +import java.util.HashMap; +import java.util.List; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +import static com.github.aaronshan.functions.utils.Failures.checkCondition; + +/** + * @author ruifeng.shan + * @date 2018-07-27 + * @time 00:04 + */ +@Description(name = "split_to_multimap" + , value = "_FUNC_(string, string, string) - creates a multimap by splitting a string into key/value pairs." + , extended = "Example:\n > select _FUNC_('a=123,b=.4,c=,=d', ',', '=') from src;") +public class UDFStringSplitToMultimap extends GenericUDF { + private static final int ARG_COUNT = 3; // Number of arguments to this UDF + HashMap> result = new HashMap>(); + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + // Check if two arguments were passed + if (arguments.length != ARG_COUNT) { + throw new UDFArgumentLengthException( + "The function split_to_multimap(string, string, string) takes exactly " + ARG_COUNT + " arguments."); + } + + // Check if two argument is of string + for (int i = 0; i < 3; i++) { + if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { + throw new UDFArgumentTypeException(i, + "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " + + "expected at function split_to_multimap, but " + + "\"" + arguments[i].getTypeName() + "\" " + + "is found"); + } + } + + ObjectInspector mapKeyOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ObjectInspector mapValueOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector); + + return ObjectInspectorFactory.getStandardMapObjectInspector(mapKeyOI, mapValueOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + String string = (String) arguments[0].get(); + String entryDelimiter = (String) arguments[1].get(); + String keyValueDelimiter = (String) arguments[2].get(); + + checkCondition(entryDelimiter.length() > 0, "entryDelimiter is empty"); + checkCondition(keyValueDelimiter.length() > 0, "keyValueDelimiter is empty"); + checkCondition(!entryDelimiter.equals(keyValueDelimiter), "entryDelimiter and keyValueDelimiter must not be the same"); + + if (string == null) { + return null; + } + + Multimap multimap = ArrayListMultimap.create(); + + result.clear(); + List list = Splitter.on(entryDelimiter).splitToList(string); + for (String str : list) { + String[] fields = str.split(keyValueDelimiter); + if (fields.length != 2) { + throw new HiveException("Key-value delimiter must appear exactly once in each entry. Bad input: " + string); + } + multimap.put(fields[0], fields[1]); + + } + + for (String key : multimap.keySet()) { + result.put(key, Lists.newArrayList(multimap.get(key))); + } + + return result; + } + + @Override + public String getDisplayString(String[] strings) { + assert (strings.length == ARG_COUNT); + return "split_to_multimap(" + strings[0] + ", " + + strings[1] + ", " + strings[2] + ")"; + } +} \ No newline at end of file diff --git a/src/main/java/cc/shanruifeng/functions/url/UDFUrlDecode.java b/src/main/java/com/github/aaronshan/functions/url/UDFUrlDecode.java similarity index 95% rename from src/main/java/cc/shanruifeng/functions/url/UDFUrlDecode.java rename to src/main/java/com/github/aaronshan/functions/url/UDFUrlDecode.java index 21d1f2f..42b70fe 100644 --- a/src/main/java/cc/shanruifeng/functions/url/UDFUrlDecode.java +++ b/src/main/java/com/github/aaronshan/functions/url/UDFUrlDecode.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.url; +package com.github.aaronshan.functions.url; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; diff --git a/src/main/java/cc/shanruifeng/functions/url/UDFUrlEncode.java b/src/main/java/com/github/aaronshan/functions/url/UDFUrlEncode.java similarity index 96% rename from src/main/java/cc/shanruifeng/functions/url/UDFUrlEncode.java rename to src/main/java/com/github/aaronshan/functions/url/UDFUrlEncode.java index 7c8a876..bae9d7f 100644 --- a/src/main/java/cc/shanruifeng/functions/url/UDFUrlEncode.java +++ b/src/main/java/com/github/aaronshan/functions/url/UDFUrlEncode.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.url; +package com.github.aaronshan.functions.url; import com.google.common.escape.Escaper; import com.google.common.net.UrlEscapers; diff --git a/src/main/java/cc/shanruifeng/functions/utils/ArrayUtils.java b/src/main/java/com/github/aaronshan/functions/utils/ArrayUtils.java similarity index 92% rename from src/main/java/cc/shanruifeng/functions/utils/ArrayUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/ArrayUtils.java index e04f3b3..9a31194 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/ArrayUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/ArrayUtils.java @@ -1,7 +1,7 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; -import cc.shanruifeng.functions.fastuitl.ints.AbstractIntComparator; -import cc.shanruifeng.functions.fastuitl.ints.IntComparator; +import com.github.aaronshan.functions.fastuitl.ints.AbstractIntComparator; +import com.github.aaronshan.functions.fastuitl.ints.IntComparator; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; diff --git a/src/main/java/cc/shanruifeng/functions/utils/CardUtils.java b/src/main/java/com/github/aaronshan/functions/utils/CardUtils.java similarity index 97% rename from src/main/java/cc/shanruifeng/functions/utils/CardUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/CardUtils.java index 41c5d23..226a2fc 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/CardUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/CardUtils.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; -import cc.shanruifeng.functions.model.ChinaIdArea; +import com.github.aaronshan.functions.model.ChinaIdArea; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Strings; diff --git a/src/main/java/cc/shanruifeng/functions/utils/ConfigUtils.java b/src/main/java/com/github/aaronshan/functions/utils/ConfigUtils.java similarity index 96% rename from src/main/java/cc/shanruifeng/functions/utils/ConfigUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/ConfigUtils.java index 6dfca16..5eb7b9e 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/ConfigUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/ConfigUtils.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; -import cc.shanruifeng.functions.model.ChinaIdArea; +import com.github.aaronshan.functions.model.ChinaIdArea; import com.google.common.base.Strings; import com.google.common.collect.Lists; import com.google.common.collect.Maps; diff --git a/src/main/java/com/github/aaronshan/functions/utils/Failures.java b/src/main/java/com/github/aaronshan/functions/utils/Failures.java new file mode 100644 index 0000000..6437bea --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/utils/Failures.java @@ -0,0 +1,16 @@ +package com.github.aaronshan.functions.utils; + +import org.apache.hadoop.hive.ql.metadata.HiveException; + +import static com.google.common.collect.Sets.newIdentityHashSet; +import static java.lang.String.format; + +public class Failures { + private Failures() {} + + public static void checkCondition(boolean condition, String formatString, Object... args) throws HiveException { + if (!condition) { + throw new HiveException(format(formatString, args)); + } + } +} diff --git a/src/main/java/cc/shanruifeng/functions/utils/GeoUtils.java b/src/main/java/com/github/aaronshan/functions/utils/GeoUtils.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/utils/GeoUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/GeoUtils.java index a499e6f..951efdd 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/GeoUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/GeoUtils.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/src/main/java/cc/shanruifeng/functions/utils/MapUtils.java b/src/main/java/com/github/aaronshan/functions/utils/MapUtils.java similarity index 96% rename from src/main/java/cc/shanruifeng/functions/utils/MapUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/MapUtils.java index b8f2320..1b8caf6 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/MapUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/MapUtils.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils; +package com.github.aaronshan.functions.utils; import java.util.Map; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; diff --git a/src/main/java/com/github/aaronshan/functions/utils/MathUtils.java b/src/main/java/com/github/aaronshan/functions/utils/MathUtils.java new file mode 100644 index 0000000..4adb505 --- /dev/null +++ b/src/main/java/com/github/aaronshan/functions/utils/MathUtils.java @@ -0,0 +1,14 @@ +package com.github.aaronshan.functions.utils; + +import org.apache.hadoop.hive.ql.metadata.HiveException; + +import static com.github.aaronshan.functions.utils.Failures.checkCondition; +import static java.lang.Character.MAX_RADIX; +import static java.lang.Character.MIN_RADIX; + +public class MathUtils { + public static void checkRadix(long radix) throws HiveException { + checkCondition(radix >= MIN_RADIX && radix <= MAX_RADIX, "Radix must be between %d and %d", MIN_RADIX, MAX_RADIX); + } + +} diff --git a/src/main/java/cc/shanruifeng/functions/utils/json/JsonExtract.java b/src/main/java/com/github/aaronshan/functions/utils/json/JsonExtract.java similarity index 99% rename from src/main/java/cc/shanruifeng/functions/utils/json/JsonExtract.java rename to src/main/java/com/github/aaronshan/functions/utils/json/JsonExtract.java index 3dbb603..c04bd32 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/json/JsonExtract.java +++ b/src/main/java/com/github/aaronshan/functions/utils/json/JsonExtract.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils.json; +package com.github.aaronshan.functions.utils.json; import com.fasterxml.jackson.core.*; import com.fasterxml.jackson.core.io.SerializedString; diff --git a/src/main/java/cc/shanruifeng/functions/utils/json/JsonPath.java b/src/main/java/com/github/aaronshan/functions/utils/json/JsonPath.java similarity index 95% rename from src/main/java/cc/shanruifeng/functions/utils/json/JsonPath.java rename to src/main/java/com/github/aaronshan/functions/utils/json/JsonPath.java index 70a09ce..43f8073 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/json/JsonPath.java +++ b/src/main/java/com/github/aaronshan/functions/utils/json/JsonPath.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils.json; +package com.github.aaronshan.functions.utils.json; /** * @author ruifeng.shan diff --git a/src/main/java/cc/shanruifeng/functions/utils/json/JsonPathTokenizer.java b/src/main/java/com/github/aaronshan/functions/utils/json/JsonPathTokenizer.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/utils/json/JsonPathTokenizer.java rename to src/main/java/com/github/aaronshan/functions/utils/json/JsonPathTokenizer.java index c150594..4be89ed 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/json/JsonPathTokenizer.java +++ b/src/main/java/com/github/aaronshan/functions/utils/json/JsonPathTokenizer.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils.json; +package com.github.aaronshan.functions.utils.json; import com.google.common.collect.AbstractIterator; diff --git a/src/main/java/cc/shanruifeng/functions/utils/json/JsonUtils.java b/src/main/java/com/github/aaronshan/functions/utils/json/JsonUtils.java similarity index 98% rename from src/main/java/cc/shanruifeng/functions/utils/json/JsonUtils.java rename to src/main/java/com/github/aaronshan/functions/utils/json/JsonUtils.java index 9031319..633a089 100644 --- a/src/main/java/cc/shanruifeng/functions/utils/json/JsonUtils.java +++ b/src/main/java/com/github/aaronshan/functions/utils/json/JsonUtils.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.utils.json; +package com.github.aaronshan.functions.utils.json; import com.fasterxml.jackson.core.JsonFactory; diff --git a/src/main/resources/china_day_type.config b/src/main/resources/china_day_type.config index 8ae306a..bae8705 100644 --- a/src/main/resources/china_day_type.config +++ b/src/main/resources/china_day_type.config @@ -199,4 +199,35 @@ 2018-10-04 holiday 2018-10-05 holiday 2018-10-06 holiday -2018-10-07 holiday \ No newline at end of file +2018-10-07 holiday +2018-12-29 workday +2018-12-30 holiday +2018-12-31 holiday +2019-01-01 holiday +2019-02-02 workday +2019-02-03 workday +2019-02-04 holiday +2019-02-05 holiday +2019-02-06 holiday +2019-02-07 holiday +2019-02-08 holiday +2019-02-09 holiday +2019-02-10 holiday +2019-04-05 holiday +2019-04-06 holiday +2019-04-07 holiday +2019-05-01 holiday +2019-06-07 holiday +2019-06-08 holiday +2019-06-09 holiday +2019-09-13 holiday +2019-09-14 holiday +2019-09-15 holiday +2019-09-29 workday +2019-10-01 holiday +2019-10-02 holiday +2019-10-03 holiday +2019-10-04 holiday +2019-10-05 holiday +2019-10-06 holiday +2019-10-07 holiday \ No newline at end of file diff --git a/src/test/java/cc/shanruifeng/functions/array/UDFArrayContainsTest.java b/src/test/java/com/github/aaronshan/functions/array/UDFArrayContainsTest.java similarity index 97% rename from src/test/java/cc/shanruifeng/functions/array/UDFArrayContainsTest.java rename to src/test/java/com/github/aaronshan/functions/array/UDFArrayContainsTest.java index ac9de3e..a5efc4b 100644 --- a/src/test/java/cc/shanruifeng/functions/array/UDFArrayContainsTest.java +++ b/src/test/java/com/github/aaronshan/functions/array/UDFArrayContainsTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import com.google.common.collect.ImmutableList; import java.util.List; diff --git a/src/test/java/cc/shanruifeng/functions/array/UDFArrayIntersectTest.java b/src/test/java/com/github/aaronshan/functions/array/UDFArrayIntersectTest.java similarity index 97% rename from src/test/java/cc/shanruifeng/functions/array/UDFArrayIntersectTest.java rename to src/test/java/com/github/aaronshan/functions/array/UDFArrayIntersectTest.java index 639384a..37c1143 100644 --- a/src/test/java/cc/shanruifeng/functions/array/UDFArrayIntersectTest.java +++ b/src/test/java/com/github/aaronshan/functions/array/UDFArrayIntersectTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.array; +package com.github.aaronshan.functions.array; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; diff --git a/src/test/java/com/github/aaronshan/functions/array/UDFArrayShuffleTest.java b/src/test/java/com/github/aaronshan/functions/array/UDFArrayShuffleTest.java new file mode 100644 index 0000000..0f8a654 --- /dev/null +++ b/src/test/java/com/github/aaronshan/functions/array/UDFArrayShuffleTest.java @@ -0,0 +1,33 @@ +package com.github.aaronshan.functions.array; + +import com.github.aaronshan.functions.array.UDFArrayShuffle; +import com.google.common.collect.ImmutableList; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Test; + +import java.util.List; + +/** + * @author aaronshan + * @date 2018-08-18 上午8:59 + */ +public class UDFArrayShuffleTest { + @Test + public void testArrayShuffle() throws HiveException { + UDFArrayShuffle udf = new UDFArrayShuffle(); + + ObjectInspector arrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector); + ObjectInspector[] arguments = {arrayOI}; + + udf.initialize(arguments); + + List array = ImmutableList.of(1,2,5,6); + GenericUDF.DeferredObject arrayObj = new GenericUDF.DeferredJavaObject(array); + GenericUDF.DeferredObject[] args = {arrayObj}; + System.out.println(udf.evaluate(args)); + } +} diff --git a/src/test/java/com/github/aaronshan/functions/array/UDFArrayValueCountTest.java b/src/test/java/com/github/aaronshan/functions/array/UDFArrayValueCountTest.java new file mode 100644 index 0000000..41c1acb --- /dev/null +++ b/src/test/java/com/github/aaronshan/functions/array/UDFArrayValueCountTest.java @@ -0,0 +1,39 @@ +package com.github.aaronshan.functions.array; + +import com.google.common.collect.ImmutableList; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.LongWritable; +import org.junit.Test; + +import java.util.List; + +import static org.junit.Assert.*; + +public class UDFArrayValueCountTest { + @Test + public void testArrayValueCount() throws Exception { + UDFArrayValueCount udf = new UDFArrayValueCount(); + + ObjectInspector arrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector); + ObjectInspector valueOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ObjectInspector[] arguments = {arrayOI, valueOI}; + + udf.initialize(arguments); + List array = ImmutableList.of("a", "b", "c", "a"); + GenericUDF.DeferredObject arrayObj = new GenericUDF.DeferredJavaObject(array); + GenericUDF.DeferredObject valueObj = new GenericUDF.DeferredJavaObject("a"); + GenericUDF.DeferredObject[] args = {arrayObj, valueObj}; + LongWritable output = (LongWritable) udf.evaluate(args); + + assertEquals("array_value_count() test", new LongWritable(2).get(), output.get()); + + // Try with null args + GenericUDF.DeferredObject[] nullArgs = { new GenericUDF.DeferredJavaObject(null), new GenericUDF.DeferredJavaObject(null) }; + output = (LongWritable) udf.evaluate(nullArgs); + assertEquals("array_value_count() test", new LongWritable(0).get(), output.get()); + } +} \ No newline at end of file diff --git a/src/test/java/cc/shanruifeng/functions/bitwise/UDFBitCountTest.java b/src/test/java/com/github/aaronshan/functions/bitwise/UDFBitCountTest.java similarity index 95% rename from src/test/java/cc/shanruifeng/functions/bitwise/UDFBitCountTest.java rename to src/test/java/com/github/aaronshan/functions/bitwise/UDFBitCountTest.java index d064cba..c344a33 100644 --- a/src/test/java/cc/shanruifeng/functions/bitwise/UDFBitCountTest.java +++ b/src/test/java/com/github/aaronshan/functions/bitwise/UDFBitCountTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.bitwise; +package com.github.aaronshan.functions.bitwise; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.LongWritable; diff --git a/src/test/java/cc/shanruifeng/functions/date/UDFDayOfYearTest.java b/src/test/java/com/github/aaronshan/functions/date/UDFDayOfYearTest.java similarity index 94% rename from src/test/java/cc/shanruifeng/functions/date/UDFDayOfYearTest.java rename to src/test/java/com/github/aaronshan/functions/date/UDFDayOfYearTest.java index cc35d19..d70cdd6 100644 --- a/src/test/java/cc/shanruifeng/functions/date/UDFDayOfYearTest.java +++ b/src/test/java/com/github/aaronshan/functions/date/UDFDayOfYearTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.date; +package com.github.aaronshan.functions.date; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; diff --git a/src/test/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcjTest.java b/src/test/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcjTest.java similarity index 93% rename from src/test/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcjTest.java rename to src/test/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcjTest.java index 2e9fec3..a720e53 100644 --- a/src/test/java/cc/shanruifeng/functions/geo/UDFGeoBdToGcjTest.java +++ b/src/test/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcjTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.geo; +package com.github.aaronshan.functions.geo; import org.apache.hadoop.io.Text; import org.junit.Assert; diff --git a/src/test/java/cc/shanruifeng/functions/map/UDFMapBuildTest.java b/src/test/java/com/github/aaronshan/functions/map/UDFMapBuildTest.java similarity index 87% rename from src/test/java/cc/shanruifeng/functions/map/UDFMapBuildTest.java rename to src/test/java/com/github/aaronshan/functions/map/UDFMapBuildTest.java index 630635e..0fc2ed6 100644 --- a/src/test/java/cc/shanruifeng/functions/map/UDFMapBuildTest.java +++ b/src/test/java/com/github/aaronshan/functions/map/UDFMapBuildTest.java @@ -1,6 +1,7 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; -import cc.shanruifeng.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; @@ -11,6 +12,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Assert; import org.junit.Test; import static org.junit.Assert.*; @@ -38,6 +40,6 @@ public void testMapBuild() throws Exception { LinkedHashMap expect = Maps.newLinkedHashMap(); expect.putAll(ImmutableMap.of("key1", "value1", "key2", "value2", "key3", "value3")); - assertEquals("map_build() test", true, MapUtils.mapEquals(output, expect)); + Assert.assertEquals("map_build() test", true, MapUtils.mapEquals(output, expect)); } } \ No newline at end of file diff --git a/src/test/java/cc/shanruifeng/functions/map/UDFMapConcatTest.java b/src/test/java/com/github/aaronshan/functions/map/UDFMapConcatTest.java similarity index 88% rename from src/test/java/cc/shanruifeng/functions/map/UDFMapConcatTest.java rename to src/test/java/com/github/aaronshan/functions/map/UDFMapConcatTest.java index e9fcc1b..0094f43 100644 --- a/src/test/java/cc/shanruifeng/functions/map/UDFMapConcatTest.java +++ b/src/test/java/com/github/aaronshan/functions/map/UDFMapConcatTest.java @@ -1,6 +1,7 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; -import cc.shanruifeng.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; import java.util.LinkedHashMap; @@ -9,6 +10,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Assert; import org.junit.Test; import static org.junit.Assert.*; @@ -39,6 +41,6 @@ public void testMapConcat() throws Exception { LinkedHashMap expect = Maps.newLinkedHashMap(); expect.putAll(ImmutableMap.of("key1", "11", "key2", "12", "key3", "21", "key4", "22", "key5", "23")); - assertEquals("map_concat() test", true, MapUtils.mapEquals(output, expect)); + Assert.assertEquals("map_concat() test", true, MapUtils.mapEquals(output, expect)); } } \ No newline at end of file diff --git a/src/test/java/cc/shanruifeng/functions/map/UDFMapElementAtTest.java b/src/test/java/com/github/aaronshan/functions/map/UDFMapElementAtTest.java similarity index 94% rename from src/test/java/cc/shanruifeng/functions/map/UDFMapElementAtTest.java rename to src/test/java/com/github/aaronshan/functions/map/UDFMapElementAtTest.java index d542414..b98259d 100644 --- a/src/test/java/cc/shanruifeng/functions/map/UDFMapElementAtTest.java +++ b/src/test/java/com/github/aaronshan/functions/map/UDFMapElementAtTest.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; -import cc.shanruifeng.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; diff --git a/src/test/java/cc/shanruifeng/functions/map/UDFMapEqualsTest.java b/src/test/java/com/github/aaronshan/functions/map/UDFMapEqualsTest.java similarity index 96% rename from src/test/java/cc/shanruifeng/functions/map/UDFMapEqualsTest.java rename to src/test/java/com/github/aaronshan/functions/map/UDFMapEqualsTest.java index 3c841a1..af8917f 100644 --- a/src/test/java/cc/shanruifeng/functions/map/UDFMapEqualsTest.java +++ b/src/test/java/com/github/aaronshan/functions/map/UDFMapEqualsTest.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.map; +package com.github.aaronshan.functions.map; -import cc.shanruifeng.functions.utils.MapUtils; +import com.github.aaronshan.functions.utils.MapUtils; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; import java.util.LinkedHashMap; diff --git a/src/test/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarityTest.java b/src/test/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarityTest.java new file mode 100644 index 0000000..ecff4ea --- /dev/null +++ b/src/test/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarityTest.java @@ -0,0 +1,51 @@ +package com.github.aaronshan.functions.math; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Test; + +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.*; + +public class UDFMathCosineSimilarityTest { + + @Test + public void testCosineSimilarity() throws HiveException { + Double result = getResult(ImmutableMap.of("a", 1.0, "b", 2.0), ImmutableMap.of("c", 1.0, "b", 3.0)); + assertEquals(result, 2 * 3 / (Math.sqrt(5) * Math.sqrt(10)), 0.0); + result = getResult(ImmutableMap.of("a", 1.0, "b", 2.0, "c", -1.0), ImmutableMap.of("c", 1.0, "b", 3.0)); + assertEquals(result, (2 * 3 + (-1) * 1) / (Math.sqrt(1 + 4 + 1) * Math.sqrt(1 + 9)), 0.0); + result = getResult(ImmutableMap.of("a", 1.0, "b", 2.0, "c", -1.0), ImmutableMap.of("d", 1.0, "e", 3.0)); + assertEquals(result, 0.0, 0.0); + result = getResult(null, ImmutableMap.of("c", 1.0, "b", 3.0)); + assertEquals(result, null); + LinkedHashMap leftMap = Maps.newLinkedHashMap(); + leftMap.put("a", 1.0); + leftMap.put("b", null); + result = getResult(leftMap, ImmutableMap.of("c", 1.0, "b", 3.0)); + assertEquals(result, null); + } + + public Double getResult(Map leftMap, Map rightMap) throws HiveException { + UDFMathCosineSimilarity udf = new UDFMathCosineSimilarity(); + + ObjectInspector leftMapOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); + ObjectInspector rightMapOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); + ObjectInspector[] arguments = {leftMapOI, rightMapOI}; + udf.initialize(arguments); + + GenericUDF.DeferredObject leftMapObj = new GenericUDF.DeferredJavaObject(leftMap); + GenericUDF.DeferredObject rightMapObj = new GenericUDF.DeferredJavaObject(rightMap); + GenericUDF.DeferredObject[] args = {leftMapObj, rightMapObj}; + DoubleWritable output = (DoubleWritable) udf.evaluate(args); + return output == null ? null : output.get(); + } +} \ No newline at end of file diff --git a/src/test/java/com/github/aaronshan/functions/string/UDFStringSplitToMapTest.java b/src/test/java/com/github/aaronshan/functions/string/UDFStringSplitToMapTest.java new file mode 100644 index 0000000..a1e82e6 --- /dev/null +++ b/src/test/java/com/github/aaronshan/functions/string/UDFStringSplitToMapTest.java @@ -0,0 +1,29 @@ +package com.github.aaronshan.functions.string; + +import com.github.aaronshan.functions.utils.MapUtils; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashMap; + +public class UDFStringSplitToMapTest { + @Test + public void testStringSplitToMap() throws Exception { + UDFStringSplitToMap udf = new UDFStringSplitToMap(); + + GenericUDF.DeferredObject string = new GenericUDF.DeferredJavaObject("a=123,b=0.4"); + GenericUDF.DeferredObject entryDelimiter = new GenericUDF.DeferredJavaObject(","); + GenericUDF.DeferredObject keyValueDelimiter = new GenericUDF.DeferredJavaObject("="); + GenericUDF.DeferredObject[] args = {string, entryDelimiter, keyValueDelimiter}; + + HashMap output = (HashMap) udf.evaluate(args); + + HashMap expect = Maps.newHashMap(); + expect.putAll(ImmutableMap.of("a", "123", "b", "0.4")); + + Assert.assertEquals("split_to_map() test", true, MapUtils.mapEquals(output, expect)); + } +} \ No newline at end of file diff --git a/src/test/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimapTest.java b/src/test/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimapTest.java new file mode 100644 index 0000000..d7fe5fd --- /dev/null +++ b/src/test/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimapTest.java @@ -0,0 +1,31 @@ +package com.github.aaronshan.functions.string; + +import com.github.aaronshan.functions.utils.MapUtils; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashMap; +import java.util.List; + +public class UDFStringSplitToMultimapTest { + @Test + public void testStringSplitToMultimap() throws Exception { + UDFStringSplitToMultimap udf = new UDFStringSplitToMultimap(); + + GenericUDF.DeferredObject string = new GenericUDF.DeferredJavaObject("a=123,b=0.4,a=124"); + GenericUDF.DeferredObject entryDelimiter = new GenericUDF.DeferredJavaObject(","); + GenericUDF.DeferredObject keyValueDelimiter = new GenericUDF.DeferredJavaObject("="); + GenericUDF.DeferredObject[] args = {string, entryDelimiter, keyValueDelimiter}; + + HashMap> output = (HashMap>) udf.evaluate(args); + + HashMap> expect = Maps.newHashMap(); + expect.putAll(ImmutableMap.>of("a", ImmutableList.of("123", "124"), "b", ImmutableList.of("0.4"))); + + Assert.assertEquals("split_to_multimap() test", true, MapUtils.mapEquals(output, expect)); + } +} \ No newline at end of file diff --git a/src/test/java/cc/shanruifeng/functions/url/UDFUrlDecodeTest.java b/src/test/java/com/github/aaronshan/functions/url/UDFUrlDecodeTest.java similarity index 94% rename from src/test/java/cc/shanruifeng/functions/url/UDFUrlDecodeTest.java rename to src/test/java/com/github/aaronshan/functions/url/UDFUrlDecodeTest.java index 56cd312..a771cd1 100644 --- a/src/test/java/cc/shanruifeng/functions/url/UDFUrlDecodeTest.java +++ b/src/test/java/com/github/aaronshan/functions/url/UDFUrlDecodeTest.java @@ -1,4 +1,4 @@ -package cc.shanruifeng.functions.url; +package com.github.aaronshan.functions.url; import org.apache.hadoop.io.Text; import org.junit.Assert; diff --git a/src/test/java/cc/shanruifeng/functions/url/UDFUrlEncodeTest.java b/src/test/java/com/github/aaronshan/functions/url/UDFUrlEncodeTest.java similarity index 87% rename from src/test/java/cc/shanruifeng/functions/url/UDFUrlEncodeTest.java rename to src/test/java/com/github/aaronshan/functions/url/UDFUrlEncodeTest.java index a7c894d..3cec438 100644 --- a/src/test/java/cc/shanruifeng/functions/url/UDFUrlEncodeTest.java +++ b/src/test/java/com/github/aaronshan/functions/url/UDFUrlEncodeTest.java @@ -1,6 +1,6 @@ -package cc.shanruifeng.functions.url; +package com.github.aaronshan.functions.url; -import cc.shanruifeng.functions.date.UDFDayOfYear; +import com.github.aaronshan.functions.date.UDFDayOfYear; import org.apache.hadoop.io.Text; import org.junit.Assert; import org.junit.Test;