Skip to content

Commit

Permalink
HIVE-2279. Implement sort(array) UDF (Zhenxiao Luo via cws)
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/hive/trunk@1234146 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
cwsteinbach committed Jan 20, 2012
1 parent 589dd5a commit 54660a6
Show file tree
Hide file tree
Showing 11 changed files with 248 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFCase;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFCoalesce;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFConcatWS;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFSortArray;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFEWAHBitmapAnd;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFEWAHBitmapEmpty;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFEWAHBitmapOr;
Expand Down Expand Up @@ -446,6 +447,7 @@ public final class FunctionRegistry {
registerGenericUDF("locate", GenericUDFLocate.class);
registerGenericUDF("elt", GenericUDFElt.class);
registerGenericUDF("concat_ws", GenericUDFConcatWS.class);
registerGenericUDF("sort_array", GenericUDFSortArray.class);
registerGenericUDF("array_contains", GenericUDFArrayContains.class);
registerGenericUDF("sentences", GenericUDFSentences.class);
registerGenericUDF("map_keys", GenericUDFMapKeys.class);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hive.ql.udf.generic;

import java.util.ArrayList;
import java.util.Collections;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;

/**
* Generic UDF for array sort
* <code>SORT_ARRAY(array(obj1, obj2, obj3...))</code>.
*
* @see org.apache.hadoop.hive.ql.udf.generic.GenericUDF
*/
@Description(name = "sort_array",
value = "_FUNC_(array(obj1, obj2,...)) - "
+ "Sorts the input array in ascending order according to the natural ordering"
+ " of the array elements.",
extended = "Example:\n"
+ " > SELECT _FUNC_(array('b', 'd', 'c', 'a')) FROM src LIMIT 1;\n"
+ " 'a', 'b', 'c', 'd'")
public class GenericUDFSortArray extends GenericUDF {
private Converter[] converters;
private ArrayList<Object> ret = new ArrayList<Object>();
private ObjectInspector[] argumentOIs;

@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
GenericUDFUtils.ReturnObjectInspectorResolver returnOIResolver;
returnOIResolver = new GenericUDFUtils.ReturnObjectInspectorResolver(true);

if (arguments.length != 1) {
throw new UDFArgumentLengthException(
"The function SORT_ARRAY(array(obj1, obj2,...)) needs one argument.");
}

switch(arguments[0].getCategory()) {
case LIST:
if(((ListObjectInspector)(arguments[0])).getListElementObjectInspector()
.getCategory().equals(Category.PRIMITIVE))
break;
default:
throw new UDFArgumentTypeException(0, "Argument 1"
+ " of function SORT_ARRAY must be " + Constants.LIST_TYPE_NAME
+ "<" + Category.PRIMITIVE + ">, but " + arguments[0].getTypeName()
+ " was found.");
}

ObjectInspector elementObjectInspector =
((ListObjectInspector)(arguments[0])).getListElementObjectInspector();
argumentOIs = arguments;
converters = new Converter[arguments.length];
ObjectInspector returnOI = returnOIResolver.get();
if (returnOI == null) {
returnOI = elementObjectInspector;
}
converters[0] = ObjectInspectorConverters.getConverter(elementObjectInspector, returnOI);

return ObjectInspectorFactory.getStandardListObjectInspector(returnOI);
}

@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
if (arguments[0].get() == null) {
return null;
}

Object array = arguments[0].get();
ListObjectInspector arrayOI = (ListObjectInspector) argumentOIs[0];
ArrayList retArray = (ArrayList) arrayOI.getList(array);
Collections.sort(retArray);

ret.clear();
for (int i = 0; i < retArray.size(); i++) {
ret.add(converters[0].convert(retArray.get(i)));
}
return ret;
}

@Override
public String getDisplayString(String[] children) {
assert (children.length == 1);
return "sort_array(" + children[0] + ")";
}
}
2 changes: 2 additions & 0 deletions ql/src/test/queries/clientnegative/udf_sort_array_wrong1.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- invalid argument number
SELECT sort_array(array(2, 5, 4), 3) FROM src LIMIT 1;
2 changes: 2 additions & 0 deletions ql/src/test/queries/clientnegative/udf_sort_array_wrong2.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- invalid argument type
SELECT sort_array("Invalid") FROM src LIMIT 1;
2 changes: 2 additions & 0 deletions ql/src/test/queries/clientnegative/udf_sort_array_wrong3.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- invalid argument type
SELECT sort_array(array(array(10, 20), array(5, 15), array(3, 13))) FROM src LIMIT 1;
19 changes: 19 additions & 0 deletions ql/src/test/queries/clientpositive/udf_sort_array.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use default;
-- Test sort_array() UDF

DESCRIBE FUNCTION sort_array;
DESCRIBE FUNCTION EXTENDED sort_array;

-- Evaluate function against STRING valued keys
EXPLAIN
SELECT sort_array(array("b", "d", "c", "a")) FROM src LIMIT 1;

SELECT sort_array(array("f", "a", "g", "c", "b", "d", "e")) FROM src LIMIT 1;
SELECT sort_array(sort_array(array("hadoop distributed file system", "enterprise databases", "hadoop map-reduce"))) FROM src LIMIT 1;

-- Evaluate function against INT valued keys
SELECT sort_array(array(2, 9, 7, 3, 5, 4, 1, 6, 8)) FROM src LIMIT 1;

-- Evaluate function against FLOAT valued keys
SELECT sort_array(sort_array(array(2.333, 9, 1.325, 2.003, 0.777, -3.445, 1))) FROM src LIMIT 1;

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
FAILED: Error in semantic analysis: Line 2:7 Arguments length mismatch '3': The function SORT_ARRAY(array(obj1, obj2,...)) needs one argument.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
FAILED: Error in semantic analysis: Line 2:18 Argument type mismatch '"Invalid"': Argument 1 of function SORT_ARRAY must be array<PRIMITIVE>, but string was found.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
FAILED: Error in semantic analysis: Line 2:18 Argument type mismatch '13': Argument 1 of function SORT_ARRAY must be array<PRIMITIVE>, but array<array<int>> was found.
1 change: 1 addition & 0 deletions ql/src/test/results/clientpositive/show_functions.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ sentences
sign
sin
size
sort_array
space
split
sqrt
Expand Down
101 changes: 101 additions & 0 deletions ql/src/test/results/clientpositive/udf_sort_array.q.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
PREHOOK: query: use default
PREHOOK: type: SWITCHDATABASE
POSTHOOK: query: use default
POSTHOOK: type: SWITCHDATABASE
PREHOOK: query: -- Test sort_array() UDF

DESCRIBE FUNCTION sort_array
PREHOOK: type: DESCFUNCTION
POSTHOOK: query: -- Test sort_array() UDF

DESCRIBE FUNCTION sort_array
POSTHOOK: type: DESCFUNCTION
sort_array(array(obj1, obj2,...)) - Sorts the input array in ascending order according to the natural ordering of the array elements.
PREHOOK: query: DESCRIBE FUNCTION EXTENDED sort_array
PREHOOK: type: DESCFUNCTION
POSTHOOK: query: DESCRIBE FUNCTION EXTENDED sort_array
POSTHOOK: type: DESCFUNCTION
sort_array(array(obj1, obj2,...)) - Sorts the input array in ascending order according to the natural ordering of the array elements.
Example:
> SELECT sort_array(array('b', 'd', 'c', 'a')) FROM src LIMIT 1;
'a', 'b', 'c', 'd'
PREHOOK: query: -- Evaluate function against STRING valued keys
EXPLAIN
SELECT sort_array(array("b", "d", "c", "a")) FROM src LIMIT 1
PREHOOK: type: QUERY
POSTHOOK: query: -- Evaluate function against STRING valued keys
EXPLAIN
SELECT sort_array(array("b", "d", "c", "a")) FROM src LIMIT 1
POSTHOOK: type: QUERY
ABSTRACT SYNTAX TREE:
(TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sort_array (TOK_FUNCTION array "b" "d" "c" "a")))) (TOK_LIMIT 1)))

STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 is a root stage

STAGE PLANS:
Stage: Stage-1
Map Reduce
Alias -> Map Operator Tree:
src
TableScan
alias: src
Select Operator
expressions:
expr: sort_array(array('b','d','c','a'))
type: array<string>
outputColumnNames: _col0
Limit
File Output Operator
compressed: false
GlobalTableId: 0
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat

Stage: Stage-0
Fetch Operator
limit: 1


PREHOOK: query: SELECT sort_array(array("f", "a", "g", "c", "b", "d", "e")) FROM src LIMIT 1
PREHOOK: type: QUERY
PREHOOK: Input: default@src
#### A masked pattern was here ####
POSTHOOK: query: SELECT sort_array(array("f", "a", "g", "c", "b", "d", "e")) FROM src LIMIT 1
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
["a","b","c","d","e","f","g"]
PREHOOK: query: SELECT sort_array(sort_array(array("hadoop distributed file system", "enterprise databases", "hadoop map-reduce"))) FROM src LIMIT 1
PREHOOK: type: QUERY
PREHOOK: Input: default@src
#### A masked pattern was here ####
POSTHOOK: query: SELECT sort_array(sort_array(array("hadoop distributed file system", "enterprise databases", "hadoop map-reduce"))) FROM src LIMIT 1
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
["enterprise databases","hadoop distributed file system","hadoop map-reduce"]
PREHOOK: query: -- Evaluate function against INT valued keys
SELECT sort_array(array(2, 9, 7, 3, 5, 4, 1, 6, 8)) FROM src LIMIT 1
PREHOOK: type: QUERY
PREHOOK: Input: default@src
#### A masked pattern was here ####
POSTHOOK: query: -- Evaluate function against INT valued keys
SELECT sort_array(array(2, 9, 7, 3, 5, 4, 1, 6, 8)) FROM src LIMIT 1
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
[1,2,3,4,5,6,7,8,9]
PREHOOK: query: -- Evaluate function against FLOAT valued keys
SELECT sort_array(sort_array(array(2.333, 9, 1.325, 2.003, 0.777, -3.445, 1))) FROM src LIMIT 1
PREHOOK: type: QUERY
PREHOOK: Input: default@src
#### A masked pattern was here ####
POSTHOOK: query: -- Evaluate function against FLOAT valued keys
SELECT sort_array(sort_array(array(2.333, 9, 1.325, 2.003, 0.777, -3.445, 1))) FROM src LIMIT 1
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
[-3.445,0.777,1.0,1.325,2.003,2.333,9.0]

0 comments on commit 54660a6

Please sign in to comment.