Permalink
Browse files

HIVE-3985 : Update new UDAFs introduced for Windowing to work with ne…

…w Decimal Type (Brock Noland via Ashutosh Chauhan)

git-svn-id: https://svn.apache.org/repos/asf/hive/trunk@1465263 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent ba34b99 commit b42d8324f321f14096550ae8cb3e6c8ebe9c67e4 @ashutoshc ashutoshc committed Apr 6, 2013
@@ -19,6 +19,7 @@
package org.apache.hadoop.hive.ql.udf.generic;
import java.util.ArrayList;
+import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -55,29 +56,49 @@ protected GenericUDAFRankEvaluator createEvaluator()
return new GenericUDAFCumeDistEvaluator();
}
- public static class GenericUDAFCumeDistEvaluator extends GenericUDAFRankEvaluator
- {
- @Override
- public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException
- {
- super.init(m, parameters);
- return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
- }
-
- @Override
- public Object terminate(AggregationBuffer agg) throws HiveException
- {
- ArrayList<IntWritable> ranks = ((RankBuffer) agg).rowNums;
- double sz = ranks.size();
- ArrayList<DoubleWritable> pranks = new ArrayList<DoubleWritable>(ranks.size());
+ public static class GenericUDAFCumeDistEvaluator extends GenericUDAFRankEvaluator
+ {
+ @Override
+ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException
+ {
+ super.init(m, parameters);
+ return ObjectInspectorFactory
+ .getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+ }
- for(IntWritable i : ranks)
- {
- double pr = ((double)i.get())/sz;
- pranks.add(new DoubleWritable(pr));
- }
-
- return pranks;
- }
- }
+ @Override
+ public Object terminate(AggregationBuffer agg) throws HiveException
+ {
+ List<IntWritable> ranks = ((RankBuffer) agg).rowNums;
+ int ranksSize = ranks.size();
+ double ranksSizeDouble = ranksSize;
+ List<DoubleWritable> distances = new ArrayList<DoubleWritable>(ranksSize);
+ int last = -1;
+ int current = -1;
+ // tracks the number of elements with the same rank at the current time
+ int elementsAtRank = 1;
+ for (int index = 0; index < ranksSize; index++) {
+ current = ranks.get(index).get();
+ if (index == 0) {
+ last = current;
+ } else if (last == current) {
+ elementsAtRank++;
+ } else {
+ last = current;
+ double distance = ((double) index) / ranksSizeDouble;
+ while (elementsAtRank-- > 0) {
+ distances.add(new DoubleWritable(distance));
+ }
+ elementsAtRank = 1;
+ }
+ }
+ if (ranksSize > 0 && last == current) {
+ double distance = ((double) ranksSize) / ranksSizeDouble;
+ while (elementsAtRank-- > 0) {
+ distances.add(new DoubleWritable(distance));
+ }
+ }
+ return distances;
+ }
+ }
}
@@ -0,0 +1,49 @@
+DROP TABLE IF EXISTS part;
+
+-- data setup
+CREATE TABLE part(
+ p_partkey INT,
+ p_name STRING,
+ p_mfgr STRING,
+ p_brand STRING,
+ p_type STRING,
+ p_size INT,
+ p_container STRING,
+ p_retailprice DECIMAL,
+ p_comment STRING
+);
+
+LOAD DATA LOCAL INPATH '../data/files/part_tiny.txt' overwrite into table part;
+
+-- 1. aggregate functions with decimal type
+
+select p_mfgr, p_retailprice,
+lead(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c1,
+lag(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c2,
+first_value(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c3,
+last_value(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c4
+from part;
+
+-- 2. ranking functions with decimal type
+
+select p_mfgr, p_retailprice,
+row_number() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c1,
+rank() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c2,
+dense_rank() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c3,
+percent_rank() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c4,
+cume_dist() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c5,
+ntile(5) over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c6
+from part;
+
+-- 3. order by decimal
+
+select p_mfgr, p_retailprice,
+lag(p_retailprice) over (partition by p_mfgr ORDER BY p_retailprice desc) as c1
+from part;
+
+-- 4. partition by decimal
+
+select p_mfgr, p_retailprice,
+lag(p_retailprice) over (partition by p_retailprice) as c1
+from part;
+
@@ -979,8 +979,8 @@ POSTHOOK: Lineage: part_5.p_size SCRIPT [(part)part.FieldSchema(name:p_partkey,
POSTHOOK: Lineage: part_5.r SCRIPT [(part)part.FieldSchema(name:p_partkey, type:int, comment:null), (part)part.FieldSchema(name:p_name, type:string, comment:null), (part)part.FieldSchema(name:p_mfgr, type:string, comment:null), (part)part.FieldSchema(name:p_brand, type:string, comment:null), (part)part.FieldSchema(name:p_type, type:string, comment:null), (part)part.FieldSchema(name:p_size, type:int, comment:null), (part)part.FieldSchema(name:p_container, type:string, comment:null), (part)part.FieldSchema(name:p_retailprice, type:double, comment:null), (part)part.FieldSchema(name:p_comment, type:string, comment:null), ]
POSTHOOK: Lineage: part_5.s1 SCRIPT [(part)part.FieldSchema(name:p_partkey, type:int, comment:null), (part)part.FieldSchema(name:p_name, type:string, comment:null), (part)part.FieldSchema(name:p_mfgr, type:string, comment:null), (part)part.FieldSchema(name:p_brand, type:string, comment:null), (part)part.FieldSchema(name:p_type, type:string, comment:null), (part)part.FieldSchema(name:p_size, type:int, comment:null), (part)part.FieldSchema(name:p_container, type:string, comment:null), (part)part.FieldSchema(name:p_retailprice, type:double, comment:null), (part)part.FieldSchema(name:p_comment, type:string, comment:null), ]
POSTHOOK: Lineage: part_5.s2 SCRIPT [(part)part.FieldSchema(name:p_partkey, type:int, comment:null), (part)part.FieldSchema(name:p_name, type:string, comment:null), (part)part.FieldSchema(name:p_mfgr, type:string, comment:null), (part)part.FieldSchema(name:p_brand, type:string, comment:null), (part)part.FieldSchema(name:p_type, type:string, comment:null), (part)part.FieldSchema(name:p_size, type:int, comment:null), (part)part.FieldSchema(name:p_container, type:string, comment:null), (part)part.FieldSchema(name:p_retailprice, type:double, comment:null), (part)part.FieldSchema(name:p_comment, type:string, comment:null), ]
-Manufacturer#1 almond antique burnished rose metallic 2 4 4 1 1 0.16666666666666666 2
-Manufacturer#1 almond antique burnished rose metallic 2 2 4 1 1 0.16666666666666666 2
+Manufacturer#1 almond antique burnished rose metallic 2 4 4 1 1 0.3333333333333333 2
+Manufacturer#1 almond antique burnished rose metallic 2 2 4 1 1 0.3333333333333333 2
Manufacturer#1 almond antique salmon chartreuse burlywood 6 44 10 4 3 0.6666666666666666 2
Manufacturer#1 almond aquamarine burnished black steel 28 72 28 5 4 0.8333333333333334 34
Manufacturer#1 almond antique chartreuse lavender yellow 34 38 34 3 2 0.5 2
@@ -0,0 +1,221 @@
+PREHOOK: query: DROP TABLE IF EXISTS part
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS part
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: -- data setup
+CREATE TABLE part(
+ p_partkey INT,
+ p_name STRING,
+ p_mfgr STRING,
+ p_brand STRING,
+ p_type STRING,
+ p_size INT,
+ p_container STRING,
+ p_retailprice DECIMAL,
+ p_comment STRING
+)
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- data setup
+CREATE TABLE part(
+ p_partkey INT,
+ p_name STRING,
+ p_mfgr STRING,
+ p_brand STRING,
+ p_type STRING,
+ p_size INT,
+ p_container STRING,
+ p_retailprice DECIMAL,
+ p_comment STRING
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@part
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/part_tiny.txt' overwrite into table part
+PREHOOK: type: LOAD
+PREHOOK: Output: default@part
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/part_tiny.txt' overwrite into table part
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@part
+PREHOOK: query: -- 1. aggregate functions with decimal type
+
+select p_mfgr, p_retailprice,
+lead(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c1,
+lag(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c2,
+first_value(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c3,
+last_value(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c4
+from part
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: -- 1. aggregate functions with decimal type
+
+select p_mfgr, p_retailprice,
+lead(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c1,
+lag(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c2,
+first_value(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c3,
+last_value(p_retailprice) over (partition by p_mfgr ORDER BY p_name) as c4
+from part
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+Manufacturer#1 1173.15 1173.15 NULL 1173.15 1173.15
+Manufacturer#1 1173.15 1753.76 1173.15 1173.15 1173.15
+Manufacturer#1 1753.76 1602.59 1173.15 1173.15 1753.76
+Manufacturer#1 1602.59 1414.42 1753.76 1173.15 1602.59
+Manufacturer#1 1414.42 1632.66 1602.59 1173.15 1414.42
+Manufacturer#1 1632.66 NULL 1414.42 1173.15 1632.66
+Manufacturer#2 1690.68 1800.7 NULL 1690.68 1690.68
+Manufacturer#2 1800.7 2031.98 1690.68 1690.68 1800.7
+Manufacturer#2 2031.98 1698.66 1800.7 1690.68 2031.98
+Manufacturer#2 1698.66 1701.6 2031.98 1690.68 1698.66
+Manufacturer#2 1701.6 NULL 1698.66 1690.68 1701.6
+Manufacturer#3 1671.68 1190.27 NULL 1671.68 1671.68
+Manufacturer#3 1190.27 1410.39 1671.68 1671.68 1190.27
+Manufacturer#3 1410.39 1922.98 1190.27 1671.68 1410.39
+Manufacturer#3 1922.98 1337.29 1410.39 1671.68 1922.98
+Manufacturer#3 1337.29 NULL 1922.98 1671.68 1337.29
+Manufacturer#4 1620.67 1375.42 NULL 1620.67 1620.67
+Manufacturer#4 1375.42 1206.26 1620.67 1620.67 1375.42
+Manufacturer#4 1206.26 1844.92 1375.42 1620.67 1206.26
+Manufacturer#4 1844.92 1290.35 1206.26 1620.67 1844.92
+Manufacturer#4 1290.35 NULL 1844.92 1620.67 1290.35
+Manufacturer#5 1789.69 1611.66 NULL 1789.69 1789.69
+Manufacturer#5 1611.66 1788.73 1789.69 1789.69 1611.66
+Manufacturer#5 1788.73 1018.1 1611.66 1789.69 1788.73
+Manufacturer#5 1018.1 1464.48 1788.73 1789.69 1018.1
+Manufacturer#5 1464.48 NULL 1018.1 1789.69 1464.48
+PREHOOK: query: -- 2. ranking functions with decimal type
+
+select p_mfgr, p_retailprice,
+row_number() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c1,
+rank() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c2,
+dense_rank() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c3,
+percent_rank() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c4,
+cume_dist() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c5,
+ntile(5) over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c6
+from part
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: -- 2. ranking functions with decimal type
+
+select p_mfgr, p_retailprice,
+row_number() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c1,
+rank() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c2,
+dense_rank() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c3,
+percent_rank() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c4,
+cume_dist() over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c5,
+ntile(5) over (PARTITION BY p_mfgr ORDER BY p_retailprice) as c6
+from part
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+Manufacturer#1 1173.15 1 1 1 0.0 0.3333333333333333 1
+Manufacturer#1 1173.15 2 1 1 0.0 0.3333333333333333 1
+Manufacturer#1 1414.42 3 3 2 0.4 0.5 2
+Manufacturer#1 1602.59 4 4 3 0.6 0.6666666666666666 3
+Manufacturer#1 1632.66 5 5 4 0.8 0.8333333333333334 4
+Manufacturer#1 1753.76 6 6 5 1.0 1.0 5
+Manufacturer#2 1690.68 1 1 1 0.0 0.2 1
+Manufacturer#2 1698.66 2 2 2 0.25 0.4 2
+Manufacturer#2 1701.6 3 3 3 0.5 0.6 3
+Manufacturer#2 1800.7 4 4 4 0.75 0.8 4
+Manufacturer#2 2031.98 5 5 5 1.0 1.0 5
+Manufacturer#3 1190.27 1 1 1 0.0 0.2 1
+Manufacturer#3 1337.29 2 2 2 0.25 0.4 2
+Manufacturer#3 1410.39 3 3 3 0.5 0.6 3
+Manufacturer#3 1671.68 4 4 4 0.75 0.8 4
+Manufacturer#3 1922.98 5 5 5 1.0 1.0 5
+Manufacturer#4 1206.26 1 1 1 0.0 0.2 1
+Manufacturer#4 1290.35 2 2 2 0.25 0.4 2
+Manufacturer#4 1375.42 3 3 3 0.5 0.6 3
+Manufacturer#4 1620.67 4 4 4 0.75 0.8 4
+Manufacturer#4 1844.92 5 5 5 1.0 1.0 5
+Manufacturer#5 1018.1 1 1 1 0.0 0.2 1
+Manufacturer#5 1464.48 2 2 2 0.25 0.4 2
+Manufacturer#5 1611.66 3 3 3 0.5 0.6 3
+Manufacturer#5 1788.73 4 4 4 0.75 0.8 4
+Manufacturer#5 1789.69 5 5 5 1.0 1.0 5
+PREHOOK: query: -- 3. order by decimal
+
+select p_mfgr, p_retailprice,
+lag(p_retailprice) over (partition by p_mfgr ORDER BY p_retailprice desc) as c1
+from part
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: -- 3. order by decimal
+
+select p_mfgr, p_retailprice,
+lag(p_retailprice) over (partition by p_mfgr ORDER BY p_retailprice desc) as c1
+from part
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+Manufacturer#1 1753.76 NULL
+Manufacturer#1 1632.66 1753.76
+Manufacturer#1 1602.59 1632.66
+Manufacturer#1 1414.42 1602.59
+Manufacturer#1 1173.15 1414.42
+Manufacturer#1 1173.15 1173.15
+Manufacturer#2 2031.98 NULL
+Manufacturer#2 1800.7 2031.98
+Manufacturer#2 1701.6 1800.7
+Manufacturer#2 1698.66 1701.6
+Manufacturer#2 1690.68 1698.66
+Manufacturer#3 1922.98 NULL
+Manufacturer#3 1671.68 1922.98
+Manufacturer#3 1410.39 1671.68
+Manufacturer#3 1337.29 1410.39
+Manufacturer#3 1190.27 1337.29
+Manufacturer#4 1844.92 NULL
+Manufacturer#4 1620.67 1844.92
+Manufacturer#4 1375.42 1620.67
+Manufacturer#4 1290.35 1375.42
+Manufacturer#4 1206.26 1290.35
+Manufacturer#5 1789.69 NULL
+Manufacturer#5 1788.73 1789.69
+Manufacturer#5 1611.66 1788.73
+Manufacturer#5 1464.48 1611.66
+Manufacturer#5 1018.1 1464.48
+PREHOOK: query: -- 4. partition by decimal
+
+select p_mfgr, p_retailprice,
+lag(p_retailprice) over (partition by p_retailprice) as c1
+from part
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: -- 4. partition by decimal
+
+select p_mfgr, p_retailprice,
+lag(p_retailprice) over (partition by p_retailprice) as c1
+from part
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+Manufacturer#5 1018.1 NULL
+Manufacturer#1 1173.15 NULL
+Manufacturer#1 1173.15 1173.15
+Manufacturer#3 1190.27 NULL
+Manufacturer#4 1206.26 NULL
+Manufacturer#4 1290.35 NULL
+Manufacturer#3 1337.29 NULL
+Manufacturer#4 1375.42 NULL
+Manufacturer#3 1410.39 NULL
+Manufacturer#1 1414.42 NULL
+Manufacturer#5 1464.48 NULL
+Manufacturer#1 1602.59 NULL
+Manufacturer#5 1611.66 NULL
+Manufacturer#4 1620.67 NULL
+Manufacturer#1 1632.66 NULL
+Manufacturer#3 1671.68 NULL
+Manufacturer#2 1690.68 NULL
+Manufacturer#2 1698.66 NULL
+Manufacturer#2 1701.6 NULL
+Manufacturer#1 1753.76 NULL
+Manufacturer#5 1788.73 NULL
+Manufacturer#5 1789.69 NULL
+Manufacturer#2 1800.7 NULL
+Manufacturer#4 1844.92 NULL
+Manufacturer#3 1922.98 NULL
+Manufacturer#2 2031.98 NULL
Oops, something went wrong.

0 comments on commit b42d832

Please sign in to comment.