Skip to content
Permalink
Browse files
resolved conflicts
  • Loading branch information
myui committed Oct 19, 2016
1 parent 9c13afc commit f22de1bb6fe7bff193e253f6d47976d39b98ce22
Showing 50 changed files with 1,070 additions and 2,239 deletions.
@@ -18,8 +18,6 @@ env:

language: java
jdk:
# kuromoji requires Java 7 and later
# - openjdk6
- openjdk7
- oraclejdk7
- oraclejdk8
@@ -36,9 +34,9 @@ notifications:
email: false

script:
- mvn -q test -Pspark-2.0
- mvn -q scalastyle:check test -Pspark-2.0
# test the spark-1.6 module only in this second run
- mvn -q test -Pspark-1.6 -Dtest=org.apache.spark.*
- mvn -q scalastyle:check clean -Pspark-1.6 -pl spark/spark-1.6 -am test -Dtest=none

after_success:
- mvn clean cobertura:cobertura coveralls:report
@@ -120,6 +120,10 @@ System requirements

* Java 7 or later

* Spark 1.6 or 2.0 for Hivemall on Spark

* Pig 0.15 or later for Hivemall on Pig

Basic Usage
------------

Large diffs are not rendered by default.

@@ -18,47 +18,41 @@
package hivemall.ftvec.binning;

import hivemall.utils.hadoop.HiveUtils;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import java.util.*;

@Description(
name = "feature_binning",
value = "_FUNC_(array<features::string> features, const map<string, array<double>> quantiles_map)"
+ " / _FUNC(int|bigint|float|double weight, const array<double> quantiles)"
value = "_FUNC_(array<features::string> features, const map<string, array<number>> quantiles_map)"
+ " / _FUNC(number weight, const array<number> quantiles)"
+ " - Returns binned features as an array<features::string> / bin ID as int")
@UDFType(deterministic = true, stateful = false)
public final class FeatureBinningUDF extends GenericUDF {
private boolean multiple = true;

private StandardListObjectInspector featuresOI;
private WritableStringObjectInspector featuresElOI;
private StandardMapObjectInspector quantilesMapOI;
private WritableStringObjectInspector keyOI;
private StandardListObjectInspector quantilesOI;
private WritableDoubleObjectInspector quantileOI;
private ListObjectInspector featuresOI;
private StringObjectInspector featureOI;
private MapObjectInspector quantilesMapOI;
private StringObjectInspector keyOI;
private ListObjectInspector quantilesOI;
private PrimitiveObjectInspector quantileOI;

private PrimitiveObjectInspector weightOI;

@@ -67,104 +61,88 @@ public final class FeatureBinningUDF extends GenericUDF {

@Override
public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentException {
if (OIs.length != 2)
if (OIs.length != 2) {
throw new UDFArgumentLengthException("Specify two arguments");
}

if (HiveUtils.isListOI(OIs[0]) && HiveUtils.isMapOI(OIs[1])) {
// for (array<features::string> features, const map<string, array<number>> quantiles_map)

Category arg0Category = OIs[0].getCategory();
Category arg1Category = OIs[1].getCategory();

if (arg0Category == Category.LIST && arg1Category == Category.MAP) {
featuresOI = (StandardListObjectInspector) OIs[0];
isStringOI(featuresOI.getListElementObjectInspector(), 0, "features");
featuresElOI = (WritableStringObjectInspector) featuresOI.getListElementObjectInspector();
quantilesMapOI = (StandardMapObjectInspector) OIs[1];
isStringOI(quantilesMapOI.getMapKeyObjectInspector(), 1, "key of quantiles_map");
keyOI = (WritableStringObjectInspector) quantilesMapOI.getMapKeyObjectInspector();
isListOI(quantilesMapOI.getMapValueObjectInspector(), 1, "value of quantiles_map");
quantilesOI = (StandardListObjectInspector) quantilesMapOI.getMapValueObjectInspector();
isNumberOI(quantilesOI.getListElementObjectInspector(), 1, "value of quantiles");
quantileOI = (WritableDoubleObjectInspector) quantilesOI.getListElementObjectInspector();
if (!HiveUtils.isStringOI(((ListObjectInspector) OIs[0]).getListElementObjectInspector())) {
throw new UDFArgumentTypeException(0,
"Only array<string> type argument is acceptable but " + OIs[0].getTypeName()
+ " was passed as `features`");
}
featuresOI = HiveUtils.asListOI(OIs[0]);
featureOI = HiveUtils.asStringOI(featuresOI.getListElementObjectInspector());

quantilesMapOI = HiveUtils.asMapOI(OIs[1]);
if (!HiveUtils.isStringOI(quantilesMapOI.getMapKeyObjectInspector())
|| !HiveUtils.isListOI(quantilesMapOI.getMapValueObjectInspector())
|| !HiveUtils.isNumberOI(((ListObjectInspector) quantilesMapOI.getMapValueObjectInspector()).getListElementObjectInspector())) {
throw new UDFArgumentTypeException(1,
"Only map<string, array<number>> type argument is acceptable but "
+ OIs[1].getTypeName() + " was passed as `quantiles_map`");
}
keyOI = HiveUtils.asStringOI(quantilesMapOI.getMapKeyObjectInspector());
quantilesOI = HiveUtils.asListOI(quantilesMapOI.getMapValueObjectInspector());
quantileOI = HiveUtils.asDoubleCompatibleOI(quantilesOI.getListElementObjectInspector());

multiple = true;

return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
} else if (arg0Category == Category.PRIMITIVE && arg1Category == Category.LIST) {
isNumberOI(OIs[0], 0, "weight");
weightOI = (PrimitiveObjectInspector) OIs[0];
quantilesOI = (StandardListObjectInspector) OIs[1];
isNumberOI(quantilesOI.getListElementObjectInspector(), 1, "value of quantiles");
quantileOI = (WritableDoubleObjectInspector) quantilesOI.getListElementObjectInspector();
} else if (HiveUtils.isPrimitiveOI(OIs[0]) && HiveUtils.isListOI(OIs[1])) {
// for (number weight, const array<number> quantiles)

weightOI = HiveUtils.asDoubleCompatibleOI(OIs[0]);

quantilesOI = HiveUtils.asListOI(OIs[1]);
if (!HiveUtils.isNumberOI(quantilesOI.getListElementObjectInspector())) {
throw new UDFArgumentTypeException(1,
"Only array<number> type argument is acceptable but " + OIs[1].getTypeName()
+ " was passed as `quantiles`");
}
quantileOI = HiveUtils.asDoubleCompatibleOI(quantilesOI.getListElementObjectInspector());

multiple = false;

return PrimitiveObjectInspectorFactory.writableIntObjectInspector;
} else {
throw new UDFArgumentTypeException(
0,
"Only <array<features::string>, map<string, array<double>>> "
+ "or <int|bigint|float|double, array<double>> type arguments are accepted but <"
throw new UDFArgumentTypeException(0,
"Only <array<features::string>, map<string, array<number>>> "
+ "or <number, array<number>> type arguments are accepted but <"
+ OIs[0].getTypeName() + ", " + OIs[1].getTypeName() + "> was passed.");
}
}

private boolean isStringOI(ObjectInspector OI, int index, String name)
throws UDFArgumentTypeException {
if (HiveUtils.isStringOI(OI))
return true;
else
throw new UDFArgumentTypeException(index,
"Only string type arguments are accepted but " + OI.getTypeName()
+ " was passed as `" + name + "`.");
}

private boolean isListOI(ObjectInspector OI, int index, String name)
throws UDFArgumentTypeException {
if (HiveUtils.isListOI(OI))
return true;
else
throw new UDFArgumentTypeException(index, "Only array type arguments are accepted but "
+ OI.getTypeName() + " was passed as `" + name + "`.");

}

private boolean isNumberOI(ObjectInspector OI, int index, String name)
throws UDFArgumentTypeException {
if (HiveUtils.isNumberOI(OI))
return true;
else
throw new UDFArgumentTypeException(index,
"Only numeric type arguments are accepted but " + OI.getTypeName()
+ " was passed as `" + name + "`.");
}

@Override
public Object evaluate(DeferredObject[] dObj) throws HiveException {
if (multiple) {
// init quantilesMap
if (quantilesMap == null) {
quantilesMap = new HashMap<Text, double[]>();
Map<?, ?> _quantilesMap = quantilesMapOI.getMap(dObj[1].get());
final Map<?, ?> _quantilesMap = quantilesMapOI.getMap(dObj[1].get());

for (Object _key : _quantilesMap.keySet()) {
Text key = new Text(keyOI.getPrimitiveJavaObject(_key));
double[] val = HiveUtils.asDoubleArray(_quantilesMap.get(key), quantilesOI,
quantileOI);
final Text key = new Text(keyOI.getPrimitiveJavaObject(_key));
final double[] val = HiveUtils.asDoubleArray(_quantilesMap.get(key),
quantilesOI, quantileOI);
quantilesMap.put(key, val);
}
}

List<?> fs = featuresOI.getList(dObj[0].get());
List<Text> result = new ArrayList<Text>();
final List<?> fs = featuresOI.getList(dObj[0].get());
final List<Text> result = new ArrayList<Text>();
for (Object f : fs) {
String entry = featuresElOI.getPrimitiveJavaObject(f);
int pos = entry.indexOf(":");
final String entry = featureOI.getPrimitiveJavaObject(f);
final int pos = entry.indexOf(":");

if (pos < 0) {
// categorical
result.add(new Text(entry));
} else {
// quantitative
Text key = new Text(entry.substring(0, pos));
final Text key = new Text(entry.substring(0, pos));
String val = entry.substring(pos + 1);

// binning
@@ -187,30 +165,20 @@ public Object evaluate(DeferredObject[] dObj) throws HiveException {
}
}

private int findBin(double[] _quantiles, double target) throws HiveException {
if (_quantiles.length < 3)
private int findBin(double[] _quantiles, double d) throws HiveException {
if (_quantiles.length < 3) {
throw new HiveException(
"Length of `quantiles` should be greater than or equal to three but "
+ _quantiles.length + ".");

int left = 0;
int right = _quantiles.length - 1;
int p = (left + right) / 2;
while (left + 1 != right) {
if (_quantiles[p] < target) {
left = p;
p = (left + right) / 2;
} else {
right = p;
p = (left + right) / 2;
}
}
return p;

int res = Arrays.binarySearch(_quantiles, d);
return (res < 0) ? ~res - 1 : (res == 0) ? 0 : res - 1;
}

@Override
public String getDisplayString(String[] children) {
StringBuilder sb = new StringBuilder();
final StringBuilder sb = new StringBuilder();
sb.append("feature_binning");
sb.append("(");
if (children.length > 0) {
@@ -18,15 +18,14 @@
package hivemall.ftvec.binning;

import hivemall.utils.lang.SizeOf;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;

import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;

/**
* **THIS CLASS IS IMPORTED FROM HIVE 2.1.0 FOR COMPATIBILITY**
*
@@ -51,6 +51,7 @@
import org.apache.hadoop.hive.serde2.lazy.LazyString;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
@@ -161,13 +162,14 @@ public static List<String> asStringList(@Nonnull final DeferredObject arg,
}
return Arrays.asList(ary);
}

@Nonnull
public static StructObjectInspector asStructOI(@Nonnull final ObjectInspector oi) throws UDFArgumentException {
if(oi.getCategory() != Category.STRUCT) {
public static StructObjectInspector asStructOI(@Nonnull final ObjectInspector oi)
throws UDFArgumentException {
if (oi.getCategory() != Category.STRUCT) {
throw new UDFArgumentException("Expected Struct OI but got: " + oi.getTypeName());
}
return (StructObjectInspector) oi;
return (StructObjectInspector) oi;
}

public static boolean isPrimitiveOI(@Nonnull final ObjectInspector oi) {
@@ -240,6 +242,10 @@ public static boolean isListOI(@Nonnull final ObjectInspector oi) {
return category == Category.LIST;
}

public static boolean isMapOI(@Nonnull final ObjectInspector oi) {
return oi.getCategory() == Category.MAP;
}

public static boolean isPrimitiveTypeInfo(@Nonnull TypeInfo typeInfo) {
return typeInfo.getCategory() == ObjectInspector.Category.PRIMITIVE;
}
@@ -787,6 +793,15 @@ public static ListObjectInspector asListOI(@Nonnull final ObjectInspector oi)
return (ListObjectInspector) oi;
}

@Nonnull
public static MapObjectInspector asMapOI(@Nonnull final ObjectInspector oi)
throws UDFArgumentException {
if (oi.getCategory() != Category.MAP) {
throw new UDFArgumentException("Expected Map OI but was: " + oi);
}
return (MapObjectInspector) oi;
}

public static void validateFeatureOI(@Nonnull final ObjectInspector oi)
throws UDFArgumentException {
final String typeName = oi.getTypeName();

0 comments on commit f22de1b

Please sign in to comment.