Permalink
Show file tree
Hide file tree
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
new tuple sketch UDFs
- Loading branch information
1 parent
7448100
commit 909ce0f1d997ad6de8de26b2104934699ec5ae9a
Showing
6 changed files
with
366 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
/* | ||
* Copyright 2017, Yahoo! Inc. | ||
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms. | ||
*/ | ||
|
||
package com.yahoo.sketches.pig.tuple; | ||
|
||
import java.io.IOException; | ||
import java.util.Arrays; | ||
|
||
import org.apache.pig.EvalFunc; | ||
import org.apache.pig.data.DataByteArray; | ||
import org.apache.pig.data.Tuple; | ||
import org.apache.pig.data.TupleFactory; | ||
|
||
import com.yahoo.memory.Memory; | ||
import com.yahoo.sketches.tuple.ArrayOfDoublesSketch; | ||
import com.yahoo.sketches.tuple.ArrayOfDoublesSketches; | ||
|
||
/** | ||
* This is a User Defined Function (UDF) for obtaining the unique count estimate | ||
* along with a lower and upper bound from an ArrayOfDoublesSketch. | ||
* | ||
* <p>The result is a tuple with three double values: estimate, lower bound and upper bound. | ||
* The bounds are given at 95.5% confidence. | ||
* | ||
* @author Alexander Saydakov | ||
*/ | ||
public class ArrayOfDoublesSketchToEstimateAndErrorBounds extends EvalFunc<Tuple> { | ||
|
||
@Override | ||
public Tuple exec(final Tuple input) throws IOException { | ||
if ((input == null) || (input.size() == 0)) { | ||
return null; | ||
} | ||
|
||
final DataByteArray dba = (DataByteArray) input.get(0); | ||
final ArrayOfDoublesSketch sketch = ArrayOfDoublesSketches.wrapSketch(Memory.wrap(dba.get())); | ||
|
||
return TupleFactory.getInstance().newTuple(Arrays.asList( | ||
sketch.getEstimate(), | ||
sketch.getLowerBound(2), | ||
sketch.getUpperBound(2) | ||
)); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
/* | ||
* Copyright 2017, Yahoo! Inc. | ||
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms. | ||
*/ | ||
|
||
package com.yahoo.sketches.pig.tuple; | ||
|
||
import java.io.IOException; | ||
|
||
import org.apache.pig.EvalFunc; | ||
import org.apache.pig.data.DataByteArray; | ||
import org.apache.pig.data.Tuple; | ||
|
||
import com.yahoo.memory.Memory; | ||
import com.yahoo.sketches.tuple.ArrayOfDoublesSketch; | ||
import com.yahoo.sketches.tuple.ArrayOfDoublesSketches; | ||
|
||
/** | ||
* This is a User Defined Function (UDF) for obtaining the number of retained entries | ||
* from an ArrayOfDoublesSketch. | ||
* | ||
* <p>The result is an integer value. | ||
* | ||
* @author Alexander Saydakov | ||
*/ | ||
public class ArrayOfDoublesSketchToNumberOfRetainedEntries extends EvalFunc<Integer> { | ||
|
||
@Override | ||
public Integer exec(final Tuple input) throws IOException { | ||
if ((input == null) || (input.size() == 0)) { | ||
return null; | ||
} | ||
|
||
final DataByteArray dba = (DataByteArray) input.get(0); | ||
final ArrayOfDoublesSketch sketch = ArrayOfDoublesSketches.wrapSketch(Memory.wrap(dba.get())); | ||
|
||
return sketch.getRetainedEntries(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
/* | ||
* Copyright 2017, Yahoo! Inc. | ||
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms. | ||
*/ | ||
|
||
package com.yahoo.sketches.pig.tuple; | ||
|
||
import java.io.IOException; | ||
|
||
import org.apache.pig.EvalFunc; | ||
import org.apache.pig.data.DataByteArray; | ||
import org.apache.pig.data.Tuple; | ||
|
||
import com.yahoo.memory.Memory; | ||
import com.yahoo.sketches.quantiles.DoublesSketchBuilder; | ||
import com.yahoo.sketches.quantiles.UpdateDoublesSketch; | ||
import com.yahoo.sketches.tuple.ArrayOfDoublesSketch; | ||
import com.yahoo.sketches.tuple.ArrayOfDoublesSketchIterator; | ||
import com.yahoo.sketches.tuple.ArrayOfDoublesSketches; | ||
|
||
/** | ||
* This UDF converts a given column of double values from an ArrayOfDoubles sketch | ||
* to a quantiles DoublesSketch to further analyze the distribution of these values. | ||
* The result will be a DataByteArray with serialized quantiles sketch. | ||
*/ | ||
public class ArrayOfDoublesSketchToQuantilesSketch extends EvalFunc<DataByteArray> { | ||
|
||
private final int k; | ||
|
||
/** | ||
* Constructor with default parameter k for quantiles sketch | ||
*/ | ||
public ArrayOfDoublesSketchToQuantilesSketch() { | ||
k = 0; | ||
} | ||
|
||
/** | ||
* Constructor with a given parameter k for quantiles sketch | ||
* @param k parameter that determines the accuracy and size of the quantiles sketch | ||
*/ | ||
public ArrayOfDoublesSketchToQuantilesSketch(final int k) { | ||
this.k = k; | ||
} | ||
|
||
@Override | ||
public DataByteArray exec(final Tuple input) throws IOException { | ||
if ((input == null) || (input.size() == 0)) { | ||
return null; | ||
} | ||
|
||
final DataByteArray dba = (DataByteArray) input.get(0); | ||
final ArrayOfDoublesSketch sketch = ArrayOfDoublesSketches.wrapSketch(Memory.wrap(dba.get())); | ||
|
||
int column = 1; | ||
if (input.size() > 1) { | ||
column = (int) input.get(1); | ||
if (column < 1 || column > sketch.getNumValues()) { | ||
throw new IllegalArgumentException("Column number out of range. The given sketch has " | ||
+ sketch.getNumValues() + " columns"); | ||
} | ||
} | ||
|
||
final DoublesSketchBuilder builder = UpdateDoublesSketch.builder(); | ||
if (k > 0) { | ||
builder.setK(k); | ||
} | ||
final UpdateDoublesSketch qs = builder.build(); | ||
final ArrayOfDoublesSketchIterator it = sketch.iterator(); | ||
while (it.next()) { | ||
qs.update(it.getValues()[column - 1]); | ||
} | ||
return new DataByteArray(qs.compact().toByteArray()); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
/* | ||
* Copyright 2017, Yahoo! Inc. | ||
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms. | ||
*/ | ||
|
||
package com.yahoo.sketches.pig.tuple; | ||
|
||
import org.apache.pig.EvalFunc; | ||
import org.apache.pig.data.DataByteArray; | ||
import org.apache.pig.data.Tuple; | ||
import org.apache.pig.data.TupleFactory; | ||
import org.junit.Assert; | ||
import org.testng.annotations.Test; | ||
|
||
import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch; | ||
import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder; | ||
|
||
public class ArrayOfDoublesSketchToEstimateAndErrorBoundsTest { | ||
|
||
static final TupleFactory tupleFactory = TupleFactory.getInstance(); | ||
|
||
@Test | ||
public void nullInput() throws Exception { | ||
EvalFunc<Tuple> func = new ArrayOfDoublesSketchToEstimateAndErrorBounds(); | ||
Tuple resultTuple = func.exec(null); | ||
Assert.assertNull(resultTuple); | ||
} | ||
|
||
@Test | ||
public void emptyInputTuple() throws Exception { | ||
EvalFunc<Tuple> func = new ArrayOfDoublesSketchToEstimateAndErrorBounds(); | ||
Tuple resultTuple = func.exec(tupleFactory.newTuple()); | ||
Assert.assertNull(resultTuple); | ||
} | ||
|
||
@Test | ||
public void emptyInputSketch() throws Exception { | ||
EvalFunc<Tuple> func = new ArrayOfDoublesSketchToEstimateAndErrorBounds(); | ||
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build(); | ||
Tuple resultTuple = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray()))); | ||
Assert.assertNotNull(resultTuple); | ||
Assert.assertEquals(resultTuple.size(), 3); | ||
Assert.assertEquals(resultTuple.get(0), 0.0); | ||
Assert.assertEquals(resultTuple.get(1), 0.0); | ||
Assert.assertEquals(resultTuple.get(2), 0.0); | ||
} | ||
|
||
@Test | ||
public void nonEmptyInputSketchExactMode() throws Exception { | ||
EvalFunc<Tuple> func = new ArrayOfDoublesSketchToEstimateAndErrorBounds(); | ||
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build(); | ||
sketch.update(1, new double[] {0}); | ||
Tuple resultTuple = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray()))); | ||
Assert.assertNotNull(resultTuple); | ||
Assert.assertEquals(resultTuple.size(), 3); | ||
Assert.assertEquals(resultTuple.get(0), 1.0); | ||
Assert.assertEquals(resultTuple.get(1), 1.0); | ||
Assert.assertEquals(resultTuple.get(2), 1.0); | ||
} | ||
|
||
@Test | ||
public void nonEmptyInputSketchEstimationMode() throws Exception { | ||
EvalFunc<Tuple> func = new ArrayOfDoublesSketchToEstimateAndErrorBounds(); | ||
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build(); | ||
int numKeys = 10000; // to saturate the sketch with default number of nominal entries (4K) | ||
for (int i = 0; i < numKeys; i++ ) { | ||
sketch.update(i, new double[] {0}); | ||
} | ||
Tuple resultTuple = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray()))); | ||
Assert.assertNotNull(resultTuple); | ||
Assert.assertEquals(resultTuple.size(), 3); | ||
double estimate = (double) resultTuple.get(0); | ||
double lowerBound = (double) resultTuple.get(1); | ||
double upperBound = (double) resultTuple.get(2); | ||
Assert.assertEquals(estimate, numKeys, numKeys * 0.04); | ||
Assert.assertEquals(lowerBound, numKeys, numKeys * 0.04); | ||
Assert.assertEquals(upperBound, numKeys, numKeys * 0.04); | ||
Assert.assertTrue(lowerBound < estimate); | ||
Assert.assertTrue(upperBound > estimate); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
/* | ||
* Copyright 2017, Yahoo! Inc. | ||
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms. | ||
*/ | ||
|
||
package com.yahoo.sketches.pig.tuple; | ||
|
||
import org.apache.pig.EvalFunc; | ||
import org.apache.pig.data.DataByteArray; | ||
import org.apache.pig.data.TupleFactory; | ||
import org.junit.Assert; | ||
import org.testng.annotations.Test; | ||
|
||
import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch; | ||
import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder; | ||
|
||
public class ArrayOfDoublesSketchToNumberOfRetainedEntriesTest { | ||
|
||
static final TupleFactory tupleFactory = TupleFactory.getInstance(); | ||
|
||
@Test | ||
public void nullInput() throws Exception { | ||
EvalFunc<Integer> func = new ArrayOfDoublesSketchToNumberOfRetainedEntries(); | ||
Integer result = func.exec(null); | ||
Assert.assertNull(result); | ||
} | ||
|
||
@Test | ||
public void emptyInputTuple() throws Exception { | ||
EvalFunc<Integer> func = new ArrayOfDoublesSketchToNumberOfRetainedEntries(); | ||
Integer result = func.exec(tupleFactory.newTuple()); | ||
Assert.assertNull(result); | ||
} | ||
|
||
@Test | ||
public void emptyInputSketch() throws Exception { | ||
EvalFunc<Integer> func = new ArrayOfDoublesSketchToNumberOfRetainedEntries(); | ||
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build(); | ||
Integer result = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray()))); | ||
Assert.assertNotNull(result); | ||
Assert.assertEquals((int) result, 0); | ||
} | ||
|
||
@Test | ||
public void nonEmptyInputSketch() throws Exception { | ||
EvalFunc<Integer> func = new ArrayOfDoublesSketchToNumberOfRetainedEntries(); | ||
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build(); | ||
sketch.update(1, new double[] {0}); | ||
Integer result = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray()))); | ||
Assert.assertNotNull(result); | ||
Assert.assertEquals((int) result, 1); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
/* | ||
* Copyright 2017, Yahoo! Inc. | ||
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms. | ||
*/ | ||
|
||
package com.yahoo.sketches.pig.tuple; | ||
|
||
import java.util.Arrays; | ||
|
||
import org.apache.pig.EvalFunc; | ||
import org.apache.pig.data.DataByteArray; | ||
import org.apache.pig.data.TupleFactory; | ||
import org.testng.Assert; | ||
import org.testng.annotations.Test; | ||
|
||
import com.yahoo.memory.Memory; | ||
import com.yahoo.sketches.quantiles.DoublesSketch; | ||
import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch; | ||
import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder; | ||
|
||
public class ArrayOfDoublesSketchToQuantilesSketchTest { | ||
|
||
static final TupleFactory tupleFactory = TupleFactory.getInstance(); | ||
|
||
@Test | ||
public void nullInput() throws Exception { | ||
EvalFunc<DataByteArray> func = new ArrayOfDoublesSketchToQuantilesSketch(); | ||
DataByteArray result = func.exec(null); | ||
Assert.assertNull(result); | ||
} | ||
|
||
@Test | ||
public void emptyInputTuple() throws Exception { | ||
EvalFunc<DataByteArray> func = new ArrayOfDoublesSketchToQuantilesSketch(); | ||
DataByteArray result = func.exec(TupleFactory.getInstance().newTuple()); | ||
Assert.assertNull(result); | ||
} | ||
|
||
@Test | ||
public void emptyInputSketch() throws Exception { | ||
EvalFunc<DataByteArray> func = new ArrayOfDoublesSketchToQuantilesSketch(); | ||
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build(); | ||
DataByteArray result = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray()))); | ||
Assert.assertNotNull(result); | ||
DoublesSketch quantilesSketch = DoublesSketch.wrap(Memory.wrap(result.get())); | ||
Assert.assertTrue(quantilesSketch.isEmpty()); | ||
} | ||
|
||
@Test | ||
public void nonEmptyInputSketchWithTwoColumnsExplicitK() throws Exception { | ||
int k = 256; | ||
EvalFunc<DataByteArray> func = new ArrayOfDoublesSketchToQuantilesSketch(k); | ||
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().setNumberOfValues(2).build(); | ||
sketch.update(1, new double[] {1.0, 2.0}); | ||
sketch.update(2, new double[] {10.0, 20.0}); | ||
DataByteArray result = func.exec(tupleFactory.newTuple(Arrays.asList( | ||
new DataByteArray(sketch.compact().toByteArray()), | ||
2 | ||
))); | ||
Assert.assertNotNull(result); | ||
DoublesSketch quantilesSketch = DoublesSketch.wrap(Memory.wrap(result.get())); | ||
Assert.assertFalse(quantilesSketch.isEmpty()); | ||
Assert.assertEquals(quantilesSketch.getK(), k); | ||
Assert.assertEquals(quantilesSketch.getMinValue(), 2.0); | ||
Assert.assertEquals(quantilesSketch.getMaxValue(), 20.0); | ||
} | ||
|
||
} |