Skip to content
Permalink
Browse files
new tuple sketch UDFs
  • Loading branch information
AlexanderSaydakov committed Sep 8, 2017
1 parent 7448100 commit 909ce0f1d997ad6de8de26b2104934699ec5ae9a
Showing 6 changed files with 366 additions and 0 deletions.
@@ -0,0 +1,47 @@
/*
* Copyright 2017, Yahoo! Inc.
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
*/

package com.yahoo.sketches.pig.tuple;

import java.io.IOException;
import java.util.Arrays;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

import com.yahoo.memory.Memory;
import com.yahoo.sketches.tuple.ArrayOfDoublesSketch;
import com.yahoo.sketches.tuple.ArrayOfDoublesSketches;

/**
* This is a User Defined Function (UDF) for obtaining the unique count estimate
* along with a lower and upper bound from an ArrayOfDoublesSketch.
*
* <p>The result is a tuple with three double values: estimate, lower bound and upper bound.
* The bounds are given at 95.5% confidence.
*
* @author Alexander Saydakov
*/
public class ArrayOfDoublesSketchToEstimateAndErrorBounds extends EvalFunc<Tuple> {

@Override
public Tuple exec(final Tuple input) throws IOException {
if ((input == null) || (input.size() == 0)) {
return null;
}

final DataByteArray dba = (DataByteArray) input.get(0);
final ArrayOfDoublesSketch sketch = ArrayOfDoublesSketches.wrapSketch(Memory.wrap(dba.get()));

return TupleFactory.getInstance().newTuple(Arrays.asList(
sketch.getEstimate(),
sketch.getLowerBound(2),
sketch.getUpperBound(2)
));
}

}
@@ -0,0 +1,40 @@
/*
* Copyright 2017, Yahoo! Inc.
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
*/

package com.yahoo.sketches.pig.tuple;

import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;

import com.yahoo.memory.Memory;
import com.yahoo.sketches.tuple.ArrayOfDoublesSketch;
import com.yahoo.sketches.tuple.ArrayOfDoublesSketches;

/**
* This is a User Defined Function (UDF) for obtaining the number of retained entries
* from an ArrayOfDoublesSketch.
*
* <p>The result is an integer value.
*
* @author Alexander Saydakov
*/
public class ArrayOfDoublesSketchToNumberOfRetainedEntries extends EvalFunc<Integer> {

@Override
public Integer exec(final Tuple input) throws IOException {
if ((input == null) || (input.size() == 0)) {
return null;
}

final DataByteArray dba = (DataByteArray) input.get(0);
final ArrayOfDoublesSketch sketch = ArrayOfDoublesSketches.wrapSketch(Memory.wrap(dba.get()));

return sketch.getRetainedEntries();
}

}
@@ -0,0 +1,75 @@
/*
* Copyright 2017, Yahoo! Inc.
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
*/

package com.yahoo.sketches.pig.tuple;

import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;

import com.yahoo.memory.Memory;
import com.yahoo.sketches.quantiles.DoublesSketchBuilder;
import com.yahoo.sketches.quantiles.UpdateDoublesSketch;
import com.yahoo.sketches.tuple.ArrayOfDoublesSketch;
import com.yahoo.sketches.tuple.ArrayOfDoublesSketchIterator;
import com.yahoo.sketches.tuple.ArrayOfDoublesSketches;

/**
* This UDF converts a given column of double values from an ArrayOfDoubles sketch
* to a quantiles DoublesSketch to further analyze the distribution of these values.
* The result will be a DataByteArray with serialized quantiles sketch.
*/
public class ArrayOfDoublesSketchToQuantilesSketch extends EvalFunc<DataByteArray> {

private final int k;

/**
* Constructor with default parameter k for quantiles sketch
*/
public ArrayOfDoublesSketchToQuantilesSketch() {
k = 0;
}

/**
* Constructor with a given parameter k for quantiles sketch
* @param k parameter that determines the accuracy and size of the quantiles sketch
*/
public ArrayOfDoublesSketchToQuantilesSketch(final int k) {
this.k = k;
}

@Override
public DataByteArray exec(final Tuple input) throws IOException {
if ((input == null) || (input.size() == 0)) {
return null;
}

final DataByteArray dba = (DataByteArray) input.get(0);
final ArrayOfDoublesSketch sketch = ArrayOfDoublesSketches.wrapSketch(Memory.wrap(dba.get()));

int column = 1;
if (input.size() > 1) {
column = (int) input.get(1);
if (column < 1 || column > sketch.getNumValues()) {
throw new IllegalArgumentException("Column number out of range. The given sketch has "
+ sketch.getNumValues() + " columns");
}
}

final DoublesSketchBuilder builder = UpdateDoublesSketch.builder();
if (k > 0) {
builder.setK(k);
}
final UpdateDoublesSketch qs = builder.build();
final ArrayOfDoublesSketchIterator it = sketch.iterator();
while (it.next()) {
qs.update(it.getValues()[column - 1]);
}
return new DataByteArray(qs.compact().toByteArray());
}

}
@@ -0,0 +1,82 @@
/*
* Copyright 2017, Yahoo! Inc.
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
*/

package com.yahoo.sketches.pig.tuple;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.junit.Assert;
import org.testng.annotations.Test;

import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch;
import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder;

public class ArrayOfDoublesSketchToEstimateAndErrorBoundsTest {

static final TupleFactory tupleFactory = TupleFactory.getInstance();

@Test
public void nullInput() throws Exception {
EvalFunc<Tuple> func = new ArrayOfDoublesSketchToEstimateAndErrorBounds();
Tuple resultTuple = func.exec(null);
Assert.assertNull(resultTuple);
}

@Test
public void emptyInputTuple() throws Exception {
EvalFunc<Tuple> func = new ArrayOfDoublesSketchToEstimateAndErrorBounds();
Tuple resultTuple = func.exec(tupleFactory.newTuple());
Assert.assertNull(resultTuple);
}

@Test
public void emptyInputSketch() throws Exception {
EvalFunc<Tuple> func = new ArrayOfDoublesSketchToEstimateAndErrorBounds();
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build();
Tuple resultTuple = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray())));
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 3);
Assert.assertEquals(resultTuple.get(0), 0.0);
Assert.assertEquals(resultTuple.get(1), 0.0);
Assert.assertEquals(resultTuple.get(2), 0.0);
}

@Test
public void nonEmptyInputSketchExactMode() throws Exception {
EvalFunc<Tuple> func = new ArrayOfDoublesSketchToEstimateAndErrorBounds();
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build();
sketch.update(1, new double[] {0});
Tuple resultTuple = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray())));
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 3);
Assert.assertEquals(resultTuple.get(0), 1.0);
Assert.assertEquals(resultTuple.get(1), 1.0);
Assert.assertEquals(resultTuple.get(2), 1.0);
}

@Test
public void nonEmptyInputSketchEstimationMode() throws Exception {
EvalFunc<Tuple> func = new ArrayOfDoublesSketchToEstimateAndErrorBounds();
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build();
int numKeys = 10000; // to saturate the sketch with default number of nominal entries (4K)
for (int i = 0; i < numKeys; i++ ) {
sketch.update(i, new double[] {0});
}
Tuple resultTuple = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray())));
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 3);
double estimate = (double) resultTuple.get(0);
double lowerBound = (double) resultTuple.get(1);
double upperBound = (double) resultTuple.get(2);
Assert.assertEquals(estimate, numKeys, numKeys * 0.04);
Assert.assertEquals(lowerBound, numKeys, numKeys * 0.04);
Assert.assertEquals(upperBound, numKeys, numKeys * 0.04);
Assert.assertTrue(lowerBound < estimate);
Assert.assertTrue(upperBound > estimate);
}

}
@@ -0,0 +1,54 @@
/*
* Copyright 2017, Yahoo! Inc.
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
*/

package com.yahoo.sketches.pig.tuple;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.TupleFactory;
import org.junit.Assert;
import org.testng.annotations.Test;

import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch;
import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder;

public class ArrayOfDoublesSketchToNumberOfRetainedEntriesTest {

static final TupleFactory tupleFactory = TupleFactory.getInstance();

@Test
public void nullInput() throws Exception {
EvalFunc<Integer> func = new ArrayOfDoublesSketchToNumberOfRetainedEntries();
Integer result = func.exec(null);
Assert.assertNull(result);
}

@Test
public void emptyInputTuple() throws Exception {
EvalFunc<Integer> func = new ArrayOfDoublesSketchToNumberOfRetainedEntries();
Integer result = func.exec(tupleFactory.newTuple());
Assert.assertNull(result);
}

@Test
public void emptyInputSketch() throws Exception {
EvalFunc<Integer> func = new ArrayOfDoublesSketchToNumberOfRetainedEntries();
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build();
Integer result = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray())));
Assert.assertNotNull(result);
Assert.assertEquals((int) result, 0);
}

@Test
public void nonEmptyInputSketch() throws Exception {
EvalFunc<Integer> func = new ArrayOfDoublesSketchToNumberOfRetainedEntries();
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build();
sketch.update(1, new double[] {0});
Integer result = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray())));
Assert.assertNotNull(result);
Assert.assertEquals((int) result, 1);
}

}
@@ -0,0 +1,68 @@
/*
* Copyright 2017, Yahoo! Inc.
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
*/

package com.yahoo.sketches.pig.tuple;

import java.util.Arrays;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.TupleFactory;
import org.testng.Assert;
import org.testng.annotations.Test;

import com.yahoo.memory.Memory;
import com.yahoo.sketches.quantiles.DoublesSketch;
import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch;
import com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder;

public class ArrayOfDoublesSketchToQuantilesSketchTest {

static final TupleFactory tupleFactory = TupleFactory.getInstance();

@Test
public void nullInput() throws Exception {
EvalFunc<DataByteArray> func = new ArrayOfDoublesSketchToQuantilesSketch();
DataByteArray result = func.exec(null);
Assert.assertNull(result);
}

@Test
public void emptyInputTuple() throws Exception {
EvalFunc<DataByteArray> func = new ArrayOfDoublesSketchToQuantilesSketch();
DataByteArray result = func.exec(TupleFactory.getInstance().newTuple());
Assert.assertNull(result);
}

@Test
public void emptyInputSketch() throws Exception {
EvalFunc<DataByteArray> func = new ArrayOfDoublesSketchToQuantilesSketch();
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().build();
DataByteArray result = func.exec(tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray())));
Assert.assertNotNull(result);
DoublesSketch quantilesSketch = DoublesSketch.wrap(Memory.wrap(result.get()));
Assert.assertTrue(quantilesSketch.isEmpty());
}

@Test
public void nonEmptyInputSketchWithTwoColumnsExplicitK() throws Exception {
int k = 256;
EvalFunc<DataByteArray> func = new ArrayOfDoublesSketchToQuantilesSketch(k);
ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().setNumberOfValues(2).build();
sketch.update(1, new double[] {1.0, 2.0});
sketch.update(2, new double[] {10.0, 20.0});
DataByteArray result = func.exec(tupleFactory.newTuple(Arrays.asList(
new DataByteArray(sketch.compact().toByteArray()),
2
)));
Assert.assertNotNull(result);
DoublesSketch quantilesSketch = DoublesSketch.wrap(Memory.wrap(result.get()));
Assert.assertFalse(quantilesSketch.isEmpty());
Assert.assertEquals(quantilesSketch.getK(), k);
Assert.assertEquals(quantilesSketch.getMinValue(), 2.0);
Assert.assertEquals(quantilesSketch.getMaxValue(), 20.0);
}

}

0 comments on commit 909ce0f

Please sign in to comment.