From 8efe5597a553eaab4f4ed8724e7fbae820adbb71 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Wed, 13 Aug 2014 18:38:53 +0900 Subject: [PATCH 01/18] initial work. --- .../java/org/apache/tajo/datum/Datum.java | 4 + .../java/org/apache/tajo/datum/TextDatum.java | 7 ++ .../org/apache/tajo/util/StringUtils.java | 8 ++ .../planner/RangePartitionAlgorithm.java | 62 ++++++++++++--- .../engine/planner/UniformRangePartition.java | 23 +++++- .../planner/TestUniformRangePartition.java | 79 +++++++++++++++++-- .../org/apache/tajo/jdbc/MetaDataTuple.java | 5 ++ .../org/apache/tajo/storage/FrameTuple.java | 5 ++ .../org/apache/tajo/storage/LazyTuple.java | 5 ++ .../java/org/apache/tajo/storage/Tuple.java | 2 + .../java/org/apache/tajo/storage/VTuple.java | 5 ++ 11 files changed, 181 insertions(+), 24 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/Datum.java b/tajo-common/src/main/java/org/apache/tajo/datum/Datum.java index 874004b5ba..227cedf219 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/Datum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/Datum.java @@ -92,6 +92,10 @@ public String asChars() { throw new InvalidCastException(type, Type.TEXT); } + public char [] asUnicodeChars() { + throw new InvalidCastException(type, Type.TEXT); + } + public byte[] asTextBytes() { return toString().getBytes(); } diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java index e8424b3147..bdef182292 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java @@ -20,6 +20,7 @@ import com.google.common.primitives.UnsignedBytes; import com.google.gson.annotations.Expose; +import com.sun.tools.javac.util.Convert; import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.exception.InvalidCastException; import org.apache.tajo.exception.InvalidOperationException; @@ -84,10 +85,16 @@ public byte[] asByteArray() { return this.bytes; } + @Override public String asChars() { return new String(this.bytes); } + @Override + public char[] asUnicodeChars() { + return Convert.utf2chars(this.bytes); + } + @Override public byte[] asTextBytes() { return this.bytes; diff --git a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java index 90391a8e2a..1237493763 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java @@ -25,6 +25,8 @@ import org.apache.hadoop.util.ShutdownHookManager; import org.apache.hadoop.util.SignalLogger; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; import java.util.Arrays; import java.util.BitSet; @@ -71,6 +73,12 @@ public static String formatTime(long timeDiff){ return buf.toString(); } + static CharsetEncoder asciiEncoder = Charset.forName("US-ASCII").newEncoder(); // or "ISO-8859-1" for ISO Latin 1 + + public static boolean isPureAscii(String v) { + return asciiEncoder.canEncode(v); + } + public static String quote(String str) { return "'" + str + "'"; } diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java index db53cd771f..d7d0b758a4 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java @@ -18,14 +18,18 @@ package org.apache.tajo.engine.planner; +import com.sun.tools.javac.util.Convert; import org.apache.tajo.catalog.SortSpec; import org.apache.tajo.common.TajoDataTypes.DataType; import org.apache.tajo.datum.Datum; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.TupleRange; import org.apache.tajo.util.Bytes; +import org.apache.tajo.util.StringUtils; +import sun.text.CollatorUtilities; import java.math.BigInteger; +import java.text.Collator; public abstract class RangePartitionAlgorithm { protected SortSpec [] sortSpecs; @@ -115,21 +119,53 @@ public static BigInteger computeCardinality(DataType dataType, Datum start, Datu } break; case TEXT: { - byte [] a; - byte [] b; - if (isAscending) { - a = start.asByteArray(); - b = end.asByteArray(); + boolean isPureAscii = StringUtils.isPureAscii(start.asChars()) && StringUtils.isPureAscii(end.asChars()); + + if (isPureAscii) { + byte[] a; + byte[] b; + if (isAscending) { + a = start.asByteArray(); + b = end.asByteArray(); + } else { + b = start.asByteArray(); + a = end.asByteArray(); + } + + byte[] prependHeader = {1, 0}; + final BigInteger startBI = new BigInteger(Bytes.add(prependHeader, a)); + final BigInteger stopBI = new BigInteger(Bytes.add(prependHeader, b)); + BigInteger diffBI = stopBI.subtract(startBI); + columnCard = diffBI; } else { - b = start.asByteArray(); - a = end.asByteArray(); - } + char [] a; + char [] b; - byte [] prependHeader = {1, 0}; - final BigInteger startBI = new BigInteger(Bytes.add(prependHeader, a)); - final BigInteger stopBI = new BigInteger(Bytes.add(prependHeader, b)); - BigInteger diffBI = stopBI.subtract(startBI); - columnCard = diffBI; + if (isAscending) { + a = start.asUnicodeChars(); + b = end.asUnicodeChars(); + } else { + b = start.asUnicodeChars(); + a = end.asUnicodeChars(); + } + + BigInteger startBI = BigInteger.ZERO; + BigInteger stopBI = BigInteger.ZERO; + + for (int i = 0; i < a.length; i++) { + BigInteger charVal = BigInteger.valueOf(a[i]); + BigInteger base = charVal.multiply(BigInteger.valueOf(2^16)); + startBI = startBI.add(base.pow(i + 1)); + } + + for (int i = 0; i < b.length; i++) { + BigInteger charVal = BigInteger.valueOf(b[i]); + BigInteger base = charVal.multiply(BigInteger.valueOf(2^16)); + stopBI = stopBI.add(base.pow(i + 1)); + } + BigInteger diffBI = stopBI.subtract(startBI); + columnCard = diffBI; + } break; } case DATE: diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index 0a1389a0b0..43b9254a76 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -32,6 +32,7 @@ import org.apache.tajo.storage.VTuple; import org.apache.tajo.util.Bytes; import org.apache.tajo.util.BytesUtils; +import org.apache.tajo.util.StringUtils; import java.math.BigDecimal; import java.math.BigInteger; @@ -43,6 +44,7 @@ public class UniformRangePartition extends RangePartitionAlgorithm { private int variableId; private BigInteger[] cardForEachDigit; private BigInteger[] colCards; + private boolean [] isPureAscii; /** * @@ -52,13 +54,19 @@ public class UniformRangePartition extends RangePartitionAlgorithm { */ public UniformRangePartition(final TupleRange totalRange, final SortSpec[] sortSpecs, boolean inclusive) { super(sortSpecs, totalRange, inclusive); - colCards = new BigInteger[sortSpecs.length]; + isPureAscii = new boolean[sortSpecs.length]; + colCards = new BigInteger[sortSpecs.length]; normalize(sortSpecs, this.mergedRange); for (int i = 0; i < sortSpecs.length; i++) { - colCards[i] = computeCardinality(sortSpecs[i].getSortKey().getDataType(), totalRange.getStart().get(i), - totalRange.getEnd().get(i), inclusive, sortSpecs[i].isAscending()); + Datum startValue = totalRange.getStart().get(i); + Datum endValue = totalRange.getEnd().get(i); + + isPureAscii[i] = StringUtils.isPureAscii(startValue.asChars()) && StringUtils.isPureAscii(endValue.asChars()); + + colCards[i] = computeCardinality(sortSpecs[i].getSortKey().getDataType(), startValue, endValue, + inclusive, sortSpecs[i].isAscending()); } cardForEachDigit = new BigInteger[colCards.length]; @@ -529,4 +537,13 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi return end; } + + public static BigInteger charsToBigInteger(char [] chars) { + BigInteger lastPoint = BigInteger.ZERO; + for (int i = chars.length - 1; i >= 0; i--) { + lastPoint = lastPoint.or(BigInteger.valueOf(Character.codePointAt(chars, i))); + lastPoint = lastPoint.shiftLeft(16); + } + return lastPoint; + } } diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java index 58653d1cdc..71d211928d 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java @@ -18,6 +18,8 @@ package org.apache.tajo.engine.planner; +import com.google.common.primitives.UnsignedBytes; +import com.sun.tools.javac.util.Convert; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.SortSpec; import org.apache.tajo.common.TajoDataTypes.Type; @@ -28,16 +30,17 @@ import org.junit.Test; import java.math.BigInteger; +import java.nio.charset.Charset; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class TestUniformRangePartition { /** - * It verify overflow and increment. + * It verify overflow and increment in normal case. */ @Test - public void testIncrement1() { + public void testIncrementOfText() { Schema schema = new Schema() .addColumn("l_returnflag", Type.TEXT) .addColumn("l_linestatus", Type.TEXT); @@ -84,7 +87,7 @@ public void testIncrement1() { * It verify overflow with the number that exceeds the last digit. */ @Test - public void testIncrement2() { + public void testIncrementOfText2() { Schema schema = new Schema() .addColumn("l_returnflag", Type.TEXT) .addColumn("l_linestatus", Type.TEXT); @@ -129,7 +132,7 @@ public void testIncrement2() { * It verify the case where two or more digits are overflow. */ @Test - public void testIncrement3() { + public void testIncrementOfText3() { Schema schema = new Schema() .addColumn("l_returnflag", Type.TEXT) .addColumn("l_linestatus", Type.TEXT) @@ -162,7 +165,67 @@ public void testIncrement3() { } @Test - public void testIncrement4() { + public void testIncrementOfUnicode2() { + String abc = "갹a"; + byte [] b = Convert.chars2utf(abc.toCharArray()); + + for (int i = 0; i < b.length; i++) { + System.out.println(UnsignedBytes.toInt(b[i])); + } + } + + @Test + public void testIncrementOfUnicode() { + char [] a = new String("가").toCharArray(); + System.out.println(Character.codePointAt(a, 0)); + System.out.println(((int)a[0])); + + char [] b = new String("갸").toCharArray(); + System.out.println(Character.codePointAt(b, 0)); + System.out.println(new String(new char[] {(char) (Character.codePointAt(b, 0) + 1)})); + + char [] c = new String("나").toCharArray(); + System.out.println(Character.codePointAt(c, 0)); + + char [] en = new String("a").toCharArray(); + System.out.println(Character.getName(Character.codePointAt(en, 0))); + System.out.println(Character.getName(Character.codePointAt(c, 0))); + } + + @Test + public void testIncrementOfUnicodeText() { + Schema schema = new Schema() + .addColumn("col1", Type.TEXT); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + + Tuple s = new VTuple(1); + s.put(0, DatumFactory.createText("가")); + Tuple e = new VTuple(1); + e.put(0, DatumFactory.createText("마")); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 3; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev == null) { + prev = r; + } else { + assertTrue(prev.compareTo(r) < 0); + } + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + + } + + @Test + public void testIncrementOfInt8() { Schema schema = new Schema() .addColumn("l_orderkey", Type.INT8) .addColumn("l_linenumber", Type.INT8); @@ -189,7 +252,7 @@ public void testIncrement4() { assertEquals(39, range3.get(1).asInt4()); } - @Test public void testIncrement5() { + @Test public void testIncrementOfInt8AndFinal() { Schema schema = new Schema() .addColumn("l_orderkey", Type.INT8) .addColumn("l_linenumber", Type.INT8) @@ -222,7 +285,7 @@ public void testIncrement4() { } @Test - public void testIncrement6() { + public void testIncrementOfFloat8() { Schema schema = new Schema() .addColumn("l_orderkey", Type.FLOAT8) .addColumn("l_linenumber", Type.FLOAT8) @@ -255,7 +318,7 @@ public void testIncrement6() { } @Test - public void testIncrement7() { + public void testIncrementOfInet4() { Schema schema = new Schema() .addColumn("l_orderkey", Type.INET4) .addColumn("l_linenumber", Type.INET4) diff --git a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/MetaDataTuple.java b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/MetaDataTuple.java index a88a791989..291b62874d 100644 --- a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/MetaDataTuple.java +++ b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/MetaDataTuple.java @@ -139,6 +139,11 @@ public String getText(int fieldId) { return values.get(fieldId).toString(); } + @Override + public char[] getChars(int fieldId) { + return values.get(fieldId).asUnicodeChars(); + } + @Override public Tuple clone() throws CloneNotSupportedException { throw new UnsupportedException("clone"); diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/FrameTuple.java b/tajo-storage/src/main/java/org/apache/tajo/storage/FrameTuple.java index 5e2f28c07b..412dab3741 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/FrameTuple.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/FrameTuple.java @@ -170,6 +170,11 @@ public String getText(int fieldId) { return get(fieldId).asChars(); } + @Override + public char [] getChars(int fieldId) { + return get(fieldId).asUnicodeChars(); + } + @Override public Tuple clone() throws CloneNotSupportedException { FrameTuple frameTuple = (FrameTuple) super.clone(); diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/LazyTuple.java b/tajo-storage/src/main/java/org/apache/tajo/storage/LazyTuple.java index 27d26917d2..c7713c5398 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/LazyTuple.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/LazyTuple.java @@ -191,6 +191,11 @@ public String getText(int fieldId) { return get(fieldId).asChars(); } + @Override + public char[] getChars(int fieldId) { + return get(fieldId).asUnicodeChars(); + } + public String toString() { boolean first = true; StringBuilder str = new StringBuilder(); diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/Tuple.java b/tajo-storage/src/main/java/org/apache/tajo/storage/Tuple.java index a05dc71a6a..f806286321 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/Tuple.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/Tuple.java @@ -64,6 +64,8 @@ public interface Tuple extends Cloneable { public String getText(int fieldId); + public char [] getChars(int fieldId); + public Tuple clone() throws CloneNotSupportedException; public Datum[] getValues(); diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/VTuple.java b/tajo-storage/src/main/java/org/apache/tajo/storage/VTuple.java index 6d602f8bcb..121f7ceda7 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/VTuple.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/VTuple.java @@ -172,6 +172,11 @@ public String getText(int fieldId) { return values[fieldId].asChars(); } + @Override + public char[] getChars(int fieldId) { + return values[fieldId].asUnicodeChars(); + } + @Override public Tuple clone() throws CloneNotSupportedException { VTuple tuple = (VTuple) super.clone(); From 1a938220a3887ba80031a178d01bf47634cb0339 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 14 Aug 2014 10:24:09 +0900 Subject: [PATCH 02/18] Changing calculate partition num. --- .../planner/RangePartitionAlgorithm.java | 15 +--- .../engine/planner/UniformRangePartition.java | 74 +++++++++++++++---- .../planner/TestUniformRangePartition.java | 4 +- .../org/apache/tajo/jdbc/MetaDataTuple.java | 2 +- .../org/apache/tajo/storage/FrameTuple.java | 2 +- .../org/apache/tajo/storage/LazyTuple.java | 2 +- .../java/org/apache/tajo/storage/Tuple.java | 2 +- .../java/org/apache/tajo/storage/VTuple.java | 2 +- 8 files changed, 70 insertions(+), 33 deletions(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java index d7d0b758a4..f3d2160bd5 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java @@ -149,20 +149,9 @@ public static BigInteger computeCardinality(DataType dataType, Datum start, Datu a = end.asUnicodeChars(); } - BigInteger startBI = BigInteger.ZERO; - BigInteger stopBI = BigInteger.ZERO; + BigInteger startBI = UniformRangePartition.charsToBigInteger(a); + BigInteger stopBI = UniformRangePartition.charsToBigInteger(b); - for (int i = 0; i < a.length; i++) { - BigInteger charVal = BigInteger.valueOf(a[i]); - BigInteger base = charVal.multiply(BigInteger.valueOf(2^16)); - startBI = startBI.add(base.pow(i + 1)); - } - - for (int i = 0; i < b.length; i++) { - BigInteger charVal = BigInteger.valueOf(b[i]); - BigInteger base = charVal.multiply(BigInteger.valueOf(2^16)); - stopBI = stopBI.add(base.pow(i + 1)); - } BigInteger diffBI = stopBI.subtract(startBI); columnCard = diffBI; } diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index 43b9254a76..b77e2e9f6c 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -266,17 +266,35 @@ public boolean isOverflow(int colId, Datum last, BigInteger inc, SortSpec [] sor } case TEXT: { - byte [] lastBytes = last.asByteArray(); - byte [] endBytes = mergedRange.getEnd().getBytes(colId); + if (isPureAscii[colId]) { + byte[] lastBytes = last.asByteArray(); + byte[] endBytes = mergedRange.getEnd().getBytes(colId); - Preconditions.checkState(lastBytes.length == endBytes.length); + Preconditions.checkState(lastBytes.length == endBytes.length); - if (sortSpecs[colId].isAscending()) { - candidate = incDecimal.add(new BigDecimal(new BigInteger(lastBytes))); - return new BigDecimal(new BigInteger(endBytes)).compareTo(candidate) < 0; + if (sortSpecs[colId].isAscending()) { + candidate = incDecimal.add(new BigDecimal(new BigInteger(lastBytes))); + return new BigDecimal(new BigInteger(endBytes)).compareTo(candidate) < 0; + } else { + candidate = new BigDecimal(new BigInteger(lastBytes)).subtract(incDecimal); + return candidate.compareTo(new BigDecimal(new BigInteger(endBytes))) < 0; + } } else { - candidate = new BigDecimal(new BigInteger(lastBytes)).subtract(incDecimal); - return candidate.compareTo(new BigDecimal(new BigInteger(endBytes))) < 0; + char[] lastChars = last.asUnicodeChars(); + char[] endChars = mergedRange.getEnd().getUnicodeChars(colId); + + Preconditions.checkState(lastChars.length == endChars.length); + + BigInteger lastBi = charsToBigInteger(lastChars); + BigInteger endBi = charsToBigInteger(endChars); + + if (sortSpecs[colId].isAscending()) { + candidate = incDecimal.add(new BigDecimal(lastBi)); + return new BigDecimal(endBi).compareTo(candidate) < 0; + } else { + candidate = new BigDecimal(lastBi).subtract(incDecimal); + return candidate.compareTo(new BigDecimal(endBi)) < 0; + } } } case INET4: { @@ -490,7 +508,30 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi if (last.isNull(i)) { lastBigInt = BigInteger.valueOf(0); } else { - lastBigInt = UnsignedLong.valueOf(new BigInteger(last.get(i).asByteArray())).bigIntegerValue(); + if (isPureAscii[i]) { + lastBigInt = UnsignedLong.valueOf(new BigInteger(last.get(i).asByteArray())).bigIntegerValue(); + } else { + char[] lastChars = last.getUnicodeChars(i); + int [] charIncs = new int[lastChars.length]; + + BigInteger remain = incs[i]; + for (int k = lastChars.length - 1; k > 0 && remain.compareTo(BigInteger.ZERO) > 0; k--) { + BigInteger charVal = BigInteger.valueOf(lastChars[k]); + BigInteger digitBase = charVal.multiply(BigInteger.valueOf(2 ^ 16)).pow(k); + + if (remain.compareTo(digitBase) > 0) { + charIncs[k] = remain.divide(digitBase).intValue(); + BigInteger sub = digitBase.multiply(BigInteger.valueOf(charIncs[k])); + remain = remain.subtract(sub); + } + } + charIncs[charIncs.length - 1] = remain.intValue(); + for (int k = 0; k < lastChars.length; k++) { + lastChars[i] += charIncs[i]; + } + + lastBigInt = charsToBigInteger(lastChars); + } } end.put(i, DatumFactory.createText(lastBigInt.add(incs[i]).toByteArray())); } @@ -539,11 +580,18 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi } public static BigInteger charsToBigInteger(char [] chars) { - BigInteger lastPoint = BigInteger.ZERO; + BigInteger digitBase = null; + BigInteger sum = BigInteger.ZERO; for (int i = chars.length - 1; i >= 0; i--) { - lastPoint = lastPoint.or(BigInteger.valueOf(Character.codePointAt(chars, i))); - lastPoint = lastPoint.shiftLeft(16); + BigInteger charVal = BigInteger.valueOf(chars[i]); + if (i > 0) { + digitBase = charVal.multiply(BigInteger.valueOf(2 ^ 16).pow(i)); + sum = sum.add(digitBase); + } else { + sum = sum.add(charVal); + } + } - return lastPoint; + return digitBase; } } diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java index 71d211928d..d1847f3819 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java @@ -200,9 +200,9 @@ public void testIncrementOfUnicodeText() { SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); Tuple s = new VTuple(1); - s.put(0, DatumFactory.createText("가")); + s.put(0, DatumFactory.createText("가나")); Tuple e = new VTuple(1); - e.put(0, DatumFactory.createText("마")); + e.put(0, DatumFactory.createText("마바")); TupleRange expected = new TupleRange(sortSpecs, s, e); diff --git a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/MetaDataTuple.java b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/MetaDataTuple.java index 291b62874d..04338ca170 100644 --- a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/MetaDataTuple.java +++ b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/MetaDataTuple.java @@ -140,7 +140,7 @@ public String getText(int fieldId) { } @Override - public char[] getChars(int fieldId) { + public char[] getUnicodeChars(int fieldId) { return values.get(fieldId).asUnicodeChars(); } diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/FrameTuple.java b/tajo-storage/src/main/java/org/apache/tajo/storage/FrameTuple.java index 412dab3741..1376a05656 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/FrameTuple.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/FrameTuple.java @@ -171,7 +171,7 @@ public String getText(int fieldId) { } @Override - public char [] getChars(int fieldId) { + public char [] getUnicodeChars(int fieldId) { return get(fieldId).asUnicodeChars(); } diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/LazyTuple.java b/tajo-storage/src/main/java/org/apache/tajo/storage/LazyTuple.java index c7713c5398..d8dca0ed9c 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/LazyTuple.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/LazyTuple.java @@ -192,7 +192,7 @@ public String getText(int fieldId) { } @Override - public char[] getChars(int fieldId) { + public char[] getUnicodeChars(int fieldId) { return get(fieldId).asUnicodeChars(); } diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/Tuple.java b/tajo-storage/src/main/java/org/apache/tajo/storage/Tuple.java index f806286321..5a173f7532 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/Tuple.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/Tuple.java @@ -64,7 +64,7 @@ public interface Tuple extends Cloneable { public String getText(int fieldId); - public char [] getChars(int fieldId); + public char [] getUnicodeChars(int fieldId); public Tuple clone() throws CloneNotSupportedException; diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/VTuple.java b/tajo-storage/src/main/java/org/apache/tajo/storage/VTuple.java index 121f7ceda7..326fb6d645 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/VTuple.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/VTuple.java @@ -173,7 +173,7 @@ public String getText(int fieldId) { } @Override - public char[] getChars(int fieldId) { + public char[] getUnicodeChars(int fieldId) { return values[fieldId].asUnicodeChars(); } From 7b5a689036c37633e1a47e0ab3b4aa92305b0d99 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 14 Aug 2014 11:33:12 +0900 Subject: [PATCH 03/18] Improved unicode partition. --- .../engine/planner/UniformRangePartition.java | 78 +++++++++++++------ .../planner/TestUniformRangePartition.java | 63 ++++++++++++++- 2 files changed, 118 insertions(+), 23 deletions(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index b77e2e9f6c..aba654ab35 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -20,6 +20,7 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; +import com.google.common.primitives.Chars; import com.google.common.primitives.UnsignedLong; import org.apache.tajo.catalog.Column; import org.apache.tajo.catalog.SortSpec; @@ -141,27 +142,51 @@ public TupleRange[] partition(int partNum) { * @param sortSpecs The sort specs * @param range Tuple range to be normalize */ - public static void normalize(final SortSpec [] sortSpecs, TupleRange range) { + public void normalize(final SortSpec [] sortSpecs, TupleRange range) { // normalize text fields to have same bytes length for (int i = 0; i < sortSpecs.length; i++) { if (sortSpecs[i].getSortKey().getDataType().getType() == TajoDataTypes.Type.TEXT) { - byte [] startBytes; - byte [] endBytes; - if (range.getStart().isNull(i)) { - startBytes = BigInteger.ZERO.toByteArray(); - } else { - startBytes = range.getStart().getBytes(i); - } + if (isPureAscii[i]) { + byte[] startBytes; + byte[] endBytes; + if (range.getStart().isNull(i)) { + startBytes = BigInteger.ZERO.toByteArray(); + } else { + startBytes = range.getStart().getBytes(i); + } + + if (range.getEnd().isNull(i)) { + endBytes = BigInteger.ZERO.toByteArray(); + } else { + endBytes = range.getEnd().getBytes(i); + } + + byte[][] padded = BytesUtils.padBytes(startBytes, endBytes); + range.getStart().put(i, DatumFactory.createText(padded[0])); + range.getEnd().put(i, DatumFactory.createText(padded[1])); - if (range.getEnd().isNull(i)) { - endBytes = BigInteger.ZERO.toByteArray(); } else { - endBytes = range.getEnd().getBytes(i); - } + char[] startChars; + char[] endChars; + if (range.getStart().isNull(i)) { + startChars = new char[] {0}; + } else { + startChars = range.getStart().getUnicodeChars(i); + } + + if (range.getEnd().isNull(i)) { + endChars = new char[] {0}; + } else { + endChars = range.getEnd().getUnicodeChars(i); + } - byte [][] padded = BytesUtils.padBytes(startBytes, endBytes); - range.getStart().put(i, DatumFactory.createText(padded[0])); - range.getEnd().put(i, DatumFactory.createText(padded[1])); + char[][] padded = new char[2][]; + int max = Math.max(startChars.length, endChars.length); + padded[0] = Chars.ensureCapacity(startChars, startChars.length, max - startChars.length); + padded[1] = Chars.ensureCapacity(endChars, endChars.length, max - endChars.length); + range.getStart().put(i, DatumFactory.createText(new String(padded[0]))); + range.getEnd().put(i, DatumFactory.createText(new String(padded[1]))); + } } } } @@ -507,16 +532,18 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi BigInteger lastBigInt; if (last.isNull(i)) { lastBigInt = BigInteger.valueOf(0); + end.put(i, DatumFactory.createText(lastBigInt.add(incs[i]).toByteArray())); } else { if (isPureAscii[i]) { lastBigInt = UnsignedLong.valueOf(new BigInteger(last.get(i).asByteArray())).bigIntegerValue(); + end.put(i, DatumFactory.createText(lastBigInt.add(incs[i]).toByteArray())); } else { char[] lastChars = last.getUnicodeChars(i); int [] charIncs = new int[lastChars.length]; BigInteger remain = incs[i]; for (int k = lastChars.length - 1; k > 0 && remain.compareTo(BigInteger.ZERO) > 0; k--) { - BigInteger charVal = BigInteger.valueOf(lastChars[k]); + BigInteger charVal = BigInteger.valueOf(lastChars[(lastChars.length -1) - k]); BigInteger digitBase = charVal.multiply(BigInteger.valueOf(2 ^ 16)).pow(k); if (remain.compareTo(digitBase) > 0) { @@ -526,14 +553,22 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi } } charIncs[charIncs.length - 1] = remain.intValue(); + for (int k = 0; k < lastChars.length; k++) { - lastChars[i] += charIncs[i]; + long sum = (long)lastChars[k] + (long)charIncs[k]; + if (sum > 65536) { + charIncs[k] = 65536 - charIncs[k]; + lastChars[k - 1]++; + + lastChars[i] += charIncs[i]; + } else { + lastChars[i] += charIncs[i]; + } } - lastBigInt = charsToBigInteger(lastChars); + end.put(i, DatumFactory.createText(new String(lastChars))); } } - end.put(i, DatumFactory.createText(lastBigInt.add(incs[i]).toByteArray())); } break; case DATE: @@ -583,15 +618,14 @@ public static BigInteger charsToBigInteger(char [] chars) { BigInteger digitBase = null; BigInteger sum = BigInteger.ZERO; for (int i = chars.length - 1; i >= 0; i--) { - BigInteger charVal = BigInteger.valueOf(chars[i]); + BigInteger charVal = BigInteger.valueOf(chars[(chars.length - 1) - i]); if (i > 0) { digitBase = charVal.multiply(BigInteger.valueOf(2 ^ 16).pow(i)); sum = sum.add(digitBase); } else { sum = sum.add(charVal); } - } - return digitBase; + return sum; } } diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java index d1847f3819..6f5d38db5e 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java @@ -192,6 +192,68 @@ public void testIncrementOfUnicode() { System.out.println(Character.getName(Character.codePointAt(c, 0))); } + @Test + public void testIncrementOfUnicodeOneCharSinglePartition() { + Schema schema = new Schema() + .addColumn("col1", Type.TEXT); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + + Tuple s = new VTuple(1); + s.put(0, DatumFactory.createText("가")); + Tuple e = new VTuple(1); + e.put(0, DatumFactory.createText("다")); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 1; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev == null) { + prev = r; + } else { + assertTrue(prev.compareTo(r) < 0); + } + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + + @Test + public void testIncrementOfUnicodeOneCharMultiPartition() { + Schema schema = new Schema() + .addColumn("col1", Type.TEXT); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + + Tuple s = new VTuple(1); + s.put(0, DatumFactory.createText("가")); + Tuple e = new VTuple(1); + e.put(0, DatumFactory.createText("다")); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 2; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev == null) { + prev = r; + } else { + assertTrue(prev.compareTo(r) < 0); + } + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + @Test public void testIncrementOfUnicodeText() { Schema schema = new Schema() @@ -221,7 +283,6 @@ public void testIncrementOfUnicodeText() { assertEquals(partNum, ranges.length); assertTrue(ranges[0].getStart().equals(s)); assertTrue(ranges[partNum - 1].getEnd().equals(e)); - } @Test From 262c88cfe3edd22ef638f7f077f27110cd93d860 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 14 Aug 2014 13:21:10 +0900 Subject: [PATCH 04/18] Completed and add unit tests. --- .../planner/RangePartitionAlgorithm.java | 3 - .../engine/planner/UniformRangePartition.java | 22 +++-- .../planner/TestUniformRangePartition.java | 86 +++++++++++++------ .../tajo/engine/query/TestSortQuery.java | 71 ++++++++++++--- .../TestSortQuery/testSortOnNullColumn.sql | 13 +++ .../testSortOnUnicodeTextAsc.sql | 6 ++ .../testSortOnUnicodeTextDesc.sql | 6 ++ .../TestSortQuery/testSortOnNullColumn.result | 6 ++ .../testSortOnUnicodeTextAsc.result | 6 ++ .../testSortOnUnicodeTextDesc.result | 6 ++ 10 files changed, 172 insertions(+), 53 deletions(-) create mode 100644 tajo-core/src/test/resources/queries/TestSortQuery/testSortOnNullColumn.sql create mode 100644 tajo-core/src/test/resources/queries/TestSortQuery/testSortOnUnicodeTextAsc.sql create mode 100644 tajo-core/src/test/resources/queries/TestSortQuery/testSortOnUnicodeTextDesc.sql create mode 100644 tajo-core/src/test/resources/results/TestSortQuery/testSortOnNullColumn.result create mode 100644 tajo-core/src/test/resources/results/TestSortQuery/testSortOnUnicodeTextAsc.result create mode 100644 tajo-core/src/test/resources/results/TestSortQuery/testSortOnUnicodeTextDesc.result diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java index f3d2160bd5..d129236aa8 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java @@ -18,7 +18,6 @@ package org.apache.tajo.engine.planner; -import com.sun.tools.javac.util.Convert; import org.apache.tajo.catalog.SortSpec; import org.apache.tajo.common.TajoDataTypes.DataType; import org.apache.tajo.datum.Datum; @@ -26,10 +25,8 @@ import org.apache.tajo.storage.TupleRange; import org.apache.tajo.util.Bytes; import org.apache.tajo.util.StringUtils; -import sun.text.CollatorUtilities; import java.math.BigInteger; -import java.text.Collator; public abstract class RangePartitionAlgorithm { protected SortSpec [] sortSpecs; diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index aba654ab35..06450f2f95 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -57,6 +57,12 @@ public UniformRangePartition(final TupleRange totalRange, final SortSpec[] sortS super(sortSpecs, totalRange, inclusive); isPureAscii = new boolean[sortSpecs.length]; + for (int i = 0; i < sortSpecs.length; i++) { + Datum startValue = totalRange.getStart().get(i); + Datum endValue = totalRange.getEnd().get(i); + isPureAscii[i] = StringUtils.isPureAscii(startValue.asChars()) && StringUtils.isPureAscii(endValue.asChars()); + } + colCards = new BigInteger[sortSpecs.length]; normalize(sortSpecs, this.mergedRange); @@ -64,8 +70,6 @@ public UniformRangePartition(final TupleRange totalRange, final SortSpec[] sortS Datum startValue = totalRange.getStart().get(i); Datum endValue = totalRange.getEnd().get(i); - isPureAscii[i] = StringUtils.isPureAscii(startValue.asChars()) && StringUtils.isPureAscii(endValue.asChars()); - colCards[i] = computeCardinality(sortSpecs[i].getSortKey().getDataType(), startValue, endValue, inclusive, sortSpecs[i].isAscending()); } @@ -543,8 +547,7 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi BigInteger remain = incs[i]; for (int k = lastChars.length - 1; k > 0 && remain.compareTo(BigInteger.ZERO) > 0; k--) { - BigInteger charVal = BigInteger.valueOf(lastChars[(lastChars.length -1) - k]); - BigInteger digitBase = charVal.multiply(BigInteger.valueOf(2 ^ 16)).pow(k); + BigInteger digitBase = BigInteger.valueOf(65536).pow(k); if (remain.compareTo(digitBase) > 0) { charIncs[k] = remain.divide(digitBase).intValue(); @@ -555,14 +558,15 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi charIncs[charIncs.length - 1] = remain.intValue(); for (int k = 0; k < lastChars.length; k++) { - long sum = (long)lastChars[k] + (long)charIncs[k]; + int sum = (int)lastChars[k] + (int)charIncs[k]; if (sum > 65536) { - charIncs[k] = 65536 - charIncs[k]; - lastChars[k - 1]++; + charIncs[k] = sum - 65536; + charIncs[k-1] += 1; - lastChars[i] += charIncs[i]; + lastChars[k-1] += 1; + lastChars[k] += charIncs[k]; } else { - lastChars[i] += charIncs[i]; + lastChars[k] += charIncs[k]; } } diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java index 6f5d38db5e..50061d0380 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java @@ -18,19 +18,17 @@ package org.apache.tajo.engine.planner; -import com.google.common.primitives.UnsignedBytes; -import com.sun.tools.javac.util.Convert; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.SortSpec; import org.apache.tajo.common.TajoDataTypes.Type; import org.apache.tajo.datum.DatumFactory; import org.apache.tajo.storage.Tuple; +import org.apache.tajo.storage.TupleComparator; import org.apache.tajo.storage.TupleRange; import org.apache.tajo.storage.VTuple; import org.junit.Test; import java.math.BigInteger; -import java.nio.charset.Charset; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -165,31 +163,33 @@ public void testIncrementOfText3() { } @Test - public void testIncrementOfUnicode2() { - String abc = "갹a"; - byte [] b = Convert.chars2utf(abc.toCharArray()); + public void testIncrementOfUnicode() { + Schema schema = new Schema() + .addColumn("col1", Type.TEXT); - for (int i = 0; i < b.length; i++) { - System.out.println(UnsignedBytes.toInt(b[i])); - } - } + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); - @Test - public void testIncrementOfUnicode() { - char [] a = new String("가").toCharArray(); - System.out.println(Character.codePointAt(a, 0)); - System.out.println(((int)a[0])); + Tuple s = new VTuple(1); + s.put(0, DatumFactory.createText("가가가")); + Tuple e = new VTuple(1); + e.put(0, DatumFactory.createText("하하하")); + + TupleRange expected = new TupleRange(sortSpecs, s, e); - char [] b = new String("갸").toCharArray(); - System.out.println(Character.codePointAt(b, 0)); - System.out.println(new String(new char[] {(char) (Character.codePointAt(b, 0) + 1)})); + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + TupleComparator comp = new TupleComparator(schema, sortSpecs); - char [] c = new String("나").toCharArray(); - System.out.println(Character.codePointAt(c, 0)); + Tuple tuple = s; + Tuple prevTuple = null; + for (int i = 0; i < 100; i++) { + tuple = partitioner.increment(tuple, BigInteger.valueOf(30000), 0); - char [] en = new String("a").toCharArray(); - System.out.println(Character.getName(Character.codePointAt(en, 0))); - System.out.println(Character.getName(Character.codePointAt(c, 0))); + if (prevTuple == null) { + prevTuple = tuple; + } else { + assertTrue(comp.compare(prevTuple, tuple) < 0); + } + } } @Test @@ -255,21 +255,21 @@ public void testIncrementOfUnicodeOneCharMultiPartition() { } @Test - public void testIncrementOfUnicodeText() { + public void testPartitionForUnicodeTextAsc() { Schema schema = new Schema() .addColumn("col1", Type.TEXT); SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); Tuple s = new VTuple(1); - s.put(0, DatumFactory.createText("가나")); Tuple e = new VTuple(1); - e.put(0, DatumFactory.createText("마바")); + s.put(0, DatumFactory.createText("가가가")); + e.put(0, DatumFactory.createText("하하하")); TupleRange expected = new TupleRange(sortSpecs, s, e); UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); - int partNum = 3; + int partNum = 64; TupleRange [] ranges = partitioner.partition(partNum); TupleRange prev = null; @@ -285,6 +285,38 @@ public void testIncrementOfUnicodeText() { assertTrue(ranges[partNum - 1].getEnd().equals(e)); } + @Test + public void testPartitionForUnicodeTextDesc() { + Schema schema = new Schema() + .addColumn("col1", Type.TEXT); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + sortSpecs[0].setDescOrder(); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createText("하하하")); + e.put(0, DatumFactory.createText("가가가")); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev == null) { + prev = r; + } else { + assertTrue(prev.compareTo(r) > 0); + } + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + @Test public void testIncrementOfInt8() { Schema schema = new Schema() diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSortQuery.java b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSortQuery.java index 7d2c5d2a5a..5213826106 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSortQuery.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSortQuery.java @@ -179,7 +179,7 @@ public final void testTopkWithJson() throws Exception { } @Test - public final void testSortNullColumn() throws Exception { + public final void testSortOnNullColumn() throws Exception { try { testingCluster.setAllTajoDaemonConfValue(ConfVars.$TEST_MIN_TASK_NUM.varname, "2"); KeyValueSet tableOptions = new KeyValueSet(); @@ -197,25 +197,68 @@ public final void testSortNullColumn() throws Exception { }; TajoTestingCluster.createTable("nullsort", schema, tableOptions, data, 2); - ResultSet res = executeString( - "select * from (" + - "select case when id > 2 then null else id end as col1, name as col2 from nullsort) a " + - "order by col1, col2" - ); + ResultSet res = executeQuery(); + assertResultSet(res); + cleanupQuery(res); + } finally { + testingCluster.setAllTajoDaemonConfValue(ConfVars.$TEST_MIN_TASK_NUM.varname, "0"); + executeString("DROP TABLE nullsort PURGE;").close(); + } + } - String expected = "col1,col2\n" + - "-------------------------------\n" + - "1,BRAZIL\n" + - "2,ALGERIA\n" + - "null,ARGENTINA\n" + - "null,CANADA\n"; + @Test + public final void testSortOnUnicodeTextAsc() throws Exception { + try { + testingCluster.setAllTajoDaemonConfValue(ConfVars.$TEST_MIN_TASK_NUM.varname, "2"); + KeyValueSet tableOptions = new KeyValueSet(); + tableOptions.set(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER); + tableOptions.set(StorageConstants.CSVFILE_NULL, "\\\\N"); - assertEquals(expected, resultSetToString(res)); + Schema schema = new Schema(); + schema.addColumn("col1", Type.INT4); + schema.addColumn("col2", Type.TEXT); + String[] data = new String[]{ + "1|하하하", + "2|캬캬캬", + "3|가가가", + "4|냐하하" + }; + TajoTestingCluster.createTable("unicode_sort1", schema, tableOptions, data, 2); + ResultSet res = executeQuery(); + assertResultSet(res); cleanupQuery(res); } finally { testingCluster.setAllTajoDaemonConfValue(ConfVars.$TEST_MIN_TASK_NUM.varname, "0"); - executeString("DROP TABLE nullsort PURGE;").close(); + executeString("DROP TABLE unicode_sort1 PURGE;").close(); + } + } + + @Test + public final void testSortOnUnicodeTextDesc() throws Exception { + try { + testingCluster.setAllTajoDaemonConfValue(ConfVars.$TEST_MIN_TASK_NUM.varname, "2"); + KeyValueSet tableOptions = new KeyValueSet(); + tableOptions.set(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER); + tableOptions.set(StorageConstants.CSVFILE_NULL, "\\\\N"); + + Schema schema = new Schema(); + schema.addColumn("col1", Type.INT4); + schema.addColumn("col2", Type.TEXT); + String[] data = new String[]{ + "1|하하하", + "2|캬캬캬", + "3|가가가", + "4|냐하하" + }; + TajoTestingCluster.createTable("unicode_sort2", schema, tableOptions, data, 2); + + ResultSet res = executeQuery(); + assertResultSet(res); + cleanupQuery(res); + } finally { + testingCluster.setAllTajoDaemonConfValue(ConfVars.$TEST_MIN_TASK_NUM.varname, "0"); + executeString("DROP TABLE unicode_sort2 PURGE;").close(); } } diff --git a/tajo-core/src/test/resources/queries/TestSortQuery/testSortOnNullColumn.sql b/tajo-core/src/test/resources/queries/TestSortQuery/testSortOnNullColumn.sql new file mode 100644 index 0000000000..6707d270a2 --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSortQuery/testSortOnNullColumn.sql @@ -0,0 +1,13 @@ +select + * +from ( + select + case when id > 2 then null else id end as col1, + name as col2 + from + nullsort +) a + +order by + col1, + col2; \ No newline at end of file diff --git a/tajo-core/src/test/resources/queries/TestSortQuery/testSortOnUnicodeTextAsc.sql b/tajo-core/src/test/resources/queries/TestSortQuery/testSortOnUnicodeTextAsc.sql new file mode 100644 index 0000000000..6ed18479a6 --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSortQuery/testSortOnUnicodeTextAsc.sql @@ -0,0 +1,6 @@ +select + * +from + unicode_sort1 +order by + col2 asc; \ No newline at end of file diff --git a/tajo-core/src/test/resources/queries/TestSortQuery/testSortOnUnicodeTextDesc.sql b/tajo-core/src/test/resources/queries/TestSortQuery/testSortOnUnicodeTextDesc.sql new file mode 100644 index 0000000000..efbb684fce --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSortQuery/testSortOnUnicodeTextDesc.sql @@ -0,0 +1,6 @@ +select + * +from + unicode_sort2 +order by + col2 desc; \ No newline at end of file diff --git a/tajo-core/src/test/resources/results/TestSortQuery/testSortOnNullColumn.result b/tajo-core/src/test/resources/results/TestSortQuery/testSortOnNullColumn.result new file mode 100644 index 0000000000..0ee6535f69 --- /dev/null +++ b/tajo-core/src/test/resources/results/TestSortQuery/testSortOnNullColumn.result @@ -0,0 +1,6 @@ +col1,col2 +------------------------------- +1,BRAZIL +2,ALGERIA +null,ARGENTINA +null,CANADA \ No newline at end of file diff --git a/tajo-core/src/test/resources/results/TestSortQuery/testSortOnUnicodeTextAsc.result b/tajo-core/src/test/resources/results/TestSortQuery/testSortOnUnicodeTextAsc.result new file mode 100644 index 0000000000..eca4f68fa6 --- /dev/null +++ b/tajo-core/src/test/resources/results/TestSortQuery/testSortOnUnicodeTextAsc.result @@ -0,0 +1,6 @@ +col1,col2 +------------------------------- +3,가가가 +4,냐하하 +2,캬캬캬 +1,하하하 \ No newline at end of file diff --git a/tajo-core/src/test/resources/results/TestSortQuery/testSortOnUnicodeTextDesc.result b/tajo-core/src/test/resources/results/TestSortQuery/testSortOnUnicodeTextDesc.result new file mode 100644 index 0000000000..9f5313696a --- /dev/null +++ b/tajo-core/src/test/resources/results/TestSortQuery/testSortOnUnicodeTextDesc.result @@ -0,0 +1,6 @@ +col1,col2 +------------------------------- +1,하하하 +2,캬캬캬 +4,냐하하 +3,가가가 \ No newline at end of file From 4239e165315dc375850bc0d445b904aea14f7664 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 14 Aug 2014 13:52:52 +0900 Subject: [PATCH 05/18] Refactored trivial things. --- .../java/org/apache/tajo/datum/TextDatum.java | 1 + .../planner/RangePartitionAlgorithm.java | 2 +- .../engine/planner/UniformRangePartition.java | 20 ++++++++++++------- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java index bdef182292..f2ab1dff2b 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java @@ -32,6 +32,7 @@ public class TextDatum extends Datum { @Expose private final int size; @Expose private final byte[] bytes; + public static final int UNICODE_CHAR_BITS_NUM = 65536; // bits number for 2 bytes public static final TextDatum EMPTY_TEXT = new TextDatum(""); public static final Comparator COMPARATOR = UnsignedBytes.lexicographicalComparator(); diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java index d129236aa8..c37551f182 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java @@ -31,7 +31,7 @@ public abstract class RangePartitionAlgorithm { protected SortSpec [] sortSpecs; protected TupleRange mergedRange; - protected final BigInteger totalCard; + protected final BigInteger totalCard; // total cardinality /** true if the end of the range is inclusive. Otherwise, it should be false. */ protected final boolean inclusive; diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index 06450f2f95..9b4aa61989 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -27,6 +27,7 @@ import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.datum.Datum; import org.apache.tajo.datum.DatumFactory; +import org.apache.tajo.datum.TextDatum; import org.apache.tajo.engine.exception.RangeOverflowException; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.TupleRange; @@ -40,12 +41,15 @@ import java.math.RoundingMode; import java.util.List; - +/** + * It serializes multiple sort key spaces into one dimension space by regarding key spaces as + * arbitrary base number Systems respectively. + */ public class UniformRangePartition extends RangePartitionAlgorithm { private int variableId; private BigInteger[] cardForEachDigit; private BigInteger[] colCards; - private boolean [] isPureAscii; + private boolean [] isPureAscii; // flags to indicate if i'th key contains pure ascii characters. /** * @@ -56,6 +60,7 @@ public class UniformRangePartition extends RangePartitionAlgorithm { public UniformRangePartition(final TupleRange totalRange, final SortSpec[] sortSpecs, boolean inclusive) { super(sortSpecs, totalRange, inclusive); + // filling pure ascii flags isPureAscii = new boolean[sortSpecs.length]; for (int i = 0; i < sortSpecs.length; i++) { Datum startValue = totalRange.getStart().get(i); @@ -538,6 +543,7 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi lastBigInt = BigInteger.valueOf(0); end.put(i, DatumFactory.createText(lastBigInt.add(incs[i]).toByteArray())); } else { + if (isPureAscii[i]) { lastBigInt = UnsignedLong.valueOf(new BigInteger(last.get(i).asByteArray())).bigIntegerValue(); end.put(i, DatumFactory.createText(lastBigInt.add(incs[i]).toByteArray())); @@ -547,7 +553,7 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi BigInteger remain = incs[i]; for (int k = lastChars.length - 1; k > 0 && remain.compareTo(BigInteger.ZERO) > 0; k--) { - BigInteger digitBase = BigInteger.valueOf(65536).pow(k); + BigInteger digitBase = BigInteger.valueOf(TextDatum.UNICODE_CHAR_BITS_NUM).pow(k); if (remain.compareTo(digitBase) > 0) { charIncs[k] = remain.divide(digitBase).intValue(); @@ -559,8 +565,8 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi for (int k = 0; k < lastChars.length; k++) { int sum = (int)lastChars[k] + (int)charIncs[k]; - if (sum > 65536) { - charIncs[k] = sum - 65536; + if (sum > TextDatum.UNICODE_CHAR_BITS_NUM) { + charIncs[k] = sum - TextDatum.UNICODE_CHAR_BITS_NUM; charIncs[k-1] += 1; lastChars[k-1] += 1; @@ -619,12 +625,12 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi } public static BigInteger charsToBigInteger(char [] chars) { - BigInteger digitBase = null; + BigInteger digitBase; BigInteger sum = BigInteger.ZERO; for (int i = chars.length - 1; i >= 0; i--) { BigInteger charVal = BigInteger.valueOf(chars[(chars.length - 1) - i]); if (i > 0) { - digitBase = charVal.multiply(BigInteger.valueOf(2 ^ 16).pow(i)); + digitBase = charVal.multiply(BigInteger.valueOf(TextDatum.UNICODE_CHAR_BITS_NUM).pow(i)); sum = sum.add(digitBase); } else { sum = sum.add(charVal); From 4a02285955db2aef3184de2c336e0e45aa042718 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 14 Aug 2014 17:28:30 +0900 Subject: [PATCH 06/18] Fixed failed unit tests. --- .../java/org/apache/tajo/datum/TextDatum.java | 9 ++- .../java/org/apache/tajo/util/BytesUtils.java | 27 +++++++++ .../engine/planner/UniformRangePartition.java | 18 +++--- .../planner/TestUniformRangePartition.java | 59 +++++++++---------- 4 files changed, 71 insertions(+), 42 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java index f2ab1dff2b..730974c5c9 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java @@ -18,6 +18,7 @@ package org.apache.tajo.datum; +import com.google.common.primitives.Chars; import com.google.common.primitives.UnsignedBytes; import com.google.gson.annotations.Expose; import com.sun.tools.javac.util.Convert; @@ -32,7 +33,7 @@ public class TextDatum extends Datum { @Expose private final int size; @Expose private final byte[] bytes; - public static final int UNICODE_CHAR_BITS_NUM = 65536; // bits number for 2 bytes + public static final int UNICODE_CHAR_BITS_NUM = Character.MAX_VALUE; // bits number for 2 bytes public static final TextDatum EMPTY_TEXT = new TextDatum(""); public static final Comparator COMPARATOR = UnsignedBytes.lexicographicalComparator(); @@ -112,7 +113,8 @@ public int compareTo(Datum datum) { case TEXT: case CHAR: case BLOB: - return COMPARATOR.compare(bytes, datum.asByteArray()); + //return COMPARATOR.compare(bytes, datum.asByteArray()); + return Chars.lexicographicalComparator().compare(Convert.utf2chars(bytes), Convert.utf2chars(datum.asByteArray())); case NULL_TYPE: return -1; @@ -138,7 +140,8 @@ public Datum equalsTo(Datum datum) { case TEXT: case CHAR: case BLOB: - return DatumFactory.createBool(COMPARATOR.compare(bytes, datum.asByteArray()) == 0); + //return DatumFactory.createBool(COMPARATOR.compare(bytes, datum.asByteArray()) == 0); + return DatumFactory.createBool(Chars.lexicographicalComparator().compare(Convert.utf2chars(bytes), Convert.utf2chars(datum.asByteArray()))); case NULL_TYPE: return datum; default: diff --git a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java index 59ed4fb673..d7823239ab 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java @@ -18,6 +18,7 @@ package org.apache.tajo.util; +import com.google.common.primitives.Chars; import org.apache.hadoop.io.WritableUtils; import java.io.ByteArrayOutputStream; @@ -211,6 +212,32 @@ public static byte[][] padBytes(byte []...bytes) { return padded; } + public static char[][] padChars(char []...bytes) { + char[] startChars = bytes[0]; + char[] endChars = bytes[1]; + + char[][] padded = new char[2][]; + int max = Math.max(startChars.length, endChars.length); + + padded[0] = new char[max]; + padded[1] = new char[max]; + + for (int i = 0; i < startChars.length; i++) { + padded[0][i] = startChars[i]; + } + for (int i = startChars.length; i < max; i++) { + padded[0][i] = 0; + } + for (int i = 0; i < endChars.length; i++) { + padded[1][i] = endChars[i]; + } + for (int i = endChars.length; i < max; i++) { + padded[1][i] = 0; + } + + return padded; + } + public static byte [] trimBytes(byte [] bytes) { return new String(bytes).trim().getBytes(); } diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index 9b4aa61989..c449164cb5 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -22,6 +22,7 @@ import com.google.common.collect.Lists; import com.google.common.primitives.Chars; import com.google.common.primitives.UnsignedLong; +import com.sun.tools.javac.util.Convert; import org.apache.tajo.catalog.Column; import org.apache.tajo.catalog.SortSpec; import org.apache.tajo.common.TajoDataTypes; @@ -65,7 +66,8 @@ public UniformRangePartition(final TupleRange totalRange, final SortSpec[] sortS for (int i = 0; i < sortSpecs.length; i++) { Datum startValue = totalRange.getStart().get(i); Datum endValue = totalRange.getEnd().get(i); - isPureAscii[i] = StringUtils.isPureAscii(startValue.asChars()) && StringUtils.isPureAscii(endValue.asChars()); +// isPureAscii[i] = StringUtils.isPureAscii(startValue.asChars()) && StringUtils.isPureAscii(endValue.asChars()); + isPureAscii[i] = false; } colCards = new BigInteger[sortSpecs.length]; @@ -189,10 +191,7 @@ public void normalize(final SortSpec [] sortSpecs, TupleRange range) { endChars = range.getEnd().getUnicodeChars(i); } - char[][] padded = new char[2][]; - int max = Math.max(startChars.length, endChars.length); - padded[0] = Chars.ensureCapacity(startChars, startChars.length, max - startChars.length); - padded[1] = Chars.ensureCapacity(endChars, endChars.length, max - endChars.length); + char[][] padded = BytesUtils.padChars(startChars, endChars); range.getStart().put(i, DatumFactory.createText(new String(padded[0]))); range.getEnd().put(i, DatumFactory.createText(new String(padded[1]))); } @@ -564,7 +563,11 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi charIncs[charIncs.length - 1] = remain.intValue(); for (int k = 0; k < lastChars.length; k++) { - int sum = (int)lastChars[k] + (int)charIncs[k]; + if (charIncs[k] == 0) { + continue; + } + + int sum = (int)lastChars[k] + charIncs[k]; if (sum > TextDatum.UNICODE_CHAR_BITS_NUM) { charIncs[k] = sum - TextDatum.UNICODE_CHAR_BITS_NUM; charIncs[k-1] += 1; @@ -573,10 +576,11 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi lastChars[k] += charIncs[k]; } else { lastChars[k] += charIncs[k]; + } } - end.put(i, DatumFactory.createText(new String(lastChars))); + end.put(i, DatumFactory.createText(Convert.chars2utf(lastChars))); } } } diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java index 50061d0380..f9c562d91c 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java @@ -184,11 +184,13 @@ public void testIncrementOfUnicode() { for (int i = 0; i < 100; i++) { tuple = partitioner.increment(tuple, BigInteger.valueOf(30000), 0); - if (prevTuple == null) { - prevTuple = tuple; - } else { - assertTrue(comp.compare(prevTuple, tuple) < 0); + if (prevTuple != null) { + if (!(comp.compare(prevTuple, tuple) < 0)) { + System.out.println("AAAA"); + } + assertTrue("prev=" + prevTuple + ", current=" + tuple, comp.compare(prevTuple, tuple) < 0); } + prevTuple = tuple; } } @@ -212,11 +214,10 @@ public void testIncrementOfUnicodeOneCharSinglePartition() { TupleRange prev = null; for (TupleRange r : ranges) { - if (prev == null) { - prev = r; - } else { + if (prev != null) { assertTrue(prev.compareTo(r) < 0); } + prev = r; } assertEquals(partNum, ranges.length); assertTrue(ranges[0].getStart().equals(s)); @@ -243,10 +244,10 @@ public void testIncrementOfUnicodeOneCharMultiPartition() { TupleRange prev = null; for (TupleRange r : ranges) { - if (prev == null) { - prev = r; - } else { + if (prev != null) { assertTrue(prev.compareTo(r) < 0); + } else { + prev = r; } } assertEquals(partNum, ranges.length); @@ -274,11 +275,10 @@ public void testPartitionForUnicodeTextAsc() { TupleRange prev = null; for (TupleRange r : ranges) { - if (prev == null) { - prev = r; - } else { + if (prev != null) { assertTrue(prev.compareTo(r) < 0); } + prev = r; } assertEquals(partNum, ranges.length); assertTrue(ranges[0].getStart().equals(s)); @@ -465,11 +465,10 @@ public void testPartition() { TupleRange prev = null; for (TupleRange r : ranges) { - if (prev == null) { - prev = r; - } else { + if (prev != null) { assertTrue(prev.compareTo(r) < 0); } + prev = r; } } @@ -535,12 +534,11 @@ public void testPartitionForMultipleChars() { TupleRange [] ranges = partitioner.partition(48); TupleRange prev = null; - for (TupleRange r : ranges) { - if (prev == null) { - prev = r; - } else { - assertTrue(prev.compareTo(r) < 0); + for (int i = 0; i < ranges.length; i++) { + if (prev != null) { + assertTrue(i + "th,prev=" + prev + ",cur=" + ranges[i], prev.compareTo(ranges[i]) < 0); } + prev = ranges[i]; } assertEquals(48, ranges.length); assertTrue(ranges[0].getStart().equals(s)); @@ -568,11 +566,10 @@ public void testPartitionForMultipleChars2() { TupleRange prev = null; for (TupleRange r : ranges) { - if (prev == null) { - prev = r; - } else { + if (prev != null) { assertTrue(prev.compareTo(r) < 0); } + prev = r; } assertEquals(partNum, ranges.length); assertTrue(ranges[0].getStart().equals(s)); @@ -600,12 +597,11 @@ public void testPartitionForMultipleChars2Desc() { TupleRange [] ranges = partitioner.partition(partNum); TupleRange prev = null; - for (TupleRange r : ranges) { - if (prev == null) { - prev = r; - } else { - assertTrue(prev.compareTo(r) > 0); + for (int i = 0; i < ranges.length; i++) { + if (prev != null) { + assertTrue(i + "th,prev=" + prev + ",cur=" + ranges[i], prev.compareTo(ranges[i]) > 0); } + prev = ranges[i]; } assertEquals(partNum, ranges.length); assertTrue(ranges[0].getStart().equals(s)); @@ -717,11 +713,10 @@ public void testPartitionWithINET4() { TupleRange prev = null; for (TupleRange r : ranges) { - if (prev == null) { - prev = r; - } else { + if (prev != null) { assertTrue(prev.compareTo(r) < 0); } + prev = r; } } } From bef6d216edc61954c4a09bf41bd1b6528a99fca4 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 14 Aug 2014 19:32:53 +0900 Subject: [PATCH 07/18] Fixed unit tests and fixed some potential problems. --- .../java/org/apache/tajo/datum/TextDatum.java | 7 +- .../planner/RangePartitionAlgorithm.java | 5 + .../engine/planner/UniformRangePartition.java | 27 ++- .../planner/TestUniformRangePartition.java | 156 +++++++++++++++--- .../org/apache/tajo/storage/TupleRange.java | 12 +- 5 files changed, 176 insertions(+), 31 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java index 730974c5c9..8f5edf5b2c 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java @@ -113,8 +113,7 @@ public int compareTo(Datum datum) { case TEXT: case CHAR: case BLOB: - //return COMPARATOR.compare(bytes, datum.asByteArray()); - return Chars.lexicographicalComparator().compare(Convert.utf2chars(bytes), Convert.utf2chars(datum.asByteArray())); + return COMPARATOR.compare(bytes, datum.asByteArray()); case NULL_TYPE: return -1; @@ -140,8 +139,8 @@ public Datum equalsTo(Datum datum) { case TEXT: case CHAR: case BLOB: - //return DatumFactory.createBool(COMPARATOR.compare(bytes, datum.asByteArray()) == 0); - return DatumFactory.createBool(Chars.lexicographicalComparator().compare(Convert.utf2chars(bytes), Convert.utf2chars(datum.asByteArray()))); + return DatumFactory.createBool(COMPARATOR.compare(bytes, datum.asByteArray()) == 0); + //return DatumFactory.createBool(Chars.lexicographicalComparator().compare(Convert.utf2chars(bytes), Convert.utf2chars(datum.asByteArray()))); case NULL_TYPE: return datum; default: diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java index c37551f182..fe41c91a9f 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java @@ -24,6 +24,7 @@ import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.TupleRange; import org.apache.tajo.util.Bytes; +import org.apache.tajo.util.BytesUtils; import org.apache.tajo.util.StringUtils; import java.math.BigInteger; @@ -129,6 +130,10 @@ public static BigInteger computeCardinality(DataType dataType, Datum start, Datu a = end.asByteArray(); } + byte [][] padded = BytesUtils.padBytes(a, b); + a = padded[0]; + b = padded[1]; + byte[] prependHeader = {1, 0}; final BigInteger startBI = new BigInteger(Bytes.add(prependHeader, a)); final BigInteger stopBI = new BigInteger(Bytes.add(prependHeader, b)); diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index c449164cb5..ab4c5bae78 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -66,8 +66,7 @@ public UniformRangePartition(final TupleRange totalRange, final SortSpec[] sortS for (int i = 0; i < sortSpecs.length; i++) { Datum startValue = totalRange.getStart().get(i); Datum endValue = totalRange.getEnd().get(i); -// isPureAscii[i] = StringUtils.isPureAscii(startValue.asChars()) && StringUtils.isPureAscii(endValue.asChars()); - isPureAscii[i] = false; + isPureAscii[i] = StringUtils.isPureAscii(startValue.asChars()) && StringUtils.isPureAscii(endValue.asChars()); } colCards = new BigInteger[sortSpecs.length]; @@ -127,6 +126,7 @@ public TupleRange[] partition(int partNum) { BigInteger reminder = reverseCardsForDigit[0]; Tuple last = mergedRange.getStart(); TupleRange tupleRange; + while(reminder.compareTo(BigInteger.ZERO) > 0) { if (reminder.compareTo(term) <= 0) { // final one is inclusive tupleRange = new TupleRange(sortSpecs, last, mergedRange.getEnd()); @@ -138,10 +138,29 @@ public TupleRange[] partition(int partNum) { ranges.add(tupleRange); last = ranges.get(ranges.size() - 1).getEnd(); reminder = reminder.subtract(term); + + if (ranges.size() > 1) { + if (sortSpecs[0].isAscending()) { + assert (ranges.get(ranges.size() - 2).compareTo(ranges.get(ranges.size() - 1)) < 0); + } else { + assert (ranges.get(ranges.size() - 2).compareTo(ranges.get(ranges.size() - 1)) > 0); + } + } } - for (TupleRange r : ranges) { - denormalize(sortSpecs, r); + // Recovering the transformed same bytes tuples into the original start and end keys + ranges.get(0).setStart(mergedRange.getStart()); + ranges.get(ranges.size() - 1).setEnd(mergedRange.getEnd()); + + // Guarantee the keys are totally ordered correctly + for (int i = 0; i < ranges.size(); i++) { + if (i > 1) { + if (sortSpecs[0].isAscending()) { + assert (ranges.get(i - 2).compareTo(ranges.get(i - 1)) < 0); + } else { + assert (ranges.get(i - 2).compareTo(ranges.get(i - 1)) > 0); + } + } } return ranges.toArray(new TupleRange[ranges.size()]); diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java index f9c562d91c..dacf9d9244 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java @@ -183,11 +183,7 @@ public void testIncrementOfUnicode() { Tuple prevTuple = null; for (int i = 0; i < 100; i++) { tuple = partitioner.increment(tuple, BigInteger.valueOf(30000), 0); - if (prevTuple != null) { - if (!(comp.compare(prevTuple, tuple) < 0)) { - System.out.println("AAAA"); - } assertTrue("prev=" + prevTuple + ", current=" + tuple, comp.compare(prevTuple, tuple) < 0); } prevTuple = tuple; @@ -234,21 +230,20 @@ public void testIncrementOfUnicodeOneCharMultiPartition() { Tuple s = new VTuple(1); s.put(0, DatumFactory.createText("가")); Tuple e = new VTuple(1); - e.put(0, DatumFactory.createText("다")); + e.put(0, DatumFactory.createText("꽥")); TupleRange expected = new TupleRange(sortSpecs, s, e); UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); - int partNum = 2; + int partNum = 8; TupleRange [] ranges = partitioner.partition(partNum); TupleRange prev = null; for (TupleRange r : ranges) { if (prev != null) { assertTrue(prev.compareTo(r) < 0); - } else { - prev = r; } + prev = r; } assertEquals(partNum, ranges.length); assertTrue(ranges[0].getStart().equals(s)); @@ -285,6 +280,66 @@ public void testPartitionForUnicodeTextAsc() { assertTrue(ranges[partNum - 1].getEnd().equals(e)); } + @Test + public void testPartitionForUnicodeDiffLenBeginTextAsc() { + Schema schema = new Schema() + .addColumn("col1", Type.TEXT); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createText("가")); + e.put(0, DatumFactory.createText("하하하")); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + + @Test + public void testPartitionForUnicodeDiffLenEndTextAsc() { + Schema schema = new Schema() + .addColumn("col1", Type.TEXT); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createText("가가가")); + e.put(0, DatumFactory.createText("하")); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + @Test public void testPartitionForUnicodeTextDesc() { Schema schema = new Schema() @@ -306,11 +361,72 @@ public void testPartitionForUnicodeTextDesc() { TupleRange prev = null; for (TupleRange r : ranges) { - if (prev == null) { - prev = r; - } else { + if (prev != null) { assertTrue(prev.compareTo(r) > 0); } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + + @Test + public void testPartitionForUnicodeDiffLenBeginTextDesc() { + Schema schema = new Schema() + .addColumn("col1", Type.TEXT); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + sortSpecs[0].setDescOrder(); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createText("하")); + e.put(0, DatumFactory.createText("가가가")); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) > 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + + @Test + public void testPartitionForUnicodeDiffLenEndTextDesc() { + Schema schema = new Schema() + .addColumn("col1", Type.TEXT); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + sortSpecs[0].setDescOrder(); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createText("하")); + e.put(0, DatumFactory.createText("가가가")); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) > 0); + } + prev = r; } assertEquals(partNum, ranges.length); assertTrue(ranges[0].getStart().equals(s)); @@ -536,7 +652,7 @@ public void testPartitionForMultipleChars() { TupleRange prev = null; for (int i = 0; i < ranges.length; i++) { if (prev != null) { - assertTrue(i + "th,prev=" + prev + ",cur=" + ranges[i], prev.compareTo(ranges[i]) < 0); + assertTrue(i + "th, prev=" + prev + ",cur=" + ranges[i], prev.compareTo(ranges[i]) < 0); } prev = ranges[i]; } @@ -597,11 +713,11 @@ public void testPartitionForMultipleChars2Desc() { TupleRange [] ranges = partitioner.partition(partNum); TupleRange prev = null; - for (int i = 0; i < ranges.length; i++) { + for (TupleRange r : ranges) { if (prev != null) { - assertTrue(i + "th,prev=" + prev + ",cur=" + ranges[i], prev.compareTo(ranges[i]) > 0); + assertTrue(prev.compareTo(r) > 0); } - prev = ranges[i]; + prev = r; } assertEquals(partNum, ranges.length); assertTrue(ranges[0].getStart().equals(s)); @@ -629,11 +745,10 @@ public void testPartitionForMultipleCharsWithSameFirstChar() { TupleRange prev = null; for (TupleRange r : ranges) { - if (prev == null) { - prev = r; - } else { + if (prev != null) { assertTrue(prev.compareTo(r) < 0); } + prev = r; } assertEquals(partNum, ranges.length); assertTrue(ranges[0].getStart().equals(s)); @@ -684,11 +799,10 @@ public void testPartitionWithNull() { TupleRange prev = null; for (TupleRange r : ranges) { - if (prev == null) { - prev = r; - } else { + if (prev != null) { assertTrue(prev.compareTo(r) < 0); } + prev = r; } } diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/TupleRange.java b/tajo-storage/src/main/java/org/apache/tajo/storage/TupleRange.java index 323205644b..208da5f76b 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/TupleRange.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/TupleRange.java @@ -28,8 +28,8 @@ * It represents a pair of start and end tuples. */ public class TupleRange implements Comparable { - private final Tuple start; - private final Tuple end; + private Tuple start; + private Tuple end; private final TupleComparator comp; public TupleRange(final SortSpec [] sortSpecs, final Tuple start, final Tuple end) { @@ -48,10 +48,18 @@ public static Schema sortSpecsToSchema(SortSpec[] sortSpecs) { return schema; } + public void setStart(Tuple tuple) { + this.start = tuple; + } + public final Tuple getStart() { return this.start; } + public void setEnd(Tuple tuple) { + this.end = tuple; + } + public final Tuple getEnd() { return this.end; } From 5c763059f2d9b61349cd179f778784993b750077 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 14 Aug 2014 19:33:54 +0900 Subject: [PATCH 08/18] Moved padChars to StringUtils. --- .../java/org/apache/tajo/util/BytesUtils.java | 26 ------------------- .../org/apache/tajo/util/StringUtils.java | 26 +++++++++++++++++++ .../engine/planner/UniformRangePartition.java | 3 +-- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java index d7823239ab..b2c085a985 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java @@ -212,32 +212,6 @@ public static byte[][] padBytes(byte []...bytes) { return padded; } - public static char[][] padChars(char []...bytes) { - char[] startChars = bytes[0]; - char[] endChars = bytes[1]; - - char[][] padded = new char[2][]; - int max = Math.max(startChars.length, endChars.length); - - padded[0] = new char[max]; - padded[1] = new char[max]; - - for (int i = 0; i < startChars.length; i++) { - padded[0][i] = startChars[i]; - } - for (int i = startChars.length; i < max; i++) { - padded[0][i] = 0; - } - for (int i = 0; i < endChars.length; i++) { - padded[1][i] = endChars[i]; - } - for (int i = endChars.length; i < max; i++) { - padded[1][i] = 0; - } - - return padded; - } - public static byte [] trimBytes(byte [] bytes) { return new String(bytes).trim().getBytes(); } diff --git a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java index 1237493763..96118ac1a1 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java @@ -284,4 +284,30 @@ public static String unescapePathName(String path) { } return sb.toString(); } + + public static char[][] padChars(char []...bytes) { + char[] startChars = bytes[0]; + char[] endChars = bytes[1]; + + char[][] padded = new char[2][]; + int max = Math.max(startChars.length, endChars.length); + + padded[0] = new char[max]; + padded[1] = new char[max]; + + for (int i = 0; i < startChars.length; i++) { + padded[0][i] = startChars[i]; + } + for (int i = startChars.length; i < max; i++) { + padded[0][i] = 0; + } + for (int i = 0; i < endChars.length; i++) { + padded[1][i] = endChars[i]; + } + for (int i = endChars.length; i < max; i++) { + padded[1][i] = 0; + } + + return padded; + } } diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index ab4c5bae78..9b8b63c0e7 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -20,7 +20,6 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; -import com.google.common.primitives.Chars; import com.google.common.primitives.UnsignedLong; import com.sun.tools.javac.util.Convert; import org.apache.tajo.catalog.Column; @@ -210,7 +209,7 @@ public void normalize(final SortSpec [] sortSpecs, TupleRange range) { endChars = range.getEnd().getUnicodeChars(i); } - char[][] padded = BytesUtils.padChars(startChars, endChars); + char[][] padded = StringUtils.padChars(startChars, endChars); range.getStart().put(i, DatumFactory.createText(new String(padded[0]))); range.getEnd().put(i, DatumFactory.createText(new String(padded[1]))); } From 5cb21e5e626208a373e5a9b47f6adf1bf068d684 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 14 Aug 2014 19:35:48 +0900 Subject: [PATCH 09/18] Removed commented out lines. --- tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java | 1 - 1 file changed, 1 deletion(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java index 8f5edf5b2c..33ec1d476b 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java @@ -140,7 +140,6 @@ public Datum equalsTo(Datum datum) { case CHAR: case BLOB: return DatumFactory.createBool(COMPARATOR.compare(bytes, datum.asByteArray()) == 0); - //return DatumFactory.createBool(Chars.lexicographicalComparator().compare(Convert.utf2chars(bytes), Convert.utf2chars(datum.asByteArray()))); case NULL_TYPE: return datum; default: From 1d002e6986cfb84485b16b7eccce26058f3db2bb Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 14 Aug 2014 19:38:56 +0900 Subject: [PATCH 10/18] Ensuring the totally ordered ranges. --- .../engine/planner/UniformRangePartition.java | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index 9b8b63c0e7..ccedd9f702 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -137,27 +137,21 @@ public TupleRange[] partition(int partNum) { ranges.add(tupleRange); last = ranges.get(ranges.size() - 1).getEnd(); reminder = reminder.subtract(term); - - if (ranges.size() > 1) { - if (sortSpecs[0].isAscending()) { - assert (ranges.get(ranges.size() - 2).compareTo(ranges.get(ranges.size() - 1)) < 0); - } else { - assert (ranges.get(ranges.size() - 2).compareTo(ranges.get(ranges.size() - 1)) > 0); - } - } } // Recovering the transformed same bytes tuples into the original start and end keys ranges.get(0).setStart(mergedRange.getStart()); ranges.get(ranges.size() - 1).setEnd(mergedRange.getEnd()); - // Guarantee the keys are totally ordered correctly + // Ensure all keys are totally ordered correctly. for (int i = 0; i < ranges.size(); i++) { if (i > 1) { if (sortSpecs[0].isAscending()) { - assert (ranges.get(i - 2).compareTo(ranges.get(i - 1)) < 0); + Preconditions.checkState(ranges.get(i - 2).compareTo(ranges.get(i - 1)) < 0, + "Sort ranges are not totally ordered: prev key-" + ranges.get(i - 2) + ", cur key-" + ranges.get(i - 1)); } else { - assert (ranges.get(i - 2).compareTo(ranges.get(i - 1)) > 0); + Preconditions.checkState(ranges.get(i - 2).compareTo(ranges.get(i - 1)) > 0, + "Sort ranges are not totally ordered: prev key-" + ranges.get(i - 2) + ", cur key-" + ranges.get(i - 1)); } } } From 5b123884e0d59834dc080ec9f3f241edd89d4b1a Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 14 Aug 2014 19:49:39 +0900 Subject: [PATCH 11/18] Added more comments. --- .../apache/tajo/engine/planner/UniformRangePartition.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index ccedd9f702..d4fd7b2d42 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -559,6 +559,10 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi lastBigInt = UnsignedLong.valueOf(new BigInteger(last.get(i).asByteArray())).bigIntegerValue(); end.put(i, DatumFactory.createText(lastBigInt.add(incs[i]).toByteArray())); } else { + + // We consider an array of chars as a 2^16 base number system because each char is 2^16 bits. + // See Character.MAX_NUMBER. Then, we increase some number to the last array of chars. + char[] lastChars = last.getUnicodeChars(i); int [] charIncs = new int[lastChars.length]; @@ -580,7 +584,7 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi } int sum = (int)lastChars[k] + charIncs[k]; - if (sum > TextDatum.UNICODE_CHAR_BITS_NUM) { + if (sum > TextDatum.UNICODE_CHAR_BITS_NUM) { // if carry occurs in the current digit charIncs[k] = sum - TextDatum.UNICODE_CHAR_BITS_NUM; charIncs[k-1] += 1; From 57b9e2cf4479a656f18f25235754499c04724fc6 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 14 Aug 2014 19:56:48 +0900 Subject: [PATCH 12/18] Fixed some comment. --- .../org/apache/tajo/engine/planner/UniformRangePartition.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index d4fd7b2d42..4eee47ec0f 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -43,7 +43,7 @@ /** * It serializes multiple sort key spaces into one dimension space by regarding key spaces as - * arbitrary base number Systems respectively. + * arbitrary base number systems respectively. */ public class UniformRangePartition extends RangePartitionAlgorithm { private int variableId; From a649064b4c8731c18d2a82e87aa65ed6af6ebf78 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Wed, 20 Aug 2014 14:25:00 +0900 Subject: [PATCH 13/18] Fixed range clone. --- .../engine/planner/UniformRangePartition.java | 24 +++++++------------ .../org/apache/tajo/storage/TupleRange.java | 17 +++++++++---- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index 4eee47ec0f..40e3f66997 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -46,11 +46,13 @@ * arbitrary base number systems respectively. */ public class UniformRangePartition extends RangePartitionAlgorithm { + private TupleRange originalRange; private int variableId; private BigInteger[] cardForEachDigit; private BigInteger[] colCards; private boolean [] isPureAscii; // flags to indicate if i'th key contains pure ascii characters. + /** * * @param totalRange @@ -60,6 +62,12 @@ public class UniformRangePartition extends RangePartitionAlgorithm { public UniformRangePartition(final TupleRange totalRange, final SortSpec[] sortSpecs, boolean inclusive) { super(sortSpecs, totalRange, inclusive); + try { + originalRange = totalRange.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + // filling pure ascii flags isPureAscii = new boolean[sortSpecs.length]; for (int i = 0; i < sortSpecs.length; i++) { @@ -211,22 +219,6 @@ public void normalize(final SortSpec [] sortSpecs, TupleRange range) { } } - /** - * Normalized keys have padding values, but it will cause the key mismatch in pull server. - * So, it denormalize the normalized keys again. - * - * @param sortSpecs The sort specs - * @param range Tuple range to be denormalized - */ - public static void denormalize(SortSpec [] sortSpecs, TupleRange range) { - for (int i = 0; i < sortSpecs.length; i++) { - if (sortSpecs[i].getSortKey().getDataType().getType() == TajoDataTypes.Type.TEXT) { - range.getStart().put(i,DatumFactory.createText(BytesUtils.trimBytes(range.getStart().getBytes(i)))); - range.getEnd().put(i,DatumFactory.createText(BytesUtils.trimBytes(range.getEnd().getBytes(i)))); - } - } - } - /** * Check whether an overflow occurs or not. * diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/TupleRange.java b/tajo-storage/src/main/java/org/apache/tajo/storage/TupleRange.java index 208da5f76b..6cc09d41dc 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/TupleRange.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/TupleRange.java @@ -27,12 +27,12 @@ /** * It represents a pair of start and end tuples. */ -public class TupleRange implements Comparable { +public class TupleRange implements Comparable, Cloneable { private Tuple start; private Tuple end; private final TupleComparator comp; - public TupleRange(final SortSpec [] sortSpecs, final Tuple start, final Tuple end) { + public TupleRange(final SortSpec[] sortSpecs, final Tuple start, final Tuple end) { this.comp = new TupleComparator(sortSpecsToSchema(sortSpecs), sortSpecs); // if there is only one value, start == end this.start = start; @@ -65,7 +65,7 @@ public final Tuple getEnd() { } public String toString() { - return "[" + this.start + ", " + this.end+")"; + return "[" + this.start + ", " + this.end + ")"; } @Override @@ -75,7 +75,7 @@ public int hashCode() { @Override public boolean equals(Object obj) { - if (obj instanceof TupleRange) { + if (obj instanceof TupleRange) { TupleRange other = (TupleRange) obj; return this.start.equals(other.start) && this.end.equals(other.end); } else { @@ -102,4 +102,11 @@ public int compare(TupleRange left, TupleRange right) { return right.compareTo(left); } } -} + + public TupleRange clone() throws CloneNotSupportedException { + TupleRange newRange = (TupleRange) super.clone(); + newRange.setStart(start.clone()); + newRange.setEnd(end.clone()); + return newRange; + } +} \ No newline at end of file From bf000738d27bd7f87d7acde7a744b1fdc99788bb Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Wed, 20 Aug 2014 14:31:15 +0900 Subject: [PATCH 14/18] Committed. --- .../apache/tajo/engine/planner/RangePartitionAlgorithm.java | 6 +++++- .../apache/tajo/engine/planner/UniformRangePartition.java | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java index fe41c91a9f..38aa9281bf 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/RangePartitionAlgorithm.java @@ -44,7 +44,11 @@ public abstract class RangePartitionAlgorithm { */ public RangePartitionAlgorithm(SortSpec [] sortSpecs, TupleRange totalRange, boolean inclusive) { this.sortSpecs = sortSpecs; - this.mergedRange = totalRange; + try { + this.mergedRange = totalRange.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } this.inclusive = inclusive; this.totalCard = computeCardinalityForAllColumns(sortSpecs, totalRange, inclusive); } diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index 40e3f66997..885e704c43 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -148,8 +148,8 @@ public TupleRange[] partition(int partNum) { } // Recovering the transformed same bytes tuples into the original start and end keys - ranges.get(0).setStart(mergedRange.getStart()); - ranges.get(ranges.size() - 1).setEnd(mergedRange.getEnd()); + ranges.get(0).setStart(originalRange.getStart()); + ranges.get(ranges.size() - 1).setEnd(originalRange.getEnd()); // Ensure all keys are totally ordered correctly. for (int i = 0; i < ranges.size(); i++) { From 97320198160a24b79f9dadf4c474b8810fae23c2 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Wed, 20 Aug 2014 15:44:40 +0900 Subject: [PATCH 15/18] Fixed some bugs. --- .../engine/planner/UniformRangePartition.java | 58 +++++++++++-------- .../tajo/master/querymaster/SubQuery.java | 4 +- 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index 885e704c43..e74b90df64 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -27,6 +27,7 @@ import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.datum.Datum; import org.apache.tajo.datum.DatumFactory; +import org.apache.tajo.datum.NullDatum; import org.apache.tajo.datum.TextDatum; import org.apache.tajo.engine.exception.RangeOverflowException; import org.apache.tajo.storage.Tuple; @@ -46,42 +47,43 @@ * arbitrary base number systems respectively. */ public class UniformRangePartition extends RangePartitionAlgorithm { - private TupleRange originalRange; + private final TupleRange originalRange; private int variableId; private BigInteger[] cardForEachDigit; private BigInteger[] colCards; private boolean [] isPureAscii; // flags to indicate if i'th key contains pure ascii characters. - + private boolean [] beginNulls; // flags to indicate if i'th begin value is null. + private boolean [] endNulls; // flags to indicate if i'th begin value is null. /** * - * @param totalRange + * @param entireRange * @param sortSpecs The description of sort keys * @param inclusive true if the end of the range is inclusive */ - public UniformRangePartition(final TupleRange totalRange, final SortSpec[] sortSpecs, boolean inclusive) { - super(sortSpecs, totalRange, inclusive); + public UniformRangePartition(final TupleRange entireRange, final SortSpec[] sortSpecs, boolean inclusive) { + super(sortSpecs, entireRange, inclusive); - try { - originalRange = totalRange.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); - } + this.originalRange = entireRange; + beginNulls = new boolean[sortSpecs.length]; + endNulls = new boolean[sortSpecs.length]; // filling pure ascii flags isPureAscii = new boolean[sortSpecs.length]; for (int i = 0; i < sortSpecs.length; i++) { - Datum startValue = totalRange.getStart().get(i); - Datum endValue = totalRange.getEnd().get(i); + Datum startValue = entireRange.getStart().get(i); + Datum endValue = entireRange.getEnd().get(i); isPureAscii[i] = StringUtils.isPureAscii(startValue.asChars()) && StringUtils.isPureAscii(endValue.asChars()); + beginNulls[i] = startValue.isNull(); + endNulls[i] = endValue.isNull(); } colCards = new BigInteger[sortSpecs.length]; normalize(sortSpecs, this.mergedRange); for (int i = 0; i < sortSpecs.length; i++) { - Datum startValue = totalRange.getStart().get(i); - Datum endValue = totalRange.getEnd().get(i); + Datum startValue = entireRange.getStart().get(i); + Datum endValue = entireRange.getEnd().get(i); colCards[i] = computeCardinality(sortSpecs[i].getSortKey().getDataType(), startValue, endValue, inclusive, sortSpecs[i].isAscending()); @@ -152,17 +154,17 @@ public TupleRange[] partition(int partNum) { ranges.get(ranges.size() - 1).setEnd(originalRange.getEnd()); // Ensure all keys are totally ordered correctly. - for (int i = 0; i < ranges.size(); i++) { - if (i > 1) { - if (sortSpecs[0].isAscending()) { - Preconditions.checkState(ranges.get(i - 2).compareTo(ranges.get(i - 1)) < 0, - "Sort ranges are not totally ordered: prev key-" + ranges.get(i - 2) + ", cur key-" + ranges.get(i - 1)); - } else { - Preconditions.checkState(ranges.get(i - 2).compareTo(ranges.get(i - 1)) > 0, - "Sort ranges are not totally ordered: prev key-" + ranges.get(i - 2) + ", cur key-" + ranges.get(i - 1)); - } - } - } +// for (int i = 0; i < ranges.size(); i++) { +// if (i > 1) { +// if (sortSpecs[0].isAscending()) { +// Preconditions.checkState(ranges.get(i - 2).compareTo(ranges.get(i - 1)) < 0, +// "Sort ranges are not totally ordered: prev key-" + ranges.get(i - 2) + ", cur key-" + ranges.get(i - 1)); +// } else { +// Preconditions.checkState(ranges.get(i - 2).compareTo(ranges.get(i - 1)) > 0, +// "Sort ranges are not totally ordered: prev key-" + ranges.get(i - 2) + ", cur key-" + ranges.get(i - 1)); +// } +// } +// } return ranges.toArray(new TupleRange[ranges.size()]); } @@ -631,6 +633,12 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi default: throw new UnsupportedOperationException(column.getDataType() + " is not supported yet"); } + + // replace i'th end value by NULL if begin and end are all NULL + if (beginNulls[i] && endNulls[i]) { + end.put(i, NullDatum.get()); + continue; + } } return end; diff --git a/tajo-core/src/main/java/org/apache/tajo/master/querymaster/SubQuery.java b/tajo-core/src/main/java/org/apache/tajo/master/querymaster/SubQuery.java index 8eff8a4f9a..97a935653b 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/querymaster/SubQuery.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/querymaster/SubQuery.java @@ -668,7 +668,7 @@ public void run() { allocateContainers(subQuery); } - } catch (Exception e) { + } catch (Throwable e) { LOG.error("SubQuery (" + subQuery.getId() + ") ERROR: ", e); subQuery.setFinishTime(); subQuery.eventHandler.handle(new SubQueryDiagnosticsUpdateEvent(subQuery.getId(), e.getMessage())); @@ -679,7 +679,7 @@ public void run() { ); state = SubQueryState.INITED; } - } catch (Exception e) { + } catch (Throwable e) { LOG.error("SubQuery (" + subQuery.getId() + ") ERROR: ", e); subQuery.setFinishTime(); subQuery.eventHandler.handle(new SubQueryDiagnosticsUpdateEvent(subQuery.getId(), e.getMessage())); From 68ea6be54384c25f0d0a832fe571da334dc30f0d Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Wed, 20 Aug 2014 17:21:41 +0900 Subject: [PATCH 16/18] Fixing descending order bug. --- .../java/org/apache/tajo/datum/TextDatum.java | 4 +- .../engine/planner/UniformRangePartition.java | 122 ++++++++++++----- .../planner/TestUniformRangePartition.java | 125 +++++++++++++++++- 3 files changed, 213 insertions(+), 38 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java index 3090ca454b..ef4da68b4f 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java @@ -18,10 +18,8 @@ package org.apache.tajo.datum; -import com.google.common.primitives.Chars; import com.google.common.primitives.UnsignedBytes; import com.google.gson.annotations.Expose; -import com.sun.tools.javac.util.Convert; import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.exception.InvalidCastException; import org.apache.tajo.exception.InvalidOperationException; @@ -98,7 +96,7 @@ public String asChars() { @Override public char[] asUnicodeChars() { - return Convert.utf2chars(this.bytes); + return new String(this.bytes, defaultCharset).toCharArray(); } @Override diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index e74b90df64..ad90e9d7ef 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -154,17 +154,12 @@ public TupleRange[] partition(int partNum) { ranges.get(ranges.size() - 1).setEnd(originalRange.getEnd()); // Ensure all keys are totally ordered correctly. -// for (int i = 0; i < ranges.size(); i++) { -// if (i > 1) { -// if (sortSpecs[0].isAscending()) { -// Preconditions.checkState(ranges.get(i - 2).compareTo(ranges.get(i - 1)) < 0, -// "Sort ranges are not totally ordered: prev key-" + ranges.get(i - 2) + ", cur key-" + ranges.get(i - 1)); -// } else { -// Preconditions.checkState(ranges.get(i - 2).compareTo(ranges.get(i - 1)) > 0, -// "Sort ranges are not totally ordered: prev key-" + ranges.get(i - 2) + ", cur key-" + ranges.get(i - 1)); -// } -// } -// } + for (int i = 0; i < ranges.size(); i++) { + if (i > 1) { + Preconditions.checkState(ranges.get(i - 2).compareTo(ranges.get(i - 1)) < 0, + "Sort ranges are not totally ordered: prev key-" + ranges.get(i - 2) + ", cur key-" + ranges.get(i - 1)); + } + } return ranges.toArray(new TupleRange[ranges.size()]); } @@ -483,7 +478,11 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi if (overflowFlag[i]) { end.put(i, DatumFactory.createChar((char) (mergedRange.getStart().get(i).asChar() + incs[i].longValue()))); } else { - end.put(i, DatumFactory.createChar((char) (last.get(i).asChar() + incs[i].longValue()))); + if (sortSpecs[i].isAscending()) { + end.put(i, DatumFactory.createChar((char) (last.get(i).asChar() + incs[i].longValue()))); + } else { + end.put(i, DatumFactory.createChar((char) (last.get(i).asChar() - incs[i].longValue()))); + } } break; case BIT: @@ -491,7 +490,11 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi end.put(i, DatumFactory.createBit( (byte) (mergedRange.getStart().get(i).asByte() + incs[i].longValue()))); } else { - end.put(i, DatumFactory.createBit((byte) (last.get(i).asByte() + incs[i].longValue()))); + if (sortSpecs[i].isAscending()) { + end.put(i, DatumFactory.createBit((byte) (last.get(i).asByte() + incs[i].longValue()))); + } else { + end.put(i, DatumFactory.createBit((byte) (last.get(i).asByte() - incs[i].longValue()))); + } } break; case INT2: @@ -499,7 +502,11 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi end.put(i, DatumFactory.createInt2( (short) (mergedRange.getStart().get(i).asInt2() + incs[i].longValue()))); } else { - end.put(i, DatumFactory.createInt2((short) (last.get(i).asInt2() + incs[i].longValue()))); + if (sortSpecs[i].isAscending()) { + end.put(i, DatumFactory.createInt2((short) (last.get(i).asInt2() + incs[i].longValue()))); + } else { + end.put(i, DatumFactory.createInt2((short) (last.get(i).asInt2() - incs[i].longValue()))); + } } break; case INT4: @@ -519,7 +526,11 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi end.put(i, DatumFactory.createInt8( mergedRange.getStart().get(i).asInt8() + incs[i].longValue())); } else { - end.put(i, DatumFactory.createInt8(last.get(i).asInt8() + incs[i].longValue())); + if (sortSpecs[i].isAscending()) { + end.put(i, DatumFactory.createInt8(last.get(i).asInt8() + incs[i].longValue())); + } else { + end.put(i, DatumFactory.createInt8(last.get(i).asInt8() - incs[i].longValue())); + } } break; case FLOAT4: @@ -527,7 +538,11 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi end.put(i, DatumFactory.createFloat4( mergedRange.getStart().get(i).asFloat4() + incs[i].longValue())); } else { - end.put(i, DatumFactory.createFloat4(last.get(i).asFloat4() + incs[i].longValue())); + if (sortSpecs[i].isAscending()) { + end.put(i, DatumFactory.createFloat4(last.get(i).asFloat4() + incs[i].longValue())); + } else { + end.put(i, DatumFactory.createFloat4(last.get(i).asFloat4() - incs[i].longValue())); + } } break; case FLOAT8: @@ -535,7 +550,11 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi end.put(i, DatumFactory.createFloat8( mergedRange.getStart().get(i).asFloat8() + incs[i].longValue())); } else { - end.put(i, DatumFactory.createFloat8(last.get(i).asFloat8() + incs[i].longValue())); + if (sortSpecs[i].isAscending()) { + end.put(i, DatumFactory.createFloat8(last.get(i).asFloat8() + incs[i].longValue())); + } else { + end.put(i, DatumFactory.createFloat8(last.get(i).asFloat8() - incs[i].longValue())); + } } break; case TEXT: @@ -551,7 +570,11 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi if (isPureAscii[i]) { lastBigInt = UnsignedLong.valueOf(new BigInteger(last.get(i).asByteArray())).bigIntegerValue(); - end.put(i, DatumFactory.createText(lastBigInt.add(incs[i]).toByteArray())); + if (sortSpecs[i].isAscending()) { + end.put(i, DatumFactory.createText(lastBigInt.add(incs[i]).toByteArray())); + } else { + end.put(i, DatumFactory.createText(lastBigInt.subtract(incs[i]).toByteArray())); + } } else { // We consider an array of chars as a 2^16 base number system because each char is 2^16 bits. @@ -577,16 +600,28 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi continue; } - int sum = (int)lastChars[k] + charIncs[k]; - if (sum > TextDatum.UNICODE_CHAR_BITS_NUM) { // if carry occurs in the current digit - charIncs[k] = sum - TextDatum.UNICODE_CHAR_BITS_NUM; - charIncs[k-1] += 1; - - lastChars[k-1] += 1; - lastChars[k] += charIncs[k]; + if (sortSpecs[i].isAscending()) { + int sum = (int) lastChars[k] + charIncs[k]; + if (sum > TextDatum.UNICODE_CHAR_BITS_NUM) { // if carry occurs in the current digit + charIncs[k] = sum - TextDatum.UNICODE_CHAR_BITS_NUM; + charIncs[k - 1] += 1; + + lastChars[k - 1] += 1; + lastChars[k] += charIncs[k]; + } else { + lastChars[k] += charIncs[k]; + } } else { - lastChars[k] += charIncs[k]; - + int sum = (int) lastChars[k] - charIncs[k]; + if (0 > TextDatum.UNICODE_CHAR_BITS_NUM) { // if carry occurs in the current digit + charIncs[k] = TextDatum.UNICODE_CHAR_BITS_NUM - sum; + charIncs[k - 1] -= 1; + + lastChars[k - 1] -= 1; + lastChars[k] += charIncs[k]; + } else { + lastChars[k] -= charIncs[k]; + } } } @@ -599,14 +634,22 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi if (overflowFlag[i]) { end.put(i, DatumFactory.createDate((int) (mergedRange.getStart().get(i).asInt4() + incs[i].longValue()))); } else { - end.put(i, DatumFactory.createDate((int) (last.get(i).asInt4() + incs[i].longValue()))); + if (sortSpecs[i].isAscending()) { + end.put(i, DatumFactory.createDate((int) (last.get(i).asInt4() + incs[i].longValue()))); + } else { + end.put(i, DatumFactory.createDate((int) (last.get(i).asInt4() - incs[i].longValue()))); + } } break; case TIME: if (overflowFlag[i]) { end.put(i, DatumFactory.createTime(mergedRange.getStart().get(i).asInt8() + incs[i].longValue())); } else { - end.put(i, DatumFactory.createTime(last.get(i).asInt8() + incs[i].longValue())); + if (sortSpecs[i].isAscending()) { + end.put(i, DatumFactory.createTime(last.get(i).asInt8() + incs[i].longValue())); + } else { + end.put(i, DatumFactory.createTime(last.get(i).asInt8() - incs[i].longValue())); + } } break; case TIMESTAMP: @@ -614,7 +657,11 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi end.put(i, DatumFactory.createTimestmpDatumWithJavaMillis( mergedRange.getStart().get(i).asInt8() + incs[i].longValue())); } else { - end.put(i, DatumFactory.createTimestmpDatumWithJavaMillis(last.get(i).asInt8() + incs[i].longValue())); + if (sortSpecs[i].isAscending()) { + end.put(i, DatumFactory.createTimestmpDatumWithJavaMillis(last.get(i).asInt8() + incs[i].longValue())); + } else { + end.put(i, DatumFactory.createTimestmpDatumWithJavaMillis(last.get(i).asInt8() - incs[i].longValue())); + } } break; case INET4: @@ -624,10 +671,17 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi assert ipBytes.length == 4; end.put(i, DatumFactory.createInet4(ipBytes)); } else { - int lastVal = last.get(i).asInt4() + incs[i].intValue(); - ipBytes = new byte[4]; - Bytes.putInt(ipBytes, 0, lastVal); - end.put(i, DatumFactory.createInet4(ipBytes)); + if (sortSpecs[i].isAscending()) { + int lastVal = last.get(i).asInt4() + incs[i].intValue(); + ipBytes = new byte[4]; + Bytes.putInt(ipBytes, 0, lastVal); + end.put(i, DatumFactory.createInet4(ipBytes)); + } else { + int lastVal = last.get(i).asInt4() - incs[i].intValue(); + ipBytes = new byte[4]; + Bytes.putInt(ipBytes, 0, lastVal); + end.put(i, DatumFactory.createInet4(ipBytes)); + } } break; default: diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java index dacf9d9244..9ea4840627 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java @@ -34,6 +34,129 @@ import static org.junit.Assert.assertTrue; public class TestUniformRangePartition { + + @Test + public void testPartitionForINT4Asc() { + Schema schema = new Schema() + .addColumn("col1", Type.INT4); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createInt4(1)); + e.put(0, DatumFactory.createInt4(10000)); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + + @Test + public void testPartitionForINT4Desc() { + Schema schema = new Schema() + .addColumn("col1", Type.INT4); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + sortSpecs[0].setDescOrder(); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createInt4(10000)); + e.put(0, DatumFactory.createInt4(1)); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + + @Test + public void testPartitionForINT8Asc() { + Schema schema = new Schema() + .addColumn("col1", Type.INT8); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createInt8(1)); + e.put(0, DatumFactory.createInt8(10000)); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + + @Test + public void testPartitionForINT8Desc() { + Schema schema = new Schema() + .addColumn("col1", Type.INT8); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + sortSpecs[0].setDescOrder(); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createInt8(10000)); + e.put(0, DatumFactory.createInt8(1)); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + /** * It verify overflow and increment in normal case. */ @@ -715,7 +838,7 @@ public void testPartitionForMultipleChars2Desc() { TupleRange prev = null; for (TupleRange r : ranges) { if (prev != null) { - assertTrue(prev.compareTo(r) > 0); + assertTrue(prev.compareTo(r) < 0); } prev = r; } From 987494763f14b0594edcfa347817d2c5dc7e848e Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Wed, 20 Aug 2014 17:45:24 +0900 Subject: [PATCH 17/18] Added more comments and fixed all bugs. --- .../engine/planner/UniformRangePartition.java | 8 +- .../planner/TestUniformRangePartition.java | 191 +++++++++++++++++- 2 files changed, 192 insertions(+), 7 deletions(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java index ad90e9d7ef..db1228509e 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java @@ -153,11 +153,12 @@ public TupleRange[] partition(int partNum) { ranges.get(0).setStart(originalRange.getStart()); ranges.get(ranges.size() - 1).setEnd(originalRange.getEnd()); - // Ensure all keys are totally ordered correctly. + // Ensure all keys are totally ordered in a right order. for (int i = 0; i < ranges.size(); i++) { if (i > 1) { Preconditions.checkState(ranges.get(i - 2).compareTo(ranges.get(i - 1)) < 0, - "Sort ranges are not totally ordered: prev key-" + ranges.get(i - 2) + ", cur key-" + ranges.get(i - 1)); + "Computed ranges are not totally ordered. Previous key=" + ranges.get(i - 2) + ", Current Key=" + + ranges.get(i - 1)); } } @@ -451,6 +452,7 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi if (i == 0) { throw new RangeOverflowException(mergedRange, last, incs[i].longValue(), sortSpecs[i].isAscending()); } + // increment some volume of the serialized one-dimension key space long rem = incrementAndGetReminder(i, last.get(i), value.longValue()); incs[i] = BigInteger.valueOf(rem); incs[i - 1] = incs[i-1].add(BigInteger.ONE); @@ -613,7 +615,7 @@ public Tuple increment(final Tuple last, BigInteger interval, final int baseDigi } } else { int sum = (int) lastChars[k] - charIncs[k]; - if (0 > TextDatum.UNICODE_CHAR_BITS_NUM) { // if carry occurs in the current digit + if (sum < 0) { // if carry occurs in the current digit charIncs[k] = TextDatum.UNICODE_CHAR_BITS_NUM - sum; charIncs[k - 1] -= 1; diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java index 9ea4840627..22944247bc 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestUniformRangePartition.java @@ -35,6 +35,67 @@ public class TestUniformRangePartition { + @Test + public void testPartitionForINT2Asc() { + Schema schema = new Schema() + .addColumn("col1", Type.INT2); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createInt2((short) 1)); + e.put(0, DatumFactory.createInt2((short) 30000)); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + + @Test + public void testPartitionForINT2Desc() { + Schema schema = new Schema() + .addColumn("col1", Type.INT2); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + sortSpecs[0].setDescOrder(); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createInt2((short) 30000)); + e.put(0, DatumFactory.createInt2((short) 1)); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + @Test public void testPartitionForINT4Asc() { Schema schema = new Schema() @@ -127,7 +188,7 @@ public void testPartitionForINT8Asc() { } @Test - public void testPartitionForINT8Desc() { + public void testPartitionForInt8Desc() { Schema schema = new Schema() .addColumn("col1", Type.INT8); @@ -157,6 +218,128 @@ public void testPartitionForINT8Desc() { assertTrue(ranges[partNum - 1].getEnd().equals(e)); } + @Test + public void testPartitionForFloat4Asc() { + Schema schema = new Schema() + .addColumn("col1", Type.FLOAT4); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createFloat4((float) 1.0)); + e.put(0, DatumFactory.createFloat4((float) 10000.0)); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + + @Test + public void testPartitionForFloat4Desc() { + Schema schema = new Schema() + .addColumn("col1", Type.FLOAT4); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + sortSpecs[0].setDescOrder(); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createFloat4((float) 10000.0)); + e.put(0, DatumFactory.createFloat4((float) 1.0)); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + + @Test + public void testPartitionForFloat8Asc() { + Schema schema = new Schema() + .addColumn("col1", Type.FLOAT8); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createFloat8(1.0)); + e.put(0, DatumFactory.createFloat8(10000.0)); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + + @Test + public void testPartitionForFloat8Desc() { + Schema schema = new Schema() + .addColumn("col1", Type.FLOAT8); + + SortSpec [] sortSpecs = PlannerUtil.schemaToSortSpecs(schema); + sortSpecs[0].setDescOrder(); + + Tuple s = new VTuple(1); + Tuple e = new VTuple(1); + s.put(0, DatumFactory.createFloat8((float) 10000.0)); + e.put(0, DatumFactory.createFloat8((float) 1.0)); + + TupleRange expected = new TupleRange(sortSpecs, s, e); + + UniformRangePartition partitioner = new UniformRangePartition(expected, sortSpecs); + int partNum = 64; + TupleRange [] ranges = partitioner.partition(partNum); + + TupleRange prev = null; + for (TupleRange r : ranges) { + if (prev != null) { + assertTrue(prev.compareTo(r) < 0); + } + prev = r; + } + assertEquals(partNum, ranges.length); + assertTrue(ranges[0].getStart().equals(s)); + assertTrue(ranges[partNum - 1].getEnd().equals(e)); + } + /** * It verify overflow and increment in normal case. */ @@ -485,7 +668,7 @@ public void testPartitionForUnicodeTextDesc() { TupleRange prev = null; for (TupleRange r : ranges) { if (prev != null) { - assertTrue(prev.compareTo(r) > 0); + assertTrue(prev.compareTo(r) < 0); } prev = r; } @@ -516,7 +699,7 @@ public void testPartitionForUnicodeDiffLenBeginTextDesc() { TupleRange prev = null; for (TupleRange r : ranges) { if (prev != null) { - assertTrue(prev.compareTo(r) > 0); + assertTrue(prev.compareTo(r) < 0); } prev = r; } @@ -547,7 +730,7 @@ public void testPartitionForUnicodeDiffLenEndTextDesc() { TupleRange prev = null; for (TupleRange r : ranges) { if (prev != null) { - assertTrue(prev.compareTo(r) > 0); + assertTrue(prev.compareTo(r) < 0); } prev = r; } From e7aed022707eea79a08cb4c050ed3f53af8b49bc Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 21 Aug 2014 15:03:26 +0900 Subject: [PATCH 18/18] Use Convert. --- tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java | 3 ++- .../src/main/java/org/apache/tajo/util/StringUtils.java | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java index ef4da68b4f..ca76ed24ec 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java @@ -20,6 +20,7 @@ import com.google.common.primitives.UnsignedBytes; import com.google.gson.annotations.Expose; +import com.sun.tools.javac.util.Convert; import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.exception.InvalidCastException; import org.apache.tajo.exception.InvalidOperationException; @@ -96,7 +97,7 @@ public String asChars() { @Override public char[] asUnicodeChars() { - return new String(this.bytes, defaultCharset).toCharArray(); + return Convert.utf2chars(this.bytes); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java index 96118ac1a1..232c15dd1c 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java @@ -18,6 +18,7 @@ package org.apache.tajo.util; +import org.apache.commons.codec.StringDecoder; import org.apache.commons.lang.CharUtils; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.SystemUtils; @@ -25,6 +26,8 @@ import org.apache.hadoop.util.ShutdownHookManager; import org.apache.hadoop.util.SignalLogger; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.util.Arrays;