From 00fe7c284b75bc69755a922144f3c9e784b887cb Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Thu, 29 Oct 2015 14:58:39 +0900 Subject: [PATCH 01/28] TAJO-1271: Improve memory usage in HashShuffleFileWriteExec --- .../org/apache/tajo/TajoTestingCluster.java | 5 +- .../java/org/apache/tajo/SessionVars.java | 4 +- .../java/org/apache/tajo/conf/TajoConf.java | 12 +- .../org/apache/tajo/storage/BufferPool.java | 2 +- .../apache/tajo/tuple/BaseTupleBuilder.java | 11 +- .../tuple/memory/OffHeapRowBlockWriter.java | 19 +- .../tajo/tuple/memory/OffHeapRowWriter.java | 2 +- .../tajo/tuple/memory/ResizableLimitSpec.java | 2 +- .../tuple/memory/ResizableMemoryBlock.java | 2 +- .../apache/tajo/tuple/memory/RowWriter.java | 4 +- .../apache/tajo/tuple/memory/UnSafeTuple.java | 8 +- .../tajo/tuple/memory/TestMemoryRowBlock.java | 2 +- .../physical/TestExternalSortExec.java | 2 +- .../querymaster/TestTaskStatusUpdate.java | 12 +- .../TestTajoCli/testHelpSessionVars.result | 3 +- .../engine/planner/global/DataChannel.java | 2 +- .../planner/physical/ExternalSortExec.java | 38 +-- .../physical/HashShuffleFileWriteExec.java | 213 +++++++++---- .../engine/planner/physical/SortExec.java | 3 +- .../engine/planner/physical/TupleSorter.java | 5 +- .../planner/physical/UnsafeTupleList.java | 69 +++++ .../planner/physical/VectorizedSorter.java | 3 +- .../NonForwardQueryResultFileScanner.java | 2 +- .../NonForwardQueryResultSystemScanner.java | 2 +- .../tajo/master/exec/QueryExecutor.java | 2 +- .../tajo/querymaster/Repartitioner.java | 4 +- .../org/apache/tajo/worker/TajoWorker.java | 4 + .../java/org/apache/tajo/worker/TaskImpl.java | 10 +- .../org/apache/tajo/jdbc/TestResultSet.java | 2 +- .../tajo/plan/function/stream/BufferPool.java | 2 +- tajo-project/pom.xml | 2 +- .../tajo/storage/HashShuffleAppender.java | 111 ++++--- .../storage/HashShuffleAppenderManager.java | 159 ++++++---- .../java/org/apache/tajo/storage/RawFile.java | 284 +++++++++--------- .../storage/rawfile/DirectRawFileWriter.java | 39 ++- 35 files changed, 650 insertions(+), 396 deletions(-) create mode 100644 tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java diff --git a/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java b/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java index a4fff574e2..646f29a611 100644 --- a/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java +++ b/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java @@ -157,8 +157,9 @@ void initPropertiesAndConfigs() { // Python function path conf.setStrings(ConfVars.PYTHON_CODE_DIR.varname, getClass().getResource("/python").toString()); - // Query output file - conf.setVar(ConfVars.QUERY_OUTPUT_DEFAULT_FILE_FORMAT, BuiltinStorages.DRAW); + // Buffer size + conf.setInt(ConfVars.$EXECUTOR_EXTERNAL_SORT_BUFFER_SIZE.varname, 1); + conf.setInt(ConfVars.$EXECUTOR_HASH_SHUFFLE_BUFFER_SIZE.varname, 1); /* Since Travis CI limits the size of standard output log up to 4MB */ if (!StringUtils.isEmpty(LOG_LEVEL)) { diff --git a/tajo-common/src/main/java/org/apache/tajo/SessionVars.java b/tajo-common/src/main/java/org/apache/tajo/SessionVars.java index 7e419f0221..8b89bebc5e 100644 --- a/tajo-common/src/main/java/org/apache/tajo/SessionVars.java +++ b/tajo-common/src/main/java/org/apache/tajo/SessionVars.java @@ -117,7 +117,9 @@ public enum SessionVars implements ConfigKey { // for physical Executors EXTSORT_BUFFER_SIZE(ConfVars.$EXECUTOR_EXTERNAL_SORT_BUFFER_SIZE, "sort buffer size for external sort (mb)", DEFAULT, - Long.class, Validators.min("0")), + Integer.class, Validators.min("0")), + HASH_SHUFFLE_BUFFER_SIZE(ConfVars.$EXECUTOR_HASH_SHUFFLE_BUFFER_SIZE, "hash-shuffle buffer size for local disk I/O (mb)" + , DEFAULT, Integer.class, Validators.min("1")), HASH_JOIN_SIZE_LIMIT(ConfVars.$EXECUTOR_HASH_JOIN_SIZE_THRESHOLD, "limited size for hash join (mb)", DEFAULT, Long.class, Validators.min("0")), INNER_HASH_JOIN_SIZE_LIMIT(ConfVars.$EXECUTOR_INNER_HASH_JOIN_SIZE_THRESHOLD, diff --git a/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java b/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java index 2abd99629a..f6d6502a22 100644 --- a/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java +++ b/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java @@ -206,19 +206,18 @@ public static enum ConfVars implements ConfigKey { // Shuffle Configuration -------------------------------------------------- PULLSERVER_PORT("tajo.pullserver.port", 0, Validators.range("0", "65535")), SHUFFLE_SSL_ENABLED_KEY("tajo.pullserver.ssl.enabled", false, Validators.bool()), - SHUFFLE_FILE_FORMAT("tajo.shuffle.file-format", BuiltinStorages.RAW, Validators.javaString()), + SHUFFLE_FILE_FORMAT("tajo.shuffle.file-format", BuiltinStorages.DRAW, Validators.javaString()), SHUFFLE_FETCHER_PARALLEL_EXECUTION_MAX_NUM("tajo.shuffle.fetcher.parallel-execution.max-num", 2, Validators.min("1")), SHUFFLE_FETCHER_CHUNK_MAX_SIZE("tajo.shuffle.fetcher.chunk.max-size", 8192), SHUFFLE_FETCHER_CONNECT_TIMEOUT("tajo.shuffle.fetcher.connect.timeout-sec", 60, Validators.min("1")), SHUFFLE_FETCHER_READ_TIMEOUT("tajo.shuffle.fetcher.read.timeout-sec", 60, Validators.min("1")), SHUFFLE_FETCHER_READ_RETRY_MAX_NUM("tajo.shuffle.fetcher.read.retry.max-num", 2, Validators.min("0")), - SHUFFLE_HASH_APPENDER_BUFFER_SIZE("tajo.shuffle.hash.appender.buffer.size", 10000), - SHUFFLE_HASH_APPENDER_PAGE_VOLUME("tajo.shuffle.hash.appender.page.volumn-mb", 30), - HASH_SHUFFLE_PARENT_DIRS("tajo.hash.shuffle.parent.dirs.count", 10), + SHUFFLE_HASH_APPENDER_PAGE_VOLUME("tajo.shuffle.hash.appender.page.volume-mb", 30), + SHUFFLE_HASH_PARENT_DIRS("tajo.shuffle.hash.parent.dirs.count", 64), // Query output Configuration -------------------------------------------------- - QUERY_OUTPUT_DEFAULT_FILE_FORMAT("tajo.query.output.file-format", BuiltinStorages.TEXT, Validators.javaString()), + QUERY_OUTPUT_DEFAULT_FILE_FORMAT("tajo.query.output.file-format", BuiltinStorages.DRAW, Validators.javaString()), // Storage Configuration -------------------------------------------------- ROWFILE_SYNC_INTERVAL("rowfile.sync.interval", 100), @@ -334,7 +333,7 @@ public static enum ConfVars implements ConfigKey { $QUERY_EXECUTE_PARALLEL_MAX("tajo.query.execute.parallel.max", 10), // for physical Executors - $EXECUTOR_EXTERNAL_SORT_BUFFER_SIZE("tajo.executor.external-sort.buffer-mb", 200L), + $EXECUTOR_EXTERNAL_SORT_BUFFER_SIZE("tajo.executor.external-sort.buffer-mb", 200), $EXECUTOR_HASH_JOIN_SIZE_THRESHOLD("tajo.executor.join.common.in-memory-hash-threshold-mb", 64l, Validators.min("0")), $EXECUTOR_INNER_HASH_JOIN_SIZE_THRESHOLD("tajo.executor.join.inner.in-memory-hash-threshold-mb", 64l, Validators.min("0")), @@ -342,6 +341,7 @@ public static enum ConfVars implements ConfigKey { Validators.min("0")), $EXECUTOR_GROUPBY_INMEMORY_HASH_THRESHOLD("tajo.executor.groupby.in-memory-hash-threshold-mb", 64l, Validators.min("0")), + $EXECUTOR_HASH_SHUFFLE_BUFFER_SIZE("tajo.executor.hash-shuffle.buffer-mb", 200, Validators.min("1")), $MAX_OUTPUT_FILE_SIZE("tajo.query.max-outfile-size-mb", 0), // zero means infinite $CODEGEN("tajo.executor.codegen.enabled", false), // Runtime code generation (todo this is broken) diff --git a/tajo-common/src/main/java/org/apache/tajo/storage/BufferPool.java b/tajo-common/src/main/java/org/apache/tajo/storage/BufferPool.java index 403d789e37..356183ca84 100644 --- a/tajo-common/src/main/java/org/apache/tajo/storage/BufferPool.java +++ b/tajo-common/src/main/java/org/apache/tajo/storage/BufferPool.java @@ -52,7 +52,7 @@ private BufferPool() { ResourceLeakDetector.setLevel(ResourceLeakDetector.Level.ADVANCED); } else { TajoConf tajoConf = new TajoConf(); - ALLOCATOR = createPooledByteBufAllocator(true, tajoConf.getBoolean(ALLOW_CACHE, false), 0); + ALLOCATOR = createPooledByteBufAllocator(true, tajoConf.getBoolean(ALLOW_CACHE, true), 0); } } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java b/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java index cb417f32d0..00328829a2 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java @@ -19,6 +19,8 @@ package org.apache.tajo.tuple; import org.apache.tajo.common.TajoDataTypes.DataType; +import org.apache.tajo.exception.NotImplementedException; +import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.storage.Tuple; import org.apache.tajo.tuple.memory.*; import org.apache.tajo.unit.StorageUnit; @@ -65,15 +67,20 @@ public void endRow() { } @Override - public void addTuple(Tuple tuple) { + public void putTuple(Tuple tuple) { if (tuple instanceof UnSafeTuple) { UnSafeTuple unSafeTuple = TUtil.checkTypeAndGet(tuple, UnSafeTuple.class); - addTuple(unSafeTuple); + putTuple(unSafeTuple); } else { OffHeapRowBlockUtils.convert(tuple, this); } } + @Override + public ZeroCopyTuple addTuple(Tuple tuple) { + throw new TajoRuntimeException(new NotImplementedException()); + } + @Override public Tuple build() { return buildToHeapTuple(); diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java index 9f3d8a2c36..bf27d2ae84 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java @@ -65,13 +65,28 @@ public TajoDataTypes.DataType[] dataTypes() { @Override - public void addTuple(Tuple tuple) { + public void putTuple(Tuple tuple) { if (tuple instanceof UnSafeTuple) { UnSafeTuple unSafeTuple = TUtil.checkTypeAndGet(tuple, UnSafeTuple.class); - addTuple(unSafeTuple); + putTuple(unSafeTuple); rowBlock.setRows(rowBlock.rows() + 1); } else { OffHeapRowBlockUtils.convert(tuple, this); } } + + @Override + public ZeroCopyTuple addTuple(Tuple tuple) { + int prevPos = rowBlock.getMemory().writerPosition(); + if (tuple instanceof UnSafeTuple) { + UnSafeTuple unSafeTuple = TUtil.checkTypeAndGet(tuple, UnSafeTuple.class); + putTuple(unSafeTuple); + rowBlock.setRows(rowBlock.rows() + 1); + } else { + OffHeapRowBlockUtils.convert(tuple, this); + } + UnSafeTuple unSafeTuple = new UnSafeTuple(); + unSafeTuple.set(rowBlock.getMemory(), prevPos, rowBlock.getMemory().writerPosition() - prevPos, dataTypes()); + return unSafeTuple; + } } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowWriter.java index f082762ca9..3bb26a147b 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowWriter.java @@ -289,7 +289,7 @@ public void putProtoDatum(ProtobufDatum val) { } - protected void addTuple(UnSafeTuple tuple) { + protected void putTuple(UnSafeTuple tuple) { int length = tuple.getLength(); ensureSize(length); PlatformDependent.copyMemory(tuple.address(), address() + position(), length); diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableLimitSpec.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableLimitSpec.java index 614b3fb01f..79cc1c58f6 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableLimitSpec.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableLimitSpec.java @@ -29,7 +29,7 @@ * due to ByteBuffer. */ public class ResizableLimitSpec { - private final Log LOG = LogFactory.getLog(ResizableLimitSpec.class); + private static final Log LOG = LogFactory.getLog(ResizableLimitSpec.class); public static final int MAX_SIZE_BYTES = Integer.MAX_VALUE; public static final ResizableLimitSpec DEFAULT_LIMIT = new ResizableLimitSpec(Integer.MAX_VALUE); diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java index 22c25617f4..0d09af1a9f 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java @@ -154,7 +154,7 @@ private void resize(int newSize) { @Override public void release() { - buffer.release(); + if(buffer.refCnt() > 0) buffer.release(); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java index 0393714d3c..d49636e1b5 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java @@ -76,5 +76,7 @@ public interface RowWriter { void putProtoDatum(ProtobufDatum datum); - void addTuple(Tuple tuple); + void putTuple(Tuple tuple); + + ZeroCopyTuple addTuple(Tuple tuple); } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java index ec167f8237..b0cce1f29d 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java @@ -42,20 +42,20 @@ public class UnSafeTuple extends ZeroCopyTuple { private static final Unsafe UNSAFE = UnsafeUtil.unsafe; - private long address; + private MemoryBlock memoryBlock; private DataType[] types; @Override public void set(MemoryBlock memoryBlock, int relativePos, int length, DataType[] types) { Preconditions.checkArgument(memoryBlock.hasAddress()); - this.address = memoryBlock.address(); + this.memoryBlock = memoryBlock; this.types = types; super.set(relativePos, length); } public void set(UnSafeTuple tuple) { - this.address = tuple.address; + this.memoryBlock = tuple.memoryBlock; this.types = tuple.types; super.set(tuple.getRelativePos(), tuple.getLength()); } @@ -93,7 +93,7 @@ public void writeTo(ByteBuffer bb) { } public long address() { - return address + getRelativePos(); + return memoryBlock.address() + getRelativePos(); } public HeapTuple toHeapTuple() { diff --git a/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestMemoryRowBlock.java b/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestMemoryRowBlock.java index a6003c773d..15f0054922 100644 --- a/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestMemoryRowBlock.java +++ b/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestMemoryRowBlock.java @@ -274,7 +274,7 @@ public void testVTuplePutAndGetBenchmarkViaDirectRowEncoder() { VTuple tuple = new VTuple(schema.length); for (int i = 0; i < rowNum; i++) { fillVTuple(i, tuple); - rowBlock.getWriter().addTuple(tuple); + rowBlock.getWriter().putTuple(tuple); } long writeEnd = System.currentTimeMillis(); LOG.info("Writing takes " + (writeEnd - writeStart) + " msec"); diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestExternalSortExec.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestExternalSortExec.java index e796bad443..10aa47df62 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestExternalSortExec.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestExternalSortExec.java @@ -122,7 +122,7 @@ public void tearDown() throws Exception { public final void testNext() throws IOException, TajoException { conf.setIntVar(ConfVars.EXECUTOR_EXTERNAL_SORT_FANOUT, 2); QueryContext queryContext = LocalTajoTestingUtility.createDummyContext(conf); - queryContext.setLong(SessionVars.EXTSORT_BUFFER_SIZE, 1024*1024); + queryContext.setLong(SessionVars.EXTSORT_BUFFER_SIZE, 1); FileFragment[] frags = FileTablespace.splitNG(conf, "default.employee", employee.getMeta(), new Path(employee.getUri()), Integer.MAX_VALUE); diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java b/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java index f845bb36e7..e1e10ffcf4 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java @@ -59,8 +59,8 @@ public final void case1() throws Exception { // tpch/lineitem.tbl long[] expectedNumRows = new long[]{5, 2, 2, 2}; - long[] expectedNumBytes = new long[]{604, 18, 18, 48}; - long[] expectedReadBytes = new long[]{604, 604, 18, 0}; + long[] expectedNumBytes = new long[]{604, 48, 48, 48}; + long[] expectedReadBytes = new long[]{604, 604, 48, 0}; QueryId queryId = getQueryId(res); assertStatus(queryId, 2, expectedNumRows, expectedNumBytes, expectedReadBytes); @@ -78,8 +78,8 @@ public final void case2() throws Exception { // tpch/lineitem.tbl long[] expectedNumRows = new long[]{5, 2, 2, 2, 2, 2}; - long[] expectedNumBytes = new long[]{604, 162, 162, 138, 138, 236}; - long[] expectedReadBytes = new long[]{604, 604, 162, 0, 138, 0}; + long[] expectedNumBytes = new long[]{604, 278, 278, 236, 236, 236}; + long[] expectedReadBytes = new long[]{604, 604, 278, 0, 236, 0}; QueryId queryId = getQueryId(res); assertStatus(queryId, 3, expectedNumRows, expectedNumBytes, expectedReadBytes); @@ -107,8 +107,8 @@ public final void case3() throws Exception { // in/out * stage(4) long[] expectedNumRows = new long[]{5, 5, 2, 2, 7, 2, 2, 2}; - long[] expectedNumBytes = new long[]{20, 75, 8, 34, 109, 34, 34, 64}; - long[] expectedReadBytes = new long[]{20, 20, 8, 8, 109, 0, 34, 0}; + long[] expectedNumBytes = new long[]{20, 80, 8, 64, 144, 64, 64, 64}; + long[] expectedReadBytes = new long[]{20, 20, 8, 8, 144, 0, 64, 0}; QueryId queryId = getQueryId(res); assertStatus(queryId, 4, expectedNumRows, expectedNumBytes, expectedReadBytes); diff --git a/tajo-core-tests/src/test/resources/results/TestTajoCli/testHelpSessionVars.result b/tajo-core-tests/src/test/resources/results/TestTajoCli/testHelpSessionVars.result index 2d87b561b9..b075e60569 100644 --- a/tajo-core-tests/src/test/resources/results/TestTajoCli/testHelpSessionVars.result +++ b/tajo-core-tests/src/test/resources/results/TestTajoCli/testHelpSessionVars.result @@ -28,7 +28,8 @@ Available Session Variables: \set TABLE_PARTITION_PER_SHUFFLE_SIZE [int value] - shuffle output size for partition table write (mb) \set GROUPBY_MULTI_LEVEL_ENABLED [true or false] - Multiple level groupby enabled \set QUERY_EXECUTE_PARALLEL [int value] - Maximum parallel running of execution blocks for a query -\set EXTSORT_BUFFER_SIZE [long value] - sort buffer size for external sort (mb) +\set EXTSORT_BUFFER_SIZE [int value] - sort buffer size for external sort (mb) +\set HASH_SHUFFLE_BUFFER_SIZE [int value] - hash-shuffle buffer size for local disk I/O (mb) \set HASH_JOIN_SIZE_LIMIT [long value] - limited size for hash join (mb) \set INNER_HASH_JOIN_SIZE_LIMIT [long value] - limited size for hash inner join (mb) \set OUTER_HASH_JOIN_SIZE_LIMIT [long value] - limited size for hash outer join (mb) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/global/DataChannel.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/global/DataChannel.java index c779d2f362..38fdc246a9 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/global/DataChannel.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/global/DataChannel.java @@ -40,7 +40,7 @@ public class DataChannel { private Schema schema; - private String dataFormat = BuiltinStorages.RAW; + private String dataFormat = BuiltinStorages.DRAW; public DataChannel(ExecutionBlockId srcId, ExecutionBlockId targetId) { this.srcId = srcId; diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java index 42d99bb113..8944fb86a5 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java @@ -24,6 +24,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RawLocalFileSystem; import org.apache.hadoop.io.IOUtils; +import org.apache.tajo.BuiltinStorages; import org.apache.tajo.SessionVars; import org.apache.tajo.catalog.CatalogUtil; import org.apache.tajo.catalog.Schema; @@ -36,6 +37,8 @@ import org.apache.tajo.storage.*; import org.apache.tajo.storage.fragment.FileFragment; import org.apache.tajo.storage.fragment.FragmentConvertor; +import org.apache.tajo.storage.rawfile.DirectRawFileScanner; +import org.apache.tajo.storage.rawfile.DirectRawFileWriter; import org.apache.tajo.unit.StorageUnit; import org.apache.tajo.util.FileUtil; import org.apache.tajo.util.TUtil; @@ -48,9 +51,6 @@ import java.util.List; import java.util.concurrent.*; -import static org.apache.tajo.storage.RawFile.RawFileAppender; -import static org.apache.tajo.storage.RawFile.RawFileScanner; - /** * This external sort algorithm can be characterized by the followings: * @@ -79,7 +79,7 @@ public class ExternalSortExec extends SortExec { /** If there are available multiple cores, it tries parallel merge. */ private ExecutorService executorService; /** used for in-memory sort of each chunk. */ - private TupleList inMemoryTable; + private UnsafeTupleList inMemoryTable; /** temporal dir */ private final Path sortTmpDir; /** It enables round-robin disks allocation */ @@ -108,17 +108,18 @@ private ExternalSortExec(final TaskAttemptContext context, final SortNode plan) super(context, plan.getInSchema(), plan.getOutSchema(), null, plan.getSortKeys()); this.plan = plan; - this.meta = CatalogUtil.newTableMeta("ROWFILE"); + this.meta = CatalogUtil.newTableMeta(BuiltinStorages.DRAW); this.defaultFanout = context.getConf().getIntVar(ConfVars.EXECUTOR_EXTERNAL_SORT_FANOUT); if (defaultFanout < 2) { throw new PhysicalPlanningException(ConfVars.EXECUTOR_EXTERNAL_SORT_FANOUT.varname + " cannot be lower than 2"); } // TODO - sort buffer and core num should be changed to use the allocated container resource. - this.sortBufferBytesNum = context.getQueryContext().getLong(SessionVars.EXTSORT_BUFFER_SIZE) * StorageUnit.MB; + long bufferSize = context.getQueryContext().getInt(SessionVars.EXTSORT_BUFFER_SIZE) * StorageUnit.MB; + this.sortBufferBytesNum = (int) (bufferSize * 0.8); this.allocatedCoreNum = context.getConf().getIntVar(ConfVars.EXECUTOR_EXTERNAL_SORT_THREAD_NUM); this.executorService = Executors.newFixedThreadPool(this.allocatedCoreNum); - this.inMemoryTable = new TupleList(100000); + this.inMemoryTable = new UnsafeTupleList(outSchema, (int) Math.min(bufferSize, 16 * StorageUnit.MB)); this.sortTmpDir = getExecutorTmpDir(); localDirAllocator = new LocalDirAllocator(ConfVars.WORKER_TEMPORAL_DIR.varname); @@ -154,9 +155,9 @@ public SortNode getPlan() { /** * Sort a tuple block and store them into a chunk file */ - private Path sortAndStoreChunk(int chunkId, TupleList tupleBlock) + private Path sortAndStoreChunk(int chunkId, List tupleBlock) throws IOException { - TableMeta meta = CatalogUtil.newTableMeta("RAW"); + TableMeta meta = CatalogUtil.newTableMeta(BuiltinStorages.DRAW); int rowNum = tupleBlock.size(); long sortStart = System.currentTimeMillis(); @@ -165,7 +166,7 @@ private Path sortAndStoreChunk(int chunkId, TupleList tupleBlock) long chunkWriteStart = System.currentTimeMillis(); Path outputPath = getChunkPathForWrite(0, chunkId); - final RawFileAppender appender = new RawFileAppender(context.getConf(), null, inSchema, meta, outputPath); + final FileAppender appender = new DirectRawFileWriter(context.getConf(), null, inSchema, meta, outputPath); appender.init(); for (Tuple t : sorted) { appender.addTuple(t); @@ -190,26 +191,24 @@ private Path sortAndStoreChunk(int chunkId, TupleList tupleBlock) */ private List sortAndStoreAllChunks() throws IOException { Tuple tuple; - long memoryConsumption = 0; List chunkPaths = TUtil.newList(); int chunkId = 0; long runStartTime = System.currentTimeMillis(); while (!context.isStopped() && (tuple = child.next()) != null) { // partition sort start inMemoryTable.add(tuple); - memoryConsumption += MemoryUtil.calculateMemorySize(tuple); - if (memoryConsumption > sortBufferBytesNum) { + if (inMemoryTable.getUsedMem() > sortBufferBytesNum) { long runEndTime = System.currentTimeMillis(); info(LOG, chunkId + " run loading time: " + (runEndTime - runStartTime) + " msec"); runStartTime = runEndTime; - info(LOG, "Memory consumption exceeds " + sortBufferBytesNum + " bytes"); + info(LOG, "Memory consumption exceeds " + FileUtil.humanReadableByteCount(inMemoryTable.getUsedMem(), false)); memoryResident = false; chunkPaths.add(sortAndStoreChunk(chunkId, inMemoryTable)); - memoryConsumption = 0; + inMemoryTable.clear(); chunkId++; // When the volume of sorting data once exceed the size of sort buffer, @@ -248,7 +247,8 @@ private List sortAndStoreAllChunks() throws IOException { * Get a local path from all temporal paths in round-robin manner. */ private synchronized Path getChunkPathForWrite(int level, int chunkId) throws IOException { - return localDirAllocator.getLocalPathForWrite(sortTmpDir + "/" + level +"_" + chunkId, context.getConf()); + return localFS.makeQualified(localDirAllocator.getLocalPathForWrite( + sortTmpDir + "/" + level + "_" + chunkId, context.getConf())); } @Override @@ -459,7 +459,7 @@ public FileFragment call() throws Exception { final Path outputPath = getChunkPathForWrite(level + 1, nextRunId); info(LOG, mergeFanout + " files are being merged to an output file " + outputPath.getName()); long mergeStartTime = System.currentTimeMillis(); - final RawFileAppender output = new RawFileAppender(context.getConf(), null, inSchema, meta, outputPath); + final FileAppender output = new DirectRawFileWriter(context.getConf(), null, inSchema, meta, outputPath); output.init(); final Scanner merger = createKWayMerger(inputFiles, startIdx, mergeFanout); merger.init(); @@ -499,7 +499,7 @@ private Scanner createFinalMerger(List inputs) throws IOException } private Scanner getFileScanner(FileFragment frag) throws IOException { - return new RawFileScanner(context.getConf(), plan.getInSchema(), meta, frag); + return new DirectRawFileScanner(context.getConf(), plan.getInSchema(), meta, frag); } private Scanner createKWayMerger(List inputs, final int startChunkId, final int num) throws IOException { @@ -794,7 +794,7 @@ public void close() throws IOException { } if(inMemoryTable != null){ - inMemoryTable.clear(); + inMemoryTable.release(); inMemoryTable = null; } diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java index a72a375251..026db3ff64 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java @@ -19,37 +19,59 @@ package org.apache.tajo.engine.planner.physical; import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tajo.SessionVars; import org.apache.tajo.catalog.CatalogUtil; import org.apache.tajo.catalog.Column; +import org.apache.tajo.catalog.SchemaUtil; import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.catalog.statistics.TableStats; -import org.apache.tajo.conf.TajoConf.ConfVars; +import org.apache.tajo.common.TajoDataTypes.DataType; import org.apache.tajo.plan.logical.ShuffleFileWriteNode; -import org.apache.tajo.storage.HashShuffleAppender; import org.apache.tajo.storage.HashShuffleAppenderManager; import org.apache.tajo.storage.Tuple; +import org.apache.tajo.tuple.memory.MemoryRowBlock; +import org.apache.tajo.tuple.memory.RowBlock; +import org.apache.tajo.tuple.memory.RowWriter; +import org.apache.tajo.unit.StorageUnit; +import org.apache.tajo.util.FileUtil; import org.apache.tajo.worker.TaskAttemptContext; import java.io.IOException; -import java.util.HashMap; +import java.util.ArrayList; +import java.util.List; import java.util.Map; +import java.util.concurrent.Future; /** * HashShuffleFileWriteExec is a physical executor to store intermediate data into a number of * file outputs associated with shuffle keys. The file outputs are stored on local disks. */ public final class HashShuffleFileWriteExec extends UnaryPhysicalExec { - private static Log LOG = LogFactory.getLog(HashShuffleFileWriteExec.class); - private ShuffleFileWriteNode plan; + private static final Log LOG = LogFactory.getLog(HashShuffleFileWriteExec.class); + private static final int MAXIMUM_INITIAL_BUFFER_SIZE = StorageUnit.MB; + private static final int MINIMUM_INITIAL_BUFFER_SIZE = 4 * StorageUnit.KB; + // Buffer usage is greater than threshold, it will be flush to local storage + private static final float BUFFER_THRESHOLD_FACTOR = 0.8f; + + private final ShuffleFileWriteNode plan; private final TableMeta meta; - private Partitioner partitioner; - private Map appenderMap = new HashMap<>(); + private final Partitioner partitioner; private final int numShuffleOutputs; - private final int [] shuffleKeyIds; - private HashShuffleAppenderManager hashShuffleAppenderManager; - private int numHashShuffleBufferTuples; + private final int[] shuffleKeyIds; + private final HashShuffleAppenderManager hashShuffleAppenderManager; + private final int maxBufferSize; + private final int bufferThreshold; + private final int initialBufferSize; + private final DataType[] dataTypes; + + private final Map partitionMemoryMap; + private long writtenBytes = 0; + private long usedBufferSize = 0; + private long totalBufferCapacity = 0; public HashShuffleFileWriteExec(TaskAttemptContext context, final ShuffleFileWriteNode plan, final PhysicalExec child) throws IOException { @@ -71,78 +93,147 @@ public HashShuffleFileWriteExec(TaskAttemptContext context, } this.partitioner = new HashPartitioner(shuffleKeyIds, numShuffleOutputs); this.hashShuffleAppenderManager = context.getHashShuffleAppenderManager(); - this.numHashShuffleBufferTuples = context.getConf().getIntVar(ConfVars.SHUFFLE_HASH_APPENDER_BUFFER_SIZE); + this.maxBufferSize = context.getQueryContext().getInt(SessionVars.HASH_SHUFFLE_BUFFER_SIZE) * StorageUnit.MB; + this.bufferThreshold = (int) (maxBufferSize * BUFFER_THRESHOLD_FACTOR); + this.dataTypes = SchemaUtil.toDataTypes(outSchema); + + if(numShuffleOutputs > 0){ + this.initialBufferSize = Math.min(MAXIMUM_INITIAL_BUFFER_SIZE, + Math.max(maxBufferSize / numShuffleOutputs, MINIMUM_INITIAL_BUFFER_SIZE)); + } else { + this.initialBufferSize = MINIMUM_INITIAL_BUFFER_SIZE; + } + + this.partitionMemoryMap = Maps.newHashMap(); } @Override public void init() throws IOException { super.init(); } - - private HashShuffleAppender getAppender(int partId) throws IOException { - HashShuffleAppender appender = appenderMap.get(partId); - if (appender == null) { - appender = hashShuffleAppenderManager.getAppender(context.getConf(), - context.getTaskId().getTaskId().getExecutionBlockId(), partId, meta, outSchema); - appenderMap.put(partId, appender); - } - return appender; - } - - Map partitionTuples = new HashMap<>(); - long writtenBytes = 0L; @Override public Tuple next() throws IOException { try { Tuple tuple; int partId; - int tupleCount = 0; long numRows = 0; while (!context.isStopped() && (tuple = child.next()) != null) { - tupleCount++; numRows++; partId = partitioner.getPartition(tuple); - TupleList partitionTupleList = partitionTuples.get(partId); - if (partitionTupleList == null) { - partitionTupleList = new TupleList(1000); - partitionTuples.put(partId, partitionTupleList); + MemoryRowBlock rowBlock = partitionMemoryMap.get(partId); + if (rowBlock == null) { + rowBlock = new MemoryRowBlock(dataTypes, initialBufferSize); + partitionMemoryMap.put(partId, rowBlock); + totalBufferCapacity += rowBlock.capacity(); } - partitionTupleList.add(tuple); - if (tupleCount >= numHashShuffleBufferTuples) { - for (Map.Entry entry : partitionTuples.entrySet()) { + + RowWriter writer = rowBlock.getWriter(); + long prevUsedMem = rowBlock.getMemory().writerPosition(); + totalBufferCapacity -= rowBlock.capacity(); + writer.putTuple(tuple); + totalBufferCapacity += rowBlock.capacity(); // calculate resizeable buffer capacity + usedBufferSize += (rowBlock.getMemory().writerPosition() - prevUsedMem); + + if (totalBufferCapacity > maxBufferSize) { + LOG.warn(String.format("Buffer-Capacity threshold(%s) exceeded(%s). usage: %s", + FileUtil.humanReadableByteCount(bufferThreshold, false), + FileUtil.humanReadableByteCount(totalBufferCapacity, false), + FileUtil.humanReadableByteCount(usedBufferSize, false))); + + List> resultList = Lists.newArrayList(); + for (Map.Entry entry : partitionMemoryMap.entrySet()) { int appendPartId = entry.getKey(); - HashShuffleAppender appender = getAppender(appendPartId); - int appendedSize = appender.addTuples(context.getTaskId(), entry.getValue()); - writtenBytes += appendedSize; - entry.getValue().clear(); + + MemoryRowBlock memoryRowBlock = entry.getValue(); + if(memoryRowBlock.getMemory().isReadable()) { + //flush and release buffer + resultList.add(hashShuffleAppenderManager. + writePartitions(meta, outSchema, context.getTaskId(), appendPartId, memoryRowBlock, true)); + } else { + // release the unused buffer + memoryRowBlock.release(); + } + } + + // wait for flush to storage + for (Future future : resultList) { + future.get(); + } + + writtenBytes += usedBufferSize; + totalBufferCapacity = usedBufferSize = 0; + partitionMemoryMap.clear(); + + } else if (usedBufferSize > bufferThreshold) { + ArrayList releaseList = Lists.newArrayList(); + List> resultList = Lists.newArrayList(); + for (Map.Entry entry : partitionMemoryMap.entrySet()) { + + int appendPartId = entry.getKey(); + MemoryRowBlock memoryRowBlock = entry.getValue(); + if(memoryRowBlock.getMemory().isReadable()) { + + //flush and reuse buffer + resultList.add(hashShuffleAppenderManager. + writePartitions(meta, outSchema, context.getTaskId(), appendPartId, memoryRowBlock, false)); + } else { + releaseList.add(appendPartId); + } + } + + // wait for flush to storage + for (Future future : resultList) { + future.get(); + } + + writtenBytes += usedBufferSize; + usedBufferSize = 0; + + // release the unused partition + for (Integer id : releaseList) { + MemoryRowBlock memoryRowBlock = partitionMemoryMap.remove(id); + LOG.warn("release unused buffer" + memoryRowBlock.capacity()); + memoryRowBlock.release(); } - tupleCount = 0; } } - // processing remained tuples - for (Map.Entry entry : partitionTuples.entrySet()) { + // write the remaining partition buffers + List> resultList = Lists.newArrayList(); + for (Map.Entry entry : partitionMemoryMap.entrySet()) { + int appendPartId = entry.getKey(); - HashShuffleAppender appender = getAppender(appendPartId); - int appendedSize = appender.addTuples(context.getTaskId(), entry.getValue()); - writtenBytes += appendedSize; - entry.getValue().clear(); + MemoryRowBlock memoryRowBlock = entry.getValue(); + if(memoryRowBlock.getMemory().isReadable()) { + //flush and release buffer + resultList.add(hashShuffleAppenderManager. + writePartitions(meta, outSchema, context.getTaskId(), appendPartId, memoryRowBlock, true)); + } else { + // release the unused buffer + memoryRowBlock.release(); + } + } + + // wait for flush to storage + for (Future future : resultList) { + future.get(); } - TableStats aggregated = (TableStats) child.getInputStats().clone(); + writtenBytes += usedBufferSize; + TableStats aggregated = (TableStats)child.getInputStats().clone(); aggregated.setNumBytes(writtenBytes); aggregated.setNumRows(numRows); context.setResultStats(aggregated); - partitionTuples.clear(); - + usedBufferSize = totalBufferCapacity = 0; + partitionMemoryMap.clear(); return null; } catch (RuntimeException e) { LOG.error(e.getMessage(), e); throw new IOException(e); - } catch (Exception e) { + } catch (Throwable e) { LOG.error(e.getMessage(), e); throw new IOException(e); } @@ -150,26 +241,24 @@ public Tuple next() throws IOException { @Override public void rescan() throws IOException { - // nothing to do + if (partitionMemoryMap.size() > 0) { + for (RowBlock rowBlock : partitionMemoryMap.values()) { + rowBlock.release(); + } + partitionMemoryMap.clear(); + } } @Override public void close() throws IOException{ - super.close(); - if (appenderMap != null) { - appenderMap.clear(); - appenderMap = null; - } - - for (TupleList eachList : partitionTuples.values()) { - eachList.clear(); + if (partitionMemoryMap.size() > 0) { + for (RowBlock rowBlock : partitionMemoryMap.values()) { + rowBlock.release(); + } + partitionMemoryMap.clear(); } - partitionTuples.clear(); - partitionTuples = null; - - partitioner = null; - plan = null; progress = 1.0f; + super.close(); } } \ No newline at end of file diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/SortExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/SortExec.java index b652b0aaaf..cabbf4ccac 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/SortExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/SortExec.java @@ -26,6 +26,7 @@ import java.io.IOException; import java.util.Comparator; +import java.util.List; public abstract class SortExec extends UnaryPhysicalExec { @@ -39,7 +40,7 @@ public SortExec(TaskAttemptContext context, Schema inSchema, this.comparator = new BaseTupleComparator(inSchema, sortSpecs); } - protected TupleSorter getSorter(TupleList tupleSlots) { + protected TupleSorter getSorter(List tupleSlots) { if (!tupleSlots.isEmpty() && ComparableVector.isVectorizable(sortSpecs)) { return new VectorizedSorter(tupleSlots, sortSpecs, comparator.getSortKeyIds()); } diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/TupleSorter.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/TupleSorter.java index abf2808335..39c67d6e2b 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/TupleSorter.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/TupleSorter.java @@ -22,6 +22,7 @@ import org.apache.tajo.storage.TupleComparator; import java.util.Collections; +import java.util.List; public interface TupleSorter { @@ -29,10 +30,10 @@ public interface TupleSorter { class DefaultSorter implements TupleSorter { - private final TupleList target; + private final List target; private final TupleComparator comparator; - public DefaultSorter(TupleList target, TupleComparator comparator) { + public DefaultSorter(List target, TupleComparator comparator) { this.target = target; this.comparator = comparator; } diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java new file mode 100644 index 0000000000..8db34ebeca --- /dev/null +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.engine.planner.physical; + +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.catalog.SchemaUtil; +import org.apache.tajo.storage.Tuple; +import org.apache.tajo.tuple.memory.MemoryRowBlock; + +import java.util.ArrayList; + +/** + * In TupleList, input tuples are automatically cloned whenever the add() method is called. + * This data structure is usually used in physical operators like hash join or hash aggregation. + */ +public class UnsafeTupleList extends ArrayList { + + private MemoryRowBlock rowBlock; + + public UnsafeTupleList(Schema schema) { + super(); + this.rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema)); + } + + public UnsafeTupleList(Schema schema, int initialCapacity) { + super(10000); + this.rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), initialCapacity); + } + + @Override + public boolean add(Tuple tuple) { + return super.add(rowBlock.getWriter().addTuple(tuple)); + } + + public void release() { + rowBlock.release(); + super.clear(); + } + + public long getUsedMem() { + return rowBlock.getMemory().writerPosition(); + } + + public long getCapacity() { + return rowBlock.getMemory().capacity(); + } + + @Override + public void clear() { + super.clear(); + rowBlock.clear(); + } +} diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/VectorizedSorter.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/VectorizedSorter.java index 82f7153a2a..44649cd100 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/VectorizedSorter.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/VectorizedSorter.java @@ -26,6 +26,7 @@ import org.apache.tajo.storage.Tuple; import java.util.Iterator; +import java.util.List; /** * Extract raw level values (primitive or String/byte[]) from each of key columns before sorting @@ -35,7 +36,7 @@ public class VectorizedSorter extends ComparableVector implements IndexedSortabl private final int[] mappings; // index indirection - public VectorizedSorter(TupleList source, SortSpec[] sortKeys, int[] keyIndex) { + public VectorizedSorter(List source, SortSpec[] sortKeys, int[] keyIndex) { super(source.size(), sortKeys, keyIndex); source.toArray(tuples); // wish it's array list mappings = new int[tuples.length]; diff --git a/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java b/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java index ce692704a3..a653e62ef1 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java @@ -287,7 +287,7 @@ public void run() { eof = true; break; } else { - rowBlock.getWriter().addTuple(tuple); + rowBlock.getWriter().putTuple(tuple); currentNumRows++; if (currentNumRows >= maxRow) { eof = true; diff --git a/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultSystemScanner.java b/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultSystemScanner.java index 7f6db9bf59..5d3d178021 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultSystemScanner.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultSystemScanner.java @@ -646,7 +646,7 @@ public SerializedResultSet nextRowBlock(int fetchRowNum) throws IOException { rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(tableDesc.getLogicalSchema())); } - rowBlock.getWriter().addTuple(currentTuple); + rowBlock.getWriter().putTuple(currentTuple); currentRow++; rowCount++; diff --git a/tajo-core/src/main/java/org/apache/tajo/master/exec/QueryExecutor.java b/tajo-core/src/main/java/org/apache/tajo/master/exec/QueryExecutor.java index e260c003fd..47807445ef 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/exec/QueryExecutor.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/exec/QueryExecutor.java @@ -342,7 +342,7 @@ public void execNonFromQuery(QueryContext queryContext, Session session, String MemoryRowBlock rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema)); try { - rowBlock.getWriter().addTuple(outTuple); + rowBlock.getWriter().putTuple(outTuple); MemoryBlock memoryBlock = rowBlock.getMemory(); ByteBuffer uncompressed = memoryBlock.getBuffer().nioBuffer(0, memoryBlock.readableBytes()); diff --git a/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java b/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java index 815c44b26b..acc8dbee3b 100644 --- a/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java +++ b/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java @@ -24,6 +24,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; +import org.apache.tajo.BuiltinStorages; import org.apache.tajo.ExecutionBlockId; import org.apache.tajo.SessionVars; import org.apache.tajo.algebra.JoinType; @@ -195,7 +196,8 @@ public static void scheduleFragmentsForJoinQuery(TaskSchedulerContext schedulerC int maxStatsScanIdx = -1; StringBuilder nonLeafScanNamesBuilder = new StringBuilder(); for (int i = 0; i < scans.length; i++) { - if (scans[i].getTableDesc().getMeta().getDataFormat().equalsIgnoreCase("RAW")) { + + if (scans[i].getTableDesc().getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.DRAW)) { // Intermediate data scan hasNonLeafNode = true; largeScanIndexList.add(i); diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorker.java b/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorker.java index 4b0dac9c67..c7cac4f2e3 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorker.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorker.java @@ -338,6 +338,10 @@ public void serviceStop() throws Exception { return; } + if (hashShuffleAppenderManager != null) { + hashShuffleAppenderManager.shutdown(); + } + if(webServer != null) { try { webServer.stop(); diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/TaskImpl.java b/tajo-core/src/main/java/org/apache/tajo/worker/TaskImpl.java index be920a1a7b..a3b1deec2f 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/TaskImpl.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/TaskImpl.java @@ -25,10 +25,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.CommonConfigurationKeysPublic; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.*; import org.apache.tajo.TajoProtos; import org.apache.tajo.TajoProtos.TaskAttemptState; import org.apache.tajo.TaskAttemptId; @@ -548,6 +545,7 @@ public boolean equals(Object obj) { private FileFragment[] localizeFetchedData(File file, String name, TableMeta meta) throws IOException { + Configuration c = new Configuration(systemConf); c.set(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, "file:///"); FileSystem fs = FileSystem.get(c); @@ -561,7 +559,7 @@ private FileFragment[] localizeFetchedData(File file, String name, TableMeta met if (f.getLen() == 0) { continue; } - tablet = new FileFragment(name, f.getPath(), 0l, f.getLen()); + tablet = new FileFragment(name, fs.makeQualified(f.getPath()), 0l, f.getLen()); listTablets.add(tablet); } @@ -569,7 +567,7 @@ private FileFragment[] localizeFetchedData(File file, String name, TableMeta met synchronized (localChunks) { for (FileChunk chunk : localChunks) { if (name.equals(chunk.getEbId())) { - tablet = new FileFragment(name, new Path(chunk.getFile().getPath()), chunk.startOffset(), chunk.length()); + tablet = new FileFragment(name, fs.makeQualified(new Path(chunk.getFile().getPath())), chunk.startOffset(), chunk.length()); listTablets.add(tablet); } } diff --git a/tajo-jdbc/src/test/java/org/apache/tajo/jdbc/TestResultSet.java b/tajo-jdbc/src/test/java/org/apache/tajo/jdbc/TestResultSet.java index 4c926bbf2c..341d676732 100644 --- a/tajo-jdbc/src/test/java/org/apache/tajo/jdbc/TestResultSet.java +++ b/tajo-jdbc/src/test/java/org/apache/tajo/jdbc/TestResultSet.java @@ -92,7 +92,7 @@ public static void setup() throws Exception { tuple.put(1, DatumFactory.createInt4(i + 1)); written += key.length() + Integer.SIZE; appender.addTuple(tuple); - rowBlock.getWriter().addTuple(tuple); + rowBlock.getWriter().putTuple(tuple); } appender.close(); stats.setNumRows(tupleNum); diff --git a/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/BufferPool.java b/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/BufferPool.java index 55fa45d88b..d5d2c29fc3 100644 --- a/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/BufferPool.java +++ b/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/BufferPool.java @@ -53,7 +53,7 @@ private BufferPool() { ResourceLeakDetector.setLevel(ResourceLeakDetector.Level.ADVANCED); } else { TajoConf tajoConf = new TajoConf(); - ALLOCATOR = createPooledByteBufAllocator(true, tajoConf.getBoolean(ALLOW_CACHE, false), 0); + ALLOCATOR = createPooledByteBufAllocator(true, tajoConf.getBoolean(ALLOW_CACHE, true), 0); } } diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml index ab7370c6ee..72e99d118e 100644 --- a/tajo-project/pom.xml +++ b/tajo-project/pom.xml @@ -38,7 +38,7 @@ 0.12.0-SNAPSHOT 1.1.1 1.1.0 - 4.0.29.Final + 4.0.32.Final 2.6 6.1.14 ${project.parent.relativePath}/.. diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppender.java index a82c7ec12c..3b29f24477 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppender.java @@ -23,6 +23,8 @@ import org.apache.tajo.ExecutionBlockId; import org.apache.tajo.TaskAttemptId; import org.apache.tajo.catalog.statistics.TableStats; +import org.apache.tajo.storage.rawfile.DirectRawFileWriter; +import org.apache.tajo.tuple.memory.MemoryRowBlock; import org.apache.tajo.util.Pair; import java.io.IOException; @@ -35,9 +37,10 @@ public class HashShuffleAppender implements Appender { private static Log LOG = LogFactory.getLog(HashShuffleAppender.class); - private FileAppender appender; + private DirectRawFileWriter appender; private AtomicBoolean closed = new AtomicBoolean(false); private int partId; + private int volumeId; private TableStats tableStats; @@ -59,11 +62,12 @@ public class HashShuffleAppender implements Appender { private ExecutionBlockId ebId; - public HashShuffleAppender(ExecutionBlockId ebId, int partId, int pageSize, FileAppender appender) { + public HashShuffleAppender(ExecutionBlockId ebId, int partId, int pageSize, DirectRawFileWriter appender, int volumeId) { this.ebId = ebId; this.partId = partId; this.appender = appender; this.pageSize = pageSize; + this.volumeId = volumeId; } @Override @@ -77,42 +81,38 @@ public void init() throws IOException { * Write multiple tuples. Each tuple is written by a FileAppender which is responsible specified partition. * After writing if a current page exceeds pageSize, pageOffset will be added. * @param taskId - * @param tuples + * @param rowBlock * @return written bytes * @throws java.io.IOException */ - public int addTuples(TaskAttemptId taskId, List tuples) throws IOException { - synchronized(appender) { - if (closed.get()) { - return 0; - } - long currentPos = appender.getOffset(); + public MemoryRowBlock writeRowBlock(TaskAttemptId taskId, MemoryRowBlock rowBlock) throws IOException { + if (closed.get()) { + return rowBlock; + } - for (Tuple eachTuple: tuples) { - appender.addTuple(eachTuple); - } - long posAfterWritten = appender.getOffset(); + appender.writeRowBlock(rowBlock); + appender.flush(); - int writtenBytes = (int)(posAfterWritten - currentPos); + int rows = rowBlock.rows(); + long posAfterWritten = appender.getOffset(); - int nextRowNum = rowNumInPage + tuples.size(); - List>> taskIndexes = taskTupleIndexes.get(taskId); - if (taskIndexes == null) { - taskIndexes = new ArrayList<>(); - taskTupleIndexes.put(taskId, taskIndexes); - } - taskIndexes.add( - new Pair<>(currentPage.getFirst(), new Pair(rowNumInPage, nextRowNum))); - rowNumInPage = nextRowNum; - - if (posAfterWritten - currentPage.getFirst() > pageSize) { - nextPage(posAfterWritten); - rowNumInPage = 0; - } + int nextRowNum = rowNumInPage + rows; + List>> taskIndexes = taskTupleIndexes.get(taskId); + if (taskIndexes == null) { + taskIndexes = new ArrayList<>(); + taskTupleIndexes.put(taskId, taskIndexes); + } + taskIndexes.add( + new Pair<>(currentPage.getFirst(), new Pair(rowNumInPage, nextRowNum))); + rowNumInPage = nextRowNum; - totalRows += tuples.size(); - return writtenBytes; + if (posAfterWritten - currentPage.getFirst() > pageSize) { + nextPage(posAfterWritten); + rowNumInPage = 0; } + + totalRows += rows; + return rowBlock; } public long getOffset() throws IOException { @@ -136,12 +136,10 @@ public void addTuple(Tuple t) throws IOException { @Override public void flush() throws IOException { - synchronized(appender) { - if (closed.get()) { - return; - } - appender.flush(); + if (closed.get()) { + return; } + appender.flush(); } @Override @@ -151,27 +149,24 @@ public long getEstimatedOutputSize() throws IOException { @Override public void close() throws IOException { - synchronized(appender) { - if (closed.get()) { - return; - } - appender.flush(); - offset = appender.getOffset(); - if (offset > currentPage.getFirst()) { - nextPage(offset); - } - appender.close(); - if (LOG.isDebugEnabled()) { - if (!pages.isEmpty()) { - LOG.info(ebId + ",partId=" + partId + " Appender closed: fileLen=" + offset + ", pages=" + pages.size() - + ", lastPage=" + pages.get(pages.size() - 1)); - } else { - LOG.info(ebId + ",partId=" + partId + " Appender closed: fileLen=" + offset + ", pages=" + pages.size()); - } + if (closed.getAndSet(true)) { + return; + } + appender.flush(); + offset = appender.getOffset(); + if (offset > currentPage.getFirst()) { + nextPage(offset); + } + appender.close(); + if (LOG.isDebugEnabled()) { + if (!pages.isEmpty()) { + LOG.info(ebId + ",partId=" + partId + " Appender closed: fileLen=" + offset + ", pages=" + pages.size() + + ", lastPage=" + pages.get(pages.size() - 1)); + } else { + LOG.info(ebId + ",partId=" + partId + " Appender closed: fileLen=" + offset + ", pages=" + pages.size()); } - closed.set(true); - tableStats = appender.getStats(); } + tableStats = appender.getStats(); } @Override @@ -180,9 +175,7 @@ public void enableStats() { @Override public TableStats getStats() { - synchronized(appender) { - return appender.getStats(); - } + return appender.getStats(); } public List> getPages() { @@ -206,4 +199,8 @@ public List>> getMergedTupleIndexes() { public void taskFinished(TaskAttemptId taskId) { taskTupleIndexes.remove(taskId); } + + public int getVolumeId() { + return volumeId; + } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java index 4297e4d1ba..aeadca2725 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java @@ -18,6 +18,9 @@ package org.apache.tajo.storage; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileStatus; @@ -30,6 +33,9 @@ import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.conf.TajoConf; import org.apache.tajo.conf.TajoConf.ConfVars; +import org.apache.tajo.storage.rawfile.DirectRawFileWriter; +import org.apache.tajo.tuple.memory.MemoryRowBlock; +import org.apache.tajo.unit.StorageUnit; import org.apache.tajo.util.Pair; import java.io.IOException; @@ -37,13 +43,15 @@ import java.util.Collection; import java.util.List; import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.*; public class HashShuffleAppenderManager { private static final Log LOG = LogFactory.getLog(HashShuffleAppenderManager.class); - private Map> appenderMap = - new ConcurrentHashMap<>(); + private ConcurrentMap> appenderMap = Maps.newConcurrentMap(); + private ConcurrentMap executors = Maps.newConcurrentMap(); // for parallel writing + private List temporalPaths = Lists.newArrayList(); + private TajoConf systemConf; private FileSystem defaultFS; private FileSystem localFS; @@ -59,66 +67,86 @@ public HashShuffleAppenderManager(TajoConf systemConf) throws IOException { // initialize DFS and LocalFileSystems defaultFS = TajoConf.getTajoRootDir(systemConf).getFileSystem(systemConf); localFS = FileSystem.getLocal(systemConf); - pageSize = systemConf.getIntVar(ConfVars.SHUFFLE_HASH_APPENDER_PAGE_VOLUME) * 1024 * 1024; + pageSize = systemConf.getIntVar(ConfVars.SHUFFLE_HASH_APPENDER_PAGE_VOLUME) * StorageUnit.MB; + + Iterable allLocalPath = lDirAllocator.getAllLocalPathsToRead(".", systemConf); + + //add async hash shuffle writer + for (Path path : allLocalPath) { + temporalPaths.add(localFS.makeQualified(path).toString()); + executors.put(temporalPaths.size() - 1, Executors.newSingleThreadExecutor()); + } } - public HashShuffleAppender getAppender(TajoConf tajoConf, ExecutionBlockId ebId, int partId, - TableMeta meta, Schema outSchema) throws IOException { - synchronized (appenderMap) { - Map partitionAppenderMap = appenderMap.get(ebId); + protected int getVolumeId(Path path) { + int i = 0; + for (String rootPath : temporalPaths) { + if (path.toString().startsWith(rootPath)) { + break; + } + i++; + } + Preconditions.checkPositionIndex(i, temporalPaths.size() - 1); + return i; + } - if (partitionAppenderMap == null) { - partitionAppenderMap = new ConcurrentHashMap<>(); - appenderMap.put(ebId, partitionAppenderMap); + public synchronized HashShuffleAppender getAppender(MemoryRowBlock memoryRowBlock, ExecutionBlockId ebId, int partId, + TableMeta meta, Schema outSchema) throws IOException { + Map partitionAppenderMap = appenderMap.get(ebId); + + if (partitionAppenderMap == null) { + partitionAppenderMap = new ConcurrentHashMap<>(); + appenderMap.put(ebId, partitionAppenderMap); + } + + PartitionAppenderMeta partitionAppenderMeta = partitionAppenderMap.get(partId); + if (partitionAppenderMeta == null) { + Path dataFile = getDataFile(ebId, partId); + FileSystem fs = dataFile.getFileSystem(systemConf); + if (fs.exists(dataFile)) { + FileStatus status = fs.getFileStatus(dataFile); + LOG.info("File " + dataFile + " already exists, size=" + status.getLen()); } - PartitionAppenderMeta partitionAppenderMeta = partitionAppenderMap.get(partId); - if (partitionAppenderMeta == null) { - Path dataFile = getDataFile(ebId, partId); - FileSystem fs = dataFile.getFileSystem(systemConf); - if (fs.exists(dataFile)) { - FileStatus status = fs.getFileStatus(dataFile); - LOG.info("File " + dataFile + " already exists, size=" + status.getLen()); - } - - if (!fs.exists(dataFile.getParent())) { - fs.mkdirs(dataFile.getParent()); - } - - FileTablespace space = (FileTablespace) TablespaceManager.get(dataFile.toUri()); - FileAppender appender = (FileAppender) space.getAppender(meta, outSchema, dataFile); - appender.enableStats(); - appender.init(); - - partitionAppenderMeta = new PartitionAppenderMeta(); - partitionAppenderMeta.partId = partId; - partitionAppenderMeta.dataFile = dataFile; - partitionAppenderMeta.appender = new HashShuffleAppender(ebId, partId, pageSize, appender); - partitionAppenderMeta.appender.init(); - partitionAppenderMap.put(partId, partitionAppenderMeta); - - if (LOG.isDebugEnabled()) { - LOG.debug("Create Hash shuffle file(partId=" + partId + "): " + dataFile); - } + if (!fs.exists(dataFile.getParent())) { + fs.mkdirs(dataFile.getParent()); } - return partitionAppenderMeta.appender; + DirectRawFileWriter appender = + new DirectRawFileWriter(systemConf, null, outSchema, meta, dataFile, memoryRowBlock); + appender.enableStats(); + appender.init(); + + partitionAppenderMeta = new PartitionAppenderMeta(); + partitionAppenderMeta.partId = partId; + partitionAppenderMeta.dataFile = dataFile; + partitionAppenderMeta.appender = + new HashShuffleAppender(ebId, partId, pageSize, appender, getVolumeId(dataFile)); + partitionAppenderMeta.appender.init(); + partitionAppenderMap.put(partId, partitionAppenderMeta); + + if (LOG.isDebugEnabled()) { + LOG.debug("Create Hash shuffle file(partId=" + partId + "): " + dataFile); + } } + + return partitionAppenderMeta.appender; } public static int getPartParentId(int partId, TajoConf tajoConf) { - return partId % tajoConf.getIntVar(TajoConf.ConfVars.HASH_SHUFFLE_PARENT_DIRS); + return partId % tajoConf.getIntVar(ConfVars.SHUFFLE_HASH_PARENT_DIRS); } private Path getDataFile(ExecutionBlockId ebId, int partId) throws IOException { try { // the base dir for an output dir String executionBlockBaseDir = ebId.getQueryId().toString() + "/output" + "/" + ebId.getId() + "/hash-shuffle"; - Path baseDirPath = localFS.makeQualified(lDirAllocator.getLocalPathForWrite(executionBlockBaseDir, systemConf)); + Path baseDirPath = lDirAllocator.getLocalPathForWrite(executionBlockBaseDir, systemConf); //LOG.info(ebId + "'s basedir is created (" + baseDirPath + ")"); // If EB has many partition, too many shuffle file are in single directory. - return StorageUtil.concatPath(baseDirPath, "" + getPartParentId(partId, systemConf), "" + partId); + return localFS.makeQualified( + StorageUtil.concatPath(baseDirPath, "" + getPartParentId(partId, systemConf), "" + partId)); } catch (Exception e) { LOG.error(e.getMessage(), e); throw new IOException(e); @@ -126,10 +154,7 @@ private Path getDataFile(ExecutionBlockId ebId, int partId) throws IOException { } public List close(ExecutionBlockId ebId) throws IOException { - Map partitionAppenderMap = null; - synchronized (appenderMap) { - partitionAppenderMap = appenderMap.remove(ebId); - } + Map partitionAppenderMap = appenderMap.remove(ebId); if (partitionAppenderMap == null) { LOG.info("Close HashShuffleAppender:" + ebId + ", not a hash shuffle"); @@ -158,16 +183,42 @@ public List close(ExecutionBlockId ebId) throws IOExcep } public void finalizeTask(TaskAttemptId taskId) { - synchronized (appenderMap) { - Map partitionAppenderMap = + Map partitionAppenderMap = appenderMap.get(taskId.getTaskId().getExecutionBlockId()); - if (partitionAppenderMap == null) { - return; - } + if (partitionAppenderMap == null) { + return; + } + + for (PartitionAppenderMeta eachAppender: partitionAppenderMap.values()) { + eachAppender.appender.taskFinished(taskId); + } + } + + /** + * Asynchronously write partitions. + */ + public Future writePartitions(TableMeta meta, Schema schema, final TaskAttemptId taskId, int partId, + final MemoryRowBlock rowBlock, + final boolean release) throws IOException { + + HashShuffleAppender appender = getAppender(rowBlock, taskId.getTaskId().getExecutionBlockId(), partId, meta, schema); + ExecutorService executor = executors.get(appender.getVolumeId()); + return executor.submit(new Callable() { + @Override + public MemoryRowBlock call() throws Exception { + appender.writeRowBlock(taskId, rowBlock); - for (PartitionAppenderMeta eachAppender: partitionAppenderMap.values()) { - eachAppender.appender.taskFinished(taskId); + if (release) rowBlock.release(); + else rowBlock.clear(); + + return rowBlock; } + }); + } + + public void shutdown() { + for (ExecutorService service : executors.values()) { + service.shutdownNow(); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java index fba12ddadf..65ec05ed34 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java @@ -281,96 +281,94 @@ public Tuple next() throws IOException { } switch (columnTypes[i].getType()) { - case BOOLEAN : - outTuple.put(i, DatumFactory.createBool(buffer.get())); - break; - - case BIT : - outTuple.put(i, DatumFactory.createBit(buffer.get())); - break; - - case CHAR : - int realLen = readRawVarint32(); - byte[] buf = new byte[realLen]; - buffer.get(buf); - outTuple.put(i, DatumFactory.createChar(buf)); - break; - - case INT2 : - outTuple.put(i, DatumFactory.createInt2(buffer.getShort())); - break; - - case INT4 : - outTuple.put(i, DatumFactory.createInt4(decodeZigZag32(readRawVarint32()))); - break; - - case INT8 : - outTuple.put(i, DatumFactory.createInt8(decodeZigZag64(readRawVarint64()))); - break; - - case FLOAT4 : - outTuple.put(i, DatumFactory.createFloat4(buffer.getFloat())); - break; - - case FLOAT8 : - outTuple.put(i, DatumFactory.createFloat8(buffer.getDouble())); - break; - - case TEXT : { - int len = readRawVarint32(); - byte [] strBytes = new byte[len]; - buffer.get(strBytes); - outTuple.put(i, DatumFactory.createText(strBytes)); - break; - } + case BOOLEAN: + outTuple.put(i, DatumFactory.createBool(buffer.get())); + break; - case BLOB : { - int len = readRawVarint32(); - byte [] rawBytes = new byte[len]; - buffer.get(rawBytes); - outTuple.put(i, DatumFactory.createBlob(rawBytes)); - break; - } + case BIT: + outTuple.put(i, DatumFactory.createBit(buffer.get())); + break; - case PROTOBUF: { - int len = readRawVarint32(); - byte [] rawBytes = new byte[len]; - buffer.get(rawBytes); + case CHAR: + int realLen = readRawVarint32(); + byte[] buf = new byte[realLen]; + buffer.get(buf); + outTuple.put(i, DatumFactory.createChar(buf)); + break; - outTuple.put(i, ProtobufDatumFactory.createDatum(columnTypes[i], rawBytes)); - break; - } + case INT2: + outTuple.put(i, DatumFactory.createInt2(buffer.getShort())); + break; - case INET4 : - byte [] ipv4Bytes = new byte[4]; - buffer.get(ipv4Bytes); - outTuple.put(i, DatumFactory.createInet4(ipv4Bytes)); - break; - - case DATE: { - int val = buffer.getInt(); - if (val < Integer.MIN_VALUE + 1) { - outTuple.put(i, DatumFactory.createNullDatum()); - } else { - outTuple.put(i, DatumFactory.createFromInt4(columnTypes[i], val)); - } - break; + case INT4: + outTuple.put(i, DatumFactory.createInt4(decodeZigZag32(readRawVarint32()))); + break; + + case INT8: + outTuple.put(i, DatumFactory.createInt8(decodeZigZag64(readRawVarint64()))); + break; + + case FLOAT4: + outTuple.put(i, DatumFactory.createFloat4(buffer.getFloat())); + break; + + case FLOAT8: + outTuple.put(i, DatumFactory.createFloat8(buffer.getDouble())); + break; + + case TEXT: { + int len = readRawVarint32(); + byte[] strBytes = new byte[len]; + buffer.get(strBytes); + outTuple.put(i, DatumFactory.createText(strBytes)); + break; + } + + case BLOB: { + int len = readRawVarint32(); + byte[] rawBytes = new byte[len]; + buffer.get(rawBytes); + outTuple.put(i, DatumFactory.createBlob(rawBytes)); + break; + } + + case PROTOBUF: { + int len = readRawVarint32(); + byte[] rawBytes = new byte[len]; + buffer.get(rawBytes); + + outTuple.put(i, ProtobufDatumFactory.createDatum(columnTypes[i], rawBytes)); + break; + } + + case INET4: + outTuple.put(i, DatumFactory.createInet4(buffer.getInt())); + break; + + case DATE: { + int val = buffer.getInt(); + if (val < Integer.MIN_VALUE + 1) { + outTuple.put(i, DatumFactory.createNullDatum()); + } else { + outTuple.put(i, DatumFactory.createFromInt4(columnTypes[i], val)); } - case TIME: - case TIMESTAMP: { - long val = buffer.getLong(); - if (val < Long.MIN_VALUE + 1) { - outTuple.put(i, DatumFactory.createNullDatum()); - } else { - outTuple.put(i, DatumFactory.createFromInt8(columnTypes[i], val)); - } - break; + break; + } + case TIME: + case TIMESTAMP: { + long val = buffer.getLong(); + if (val < Long.MIN_VALUE + 1) { + outTuple.put(i, DatumFactory.createNullDatum()); + } else { + outTuple.put(i, DatumFactory.createFromInt8(columnTypes[i], val)); } - case NULL_TYPE: - outTuple.put(i, NullDatum.get()); - break; + break; + } + case NULL_TYPE: + outTuple.put(i, NullDatum.get()); + break; - default: + default: } } @@ -658,46 +656,46 @@ public void addTuple(Tuple t) throws IOException { recordOffset = 0; } - switch(columnTypes[i].getType()) { - case NULL_TYPE: - nullFlags.set(i); - continue; - - case BOOLEAN: - case BIT: - buffer.put(t.getByte(i)); - break; - - case INT2 : - buffer.putShort(t.getInt2(i)); - break; - - case INT4 : - writeRawVarint32(encodeZigZag32(t.getInt4(i))); - break; - - case INT8 : - writeRawVarint64(encodeZigZag64(t.getInt8(i))); - break; - - case FLOAT4 : - buffer.putFloat(t.getFloat4(i)); - break; - - case FLOAT8 : - buffer.putDouble(t.getFloat8(i)); - break; - - case CHAR: - case TEXT: { - byte [] strBytes = t.getBytes(i); - if (flushBufferAndReplace(recordOffset, strBytes.length + computeRawVarint32Size(strBytes.length))) { - recordOffset = 0; - } - writeRawVarint32(strBytes.length); - buffer.put(strBytes); - break; + switch (columnTypes[i].getType()) { + case NULL_TYPE: + nullFlags.set(i); + continue; + + case BOOLEAN: + case BIT: + buffer.put(t.getByte(i)); + break; + + case INT2: + buffer.putShort(t.getInt2(i)); + break; + + case INT4: + writeRawVarint32(encodeZigZag32(t.getInt4(i))); + break; + + case INT8: + writeRawVarint64(encodeZigZag64(t.getInt8(i))); + break; + + case FLOAT4: + buffer.putFloat(t.getFloat4(i)); + break; + + case FLOAT8: + buffer.putDouble(t.getFloat8(i)); + break; + + case CHAR: + case TEXT: { + byte[] strBytes = t.getBytes(i); + if (flushBufferAndReplace(recordOffset, strBytes.length + computeRawVarint32Size(strBytes.length))) { + recordOffset = 0; } + writeRawVarint32(strBytes.length); + buffer.put(strBytes); + break; + } case DATE: buffer.putInt(t.getInt4(i)); @@ -708,32 +706,32 @@ public void addTuple(Tuple t) throws IOException { buffer.putLong(t.getInt8(i)); break; - case BLOB : { - byte [] rawBytes = t.getBytes(i); - if (flushBufferAndReplace(recordOffset, rawBytes.length + computeRawVarint32Size(rawBytes.length))) { - recordOffset = 0; - } - writeRawVarint32(rawBytes.length); - buffer.put(rawBytes); - break; + case BLOB: { + byte[] rawBytes = t.getBytes(i); + if (flushBufferAndReplace(recordOffset, rawBytes.length + computeRawVarint32Size(rawBytes.length))) { + recordOffset = 0; } + writeRawVarint32(rawBytes.length); + buffer.put(rawBytes); + break; + } - case PROTOBUF: { - byte [] rawBytes = t.getBytes(i); - if (flushBufferAndReplace(recordOffset, rawBytes.length + computeRawVarint32Size(rawBytes.length))) { - recordOffset = 0; - } - writeRawVarint32(rawBytes.length); - buffer.put(rawBytes); - break; + case PROTOBUF: { + byte[] rawBytes = t.getBytes(i); + if (flushBufferAndReplace(recordOffset, rawBytes.length + computeRawVarint32Size(rawBytes.length))) { + recordOffset = 0; } + writeRawVarint32(rawBytes.length); + buffer.put(rawBytes); + break; + } - case INET4 : - buffer.put(t.getBytes(i)); - break; + case INET4: + buffer.putInt(t.getInt4(i)); + break; - default: - throw new IOException("Cannot support data type: " + columnTypes[i].getType()); + default: + throw new IOException("Cannot support data type: " + columnTypes[i].getType()); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java index 03642a7a24..7d8ceffe0e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java @@ -58,11 +58,19 @@ public class DirectRawFileWriter extends FileAppender { private TableStatistics stats; private ShuffleType shuffleType; - private MemoryRowBlock memoryRowBlock; + private MemoryRowBlock rowBlock; + private boolean hasExternalBuf; public DirectRawFileWriter(Configuration conf, TaskAttemptId taskAttemptId, final Schema schema, final TableMeta meta, final Path path) throws IOException { + this(conf, taskAttemptId, schema, meta, path, null); + } + + public DirectRawFileWriter(Configuration conf, TaskAttemptId taskAttemptId, + final Schema schema, final TableMeta meta, final Path path, + MemoryRowBlock rowBlock) throws IOException { super(conf, taskAttemptId, schema, meta, path); + this.rowBlock = rowBlock; } @Override @@ -96,14 +104,19 @@ public void init() throws IOException { PlannerUtil.getShuffleType(ShuffleType.NONE_SHUFFLE))); } - memoryRowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), BUFFER_SIZE, true); + if (rowBlock == null) { + rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), BUFFER_SIZE, true); + } else { + hasExternalBuf = true; + } + pos = 0; super.init(); } @Override public long getOffset() throws IOException { - return pos + memoryRowBlock.getMemory().writerPosition(); + return hasExternalBuf ? pos : pos + rowBlock.getMemory().writerPosition(); } public void writeRowBlock(MemoryRowBlock rowBlock) throws IOException { @@ -113,10 +126,8 @@ public void writeRowBlock(MemoryRowBlock rowBlock) throws IOException { pos += rowBlock.getMemory().writeTo(fos); } - rowBlock.getMemory().clear(); - if (enabledStats) { - stats.incrementRows(rowBlock.rows() - stats.getNumRows()); + stats.incrementRows(rowBlock.rows()); } } @@ -129,17 +140,19 @@ public void addTuple(Tuple t) throws IOException { } } - memoryRowBlock.getWriter().addTuple(t); + rowBlock.getWriter().putTuple(t); - if(memoryRowBlock.getMemory().readableBytes() >= BUFFER_SIZE) { - writeRowBlock(memoryRowBlock); + if(rowBlock.getMemory().readableBytes() >= BUFFER_SIZE) { + writeRowBlock(rowBlock); + rowBlock.clear(); } } @Override public void flush() throws IOException { - if(memoryRowBlock.getMemory().isReadable()) { - writeRowBlock(memoryRowBlock); + if(!hasExternalBuf && rowBlock.getMemory().isReadable()) { + writeRowBlock(rowBlock); + rowBlock.clear(); } } @@ -155,7 +168,9 @@ public void close() throws IOException { } IOUtils.cleanup(LOG, channel, randomAccessFile, fos); - memoryRowBlock.release(); + if(!hasExternalBuf && rowBlock != null) { + rowBlock.release(); + } } @Override From 8768d2f85e25ce0f5a12bae0a2c557d15ab59c41 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Thu, 29 Oct 2015 15:10:04 +0900 Subject: [PATCH 02/28] cleanup --- .../org/apache/tajo/tuple/memory/ResizableMemoryBlock.java | 4 ++-- .../tajo/engine/planner/physical/ExternalSortExec.java | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java index 0d09af1a9f..0876eaa917 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java @@ -127,11 +127,11 @@ public int writerPosition() { @Override public void ensureSize(int size) { if (!buffer.isWritable(size)) { - if (!limitSpec.canIncrease(buffer.capacity())) { + if (!limitSpec.canIncrease(size)) { throw new RuntimeException("Cannot increase RowBlock anymore."); } - int newBlockSize = limitSpec.increasedSize(buffer.capacity()); + int newBlockSize = limitSpec.increasedSize(Math.max(buffer.capacity(), size)); resize(newBlockSize); LOG.info("Increase DirectRowBlock to " + FileUtil.humanReadableByteCount(newBlockSize, false)); } diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java index 8944fb86a5..bbcc2c9331 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java @@ -115,11 +115,10 @@ private ExternalSortExec(final TaskAttemptContext context, final SortNode plan) throw new PhysicalPlanningException(ConfVars.EXECUTOR_EXTERNAL_SORT_FANOUT.varname + " cannot be lower than 2"); } // TODO - sort buffer and core num should be changed to use the allocated container resource. - long bufferSize = context.getQueryContext().getInt(SessionVars.EXTSORT_BUFFER_SIZE) * StorageUnit.MB; - this.sortBufferBytesNum = (int) (bufferSize * 0.8); + this.sortBufferBytesNum = context.getQueryContext().getInt(SessionVars.EXTSORT_BUFFER_SIZE) * StorageUnit.MB; this.allocatedCoreNum = context.getConf().getIntVar(ConfVars.EXECUTOR_EXTERNAL_SORT_THREAD_NUM); this.executorService = Executors.newFixedThreadPool(this.allocatedCoreNum); - this.inMemoryTable = new UnsafeTupleList(outSchema, (int) Math.min(bufferSize, 16 * StorageUnit.MB)); + this.inMemoryTable = new UnsafeTupleList(outSchema, (int) Math.min(sortBufferBytesNum, 16 * StorageUnit.MB)); this.sortTmpDir = getExecutorTmpDir(); localDirAllocator = new LocalDirAllocator(ConfVars.WORKER_TEMPORAL_DIR.varname); From 2dce0f44868f62df2970526d5608ef6b1b4f711e Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Thu, 29 Oct 2015 17:02:47 +0900 Subject: [PATCH 03/28] fix buffer overflow --- .../apache/tajo/tuple/memory/MemoryRowBlock.java | 14 ++++++++++++++ .../tajo/tuple/memory/ResizableMemoryBlock.java | 1 - .../org/apache/tajo/tuple/memory/RowBlock.java | 4 ++++ .../engine/planner/physical/ExternalSortExec.java | 6 +++--- .../planner/physical/HashShuffleFileWriteExec.java | 4 ++-- .../engine/planner/physical/UnsafeTupleList.java | 8 ++++---- .../main/java/org/apache/tajo/worker/TaskImpl.java | 5 ++--- .../tajo/storage/rawfile/DirectRawFileScanner.java | 6 +++++- .../tajo/storage/rawfile/DirectRawFileWriter.java | 12 ++++++++---- 9 files changed, 42 insertions(+), 18 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java index 922fc689a1..27e54f2fcd 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java @@ -91,6 +91,20 @@ public int capacity() { return memory.capacity(); } + @Override + public int usedMem() { + return memory.writerPosition(); + } + + @Override + public float usage() { + if (usedMem() > 0) { + return (usedMem() / (float) capacity()); + } else { + return 0.0f; + } + } + public int maxRowNum() { return maxRowNum; } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java index 0876eaa917..5163ecb923 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java @@ -123,7 +123,6 @@ public int writerPosition() { return buffer.writerIndex(); } - @Override public void ensureSize(int size) { if (!buffer.isWritable(size)) { diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java index 68902fbfea..f916351d84 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java @@ -30,6 +30,10 @@ public interface RowBlock { int capacity(); + int usedMem(); + + float usage(); + void setRows(int rowNum); int rows(); diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java index bbcc2c9331..a8af4a8349 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java @@ -118,7 +118,7 @@ private ExternalSortExec(final TaskAttemptContext context, final SortNode plan) this.sortBufferBytesNum = context.getQueryContext().getInt(SessionVars.EXTSORT_BUFFER_SIZE) * StorageUnit.MB; this.allocatedCoreNum = context.getConf().getIntVar(ConfVars.EXECUTOR_EXTERNAL_SORT_THREAD_NUM); this.executorService = Executors.newFixedThreadPool(this.allocatedCoreNum); - this.inMemoryTable = new UnsafeTupleList(outSchema, (int) Math.min(sortBufferBytesNum, 16 * StorageUnit.MB)); + this.inMemoryTable = new UnsafeTupleList(outSchema, (int) Math.min(sortBufferBytesNum, Integer.MAX_VALUE)); this.sortTmpDir = getExecutorTmpDir(); localDirAllocator = new LocalDirAllocator(ConfVars.WORKER_TEMPORAL_DIR.varname); @@ -197,12 +197,12 @@ private List sortAndStoreAllChunks() throws IOException { while (!context.isStopped() && (tuple = child.next()) != null) { // partition sort start inMemoryTable.add(tuple); - if (inMemoryTable.getUsedMem() > sortBufferBytesNum) { + if (inMemoryTable.usage() > 0.9f || inMemoryTable.usedMem() > sortBufferBytesNum) { long runEndTime = System.currentTimeMillis(); info(LOG, chunkId + " run loading time: " + (runEndTime - runStartTime) + " msec"); runStartTime = runEndTime; - info(LOG, "Memory consumption exceeds " + FileUtil.humanReadableByteCount(inMemoryTable.getUsedMem(), false)); + info(LOG, "Memory consumption exceeds " + FileUtil.humanReadableByteCount(inMemoryTable.usedMem(), false)); memoryResident = false; chunkPaths.add(sortAndStoreChunk(chunkId, inMemoryTable)); diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java index 026db3ff64..f6cca604d5 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java @@ -137,8 +137,8 @@ public Tuple next() throws IOException { usedBufferSize += (rowBlock.getMemory().writerPosition() - prevUsedMem); if (totalBufferCapacity > maxBufferSize) { - LOG.warn(String.format("Buffer-Capacity threshold(%s) exceeded(%s). usage: %s", - FileUtil.humanReadableByteCount(bufferThreshold, false), + LOG.warn(String.format("Too low buffer usage. threshold: %s, total capacity: %s, used: %s", + FileUtil.humanReadableByteCount(maxBufferSize, false), FileUtil.humanReadableByteCount(totalBufferCapacity, false), FileUtil.humanReadableByteCount(usedBufferSize, false))); diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java index 8db34ebeca..804572499f 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java @@ -53,12 +53,12 @@ public void release() { super.clear(); } - public long getUsedMem() { - return rowBlock.getMemory().writerPosition(); + public int usedMem() { + return rowBlock.usedMem(); } - public long getCapacity() { - return rowBlock.getMemory().capacity(); + public float usage() { + return rowBlock.usage(); } @Override diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/TaskImpl.java b/tajo-core/src/main/java/org/apache/tajo/worker/TaskImpl.java index a3b1deec2f..fbd1948923 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/TaskImpl.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/TaskImpl.java @@ -609,10 +609,9 @@ public void run() { try { FileChunk fetched = fetcher.get(); if (fetcher.getState() == TajoProtos.FetcherState.FETCH_FINISHED && fetched != null - && fetched.getFile() != null) { + && fetched.getFile() != null) { if (fetched.fromRemote() == false) { - localChunks.add(fetched); - LOG.info("Add a new FileChunk to local chunk list"); + localChunks.add(fetched); } break; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java index 0172484c5d..84a3aefb63 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java @@ -45,6 +45,9 @@ public class DirectRawFileScanner extends FileScanner implements SeekableScanner { private static final Log LOG = LogFactory.getLog(DirectRawFileScanner.class); + public static final String READ_BUFFER_SIZE = "tajo.storage.raw.io.read-buffer.bytes"; + public static final int DEFAULT_BUFFER_SIZE = 128 * StorageUnit.KB; + private SeekableInputChannel channel; private boolean eos = false; @@ -64,7 +67,8 @@ public DirectRawFileScanner(Configuration conf, Schema schema, TableMeta meta, F public void init() throws IOException { initChannel(); - tupleBuffer = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), 64 * StorageUnit.KB, true); + tupleBuffer = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), + conf.getInt(READ_BUFFER_SIZE, DEFAULT_BUFFER_SIZE), true); reader = tupleBuffer.getReader(); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java index 7d8ceffe0e..9dbb7bab77 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java @@ -46,10 +46,13 @@ import java.nio.channels.FileChannel; public class DirectRawFileWriter extends FileAppender { - public static final String FILE_EXTENSION = "draw"; - private static final int BUFFER_SIZE = 64 * StorageUnit.KB; private static final Log LOG = LogFactory.getLog(DirectRawFileWriter.class); + public static final String FILE_EXTENSION = "draw"; + public static final String WRITE_BUFFER_SIZE = "tajo.storage.raw.io.write-buffer.bytes"; + public static final int DEFAULT_BUFFER_SIZE = 128 * StorageUnit.KB; + private static final float BUFFER_THRESHHOLD = 0.9f; + private FileChannel channel; private RandomAccessFile randomAccessFile; private FSDataOutputStream fos; @@ -105,7 +108,8 @@ public void init() throws IOException { } if (rowBlock == null) { - rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), BUFFER_SIZE, true); + int bufferSize = conf.getInt(WRITE_BUFFER_SIZE, DEFAULT_BUFFER_SIZE); + rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), bufferSize); } else { hasExternalBuf = true; } @@ -142,7 +146,7 @@ public void addTuple(Tuple t) throws IOException { rowBlock.getWriter().putTuple(t); - if(rowBlock.getMemory().readableBytes() >= BUFFER_SIZE) { + if(rowBlock.usage() > BUFFER_THRESHHOLD) { writeRowBlock(rowBlock); rowBlock.clear(); } From 9042093ff26c8735d35c6c24b72d863bfb193a1e Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Thu, 29 Oct 2015 21:22:40 +0900 Subject: [PATCH 04/28] fix broken tests --- .../java/org/apache/tajo/TajoTestingCluster.java | 4 ++-- .../physical/TestProgressExternalSortExec.java | 16 +++++++++------- .../planner/physical/ExternalSortExec.java | 2 +- .../storage/rawfile/DirectRawFileScanner.java | 2 ++ 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java b/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java index 646f29a611..496a1ef794 100644 --- a/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java +++ b/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java @@ -158,8 +158,8 @@ void initPropertiesAndConfigs() { conf.setStrings(ConfVars.PYTHON_CODE_DIR.varname, getClass().getResource("/python").toString()); // Buffer size - conf.setInt(ConfVars.$EXECUTOR_EXTERNAL_SORT_BUFFER_SIZE.varname, 1); - conf.setInt(ConfVars.$EXECUTOR_HASH_SHUFFLE_BUFFER_SIZE.varname, 1); + conf.setInt(ConfVars.$EXECUTOR_EXTERNAL_SORT_BUFFER_SIZE.varname, 10); + conf.setInt(ConfVars.$EXECUTOR_HASH_SHUFFLE_BUFFER_SIZE.varname, 10); /* Since Travis CI limits the size of standard output log up to 4MB */ if (!StringUtils.isEmpty(LOG_LEVEL)) { diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java index ec41b0da33..a23d5aeaa3 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java @@ -20,10 +20,7 @@ import org.apache.hadoop.fs.Path; -import org.apache.tajo.LocalTajoTestingUtility; -import org.apache.tajo.SessionVars; -import org.apache.tajo.TajoConstants; -import org.apache.tajo.TajoTestingCluster; +import org.apache.tajo.*; import org.apache.tajo.algebra.Expr; import org.apache.tajo.catalog.*; import org.apache.tajo.catalog.statistics.TableStats; @@ -42,6 +39,7 @@ import org.apache.tajo.engine.query.QueryContext; import org.apache.tajo.storage.*; import org.apache.tajo.storage.fragment.FileFragment; +import org.apache.tajo.unit.StorageUnit; import org.apache.tajo.util.CommonTestingUtil; import org.apache.tajo.worker.TaskAttemptContext; import org.junit.After; @@ -65,7 +63,7 @@ public class TestProgressExternalSortExec { private LogicalPlanner planner; private Path testDir; - private final int numTuple = 5000; + private final int numTuple = 50000; private Random rnd = new Random(System.currentTimeMillis()); private TableDesc employee; @@ -87,7 +85,7 @@ public void setUp() throws Exception { schema.addColumn("empid", TajoDataTypes.Type.INT4); schema.addColumn("deptname", TajoDataTypes.Type.TEXT); - TableMeta employeeMeta = CatalogUtil.newTableMeta("RAW"); + TableMeta employeeMeta = CatalogUtil.newTableMeta(BuiltinStorages.DRAW); Path employeePath = new Path(testDir, "employee.csv"); Appender appender = ((FileTablespace) TablespaceManager.getLocalFs()) .getAppender(employeeMeta, schema, employeePath); @@ -137,7 +135,11 @@ public void testExternalSortExecProgressWithPairWiseMerger() throws Exception { private void testProgress(long sortBufferBytesNum) throws Exception { conf.setIntVar(ConfVars.EXECUTOR_EXTERNAL_SORT_FANOUT, 2); QueryContext queryContext = LocalTajoTestingUtility.createDummyContext(conf); - queryContext.setLong(SessionVars.EXTSORT_BUFFER_SIZE, sortBufferBytesNum); + if(sortBufferBytesNum > StorageUnit.MB) { + queryContext.setInt(SessionVars.EXTSORT_BUFFER_SIZE, (int)(sortBufferBytesNum / StorageUnit.MB)); + } else { + queryContext.setInt(SessionVars.EXTSORT_BUFFER_SIZE, 1); + } FileFragment[] frags = FileTablespace.splitNG(conf, "default.employee", employee.getMeta(), new Path(employee.getUri()), Integer.MAX_VALUE); diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java index a8af4a8349..fd9f4c2b5e 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java @@ -772,6 +772,7 @@ public TableStats getInputStats() { @Override public void close() throws IOException { + super.close(); if (result != null) { result.close(); try { @@ -803,7 +804,6 @@ public void close() throws IOException { } plan = null; - super.close(); } @Override diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java index 84a3aefb63..191b35d3b4 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java @@ -64,6 +64,7 @@ public DirectRawFileScanner(Configuration conf, Schema schema, TableMeta meta, F super(conf, schema, meta, fragment); } + @Override public void init() throws IOException { initChannel(); @@ -168,6 +169,7 @@ public Tuple next() throws IOException { public void reset() throws IOException { // reload initial buffer filePosition = fragment.getStartKey(); + recordCount = 0; seek(filePosition); eos = false; reader.reset(); From 78a5f774c9bcc270aa8ff856cdf3d314d8157f49 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Fri, 30 Oct 2015 14:42:02 +0900 Subject: [PATCH 05/28] fix mismatched data format and invalid tests --- .../apache/tajo/tuple/memory/UnSafeTuple.java | 5 +- .../physical/TestExternalSortExec.java | 2 +- .../planner/physical/TestPhysicalPlanner.java | 3 + .../TestProgressExternalSortExec.java | 17 +-- .../querymaster/TestTaskStatusUpdate.java | 2 +- .../engine/planner/PhysicalPlannerImpl.java | 4 +- .../planner/physical/ExternalSortExec.java | 130 ++++++++++-------- .../java/org/apache/tajo/storage/RawFile.java | 1 + .../storage/rawfile/DirectRawFileScanner.java | 5 +- 9 files changed, 97 insertions(+), 72 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java index b0cce1f29d..4781507686 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java @@ -110,8 +110,9 @@ private int getFieldOffset(int fieldId) { public long getFieldAddr(int fieldId) { int fieldOffset = getFieldOffset(fieldId); - if (fieldOffset == -1) { - throw new RuntimeException("Invalid Field Access: " + fieldId); + if (fieldOffset < 0 || fieldOffset > length) { + throw new RuntimeException("Invalid Access. Field : " + fieldId + + ", Offset:" + fieldOffset + ", Record length:" + length); } return address() + fieldOffset; } diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestExternalSortExec.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestExternalSortExec.java index 10aa47df62..0500c07de5 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestExternalSortExec.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestExternalSortExec.java @@ -122,7 +122,7 @@ public void tearDown() throws Exception { public final void testNext() throws IOException, TajoException { conf.setIntVar(ConfVars.EXECUTOR_EXTERNAL_SORT_FANOUT, 2); QueryContext queryContext = LocalTajoTestingUtility.createDummyContext(conf); - queryContext.setLong(SessionVars.EXTSORT_BUFFER_SIZE, 1); + queryContext.setInt(SessionVars.EXTSORT_BUFFER_SIZE, 1); FileFragment[] frags = FileTablespace.splitNG(conf, "default.employee", employee.getMeta(), new Path(employee.getUri()), Integer.MAX_VALUE); diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestPhysicalPlanner.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestPhysicalPlanner.java index 70c19b204d..735f234513 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestPhysicalPlanner.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestPhysicalPlanner.java @@ -574,6 +574,7 @@ public final void testEnforceForDefaultColumnPartitionStorePlan() throws IOExcep LogicalNode rootNode = optimizer.optimize(plan); PhysicalPlanner phyPlanner = new PhysicalPlannerImpl(conf); PhysicalExec exec = phyPlanner.createPlan(ctx, rootNode); + exec.close(); assertTrue(exec instanceof SortBasedColPartitionStoreExec); } @@ -598,6 +599,7 @@ public final void testEnforceForHashBasedColumnPartitionStorePlan() throws IOExc PhysicalPlanner phyPlanner = new PhysicalPlannerImpl(conf); PhysicalExec exec = phyPlanner.createPlan(ctx, rootNode); + exec.close(); assertTrue(exec instanceof HashBasedColPartitionStoreExec); } @@ -622,6 +624,7 @@ public final void testEnforceForSortBasedColumnPartitionStorePlan() throws IOExc PhysicalPlanner phyPlanner = new PhysicalPlannerImpl(conf); PhysicalExec exec = phyPlanner.createPlan(ctx, rootNode); + exec.close(); assertTrue(exec instanceof SortBasedColPartitionStoreExec); } diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java index a23d5aeaa3..e10a5dfb23 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java @@ -85,8 +85,8 @@ public void setUp() throws Exception { schema.addColumn("empid", TajoDataTypes.Type.INT4); schema.addColumn("deptname", TajoDataTypes.Type.TEXT); - TableMeta employeeMeta = CatalogUtil.newTableMeta(BuiltinStorages.DRAW); - Path employeePath = new Path(testDir, "employee.csv"); + TableMeta employeeMeta = CatalogUtil.newTableMeta(BuiltinStorages.RAW); + Path employeePath = new Path(testDir, "employee.raw"); Appender appender = ((FileTablespace) TablespaceManager.getLocalFs()) .getAppender(employeeMeta, schema, employeePath); appender.enableStats(); @@ -203,9 +203,9 @@ private void testProgress(long sortBufferBytesNum) throws Exception { TableStats tableStats = exec.getInputStats(); assertNotNull(tableStats); assertEquals(testDataStats.getNumBytes().longValue(), tableStats.getNumBytes().longValue()); - assertEquals(cnt, testDataStats.getNumRows().longValue()); - assertEquals(cnt, tableStats.getNumRows().longValue()); - assertEquals(testDataStats.getNumBytes().longValue(), tableStats.getReadBytes().longValue()); + assertEquals(testDataStats.getNumRows().longValue(), cnt); + assertEquals(testDataStats.getNumRows().longValue(), tableStats.getNumRows().longValue()); + assertTrue(testDataStats.getNumBytes().longValue() <= tableStats.getReadBytes().longValue()); // for rescan test preVal = null; @@ -228,9 +228,10 @@ private void testProgress(long sortBufferBytesNum) throws Exception { tableStats = exec.getInputStats(); assertNotNull(tableStats); assertEquals(testDataStats.getNumBytes().longValue(), tableStats.getNumBytes().longValue()); - assertEquals(cnt, testDataStats.getNumRows().longValue()); - assertEquals(cnt, tableStats.getNumRows().longValue()); - assertEquals(testDataStats.getNumBytes().longValue(), tableStats.getReadBytes().longValue()); + assertEquals(testDataStats.getNumRows().longValue(), cnt); + assertEquals(testDataStats.getNumRows().longValue(), tableStats.getNumRows().longValue()); + //'ReadBytes' is actual read bytes + assertTrue(testDataStats.getNumBytes().longValue() <= tableStats.getReadBytes().longValue()); conf.setIntVar(ConfVars.EXECUTOR_EXTERNAL_SORT_FANOUT, ConfVars.EXECUTOR_EXTERNAL_SORT_FANOUT.defaultIntVal); } diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java b/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java index e1e10ffcf4..b954b09863 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java @@ -106,7 +106,7 @@ public final void case3() throws Exception { res = executeQuery(); // in/out * stage(4) - long[] expectedNumRows = new long[]{5, 5, 2, 2, 7, 2, 2, 2}; + long[] expectedNumRows = new long[]{5, 5, 2, 2, 2, 2, 2, 2}; long[] expectedNumBytes = new long[]{20, 80, 8, 64, 144, 64, 64, 64}; long[] expectedReadBytes = new long[]{20, 20, 8, 8, 144, 0, 64, 0}; diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/PhysicalPlannerImpl.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/PhysicalPlannerImpl.java index 52e3b89b5f..255c1ad113 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/PhysicalPlannerImpl.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/PhysicalPlannerImpl.java @@ -904,14 +904,12 @@ private boolean checkIfSortEquivalance(TaskAttemptContext ctx, ScanNode scanNode public PhysicalExec createScanPlan(TaskAttemptContext ctx, ScanNode scanNode, Stack node) throws IOException { // check if an input is sorted in the same order to the subsequence sort operator. - // TODO - it works only if input files are raw files. We should check the file format. - // Since the default intermediate file format is raw file, it is not problem right now. if (checkIfSortEquivalance(ctx, scanNode, node)) { if (ctx.getTable(scanNode.getCanonicalName()) == null) { return new SeqScanExec(ctx, scanNode, null); } FragmentProto [] fragments = ctx.getTables(scanNode.getCanonicalName()); - return new ExternalSortExec(ctx, (SortNode) node.peek(), fragments); + return new ExternalSortExec(ctx, (SortNode) node.peek(), scanNode, fragments); } else { Enforcer enforcer = ctx.getEnforcer(); diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java index fd9f4c2b5e..9a56305bf3 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/ExternalSortExec.java @@ -33,12 +33,11 @@ import org.apache.tajo.catalog.statistics.TableStats; import org.apache.tajo.conf.TajoConf.ConfVars; import org.apache.tajo.engine.planner.PhysicalPlanningException; +import org.apache.tajo.plan.logical.ScanNode; import org.apache.tajo.plan.logical.SortNode; import org.apache.tajo.storage.*; import org.apache.tajo.storage.fragment.FileFragment; import org.apache.tajo.storage.fragment.FragmentConvertor; -import org.apache.tajo.storage.rawfile.DirectRawFileScanner; -import org.apache.tajo.storage.rawfile.DirectRawFileWriter; import org.apache.tajo.unit.StorageUnit; import org.apache.tajo.util.FileUtil; import org.apache.tajo.util.TUtil; @@ -51,6 +50,8 @@ import java.util.List; import java.util.concurrent.*; +import static org.apache.tajo.storage.RawFile.RawFileAppender; + /** * This external sort algorithm can be characterized by the followings: * @@ -69,7 +70,8 @@ public class ExternalSortExec extends SortExec { private static final String INTERMEDIATE_FILE_PREFIX = "@interFile_"; private SortNode plan; - private final TableMeta meta; + /** the data format of intermediate file*/ + private TableMeta intermediateMeta; /** the defaultFanout of external sort */ private final int defaultFanout; /** It's the size of in-memory table. If memory consumption exceeds it, store the memory table into a disk. */ @@ -79,7 +81,7 @@ public class ExternalSortExec extends SortExec { /** If there are available multiple cores, it tries parallel merge. */ private ExecutorService executorService; /** used for in-memory sort of each chunk. */ - private UnsafeTupleList inMemoryTable; + private TupleList inMemoryTable; /** temporal dir */ private final Path sortTmpDir; /** It enables round-robin disks allocation */ @@ -87,9 +89,9 @@ public class ExternalSortExec extends SortExec { /** local file system */ private final RawLocalFileSystem localFS; /** final output files which are used for cleaning */ - private List finalOutputFiles = null; + private List finalOutputFiles = null; /** for directly merging sorted inputs */ - private List mergedInputFragments = null; + private List mergedInputFragments = null; /////////////////////////////////////////////////// // transient variables @@ -108,31 +110,30 @@ private ExternalSortExec(final TaskAttemptContext context, final SortNode plan) super(context, plan.getInSchema(), plan.getOutSchema(), null, plan.getSortKeys()); this.plan = plan; - this.meta = CatalogUtil.newTableMeta(BuiltinStorages.DRAW); - this.defaultFanout = context.getConf().getIntVar(ConfVars.EXECUTOR_EXTERNAL_SORT_FANOUT); if (defaultFanout < 2) { throw new PhysicalPlanningException(ConfVars.EXECUTOR_EXTERNAL_SORT_FANOUT.varname + " cannot be lower than 2"); } // TODO - sort buffer and core num should be changed to use the allocated container resource. - this.sortBufferBytesNum = context.getQueryContext().getInt(SessionVars.EXTSORT_BUFFER_SIZE) * StorageUnit.MB; + this.sortBufferBytesNum = context.getQueryContext().getLong(SessionVars.EXTSORT_BUFFER_SIZE) * StorageUnit.MB; this.allocatedCoreNum = context.getConf().getIntVar(ConfVars.EXECUTOR_EXTERNAL_SORT_THREAD_NUM); this.executorService = Executors.newFixedThreadPool(this.allocatedCoreNum); - this.inMemoryTable = new UnsafeTupleList(outSchema, (int) Math.min(sortBufferBytesNum, Integer.MAX_VALUE)); + this.inMemoryTable = new TupleList(100000); this.sortTmpDir = getExecutorTmpDir(); - localDirAllocator = new LocalDirAllocator(ConfVars.WORKER_TEMPORAL_DIR.varname); - localFS = new RawLocalFileSystem(); + this.localDirAllocator = new LocalDirAllocator(ConfVars.WORKER_TEMPORAL_DIR.varname); + this.localFS = new RawLocalFileSystem(); + this.intermediateMeta = CatalogUtil.newTableMeta(BuiltinStorages.RAW); //TODO change to SHUFFLE_FILE_FORMAT } - public ExternalSortExec(final TaskAttemptContext context,final SortNode plan, + public ExternalSortExec(final TaskAttemptContext context,final SortNode plan, final ScanNode scanNode, final CatalogProtos.FragmentProto[] fragments) throws PhysicalPlanningException { this(context, plan); mergedInputFragments = TUtil.newList(); for (CatalogProtos.FragmentProto proto : fragments) { FileFragment fragment = FragmentConvertor.convert(FileFragment.class, proto); - mergedInputFragments.add(fragment); + mergedInputFragments.add(new Chunk(fragment, scanNode.getTableDesc().getMeta())); } } @@ -154,9 +155,9 @@ public SortNode getPlan() { /** * Sort a tuple block and store them into a chunk file */ - private Path sortAndStoreChunk(int chunkId, List tupleBlock) + private Chunk sortAndStoreChunk(int chunkId, TupleList tupleBlock) throws IOException { - TableMeta meta = CatalogUtil.newTableMeta(BuiltinStorages.DRAW); + int rowNum = tupleBlock.size(); long sortStart = System.currentTimeMillis(); @@ -165,7 +166,9 @@ private Path sortAndStoreChunk(int chunkId, List tupleBlock) long chunkWriteStart = System.currentTimeMillis(); Path outputPath = getChunkPathForWrite(0, chunkId); - final FileAppender appender = new DirectRawFileWriter(context.getConf(), null, inSchema, meta, outputPath); + final RawFileAppender appender = + new RawFileAppender(context.getConf(), null, inSchema, intermediateMeta, outputPath); + appender.init(); for (Tuple t : sorted) { appender.addTuple(t); @@ -179,7 +182,10 @@ private Path sortAndStoreChunk(int chunkId, List tupleBlock) FileUtil.humanReadableByteCount(appender.getOffset(), false) + " bytes, " + rowNum + " rows, " + "sort time: " + (sortEnd - sortStart) + " msec, " + "write time: " + (chunkWriteEnd - chunkWriteStart) + " msec)"); - return outputPath; + + FileFragment frag = new FileFragment("", outputPath, 0, + new File(localFS.makeQualified(outputPath).toUri()).length()); + return new Chunk(frag, intermediateMeta); } /** @@ -188,26 +194,28 @@ private Path sortAndStoreChunk(int chunkId, List tupleBlock) * @return All paths of chunks * @throws java.io.IOException */ - private List sortAndStoreAllChunks() throws IOException { + private List sortAndStoreAllChunks() throws IOException { Tuple tuple; - List chunkPaths = TUtil.newList(); + long memoryConsumption = 0; + List chunkPaths = TUtil.newList(); int chunkId = 0; long runStartTime = System.currentTimeMillis(); while (!context.isStopped() && (tuple = child.next()) != null) { // partition sort start inMemoryTable.add(tuple); + memoryConsumption += MemoryUtil.calculateMemorySize(tuple); - if (inMemoryTable.usage() > 0.9f || inMemoryTable.usedMem() > sortBufferBytesNum) { + if (memoryConsumption > sortBufferBytesNum) { long runEndTime = System.currentTimeMillis(); info(LOG, chunkId + " run loading time: " + (runEndTime - runStartTime) + " msec"); runStartTime = runEndTime; - info(LOG, "Memory consumption exceeds " + FileUtil.humanReadableByteCount(inMemoryTable.usedMem(), false)); + info(LOG, "Memory consumption exceeds " + sortBufferBytesNum + " bytes"); memoryResident = false; chunkPaths.add(sortAndStoreChunk(chunkId, inMemoryTable)); - inMemoryTable.clear(); + memoryConsumption = 0; chunkId++; // When the volume of sorting data once exceed the size of sort buffer, @@ -265,7 +273,7 @@ public Tuple next() throws IOException { } else { // Try to sort all data, and store them as multiple chunks if memory exceeds long startTimeOfChunkSplit = System.currentTimeMillis(); - List chunks = sortAndStoreAllChunks(); + List chunks = sortAndStoreAllChunks(); long endTimeOfChunkSplit = System.currentTimeMillis(); info(LOG, "Chunks creation time: " + (endTimeOfChunkSplit - startTimeOfChunkSplit) + " msec"); @@ -275,14 +283,7 @@ public Tuple next() throws IOException { } else { // if input data exceeds main-memory at least once try { - List fragments = TUtil.newList(); - for (Path chunk : chunks) { - FileFragment frag = new FileFragment("", chunk, 0, - new File(localFS.makeQualified(chunk).toUri()).length()); - fragments.add(frag); - } - - this.result = externalMergeAndSort(fragments); + this.result = externalMergeAndSort(chunks); } catch (Exception e) { throw new PhysicalPlanningException(e); } @@ -323,11 +324,11 @@ private int calculateFanout(int remainInputChunks, int inputNum, int outputNum, return computedFanout; } - private Scanner externalMergeAndSort(List chunks) + private Scanner externalMergeAndSort(List chunks) throws IOException, ExecutionException, InterruptedException { int level = 0; - final List inputFiles = TUtil.newList(chunks); - final List outputFiles = TUtil.newList(); + final List inputFiles = TUtil.newList(chunks); + final List outputFiles = TUtil.newList(); int remainRun = inputFiles.size(); int chunksSize = chunks.size(); @@ -340,7 +341,7 @@ private Scanner externalMergeAndSort(List chunks) int remainInputRuns = inputFiles.size(); int outChunkId = 0; int outputFileNum = 0; - List> futures = TUtil.newList(); + List> futures = TUtil.newList(); // the number of files being merged in threads. List numberOfMergingFiles = TUtil.newList(); @@ -363,7 +364,7 @@ private Scanner externalMergeAndSort(List chunks) info(LOG, "Unbalanced merge possibility detected: number of remain input (" + remainInputRuns + ") and output files (" + outputFileNum + ") <= " + defaultFanout); - List switched = TUtil.newList(); + List switched = TUtil.newList(); // switch the remain inputs to the next outputs for (int j = startIdx; j < inputFiles.size(); j++) { switched.add(inputFiles.get(j)); @@ -378,7 +379,7 @@ private Scanner externalMergeAndSort(List chunks) // wait for all sort runners int finishedMerger = 0; int index = 0; - for (Future future : futures) { + for (Future future : futures) { outputFiles.add(future.get()); // Getting the number of merged files finishedMerger += numberOfMergingFiles.get(index++); @@ -404,12 +405,12 @@ private Scanner externalMergeAndSort(List chunks) * deleted at this point. However, for the ease of future code maintenance, we delete only type-C fragments here */ int numDeletedFiles = 0; - for (FileFragment frag : inputFiles) { - if (frag.getTableName().contains(INTERMEDIATE_FILE_PREFIX)) { - localFS.delete(frag.getPath(), true); + for (Chunk chunk : inputFiles) { + if (chunk.getFragment().getTableName().contains(INTERMEDIATE_FILE_PREFIX)) { + localFS.delete(chunk.getFragment().getPath(), true); numDeletedFiles++; - if(LOG.isDebugEnabled()) LOG.debug("Delete merged intermediate file: " + frag); + if(LOG.isDebugEnabled()) LOG.debug("Delete merged intermediate file: " + chunk.getFragment()); } } info(LOG, numDeletedFiles + " merged intermediate files deleted"); @@ -435,15 +436,15 @@ private Scanner externalMergeAndSort(List chunks) /** * Merge Thread */ - private class KWayMergerCaller implements Callable { + private class KWayMergerCaller implements Callable { final int level; final int nextRunId; - final List inputFiles; + final List inputFiles; final int startIdx; final int mergeFanout; final boolean updateInputStats; - public KWayMergerCaller(final int level, final int nextRunId, final List inputFiles, + public KWayMergerCaller(final int level, final int nextRunId, final List inputFiles, final int startIdx, final int mergeFanout, final boolean updateInputStats) { this.level = level; this.nextRunId = nextRunId; @@ -454,11 +455,12 @@ public KWayMergerCaller(final int level, final int nextRunId, final List inputs) throws IOException { + private Scanner createFinalMerger(List inputs) throws IOException { if (inputs.size() == 1) { this.result = getFileScanner(inputs.get(0)); } else { @@ -497,11 +499,11 @@ private Scanner createFinalMerger(List inputs) throws IOException return result; } - private Scanner getFileScanner(FileFragment frag) throws IOException { - return new DirectRawFileScanner(context.getConf(), plan.getInSchema(), meta, frag); + private Scanner getFileScanner(Chunk chunk) throws IOException { + return TablespaceManager.getLocalFs().getScanner(chunk.getMeta(), inSchema, chunk.getFragment(), outSchema); } - private Scanner createKWayMerger(List inputs, final int startChunkId, final int num) throws IOException { + private Scanner createKWayMerger(List inputs, final int startChunkId, final int num) throws IOException { final Scanner [] sources = new Scanner[num]; for (int i = 0; i < num; i++) { sources[i] = getFileScanner(inputs.get(startChunkId + i)); @@ -773,6 +775,7 @@ public TableStats getInputStats() { @Override public void close() throws IOException { super.close(); + if (result != null) { result.close(); try { @@ -784,7 +787,8 @@ public void close() throws IOException { } if (finalOutputFiles != null) { - for (FileFragment frag : finalOutputFiles) { + for (Chunk chunk : finalOutputFiles) { + FileFragment frag = chunk.getFragment(); File tmpFile = new File(localFS.makeQualified(frag.getPath()).toUri()); if (frag.getStartKey() == 0 && frag.getLength() == tmpFile.length()) { localFS.delete(frag.getPath(), true); @@ -794,7 +798,7 @@ public void close() throws IOException { } if(inMemoryTable != null){ - inMemoryTable.release(); + inMemoryTable.clear(); inMemoryTable = null; } @@ -832,4 +836,22 @@ public TableStats getInputStats() { return inputStats; } } + + private static class Chunk { + private FileFragment fragment; + private TableMeta meta; + + public Chunk(FileFragment fragment, TableMeta meta) { + this.fragment = fragment; + this.meta = meta; + } + + public FileFragment getFragment() { + return fragment; + } + + public TableMeta getMeta() { + return meta; + } + } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java index 65ec05ed34..3a52e4b0ed 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java @@ -397,6 +397,7 @@ public void reset() throws IOException { buffer.clear(); forceFillBuffer = true; filePosition = fragment.getStartKey(); + recordCount = 0; channel.position(filePosition); eos = false; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java index 191b35d3b4..984e7b660d 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java @@ -35,7 +35,6 @@ import org.apache.tajo.tuple.memory.MemoryRowBlock; import org.apache.tajo.tuple.memory.RowBlock; import org.apache.tajo.tuple.memory.UnSafeTuple; -import org.apache.tajo.tuple.memory.ZeroCopyTuple; import org.apache.tajo.unit.StorageUnit; import java.io.File; @@ -56,7 +55,7 @@ public class DirectRawFileScanner extends FileScanner implements SeekableScanner private long filePosition; private long endOffset; - private ZeroCopyTuple unSafeTuple = new UnSafeTuple(); + private UnSafeTuple unSafeTuple = new UnSafeTuple(); private RowBlock tupleBuffer; private RowBlockReader reader; @@ -141,7 +140,7 @@ public boolean next(RowBlock rowblock) throws IOException { private boolean fetchNeeded = true; @Override - public Tuple next() throws IOException { + public UnSafeTuple next() throws IOException { if(eos) { return null; } From 7eb78e4e1f67f5e376014957a392bf04914e007c Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Fri, 30 Oct 2015 15:43:13 +0900 Subject: [PATCH 06/28] claenup --- .../org/apache/tajo/storage/rawfile/DirectRawFileScanner.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java index 984e7b660d..dcf48c4eb3 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java @@ -125,7 +125,7 @@ public long getNextOffset() throws IOException { public void seek(long offset) throws IOException { channel.seek(offset); filePosition = channel.position(); - tupleBuffer.getMemory().clear(); + tupleBuffer.clear(); fetchNeeded = true; } @@ -171,7 +171,6 @@ public void reset() throws IOException { recordCount = 0; seek(filePosition); eos = false; - reader.reset(); } @Override From 34840f8f1f9bb8821b3a58ef193425c0a7ca3146 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Fri, 30 Oct 2015 17:40:13 +0900 Subject: [PATCH 07/28] lazy read to buffer --- .../storage/rawfile/DirectRawFileScanner.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java index dcf48c4eb3..0e8808bf32 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java @@ -67,13 +67,15 @@ public DirectRawFileScanner(Configuration conf, Schema schema, TableMeta meta, F public void init() throws IOException { initChannel(); - tupleBuffer = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), - conf.getInt(READ_BUFFER_SIZE, DEFAULT_BUFFER_SIZE), true); - - reader = tupleBuffer.getReader(); - - fetchNeeded = !next(tupleBuffer); + if (tupleBuffer == null) { + tupleBuffer = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), + conf.getInt(READ_BUFFER_SIZE, DEFAULT_BUFFER_SIZE), true); + } else { + tupleBuffer.clear(); + } + fetchNeeded = true; + eos = false; super.init(); } @@ -130,7 +132,7 @@ public void seek(long offset) throws IOException { } public boolean next(RowBlock rowblock) throws IOException { - long reamin = reader.remainForRead(); + long reamin = reader == null ? 0 : reader.remainForRead(); boolean ret = rowblock.copyFromChannel(channel); reader = rowblock.getReader(); filePosition += rowblock.getMemory().writerPosition() - reamin; From 3976210d063c4372659a81781ada401797f7ce2f Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Fri, 30 Oct 2015 20:45:38 +0900 Subject: [PATCH 08/28] remove duplicates code --- .../physical/HashShuffleFileWriteExec.java | 138 ++++++++---------- 1 file changed, 60 insertions(+), 78 deletions(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java index f6cca604d5..b2ca571623 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java @@ -44,6 +44,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; /** @@ -119,7 +120,6 @@ public Tuple next() throws IOException { int partId; long numRows = 0; while (!context.isStopped() && (tuple = child.next()) != null) { - numRows++; partId = partitioner.getPartition(tuple); MemoryRowBlock rowBlock = partitionMemoryMap.get(partId); @@ -130,105 +130,46 @@ public Tuple next() throws IOException { } RowWriter writer = rowBlock.getWriter(); - long prevUsedMem = rowBlock.getMemory().writerPosition(); + long prevUsedMem = rowBlock.usedMem(); totalBufferCapacity -= rowBlock.capacity(); + writer.putTuple(tuple); + numRows++; + totalBufferCapacity += rowBlock.capacity(); // calculate resizeable buffer capacity - usedBufferSize += (rowBlock.getMemory().writerPosition() - prevUsedMem); + usedBufferSize += (rowBlock.usedMem() - prevUsedMem); if (totalBufferCapacity > maxBufferSize) { - LOG.warn(String.format("Too low buffer usage. threshold: %s, total capacity: %s, used: %s", - FileUtil.humanReadableByteCount(maxBufferSize, false), - FileUtil.humanReadableByteCount(totalBufferCapacity, false), - FileUtil.humanReadableByteCount(usedBufferSize, false))); - - List> resultList = Lists.newArrayList(); - for (Map.Entry entry : partitionMemoryMap.entrySet()) { - int appendPartId = entry.getKey(); - - MemoryRowBlock memoryRowBlock = entry.getValue(); - if(memoryRowBlock.getMemory().isReadable()) { - //flush and release buffer - resultList.add(hashShuffleAppenderManager. - writePartitions(meta, outSchema, context.getTaskId(), appendPartId, memoryRowBlock, true)); - } else { - // release the unused buffer - memoryRowBlock.release(); - } - } - - // wait for flush to storage - for (Future future : resultList) { - future.get(); + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("Too low buffer usage. threshold: %s, total capacity: %s, used: %s", + FileUtil.humanReadableByteCount(maxBufferSize, false), + FileUtil.humanReadableByteCount(totalBufferCapacity, false), + FileUtil.humanReadableByteCount(usedBufferSize, false))); } + //flush and release buffer + flushBuffer(partitionMemoryMap, true); writtenBytes += usedBufferSize; totalBufferCapacity = usedBufferSize = 0; - partitionMemoryMap.clear(); } else if (usedBufferSize > bufferThreshold) { - ArrayList releaseList = Lists.newArrayList(); - List> resultList = Lists.newArrayList(); - for (Map.Entry entry : partitionMemoryMap.entrySet()) { - - int appendPartId = entry.getKey(); - MemoryRowBlock memoryRowBlock = entry.getValue(); - if(memoryRowBlock.getMemory().isReadable()) { - - //flush and reuse buffer - resultList.add(hashShuffleAppenderManager. - writePartitions(meta, outSchema, context.getTaskId(), appendPartId, memoryRowBlock, false)); - } else { - releaseList.add(appendPartId); - } - } - - // wait for flush to storage - for (Future future : resultList) { - future.get(); - } - + //flush and reuse buffer + flushBuffer(partitionMemoryMap, false); writtenBytes += usedBufferSize; usedBufferSize = 0; - - // release the unused partition - for (Integer id : releaseList) { - MemoryRowBlock memoryRowBlock = partitionMemoryMap.remove(id); - LOG.warn("release unused buffer" + memoryRowBlock.capacity()); - memoryRowBlock.release(); - } - } - } - - // write the remaining partition buffers - List> resultList = Lists.newArrayList(); - for (Map.Entry entry : partitionMemoryMap.entrySet()) { - - int appendPartId = entry.getKey(); - MemoryRowBlock memoryRowBlock = entry.getValue(); - if(memoryRowBlock.getMemory().isReadable()) { - //flush and release buffer - resultList.add(hashShuffleAppenderManager. - writePartitions(meta, outSchema, context.getTaskId(), appendPartId, memoryRowBlock, true)); - } else { - // release the unused buffer - memoryRowBlock.release(); } } - // wait for flush to storage - for (Future future : resultList) { - future.get(); - } + // flush remaining buffers + flushBuffer(partitionMemoryMap, true); writtenBytes += usedBufferSize; - TableStats aggregated = (TableStats)child.getInputStats().clone(); + usedBufferSize = totalBufferCapacity = 0; + TableStats aggregated = (TableStats) child.getInputStats().clone(); aggregated.setNumBytes(writtenBytes); aggregated.setNumRows(numRows); context.setResultStats(aggregated); - usedBufferSize = totalBufferCapacity = 0; - partitionMemoryMap.clear(); return null; } catch (RuntimeException e) { LOG.error(e.getMessage(), e); @@ -239,6 +180,47 @@ public Tuple next() throws IOException { } } + /** + * flush all buffer to local storage + */ + private void flushBuffer(Map partitionMemoryMap, boolean releaseBuffer) + throws IOException, ExecutionException, InterruptedException { + List> resultList = Lists.newArrayList(); + ArrayList unusedBuffer = Lists.newArrayList(); + + for (Map.Entry entry : partitionMemoryMap.entrySet()) { + int appendPartId = entry.getKey(); + + MemoryRowBlock memoryRowBlock = entry.getValue(); + if (memoryRowBlock.getMemory().isReadable()) { + //flush and release buffer + resultList.add(hashShuffleAppenderManager. + writePartitions(meta, outSchema, context.getTaskId(), appendPartId, memoryRowBlock, releaseBuffer)); + } else { + if (releaseBuffer) { + memoryRowBlock.release(); + } else { + unusedBuffer.add(appendPartId); + } + } + } + + // wait for flush to storage + for (Future future : resultList) { + future.get(); + } + + if (releaseBuffer) { + partitionMemoryMap.clear(); + } else { + // release the unused partition + for (Integer id : unusedBuffer) { + MemoryRowBlock memoryRowBlock = partitionMemoryMap.remove(id); + memoryRowBlock.release(); + } + } + } + @Override public void rescan() throws IOException { if (partitionMemoryMap.size() > 0) { From 1ec40a2ce13eb4daa19cdbb6cf6d670a9c0cf666 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Fri, 30 Oct 2015 21:14:12 +0900 Subject: [PATCH 09/28] print usage after ran --- .../java/org/apache/tajo/QueryTestCaseBase.java | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java index ffa2dab007..313f5f7dbb 100644 --- a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java +++ b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java @@ -54,10 +54,7 @@ import org.apache.tajo.plan.verifier.VerificationState; import org.apache.tajo.storage.StorageUtil; import org.apache.tajo.util.FileUtil; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Rule; +import org.junit.*; import org.junit.rules.TestName; import org.junit.rules.TestRule; import org.junit.rules.TestWatcher; @@ -245,12 +242,14 @@ public static void tearDownClass() throws Exception { client.close(); } - @Before + @After public void printTestName() { /* protect a travis stalled build */ - System.out.println("Run: " + name.getMethodName() + - " Used memory: " + ((Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) - / (1024 * 1024)) + " MBytes, Active Threads:" + Thread.activeCount()); + System.out.println(String.format("\t\tRan: %s.%s, Used Memory: %s, Active Threads: %d", + getClass().getSimpleName(), + name.getMethodName(), + FileUtil.humanReadableByteCount(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(), false), + Thread.activeCount())); } public QueryTestCaseBase() { From bf3a939847341feec96a42c0386a4afd897e22c2 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Fri, 30 Oct 2015 21:28:13 +0900 Subject: [PATCH 10/28] remove duplicates code --- .../tajo/tuple/memory/OffHeapRowBlockWriter.java | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java index bf27d2ae84..57a1e89a3f 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java @@ -78,13 +78,9 @@ public void putTuple(Tuple tuple) { @Override public ZeroCopyTuple addTuple(Tuple tuple) { int prevPos = rowBlock.getMemory().writerPosition(); - if (tuple instanceof UnSafeTuple) { - UnSafeTuple unSafeTuple = TUtil.checkTypeAndGet(tuple, UnSafeTuple.class); - putTuple(unSafeTuple); - rowBlock.setRows(rowBlock.rows() + 1); - } else { - OffHeapRowBlockUtils.convert(tuple, this); - } + + putTuple(tuple); + UnSafeTuple unSafeTuple = new UnSafeTuple(); unSafeTuple.set(rowBlock.getMemory(), prevPos, rowBlock.getMemory().writerPosition() - prevPos, dataTypes()); return unSafeTuple; From cada315496fac29f513340cd1bee569508d5dcf3 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Fri, 30 Oct 2015 21:34:12 +0900 Subject: [PATCH 11/28] trigger CI --- .../src/test/java/org/apache/tajo/QueryTestCaseBase.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java index 313f5f7dbb..b4c7a7d3d7 100644 --- a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java +++ b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java @@ -245,7 +245,7 @@ public static void tearDownClass() throws Exception { @After public void printTestName() { /* protect a travis stalled build */ - System.out.println(String.format("\t\tRan: %s.%s, Used Memory: %s, Active Threads: %d", + System.out.println(String.format(" Ran: %s.%s, Used Memory: %s, Active Threads: %d", getClass().getSimpleName(), name.getMethodName(), FileUtil.humanReadableByteCount(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(), false), From 6959db20ca8485e806df0d516336b4dba83bfe7e Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Fri, 30 Oct 2015 21:37:35 +0900 Subject: [PATCH 12/28] trigger CI --- .../src/test/java/org/apache/tajo/QueryTestCaseBase.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java index b4c7a7d3d7..44d4e7eb1e 100644 --- a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java +++ b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java @@ -245,7 +245,7 @@ public static void tearDownClass() throws Exception { @After public void printTestName() { /* protect a travis stalled build */ - System.out.println(String.format(" Ran: %s.%s, Used Memory: %s, Active Threads: %d", + System.out.println(String.format("Ran: %s.%s, Used Memory: %s, Active Threads: %d", getClass().getSimpleName(), name.getMethodName(), FileUtil.humanReadableByteCount(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(), false), From 66779e581ec23c079cd9ea0b9b8871f10762211d Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Fri, 30 Oct 2015 21:54:25 +0900 Subject: [PATCH 13/28] trigger CI --- .../src/test/java/org/apache/tajo/QueryTestCaseBase.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java index 44d4e7eb1e..02f5f1bb5c 100644 --- a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java +++ b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java @@ -245,11 +245,11 @@ public static void tearDownClass() throws Exception { @After public void printTestName() { /* protect a travis stalled build */ - System.out.println(String.format("Ran: %s.%s, Used Memory: %s, Active Threads: %d", - getClass().getSimpleName(), - name.getMethodName(), + System.out.println(String.format("Used Memory: %s, Active Threads: %d, Ran: %s.%s", FileUtil.humanReadableByteCount(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(), false), - Thread.activeCount())); + Thread.activeCount(), + getClass().getSimpleName(), + name.getMethodName())); } public QueryTestCaseBase() { From 252bd703524d763fe3f56388ba46836f48409bdc Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Mon, 2 Nov 2015 11:09:16 +0900 Subject: [PATCH 14/28] fix invalid verification in TestHAServiceHDFSImpl --- .../java/org/apache/tajo/ha/TestHAServiceHDFSImpl.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/ha/TestHAServiceHDFSImpl.java b/tajo-core-tests/src/test/java/org/apache/tajo/ha/TestHAServiceHDFSImpl.java index f0f01bfb54..279fce7329 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/ha/TestHAServiceHDFSImpl.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/ha/TestHAServiceHDFSImpl.java @@ -31,6 +31,7 @@ import org.apache.tajo.master.TajoMaster; import org.apache.tajo.service.ServiceTracker; import org.apache.tajo.service.ServiceTrackerFactory; +import org.junit.Assert; import org.junit.Test; import static junit.framework.Assert.assertTrue; @@ -150,9 +151,9 @@ private void verifyDataBaseAndTable(ServiceTracker tracker) throws Exception { TajoClient client = null; try { client = new TajoClientImpl(tracker); - client.existDatabase("default"); - client.existTable("default.ha_test1"); - client.existTable("default.ha_test2"); + Assert.assertTrue(client.existDatabase("default")); + Assert.assertTrue(client.existTable("default.ha_test1")); + Assert.assertTrue(client.existTable("default.ha_test2")); } finally { IOUtils.cleanup(null, client); } From 6ad44729cbc69a5a3b54f7470ad40f26f5e2bc3b Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Mon, 2 Nov 2015 15:20:27 +0900 Subject: [PATCH 15/28] add missing close --- .../physical/DistinctGroupbySortAggregationExec.java | 7 +++---- .../planner/physical/SortBasedColPartitionStoreExec.java | 1 + .../tajo/master/exec/NonForwardQueryResultFileScanner.java | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbySortAggregationExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbySortAggregationExec.java index 58cfca4a7c..8a8b14536b 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbySortAggregationExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbySortAggregationExec.java @@ -68,10 +68,6 @@ public DistinctGroupbySortAggregationExec(final TaskAttemptContext context, Dist for(int i = 0; i < resultColumnIds.length; i++) { resultColumnIdIndexes[resultColumnIds[i]] = i; } - - for (SortAggregateExec eachExec: aggregateExecs) { - eachExec.init(); - } } boolean first = true; @@ -172,6 +168,9 @@ public void close() throws IOException { @Override public void init() throws IOException { + for (SortAggregateExec eachExec: aggregateExecs) { + eachExec.init(); + } } @Override diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/SortBasedColPartitionStoreExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/SortBasedColPartitionStoreExec.java index 607dff763e..d7fa917200 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/SortBasedColPartitionStoreExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/SortBasedColPartitionStoreExec.java @@ -102,6 +102,7 @@ public void close() throws IOException { StatisticsUtil.aggregateTableStat(aggregatedStats, appender.getStats()); context.setResultStats(aggregatedStats); } + super.close(); } @Override diff --git a/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java b/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java index a653e62ef1..53235727cc 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java @@ -252,6 +252,7 @@ public SerializedResultSet nextRowBlock(int fetchRowNum) throws IOException { } return resultSetBuilder.build(); } catch (Throwable t) { + close(); throw new TajoInternalError(t.getCause()); } } From fda204e990f7198ea378777760308dfeb8d6ebb9 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Mon, 9 Nov 2015 18:25:53 +0900 Subject: [PATCH 16/28] add CompactRowBlockWriter --- .../java/org/apache/tajo/conf/TajoConf.java | 2 +- .../tuple/memory/CompactRowBlockWriter.java | 406 ++++++++++++++++++ .../tajo/tuple/memory/MemoryRowBlock.java | 62 ++- .../tuple/memory/OffHeapRowBlockUtils.java | 14 +- .../java/org/apache/tajo/util/BitArray.java | 5 +- .../querymaster/TestTaskStatusUpdate.java | 12 +- .../engine/planner/global/DataChannel.java | 2 +- .../physical/HashShuffleFileWriteExec.java | 2 +- .../tajo/querymaster/Repartitioner.java | 5 +- .../java/org/apache/tajo/storage/RawFile.java | 293 +++---------- .../storage/rawfile/DirectRawFileScanner.java | 2 +- .../storage/rawfile/DirectRawFileWriter.java | 10 +- 12 files changed, 532 insertions(+), 283 deletions(-) create mode 100644 tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java diff --git a/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java b/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java index a8a0dbec67..6143650a02 100644 --- a/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java +++ b/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java @@ -206,7 +206,7 @@ public static enum ConfVars implements ConfigKey { // Shuffle Configuration -------------------------------------------------- PULLSERVER_PORT("tajo.pullserver.port", 0, Validators.range("0", "65535")), SHUFFLE_SSL_ENABLED_KEY("tajo.pullserver.ssl.enabled", false, Validators.bool()), - SHUFFLE_FILE_FORMAT("tajo.shuffle.file-format", BuiltinStorages.DRAW, Validators.javaString()), + SHUFFLE_FILE_FORMAT("tajo.shuffle.file-format", BuiltinStorages.RAW, Validators.javaString()), SHUFFLE_FETCHER_PARALLEL_EXECUTION_MAX_NUM("tajo.shuffle.fetcher.parallel-execution.max-num", 2, Validators.min("1")), SHUFFLE_FETCHER_CHUNK_MAX_SIZE("tajo.shuffle.fetcher.chunk.max-size", 8192), diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java new file mode 100644 index 0000000000..e6a649b984 --- /dev/null +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java @@ -0,0 +1,406 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.tuple.memory; + +import io.netty.util.internal.PlatformDependent; +import org.apache.tajo.common.TajoDataTypes.DataType; +import org.apache.tajo.datum.BooleanDatum; +import org.apache.tajo.datum.IntervalDatum; +import org.apache.tajo.datum.ProtobufDatum; +import org.apache.tajo.datum.TextDatum; +import org.apache.tajo.exception.TajoInternalError; +import org.apache.tajo.exception.TajoRuntimeException; +import org.apache.tajo.exception.UnsupportedException; +import org.apache.tajo.storage.Tuple; +import org.apache.tajo.util.BitArray; +import org.apache.tajo.util.SizeOf; +import org.apache.tajo.util.UnsafeUtil; + +public class CompactRowBlockWriter implements RowWriter { + private static final int RECORD_FIELD_SIZE = 4; + // Maximum variant int32 size is 5 + private static final short MAXIMUM_VARIANT_INT32 = 5; + // Maximum variant int64 size is 10 + private static final short MAXIMUM_VARIANT_INT64 = 10; + + private final RowBlock rowBlock; + private final BitArray nullFlags; + + /** record capacity + offset list */ + private final int headerSize; + + private final DataType[] dataTypes; + + private int curFieldIdx; + private int curOffset; + + + public CompactRowBlockWriter(RowBlock rowBlock) { + this.dataTypes = rowBlock.getDataTypes(); + this.rowBlock = rowBlock; + + // compute the number of bytes, representing the null flags + nullFlags = new BitArray(dataTypes.length); + headerSize = RECORD_FIELD_SIZE + SizeOf.SIZE_OF_SHORT + nullFlags.bytesLength(); + + if (!rowBlock.getMemory().hasAddress()) { + throw new TajoInternalError(rowBlock.getMemory().getClass().getSimpleName() + + " does not support to direct memory access"); + } + } + + + /** + * Encode a ZigZag-encoded 32-bit value. ZigZag encodes signed integers + * into values that can be efficiently encoded with varint. (Otherwise, + * negative values must be sign-extended to 64 bits to be varint encoded, + * thus always taking 10 bytes on the wire.) + * + * @param n A signed 32-bit integer. + * @return An unsigned 32-bit integer, stored in a signed int because + * Java has no explicit unsigned support. + */ + public static int encodeZigZag32(final int n) { + // Note: the right-shift must be arithmetic + return (n << 1) ^ (n >> 31); + } + + /** + * Encode a ZigZag-encoded 64-bit value. ZigZag encodes signed integers + * into values that can be efficiently encoded with varint. (Otherwise, + * negative values must be sign-extended to 64 bits to be varint encoded, + * thus always taking 10 bytes on the wire.) + * + * @param n A signed 64-bit integer. + * @return An unsigned 64-bit integer, stored in a signed int because + * Java has no explicit unsigned support. + */ + public static long encodeZigZag64(final long n) { + // Note: the right-shift must be arithmetic + return (n << 1) ^ (n >> 63); + } + + /** + * Encode and write a varint. {@code value} is treated as + * unsigned, so it won't be sign-extended if negative. + */ + public static short writeRawVarint32(long address, int value) { + short length = 0; + while (true) { + if ((value & ~0x7F) == 0) { + PlatformDependent.putByte(address + length, (byte) value); + length++; + return length; + } else { + PlatformDependent.putByte(address + length, (byte) ((value & 0x7F) | 0x80)); + value >>>= 7; + length++; + } + } + } + + /** + * Encode and write a varint64. + */ + public static short writeRawVarint64(long address, long value) { + short length = 0; + while (true) { + if ((value & ~0x7FL) == 0) { + PlatformDependent.putByte(address + length, (byte) value); + length++; + return length; + } else { + PlatformDependent.putByte(address + length, (byte) ((value & 0x7F) | 0x80)); + value >>>= 7; + length++; + } + } + } + + /** + * Compute the number of bytes that would be needed to encode a varint. + * {@code value} is treated as unsigned, so it won't be sign-extended if + * negative. + */ + public static int computeRawVarint32Size(final int value) { + if ((value & (0xffffffff << 7)) == 0) return 1; + if ((value & (0xffffffff << 14)) == 0) return 2; + if ((value & (0xffffffff << 21)) == 0) return 3; + if ((value & (0xffffffff << 28)) == 0) return 4; + return 5; + } + + /** + * Current memory address of the buffer + * + * @return The memory address + */ + public long address() { + return rowBlock.getMemory().address(); + } + + /** + * Current position + * + * @return The position + */ + public int position() { + return rowBlock.getMemory().writerPosition(); + } + + + /** + * Forward the address; + * + * @param length Length to be forwarded + */ + public void forward(int length) { + rowBlock.getMemory().writerPosition(rowBlock.getMemory().writerPosition() + length); + } + + public void ensureSize(int size) { + rowBlock.getMemory().ensureSize(size); + } + + @Override + public DataType[] dataTypes() { + return rowBlock.getDataTypes(); + } + + /** + * Current memory address of the row + * + * @return The memory address + */ + public long recordStartAddr() { + return currentAddr() - curOffset; + } + + /** + * Memory address that point to the first byte of the buffer + * + * @return The memory address + */ + private long currentAddr() { + return address() + position(); + } + + public int offset() { + return position(); + } + + + @Override + public void clear() { + curOffset = 0; + curFieldIdx = 0; + nullFlags.clear(); + } + + @Override + public boolean startRow() { + ensureSize(headerSize); + nullFlags.clear(); + + curOffset = headerSize; + curFieldIdx = 0; + forward(headerSize); + return true; + } + + + public void endRow() { + long rowHeaderPos = recordStartAddr(); + // curOffset is equivalent to a byte length of this row. + PlatformDependent.putInt(rowHeaderPos, curOffset); + rowHeaderPos += SizeOf.SIZE_OF_INT; + + //set null flags + byte [] flags = nullFlags.toArray(); + PlatformDependent.putShort(rowHeaderPos, (short) flags.length); + rowHeaderPos += SizeOf.SIZE_OF_SHORT; + PlatformDependent.copyMemory(flags, 0, rowHeaderPos, flags.length); + + rowBlock.setRows(rowBlock.rows() + 1); + } + + @Override + public void skipField() { + // set null flag + nullFlags.set(curFieldIdx); + curFieldIdx++; + } + + /** + * set current buffer position and forward field length + * @param fieldLength + */ + private void forwardField(int fieldLength) { + forward(fieldLength); + curOffset += fieldLength; + + } + + @Override + public void putByte(byte val) { + ensureSize(SizeOf.SIZE_OF_BYTE); + long addr = currentAddr(); + + PlatformDependent.putByte(addr, val); + curFieldIdx++; + forwardField(SizeOf.SIZE_OF_BYTE); + } + + @Override + public void putBool(boolean val) { + putByte(val ? BooleanDatum.TRUE_INT : BooleanDatum.FALSE_INT); + } + + @Override + public void putInt2(short val) { + ensureSize(SizeOf.SIZE_OF_SHORT); + long addr = currentAddr(); + + PlatformDependent.putShort(addr, val); + curFieldIdx++; + forwardField(SizeOf.SIZE_OF_SHORT); + } + + @Override + public void putInt4(int val) { + ensureSize(MAXIMUM_VARIANT_INT32); + + curFieldIdx++; + forwardField(writeRawVarint32(currentAddr(), encodeZigZag32(val))); + } + + @Override + public void putInt8(long val) { + ensureSize(MAXIMUM_VARIANT_INT64); + + curFieldIdx++; + forwardField(writeRawVarint64(currentAddr(), encodeZigZag64(val))); + } + + @Override + public void putFloat4(float val) { + ensureSize(SizeOf.SIZE_OF_FLOAT); + long addr = currentAddr(); + + UnsafeUtil.unsafe.putFloat(addr, val); + curFieldIdx++; + forwardField(SizeOf.SIZE_OF_FLOAT); + } + + @Override + public void putFloat8(double val) { + ensureSize(SizeOf.SIZE_OF_DOUBLE); + long addr = currentAddr(); + + UnsafeUtil.unsafe.putDouble(addr, val); + curFieldIdx++; + forwardField(SizeOf.SIZE_OF_DOUBLE); + } + + @Override + public void putText(String val) { + putText(val.getBytes(TextDatum.DEFAULT_CHARSET)); + } + + @Override + public void putText(byte[] val) { + putBlob(val); + } + + @Override + public void putBlob(byte[] val) { + int bytesLen = val.length; + + ensureSize(MAXIMUM_VARIANT_INT32 + bytesLen); + long addr = currentAddr(); + + short length = writeRawVarint32(addr, bytesLen); + PlatformDependent.copyMemory(val, 0, addr + length, bytesLen); + curFieldIdx++; + forwardField(length + bytesLen); + } + + @Override + public void putDate(int val) { + ensureSize(SizeOf.SIZE_OF_INT); + long addr = currentAddr(); + + PlatformDependent.putInt(addr, val); + curFieldIdx++; + forwardField(SizeOf.SIZE_OF_INT); + } + + @Override + public void putTime(long val) { + ensureSize(SizeOf.SIZE_OF_LONG); + long addr = currentAddr(); + + PlatformDependent.putLong(addr, val); + curFieldIdx++; + forwardField(SizeOf.SIZE_OF_LONG); + } + + @Override + public void putTimestamp(long val) { + putTime(val); + } + + @Override + public void putInterval(IntervalDatum val) { + ensureSize(MAXIMUM_VARIANT_INT32 + MAXIMUM_VARIANT_INT64); + long addr = currentAddr(); + + short length = writeRawVarint32(addr, encodeZigZag32(val.getMonths())); + length += writeRawVarint64(addr, encodeZigZag64(val.getMilliSeconds())); + + curFieldIdx++; + forwardField(length); + } + + @Override + public void putInet4(int val) { + ensureSize(SizeOf.SIZE_OF_INT); + long addr = currentAddr(); + + PlatformDependent.putInt(addr, val); + curFieldIdx++; + forwardField(SizeOf.SIZE_OF_INT); + } + + @Override + public void putProtoDatum(ProtobufDatum val) { + putBlob(val.asByteArray()); + } + + + @Override + public void putTuple(Tuple tuple) { + OffHeapRowBlockUtils.convert(tuple, this); + } + + @Override + public ZeroCopyTuple addTuple(Tuple tuple) { + throw new TajoRuntimeException(new UnsupportedException()); + } +} diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java index 27e54f2fcd..57d522ab98 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java @@ -19,6 +19,8 @@ package org.apache.tajo.tuple.memory; import io.netty.util.internal.PlatformDependent; +import org.apache.tajo.BuiltinStorages; +import org.apache.tajo.annotation.NotThreadSafe; import org.apache.tajo.exception.NotImplementedException; import org.apache.tajo.exception.TajoInternalError; import org.apache.tajo.tuple.RowBlockReader; @@ -32,10 +34,12 @@ import static org.apache.tajo.common.TajoDataTypes.DataType; +@NotThreadSafe public class MemoryRowBlock implements RowBlock, Deallocatable { public static final int NULL_FIELD_OFFSET = -1; - private DataType[] dataTypes; + private final DataType[] dataTypes; + private final String dataFormat; // Basic States private int maxRowNum = Integer.MAX_VALUE; // optional @@ -45,20 +49,20 @@ public class MemoryRowBlock implements RowBlock, Deallocatable { private MemoryBlock memory; public MemoryRowBlock(DataType[] dataTypes, ResizableLimitSpec limitSpec, boolean isDirect) { + this(dataTypes, limitSpec, isDirect, BuiltinStorages.DRAW); + } + + public MemoryRowBlock(DataType[] dataTypes, ResizableLimitSpec limitSpec, boolean isDirect, String dataFormat) { this.memory = new ResizableMemoryBlock(limitSpec, isDirect); this.dataTypes = dataTypes; + this.dataFormat = dataFormat; } public MemoryRowBlock(MemoryRowBlock rowBlock) { this.memory = TUtil.checkTypeAndGet(rowBlock.getMemory().duplicate(), ResizableMemoryBlock.class); this.rowNum = rowBlock.rowNum; this.dataTypes = rowBlock.dataTypes; - } - - public MemoryRowBlock(MemoryBlock memory, DataType[] dataTypes, int rowNum) { - this.memory = memory; - this.rowNum = rowNum; - this.dataTypes = dataTypes; + this.dataFormat = rowBlock.dataFormat; } public MemoryRowBlock(DataType[] dataTypes) { @@ -69,8 +73,8 @@ public MemoryRowBlock(DataType[] dataTypes, int bytes) { this(dataTypes, new ResizableLimitSpec(bytes), true); } - public MemoryRowBlock(DataType[] dataTypes, int bytes, boolean isDirect) { - this(dataTypes, new ResizableLimitSpec(bytes), isDirect); + public MemoryRowBlock(DataType[] dataTypes, int bytes, boolean isDirect, String dataFormat) { + this(dataTypes, new ResizableLimitSpec(bytes), isDirect, dataFormat); } @Override @@ -126,6 +130,15 @@ public DataType[] getDataTypes() { @Override public boolean copyFromChannel(ScatteringByteChannel channel) throws IOException { + switch (dataFormat) { + case BuiltinStorages.DRAW: + return fillDrawBuffer(channel); + default: + throw new TajoInternalError(new NotImplementedException("Heap memory writer not implemented yet")); + } + } + + protected boolean fillDrawBuffer(ScatteringByteChannel channel) throws IOException { reset(); int readBytes = memory.writeBytes(channel); @@ -157,13 +170,23 @@ public boolean copyFromChannel(ScatteringByteChannel channel) throws IOException @Override public RowWriter getWriter() { + if (!getMemory().hasAddress()) { + throw new TajoInternalError(new NotImplementedException("Heap memory writer not implemented yet")); + } + if (builder == null) { - if (!getMemory().hasAddress()) { - throw new TajoInternalError(new NotImplementedException("Heap memory writer not implemented yet")); - } else { + switch (dataFormat) { + case BuiltinStorages.DRAW: this.builder = new OffHeapRowBlockWriter(this); + break; + case BuiltinStorages.RAW: + this.builder = new CompactRowBlockWriter(this); + break; + default: + throw new TajoInternalError(new NotImplementedException(dataFormat + " memory writer not implemented yet")); } } + return builder; } @@ -179,10 +202,17 @@ public void release() { @Override public RowBlockReader getReader() { - if (!getMemory().hasAddress()) { - return new HeapRowBlockReader(this); - } else { - return new OffHeapRowBlockReader(this); + + switch (dataFormat) { + case BuiltinStorages.DRAW: { + if (!getMemory().hasAddress()) { + return new HeapRowBlockReader(this); + } else { + return new OffHeapRowBlockReader(this); + } + } + default: + throw new TajoInternalError(new NotImplementedException(dataFormat + " memory writer not implemented yet")); } } } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java index e8f219cebc..f8430c094e 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java @@ -100,14 +100,19 @@ public static void convert(Tuple tuple, RowWriter writer) { writer.putInt2(tuple.getInt2(i)); break; case INT4: - case DATE: - case INET4: writer.putInt4(tuple.getInt4(i)); break; + case DATE: + writer.putDate(tuple.getInt4(i)); + break; case INT8: + writer.putInt8(tuple.getInt8(i)); + break; case TIMESTAMP: + writer.putTimestamp(tuple.getInt8(i)); + break; case TIME: - writer.putInt8(tuple.getInt8(i)); + writer.putTime(tuple.getInt8(i)); break; case FLOAT4: writer.putFloat4(tuple.getFloat4(i)); @@ -128,6 +133,9 @@ public static void convert(Tuple tuple, RowWriter writer) { case PROTOBUF: writer.putProtoDatum((ProtobufDatum) tuple.getProtobufDatum(i)); break; + case INET4: + writer.putInet4(tuple.getInt4(i)); + break; case NULL_TYPE: writer.skipField(); break; diff --git a/tajo-common/src/main/java/org/apache/tajo/util/BitArray.java b/tajo-common/src/main/java/org/apache/tajo/util/BitArray.java index e62496a75b..973266b2b4 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/BitArray.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/BitArray.java @@ -19,6 +19,7 @@ package org.apache.tajo.util; import java.nio.ByteBuffer; +import java.util.Arrays; public class BitArray { private byte [] data; @@ -60,9 +61,7 @@ public boolean get(int idx) { } public void clear() { - for (int i = 0; i < data.length; i++) { - data[i] = 0; - } + Arrays.fill(data, (byte) 0); } public int bytesLength() { diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java b/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java index b954b09863..9f5ecb50ca 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/querymaster/TestTaskStatusUpdate.java @@ -59,8 +59,8 @@ public final void case1() throws Exception { // tpch/lineitem.tbl long[] expectedNumRows = new long[]{5, 2, 2, 2}; - long[] expectedNumBytes = new long[]{604, 48, 48, 48}; - long[] expectedReadBytes = new long[]{604, 604, 48, 0}; + long[] expectedNumBytes = new long[]{604, 18, 18, 48}; + long[] expectedReadBytes = new long[]{604, 604, 18, 0}; QueryId queryId = getQueryId(res); assertStatus(queryId, 2, expectedNumRows, expectedNumBytes, expectedReadBytes); @@ -78,8 +78,8 @@ public final void case2() throws Exception { // tpch/lineitem.tbl long[] expectedNumRows = new long[]{5, 2, 2, 2, 2, 2}; - long[] expectedNumBytes = new long[]{604, 278, 278, 236, 236, 236}; - long[] expectedReadBytes = new long[]{604, 604, 278, 0, 236, 0}; + long[] expectedNumBytes = new long[]{604, 162, 162, 138, 138, 236}; + long[] expectedReadBytes = new long[]{604, 604, 162, 0, 138, 0}; QueryId queryId = getQueryId(res); assertStatus(queryId, 3, expectedNumRows, expectedNumBytes, expectedReadBytes); @@ -107,8 +107,8 @@ public final void case3() throws Exception { // in/out * stage(4) long[] expectedNumRows = new long[]{5, 5, 2, 2, 2, 2, 2, 2}; - long[] expectedNumBytes = new long[]{20, 80, 8, 64, 144, 64, 64, 64}; - long[] expectedReadBytes = new long[]{20, 20, 8, 8, 144, 0, 64, 0}; + long[] expectedNumBytes = new long[]{20, 75, 8, 34, 109, 34, 34, 64}; + long[] expectedReadBytes = new long[]{20, 20, 8, 8, 109, 0, 34, 0}; QueryId queryId = getQueryId(res); assertStatus(queryId, 4, expectedNumRows, expectedNumBytes, expectedReadBytes); diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/global/DataChannel.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/global/DataChannel.java index 38fdc246a9..c779d2f362 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/global/DataChannel.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/global/DataChannel.java @@ -40,7 +40,7 @@ public class DataChannel { private Schema schema; - private String dataFormat = BuiltinStorages.DRAW; + private String dataFormat = BuiltinStorages.RAW; public DataChannel(ExecutionBlockId srcId, ExecutionBlockId targetId) { this.srcId = srcId; diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java index b2ca571623..271c52f198 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java @@ -124,7 +124,7 @@ public Tuple next() throws IOException { partId = partitioner.getPartition(tuple); MemoryRowBlock rowBlock = partitionMemoryMap.get(partId); if (rowBlock == null) { - rowBlock = new MemoryRowBlock(dataTypes, initialBufferSize); + rowBlock = new MemoryRowBlock(dataTypes, initialBufferSize, true, plan.getStorageType()); partitionMemoryMap.put(partId, rowBlock); totalBufferCapacity += rowBlock.capacity(); } diff --git a/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java b/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java index acc8dbee3b..ac087a7070 100644 --- a/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java +++ b/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java @@ -196,8 +196,9 @@ public static void scheduleFragmentsForJoinQuery(TaskSchedulerContext schedulerC int maxStatsScanIdx = -1; StringBuilder nonLeafScanNamesBuilder = new StringBuilder(); for (int i = 0; i < scans.length; i++) { - - if (scans[i].getTableDesc().getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.DRAW)) { + String dataFormat = scans[i].getTableDesc().getMeta().getDataFormat(); + //TODO add flag for intermediate data format + if (dataFormat.equalsIgnoreCase(BuiltinStorages.DRAW) || dataFormat.equalsIgnoreCase(BuiltinStorages.RAW)) { // Intermediate data scan hasNonLeafNode = true; largeScanIndexList.add(i); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java index 2986e3ef23..c50a3d1540 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java @@ -24,8 +24,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; +import org.apache.tajo.BuiltinStorages; import org.apache.tajo.TaskAttemptId; import org.apache.tajo.catalog.Schema; +import org.apache.tajo.catalog.SchemaUtil; import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.catalog.statistics.TableStats; import org.apache.tajo.common.TajoDataTypes.DataType; @@ -38,6 +40,7 @@ import org.apache.tajo.plan.serder.PlanProto.ShuffleType; import org.apache.tajo.plan.util.PlannerUtil; import org.apache.tajo.storage.fragment.Fragment; +import org.apache.tajo.tuple.memory.MemoryRowBlock; import org.apache.tajo.unit.StorageUnit; import org.apache.tajo.util.BitArray; @@ -46,6 +49,7 @@ import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.channels.FileChannel; public class RawFile { @@ -101,7 +105,7 @@ public void init() throws IOException { } if(buf == null) { - buf = BufferPool.directBuffer(conf.getInt(READ_BUFFER_SIZE, DEFAULT_BUFFER_SIZE)); + buf = BufferPool.directBuffer(conf.getInt(READ_BUFFER_SIZE, DEFAULT_BUFFER_SIZE)).order(ByteOrder.LITTLE_ENDIAN); buffer = buf.nioBuffer(0, buf.capacity()); } @@ -462,24 +466,30 @@ public float getProgress() { } } + @Deprecated public static class RawFileAppender extends FileAppender { + private static final float BUFFER_THRESHHOLD = 0.9f; + private FileChannel channel; private RandomAccessFile randomAccessFile; private DataType[] columnTypes; - private ByteBuffer buffer; - private ByteBuf buf; - private BitArray nullFlags; - private int headerSize = 0; - private static final int RECORD_SIZE = 4; private long pos; - private ShuffleType shuffleType; + private MemoryRowBlock rowBlock; + private boolean analyzeField; + private boolean hasExternalBuf; private TableStatistics stats; - public RawFileAppender(Configuration conf, TaskAttemptId taskAttemptId, - Schema schema, TableMeta meta, Path workDir) throws IOException { + public RawFileAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, + TableMeta meta, Path workDir) throws IOException { + this(conf, taskAttemptId, schema, meta, workDir, null); + } + + public RawFileAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, + TableMeta meta, Path workDir, MemoryRowBlock rowBlock) throws IOException { super(conf, taskAttemptId, schema, meta, workDir); + this.rowBlock = rowBlock; } public void init() throws IOException { @@ -496,269 +506,67 @@ public void init() throws IOException { randomAccessFile = new RandomAccessFile(file, "rw"); channel = randomAccessFile.getChannel(); - pos = 0; columnTypes = new DataType[schema.size()]; for (int i = 0; i < schema.size(); i++) { columnTypes[i] = schema.getColumn(i).getDataType(); } - buf = BufferPool.directBuffer(conf.getInt(WRITE_BUFFER_SIZE, DEFAULT_BUFFER_SIZE)); - buffer = buf.nioBuffer(0, buf.capacity()); - - // comput the number of bytes, representing the null flags - - nullFlags = new BitArray(schema.size()); - headerSize = RECORD_SIZE + 2 + nullFlags.bytesLength(); + if (rowBlock == null) { + int bufferSize = conf.getInt(WRITE_BUFFER_SIZE, DEFAULT_BUFFER_SIZE); + rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), bufferSize, true, BuiltinStorages.RAW); + } else { + hasExternalBuf = true; + } if (enabledStats) { this.stats = new TableStatistics(this.schema); - this.shuffleType = PlannerUtil.getShuffleType( + if (ShuffleType.RANGE_SHUFFLE == PlannerUtil.getShuffleType( meta.getOption(StorageConstants.SHUFFLE_TYPE, - PlannerUtil.getShuffleType(ShuffleType.NONE_SHUFFLE))); + PlannerUtil.getShuffleType(ShuffleType.NONE_SHUFFLE)))) { + this.analyzeField = true; + } } + pos = 0; super.init(); } @Override public long getOffset() throws IOException { - return pos; - } - - private void flushBuffer() throws IOException { - buffer.flip(); - channel.write(buffer); - buffer.clear(); - } - - private boolean flushBufferAndReplace(int recordOffset, int sizeToBeWritten) - throws IOException { - - // if the buffer reaches the limit, - // write the bytes from 0 to the previous record. - if (buffer.remaining() < sizeToBeWritten) { - - int limit = buffer.position(); - buffer.limit(recordOffset); - buffer.flip(); - channel.write(buffer); - buffer.position(recordOffset); - buffer.limit(limit); - buffer.compact(); - - //increase the write-buffer - if(buffer.remaining() < sizeToBeWritten) { - buf.setIndex(buffer.position(), buffer.limit()); - buf.ensureWritable(sizeToBeWritten); - buffer = buf.nioBuffer(0, buf.capacity()); - buffer.position(buf.readerIndex()); - } - return true; - } else { - return false; - } - } - - /** - * Encode a ZigZag-encoded 32-bit value. ZigZag encodes signed integers - * into values that can be efficiently encoded with varint. (Otherwise, - * negative values must be sign-extended to 64 bits to be varint encoded, - * thus always taking 10 bytes on the wire.) - * - * @param n A signed 32-bit integer. - * @return An unsigned 32-bit integer, stored in a signed int because - * Java has no explicit unsigned support. - */ - public static int encodeZigZag32(final int n) { - // Note: the right-shift must be arithmetic - return (n << 1) ^ (n >> 31); - } - - /** - * Encode a ZigZag-encoded 64-bit value. ZigZag encodes signed integers - * into values that can be efficiently encoded with varint. (Otherwise, - * negative values must be sign-extended to 64 bits to be varint encoded, - * thus always taking 10 bytes on the wire.) - * - * @param n A signed 64-bit integer. - * @return An unsigned 64-bit integer, stored in a signed int because - * Java has no explicit unsigned support. - */ - public static long encodeZigZag64(final long n) { - // Note: the right-shift must be arithmetic - return (n << 1) ^ (n >> 63); + return hasExternalBuf ? pos : pos + rowBlock.getMemory().writerPosition(); } - /** - * Encode and write a varint. {@code value} is treated as - * unsigned, so it won't be sign-extended if negative. - */ - public void writeRawVarint32(int value) throws IOException { - while (true) { - if ((value & ~0x7F) == 0) { - buffer.put((byte) value); - return; - } else { - buffer.put((byte) ((value & 0x7F) | 0x80)); - value >>>= 7; - } - } - } + public void writeRowBlock(MemoryRowBlock rowBlock) throws IOException { + pos += rowBlock.getMemory().writeTo(channel); - /** - * Compute the number of bytes that would be needed to encode a varint. - * {@code value} is treated as unsigned, so it won't be sign-extended if - * negative. - */ - public static int computeRawVarint32Size(final int value) { - if ((value & (0xffffffff << 7)) == 0) return 1; - if ((value & (0xffffffff << 14)) == 0) return 2; - if ((value & (0xffffffff << 21)) == 0) return 3; - if ((value & (0xffffffff << 28)) == 0) return 4; - return 5; - } - - /** Encode and write a varint. */ - public void writeRawVarint64(long value) throws IOException { - while (true) { - if ((value & ~0x7FL) == 0) { - buffer.put((byte) value); - return; - } else { - buffer.put((byte) ((value & 0x7F) | 0x80)); - value >>>= 7; - } + if (enabledStats) { + stats.incrementRows(rowBlock.rows()); } } @Override public void addTuple(Tuple t) throws IOException { - - if (buffer.remaining() < headerSize) { - flushBuffer(); - } - - // skip the row header - int recordOffset = buffer.position(); - buffer.position(recordOffset + headerSize); - // reset the null flags - nullFlags.clear(); - for (int i = 0; i < schema.size(); i++) { - if (shuffleType == ShuffleType.RANGE_SHUFFLE) { - // it is to calculate min/max values, and it is only used for the intermediate file. + if (analyzeField) { + // it is to calculate min/max values, and it is only used for the intermediate file. + for (int i = 0; i < schema.size(); i++) { stats.analyzeField(i, t); } - - if (t.isBlankOrNull(i)) { - nullFlags.set(i); - continue; - } - - // 10 is the maximum bytes size of all types - if (flushBufferAndReplace(recordOffset, 10)) { - recordOffset = 0; - } - - switch (columnTypes[i].getType()) { - case NULL_TYPE: - nullFlags.set(i); - continue; - - case BOOLEAN: - case BIT: - buffer.put(t.getByte(i)); - break; - - case INT2: - buffer.putShort(t.getInt2(i)); - break; - - case INT4: - writeRawVarint32(encodeZigZag32(t.getInt4(i))); - break; - - case INT8: - writeRawVarint64(encodeZigZag64(t.getInt8(i))); - break; - - case FLOAT4: - buffer.putFloat(t.getFloat4(i)); - break; - - case FLOAT8: - buffer.putDouble(t.getFloat8(i)); - break; - - case CHAR: - case TEXT: { - byte[] strBytes = t.getBytes(i); - if (flushBufferAndReplace(recordOffset, strBytes.length + computeRawVarint32Size(strBytes.length))) { - recordOffset = 0; - } - writeRawVarint32(strBytes.length); - buffer.put(strBytes); - break; - } - - case DATE: - buffer.putInt(t.getInt4(i)); - break; - - case TIME: - case TIMESTAMP: - buffer.putLong(t.getInt8(i)); - break; - - case BLOB: { - byte[] rawBytes = t.getBytes(i); - if (flushBufferAndReplace(recordOffset, rawBytes.length + computeRawVarint32Size(rawBytes.length))) { - recordOffset = 0; - } - writeRawVarint32(rawBytes.length); - buffer.put(rawBytes); - break; - } - - case PROTOBUF: { - byte[] rawBytes = t.getBytes(i); - if (flushBufferAndReplace(recordOffset, rawBytes.length + computeRawVarint32Size(rawBytes.length))) { - recordOffset = 0; - } - writeRawVarint32(rawBytes.length); - buffer.put(rawBytes); - break; - } - - case INET4: - buffer.putInt(t.getInt4(i)); - break; - - default: - throw new IOException("Cannot support data type: " + columnTypes[i].getType()); - } } - // write a record header - int bufferPos = buffer.position(); - buffer.position(recordOffset); - buffer.putInt(bufferPos - recordOffset); - byte [] flags = nullFlags.toArray(); - buffer.putShort((short) flags.length); - buffer.put(flags); + rowBlock.getWriter().putTuple(t); - pos += bufferPos - recordOffset; - buffer.position(bufferPos); - - if (enabledStats) { - stats.incrementRow(); + if(rowBlock.usage() > BUFFER_THRESHHOLD) { + writeRowBlock(rowBlock); + rowBlock.clear(); } } @Override public void flush() throws IOException { - if(buffer != null){ - flushBuffer(); + if(!hasExternalBuf && rowBlock.getMemory().isReadable()) { + writeRowBlock(rowBlock); + rowBlock.clear(); } } @@ -772,15 +580,10 @@ public void close() throws IOException { LOG.debug("RawFileAppender written: " + getOffset() + " bytes, path: " + path); } - if(buf != null){ - buffer.clear(); - buffer = null; - - buf.release(); - buf = null; - } - IOUtils.cleanup(LOG, channel, randomAccessFile); + if(!hasExternalBuf && rowBlock != null) { + rowBlock.release(); + } } @Override diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java index 0e8808bf32..550de63d75 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileScanner.java @@ -69,7 +69,7 @@ public void init() throws IOException { if (tupleBuffer == null) { tupleBuffer = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), - conf.getInt(READ_BUFFER_SIZE, DEFAULT_BUFFER_SIZE), true); + conf.getInt(READ_BUFFER_SIZE, DEFAULT_BUFFER_SIZE)); } else { tupleBuffer.clear(); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java index 9dbb7bab77..79b0f71cb5 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java @@ -60,8 +60,8 @@ public class DirectRawFileWriter extends FileAppender { private long pos; private TableStatistics stats; - private ShuffleType shuffleType; private MemoryRowBlock rowBlock; + private boolean analyzeField; private boolean hasExternalBuf; public DirectRawFileWriter(Configuration conf, TaskAttemptId taskAttemptId, @@ -102,9 +102,11 @@ public void init() throws IOException { if (enabledStats) { this.stats = new TableStatistics(this.schema); - this.shuffleType = PlannerUtil.getShuffleType( + if (ShuffleType.RANGE_SHUFFLE == PlannerUtil.getShuffleType( meta.getOption(StorageConstants.SHUFFLE_TYPE, - PlannerUtil.getShuffleType(ShuffleType.NONE_SHUFFLE))); + PlannerUtil.getShuffleType(ShuffleType.NONE_SHUFFLE)))) { + this.analyzeField = true; + } } if (rowBlock == null) { @@ -137,7 +139,7 @@ public void writeRowBlock(MemoryRowBlock rowBlock) throws IOException { @Override public void addTuple(Tuple t) throws IOException { - if (shuffleType == ShuffleType.RANGE_SHUFFLE) { + if (analyzeField) { // it is to calculate min/max values, and it is only used for the intermediate file. for (int i = 0; i < schema.size(); i++) { stats.analyzeField(i, t); From 4705aec5087828a37b7b568dabaa4b0ebb4232de Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Tue, 10 Nov 2015 12:31:26 +0900 Subject: [PATCH 17/28] add tuple converter --- .../tuple/memory/CompactRowBlockWriter.java | 11 +- .../tajo/tuple/memory/MemoryRowBlock.java | 5 + .../tuple/memory/OffHeapRowBlockUtils.java | 138 ++++++++++-------- .../apache/tajo/tuple/memory/RowBlock.java | 2 + .../apache/tajo/tuple/memory/RowWriter.java | 2 +- .../storage/HashShuffleAppenderManager.java | 4 +- .../java/org/apache/tajo/storage/RawFile.java | 128 +--------------- .../storage/rawfile/DirectRawFileWriter.java | 118 ++++++++++++--- 8 files changed, 194 insertions(+), 214 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java index e6a649b984..f2c7961eb1 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java @@ -25,8 +25,6 @@ import org.apache.tajo.datum.ProtobufDatum; import org.apache.tajo.datum.TextDatum; import org.apache.tajo.exception.TajoInternalError; -import org.apache.tajo.exception.TajoRuntimeException; -import org.apache.tajo.exception.UnsupportedException; import org.apache.tajo.storage.Tuple; import org.apache.tajo.util.BitArray; import org.apache.tajo.util.SizeOf; @@ -400,7 +398,12 @@ public void putTuple(Tuple tuple) { } @Override - public ZeroCopyTuple addTuple(Tuple tuple) { - throw new TajoRuntimeException(new UnsupportedException()); + public Tuple addTuple(Tuple tuple) { + putTuple(tuple); + try { + return tuple.clone(); + } catch (CloneNotSupportedException e) { + throw new TajoInternalError(e); + } } } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java index 57d522ab98..3d02f9abc7 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/MemoryRowBlock.java @@ -77,6 +77,11 @@ public MemoryRowBlock(DataType[] dataTypes, int bytes, boolean isDirect, String this(dataTypes, new ResizableLimitSpec(bytes), isDirect, dataFormat); } + @Override + public String getDataFormat() { + return dataFormat; + } + @Override public void clear() { reset(); diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java index f8430c094e..0838e44db0 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java @@ -32,6 +32,11 @@ import java.util.List; public class OffHeapRowBlockUtils { + private static TupleConverter tupleConverter; + + static { + tupleConverter = new TupleConverter(); + } public static List sort(MemoryRowBlock rowBlock, Comparator comparator) { List tupleList = Lists.newArrayList(); @@ -80,70 +85,83 @@ public static Tuple[] sortToArray(MemoryRowBlock rowBlock, Comparator com return tuples; } - public static void convert(Tuple tuple, RowWriter writer) { - writer.startRow(); + public static class TupleConverter { - for (int i = 0; i < writer.dataTypes().length; i++) { - if (tuple.isBlankOrNull(i)) { - writer.skipField(); - continue; + public void convert(Tuple tuple, RowWriter writer) { + writer.startRow(); + + for (int i = 0; i < writer.dataTypes().length; i++) { + writeField(i, tuple, writer); } - switch (writer.dataTypes()[i].getType()) { - case BOOLEAN: - writer.putBool(tuple.getBool(i)); - break; - case BIT: - writer.putByte(tuple.getByte(i)); - break; - case INT1: - case INT2: - writer.putInt2(tuple.getInt2(i)); - break; - case INT4: - writer.putInt4(tuple.getInt4(i)); - break; - case DATE: - writer.putDate(tuple.getInt4(i)); - break; - case INT8: - writer.putInt8(tuple.getInt8(i)); - break; - case TIMESTAMP: - writer.putTimestamp(tuple.getInt8(i)); - break; - case TIME: - writer.putTime(tuple.getInt8(i)); - break; - case FLOAT4: - writer.putFloat4(tuple.getFloat4(i)); - break; - case FLOAT8: - writer.putFloat8(tuple.getFloat8(i)); - break; - case CHAR: - case TEXT: - writer.putText(tuple.getBytes(i)); - break; - case BLOB: - writer.putBlob(tuple.getBytes(i)); - break; - case INTERVAL: - writer.putInterval((IntervalDatum) tuple.getInterval(i)); - break; - case PROTOBUF: - writer.putProtoDatum((ProtobufDatum) tuple.getProtobufDatum(i)); - break; - case INET4: - writer.putInet4(tuple.getInt4(i)); - break; - case NULL_TYPE: + + writer.endRow(); + } + + protected void writeField(int colIdx, Tuple tuple, RowWriter writer) { + + if (tuple.isBlankOrNull(colIdx)) { writer.skipField(); - break; - default: - throw new TajoRuntimeException( - new UnsupportedException("unknown data type '" + writer.dataTypes()[i].getType().name() + "'")); + } else { + switch (writer.dataTypes()[colIdx].getType()) { + case BOOLEAN: + writer.putBool(tuple.getBool(colIdx)); + break; + case BIT: + writer.putByte(tuple.getByte(colIdx)); + break; + case INT1: + case INT2: + writer.putInt2(tuple.getInt2(colIdx)); + break; + case INT4: + writer.putInt4(tuple.getInt4(colIdx)); + break; + case DATE: + writer.putDate(tuple.getInt4(colIdx)); + break; + case INT8: + writer.putInt8(tuple.getInt8(colIdx)); + break; + case TIMESTAMP: + writer.putTimestamp(tuple.getInt8(colIdx)); + break; + case TIME: + writer.putTime(tuple.getInt8(colIdx)); + break; + case FLOAT4: + writer.putFloat4(tuple.getFloat4(colIdx)); + break; + case FLOAT8: + writer.putFloat8(tuple.getFloat8(colIdx)); + break; + case CHAR: + case TEXT: + writer.putText(tuple.getBytes(colIdx)); + break; + case BLOB: + writer.putBlob(tuple.getBytes(colIdx)); + break; + case INTERVAL: + writer.putInterval((IntervalDatum) tuple.getInterval(colIdx)); + break; + case PROTOBUF: + writer.putProtoDatum((ProtobufDatum) tuple.getProtobufDatum(colIdx)); + break; + case INET4: + writer.putInet4(tuple.getInt4(colIdx)); + break; + case NULL_TYPE: + writer.skipField(); + break; + default: + throw new TajoRuntimeException( + new UnsupportedException("unknown data type '" + writer.dataTypes()[colIdx].getType().name() + "'")); + } } } - writer.endRow(); + } + + public static void convert(Tuple tuple, RowWriter writer) { + tupleConverter.convert(tuple, writer); } } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java index f916351d84..c9865d5ea1 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java @@ -26,6 +26,8 @@ public interface RowBlock { + String getDataFormat(); + void clear(); int capacity(); diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java index d49636e1b5..93a15c5016 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java @@ -78,5 +78,5 @@ public interface RowWriter { void putTuple(Tuple tuple); - ZeroCopyTuple addTuple(Tuple tuple); + Tuple addTuple(Tuple tuple); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java index aeadca2725..5988685157 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java @@ -112,8 +112,8 @@ public synchronized HashShuffleAppender getAppender(MemoryRowBlock memoryRowBloc fs.mkdirs(dataFile.getParent()); } - DirectRawFileWriter appender = - new DirectRawFileWriter(systemConf, null, outSchema, meta, dataFile, memoryRowBlock); + DirectRawFileWriter appender = new DirectRawFileWriter(systemConf, null, outSchema, meta, dataFile, + memoryRowBlock, memoryRowBlock.getDataFormat()); appender.enableStats(); appender.init(); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java index c50a3d1540..d7cfe0a3a0 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java @@ -27,7 +27,6 @@ import org.apache.tajo.BuiltinStorages; import org.apache.tajo.TaskAttemptId; import org.apache.tajo.catalog.Schema; -import org.apache.tajo.catalog.SchemaUtil; import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.catalog.statistics.TableStats; import org.apache.tajo.common.TajoDataTypes.DataType; @@ -37,9 +36,8 @@ import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.exception.UnsupportedException; import org.apache.tajo.plan.expr.EvalNode; -import org.apache.tajo.plan.serder.PlanProto.ShuffleType; -import org.apache.tajo.plan.util.PlannerUtil; import org.apache.tajo.storage.fragment.Fragment; +import org.apache.tajo.storage.rawfile.DirectRawFileWriter; import org.apache.tajo.tuple.memory.MemoryRowBlock; import org.apache.tajo.unit.StorageUnit; import org.apache.tajo.util.BitArray; @@ -47,7 +45,6 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; -import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.channels.FileChannel; @@ -467,133 +464,16 @@ public float getProgress() { } @Deprecated - public static class RawFileAppender extends FileAppender { - private static final float BUFFER_THRESHHOLD = 0.9f; - - private FileChannel channel; - private RandomAccessFile randomAccessFile; - private DataType[] columnTypes; - - private long pos; - private MemoryRowBlock rowBlock; - private boolean analyzeField; - private boolean hasExternalBuf; - - private TableStatistics stats; + public static class RawFileAppender extends DirectRawFileWriter { public RawFileAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, TableMeta meta, Path workDir) throws IOException { - this(conf, taskAttemptId, schema, meta, workDir, null); + super(conf, taskAttemptId, schema, meta, workDir, null, BuiltinStorages.RAW); } public RawFileAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, TableMeta meta, Path workDir, MemoryRowBlock rowBlock) throws IOException { - super(conf, taskAttemptId, schema, meta, workDir); - this.rowBlock = rowBlock; - } - - public void init() throws IOException { - File file; - try { - if (path.toUri().getScheme() != null) { - file = new File(path.toUri()); - } else { - file = new File(path.toString()); - } - } catch (IllegalArgumentException iae) { - throw new IOException(iae); - } - - randomAccessFile = new RandomAccessFile(file, "rw"); - channel = randomAccessFile.getChannel(); - - columnTypes = new DataType[schema.size()]; - for (int i = 0; i < schema.size(); i++) { - columnTypes[i] = schema.getColumn(i).getDataType(); - } - - if (rowBlock == null) { - int bufferSize = conf.getInt(WRITE_BUFFER_SIZE, DEFAULT_BUFFER_SIZE); - rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), bufferSize, true, BuiltinStorages.RAW); - } else { - hasExternalBuf = true; - } - - if (enabledStats) { - this.stats = new TableStatistics(this.schema); - if (ShuffleType.RANGE_SHUFFLE == PlannerUtil.getShuffleType( - meta.getOption(StorageConstants.SHUFFLE_TYPE, - PlannerUtil.getShuffleType(ShuffleType.NONE_SHUFFLE)))) { - this.analyzeField = true; - } - } - - pos = 0; - super.init(); - } - - @Override - public long getOffset() throws IOException { - return hasExternalBuf ? pos : pos + rowBlock.getMemory().writerPosition(); - } - - public void writeRowBlock(MemoryRowBlock rowBlock) throws IOException { - pos += rowBlock.getMemory().writeTo(channel); - - if (enabledStats) { - stats.incrementRows(rowBlock.rows()); - } - } - - @Override - public void addTuple(Tuple t) throws IOException { - if (analyzeField) { - // it is to calculate min/max values, and it is only used for the intermediate file. - for (int i = 0; i < schema.size(); i++) { - stats.analyzeField(i, t); - } - } - - rowBlock.getWriter().putTuple(t); - - if(rowBlock.usage() > BUFFER_THRESHHOLD) { - writeRowBlock(rowBlock); - rowBlock.clear(); - } - } - - @Override - public void flush() throws IOException { - if(!hasExternalBuf && rowBlock.getMemory().isReadable()) { - writeRowBlock(rowBlock); - rowBlock.clear(); - } - } - - @Override - public void close() throws IOException { - flush(); - if (enabledStats) { - stats.setNumBytes(getOffset()); - } - if (LOG.isDebugEnabled()) { - LOG.debug("RawFileAppender written: " + getOffset() + " bytes, path: " + path); - } - - IOUtils.cleanup(LOG, channel, randomAccessFile); - if(!hasExternalBuf && rowBlock != null) { - rowBlock.release(); - } - } - - @Override - public TableStats getStats() { - if (enabledStats) { - stats.setNumBytes(pos); - return stats.getTableStat(); - } else { - return null; - } + super(conf, taskAttemptId, schema, meta, workDir, rowBlock, BuiltinStorages.RAW); } } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java index 79b0f71cb5..68482f056a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java @@ -26,11 +26,14 @@ import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; +import org.apache.tajo.BuiltinStorages; import org.apache.tajo.TaskAttemptId; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.SchemaUtil; import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.catalog.statistics.TableStats; +import org.apache.tajo.exception.TajoInternalError; +import org.apache.tajo.exception.UnsupportedException; import org.apache.tajo.plan.serder.PlanProto.ShuffleType; import org.apache.tajo.plan.util.PlannerUtil; import org.apache.tajo.storage.FileAppender; @@ -38,6 +41,9 @@ import org.apache.tajo.storage.TableStatistics; import org.apache.tajo.storage.Tuple; import org.apache.tajo.tuple.memory.MemoryRowBlock; +import org.apache.tajo.tuple.memory.OffHeapRowBlockUtils.TupleConverter; +import org.apache.tajo.tuple.memory.RowWriter; +import org.apache.tajo.tuple.memory.UnSafeTuple; import org.apache.tajo.unit.StorageUnit; import java.io.File; @@ -48,32 +54,42 @@ public class DirectRawFileWriter extends FileAppender { private static final Log LOG = LogFactory.getLog(DirectRawFileWriter.class); - public static final String FILE_EXTENSION = "draw"; public static final String WRITE_BUFFER_SIZE = "tajo.storage.raw.io.write-buffer.bytes"; public static final int DEFAULT_BUFFER_SIZE = 128 * StorageUnit.KB; - private static final float BUFFER_THRESHHOLD = 0.9f; + public static final float BUFFER_THRESHHOLD = 0.9f; - private FileChannel channel; - private RandomAccessFile randomAccessFile; - private FSDataOutputStream fos; - private boolean isLocal; - private long pos; + protected FileChannel channel; - private TableStatistics stats; - private MemoryRowBlock rowBlock; - private boolean analyzeField; - private boolean hasExternalBuf; + protected RandomAccessFile randomAccessFile; + protected FSDataOutputStream fos; + protected long pos; + protected final String dataFormat; + protected TableStatistics stats; + + protected TupleConverter tupleConverter; + protected MemoryRowBlock rowBlock; + protected boolean analyzeField; + protected boolean hasExternalBuf; + protected boolean isLocal; public DirectRawFileWriter(Configuration conf, TaskAttemptId taskAttemptId, final Schema schema, final TableMeta meta, final Path path) throws IOException { - this(conf, taskAttemptId, schema, meta, path, null); + this(conf, taskAttemptId, schema, meta, path, null, BuiltinStorages.DRAW); + } + + public DirectRawFileWriter(Configuration conf, TaskAttemptId taskAttemptId, + final Schema schema, final TableMeta meta, final Path path, String dataFormat) + throws IOException { + this(conf, taskAttemptId, schema, meta, path, null, dataFormat); } public DirectRawFileWriter(Configuration conf, TaskAttemptId taskAttemptId, final Schema schema, final TableMeta meta, final Path path, - MemoryRowBlock rowBlock) throws IOException { + MemoryRowBlock rowBlock, String dataFormat) throws IOException { super(conf, taskAttemptId, schema, meta, path); this.rowBlock = rowBlock; + this.hasExternalBuf = rowBlock != null; + this.dataFormat = dataFormat; } @Override @@ -111,15 +127,77 @@ public void init() throws IOException { if (rowBlock == null) { int bufferSize = conf.getInt(WRITE_BUFFER_SIZE, DEFAULT_BUFFER_SIZE); - rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), bufferSize); - } else { - hasExternalBuf = true; + rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), bufferSize, true, dataFormat); } + tupleConverter = initConverter(); + pos = 0; super.init(); } + public TupleConverter initConverter() { + switch (dataFormat) { + case BuiltinStorages.DRAW: + return getDrawConverter(); + case BuiltinStorages.RAW: + return getRawConverter(); + default: + throw new TajoInternalError(new UnsupportedException()); + } + } + + private TupleConverter getDrawConverter() { + return new TupleConverter() { + + @Override + public void convert(Tuple tuple, RowWriter writer) { + if (analyzeField) { + if (tuple instanceof UnSafeTuple) { + + for (int i = 0; i < writer.dataTypes().length; i++) { + // it is to calculate min/max values, and it is only used for the intermediate file. + stats.analyzeField(i, tuple); + } + // write direct to memory + writer.putTuple(tuple); + } else { + writer.startRow(); + + for (int i = 0; i < writer.dataTypes().length; i++) { + // it is to calculate min/max values, and it is only used for the intermediate file. + stats.analyzeField(i, tuple); + writeField(i, tuple, writer); + } + writer.endRow(); + } + } else { + // write direct to memory + writer.putTuple(tuple); + } + } + }; + } + + private TupleConverter getRawConverter() { + return new TupleConverter() { + + @Override + public void convert(Tuple tuple, RowWriter writer) { + writer.startRow(); + + for (int i = 0; i < writer.dataTypes().length; i++) { + // it is to calculate min/max values, and it is only used for the intermediate file. + if (analyzeField) { + stats.analyzeField(i, tuple); + } + writeField(i, tuple, writer); + } + writer.endRow(); + } + }; + } + @Override public long getOffset() throws IOException { return hasExternalBuf ? pos : pos + rowBlock.getMemory().writerPosition(); @@ -139,14 +217,8 @@ public void writeRowBlock(MemoryRowBlock rowBlock) throws IOException { @Override public void addTuple(Tuple t) throws IOException { - if (analyzeField) { - // it is to calculate min/max values, and it is only used for the intermediate file. - for (int i = 0; i < schema.size(); i++) { - stats.analyzeField(i, t); - } - } - rowBlock.getWriter().putTuple(t); + tupleConverter.convert(t, rowBlock.getWriter()); if(rowBlock.usage() > BUFFER_THRESHHOLD) { writeRowBlock(rowBlock); From 1cf0347fa5a0568541d095294c945a7888a68f35 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Tue, 10 Nov 2015 12:37:20 +0900 Subject: [PATCH 18/28] remove unused class --- .../planner/physical/UnsafeTupleList.java | 69 ------------------- 1 file changed, 69 deletions(-) delete mode 100644 tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java deleted file mode 100644 index 804572499f..0000000000 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/UnsafeTupleList.java +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.engine.planner.physical; - -import org.apache.tajo.catalog.Schema; -import org.apache.tajo.catalog.SchemaUtil; -import org.apache.tajo.storage.Tuple; -import org.apache.tajo.tuple.memory.MemoryRowBlock; - -import java.util.ArrayList; - -/** - * In TupleList, input tuples are automatically cloned whenever the add() method is called. - * This data structure is usually used in physical operators like hash join or hash aggregation. - */ -public class UnsafeTupleList extends ArrayList { - - private MemoryRowBlock rowBlock; - - public UnsafeTupleList(Schema schema) { - super(); - this.rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema)); - } - - public UnsafeTupleList(Schema schema, int initialCapacity) { - super(10000); - this.rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), initialCapacity); - } - - @Override - public boolean add(Tuple tuple) { - return super.add(rowBlock.getWriter().addTuple(tuple)); - } - - public void release() { - rowBlock.release(); - super.clear(); - } - - public int usedMem() { - return rowBlock.usedMem(); - } - - public float usage() { - return rowBlock.usage(); - } - - @Override - public void clear() { - super.clear(); - rowBlock.clear(); - } -} From 94aa071166d074c9ca79eacec54da18b1df51a52 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Tue, 10 Nov 2015 13:48:49 +0900 Subject: [PATCH 19/28] cleanup --- .../java/org/apache/tajo/querymaster/Repartitioner.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java b/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java index ac087a7070..e64cd51d6e 100644 --- a/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java +++ b/tajo-core/src/main/java/org/apache/tajo/querymaster/Repartitioner.java @@ -24,7 +24,6 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; -import org.apache.tajo.BuiltinStorages; import org.apache.tajo.ExecutionBlockId; import org.apache.tajo.SessionVars; import org.apache.tajo.algebra.JoinType; @@ -195,10 +194,11 @@ public static void scheduleFragmentsForJoinQuery(TaskSchedulerContext schedulerC long maxStats = Long.MIN_VALUE; int maxStatsScanIdx = -1; StringBuilder nonLeafScanNamesBuilder = new StringBuilder(); + + String intermediateDataFormat = schedulerContext.getMasterContext().getConf().getVar(ConfVars.SHUFFLE_FILE_FORMAT); for (int i = 0; i < scans.length; i++) { - String dataFormat = scans[i].getTableDesc().getMeta().getDataFormat(); - //TODO add flag for intermediate data format - if (dataFormat.equalsIgnoreCase(BuiltinStorages.DRAW) || dataFormat.equalsIgnoreCase(BuiltinStorages.RAW)) { + + if (scans[i].getTableDesc().getMeta().getDataFormat().equalsIgnoreCase(intermediateDataFormat)) { // Intermediate data scan hasNonLeafNode = true; largeScanIndexList.add(i); From dbf2f96027d029f5685f1b3a4aaaeb5b10c837ee Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Tue, 10 Nov 2015 17:12:24 +0900 Subject: [PATCH 20/28] cleanup and add comments --- .../org/apache/tajo/storage/BufferPool.java | 4 ---- .../tuple/memory/CompactRowBlockWriter.java | 10 ++++++++++ .../tuple/memory/OffHeapRowBlockUtils.java | 3 +++ .../apache/tajo/tuple/memory/RowBlock.java | 16 ++++++++++++++- .../TestProgressExternalSortExec.java | 20 ++++++++++--------- .../physical/HashShuffleFileWriteExec.java | 12 +++++------ .../tajo/plan/function/stream/BufferPool.java | 5 ----- .../java/org/apache/tajo/storage/RawFile.java | 1 - 8 files changed, 45 insertions(+), 26 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/storage/BufferPool.java b/tajo-common/src/main/java/org/apache/tajo/storage/BufferPool.java index 05b07b97e0..4913d3ba6a 100644 --- a/tajo-common/src/main/java/org/apache/tajo/storage/BufferPool.java +++ b/tajo-common/src/main/java/org/apache/tajo/storage/BufferPool.java @@ -42,10 +42,6 @@ private BufferPool() { } static { - /* TODO Enable thread cache - * Create a pooled ByteBuf allocator but disables the thread-local cache. - * Because the TaskRunner thread is newly created - * */ if (TajoConstants.IS_TEST_MODE) { /* Disable pooling buffers for memory usage */ diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java index f2c7961eb1..5bfde6d910 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java @@ -30,6 +30,16 @@ import org.apache.tajo.util.SizeOf; import org.apache.tajo.util.UnsafeUtil; +/** + * This class represent serialization of RawFile + * + * Row Record Structure + * + * | row length | null flags length | null flags | field 1 | field 2| ... | field N |; + * + * | (4 bytes) (2 bytes) (N bytes) | |; + * Header values + */ public class CompactRowBlockWriter implements RowWriter { private static final int RECORD_FIELD_SIZE = 4; // Maximum variant int32 size is 5 diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java index 0838e44db0..1aca22f3e8 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockUtils.java @@ -85,6 +85,9 @@ public static Tuple[] sortToArray(MemoryRowBlock rowBlock, Comparator com return tuples; } + /** + * This class is tuple converter to the RowBlock + */ public static class TupleConverter { public void convert(Tuple tuple, RowWriter writer) { diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java index c9865d5ea1..1ab1042a87 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowBlock.java @@ -25,15 +25,29 @@ import java.nio.channels.ScatteringByteChannel; public interface RowBlock { - + /** + * a data format for de/serialization + */ String getDataFormat(); + /** + * reset the memory and writer + */ void clear(); + /** + * @return the number of bytes this memory block can contain. + */ int capacity(); + /** + * @return the number of written bytes in this memory block + */ int usedMem(); + /** + * @return the percentage of written bytes in this memory block + */ float usage(); void setRows(int rowNum); diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java index 365b6d7c60..b1f53dad49 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestProgressExternalSortExec.java @@ -124,22 +124,24 @@ public void tearDown() throws Exception { @Test public void testExternalSortExecProgressWithMemTableScanner() throws Exception { - testProgress(testDataStats.getNumBytes() * 20); //multiply 20 for memory fit + QueryContext queryContext = LocalTajoTestingUtility.createDummyContext(conf); + int bufferSize = (int) (testDataStats.getNumBytes() * 20) / StorageUnit.MB; //multiply 2 for memory fit + queryContext.setInt(SessionVars.EXTSORT_BUFFER_SIZE, bufferSize); + + testProgress(queryContext); } @Test public void testExternalSortExecProgressWithPairWiseMerger() throws Exception { - testProgress(testDataStats.getNumBytes()); + QueryContext queryContext = LocalTajoTestingUtility.createDummyContext(conf); + int bufferSize = (int) Math.max((testDataStats.getNumBytes() / StorageUnit.MB), 1); + queryContext.setInt(SessionVars.EXTSORT_BUFFER_SIZE, bufferSize); + + testProgress(queryContext); } - private void testProgress(long sortBufferBytesNum) throws Exception { + private void testProgress(QueryContext queryContext) throws Exception { conf.setIntVar(ConfVars.EXECUTOR_EXTERNAL_SORT_FANOUT, 2); - QueryContext queryContext = LocalTajoTestingUtility.createDummyContext(conf); - if(sortBufferBytesNum > StorageUnit.MB) { - queryContext.setInt(SessionVars.EXTSORT_BUFFER_SIZE, (int)(sortBufferBytesNum / StorageUnit.MB)); - } else { - queryContext.setInt(SessionVars.EXTSORT_BUFFER_SIZE, 1); - } FileFragment[] frags = FileTablespace.splitNG(conf, "default.employee", employee.getMeta(), new Path(employee.getUri()), Integer.MAX_VALUE); diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java index 271c52f198..f8a3778d2b 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java @@ -30,6 +30,8 @@ import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.catalog.statistics.TableStats; import org.apache.tajo.common.TajoDataTypes.DataType; +import org.apache.tajo.exception.TajoRuntimeException; +import org.apache.tajo.exception.UnsupportedException; import org.apache.tajo.plan.logical.ShuffleFileWriteNode; import org.apache.tajo.storage.HashShuffleAppenderManager; import org.apache.tajo.storage.Tuple; @@ -99,6 +101,7 @@ public HashShuffleFileWriteExec(TaskAttemptContext context, this.dataTypes = SchemaUtil.toDataTypes(outSchema); if(numShuffleOutputs > 0){ + //calculate initial buffer by total partition. a buffer size will be 4Kb ~ 1MB this.initialBufferSize = Math.min(MAXIMUM_INITIAL_BUFFER_SIZE, Math.max(maxBufferSize / numShuffleOutputs, MINIMUM_INITIAL_BUFFER_SIZE)); } else { @@ -139,6 +142,8 @@ public Tuple next() throws IOException { totalBufferCapacity += rowBlock.capacity(); // calculate resizeable buffer capacity usedBufferSize += (rowBlock.usedMem() - prevUsedMem); + // if total buffer capacity are required more than maxBufferSize, + // all partitions are flushed and the buffers are released if (totalBufferCapacity > maxBufferSize) { if (LOG.isDebugEnabled()) { LOG.debug(String.format("Too low buffer usage. threshold: %s, total capacity: %s, used: %s", @@ -223,12 +228,7 @@ private void flushBuffer(Map partitionMemoryMap, boolea @Override public void rescan() throws IOException { - if (partitionMemoryMap.size() > 0) { - for (RowBlock rowBlock : partitionMemoryMap.values()) { - rowBlock.release(); - } - partitionMemoryMap.clear(); - } + throw new TajoRuntimeException(new UnsupportedException()); } @Override diff --git a/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/BufferPool.java b/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/BufferPool.java index d5d2c29fc3..b4d4f2b775 100644 --- a/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/BufferPool.java +++ b/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/BufferPool.java @@ -40,11 +40,6 @@ private BufferPool() { } static { - /* TODO Enable thread cache - * Create a pooled ByteBuf allocator but disables the thread-local cache. - * Because the TaskRunner thread is newly created - * */ - if (TajoConstants.IS_TEST_MODE) { /* Disable pooling buffers for memory usage */ ALLOCATOR = UnpooledByteBufAllocator.DEFAULT; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java index d7cfe0a3a0..b7add6a33a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java @@ -463,7 +463,6 @@ public float getProgress() { } } - @Deprecated public static class RawFileAppender extends DirectRawFileWriter { public RawFileAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, From 6253b9a94eee1ed8c8ae9a569c0b8378990b2031 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Wed, 11 Nov 2015 17:45:37 +0900 Subject: [PATCH 21/28] remove unused codes --- .../tajo/tuple/memory/ResizableLimitSpec.java | 9 ++++++--- .../tajo/tuple/memory/ResizableMemoryBlock.java | 4 ++++ .../storage/HashShuffleAppenderManager.java | 4 ++-- .../java/org/apache/tajo/storage/RawFile.java | 9 +-------- .../storage/rawfile/DirectRawFileWriter.java | 17 +++++------------ 5 files changed, 18 insertions(+), 25 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableLimitSpec.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableLimitSpec.java index 79cc1c58f6..ddf50ab6c1 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableLimitSpec.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableLimitSpec.java @@ -21,6 +21,8 @@ import com.google.common.base.Preconditions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tajo.exception.TajoRuntimeException; +import org.apache.tajo.exception.UnsupportedException; import org.apache.tajo.util.FileUtil; /** @@ -114,10 +116,11 @@ public int increasedSize(int currentSize) { return (int) initSize; } - if (currentSize > Integer.MAX_VALUE) { - LOG.warn("Current size already exceeds the maximum size (" + Integer.MAX_VALUE + " bytes)"); - return Integer.MAX_VALUE; + if (currentSize == Integer.MAX_VALUE) { + throw new TajoRuntimeException(new UnsupportedException( + "Current size already exceeds the maximum size (" + Integer.MAX_VALUE + " bytes)")); } + long nextSize = (long) (currentSize + ((float) currentSize * incRatio)); if (nextSize > limitBytes) { diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java index 5163ecb923..6d8d09e792 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java @@ -39,6 +39,7 @@ public class ResizableMemoryBlock implements MemoryBlock { protected ByteBuf buffer; protected ResizableLimitSpec limitSpec; + private long memoryAddress; public ResizableMemoryBlock(ByteBuf buffer, ResizableLimitSpec limitSpec) { this.buffer = buffer.order(ByteOrder.LITTLE_ENDIAN); @@ -51,12 +52,14 @@ public ResizableMemoryBlock(ByteBuf buffer) { public ResizableMemoryBlock(ByteBuffer buffer) { this.buffer = Unpooled.wrappedBuffer(buffer).order(ByteOrder.LITTLE_ENDIAN); + this.memoryAddress = this.buffer.hasMemoryAddress() ? this.buffer.memoryAddress() : 0; this.limitSpec = new ResizableLimitSpec(buffer.capacity()); } public ResizableMemoryBlock(ResizableLimitSpec limitSpec, boolean isDirect) { if (isDirect) { this.buffer = BufferPool.directBuffer((int) limitSpec.initialSize(), (int) limitSpec.limit()); + this.memoryAddress = buffer.memoryAddress(); } else { this.buffer = BufferPool.heapBuffer((int) limitSpec.initialSize(), (int) limitSpec.limit()); } @@ -149,6 +152,7 @@ private void resize(int newSize) { int newBlockSize = UnsafeUtil.alignedSize(newSize); buffer = BufferPool.ensureWritable(buffer, newBlockSize); + memoryAddress = buffer.memoryAddress(); } @Override diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java index 5988685157..aeadca2725 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java @@ -112,8 +112,8 @@ public synchronized HashShuffleAppender getAppender(MemoryRowBlock memoryRowBloc fs.mkdirs(dataFile.getParent()); } - DirectRawFileWriter appender = new DirectRawFileWriter(systemConf, null, outSchema, meta, dataFile, - memoryRowBlock, memoryRowBlock.getDataFormat()); + DirectRawFileWriter appender = + new DirectRawFileWriter(systemConf, null, outSchema, meta, dataFile, memoryRowBlock); appender.enableStats(); appender.init(); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java index b7add6a33a..26bd135e6e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java @@ -24,7 +24,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; -import org.apache.tajo.BuiltinStorages; import org.apache.tajo.TaskAttemptId; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; @@ -38,7 +37,6 @@ import org.apache.tajo.plan.expr.EvalNode; import org.apache.tajo.storage.fragment.Fragment; import org.apache.tajo.storage.rawfile.DirectRawFileWriter; -import org.apache.tajo.tuple.memory.MemoryRowBlock; import org.apache.tajo.unit.StorageUnit; import org.apache.tajo.util.BitArray; @@ -467,12 +465,7 @@ public static class RawFileAppender extends DirectRawFileWriter { public RawFileAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, TableMeta meta, Path workDir) throws IOException { - super(conf, taskAttemptId, schema, meta, workDir, null, BuiltinStorages.RAW); - } - - public RawFileAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, - TableMeta meta, Path workDir, MemoryRowBlock rowBlock) throws IOException { - super(conf, taskAttemptId, schema, meta, workDir, rowBlock, BuiltinStorages.RAW); + super(conf, taskAttemptId, schema, meta, workDir, null); } } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java index 68482f056a..8ad0749301 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java @@ -63,7 +63,6 @@ public class DirectRawFileWriter extends FileAppender { protected RandomAccessFile randomAccessFile; protected FSDataOutputStream fos; protected long pos; - protected final String dataFormat; protected TableStatistics stats; protected TupleConverter tupleConverter; @@ -73,23 +72,17 @@ public class DirectRawFileWriter extends FileAppender { protected boolean isLocal; public DirectRawFileWriter(Configuration conf, TaskAttemptId taskAttemptId, - final Schema schema, final TableMeta meta, final Path path) throws IOException { - this(conf, taskAttemptId, schema, meta, path, null, BuiltinStorages.DRAW); - } - - public DirectRawFileWriter(Configuration conf, TaskAttemptId taskAttemptId, - final Schema schema, final TableMeta meta, final Path path, String dataFormat) + final Schema schema, final TableMeta meta, final Path path) throws IOException { - this(conf, taskAttemptId, schema, meta, path, null, dataFormat); + this(conf, taskAttemptId, schema, meta, path, null); } public DirectRawFileWriter(Configuration conf, TaskAttemptId taskAttemptId, final Schema schema, final TableMeta meta, final Path path, - MemoryRowBlock rowBlock, String dataFormat) throws IOException { + MemoryRowBlock rowBlock) throws IOException { super(conf, taskAttemptId, schema, meta, path); this.rowBlock = rowBlock; this.hasExternalBuf = rowBlock != null; - this.dataFormat = dataFormat; } @Override @@ -127,7 +120,7 @@ public void init() throws IOException { if (rowBlock == null) { int bufferSize = conf.getInt(WRITE_BUFFER_SIZE, DEFAULT_BUFFER_SIZE); - rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), bufferSize, true, dataFormat); + rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), bufferSize, true, meta.getDataFormat()); } tupleConverter = initConverter(); @@ -137,7 +130,7 @@ public void init() throws IOException { } public TupleConverter initConverter() { - switch (dataFormat) { + switch (meta.getDataFormat()) { case BuiltinStorages.DRAW: return getDrawConverter(); case BuiltinStorages.RAW: From d64600f2c050fb076e67a7c39724262bea588dee Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Thu, 12 Nov 2015 12:24:11 +0900 Subject: [PATCH 22/28] decrease resizing buffer --- .../apache/tajo/tuple/memory/ResizableMemoryBlock.java | 9 ++++----- .../apache/tajo/storage/rawfile/DirectRawFileWriter.java | 6 ++---- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java index 6d8d09e792..85f94de9ea 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java @@ -44,6 +44,7 @@ public class ResizableMemoryBlock implements MemoryBlock { public ResizableMemoryBlock(ByteBuf buffer, ResizableLimitSpec limitSpec) { this.buffer = buffer.order(ByteOrder.LITTLE_ENDIAN); this.limitSpec = limitSpec; + this.memoryAddress = this.buffer.hasMemoryAddress() ? this.buffer.memoryAddress() : 0; } public ResizableMemoryBlock(ByteBuf buffer) { @@ -51,9 +52,7 @@ public ResizableMemoryBlock(ByteBuf buffer) { } public ResizableMemoryBlock(ByteBuffer buffer) { - this.buffer = Unpooled.wrappedBuffer(buffer).order(ByteOrder.LITTLE_ENDIAN); - this.memoryAddress = this.buffer.hasMemoryAddress() ? this.buffer.memoryAddress() : 0; - this.limitSpec = new ResizableLimitSpec(buffer.capacity()); + this(Unpooled.wrappedBuffer(buffer), new ResizableLimitSpec(buffer.capacity(), buffer.capacity())); } public ResizableMemoryBlock(ResizableLimitSpec limitSpec, boolean isDirect) { @@ -68,7 +67,7 @@ public ResizableMemoryBlock(ResizableLimitSpec limitSpec, boolean isDirect) { @Override public long address() { - return buffer.memoryAddress(); + return memoryAddress; } @Override @@ -133,7 +132,7 @@ public void ensureSize(int size) { throw new RuntimeException("Cannot increase RowBlock anymore."); } - int newBlockSize = limitSpec.increasedSize(Math.max(buffer.capacity(), size)); + int newBlockSize = limitSpec.increasedSize(size); resize(newBlockSize); LOG.info("Increase DirectRowBlock to " + FileUtil.humanReadableByteCount(newBlockSize, false)); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java index 8ad0749301..b5bae95ee9 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java @@ -56,8 +56,6 @@ public class DirectRawFileWriter extends FileAppender { public static final String WRITE_BUFFER_SIZE = "tajo.storage.raw.io.write-buffer.bytes"; public static final int DEFAULT_BUFFER_SIZE = 128 * StorageUnit.KB; - public static final float BUFFER_THRESHHOLD = 0.9f; - protected FileChannel channel; protected RandomAccessFile randomAccessFile; @@ -119,7 +117,7 @@ public void init() throws IOException { } if (rowBlock == null) { - int bufferSize = conf.getInt(WRITE_BUFFER_SIZE, DEFAULT_BUFFER_SIZE); + int bufferSize = (int) (conf.getInt(WRITE_BUFFER_SIZE, DEFAULT_BUFFER_SIZE) * 1.1f); rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), bufferSize, true, meta.getDataFormat()); } @@ -213,7 +211,7 @@ public void addTuple(Tuple t) throws IOException { tupleConverter.convert(t, rowBlock.getWriter()); - if(rowBlock.usage() > BUFFER_THRESHHOLD) { + if(rowBlock.usedMem() > DEFAULT_BUFFER_SIZE) { writeRowBlock(rowBlock); rowBlock.clear(); } From 6ea3f097a8ed05048b9af2a8959097aa03ec4c44 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Thu, 12 Nov 2015 12:26:23 +0900 Subject: [PATCH 23/28] missing code --- .../java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java index 85f94de9ea..09faff948f 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ResizableMemoryBlock.java @@ -145,7 +145,7 @@ private void resize(int newSize) { throw new RuntimeException("Resize cannot exceed the capacity limit"); } - if (newSize < buffer.capacity()) { + if (newSize < buffer.writableBytes()) { LOG.warn("The capacity reduction is ignored."); } From 7d59eb19dba9fe523e3b62588d06400de1f46fc2 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Thu, 12 Nov 2015 15:34:15 +0900 Subject: [PATCH 24/28] add MaxMetaspaceFreeRatio for the free memory --- .../src/test/java/org/apache/tajo/TajoTestingCluster.java | 4 ++-- tajo-core-tests/pom.xml | 2 +- .../apache/tajo/engine/planner/physical/TestHashJoinExec.java | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java b/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java index 3ce4fee47a..9e0e0605f2 100644 --- a/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java +++ b/tajo-cluster-tests/src/test/java/org/apache/tajo/TajoTestingCluster.java @@ -158,8 +158,8 @@ void initPropertiesAndConfigs() { conf.setStrings(ConfVars.PYTHON_CODE_DIR.varname, getClass().getResource("/python").toString()); // Buffer size - conf.setInt(ConfVars.$EXECUTOR_EXTERNAL_SORT_BUFFER_SIZE.varname, 10); - conf.setInt(ConfVars.$EXECUTOR_HASH_SHUFFLE_BUFFER_SIZE.varname, 10); + conf.setInt(ConfVars.$EXECUTOR_EXTERNAL_SORT_BUFFER_SIZE.varname, 1); + conf.setInt(ConfVars.$EXECUTOR_HASH_SHUFFLE_BUFFER_SIZE.varname, 1); /** decrease Hbase thread and memory cache for testing */ //server handler diff --git a/tajo-core-tests/pom.xml b/tajo-core-tests/pom.xml index 2037f0be30..6de5546335 100644 --- a/tajo-core-tests/pom.xml +++ b/tajo-core-tests/pom.xml @@ -375,7 +375,7 @@ ${maven.fork.count} true false - -Xms128m -Xmx800m -XX:+CMSClassUnloadingEnabled -Dfile.encoding=UTF-8 -Dderby.storage.pageSize=1024 -Dderby.stream.error.file=/dev/null + -Xms128m -Xmx800m -XX:MinMetaspaceFreeRatio=10 -XX:MaxMetaspaceFreeRatio=10 -XX:+CMSClassUnloadingEnabled -Dfile.encoding=UTF-8 -Dderby.storage.pageSize=1024 -Dderby.stream.error.file=/dev/null true true diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestHashJoinExec.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestHashJoinExec.java index a4afa7fae1..2f4d66f090 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestHashJoinExec.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/planner/physical/TestHashJoinExec.java @@ -216,6 +216,7 @@ public final void testCheckIfInMemoryInnerJoinIsPossible() throws IOException, T HashJoinExec joinExec = proj.getChild(); assertCheckInnerJoinRelatedFunctions(ctx, phyPlanner, joinNode, joinExec); + exec.close(); } /** From fa59d089910bb2f66d234e382a1889993c43669b Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Thu, 12 Nov 2015 15:50:59 +0900 Subject: [PATCH 25/28] Trigger CI (Connect to repo.maven.apache.org) --- CHANGES | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES b/CHANGES index c36ced1773..3605786281 100644 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,6 @@ Tajo Change Log + Release 0.12.0 - unreleased NEW FEATURES From 07dd470ce4ecdc52ecf8e95774cc29e5b8a30693 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Thu, 12 Nov 2015 16:07:56 +0900 Subject: [PATCH 26/28] Trigger CI (Connect to repo.maven.apache.org) --- CHANGES | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGES b/CHANGES index 3605786281..c36ced1773 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,5 @@ Tajo Change Log - Release 0.12.0 - unreleased NEW FEATURES From 65840ed3384d182b031d3f082942e34b5c5bb681 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Fri, 13 Nov 2015 11:51:34 +0900 Subject: [PATCH 27/28] change to get length directly --- .../main/java/org/apache/tajo/conf/TajoConf.java | 2 +- .../org/apache/tajo/tuple/BaseTupleBuilder.java | 2 +- .../apache/tajo/tuple/memory/DirectBufTuple.java | 6 +++--- .../tajo/tuple/memory/HeapRowBlockReader.java | 5 ++--- .../org/apache/tajo/tuple/memory/HeapTuple.java | 11 ++++++++--- .../tajo/tuple/memory/OffHeapRowBlockReader.java | 8 ++------ .../tajo/tuple/memory/OffHeapRowBlockWriter.java | 2 +- .../org/apache/tajo/tuple/memory/UnSafeTuple.java | 15 ++++++++++----- .../apache/tajo/tuple/memory/ZeroCopyTuple.java | 10 +++------- 9 files changed, 31 insertions(+), 30 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java b/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java index 9d6af9f3fc..a2c1fb820e 100644 --- a/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java +++ b/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java @@ -341,7 +341,7 @@ public static enum ConfVars implements ConfigKey { Validators.min("0")), $EXECUTOR_GROUPBY_INMEMORY_HASH_THRESHOLD("tajo.executor.groupby.in-memory-hash-threshold-mb", 64l, Validators.min("0")), - $EXECUTOR_HASH_SHUFFLE_BUFFER_SIZE("tajo.executor.hash-shuffle.buffer-mb", 200, Validators.min("1")), + $EXECUTOR_HASH_SHUFFLE_BUFFER_SIZE("tajo.executor.hash-shuffle.buffer-mb", 100, Validators.min("1")), $MAX_OUTPUT_FILE_SIZE("tajo.query.max-outfile-size-mb", 0), // zero means infinite $CODEGEN("tajo.executor.codegen.enabled", false), // Runtime code generation (todo this is broken) $AGG_HASH_TABLE_SIZE("tajo.executor.aggregate.hash-table.size", 10000), diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java b/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java index 00328829a2..a594898245 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java @@ -92,7 +92,7 @@ public HeapTuple buildToHeapTuple() { public UnSafeTuple buildToZeroCopyTuple() { UnSafeTuple zcTuple = new UnSafeTuple(); - zcTuple.set(memoryBlock, memoryBlock.readerPosition(), memoryBlock.readableBytes(), dataTypes()); + zcTuple.set(memoryBlock, memoryBlock.readerPosition(), dataTypes()); return zcTuple; } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/DirectBufTuple.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/DirectBufTuple.java index 10e493f2e8..1852e6de3e 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/DirectBufTuple.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/DirectBufTuple.java @@ -28,11 +28,11 @@ public class DirectBufTuple extends UnSafeTuple implements Deallocatable { private MemoryBlock memoryBlock; - public DirectBufTuple(int length, DataType[] types) { - ByteBuffer bb = ByteBuffer.allocateDirect(length).order(ByteOrder.LITTLE_ENDIAN); + public DirectBufTuple(DataType[] types) { + ByteBuffer bb = ByteBuffer.allocateDirect(getLength()).order(ByteOrder.LITTLE_ENDIAN); memoryBlock = new ResizableMemoryBlock(bb); - set(memoryBlock, 0, length, types); + set(memoryBlock, 0, types); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/HeapRowBlockReader.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/HeapRowBlockReader.java index dd377cff0e..ec5033b2f1 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/HeapRowBlockReader.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/HeapRowBlockReader.java @@ -48,10 +48,9 @@ public long remainForRead() { public boolean next(HeapTuple tuple) { if (curRowIdxForRead < rows) { - int recordLen = memoryBlock.getInt(curPosForRead); - tuple.set(memoryBlock, curPosForRead, recordLen, dataTypes); + tuple.set(memoryBlock, curPosForRead, dataTypes); - curPosForRead += recordLen; + curPosForRead += tuple.getLength(); curRowIdxForRead++; memoryBlock.readerPosition(curPosForRead); diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/HeapTuple.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/HeapTuple.java index 9f508b3a2a..c6c7daf0b2 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/HeapTuple.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/HeapTuple.java @@ -41,16 +41,16 @@ public class HeapTuple extends ZeroCopyTuple implements Cloneable { private DataType[] types; @Override - public void set(MemoryBlock memoryBlock, int relativePos, int length, DataType[] types) { + public void set(MemoryBlock memoryBlock, int relativePos, DataType[] types) { this.buffer = memoryBlock.getBuffer(); this.types = types; - super.set(relativePos, length); + super.set(relativePos); } protected void set(final byte[] bytes, final DataType[] types) { this.buffer = Unpooled.wrappedBuffer(bytes).order(ByteOrder.LITTLE_ENDIAN); this.types = types; - super.set(0, bytes.length); + super.set(0); } @Override @@ -58,6 +58,11 @@ public int size() { return types.length; } + @Override + public int getLength() { + return buffer.getInt(getRelativePos()); + } + @Override public TajoDataTypes.Type type(int fieldId) { return types[fieldId].getType(); diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockReader.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockReader.java index ccaeffc217..c5673e3d2b 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockReader.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockReader.java @@ -18,7 +18,6 @@ package org.apache.tajo.tuple.memory; -import io.netty.util.internal.PlatformDependent; import org.apache.tajo.common.TajoDataTypes.DataType; import org.apache.tajo.exception.TajoInternalError; import org.apache.tajo.tuple.RowBlockReader; @@ -54,11 +53,8 @@ public long remainForRead() { public boolean next(ZeroCopyTuple tuple) { if (curRowIdxForRead < rows) { - long recordStartPtr = memoryBlock.address() + curPosForRead; - int recordLen = PlatformDependent.getInt(recordStartPtr); - tuple.set(memoryBlock, curPosForRead, recordLen, dataTypes); - - curPosForRead += recordLen; + tuple.set(memoryBlock, curPosForRead, dataTypes); + curPosForRead += tuple.getLength(); curRowIdxForRead++; memoryBlock.readerPosition(curPosForRead); diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java index 57a1e89a3f..8bbc8884ca 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java @@ -82,7 +82,7 @@ public ZeroCopyTuple addTuple(Tuple tuple) { putTuple(tuple); UnSafeTuple unSafeTuple = new UnSafeTuple(); - unSafeTuple.set(rowBlock.getMemory(), prevPos, rowBlock.getMemory().writerPosition() - prevPos, dataTypes()); + unSafeTuple.set(rowBlock.getMemory(), prevPos, dataTypes()); return unSafeTuple; } } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java index 4781507686..26f7df3447 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTuple.java @@ -46,18 +46,18 @@ public class UnSafeTuple extends ZeroCopyTuple { private DataType[] types; @Override - public void set(MemoryBlock memoryBlock, int relativePos, int length, DataType[] types) { + public void set(MemoryBlock memoryBlock, int relativePos, DataType[] types) { Preconditions.checkArgument(memoryBlock.hasAddress()); this.memoryBlock = memoryBlock; this.types = types; - super.set(relativePos, length); + super.set(relativePos); } public void set(UnSafeTuple tuple) { this.memoryBlock = tuple.memoryBlock; this.types = tuple.types; - super.set(tuple.getRelativePos(), tuple.getLength()); + super.set(tuple.getRelativePos()); } @Override @@ -65,6 +65,11 @@ public int size() { return types.length; } + @Override + public int getLength() { + return PlatformDependent.getInt(address()); + } + @Override public TajoDataTypes.Type type(int fieldId) { return types[fieldId].getType(); @@ -110,9 +115,9 @@ private int getFieldOffset(int fieldId) { public long getFieldAddr(int fieldId) { int fieldOffset = getFieldOffset(fieldId); - if (fieldOffset < 0 || fieldOffset > length) { + if (fieldOffset < 0 || fieldOffset > getLength()) { throw new RuntimeException("Invalid Access. Field : " + fieldId - + ", Offset:" + fieldOffset + ", Record length:" + length); + + ", Offset:" + fieldOffset + ", Record length:" + getLength()); } return address() + fieldOffset; } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ZeroCopyTuple.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ZeroCopyTuple.java index 1f4f57eab9..e9108f2007 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ZeroCopyTuple.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/ZeroCopyTuple.java @@ -24,22 +24,18 @@ public abstract class ZeroCopyTuple implements Tuple { protected int relativePos; - protected int length; - public abstract void set(MemoryBlock memoryBlock, int relativePos, int length, DataType[] types); + public abstract void set(MemoryBlock memoryBlock, int relativePos, DataType[] types); - void set(int relativePos, int length) { + void set(int relativePos) { this.relativePos = relativePos; - this.length = length; } public int getRelativePos() { return relativePos; } - public int getLength() { - return length; - } + public abstract int getLength(); @Override public Tuple clone() throws CloneNotSupportedException { From bf5b86a65ad7885b136444f4368122e9abb8a51b Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Mon, 16 Nov 2015 12:20:02 +0900 Subject: [PATCH 28/28] remove unused codes --- .../org/apache/tajo/tuple/BaseTupleBuilder.java | 11 ++--------- .../tajo/tuple/memory/CompactRowBlockWriter.java | 13 +------------ .../tajo/tuple/memory/OffHeapRowBlockWriter.java | 15 ++------------- .../tajo/tuple/memory/OffHeapRowWriter.java | 2 +- .../org/apache/tajo/tuple/memory/RowWriter.java | 4 +--- .../tajo/tuple/memory/TestMemoryRowBlock.java | 2 +- .../physical/HashShuffleFileWriteExec.java | 2 +- .../exec/NonForwardQueryResultFileScanner.java | 2 +- .../exec/NonForwardQueryResultSystemScanner.java | 2 +- .../apache/tajo/master/exec/QueryExecutor.java | 2 +- .../java/org/apache/tajo/jdbc/TestResultSet.java | 2 +- .../tajo/storage/rawfile/DirectRawFileWriter.java | 7 ++++--- 12 files changed, 17 insertions(+), 47 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java b/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java index a594898245..ebdcc26a22 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/BaseTupleBuilder.java @@ -19,8 +19,6 @@ package org.apache.tajo.tuple; import org.apache.tajo.common.TajoDataTypes.DataType; -import org.apache.tajo.exception.NotImplementedException; -import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.storage.Tuple; import org.apache.tajo.tuple.memory.*; import org.apache.tajo.unit.StorageUnit; @@ -67,20 +65,15 @@ public void endRow() { } @Override - public void putTuple(Tuple tuple) { + public void addTuple(Tuple tuple) { if (tuple instanceof UnSafeTuple) { UnSafeTuple unSafeTuple = TUtil.checkTypeAndGet(tuple, UnSafeTuple.class); - putTuple(unSafeTuple); + addTuple(unSafeTuple); } else { OffHeapRowBlockUtils.convert(tuple, this); } } - @Override - public ZeroCopyTuple addTuple(Tuple tuple) { - throw new TajoRuntimeException(new NotImplementedException()); - } - @Override public Tuple build() { return buildToHeapTuple(); diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java index 5bfde6d910..a88d2f1a5c 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/CompactRowBlockWriter.java @@ -401,19 +401,8 @@ public void putProtoDatum(ProtobufDatum val) { putBlob(val.asByteArray()); } - @Override - public void putTuple(Tuple tuple) { + public void addTuple(Tuple tuple) { OffHeapRowBlockUtils.convert(tuple, this); } - - @Override - public Tuple addTuple(Tuple tuple) { - putTuple(tuple); - try { - return tuple.clone(); - } catch (CloneNotSupportedException e) { - throw new TajoInternalError(e); - } - } } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java index 8bbc8884ca..9f3d8a2c36 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowBlockWriter.java @@ -65,24 +65,13 @@ public TajoDataTypes.DataType[] dataTypes() { @Override - public void putTuple(Tuple tuple) { + public void addTuple(Tuple tuple) { if (tuple instanceof UnSafeTuple) { UnSafeTuple unSafeTuple = TUtil.checkTypeAndGet(tuple, UnSafeTuple.class); - putTuple(unSafeTuple); + addTuple(unSafeTuple); rowBlock.setRows(rowBlock.rows() + 1); } else { OffHeapRowBlockUtils.convert(tuple, this); } } - - @Override - public ZeroCopyTuple addTuple(Tuple tuple) { - int prevPos = rowBlock.getMemory().writerPosition(); - - putTuple(tuple); - - UnSafeTuple unSafeTuple = new UnSafeTuple(); - unSafeTuple.set(rowBlock.getMemory(), prevPos, dataTypes()); - return unSafeTuple; - } } diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowWriter.java index 3bb26a147b..f082762ca9 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/OffHeapRowWriter.java @@ -289,7 +289,7 @@ public void putProtoDatum(ProtobufDatum val) { } - protected void putTuple(UnSafeTuple tuple) { + protected void addTuple(UnSafeTuple tuple) { int length = tuple.getLength(); ensureSize(length); PlatformDependent.copyMemory(tuple.address(), address() + position(), length); diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java index 93a15c5016..0393714d3c 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/RowWriter.java @@ -76,7 +76,5 @@ public interface RowWriter { void putProtoDatum(ProtobufDatum datum); - void putTuple(Tuple tuple); - - Tuple addTuple(Tuple tuple); + void addTuple(Tuple tuple); } diff --git a/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestMemoryRowBlock.java b/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestMemoryRowBlock.java index 15f0054922..a6003c773d 100644 --- a/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestMemoryRowBlock.java +++ b/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestMemoryRowBlock.java @@ -274,7 +274,7 @@ public void testVTuplePutAndGetBenchmarkViaDirectRowEncoder() { VTuple tuple = new VTuple(schema.length); for (int i = 0; i < rowNum; i++) { fillVTuple(i, tuple); - rowBlock.getWriter().putTuple(tuple); + rowBlock.getWriter().addTuple(tuple); } long writeEnd = System.currentTimeMillis(); LOG.info("Writing takes " + (writeEnd - writeStart) + " msec"); diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java index f8a3778d2b..49b0e1112e 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/HashShuffleFileWriteExec.java @@ -136,7 +136,7 @@ public Tuple next() throws IOException { long prevUsedMem = rowBlock.usedMem(); totalBufferCapacity -= rowBlock.capacity(); - writer.putTuple(tuple); + writer.addTuple(tuple); numRows++; totalBufferCapacity += rowBlock.capacity(); // calculate resizeable buffer capacity diff --git a/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java b/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java index bf0e08f94f..80275ce6cd 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultFileScanner.java @@ -288,7 +288,7 @@ public void run() { eof = true; break; } else { - rowBlock.getWriter().putTuple(tuple); + rowBlock.getWriter().addTuple(tuple); currentNumRows++; if (currentNumRows >= maxRow) { eof = true; diff --git a/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultSystemScanner.java b/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultSystemScanner.java index 5d3d178021..7f6db9bf59 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultSystemScanner.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/exec/NonForwardQueryResultSystemScanner.java @@ -646,7 +646,7 @@ public SerializedResultSet nextRowBlock(int fetchRowNum) throws IOException { rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(tableDesc.getLogicalSchema())); } - rowBlock.getWriter().putTuple(currentTuple); + rowBlock.getWriter().addTuple(currentTuple); currentRow++; rowCount++; diff --git a/tajo-core/src/main/java/org/apache/tajo/master/exec/QueryExecutor.java b/tajo-core/src/main/java/org/apache/tajo/master/exec/QueryExecutor.java index 47807445ef..e260c003fd 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/exec/QueryExecutor.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/exec/QueryExecutor.java @@ -342,7 +342,7 @@ public void execNonFromQuery(QueryContext queryContext, Session session, String MemoryRowBlock rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema)); try { - rowBlock.getWriter().putTuple(outTuple); + rowBlock.getWriter().addTuple(outTuple); MemoryBlock memoryBlock = rowBlock.getMemory(); ByteBuffer uncompressed = memoryBlock.getBuffer().nioBuffer(0, memoryBlock.readableBytes()); diff --git a/tajo-jdbc/src/test/java/org/apache/tajo/jdbc/TestResultSet.java b/tajo-jdbc/src/test/java/org/apache/tajo/jdbc/TestResultSet.java index 341d676732..4c926bbf2c 100644 --- a/tajo-jdbc/src/test/java/org/apache/tajo/jdbc/TestResultSet.java +++ b/tajo-jdbc/src/test/java/org/apache/tajo/jdbc/TestResultSet.java @@ -92,7 +92,7 @@ public static void setup() throws Exception { tuple.put(1, DatumFactory.createInt4(i + 1)); written += key.length() + Integer.SIZE; appender.addTuple(tuple); - rowBlock.getWriter().putTuple(tuple); + rowBlock.getWriter().addTuple(tuple); } appender.close(); stats.setNumRows(tupleNum); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java index a6a38e259e..9cbb7a029a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rawfile/DirectRawFileWriter.java @@ -56,6 +56,7 @@ public class DirectRawFileWriter extends FileAppender { public static final String WRITE_BUFFER_SIZE = "tajo.storage.raw.io.write-buffer.bytes"; public static final int DEFAULT_BUFFER_SIZE = 128 * StorageUnit.KB; + private static final float OVERFLOW_RATIO = 1.1f; protected FileChannel channel; protected RandomAccessFile randomAccessFile; @@ -117,7 +118,7 @@ public void init() throws IOException { } if (rowBlock == null) { - int bufferSize = (int) (conf.getInt(WRITE_BUFFER_SIZE, DEFAULT_BUFFER_SIZE) * 1.1f); + int bufferSize = (int) (conf.getInt(WRITE_BUFFER_SIZE, DEFAULT_BUFFER_SIZE) * OVERFLOW_RATIO); rowBlock = new MemoryRowBlock(SchemaUtil.toDataTypes(schema), bufferSize, true, meta.getDataFormat()); } @@ -151,7 +152,7 @@ public void convert(Tuple tuple, RowWriter writer) { stats.analyzeField(i, tuple); } // write direct to memory - writer.putTuple(tuple); + writer.addTuple(tuple); } else { writer.startRow(); @@ -164,7 +165,7 @@ public void convert(Tuple tuple, RowWriter writer) { } } else { // write direct to memory - writer.putTuple(tuple); + writer.addTuple(tuple); } } };