From 8d6218ca98f6d80e5f63a5aa9839f0397d8a50ac Mon Sep 17 00:00:00 2001 From: jhkim Date: Tue, 9 Dec 2014 12:21:34 +0900 Subject: [PATCH] TAJO-1236: Remove slow 'new String' operation in parquet format --- .../storage/parquet/TajoRecordConverter.java | 34 ++++++++----------- .../storage/parquet/TajoWriteSupport.java | 23 ++++++------- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/parquet/TajoRecordConverter.java b/tajo-storage/src/main/java/org/apache/tajo/storage/parquet/TajoRecordConverter.java index 7c3d79d778..a091eac29d 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/parquet/TajoRecordConverter.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/parquet/TajoRecordConverter.java @@ -18,29 +18,23 @@ package org.apache.tajo.storage.parquet; -import com.google.protobuf.Message; import com.google.protobuf.InvalidProtocolBufferException; - -import java.nio.ByteBuffer; - -import parquet.io.api.GroupConverter; -import parquet.io.api.Converter; -import parquet.io.api.PrimitiveConverter; -import parquet.io.api.Binary; -import parquet.schema.Type; -import parquet.schema.GroupType; - +import com.google.protobuf.Message; +import org.apache.tajo.catalog.Column; +import org.apache.tajo.catalog.Schema; import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.common.TajoDataTypes.DataType; -import org.apache.tajo.catalog.Schema; -import org.apache.tajo.catalog.Column; +import org.apache.tajo.datum.*; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.VTuple; -import org.apache.tajo.datum.DatumFactory; -import org.apache.tajo.datum.Datum; -import org.apache.tajo.datum.BlobDatum; -import org.apache.tajo.datum.NullDatum; -import org.apache.tajo.datum.ProtobufDatumFactory; +import parquet.io.api.Binary; +import parquet.io.api.Converter; +import parquet.io.api.GroupConverter; +import parquet.io.api.PrimitiveConverter; +import parquet.schema.GroupType; +import parquet.schema.Type; + +import java.nio.ByteBuffer; /** * Converter to convert a Parquet record into a Tajo Tuple. @@ -222,7 +216,7 @@ public FieldCharConverter(ParentValueContainer parent) { @Override final public void addBinary(Binary value) { - parent.add(DatumFactory.createChar(value.toStringUsingUTF8())); + parent.add(DatumFactory.createChar(value.getBytes())); } } @@ -343,7 +337,7 @@ public FieldTextConverter(ParentValueContainer parent) { @Override final public void addBinary(Binary value) { - parent.add(DatumFactory.createText(value.toStringUsingUTF8())); + parent.add(DatumFactory.createText(value.getBytes())); } } diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/parquet/TajoWriteSupport.java b/tajo-storage/src/main/java/org/apache/tajo/storage/parquet/TajoWriteSupport.java index 35165de791..86511315df 100644 --- a/tajo-storage/src/main/java/org/apache/tajo/storage/parquet/TajoWriteSupport.java +++ b/tajo-storage/src/main/java/org/apache/tajo/storage/parquet/TajoWriteSupport.java @@ -18,10 +18,12 @@ package org.apache.tajo.storage.parquet; -import java.util.Map; -import java.util.HashMap; -import java.util.List; - +import org.apache.hadoop.conf.Configuration; +import org.apache.tajo.catalog.Column; +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.common.TajoDataTypes; +import org.apache.tajo.datum.Datum; +import org.apache.tajo.storage.Tuple; import parquet.hadoop.api.WriteSupport; import parquet.io.api.Binary; import parquet.io.api.RecordConsumer; @@ -29,12 +31,9 @@ import parquet.schema.MessageType; import parquet.schema.Type; -import org.apache.hadoop.conf.Configuration; -import org.apache.tajo.catalog.Schema; -import org.apache.tajo.catalog.Column; -import org.apache.tajo.common.TajoDataTypes; -import org.apache.tajo.storage.Tuple; -import org.apache.tajo.datum.Datum; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * Tajo implementation of {@link WriteSupport} for {@link Tuple}s. @@ -116,7 +115,7 @@ private void writeRecordFields(GroupType schema, Schema tajoSchema, private void writeValue(Type fieldType, Column column, Datum datum) { switch (column.getDataType().getType()) { case BOOLEAN: - recordConsumer.addBoolean((Boolean) datum.asBool()); + recordConsumer.addBoolean(datum.asBool()); break; case BIT: case INT2: @@ -134,7 +133,7 @@ private void writeValue(Type fieldType, Column column, Datum datum) { break; case CHAR: case TEXT: - recordConsumer.addBinary(Binary.fromString(datum.asChars())); + recordConsumer.addBinary(Binary.fromByteArray(datum.asTextBytes())); break; case PROTOBUF: case BLOB: