From 31ba465ae8cacea201f1259b2f2ba3febacad98e Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Mon, 17 Aug 2015 14:36:00 +0900 Subject: [PATCH] TAJO-1777: JsonLineDeserializer returns invalid unicode text, if contains control character --- .../storage/json/JsonLineDeserializer.java | 15 ++++----- .../tajo/storage/json/TestJsonSerDe.java | 32 +++++++++++++++++++ .../testUnicodeWithControlChar.json | 1 + 3 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java index c720118e13..9216025c20 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java @@ -21,23 +21,22 @@ import com.facebook.presto.hive.shaded.com.google.common.collect.Lists; import io.netty.buffer.ByteBuf; +import io.netty.util.CharsetUtil; import net.minidev.json.JSONObject; import net.minidev.json.parser.JSONParser; import net.minidev.json.parser.ParseException; -import org.apache.tajo.catalog.*; import org.apache.commons.net.util.Base64; -import org.apache.tajo.catalog.Schema; -import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.catalog.*; import org.apache.tajo.common.TajoDataTypes.Type; import org.apache.tajo.datum.DatumFactory; import org.apache.tajo.datum.NullDatum; -import org.apache.tajo.datum.TextDatum; import org.apache.tajo.exception.NotImplementedException; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.text.TextLineDeserializer; import org.apache.tajo.storage.text.TextLineParsingError; import java.io.IOException; +import java.nio.charset.CharsetDecoder; import java.util.Map; public class JsonLineDeserializer extends TextLineDeserializer { @@ -46,6 +45,7 @@ public class JsonLineDeserializer extends TextLineDeserializer { // Full Path -> Type private final Map types; private final String [] projectedPaths; + private final CharsetDecoder decoder = CharsetUtil.getDecoder(CharsetUtil.UTF_8); public JsonLineDeserializer(Schema schema, TableMeta meta, Column [] projected) { super(schema, meta); @@ -214,17 +214,16 @@ private void getValue(JSONObject object, @Override public void deserialize(ByteBuf buf, Tuple output) throws IOException, TextLineParsingError { - byte[] line = new byte[buf.readableBytes()]; - buf.readBytes(line); + String line = decoder.decode(buf.nioBuffer(buf.readerIndex(), buf.readableBytes())).toString(); JSONObject object; try { object = (JSONObject) parser.parse(line); } catch (ParseException pe) { - throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), pe); + throw new TextLineParsingError(line, pe); } catch (ArrayIndexOutOfBoundsException ae) { // truncated value - throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), ae); + throw new TextLineParsingError(line, ae); } for (int i = 0; i < projectedPaths.length; i++) { diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java index 809508147e..88d7536732 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java @@ -93,4 +93,36 @@ public void testVarioutType() throws IOException { assertEquals(baseTuple, tuple); } + + @Test + public void testUnicodeWithControlChar() throws IOException { + TajoConf conf = new TajoConf(); + + TableMeta meta = CatalogUtil.newTableMeta("JSON"); + Path tablePath = new Path(getResourcePath("dataset", "TestJsonSerDe"), "testUnicodeWithControlChar.json"); + FileSystem fs = FileSystem.getLocal(conf); + FileStatus status = fs.getFileStatus(tablePath); + FileFragment fragment = new FileFragment("table", tablePath, 0, status.getLen()); + + Schema schema = new Schema(); + schema.addColumn("col1", TajoDataTypes.Type.TEXT); + schema.addColumn("col2", TajoDataTypes.Type.TEXT); + schema.addColumn("col3", TajoDataTypes.Type.TEXT); + Scanner scanner = TablespaceManager.getLocalFs().getScanner(meta, schema, fragment); + scanner.init(); + + Tuple tuple = scanner.next(); + assertNotNull(tuple); + assertNull(scanner.next()); + scanner.close(); + + + Tuple baseTuple = new VTuple(new Datum[] { + DatumFactory.createText("tajo"), + DatumFactory.createText("타조"), + DatumFactory.createText("타\n조") + }); + + assertEquals(baseTuple, tuple); + } } diff --git a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json new file mode 100644 index 0000000000..5446469887 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json @@ -0,0 +1 @@ +{"col1": "tajo", "col2":"타조", "col3":"타\n조"}