From 5ab1d216ed1875073f44c58f3d63940e453a0077 Mon Sep 17 00:00:00 2001 From: Andy Schlaikjer Date: Thu, 29 Mar 2012 23:17:23 -0700 Subject: [PATCH 1/6] Bumps junit version to 4.10 --- libraries.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries.properties b/libraries.properties index 9befeccbf..ac882e249 100644 --- a/libraries.properties +++ b/libraries.properties @@ -10,7 +10,7 @@ hadoop-core.version=0.20.2 hadoop-lzo.version=0.4.15 hive-serde.version=0.8.0 json-simple.version=1.1 -junit.version=4.5 +junit.version=4.10 libthrift.version=0.5.0 log4j.version=1.2.15 mahout-collections.version=1.0 From a05237854dc306c49ef0ddb1a3dc3de0bf279870 Mon Sep 17 00:00:00 2001 From: Andy Schlaikjer Date: Thu, 29 Mar 2012 23:17:52 -0700 Subject: [PATCH 2/6] Adds outputSchema() impl to JsonStringToMap This ensures that output schema type is properly reported to pig, reducing possible confusion between actual return value type and user specified schema type. --- .../pig/piggybank/JsonStringToMap.java | 19 +++++++++++++++---- .../pig/piggybank/TestJsonStringToMap.java | 16 +++++++++++++--- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java b/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java index fded38194..8ea04c40e 100644 --- a/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java +++ b/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java @@ -8,6 +8,9 @@ import org.apache.pig.EvalFunc; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; +import org.apache.pig.impl.logicalLayer.schema.Schema; +import org.apache.pig.impl.util.Utils; +import org.apache.pig.parser.ParserException; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.json.simple.parser.ParseException; @@ -16,15 +19,23 @@ import com.twitter.elephantbird.pig.util.PigCounterHelper; /** - *

Transforms a Json string into a Pig map.
- * Only goes 1 level deep -- all value representations are their toString() representations.

+ * Transforms a Json string into a Pig map whose value type is chararray. Only goes one level deep; + * All input map values are converted to strings via {@link Object#toString()}. */ -@SuppressWarnings("rawtypes") -public class JsonStringToMap extends EvalFunc { +public class JsonStringToMap extends EvalFunc> { private static final Logger LOG = LogManager.getLogger(JsonStringToMap.class); private final JSONParser jsonParser = new JSONParser(); private final PigCounterHelper counterHelper = new PigCounterHelper(); + @Override + public Schema outputSchema(Schema input) { + try { + return Utils.getSchemaFromString("map: [chararray]"); + } catch (ParserException e) { + throw new RuntimeException(e); + } + } + @Override public Map exec(Tuple input) throws IOException { try { diff --git a/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java b/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java index 61b32a285..7e33f24d2 100644 --- a/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java +++ b/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java @@ -1,20 +1,30 @@ package com.twitter.elephantbird.pig.piggybank; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + import java.io.IOException; import java.util.Arrays; import java.util.Map; -import junit.framework.TestCase; - import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; +import org.apache.pig.impl.logicalLayer.schema.Schema; import org.junit.Test; -public class TestJsonStringToMap extends TestCase { +public class TestJsonStringToMap { private static final TupleFactory tupleFactory_ = TupleFactory.getInstance(); JsonStringToMap udf_ = new JsonStringToMap(); + @Test + public void testSchema() { + Schema schema = udf_.outputSchema(null); + assertNotNull(schema); + assertEquals("{map: map[chararray]}", schema.toString()); + } + @Test public final void testStandard() throws IOException, ExecException { Tuple input = tupleFactory_.newTuple(Arrays.asList("{\"name\": \"value\", \"number\": 2}")); From 202b22e1eb2e7cc32ffa9422ec62b07b98c0bdb0 Mon Sep 17 00:00:00 2001 From: Andy Schlaikjer Date: Fri, 30 Mar 2012 08:35:10 -0700 Subject: [PATCH 3/6] Whitespace in TestJsonStringToMap --- .../elephantbird/pig/piggybank/TestJsonStringToMap.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java b/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java index 7e33f24d2..f8a768093 100644 --- a/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java +++ b/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java @@ -29,9 +29,7 @@ public void testSchema() { public final void testStandard() throws IOException, ExecException { Tuple input = tupleFactory_.newTuple(Arrays.asList("{\"name\": \"value\", \"number\": 2}")); Map result = udf_.exec(input); - assertTrue("It should return a Map", result instanceof Map); - assertEquals("value", result.get("name")); assertEquals("It is expected to return numbers as strings", "2", result.get("number")); } @@ -40,8 +38,6 @@ public final void testStandard() throws IOException, ExecException { public final void testNestedJson() throws IOException, ExecException { Tuple input = tupleFactory_.newTuple(Arrays.asList("{\"name\": \"value\", \"nestedJson\": {\"json\": \"ihazit\"}}")); Map result = udf_.exec(input); - assertTrue("Nested Json should just return as a String", result.get("nestedJson") instanceof String); } - } From 9ebd2df33ede8e1c2dc831c97cb7418c28861dde Mon Sep 17 00:00:00 2001 From: Andy Schlaikjer Date: Fri, 30 Mar 2012 08:35:50 -0700 Subject: [PATCH 4/6] Output schema name changed from map to m --- .../com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java b/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java index 8ea04c40e..13b156a77 100644 --- a/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java +++ b/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java @@ -30,7 +30,7 @@ public class JsonStringToMap extends EvalFunc> { @Override public Schema outputSchema(Schema input) { try { - return Utils.getSchemaFromString("map: [chararray]"); + return Utils.getSchemaFromString("m: [chararray]"); } catch (ParserException e) { throw new RuntimeException(e); } From 1c463f7c9fa2201071eaf6203629c490d32a35f0 Mon Sep 17 00:00:00 2001 From: Andy Schlaikjer Date: Fri, 30 Mar 2012 09:08:42 -0700 Subject: [PATCH 5/6] Renames output schema name from m to json --- .../com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java b/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java index 13b156a77..fbe144b3f 100644 --- a/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java +++ b/src/java/com/twitter/elephantbird/pig/piggybank/JsonStringToMap.java @@ -30,7 +30,7 @@ public class JsonStringToMap extends EvalFunc> { @Override public Schema outputSchema(Schema input) { try { - return Utils.getSchemaFromString("m: [chararray]"); + return Utils.getSchemaFromString("json: [chararray]"); } catch (ParserException e) { throw new RuntimeException(e); } From debcbe970c03bcdfa284172a49424bf08bc228c4 Mon Sep 17 00:00:00 2001 From: Andy Schlaikjer Date: Fri, 30 Mar 2012 09:08:52 -0700 Subject: [PATCH 6/6] Adds unit test for JsonStringToMap evaled within PigServer --- .../pig/piggybank/TestJsonStringToMap.java | 53 +++++++++++++++++-- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java b/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java index f8a768093..9df86863b 100644 --- a/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java +++ b/src/test/com/twitter/elephantbird/pig/piggybank/TestJsonStringToMap.java @@ -4,25 +4,32 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; +import java.io.File; import java.io.IOException; +import java.io.PrintWriter; import java.util.Arrays; +import java.util.Iterator; import java.util.Map; +import org.apache.pig.ExecType; +import org.apache.pig.PigServer; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.junit.Test; +import com.google.common.collect.ImmutableMap; + public class TestJsonStringToMap { private static final TupleFactory tupleFactory_ = TupleFactory.getInstance(); - JsonStringToMap udf_ = new JsonStringToMap(); + private final JsonStringToMap udf_ = new JsonStringToMap(); @Test public void testSchema() { Schema schema = udf_.outputSchema(null); assertNotNull(schema); - assertEquals("{map: map[chararray]}", schema.toString()); + assertEquals("{json: map[chararray]}", schema.toString()); } @Test @@ -36,8 +43,46 @@ public final void testStandard() throws IOException, ExecException { @Test public final void testNestedJson() throws IOException, ExecException { - Tuple input = tupleFactory_.newTuple(Arrays.asList("{\"name\": \"value\", \"nestedJson\": {\"json\": \"ihazit\"}}")); + Tuple input = tupleFactory_.newTuple(Arrays + .asList("{\"name\": \"value\", \"nestedJson\": {\"json\": \"ihazit\"}}")); Map result = udf_.exec(input); - assertTrue("Nested Json should just return as a String", result.get("nestedJson") instanceof String); + assertTrue("Nested Json should just return as a String", + result.get("nestedJson") instanceof String); + } + + @Test + public final void testInThePig() throws IOException { + File tempFile = File.createTempFile("test", ".txt"); + String tempFilename = tempFile.getAbsolutePath(); + PrintWriter pw = new PrintWriter(tempFile); + pw.println("1\t{\"name\": \"bob\", \"number\": 2}"); + pw.close(); + PigServer pig = new PigServer(ExecType.LOCAL); + try { + pig.registerQuery(String.format("DEFINE JsonStringToMap %s();", + JsonStringToMap.class.getName())); + pig.registerQuery(String + .format("x = LOAD '%s' AS (id: int, value: chararray);", tempFilename)); + pig.registerQuery(String.format("x = FOREACH x GENERATE id, JsonStringToMap(value);", + tempFilename)); + Schema schema = pig.dumpSchema("x"); + assertNotNull(schema); + assertEquals("{id: int,json: map[chararray]}", schema.toString()); + Iterator x = pig.openIterator("x"); + assertNotNull(x); + assertTrue(x.hasNext()); + Tuple t = x.next(); + assertNotNull(t); + assertEquals(2, t.size()); + Map actual = (Map) t.get(1); + assertNotNull(actual); + Map expected = ImmutableMap. of("name", "bob", "number", "2"); + assertEquals(expected.size(), actual.size()); + for (Map.Entry e : expected.entrySet()) { + assertEquals(e.getValue(), actual.get(e.getKey())); + } + } finally { + pig.shutdown(); + } } }