Skip to content

Commit

Permalink
Merge pull request twitter#172 from sagemintblue/hazen_json_string_to…
Browse files Browse the repository at this point in the history
…_map_output_schema

Add outputSchema impl to JsonStringToMap UDF
  • Loading branch information
rangadi committed Mar 30, 2012
2 parents 4ee12ef + debcbe9 commit fe8fa0c
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 14 deletions.
2 changes: 1 addition & 1 deletion libraries.properties
Expand Up @@ -10,7 +10,7 @@ hadoop-core.version=0.20.2
hadoop-lzo.version=0.4.15
hive-serde.version=0.8.0
json-simple.version=1.1
junit.version=4.5
junit.version=4.10
libthrift.version=0.5.0
log4j.version=1.2.15
mahout-collections.version=1.0
Expand Down
Expand Up @@ -8,6 +8,9 @@
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.util.Utils;
import org.apache.pig.parser.ParserException;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
Expand All @@ -16,15 +19,23 @@
import com.twitter.elephantbird.pig.util.PigCounterHelper;

/**
* <p>Transforms a Json string into a Pig map.<br>
* Only goes 1 level deep -- all value representations are their toString() representations.</p>
* Transforms a Json string into a Pig map whose value type is chararray. Only goes one level deep;
* All input map values are converted to strings via {@link Object#toString()}.
*/
@SuppressWarnings("rawtypes")
public class JsonStringToMap extends EvalFunc<Map> {
public class JsonStringToMap extends EvalFunc<Map<String, String>> {
private static final Logger LOG = LogManager.getLogger(JsonStringToMap.class);
private final JSONParser jsonParser = new JSONParser();
private final PigCounterHelper counterHelper = new PigCounterHelper();

@Override
public Schema outputSchema(Schema input) {
try {
return Utils.getSchemaFromString("json: [chararray]");
} catch (ParserException e) {
throw new RuntimeException(e);
}
}

@Override
public Map<String, String> exec(Tuple input) throws IOException {
try {
Expand Down
@@ -1,37 +1,88 @@
package com.twitter.elephantbird.pig.piggybank;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;

import junit.framework.TestCase;

import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.junit.Test;

public class TestJsonStringToMap extends TestCase {
import com.google.common.collect.ImmutableMap;

public class TestJsonStringToMap {
private static final TupleFactory tupleFactory_ = TupleFactory.getInstance();
JsonStringToMap udf_ = new JsonStringToMap();
private final JsonStringToMap udf_ = new JsonStringToMap();

@Test
public void testSchema() {
Schema schema = udf_.outputSchema(null);
assertNotNull(schema);
assertEquals("{json: map[chararray]}", schema.toString());
}

@Test
public final void testStandard() throws IOException, ExecException {
Tuple input = tupleFactory_.newTuple(Arrays.asList("{\"name\": \"value\", \"number\": 2}"));
Map<String, String> result = udf_.exec(input);

assertTrue("It should return a Map", result instanceof Map<?, ?>);

assertEquals("value", result.get("name"));
assertEquals("It is expected to return numbers as strings", "2", result.get("number"));
}

@Test
public final void testNestedJson() throws IOException, ExecException {
Tuple input = tupleFactory_.newTuple(Arrays.asList("{\"name\": \"value\", \"nestedJson\": {\"json\": \"ihazit\"}}"));
Tuple input = tupleFactory_.newTuple(Arrays
.asList("{\"name\": \"value\", \"nestedJson\": {\"json\": \"ihazit\"}}"));
Map<String, String> result = udf_.exec(input);

assertTrue("Nested Json should just return as a String", result.get("nestedJson") instanceof String);
assertTrue("Nested Json should just return as a String",
result.get("nestedJson") instanceof String);
}

@Test
public final void testInThePig() throws IOException {
File tempFile = File.createTempFile("test", ".txt");
String tempFilename = tempFile.getAbsolutePath();
PrintWriter pw = new PrintWriter(tempFile);
pw.println("1\t{\"name\": \"bob\", \"number\": 2}");
pw.close();
PigServer pig = new PigServer(ExecType.LOCAL);
try {
pig.registerQuery(String.format("DEFINE JsonStringToMap %s();",
JsonStringToMap.class.getName()));
pig.registerQuery(String
.format("x = LOAD '%s' AS (id: int, value: chararray);", tempFilename));
pig.registerQuery(String.format("x = FOREACH x GENERATE id, JsonStringToMap(value);",
tempFilename));
Schema schema = pig.dumpSchema("x");
assertNotNull(schema);
assertEquals("{id: int,json: map[chararray]}", schema.toString());
Iterator<Tuple> x = pig.openIterator("x");
assertNotNull(x);
assertTrue(x.hasNext());
Tuple t = x.next();
assertNotNull(t);
assertEquals(2, t.size());
Map<?, ?> actual = (Map<?, ?>) t.get(1);
assertNotNull(actual);
Map<String, String> expected = ImmutableMap.<String, String> of("name", "bob", "number", "2");
assertEquals(expected.size(), actual.size());
for (Map.Entry<String, String> e : expected.entrySet()) {
assertEquals(e.getValue(), actual.get(e.getKey()));
}
} finally {
pig.shutdown();
}
}
}

0 comments on commit fe8fa0c

Please sign in to comment.