Skip to content

Commit

Permalink
Merge pull request #2075 from jon-wei/regex_extract
Browse files Browse the repository at this point in the history
Configurable value replacement on match failure for RegexExtractionFn
  • Loading branch information
fjy committed Dec 15, 2015
2 parents 6bf6644 + c88f75d commit e7f06cf
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 33 deletions.
13 changes: 12 additions & 1 deletion docs/content/querying/dimensionspecs.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,23 @@ Returns the first matching group for the given regular expression.
If there is no match, it returns the dimension value as is.

```json
{ "type" : "regex", "expr" : <regular_expression> }
{
"type" : "regex", "expr" : <regular_expression>,
"replaceMissingValues" : true,
"replaceMissingValuesWith" : "foobar"
}
```

For example, using `"expr" : "(\\w\\w\\w).*"` will transform
`'Monday'`, `'Tuesday'`, `'Wednesday'` into `'Mon'`, `'Tue'`, `'Wed'`.

If the `replaceMissingValues` property is true, the extraction function will transform dimension values that do not match the regex pattern to a user-specified String. Default value is `false`.

The `replaceMissingValuesWith` property sets the String that unmatched dimension values will be replaced with, if `replaceMissingValues` is true. If `replaceMissingValuesWith` is not specified, unmatched dimension values will be replaced with nulls.

For example, if `expr` is `"(a\w+)"` in the example JSON above, a regex that matches words starting with the letter `a`, the extraction function will convert a dimension value like `banana` to `foobar`.


### Partial Extraction Function

Returns the dimension value unchanged if the regular expression matches, otherwise returns null.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,28 +34,53 @@
public class RegexDimExtractionFn extends DimExtractionFn
{
private static final byte CACHE_TYPE_ID = 0x1;
private static final byte CACHE_KEY_SEPARATOR = (byte) 0xFF;

private final String expr;
private final Pattern pattern;
private final boolean replaceMissingValues;
private final String replaceMissingValuesWith;

@JsonCreator
public RegexDimExtractionFn(
@JsonProperty("expr") String expr
@JsonProperty("expr") String expr,
@JsonProperty("replaceMissingValues") Boolean replaceMissingValues,
@JsonProperty("replaceMissingValuesWith") String replaceMissingValuesWith
)
{
Preconditions.checkNotNull(expr, "expr must not be null");

this.expr = expr;
this.pattern = Pattern.compile(expr);
this.replaceMissingValues = replaceMissingValues == null ? false : replaceMissingValues;
this.replaceMissingValuesWith = replaceMissingValuesWith;
}

@Override
public byte[] getCacheKey()
{
byte[] exprBytes = StringUtils.toUtf8(expr);
return ByteBuffer.allocate(1 + exprBytes.length)
byte[] replaceBytes = replaceMissingValues ? new byte[]{1} : new byte[]{0};
byte[] replaceStrBytes;
if (replaceMissingValuesWith == null) {
replaceStrBytes = new byte[]{};
} else {
replaceStrBytes = StringUtils.toUtf8(replaceMissingValuesWith);
}

int totalLen = 1
+ exprBytes.length
+ replaceBytes.length
+ replaceStrBytes.length; // fields
totalLen += 2; // separators

return ByteBuffer.allocate(totalLen)
.put(CACHE_TYPE_ID)
.put(exprBytes)
.put(CACHE_KEY_SEPARATOR)
.put(replaceStrBytes)
.put(CACHE_KEY_SEPARATOR)
.put(replaceBytes)
.array();
}

Expand All @@ -65,8 +90,14 @@ public String apply(String dimValue)
if (dimValue == null) {
return null;
}
String retVal;
Matcher matcher = pattern.matcher(dimValue);
return Strings.emptyToNull(matcher.find() ? matcher.group(1) : dimValue);
if (matcher.find()) {
retVal = matcher.group(1);
} else {
retVal = replaceMissingValues ? replaceMissingValuesWith : dimValue;
}
return Strings.emptyToNull(retVal);
}

@JsonProperty("expr")
Expand All @@ -75,6 +106,18 @@ public String getExpr()
return expr;
}

@JsonProperty("replaceMissingValues")
public boolean isReplaceMissingValues()
{
return replaceMissingValues;
}

@JsonProperty("replaceMissingValuesWith")
public String getReplaceMissingValuesWith()
{
return replaceMissingValuesWith;
}

@Override
public boolean preservesOrdering()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package io.druid.query.extraction;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;
import io.druid.jackson.DefaultObjectMapper;
import org.junit.Assert;
Expand Down Expand Up @@ -55,59 +56,58 @@ public class RegexDimExtractionFnTest
public void testPathExtraction()
{
String regex = "/([^/]+)/";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
Set<String> extracted = Sets.newHashSet();
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newLinkedHashSet();

for (String path : paths) {
extracted.add(extractionFn.apply(path));
}

Assert.assertEquals(2, extracted.size());
Assert.assertTrue(extracted.contains("druid"));
Assert.assertTrue(extracted.contains("dash"));
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("druid", "dash"));
Assert.assertEquals(expected, extracted);
}

@Test
public void testDeeperPathExtraction()
{
String regex = "^/([^/]+/[^/]+)(/|$)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
Set<String> extracted = Sets.newHashSet();
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newLinkedHashSet();

for (String path : paths) {
extracted.add(extractionFn.apply(path));
}

Assert.assertEquals(4, extracted.size());
Assert.assertTrue(extracted.contains("druid/prod"));
Assert.assertTrue(extracted.contains("druid/demo"));
Assert.assertTrue(extracted.contains("dash/aloe"));
Assert.assertTrue(extracted.contains("dash/baloo"));
Set<String> expected = Sets.newLinkedHashSet(
ImmutableList.of(
"druid/prod", "druid/demo",
"dash/aloe", "dash/baloo"
)
);
Assert.assertEquals(expected, extracted);
}

@Test
public void testStringExtraction()
{
String regex = "(.)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
Set<String> extracted = Sets.newHashSet();
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newLinkedHashSet();

for (String testString : testStrings) {
extracted.add(extractionFn.apply(testString));
}

Assert.assertEquals(3, extracted.size());
Assert.assertTrue(extracted.contains("a"));
Assert.assertTrue(extracted.contains("b"));
Assert.assertTrue(extracted.contains("c"));
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("a", "b", "c"));
Assert.assertEquals(expected, extracted);
}


@Test
public void testNullAndEmpty()
{
String regex = "(.*)/.*/.*";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
// no match, map empty input value to null
Assert.assertEquals(null, extractionFn.apply(""));
// null value, returns null
Expand All @@ -116,14 +116,54 @@ public void testNullAndEmpty()
Assert.assertEquals(null, extractionFn.apply("/a/b"));
}

@Test
public void testMissingValueReplacement()
{
String regex = "(a\\w*)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, true, "foobar");
Set<String> extracted = Sets.newLinkedHashSet();

for (String testString : testStrings) {
extracted.add(extractionFn.apply(testString));
}

Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum", "foobar"));
Assert.assertEquals(expected, extracted);

byte[] cacheKey = extractionFn.getCacheKey();
byte[] expectedCacheKey = new byte[]{
0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF,
0x66, 0x6F, 0x6F, 0x62, 0x61, 0x72, (byte) 0xFF, 0x01
};
Assert.assertArrayEquals(expectedCacheKey, cacheKey);

ExtractionFn nullExtractionFn = new RegexDimExtractionFn(regex, true, null);
Set<String> extracted2 = Sets.newLinkedHashSet();

for (String testString : testStrings) {
extracted2.add(nullExtractionFn.apply(testString));
}

Set<String> expected2 = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum"));
expected2.add(null);
Assert.assertEquals(expected2, extracted2);

cacheKey = nullExtractionFn.getCacheKey();
expectedCacheKey = new byte[]{0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF, (byte) 0xFF, 0x01};
Assert.assertArrayEquals(expectedCacheKey, cacheKey);
}

@Test
public void testSerde() throws Exception
{
final ObjectMapper objectMapper = new DefaultObjectMapper();
final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" }";
final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" , " +
"\"replaceMissingValues\": true, \"replaceMissingValuesWith\":\"foobar\"}";
RegexDimExtractionFn extractionFn = (RegexDimExtractionFn) objectMapper.readValue(json, ExtractionFn.class);

Assert.assertEquals(".(...)?", extractionFn.getExpr());
Assert.assertTrue(extractionFn.isReplaceMissingValues());
Assert.assertEquals("foobar", extractionFn.getReplaceMissingValuesWith());

// round trip
Assert.assertEquals(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ public void testGroupByWithCardinality()
@Test
public void testGroupByWithNullProducingDimExtractionFn()
{
final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})")
final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
{
@Override
public byte[] getCacheKey()
Expand Down Expand Up @@ -797,7 +797,7 @@ public String apply(String dimValue)
*/
public void testGroupByWithEmptyStringProducingDimExtractionFn()
{
final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})")
final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
{
@Override
public byte[] getCacheKey()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1519,7 +1519,7 @@ public void testTopNCollapsingDimExtraction()
.dimension(
new ExtractionDimensionSpec(
QueryRunnerTestHelper.qualityDimension, QueryRunnerTestHelper.qualityDimension,
new RegexDimExtractionFn(".(.)"), null
new RegexDimExtractionFn(".(.)", false, null), null
)
)
.metric("index")
Expand Down Expand Up @@ -1568,7 +1568,7 @@ public void testTopNDimExtraction()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
Expand Down Expand Up @@ -2074,7 +2074,7 @@ public void testTopNLexicographicDimExtraction()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
Expand Down Expand Up @@ -2128,7 +2128,7 @@ public void testInvertedTopNLexicographicDimExtraction2()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("..(.)"),
new RegexDimExtractionFn("..(.)", false, null),
null
)
)
Expand Down Expand Up @@ -2182,7 +2182,7 @@ public void testTopNLexicographicDimExtractionWithPreviousStop()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
Expand Down Expand Up @@ -2300,7 +2300,7 @@ public void testInvertedTopNLexicographicDimExtractionWithPreviousStop()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
Expand Down Expand Up @@ -2347,7 +2347,7 @@ public void testInvertedTopNLexicographicDimExtractionWithPreviousStop2()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("..(.)"),
new RegexDimExtractionFn("..(.)", false, null),
null
)
)
Expand Down

0 comments on commit e7f06cf

Please sign in to comment.