From e3280ec23c7ec4a4a69197a776e8cc1b32c53630 Mon Sep 17 00:00:00 2001 From: Rajkumar singh Date: Tue, 29 May 2018 15:14:51 -0700 Subject: [PATCH] HIVE-19661: switch Hive UDFs to use Re2J regex engine. --- LICENSE | 30 ++++++++++ .../org/apache/hadoop/hive/conf/HiveConf.java | 2 +- pom.xml | 6 ++ ql/pom.xml | 6 ++ .../hive/ql/udf/generic/GenericUDFRegExp.java | 59 +++++++++++++++---- 5 files changed, 90 insertions(+), 13 deletions(-) diff --git a/LICENSE b/LICENSE index 3e7dc6b98cfc..316afc629b85 100644 --- a/LICENSE +++ b/LICENSE @@ -404,4 +404,34 @@ products or services of Licensee, or any third party. agrees to be bound by the terms and conditions of this License Agreement. +For google re2j (https://github.com/google/re2j/blob/master/LICENSE): + +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 931533a556d3..d07444754b5a 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -3636,7 +3636,7 @@ public static enum ConfVars { "Time to wait to finish prewarming spark executors"), HIVESTAGEIDREARRANGE("hive.stageid.rearrange", "none", new StringSet("none", "idonly", "traverse", "execution"), ""), HIVEEXPLAINDEPENDENCYAPPENDTASKTYPES("hive.explain.dependency.append.tasktype", false, ""), - + HIVEUSEGOOGLEREGEXENGINE("hive.use.googleregex.engine",false,"whether to use google regex engine or not, default regex engine is java.util.regex"), HIVECOUNTERGROUP("hive.counters.group.name", "HIVE", "The name of counter group for internal Hive variables (CREATED_FILE, FATAL_ERROR, etc.)"), diff --git a/pom.xml b/pom.xml index 1f43c416db30..264f1b14367a 100644 --- a/pom.xml +++ b/pom.xml @@ -214,6 +214,7 @@ 3.0.0 0.6.0 2.2.4 + 1.2 @@ -985,6 +986,11 @@ snappy-java ${snappy.version} + + com.google.re2j + re2j + ${re2j.version} + diff --git a/ql/pom.xml b/ql/pom.xml index 06124f738779..b4537f3fdd50 100644 --- a/ql/pom.xml +++ b/ql/pom.xml @@ -758,6 +758,11 @@ ${powermock.version} test + + com.google.re2j + re2j + ${re2j.version} + @@ -945,6 +950,7 @@ org.apache.orc:orc-core org.apache.orc:orc-tools joda-time:joda-time + com.google.re2j:re2j diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java index d309c37cc151..f5e877650351 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java @@ -23,6 +23,9 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.MapredContext; +import org.apache.hadoop.hive.ql.session.SessionState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.exec.Description; @@ -54,11 +57,26 @@ public class GenericUDFRegExp extends GenericUDF { private final BooleanWritable output = new BooleanWritable(); private transient boolean isRegexConst; private transient String regexConst; - private transient Pattern patternConst; + private transient java.util.regex.Pattern patternConst; + private transient com.google.re2j.Pattern patternConstR2j; private transient boolean warned; + private MapredContext context; + private boolean useGoogleRegexEngine=false; + + @Override + public void configure(MapredContext context) { + this.context = context; + } @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (context != null) { + if(context.getJobConf().get("hive.use.googleregex.engine").equals("true")){ + this.useGoogleRegexEngine=true; + } + }else { + this.useGoogleRegexEngine = SessionState.getSessionConf().getBoolVar(HiveConf.ConfVars.HIVEUSEGOOGLEREGEXENGINE); + } checkArgsSize(arguments, 2, 2); checkArgPrimitive(arguments, 0); @@ -73,7 +91,12 @@ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumen if (arguments[1] instanceof ConstantObjectInspector) { regexConst = getConstantStringValue(arguments, 1); if (regexConst != null) { - patternConst = Pattern.compile(regexConst); + if(!useGoogleRegexEngine){ + //if(!HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVEUSEGOOGLEREGEXENGINE)){ + patternConst = Pattern.compile(regexConst); + }else{ + patternConstR2j = com.google.re2j.Pattern.compile(regexConst); + } } isRegexConst = true; } @@ -103,22 +126,34 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException { if (!warned) { warned = true; LOG.warn(getClass().getSimpleName() + " regex is empty. Additional " - + "warnings for an empty regex will be suppressed."); + + "warnings for an empty regex will be suppressed."); } output.set(false); return output; } + if(!useGoogleRegexEngine){ + Pattern p; + if (isRegexConst) { + p = patternConst; + } else { + p = Pattern.compile(regex); + } - Pattern p; - if (isRegexConst) { - p = patternConst; - } else { - p = Pattern.compile(regex); - } + Matcher m = p.matcher(s); + output.set(m.find(0)); + return output; + }else{ + com.google.re2j.Pattern patternR2j; + if (isRegexConst) { + patternR2j = patternConstR2j; + } else { + patternR2j = com.google.re2j.Pattern.compile(regex); + } - Matcher m = p.matcher(s); - output.set(m.find(0)); - return output; + com.google.re2j.Matcher m = patternR2j.matcher(s); + output.set(m.find(0)); + return output; + } } @Override