Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1606,7 +1606,10 @@
<value>plugins</value>
<description>Directories where Nutch plugins are located. Each
element may be a relative or absolute path. If absolute, it is used
as is. If relative, it is searched for on the classpath.</description>
as is. If relative, it is searched for on the classpath.
For secure deployments, treat these directories as trusted code: use
read-only filesystem permissions or immutable images so untrusted
parties cannot add or replace plugin JARs or plugin.xml files.</description>
</property>

<property>
Expand Down Expand Up @@ -2146,6 +2149,19 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
each property value is always an array of Strings (so if you expect one value, use [0])
* doc - contains all the NutchFields from the NutchDocument.
each property value is always an array of Objects.
Expressions are evaluated in a sandboxed JEXL engine (see also
nutch.jexl.disable.sandbox).
</description>
</property>

<property>
<name>nutch.jexl.disable.sandbox</name>
<value>false</value>
<description>If true, disables the Commons JEXL sandbox and the restriction
on the JEXL "new" operator for all Nutch JEXL expressions (index filter,
generator, hostdb filter, crawl_db_reader, exchange-jexl, etc.). This is
unsafe and should only be used in fully trusted environments when a
legitimate expression cannot be expressed under the default sandbox.
</description>
</property>

Expand Down
6 changes: 6 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ $(boot2docker shellinit | grep export) #may not be necessary
docker build -t apache/nutch . --build-arg BUILD_MODE=2 --build-arg SERVER_PORT=8081 --build-arg SERVER_HOST=0.0.0.0 --build-arg WEBAPP_PORT=8080
```

## Security and plugin directories

Nutch loads executable code from the directories configured as `plugin.folders` (see `nutch-default.xml`). For production and shared images, treat those paths as **trusted**: mount them read-only where possible, rebuild images to change plugins, and run the crawl process under a dedicated low-privilege user so the filesystem cannot be abused to drop unexpected JARs or `plugin.xml` files into that tree.

User-defined JEXL in configuration (for example `index.jexl.filter`, generator expressions, and `hostdb.filter.expression`) is evaluated in a **sandboxed** engine by default. The property `nutch.jexl.disable.sandbox` disables that protection and must not be set in untrusted environments.

## Usage

If not already running, start docker
Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/crawl/CrawlDbReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -906,7 +906,7 @@ public void setup(
retry = config.getInt("retry", -1);

if (config.get("expr", null) != null) {
expr = JexlUtil.parseExpression(config.get("expr", null));
expr = JexlUtil.parseExpression(config, config.get("expr", null));
}
sample = config.getFloat("sample", 1);
}
Expand Down
14 changes: 7 additions & 7 deletions src/java/org/apache/nutch/crawl/Generator.java
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ public void setup(
if (!restrictStatusString.isEmpty()) {
restrictStatus = CrawlDatum.getStatusByName(restrictStatusString);
}
expr = JexlUtil.parseExpression(conf.get(GENERATOR_EXPR, null));
expr = JexlUtil.parseExpression(conf, conf.get(GENERATOR_EXPR, null));
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
// Initialize cached counter references
Expand Down Expand Up @@ -453,10 +453,10 @@ public void setup(Context context) throws IOException {
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);

if (conf.get(GENERATOR_HOSTDB) != null) {
maxCountExpr = JexlUtil
.parseExpression(conf.get(GENERATOR_MAX_COUNT_EXPR, null));
fetchDelayExpr = JexlUtil
.parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
maxCountExpr = JexlUtil.parseExpression(conf,
conf.get(GENERATOR_MAX_COUNT_EXPR, null));
fetchDelayExpr = JexlUtil.parseExpression(conf,
conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
}
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
Expand Down Expand Up @@ -871,7 +871,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
* maximum number of segments to generate
* @param expr
* a Jexl expression to use in the Generator job.
* @see JexlUtil#parseExpression(String)
* @see JexlUtil#parseExpression(Configuration, String)
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
Expand Down Expand Up @@ -922,7 +922,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
* @param hostdb
* name of a hostdb from which to execute Jexl expressions in a bid
* to determine the maximum URL count and/or fetch delay per host.
* @see JexlUtil#parseExpression(String)
* @see JexlUtil#parseExpression(Configuration, String)
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
Expand Down
9 changes: 2 additions & 7 deletions src/java/org/apache/nutch/hostdb/ReadHostDb.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,10 @@
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.SegmentReaderUtil;

import org.apache.commons.jexl3.JexlBuilder;
import org.apache.commons.jexl3.JexlContext;
import org.apache.commons.jexl3.JexlScript;
import org.apache.commons.jexl3.JexlEngine;
import org.apache.commons.jexl3.MapContext;
import org.apache.nutch.util.JexlUtil;

/**
* @see <a href='https://commons.apache.org/proper/commons-jexl/reference/syntax.html'>Commons</a>
Expand Down Expand Up @@ -77,11 +76,7 @@ public void setup(Context context) {
fieldHeader = context.getConfiguration().getBoolean(HOSTDB_DUMP_HEADER, true);
String expr = context.getConfiguration().get(HOSTDB_FILTER_EXPRESSION);
if (expr != null) {
// Create or retrieve a JexlEngine
JexlEngine jexl = new JexlBuilder().silent(true).strict(true).create();

// Create an expression object
this.expr = jexl.createScript(expr);
this.expr = JexlUtil.parseExpression(context.getConfiguration(), expr);
}
}

Expand Down
151 changes: 138 additions & 13 deletions src/java/org/apache/nutch/util/JexlUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,56 +23,181 @@

import org.apache.commons.jexl3.JexlBuilder;
import org.apache.commons.jexl3.JexlEngine;
import org.apache.commons.jexl3.JexlFeatures;
import org.apache.commons.jexl3.JexlScript;
import org.apache.commons.jexl3.introspection.JexlSandbox;
import org.apache.commons.lang3.time.DateUtils;
import org.apache.hadoop.conf.Configuration;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Utility methods for handling JEXL expressions
* Utility methods for handling JEXL expressions used in crawl and index
* pipelines. Expressions are evaluated under a {@link JexlSandbox} with
* {@link JexlFeatures#newInstance(boolean)} disabled so arbitrary classes cannot
* be instantiated from user-supplied configuration.
*/
public class JexlUtil {

private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());

/**
* When {@code true}, JEXL parsing skips the sandbox (unsafe). For trusted
* environments only; not recommended.
*/
public static final String DISABLE_SANDBOX_KEY = "nutch.jexl.disable.sandbox";

/** Supported format for date parsing yyyy-MM-ddTHH:mm:ssZ */
private static final Pattern DATE_PATTERN = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
private static final Pattern DATE_PATTERN = Pattern
.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");

/**
* Classes and interfaces that may be introspected when evaluating Nutch JEXL
* scripts. Default-deny sandbox: anything not listed is blocked.
*/
private static final String[] SANDBOX_ALLOW_CLASSES = {
"java.lang.String",
"java.lang.Boolean",
"java.lang.Byte",
"java.lang.Character",
"java.lang.Short",
"java.lang.Integer",
"java.lang.Long",
"java.lang.Float",
"java.lang.Double",
"java.lang.Number",
"java.lang.Math",
"java.lang.Comparable",
"java.lang.CharSequence",
"java.util.Map",
"java.util.List",
"java.util.Collection",
"java.util.Set",
"java.util.SortedMap",
"java.util.SortedSet",
"java.util.Iterator",
"java.lang.Iterable",
"java.util.AbstractList",
"java.util.AbstractCollection",
"java.util.AbstractMap",
"java.util.AbstractSet",
"java.util.ArrayList",
"java.util.LinkedList",
"java.util.HashMap",
"java.util.LinkedHashMap",
"java.util.HashSet",
"java.util.LinkedHashSet",
"java.util.TreeMap",
"java.util.TreeSet",
"java.util.Collections",
"java.util.Arrays",
"java.util.regex.Pattern",
"java.util.regex.Matcher",
"org.apache.commons.jexl3.MapContext",
"org.apache.nutch.indexer.NutchDocument",
"org.apache.nutch.indexer.NutchField",
};

private static volatile JexlEngine sandboxedEngine;
private static volatile JexlEngine legacyEngine;

private JexlUtil() {
}

private static JexlSandbox createSandbox() {
JexlSandbox sandbox = new JexlSandbox(false);
for (String name : SANDBOX_ALLOW_CLASSES) {
sandbox.allow(name);
}
return sandbox;
}

private static JexlFeatures createFeatures() {
return new JexlFeatures(JexlFeatures.createDefault()).newInstance(false);
}

private static JexlEngine getSandboxedEngine() {
if (sandboxedEngine == null) {
synchronized (JexlUtil.class) {
if (sandboxedEngine == null) {
sandboxedEngine = new JexlBuilder().silent(true).strict(true)
.sandbox(createSandbox()).features(createFeatures()).create();
}
}
}
return sandboxedEngine;
}

private static JexlEngine getLegacyEngine() {
if (legacyEngine == null) {
synchronized (JexlUtil.class) {
if (legacyEngine == null) {
legacyEngine = new JexlBuilder().silent(true).strict(true).create();
}
}
}
return legacyEngine;
}

private static JexlEngine engineFor(Configuration conf) {
if (conf != null && conf.getBoolean(DISABLE_SANDBOX_KEY, false)) {
LOG.warn("{}=true: JEXL sandbox is disabled; only use in fully trusted environments.",
DISABLE_SANDBOX_KEY);
return getLegacyEngine();
}
return getSandboxedEngine();
}

/**
* Parses the given expression to a JEXL expression. This supports
* date parsing.
* Parses a JEXL expression using the default (sandboxed) engine. Use
* {@link #parseExpression(Configuration, String)} when a {@link Configuration}
* is available so {@link #DISABLE_SANDBOX_KEY} can be honored.
*
* @param expr string JEXL expression
* @return parsed JEXL expression or null in case of parse error
*/
public static JexlScript parseExpression(String expr) {
if (expr == null) return null;

return parseExpression(null, expr);
}

/**
* Parses a JEXL expression. Unless {@link #DISABLE_SANDBOX_KEY} is set to
* {@code true} in {@code conf}, the expression is parsed for execution under
* a restrictive sandbox.
*
* @param conf Hadoop configuration, or null to always use the sandbox
* @param expr string JEXL expression
* @return parsed JEXL expression or null in case of parse error
*/
public static JexlScript parseExpression(Configuration conf, String expr) {
if (expr == null) {
return null;
}

try {
// Translate any date object into a long. Dates must be in the DATE_PATTERN
// format. For example: 2016-03-20T00:00:00Z
Matcher matcher = DATE_PATTERN.matcher(expr);

if (matcher.find()) {
String date = matcher.group();

// parse the matched substring and get the epoch
Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
Date parsedDate = DateUtils.parseDateStrictly(date,
new String[] { "yyyy-MM-dd'T'HH:mm:ss'Z'" });
long time = parsedDate.getTime();

// replace the original string date with the numeric value
expr = expr.replace(date, Long.toString(time));
}

JexlEngine jexl = new JexlBuilder().silent(true).strict(true).create();

return jexl.createScript(expr);
return engineFor(conf).createScript(expr);
} catch (Exception e) {
LOG.error(e.getMessage());
}

return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ public class JexlExchange implements Exchange {
*/
@Override
public void open(Map<String, String> parameters) {
expression = JexlUtil.parseExpression(parameters.get(EXPRESSION_KEY));
expression = JexlUtil.parseExpression(getConf(),
parameters.get(EXPRESSION_KEY));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ public void setConf(Configuration conf) {
"The property index.jexl.filter must have a value when index-jexl-filter is used. You can use 'true' or 'false' to index all/none");
}

expr = JexlUtil.parseExpression(strExpr);
expr = JexlUtil.parseExpression(conf, strExpr);

if (expr == null) {
LOG.error("Failed parsing JEXL from index.jexl.filter: {}", strExpr);
Expand Down
Loading
Loading