apache · echeipesh · Mar 23, 2015 · Mar 25, 2015 · Mar 25, 2015 · Mar 25, 2015
diff --git a/core/src/main/findbugs/exclude-filter.xml b/core/src/main/findbugs/exclude-filter.xml
@@ -49,6 +49,7 @@
       <Package name="org.apache.accumulo.core.iterators" />
       <Package name="org.apache.accumulo.core.trace" />
       <Class name="org.apache.accumulo.core.client.mapred.RangeInputSplit" />
+      <Class name="org.apache.accumulo.core.client.mapred.impl.BatchInputSplit" />
       <Class name="org.apache.accumulo.core.util.AddressUtil" />
       <Class name="org.apache.accumulo.core.zookeeper.ZooUtil" />
     </Or>

diff --git a/core/src/main/java/org/apache/accumulo/core/client/mapred/AbstractInputFormat.java b/core/src/main/java/org/apache/accumulo/core/client/mapred/AbstractInputFormat.java
diff --git a/core/src/main/java/org/apache/accumulo/core/client/mapred/AccumuloInputFormat.java b/core/src/main/java/org/apache/accumulo/core/client/mapred/AccumuloInputFormat.java
@@ -54,12 +54,14 @@ public RecordReader<Key,Value> getRecordReader(InputSplit split, JobConf job, Re
     log.setLevel(getLogLevel(job));
 
     // Override the log level from the configuration as if the RangeInputSplit has one it's the more correct one to use.
-    if (split instanceof org.apache.accumulo.core.client.mapreduce.RangeInputSplit) {
-      org.apache.accumulo.core.client.mapreduce.RangeInputSplit risplit = (org.apache.accumulo.core.client.mapreduce.RangeInputSplit) split;
-      Level level = risplit.getLogLevel();
+    if (split instanceof org.apache.accumulo.core.client.mapreduce.impl.AccumuloInputSplit) {
+      org.apache.accumulo.core.client.mapreduce.impl.AccumuloInputSplit accSplit = (org.apache.accumulo.core.client.mapreduce.impl.AccumuloInputSplit) split;
+      Level level = accSplit.getLogLevel();
       if (null != level) {
         log.setLevel(level);
       }
+    } else {
+      throw new IllegalArgumentException("No RecordReader for " + split.getClass().toString());
     }
 
     RecordReaderBase<Key,Value> recordReader = new RecordReaderBase<Key,Value>() {

diff --git a/core/src/main/java/org/apache/accumulo/core/client/mapred/InputFormatBase.java b/core/src/main/java/org/apache/accumulo/core/client/mapred/InputFormatBase.java
@@ -178,6 +178,7 @@ public static void setAutoAdjustRanges(JobConf job, boolean enableFeature) {
 
   /**
    * Determines whether a configuration has auto-adjust ranges enabled.
+   * Must be enabled when {@link #setBatchScan(JobConf, boolean)} is true.
    *
    * @param job
    *          the Hadoop context for the configured job
@@ -296,6 +297,48 @@ protected static boolean isOfflineScan(JobConf job) {
     return InputConfigurator.isOfflineScan(CLASS, job);
   }
 
+  /**
+   * Controls the use of the {@link org.apache.accumulo.core.client.BatchScanner} in this job.
+   * Using this feature will group Ranges by their source tablet, producing an InputSplit per tablet
+   * rather than per Range. This batching helps to reduce overhead when querying a large number of small ranges.
+   * (ex: when doing quad-tree decomposition for spatial queries)
+   * <p>
+   * In order to achieve good locality of InputSplits this option always clips the input Ranges to tablet boundaries.
+   * This may result in one input Range contributing to several InputSplits.
+   * <p>
+   * Note: that the value of {@link #setAutoAdjustRanges(JobConf, boolean)} is ignored and is assumed to be true when BatchScan option is enabled.
+   * <p>
+   * This configuration is incompatible with:
+   * <ul>
+   *   <li>{@link #setOfflineTableScan(JobConf, boolean)}</li>
+   *   <li>{@link #setLocalIterators(JobConf, boolean)}</li>
+   *   <li>{@link #setScanIsolation(JobConf, boolean)}</li>
+   * </ul>
+   * <p>
+   * By default, this feature is <b>disabled</b>.
+   *
+   * @param job
+   *          the Hadoop job instance to be configured
+   * @param enableFeature
+   *          the feature is enabled if true, disabled otherwise
+   * @since 1.7.0
+   */
+  public static void setBatchScan(JobConf job, boolean enableFeature) {
+    InputConfigurator.setBatchScan(CLASS, job, enableFeature);
+  }
+
+  /**
+   * Determines whether a configuration has the {@link org.apache.accumulo.core.client.BatchScanner} feature enabled.
+   *
+   * @param job
+   *          the Hadoop context for the configured job
+   * @since 1.7.0
+   * @see #setBatchScan(JobConf, boolean)
+   */
+  public static boolean isBatchScan(JobConf job) {
+    return InputConfigurator.isBatchScan(CLASS, job);
+  }
+
   /**
    * Initializes an Accumulo {@link org.apache.accumulo.core.client.impl.TabletLocator} based on the configuration.
    *
@@ -315,19 +358,8 @@ protected static TabletLocator getTabletLocator(JobConf job) throws TableNotFoun
   protected abstract static class RecordReaderBase<K,V> extends AbstractRecordReader<K,V> {
 
     @Override
-    protected void setupIterators(JobConf job, Scanner scanner, String tableName, org.apache.accumulo.core.client.mapred.RangeInputSplit split) {
-      List<IteratorSetting> iterators = null;
-
-      if (null == split) {
-        iterators = getIterators(job);
-      } else {
-        iterators = split.getIterators();
-        if (null == iterators) {
-          iterators = getIterators(job);
-        }
-      }
-
-      setupIterators(iterators, scanner);
+    protected List<IteratorSetting> jobIterators(JobConf job, String tableName) {
+      return getIterators(job);
     }
 
     /**
@@ -337,7 +369,9 @@ protected void setupIterators(JobConf job, Scanner scanner, String tableName, or
      *          the iterators to set
      * @param scanner
      *          the scanner to configure
+     * @deprecated since 1.7.0; Use {@link #jobIterators} instead.
      */
+    @Deprecated
     protected void setupIterators(List<IteratorSetting> iterators, Scanner scanner) {
       for (IteratorSetting iterator : iterators) {
         scanner.addScanIterator(iterator);

diff --git a/core/src/main/java/org/apache/accumulo/core/client/mapred/impl/BatchInputSplit.java b/core/src/main/java/org/apache/accumulo/core/client/mapred/impl/BatchInputSplit.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.core.client.mapred.impl;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.accumulo.core.data.Range;
+import org.apache.hadoop.mapred.InputSplit;
+
+/**
+ * The Class BatchInputSplit. Encapsulates Accumulo ranges for use in Map Reduce jobs.
+ * Can contain several Ranges per InputSplit.
+ */
+public class BatchInputSplit extends org.apache.accumulo.core.client.mapreduce.impl.BatchInputSplit implements InputSplit {
+
+  public BatchInputSplit() {
+    super();
+  }
+
+  public BatchInputSplit(BatchInputSplit split) throws IOException {
+    super(split);
+  }
+
+  public BatchInputSplit(String table, String tableId, Collection<Range> ranges, String[] location) {
+    super(table, tableId, ranges, location);
+  }
+}