apache · xuchuanyin · Jul 7, 2018 · Jul 8, 2018
diff --git a/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapBuilder.java b/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapBuilder.java
@@ -35,4 +35,8 @@ public interface DataMapBuilder {
   void finish() throws IOException;
 
   void close() throws IOException;
+  /**
+   * whether create index on internal carbon bytes (such as dictionary encoded) or original value
+   */
+  boolean isIndexForCarbonRawBytes();
 }
diff --git a/core/src/main/java/org/apache/carbondata/core/scan/collector/ResultCollectorFactory.java b/core/src/main/java/org/apache/carbondata/core/scan/collector/ResultCollectorFactory.java
@@ -26,6 +26,7 @@
 import org.apache.carbondata.core.scan.collector.impl.RestructureBasedRawResultCollector;
 import org.apache.carbondata.core.scan.collector.impl.RestructureBasedVectorResultCollector;
 import org.apache.carbondata.core.scan.collector.impl.RowIdBasedResultCollector;
+import org.apache.carbondata.core.scan.collector.impl.RowIdRawBasedResultCollector;
 import org.apache.carbondata.core.scan.executor.infos.BlockExecutionInfo;
 
 /**
@@ -52,6 +53,9 @@ public static AbstractScannedResultCollector getScannedResultCollector(
       if (blockExecutionInfo.isRestructuredBlock()) {
         LOGGER.info("Restructure based raw collector is used to scan and collect the data");
         scannerResultAggregator = new RestructureBasedRawResultCollector(blockExecutionInfo);
+      } else if (blockExecutionInfo.isRequiredRowId()) {
+        LOGGER.info("RowId based raw collector is used to scan and collect the data");
+        scannerResultAggregator = new RowIdRawBasedResultCollector(blockExecutionInfo);
       } else {
         LOGGER.info("Row based raw collector is used to scan and collect the data");
         scannerResultAggregator = new RawBasedResultCollector(blockExecutionInfo);

diff --git a/...ain/java/org/apache/carbondata/core/scan/collector/impl/RowIdRawBasedResultCollector.java b/...ain/java/org/apache/carbondata/core/scan/collector/impl/RowIdRawBasedResultCollector.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.carbondata.core.scan.collector.impl;
+
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.carbondata.common.annotations.InterfaceAudience;
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.core.scan.executor.infos.BlockExecutionInfo;
+import org.apache.carbondata.core.scan.model.ProjectionMeasure;
+import org.apache.carbondata.core.scan.result.BlockletScannedResult;
+import org.apache.carbondata.core.scan.wrappers.ByteArrayWrapper;
+import org.apache.carbondata.core.stats.QueryStatistic;
+import org.apache.carbondata.core.stats.QueryStatisticsConstants;
+
+/**
+ * It is not a collector it is just a scanned result holder.
+ * most of the lines are copyied from `RawBasedResultCollector`, the difference in function is that
+ * this class return all the dimensions in a ByteArrayWrapper and append blockletNo/PageId/RowId at
+ * end of the row.
+ * This implementation refers to `RawBasedResultCollector` and `RowIdBaedResultCollector`
+ */
+@InterfaceAudience.Internal
+public class RowIdRawBasedResultCollector extends AbstractScannedResultCollector {
+
+  public RowIdRawBasedResultCollector(BlockExecutionInfo blockExecutionInfos) {
+    super(blockExecutionInfos);
+  }
+
+  /**
+   * This method will add a record both key and value to list object
+   * it will keep track of how many record is processed, to handle limit scenario
+   */
+  @Override
+  public List<Object[]> collectResultInRow(BlockletScannedResult scannedResult,
+      int batchSize) {
+    long startTime = System.currentTimeMillis();
+    List<Object[]> listBasedResult = new ArrayList<>(batchSize);
+    ProjectionMeasure[] queryMeasures = executionInfo.getProjectionMeasures();
+    // scan the record and add to list
+    scanAndFillData(scannedResult, batchSize, listBasedResult, queryMeasures);
+    QueryStatistic resultPrepTime = queryStatisticsModel.getStatisticsTypeAndObjMap()
+        .get(QueryStatisticsConstants.RESULT_PREP_TIME);
+    resultPrepTime.addCountStatistic(QueryStatisticsConstants.RESULT_PREP_TIME,
+        resultPrepTime.getCount() + (System.currentTimeMillis() - startTime));
+    return listBasedResult;
+  }
+
+  /**
+   * This method will scan and fill dimension and measure data
+   *
+   * @param scannedResult
+   * @param batchSize
+   * @param listBasedResult
+   * @param queryMeasures
+   */
+  protected void scanAndFillData(BlockletScannedResult scannedResult, int batchSize,
+      List<Object[]> listBasedResult, ProjectionMeasure[] queryMeasures) {
+    int numberOfPages = scannedResult.numberOfpages();
+    // loop will exit once the batchSize data has been read or the pages have been exhausted
+    while (scannedResult.getCurrentPageCounter() < numberOfPages) {
+      int currentPageRowCount = scannedResult.getCurrentPageRowCount();
+      if (currentPageRowCount == 0) {
+        scannedResult.incrementPageCounter();
+        continue;
+      }
+      int rowCounter = scannedResult.getRowCounter();
+      // getRowCounter holds total number rows processed. Calculate the
+      // Left over space through getRowCounter only.
+      int availableRows = currentPageRowCount - rowCounter;
+      // rows available in current page that can be processed from current page
+      int availableBatchRowCount = Math.min(batchSize, availableRows);
+      // this condition will be true if no data left in the current block/blocklet to be scanned
+      if (availableBatchRowCount < 1) {
+        break;
+      }
+      if (batchSize > availableRows) {
+        batchSize = batchSize - availableRows;
+      } else {
+        // this is done because in IUD cases actuals rows fetch can be less than batch size as
+        // some of the rows could have deleted. So in those cases batchSize need to be
+        // re initialized with left over value
+        batchSize = 0;
+      }
+      // for every iteration of available rows filling newly created list of Object[] and add it to
+      // the final list so there is no mismatch in the counter while filling dimension and
+      // measure data
+      List<Object[]> collectedData = new ArrayList<>(availableBatchRowCount);
+      // fill dimension data
+      fillDimensionData(scannedResult, collectedData, queryMeasures, availableBatchRowCount);
+      fillMeasureData(scannedResult, collectedData);
+      // increment the number of rows scanned in scanned result statistics
+      // incrementScannedResultRowCounter(scannedResult, availableBatchRowCount);
+      // assign the left over rows to batch size if the number of rows fetched are lesser
+      // than batchSize
+      if (collectedData.size() < availableBatchRowCount) {
+        batchSize += availableBatchRowCount - listBasedResult.size();
+      }
+      // add the collected data to the final list
+      listBasedResult.addAll(collectedData);
+    }
+  }
+
+  private void fillDimensionData(BlockletScannedResult scannedResult,
+      List<Object[]> listBasedResult, ProjectionMeasure[] queryMeasures, int batchSize) {
+    long startTime = System.currentTimeMillis();
+    List<byte[]> dictionaryKeyArrayBatch = scannedResult.getDictionaryKeyArrayBatch(batchSize);
+    List<byte[][]> noDictionaryKeyArrayBatch =
+        scannedResult.getNoDictionaryKeyArrayBatch(batchSize);
+    List<byte[][]> complexTypeKeyArrayBatch = scannedResult.getComplexTypeKeyArrayBatch(batchSize);
+    // it will same for one blocklet so can be computed only once
+    byte[] implicitColumnByteArray = scannedResult.getBlockletId()
+        .getBytes(Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET));
+    // Note: size check in for loop is for dictionaryKeyArrayBatch as this size can be lesser than
+    // batch size in case of IUD scenarios
+    for (int i = 0; i < dictionaryKeyArrayBatch.size(); i++) {
+      // 1 for ByteArrayWrapper object which will contain dictionary and no dictionary data
+      // 3 for blockletId, pageId, rowId
+      Object[] row = new Object[1 + queryMeasures.length + 3];
+      scannedResult.incrementCounter();
+      row[1 + queryMeasures.length] = scannedResult.getBlockletNumber();
+      row[1 + queryMeasures.length + 1] = scannedResult.getCurrentPageCounter();
+      ByteArrayWrapper wrapper = new ByteArrayWrapper();
+      wrapper.setDictionaryKey(dictionaryKeyArrayBatch.get(i));
+      wrapper.setNoDictionaryKeys(noDictionaryKeyArrayBatch.get(i));
+      wrapper.setComplexTypesKeys(complexTypeKeyArrayBatch.get(i));
+      wrapper.setImplicitColumnByteArray(implicitColumnByteArray);
+      row[0] = wrapper;
+      row[1 + queryMeasures.length + 2] = scannedResult.getCurrentRowId();
+      listBasedResult.add(row);
+    }
+    QueryStatistic keyColumnFillingTime = queryStatisticsModel.getStatisticsTypeAndObjMap()
+        .get(QueryStatisticsConstants.KEY_COLUMN_FILLING_TIME);
+    keyColumnFillingTime.addCountStatistic(QueryStatisticsConstants.KEY_COLUMN_FILLING_TIME,
+        keyColumnFillingTime.getCount() + (System.currentTimeMillis() - startTime));
+  }
+
+  private void fillMeasureData(BlockletScannedResult scannedResult,
+      List<Object[]> listBasedResult) {
+    long startTime = System.currentTimeMillis();
+    // if list is not empty after filling the dimension data then only fill the measure data
+    if (!listBasedResult.isEmpty()) {
+      fillMeasureDataBatch(listBasedResult, 1, scannedResult);
+    }
+    QueryStatistic measureFillingTime = queryStatisticsModel.getStatisticsTypeAndObjMap()
+        .get(QueryStatisticsConstants.MEASURE_FILLING_TIME);
+    measureFillingTime.addCountStatistic(QueryStatisticsConstants.MEASURE_FILLING_TIME,
+        measureFillingTime.getCount() + (System.currentTimeMillis() - startTime));
+  }
+
+  private void incrementScannedResultRowCounter(BlockletScannedResult scannedResult,
+      int batchSize) {
+    // increment row counter by batch size as those many number of rows have been processed at once
+    scannedResult.incrementCounter(batchSize);
+  }
+}