Fix bugs for deferred rebuild for bloomfilter datamap

Previously when we implement ISSUE-2633, deferred rebuild for bloom datamap is disabled for bloomfilter datamap due to unhandled bugs. In this commit, we fixed the bugs and bring this feature back.
apache · Jun 28, 2018 · 1aa79e0 · 1aa79e0
1 parent 61168b5
commit 1aa79e0
Show file tree

Hide file tree

Showing 10 changed files with 659 additions and 51 deletions.
diff --git a/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapBuilder.java b/core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMapBuilder.java
@@ -35,4 +35,9 @@ public interface DataMapBuilder {
   void finish() throws IOException;
 
   void close() throws IOException;
+
+  /**
+   * whether create index on internal carbon bytes (such as dictionary encoded) or original value
+   */
+  boolean isIndexForCarbonRawBytes();
 }
diff --git a/core/src/main/java/org/apache/carbondata/core/scan/collector/ResultCollectorFactory.java b/core/src/main/java/org/apache/carbondata/core/scan/collector/ResultCollectorFactory.java
@@ -26,6 +26,7 @@
 import org.apache.carbondata.core.scan.collector.impl.RestructureBasedRawResultCollector;
 import org.apache.carbondata.core.scan.collector.impl.RestructureBasedVectorResultCollector;
 import org.apache.carbondata.core.scan.collector.impl.RowIdBasedResultCollector;
+import org.apache.carbondata.core.scan.collector.impl.RowIdRawBasedResultCollector;
 import org.apache.carbondata.core.scan.executor.infos.BlockExecutionInfo;
 
 /**
@@ -52,6 +53,9 @@ public static AbstractScannedResultCollector getScannedResultCollector(
       if (blockExecutionInfo.isRestructuredBlock()) {
         LOGGER.info("Restructure based raw collector is used to scan and collect the data");
         scannerResultAggregator = new RestructureBasedRawResultCollector(blockExecutionInfo);
+      } else if (blockExecutionInfo.isRequiredRowId()) {
+        LOGGER.info("RowId based raw collector is used to scan and collect the data");
+        scannerResultAggregator = new RowIdRawBasedResultCollector(blockExecutionInfo);
       } else {
         LOGGER.info("Row based raw collector is used to scan and collect the data");
         scannerResultAggregator = new RawBasedResultCollector(blockExecutionInfo);

diff --git a/...ain/java/org/apache/carbondata/core/scan/collector/impl/RowIdRawBasedResultCollector.java b/...ain/java/org/apache/carbondata/core/scan/collector/impl/RowIdRawBasedResultCollector.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.carbondata.core.scan.collector.impl;
+
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.core.scan.executor.infos.BlockExecutionInfo;
+import org.apache.carbondata.core.scan.model.ProjectionMeasure;
+import org.apache.carbondata.core.scan.result.BlockletScannedResult;
+import org.apache.carbondata.core.scan.wrappers.ByteArrayWrapper;
+import org.apache.carbondata.core.stats.QueryStatistic;
+import org.apache.carbondata.core.stats.QueryStatisticsConstants;
+
+/**
+ * It is not a collector it is just a scanned result holder.
+ * most of the lines are copyied from `RawBasedResultCollector`, the difference in function is that
+ * this class return all the dimensions in a ByteArrayWrapper and append blockletNo/PageId/RowId at
+ * end of the row.
+ * This implementation refers to `RawBasedResultCollector` and `RowIdBaedResultCollector`
+ */
+public class RowIdRawBasedResultCollector extends AbstractScannedResultCollector {
+
+  public RowIdRawBasedResultCollector(BlockExecutionInfo blockExecutionInfos) {
+    super(blockExecutionInfos);
+  }
+
+  /**
+   * This method will add a record both key and value to list object
+   * it will keep track of how many record is processed, to handle limit scenario
+   */
+  @Override
+  public List<Object[]> collectResultInRow(BlockletScannedResult scannedResult,
+      int batchSize) {
+    long startTime = System.currentTimeMillis();
+    List<Object[]> listBasedResult = new ArrayList<>(batchSize);
+    ProjectionMeasure[] queryMeasures = executionInfo.getProjectionMeasures();
+    // scan the record and add to list
+    scanAndFillData(scannedResult, batchSize, listBasedResult, queryMeasures);
+    QueryStatistic resultPrepTime = queryStatisticsModel.getStatisticsTypeAndObjMap()
+        .get(QueryStatisticsConstants.RESULT_PREP_TIME);
+    resultPrepTime.addCountStatistic(QueryStatisticsConstants.RESULT_PREP_TIME,
+        resultPrepTime.getCount() + (System.currentTimeMillis() - startTime));
+    return listBasedResult;
+  }
+
+  /**
+   * This method will scan and fill dimension and measure data
+   *
+   * @param scannedResult
+   * @param batchSize
+   * @param listBasedResult
+   * @param queryMeasures
+   */
+  protected void scanAndFillData(BlockletScannedResult scannedResult, int batchSize,
+      List<Object[]> listBasedResult, ProjectionMeasure[] queryMeasures) {
+    int numberOfPages = scannedResult.numberOfpages();
+    // loop will exit once the batchSize data has been read or the pages have been exhausted
+    while (scannedResult.getCurrentPageCounter() < numberOfPages) {
+      int currentPageRowCount = scannedResult.getCurrentPageRowCount();
+      if (currentPageRowCount == 0) {
+        scannedResult.incrementPageCounter();
+        continue;
+      }
+      int rowCounter = scannedResult.getRowCounter();
+      // getRowCounter holds total number rows processed. Calculate the
+      // Left over space through getRowCounter only.
+      int availableRows = currentPageRowCount - rowCounter;
+      // rows available in current page that can be processed from current page
+      int availableBatchRowCount = Math.min(batchSize, availableRows);
+      // this condition will be true if no data left in the current block/blocklet to be scanned
+      if (availableBatchRowCount < 1) {
+        break;
+      }
+      if (batchSize > availableRows) {
+        batchSize = batchSize - availableRows;
+      } else {
+        // this is done because in IUD cases actuals rows fetch can be less than batch size as
+        // some of the rows could have deleted. So in those cases batchSize need to be
+        // re initialized with left over value
+        batchSize = 0;
+      }
+      // for every iteration of available rows filling newly created list of Object[] and add it to
+      // the final list so there is no mismatch in the counter while filling dimension and
+      // measure data
+      List<Object[]> collectedData = new ArrayList<>(availableBatchRowCount);
+      // fill dimension data
+      fillDimensionData(scannedResult, collectedData, queryMeasures, availableBatchRowCount);
+      fillMeasureData(scannedResult, collectedData);
+      // increment the number of rows scanned in scanned result statistics
+      // incrementScannedResultRowCounter(scannedResult, availableBatchRowCount);
+      // assign the left over rows to batch size if the number of rows fetched are lesser
+      // than batchSize
+      if (collectedData.size() < availableBatchRowCount) {
+        batchSize += availableBatchRowCount - listBasedResult.size();
+      }
+      // add the collected data to the final list
+      listBasedResult.addAll(collectedData);
+    }
+  }
+
+  private void fillDimensionData(BlockletScannedResult scannedResult,
+      List<Object[]> listBasedResult, ProjectionMeasure[] queryMeasures, int batchSize) {
+    long startTime = System.currentTimeMillis();
+    List<byte[]> dictionaryKeyArrayBatch = scannedResult.getDictionaryKeyArrayBatch(batchSize);
+    List<byte[][]> noDictionaryKeyArrayBatch =
+        scannedResult.getNoDictionaryKeyArrayBatch(batchSize);
+    List<byte[][]> complexTypeKeyArrayBatch = scannedResult.getComplexTypeKeyArrayBatch(batchSize);
+    // it will same for one blocklet so can be computed only once
+    byte[] implicitColumnByteArray = scannedResult.getBlockletId()
+        .getBytes(Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET));
+    // Note: size check in for loop is for dictionaryKeyArrayBatch as this size can be lesser than
+    // batch size in case of IUD scenarios
+    for (int i = 0; i < dictionaryKeyArrayBatch.size(); i++) {
+      // 1 for ByteArrayWrapper object which will contain dictionary and no dictionary data
+      // 3 for blockletId, pageId, rowId
+      Object[] row = new Object[1 + queryMeasures.length + 3];
+      scannedResult.incrementCounter();
+      row[1 + queryMeasures.length] = scannedResult.getBlockletNumber();
+      row[1 + queryMeasures.length + 1] = scannedResult.getCurrentPageCounter();
+      ByteArrayWrapper wrapper = new ByteArrayWrapper();
+      wrapper.setDictionaryKey(dictionaryKeyArrayBatch.get(i));
+      wrapper.setNoDictionaryKeys(noDictionaryKeyArrayBatch.get(i));
+      wrapper.setComplexTypesKeys(complexTypeKeyArrayBatch.get(i));
+      wrapper.setImplicitColumnByteArray(implicitColumnByteArray);
+      row[0] = wrapper;
+      row[1 + queryMeasures.length + 2] = scannedResult.getCurrentRowId();
+      listBasedResult.add(row);
+    }
+    QueryStatistic keyColumnFillingTime = queryStatisticsModel.getStatisticsTypeAndObjMap()
+        .get(QueryStatisticsConstants.KEY_COLUMN_FILLING_TIME);
+    keyColumnFillingTime.addCountStatistic(QueryStatisticsConstants.KEY_COLUMN_FILLING_TIME,
+        keyColumnFillingTime.getCount() + (System.currentTimeMillis() - startTime));
+  }
+
+  private void fillMeasureData(BlockletScannedResult scannedResult,
+      List<Object[]> listBasedResult) {
+    long startTime = System.currentTimeMillis();
+    // if list is not empty after filling the dimension data then only fill the measure data
+    if (!listBasedResult.isEmpty()) {
+      fillMeasureDataBatch(listBasedResult, 1, scannedResult);
+    }
+    QueryStatistic measureFillingTime = queryStatisticsModel.getStatisticsTypeAndObjMap()
+        .get(QueryStatisticsConstants.MEASURE_FILLING_TIME);
+    measureFillingTime.addCountStatistic(QueryStatisticsConstants.MEASURE_FILLING_TIME,
+        measureFillingTime.getCount() + (System.currentTimeMillis() - startTime));
+  }
+
+  private void incrementScannedResultRowCounter(BlockletScannedResult scannedResult,
+      int batchSize) {
+    // increment row counter by batch size as those many number of rows have been processed at once
+    scannedResult.incrementCounter(batchSize);
+  }
+}
diff --git a/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMap.java b/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMap.java
@@ -19,7 +19,6 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.math.BigDecimal;
 import java.text.DateFormat;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
@@ -162,7 +161,7 @@ public List<Blocklet> prune(FilterResolverIntf filterExp, SegmentProperties segm
     this.dictCol2MdkIdx = new HashMap<>(indexedColumn.size());
     int idx = 0;
     for (CarbonDimension dimension : segmentProperties.getDimensions()) {
-      if (dimension.isDirectDictionaryEncoding() || dimension.isGlobalDictionaryEncoding()) {
+      if (dimension.isGlobalDictionaryEncoding()) {
         if (name2Col.containsKey(dimension.getColName())) {
           this.dictCol2MdkIdx.put(dimension.getColName(), idx++);
         } else {
@@ -273,14 +272,16 @@ private BloomQueryModel createQueryModel(CarbonColumn carbonColumn, Object filte
     try {
       if (carbonColumn.isMeasure()) {
         if (convertedValue == null) {
-          convertedValue = getNullValueForMeasure(carbonColumn.getDataType());
+          convertedValue = DataConvertUtil.getNullValueForMeasure(carbonColumn.getDataType());
         }
         internalFilterValue =
             CarbonUtil.getValueAsBytes(carbonColumn.getDataType(), convertedValue);
       } else if (carbonColumn.getEncoder().contains(Encoding.DICTIONARY)) {
         // only the index dictionary column exists in this mdk at corresponding position
-        byte[] fakeMdk = this.keyGenerator.generateKey(new int[] { (int) convertedValue });
-        byte[][] fakeKeys = this.columnarSplitter.splitKey(fakeMdk);
+        int[] fakeMdkOrigin = new int[this.columnarSplitter.getBlockKeySize().length];
+        fakeMdkOrigin[this.dictCol2MdkIdx.get(carbonColumn.getColName())] = (int) convertedValue;
+        byte[] fakeMdkBytes = this.keyGenerator.generateKey(fakeMdkOrigin);
+        byte[][] fakeKeys = this.columnarSplitter.splitKey(fakeMdkBytes);
 
         internalFilterValue = fakeKeys[this.dictCol2MdkIdx.get(carbonColumn.getColName())];
       } else {
@@ -295,19 +296,6 @@ private BloomQueryModel createQueryModel(CarbonColumn carbonColumn, Object filte
     return new BloomQueryModel(carbonColumn.getColName(), internalFilterValue);
   }
 
-  /**
-   * return default null value based on datatype. This method refers to ColumnPage.putNull
-   */
-  private Object getNullValueForMeasure(DataType dataType) {
-    if (dataType == DataTypes.BOOLEAN) {
-      return false;
-    } else if (DataTypes.isDecimal(dataType)) {
-      return BigDecimal.ZERO;
-    } else {
-      return 0;
-    }
-  }
-
   @Override
   public boolean isScanRequired(FilterResolverIntf filterExp) {
     return true;

diff --git a/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomDataMapBuilder.java b/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomDataMapBuilder.java
@@ -25,7 +25,6 @@
 import org.apache.carbondata.core.datamap.Segment;
 import org.apache.carbondata.core.datamap.dev.DataMapBuilder;
 import org.apache.carbondata.core.metadata.datatype.DataType;
-import org.apache.carbondata.core.metadata.datatype.DataTypes;
 import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
 import org.apache.carbondata.core.util.CarbonUtil;
 
@@ -42,8 +41,6 @@ public class BloomDataMapBuilder extends BloomDataMapWriter implements DataMapBu
       boolean bloomCompress) throws IOException {
     super(tablePath, dataMapName, indexColumns, segment, shardName, bloomFilterSize, bloomFilterFpp,
         bloomCompress);
-    throw new RuntimeException(
-        "Deferred rebuild for bloomfilter datamap is currently not supported");
   }
 
   @Override
@@ -63,17 +60,21 @@ public void addRow(int blockletId, int pageId, int rowId, Object[] values) {
     for (int i = 0; i < indexColumns.size(); i++) {
       Object data = values[i];
       DataType dataType = indexColumns.get(i).getDataType();
-      // todo: the index value should refer to that in BloomDataMapWriter
+      // the index value should refer to that in BloomDataMapWriter
       byte[] indexValue;
-      if (DataTypes.STRING == dataType) {
-        indexValue = getStringData(data);
-      } else if (DataTypes.BYTE_ARRAY == dataType) {
-        byte[] originValue = (byte[]) data;
-        // String and byte array is LV encoded, L is short type
-        indexValue = new byte[originValue.length - 2];
-        System.arraycopy(originValue, 2, indexValue, 0, originValue.length - 2);
+      if (indexColumns.get(i).isMeasure()) {
+        if (data == null) {
+          indexValue = CarbonUtil.getValueAsBytes(dataType,
+              DataConvertUtil.getNullValueForMeasure(dataType));
+        } else {
+          indexValue = CarbonUtil.getValueAsBytes(dataType, data);
+        }
       } else {
-        indexValue = CarbonUtil.getValueAsBytes(dataType, data);
+        assert data instanceof byte[];
+        indexValue = (byte[]) data;
+      }
+      if (indexValue.length == 0) {
+        indexValue = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY;
       }
       indexBloomFilters.get(i).add(new Key(indexValue));
     }
@@ -93,4 +94,12 @@ public void close() throws IOException {
   protected byte[] getStringData(Object data) {
     return ((String) data).getBytes(CarbonCommonConstants.DEFAULT_CHARSET_CLASS);
   }
+
+  /**
+   * currently bloom filter index are based on carbon internal bytes, not on original value
+   */
+  @Override
+  public boolean isIndexForCarbonRawBytes() {
+    return true;
+  }
 }
diff --git a/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/DataConvertUtil.java b/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/DataConvertUtil.java
@@ -17,6 +17,11 @@
 
 package org.apache.carbondata.datamap.bloom;
 
+import java.math.BigDecimal;
+
+import org.apache.carbondata.core.metadata.datatype.DataType;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
+
 public class DataConvertUtil {
   public static byte[] getRawBytes(byte[] lvData) {
     byte[] indexValue = new byte[lvData.length - 2];
@@ -29,4 +34,17 @@ public static byte[] getRawBytesForVarchar(byte[] lvData) {
     System.arraycopy(lvData, 4, indexValue, 0, lvData.length - 4);
     return indexValue;
   }
+
+  /**
+   * return default null value based on datatype. This method refers to ColumnPage.putNull
+   */
+  public static Object getNullValueForMeasure(DataType dataType) {
+    if (dataType == DataTypes.BOOLEAN) {
+      return false;
+    } else if (DataTypes.isDecimal(dataType)) {
+      return BigDecimal.ZERO;
+    } else {
+      return 0;
+    }
+  }
 }
diff --git a/datamap/lucene/src/main/java/org/apache/carbondata/datamap/lucene/LuceneDataMapBuilder.java b/datamap/lucene/src/main/java/org/apache/carbondata/datamap/lucene/LuceneDataMapBuilder.java
@@ -172,4 +172,8 @@ public void close() throws IOException {
     }
   }
 
+  @Override
+  public boolean isIndexForCarbonRawBytes() {
+    return false;
+  }
 }