From 4d0a0d0ced055f80a2f006a154c1b06cf5c6ee24 Mon Sep 17 00:00:00 2001 From: xuchuanyin Date: Fri, 13 Apr 2018 16:18:23 +0800 Subject: [PATCH 1/2] Fix bugs in mapping blocklet to UnsafeDMStore In BlockletDataMap, carbondata stores DMRow in an array for each blocklet. But currently carbondata accesses the DMRow only by blockletId(0, 1, etc.), which will cause problem since different block can have same blockletId. This PR adds a map to map the blockId#blockletId to array index, carbondata can access the DMRow by blockId and blockletId. --- .../blockletindex/BlockletDataMap.java | 18 +++++++++++++++--- .../blockletindex/BlockletDataMapFactory.java | 6 +++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java b/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java index 3ca9c5a91ef..48394bc2d56 100644 --- a/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java +++ b/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java @@ -21,6 +21,7 @@ import java.io.DataInputStream; import java.io.DataOutput; import java.io.DataOutputStream; +import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.math.BigDecimal; @@ -28,7 +29,9 @@ import java.util.ArrayList; import java.util.BitSet; import java.util.Comparator; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.apache.carbondata.common.logging.LogService; import org.apache.carbondata.common.logging.LogServiceFactory; @@ -125,6 +128,10 @@ public class BlockletDataMap extends CoarseGrainDataMap implements Cacheable { private SegmentProperties segmentProperties; private int[] columnCardinality; + /** + * map blocklet (blockFileName#blockletNo) to index in UnsafeMemoryDMStore rows + */ + private Map blocklet2DMRowIdx = new HashMap<>(); @Override public void init(DataMapModel dataMapModel) throws IOException, MemoryException { @@ -265,10 +272,14 @@ private DataMapRowImpl loadToUnsafe(DataFileFooter fileFooter, setLocations(blockMetaInfo.getLocationInfo(), row, ordinal); ordinal++; // for relative blockelt id i.e blocklet id that belongs to a particular part file - row.setShort((short) relativeBlockletId++, ordinal++); + row.setShort((short) relativeBlockletId, ordinal++); // Store block size row.setLong(blockMetaInfo.getSize(), ordinal); + String blockFileName = filePath.substring(filePath.lastIndexOf(File.separator) + 1); + this.blocklet2DMRowIdx.put(blockFileName + '#' + relativeBlockletId, + unsafeMemoryDMStore.getRowCount()); unsafeMemoryDMStore.addIndexRowToUnsafe(row); + relativeBlockletId++; } catch (Exception e) { throw new RuntimeException(e); } @@ -731,8 +742,8 @@ private boolean addBlockBasedOnMinMaxValue(FilterExecuter filterExecuter, byte[] } } - public ExtendedBlocklet getDetailedBlocklet(String blockletId) { - int index = Integer.parseInt(blockletId); + public ExtendedBlocklet getDetailedBlocklet(String blockFileName, String blockletId) { + int index = this.blocklet2DMRowIdx.get(blockFileName + '#' + blockletId); DataMapRow safeRow = unsafeMemoryDMStore.getUnsafeRow(index).convertToSafeRow(); return createBlocklet(safeRow, safeRow.getShort(BLOCKLET_ID_INDEX)); } @@ -930,6 +941,7 @@ private byte[] convertSchemaToBinary(List columnSchemas) throws IO @Override public void clear() { + blocklet2DMRowIdx.clear(); if (unsafeMemoryDMStore != null) { unsafeMemoryDMStore.freeMemory(); unsafeMemoryDMStore = null; diff --git a/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMapFactory.java b/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMapFactory.java index caac73325d3..c6c89179165 100644 --- a/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMapFactory.java +++ b/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMapFactory.java @@ -16,6 +16,7 @@ */ package org.apache.carbondata.core.indexstore.blockletindex; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; @@ -149,7 +150,10 @@ private ExtendedBlocklet getExtendedBlocklet(List Date: Sat, 21 Apr 2018 09:55:22 +0800 Subject: [PATCH 2/2] optimize storage for unique blocklet add a class to hold unique blocklet instead of plain string --- .../blockletindex/BlockletDataMap.java | 54 ++++++++++++++++--- .../blockletindex/BlockletDataMapFactory.java | 15 ++++-- 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java b/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java index 48394bc2d56..1fd7144ec69 100644 --- a/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java +++ b/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java @@ -27,6 +27,7 @@ import java.math.BigDecimal; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Arrays; import java.util.BitSet; import java.util.Comparator; import java.util.HashMap; @@ -129,9 +130,41 @@ public class BlockletDataMap extends CoarseGrainDataMap implements Cacheable { private int[] columnCardinality; /** - * map blocklet (blockFileName#blockletNo) to index in UnsafeMemoryDMStore rows + * map blocklet (segId,blockNo,blockletNo) to index in UnsafeMemoryDMStore rows */ - private Map blocklet2DMRowIdx = new HashMap<>(); + private Map blocklet2DMRowIdx = new HashMap<>(); + + /** + * class to store simple information about a blocklet for less memory footprint. Since part number + * is always 0, so skip it. + */ + private static class SimpleBlockletInfo { + byte[] segmentId; + short blockNo; + short blockletNo; + + SimpleBlockletInfo(byte[] segmentId, short blockNo, short blockletNo) { + this.segmentId = segmentId; + this.blockNo = blockNo; + this.blockletNo = blockletNo; + } + + @Override + public int hashCode() { + int result = Arrays.hashCode(segmentId); + result = 31 * result + blockNo; + result = 31 * result + blockletNo; + return result; + } + + @Override + public boolean equals(Object obj) { + return obj instanceof SimpleBlockletInfo + && Arrays.equals(segmentId, ((SimpleBlockletInfo) obj).segmentId) + && blockNo == (((SimpleBlockletInfo) obj).blockNo) + && blockletNo == (((SimpleBlockletInfo) obj).blockletNo); + } + } @Override public void init(DataMapModel dataMapModel) throws IOException, MemoryException { @@ -184,7 +217,7 @@ public void init(DataMapModel dataMapModel) throws IOException, MemoryException } summaryRow = loadToUnsafe(fileFooter, segmentProperties, blockInfo.getFilePath(), summaryRow, - blockMetaInfo, relativeBlockletId); + blockMetaInfo, relativeBlockletId, segmentId); // this is done because relative blocklet id need to be incremented based on the // total number of blocklets relativeBlockletId += fileFooter.getBlockletList().size(); @@ -210,7 +243,7 @@ public void init(DataMapModel dataMapModel) throws IOException, MemoryException private DataMapRowImpl loadToUnsafe(DataFileFooter fileFooter, SegmentProperties segmentProperties, String filePath, DataMapRowImpl summaryRow, - BlockMetaInfo blockMetaInfo, int relativeBlockletId) { + BlockMetaInfo blockMetaInfo, int relativeBlockletId, byte[] segId) { int[] minMaxLen = segmentProperties.getColumnsValueSize(); List blockletList = fileFooter.getBlockletList(); CarbonRowSchema[] schema = unsafeMemoryDMStore.getSchema(); @@ -276,7 +309,10 @@ private DataMapRowImpl loadToUnsafe(DataFileFooter fileFooter, // Store block size row.setLong(blockMetaInfo.getSize(), ordinal); String blockFileName = filePath.substring(filePath.lastIndexOf(File.separator) + 1); - this.blocklet2DMRowIdx.put(blockFileName + '#' + relativeBlockletId, + short blockNo = Short.parseShort(blockFileName.substring("part-".length(), + blockFileName.indexOf("-", "part-".length() + 1))); + this.blocklet2DMRowIdx.put( + new SimpleBlockletInfo(segId, blockNo, (short) relativeBlockletId), unsafeMemoryDMStore.getRowCount()); unsafeMemoryDMStore.addIndexRowToUnsafe(row); relativeBlockletId++; @@ -742,8 +778,12 @@ private boolean addBlockBasedOnMinMaxValue(FilterExecuter filterExecuter, byte[] } } - public ExtendedBlocklet getDetailedBlocklet(String blockFileName, String blockletId) { - int index = this.blocklet2DMRowIdx.get(blockFileName + '#' + blockletId); + public ExtendedBlocklet getDetailedBlocklet(byte[] segId, String blockFileName, + String blockletId) { + short blockNo = Short.parseShort(blockFileName.substring("part-".length(), + blockFileName.indexOf("-", "part-".length() + 1))); + int index = this.blocklet2DMRowIdx.get( + new SimpleBlockletInfo(segId, blockNo, Short.parseShort(blockletId))); DataMapRow safeRow = unsafeMemoryDMStore.getUnsafeRow(index).convertToSafeRow(); return createBlocklet(safeRow, safeRow.getShort(BLOCKLET_ID_INDEX)); } diff --git a/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMapFactory.java b/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMapFactory.java index c6c89179165..6df07d4cf23 100644 --- a/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMapFactory.java +++ b/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMapFactory.java @@ -26,6 +26,7 @@ import org.apache.carbondata.core.cache.Cache; import org.apache.carbondata.core.cache.CacheProvider; import org.apache.carbondata.core.cache.CacheType; +import org.apache.carbondata.core.constants.CarbonCommonConstants; import org.apache.carbondata.core.datamap.DataMapDistributable; import org.apache.carbondata.core.datamap.DataMapMeta; import org.apache.carbondata.core.datamap.Segment; @@ -126,9 +127,10 @@ public List getExtendedBlocklets(List blocklets, Seg } List identifiers = getTableBlockIndexUniqueIdentifiers(segment); + byte[] segId = segment.getSegmentNo().getBytes(CarbonCommonConstants.DEFAULT_CHARSET); // Retrieve each blocklets detail information from blocklet datamap for (Blocklet blocklet : blocklets) { - detailedBlocklets.add(getExtendedBlocklet(identifiers, blocklet)); + detailedBlocklets.add(getExtendedBlocklet(segId, identifiers, blocklet)); } return detailedBlocklets; } @@ -141,18 +143,21 @@ public ExtendedBlocklet getExtendedBlocklet(Blocklet blocklet, Segment segment) } List identifiers = getTableBlockIndexUniqueIdentifiers(segment); - return getExtendedBlocklet(identifiers, blocklet); + return getExtendedBlocklet( + segment.getSegmentNo().getBytes(CarbonCommonConstants.DEFAULT_CHARSET), + identifiers, + blocklet); } - private ExtendedBlocklet getExtendedBlocklet(List identifiers, - Blocklet blocklet) throws IOException { + private ExtendedBlocklet getExtendedBlocklet(byte[] segId, + List identifiers, Blocklet blocklet) throws IOException { String carbonIndexFileName = CarbonTablePath.getCarbonIndexFileName(blocklet.getBlockId()); for (TableBlockIndexUniqueIdentifier identifier : identifiers) { if (identifier.getIndexFileName().equals(carbonIndexFileName)) { DataMap dataMap = cache.get(identifier); String blockFileName = blocklet.getBlockId().substring( blocklet.getBlockId().lastIndexOf(File.separatorChar) + 1); - return ((BlockletDataMap) dataMap).getDetailedBlocklet(blockFileName, + return ((BlockletDataMap) dataMap).getDetailedBlocklet(segId, blockFileName, blocklet.getBlockletId()); } }