aoblkdir: remove hole filling mechanism

Background: (1) Block directory rowcount hole filling: The firstRowNum is allocated on the basis of gp_fast_sequence and is not always contiguous with the last minipage entry's (firstRowNum + rowCount) value. Before we insert the next minipage entry, we "fill" the rowCount of the previous one so that the range appears contiguous between successive minipage entries. (2) Hole filling in action: insert into foo values(1); -> inserts entry A with (firstRowNum, rowCount) = (1, 1) insert into foo values(1); -> updates previous entry A to (firstRowNum, rowCount) = (1, 100) since firstRowNum = 101 for the next insert, we fill the previous entry to ensure that there are no holes in the rowCount range. Motivation: There is no apparent reason why this mechanism is necessary and it becomes a hindrance for future work to support unique indexes. That work depends on the continuity of a block directory entry's range to determine unique index lookups. Changes: (1) Removing this mechanism establishes the invariant: fetching physical rows in the continuous range of a block directory entry's first and last rownumbers will always be successful. (2) We enforce this invariant with suitable ERRORs inside the fetch machinery. However, since hole filling will still exist in older versions, we do a AORelationVersion bump and ERROR out only for the current version and onwards. Note: We use the formatversion attribute of ao(cs)seg rels instead of the unused Minipage->version member. This is because even though it makes more semantic sense and constitutes an equivalent condition for the ERROR, it means more work for banning code in unique index creation: If we were to use Minipage->version, we would have to check every minipage in the block directory, which might be more expensive than the limited number of ao(cs)seg tuples we would have to check on the other hand. Co-authored-by: Ashwin Agrawal <aashwin@vmware.com>
arenadata · Sep 18, 2022 · 258ec96 · 258ec96
1 parent ec10c4d
commit 258ec96
Show file tree

Hide file tree

Showing 18 changed files with 1,208 additions and 406 deletions.
diff --git a/src/backend/access/aocs/aocsam.c b/src/backend/access/aocs/aocsam.c
@@ -1118,6 +1118,10 @@ positionSkipCurrentBlock(DatumStreamFetchDesc datumStreamFetchDesc)
 		datumStreamFetchDesc->currentBlock.lastRowNum + 1;
 }
 
+/*
+ * Fetch the tuple's datum from the block indicated by the block directory entry
+ * that covers the tuple, given the colno.
+ */
 static void
 fetchFromCurrentBlock(AOCSFetchDesc aocsFetchDesc,
 					  int64 rowNum,
@@ -1177,14 +1181,49 @@ scanToFetchValue(AOCSFetchDesc aocsFetchDesc,
 				 TupleTableSlot *slot,
 				 int colno)
 {
-	DatumStreamFetchDesc datumStreamFetchDesc = aocsFetchDesc->datumStreamFetchDesc[colno];
-	DatumStreamRead *datumStream = datumStreamFetchDesc->datumStream;
-	bool		found;
+	DatumStreamFetchDesc 			datumStreamFetchDesc = aocsFetchDesc->datumStreamFetchDesc[colno];
+	DatumStreamRead 				*datumStream = datumStreamFetchDesc->datumStream;
+	AOFetchBlockMetadata 			*currentBlock = &datumStreamFetchDesc->currentBlock;
+	AppendOnlyBlockDirectoryEntry 	*entry = &currentBlock->blockDirectoryEntry;
+	bool							found;
 
 	found = datumstreamread_find_block(datumStream,
 									   datumStreamFetchDesc,
 									   rowNum);
-	if (found)
+	if (!found)
+	{
+		if (AppendOnlyBlockDirectoryEntry_RangeHasRow(entry, rowNum))
+		{
+			/*
+			 * We fell into a hole inside the resolved block directory entry
+			 * we obtained from AppendOnlyBlockDirectory_GetEntry().
+			 * This should not be happening for versions >= PG12. Scream
+			 * appropriately. See AppendOnlyBlockDirectoryEntry for details.
+			 */
+			ereportif(datumStream->ao_read.formatVersion >= AORelationVersion_PG12,
+					  ERROR,
+					  (errcode(ERRCODE_INTERNAL_ERROR),
+					   errmsg("datum with row number %ld and col no %d not found in block directory entry range", rowNum, colno),
+					   errdetail("block directory entry: (fileOffset = %ld, firstRowNum = %ld, "
+								 "afterFileOffset = %ld, lastRowNum = %ld)",
+								 entry->range.fileOffset,
+								 entry->range.firstRowNum,
+								 entry->range.afterFileOffset,
+								 entry->range.lastRowNum)));
+		}
+		else
+		{
+			/*
+			 * The resolved block directory entry we obtained from
+			 * AppendOnlyBlockDirectory_GetEntry() has range s.t.
+			 * firstRowNum < lastRowNum < rowNum
+			 * This can happen when rowNum maps to an aborted transaction, and
+			 * we find an earlier committed block directory row due to the
+			 * <= scan condition in AppendOnlyBlockDirectory_GetEntry().
+			 */
+		}
+	}
+	else
 		fetchFromCurrentBlock(aocsFetchDesc, rowNum, slot, colno);
 
 	return found;
@@ -1258,6 +1297,11 @@ openFetchSegmentFile(AOCSFetchDesc aocsFetchDesc,
 	return true;
 }
 
+/*
+ * Note: we don't reset the block directory entry here. This is crucial, so we
+ * can use the block directory entry later on. See comment in AOFetchBlockMetadata
+ * FIXME: reset other fields here.
+ */
 static void
 resetCurrentBlockInfo(AOFetchBlockMetadata *currentBlock)
 {

diff --git a/src/backend/access/appendonly/appendonlyam.c b/src/backend/access/appendonly/appendonlyam.c
@@ -1889,30 +1889,69 @@ fetchNextBlock(AppendOnlyFetchDesc aoFetchDesc)
 	return true;
 }
 
-static bool
+/*
+ * Fetch the tuple from the block indicated by the block directory entry that
+ * covers the tuple.
+ */
+static void
 fetchFromCurrentBlock(AppendOnlyFetchDesc aoFetchDesc,
 					  int64 rowNum,
 					  TupleTableSlot *slot)
 {
-	Assert(aoFetchDesc->currentBlock.valid);
-	Assert(rowNum >= aoFetchDesc->currentBlock.firstRowNum);
-	Assert(rowNum <= aoFetchDesc->currentBlock.lastRowNum);
+	bool							fetched;
+	AOFetchBlockMetadata 			*currentBlock = &aoFetchDesc->currentBlock;
+	AppendOnlyExecutorReadBlock 	*executorReadBlock = &aoFetchDesc->executorReadBlock;
+	AppendOnlyStorageRead			*storageRead = &aoFetchDesc->storageRead;
+	AppendOnlyBlockDirectoryEntry 	*entry = &currentBlock->blockDirectoryEntry;
 
-	if (!aoFetchDesc->currentBlock.gotContents)
+	if (!currentBlock->gotContents)
 	{
 		/*
 		 * Do decompression if necessary and get contents.
 		 */
-		AppendOnlyExecutorReadBlock_GetContents(&aoFetchDesc->executorReadBlock);
+		AppendOnlyExecutorReadBlock_GetContents(executorReadBlock);
 
-		aoFetchDesc->currentBlock.gotContents = true;
+		currentBlock->gotContents = true;
 	}
 
-	return AppendOnlyExecutorReadBlock_FetchTuple(&aoFetchDesc->executorReadBlock,
-												  rowNum,
-												   /* nkeys */ 0,
-												   /* key */ NULL,
-												  slot);
+	fetched = AppendOnlyExecutorReadBlock_FetchTuple(executorReadBlock,
+													 rowNum,
+													 /* nkeys */ 0,
+													 /* key */ NULL,
+													 slot);
+	if (!fetched)
+	{
+		if (AppendOnlyBlockDirectoryEntry_RangeHasRow(entry, rowNum))
+		{
+			/*
+			 * We fell into a hole inside the resolved block directory entry
+			 * we obtained from AppendOnlyBlockDirectory_GetEntry().
+			 * This should not be happening for versions >= PG12. Scream
+			 * appropriately. See AppendOnlyBlockDirectoryEntry for details.
+			 */
+			ereportif(storageRead->formatVersion >= AORelationVersion_PG12,
+					  ERROR,
+					  (errcode(ERRCODE_INTERNAL_ERROR),
+						  errmsg("tuple with row number %ld not found in block directory entry range", rowNum),
+						  errdetail("block directory entry: (fileOffset = %ld, firstRowNum = %ld, "
+									"afterFileOffset = %ld, lastRowNum = %ld)",
+									entry->range.fileOffset,
+									entry->range.firstRowNum,
+									entry->range.afterFileOffset,
+									entry->range.lastRowNum)));
+		}
+		else
+		{
+			/*
+			 * The resolved block directory entry we obtained from
+			 * AppendOnlyBlockDirectory_GetEntry() has range s.t.
+			 * firstRowNum < lastRowNum < rowNum
+			 * This can happen when rowNum maps to an aborted transaction, and
+			 * we find an earlier committed block directory row due to the
+			 * <= scan condition in AppendOnlyBlockDirectory_GetEntry().
+			 */
+		}
+	}
 }
 
 static void
@@ -2017,7 +2056,10 @@ scanToFetchTuple(AppendOnlyFetchDesc aoFetchDesc,
 		}
 
 		if (rowNum <= aoFetchDesc->currentBlock.lastRowNum)
-			return fetchFromCurrentBlock(aoFetchDesc, rowNum, slot);
+		{
+			fetchFromCurrentBlock(aoFetchDesc, rowNum, slot);
+			return true;
+		}
 
 		/*
 		 * Update information to get next block.
@@ -2265,7 +2307,8 @@ appendonly_fetch(AppendOnlyFetchDesc aoFetchDesc,
 					}
 					return false;	/* row has been deleted or updated. */
 				}
-				return fetchFromCurrentBlock(aoFetchDesc, rowNum, slot);
+				fetchFromCurrentBlock(aoFetchDesc, rowNum, slot);
+				return true;
 			}
 
 			/*

diff --git a/src/backend/access/appendonly/appendonlyblockdirectory.c b/src/backend/access/appendonly/appendonlyblockdirectory.c
@@ -578,7 +578,12 @@ AppendOnlyBlockDirectory_GetEntry(
 			/* Ignore columns that are not projected. */
 			continue;
 		}
-		/* Setup the scan keys for the scan. */
+		/*
+		 * Set up the scan keys values. The keys have already been set up in
+		 * init_internal() with the following strategy:
+		 * (=segmentFileNum, =columnGroupNo, <=rowNum)
+		 * See init_internal().
+		 */
 		Assert(scanKeys != NULL);
 		scanKeys[0].sk_argument = Int32GetDatum(segmentFileNum);
 		scanKeys[1].sk_argument = Int32GetDatum(tmpGroupNo);
@@ -641,6 +646,15 @@ AppendOnlyBlockDirectory_GetEntry(
 			/*
 			 * Since the last few blocks may not be logged in the block
 			 * directory, we always use the last entry.
+			 *
+			 * FIXME: If we didn't find a suitable entry, why even use the last
+			 * entry? Currently, as it stands we would most likely return
+			 * true from this function. This will lead to us having to do a
+			 * fetch of the tuple from the physical file in the layer above (see
+			 * scanToFetchTuple()), where we would ultimately find the tuple
+			 * missing. Would it be correct to set the directory entry here to
+			 * be the last one (for caching purposes) and return false, in order
+			 * to avoid this physical file read?
 			 */
 			entry_no = minipageInfo->numMinipageEntries - 1;
 		}
@@ -702,7 +716,6 @@ insert_new_entry(
 	MinipageEntry *entry = NULL;
 	MinipagePerColumnGroup *minipageInfo;
 	int			minipageIndex;
-	int			lastEntryNo;
 
 	if (rowCount == 0)
 		return false;
@@ -732,35 +745,6 @@ insert_new_entry(
 	minipageInfo = &blockDirectory->minipages[minipageIndex];
 	Assert(minipageInfo->numMinipageEntries <= (uint32) NUM_MINIPAGE_ENTRIES);
 
-	lastEntryNo = minipageInfo->numMinipageEntries - 1;
-	if (lastEntryNo >= 0)
-	{
-		entry = &(minipageInfo->minipage->entry[lastEntryNo]);
-
-		Assert(entry->firstRowNum < firstRowNum);
-		Assert(entry->fileOffset < fileOffset);
-
-		if (gp_blockdirectory_entry_min_range > 0 &&
-			fileOffset - entry->fileOffset < gp_blockdirectory_entry_min_range)
-			return true;
-
-		/* Update the rowCount in the latest entry */
-		Assert(entry->rowCount <= firstRowNum - entry->firstRowNum);
-
-		ereportif(Debug_appendonly_print_blockdirectory, LOG,
-				  (errmsg("Append-only block directory update entry: "
-						  "(firstRowNum, columnGroupNo, fileOffset, rowCount) = (" INT64_FORMAT
-						  ", %d, " INT64_FORMAT ", " INT64_FORMAT ") at index %d to "
-						  "(firstRowNum, columnGroupNo, fileOffset, rowCount) = (" INT64_FORMAT
-						  ", %d, " INT64_FORMAT ", " INT64_FORMAT ")",
-						  entry->firstRowNum, columnGroupNo, entry->fileOffset, entry->rowCount,
-						  minipageInfo->numMinipageEntries - 1,
-						  entry->firstRowNum, columnGroupNo, entry->fileOffset,
-						  firstRowNum - entry->firstRowNum)));
-
-		entry->rowCount = firstRowNum - entry->firstRowNum;
-	}
-
 	if (minipageInfo->numMinipageEntries >= (uint32) gp_blockdirectory_minipage_size)
 	{
 		write_minipage(blockDirectory, columnGroupNo, minipageInfo);

diff --git a/src/include/catalog/pg_appendonly.h b/src/include/catalog/pg_appendonly.h
@@ -69,10 +69,11 @@ typedef enum AORelationVersion
 											 * were introduced, see MPP-7251 and MPP-7372. */
 	AORelationVersion_PG83 = 3,				/* Same as Aligned64bit, but numerics are stored
 											 * in the PostgreSQL 8.3 format. */
+	AORelationVersion_PG12 = 4,             /* version that removed block directory hole filling. */
 	MaxAORelationVersion                    /* must always be last */
 } AORelationVersion;
 
-#define AORelationVersion_GetLatest() AORelationVersion_PG83
+#define AORelationVersion_GetLatest() AORelationVersion_PG12
 
 #define AORelationVersion_IsValid(version) \
 	(version > AORelationVersion_None && version < MaxAORelationVersion)

diff --git a/src/include/cdb/cdbappendonlyblockdirectory.h b/src/include/cdb/cdbappendonlyblockdirectory.h
@@ -23,10 +23,21 @@
 extern int gp_blockdirectory_entry_min_range;
 extern int gp_blockdirectory_minipage_size;
 
+/*
+ * In-memory equivalent of on-disk data structure MinipageEntry, used to
+ * represent a block directory entry.
+ */
 typedef struct AppendOnlyBlockDirectoryEntry
 {
 	/*
-	 * The range of blocks covered by the Block Directory entry.
+	 * The range of blocks covered by the Block Directory entry, which is the
+	 * continuous range [firstRowNum, lastRowNum]. There are no gaps (or holes)
+	 * within this range. However, there may be gaps between successive block
+	 * directory entries. For e.g. entry0 could have range [1,50] and entry1
+	 * could have: [100,150]. The reason gaps arise between successive entries
+	 * is that we allocate row numbers using the gp_fastsequence mechanism,
+	 * which allocates blocks of row numbers of a pre-determined size (that may
+	 * be larger than the number of blocks being inserted)
 	 */
 	struct range
 	{
@@ -126,6 +137,12 @@ typedef struct AppendOnlyBlockDirectory
 
 typedef struct AOFetchBlockMetadata
 {
+	/*
+	 * Current cached block directory entry.
+	 * FIXME: At times, we rely upon the values in this struct to be valid even
+	 * when AOFetchBlockMetadata->valid = false. This indicates that this should
+	 * live elsewhere.
+	 */
 	AppendOnlyBlockDirectoryEntry blockDirectoryEntry;
 
 	/*