Skip to content

Commit

Permalink
aoblkdir: remove hole filling mechanism
Browse files Browse the repository at this point in the history
Background:

(1) Block directory rowcount hole filling: The firstRowNum is allocated on
the basis of gp_fast_sequence and is not always contiguous with the last
minipage entry's (firstRowNum + rowCount) value. Before we insert the
next minipage entry, we "fill" the rowCount of the previous one so that the
range appears contiguous between successive minipage entries.

(2) Hole filling in action:
insert into foo values(1);
 -> inserts entry A with (firstRowNum, rowCount) = (1, 1)
insert into foo values(1);
 -> updates previous entry A to (firstRowNum, rowCount) = (1, 100)
    since firstRowNum = 101 for the next insert, we fill the previous entry
    to ensure that there are no holes in the rowCount range.

Motivation:

There is no apparent reason why this mechanism is necessary and it becomes
a hindrance for future work to support unique indexes. That work depends
on the continuity of a block directory entry's range to determine unique
index lookups.

Changes:

(1) Removing this mechanism establishes the invariant: fetching physical
rows in the continuous range of a block directory entry's first and last
rownumbers will always be successful.

(2) We enforce this invariant with suitable ERRORs inside the fetch
machinery. However, since hole filling will still exist in older
versions, we do a AORelationVersion bump and ERROR out only for the
current version and onwards.
Note: We use the formatversion attribute of ao(cs)seg rels instead of
the unused Minipage->version member. This is because even though it
makes more semantic sense and constitutes an equivalent condition for
the ERROR, it means more work for banning code in unique index creation:
If we were to use Minipage->version, we would have to check every
minipage in the block directory, which might be more expensive than the
limited number of ao(cs)seg tuples we would have to check on the other
hand.

Co-authored-by: Ashwin Agrawal <aashwin@vmware.com>
  • Loading branch information
soumyadeep2007 and ashwinstar committed Sep 18, 2022
1 parent ec10c4d commit 258ec96
Show file tree
Hide file tree
Showing 18 changed files with 1,208 additions and 406 deletions.
52 changes: 48 additions & 4 deletions src/backend/access/aocs/aocsam.c
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,10 @@ positionSkipCurrentBlock(DatumStreamFetchDesc datumStreamFetchDesc)
datumStreamFetchDesc->currentBlock.lastRowNum + 1;
}

/*
* Fetch the tuple's datum from the block indicated by the block directory entry
* that covers the tuple, given the colno.
*/
static void
fetchFromCurrentBlock(AOCSFetchDesc aocsFetchDesc,
int64 rowNum,
Expand Down Expand Up @@ -1177,14 +1181,49 @@ scanToFetchValue(AOCSFetchDesc aocsFetchDesc,
TupleTableSlot *slot,
int colno)
{
DatumStreamFetchDesc datumStreamFetchDesc = aocsFetchDesc->datumStreamFetchDesc[colno];
DatumStreamRead *datumStream = datumStreamFetchDesc->datumStream;
bool found;
DatumStreamFetchDesc datumStreamFetchDesc = aocsFetchDesc->datumStreamFetchDesc[colno];
DatumStreamRead *datumStream = datumStreamFetchDesc->datumStream;
AOFetchBlockMetadata *currentBlock = &datumStreamFetchDesc->currentBlock;
AppendOnlyBlockDirectoryEntry *entry = &currentBlock->blockDirectoryEntry;
bool found;

found = datumstreamread_find_block(datumStream,
datumStreamFetchDesc,
rowNum);
if (found)
if (!found)
{
if (AppendOnlyBlockDirectoryEntry_RangeHasRow(entry, rowNum))
{
/*
* We fell into a hole inside the resolved block directory entry
* we obtained from AppendOnlyBlockDirectory_GetEntry().
* This should not be happening for versions >= PG12. Scream
* appropriately. See AppendOnlyBlockDirectoryEntry for details.
*/
ereportif(datumStream->ao_read.formatVersion >= AORelationVersion_PG12,
ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg("datum with row number %ld and col no %d not found in block directory entry range", rowNum, colno),
errdetail("block directory entry: (fileOffset = %ld, firstRowNum = %ld, "
"afterFileOffset = %ld, lastRowNum = %ld)",
entry->range.fileOffset,
entry->range.firstRowNum,
entry->range.afterFileOffset,
entry->range.lastRowNum)));
}
else
{
/*
* The resolved block directory entry we obtained from
* AppendOnlyBlockDirectory_GetEntry() has range s.t.
* firstRowNum < lastRowNum < rowNum
* This can happen when rowNum maps to an aborted transaction, and
* we find an earlier committed block directory row due to the
* <= scan condition in AppendOnlyBlockDirectory_GetEntry().
*/
}
}
else
fetchFromCurrentBlock(aocsFetchDesc, rowNum, slot, colno);

return found;
Expand Down Expand Up @@ -1258,6 +1297,11 @@ openFetchSegmentFile(AOCSFetchDesc aocsFetchDesc,
return true;
}

/*
* Note: we don't reset the block directory entry here. This is crucial, so we
* can use the block directory entry later on. See comment in AOFetchBlockMetadata
* FIXME: reset other fields here.
*/
static void
resetCurrentBlockInfo(AOFetchBlockMetadata *currentBlock)
{
Expand Down
71 changes: 57 additions & 14 deletions src/backend/access/appendonly/appendonlyam.c
Original file line number Diff line number Diff line change
Expand Up @@ -1889,30 +1889,69 @@ fetchNextBlock(AppendOnlyFetchDesc aoFetchDesc)
return true;
}

static bool
/*
* Fetch the tuple from the block indicated by the block directory entry that
* covers the tuple.
*/
static void
fetchFromCurrentBlock(AppendOnlyFetchDesc aoFetchDesc,
int64 rowNum,
TupleTableSlot *slot)
{
Assert(aoFetchDesc->currentBlock.valid);
Assert(rowNum >= aoFetchDesc->currentBlock.firstRowNum);
Assert(rowNum <= aoFetchDesc->currentBlock.lastRowNum);
bool fetched;
AOFetchBlockMetadata *currentBlock = &aoFetchDesc->currentBlock;
AppendOnlyExecutorReadBlock *executorReadBlock = &aoFetchDesc->executorReadBlock;
AppendOnlyStorageRead *storageRead = &aoFetchDesc->storageRead;
AppendOnlyBlockDirectoryEntry *entry = &currentBlock->blockDirectoryEntry;

if (!aoFetchDesc->currentBlock.gotContents)
if (!currentBlock->gotContents)
{
/*
* Do decompression if necessary and get contents.
*/
AppendOnlyExecutorReadBlock_GetContents(&aoFetchDesc->executorReadBlock);
AppendOnlyExecutorReadBlock_GetContents(executorReadBlock);

aoFetchDesc->currentBlock.gotContents = true;
currentBlock->gotContents = true;
}

return AppendOnlyExecutorReadBlock_FetchTuple(&aoFetchDesc->executorReadBlock,
rowNum,
/* nkeys */ 0,
/* key */ NULL,
slot);
fetched = AppendOnlyExecutorReadBlock_FetchTuple(executorReadBlock,
rowNum,
/* nkeys */ 0,
/* key */ NULL,
slot);
if (!fetched)
{
if (AppendOnlyBlockDirectoryEntry_RangeHasRow(entry, rowNum))
{
/*
* We fell into a hole inside the resolved block directory entry
* we obtained from AppendOnlyBlockDirectory_GetEntry().
* This should not be happening for versions >= PG12. Scream
* appropriately. See AppendOnlyBlockDirectoryEntry for details.
*/
ereportif(storageRead->formatVersion >= AORelationVersion_PG12,
ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg("tuple with row number %ld not found in block directory entry range", rowNum),
errdetail("block directory entry: (fileOffset = %ld, firstRowNum = %ld, "
"afterFileOffset = %ld, lastRowNum = %ld)",
entry->range.fileOffset,
entry->range.firstRowNum,
entry->range.afterFileOffset,
entry->range.lastRowNum)));
}
else
{
/*
* The resolved block directory entry we obtained from
* AppendOnlyBlockDirectory_GetEntry() has range s.t.
* firstRowNum < lastRowNum < rowNum
* This can happen when rowNum maps to an aborted transaction, and
* we find an earlier committed block directory row due to the
* <= scan condition in AppendOnlyBlockDirectory_GetEntry().
*/
}
}
}

static void
Expand Down Expand Up @@ -2017,7 +2056,10 @@ scanToFetchTuple(AppendOnlyFetchDesc aoFetchDesc,
}

if (rowNum <= aoFetchDesc->currentBlock.lastRowNum)
return fetchFromCurrentBlock(aoFetchDesc, rowNum, slot);
{
fetchFromCurrentBlock(aoFetchDesc, rowNum, slot);
return true;
}

/*
* Update information to get next block.
Expand Down Expand Up @@ -2265,7 +2307,8 @@ appendonly_fetch(AppendOnlyFetchDesc aoFetchDesc,
}
return false; /* row has been deleted or updated. */
}
return fetchFromCurrentBlock(aoFetchDesc, rowNum, slot);
fetchFromCurrentBlock(aoFetchDesc, rowNum, slot);
return true;
}

/*
Expand Down
46 changes: 15 additions & 31 deletions src/backend/access/appendonly/appendonlyblockdirectory.c
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,12 @@ AppendOnlyBlockDirectory_GetEntry(
/* Ignore columns that are not projected. */
continue;
}
/* Setup the scan keys for the scan. */
/*
* Set up the scan keys values. The keys have already been set up in
* init_internal() with the following strategy:
* (=segmentFileNum, =columnGroupNo, <=rowNum)
* See init_internal().
*/
Assert(scanKeys != NULL);
scanKeys[0].sk_argument = Int32GetDatum(segmentFileNum);
scanKeys[1].sk_argument = Int32GetDatum(tmpGroupNo);
Expand Down Expand Up @@ -641,6 +646,15 @@ AppendOnlyBlockDirectory_GetEntry(
/*
* Since the last few blocks may not be logged in the block
* directory, we always use the last entry.
*
* FIXME: If we didn't find a suitable entry, why even use the last
* entry? Currently, as it stands we would most likely return
* true from this function. This will lead to us having to do a
* fetch of the tuple from the physical file in the layer above (see
* scanToFetchTuple()), where we would ultimately find the tuple
* missing. Would it be correct to set the directory entry here to
* be the last one (for caching purposes) and return false, in order
* to avoid this physical file read?
*/
entry_no = minipageInfo->numMinipageEntries - 1;
}
Expand Down Expand Up @@ -702,7 +716,6 @@ insert_new_entry(
MinipageEntry *entry = NULL;
MinipagePerColumnGroup *minipageInfo;
int minipageIndex;
int lastEntryNo;

if (rowCount == 0)
return false;
Expand Down Expand Up @@ -732,35 +745,6 @@ insert_new_entry(
minipageInfo = &blockDirectory->minipages[minipageIndex];
Assert(minipageInfo->numMinipageEntries <= (uint32) NUM_MINIPAGE_ENTRIES);

lastEntryNo = minipageInfo->numMinipageEntries - 1;
if (lastEntryNo >= 0)
{
entry = &(minipageInfo->minipage->entry[lastEntryNo]);

Assert(entry->firstRowNum < firstRowNum);
Assert(entry->fileOffset < fileOffset);

if (gp_blockdirectory_entry_min_range > 0 &&
fileOffset - entry->fileOffset < gp_blockdirectory_entry_min_range)
return true;

/* Update the rowCount in the latest entry */
Assert(entry->rowCount <= firstRowNum - entry->firstRowNum);

ereportif(Debug_appendonly_print_blockdirectory, LOG,
(errmsg("Append-only block directory update entry: "
"(firstRowNum, columnGroupNo, fileOffset, rowCount) = (" INT64_FORMAT
", %d, " INT64_FORMAT ", " INT64_FORMAT ") at index %d to "
"(firstRowNum, columnGroupNo, fileOffset, rowCount) = (" INT64_FORMAT
", %d, " INT64_FORMAT ", " INT64_FORMAT ")",
entry->firstRowNum, columnGroupNo, entry->fileOffset, entry->rowCount,
minipageInfo->numMinipageEntries - 1,
entry->firstRowNum, columnGroupNo, entry->fileOffset,
firstRowNum - entry->firstRowNum)));

entry->rowCount = firstRowNum - entry->firstRowNum;
}

if (minipageInfo->numMinipageEntries >= (uint32) gp_blockdirectory_minipage_size)
{
write_minipage(blockDirectory, columnGroupNo, minipageInfo);
Expand Down
3 changes: 2 additions & 1 deletion src/include/catalog/pg_appendonly.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,11 @@ typedef enum AORelationVersion
* were introduced, see MPP-7251 and MPP-7372. */
AORelationVersion_PG83 = 3, /* Same as Aligned64bit, but numerics are stored
* in the PostgreSQL 8.3 format. */
AORelationVersion_PG12 = 4, /* version that removed block directory hole filling. */
MaxAORelationVersion /* must always be last */
} AORelationVersion;

#define AORelationVersion_GetLatest() AORelationVersion_PG83
#define AORelationVersion_GetLatest() AORelationVersion_PG12

#define AORelationVersion_IsValid(version) \
(version > AORelationVersion_None && version < MaxAORelationVersion)
Expand Down
19 changes: 18 additions & 1 deletion src/include/cdb/cdbappendonlyblockdirectory.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,21 @@
extern int gp_blockdirectory_entry_min_range;
extern int gp_blockdirectory_minipage_size;

/*
* In-memory equivalent of on-disk data structure MinipageEntry, used to
* represent a block directory entry.
*/
typedef struct AppendOnlyBlockDirectoryEntry
{
/*
* The range of blocks covered by the Block Directory entry.
* The range of blocks covered by the Block Directory entry, which is the
* continuous range [firstRowNum, lastRowNum]. There are no gaps (or holes)
* within this range. However, there may be gaps between successive block
* directory entries. For e.g. entry0 could have range [1,50] and entry1
* could have: [100,150]. The reason gaps arise between successive entries
* is that we allocate row numbers using the gp_fastsequence mechanism,
* which allocates blocks of row numbers of a pre-determined size (that may
* be larger than the number of blocks being inserted)
*/
struct range
{
Expand Down Expand Up @@ -126,6 +137,12 @@ typedef struct AppendOnlyBlockDirectory

typedef struct AOFetchBlockMetadata
{
/*
* Current cached block directory entry.
* FIXME: At times, we rely upon the values in this struct to be valid even
* when AOFetchBlockMetadata->valid = false. This indicates that this should
* live elsewhere.
*/
AppendOnlyBlockDirectoryEntry blockDirectoryEntry;

/*
Expand Down
Loading

0 comments on commit 258ec96

Please sign in to comment.