diff --git a/faiss_vector_posting.go b/faiss_vector_posting.go index bf35f99..338e05d 100644 --- a/faiss_vector_posting.go +++ b/faiss_vector_posting.go @@ -409,14 +409,9 @@ func (sb *SegmentBase) UpdateFieldStats(stats segment.FieldStats) { continue } - _, _, numVecs, _, minDocID, maxDocID, indexBytesLen, _ := getVectorSectionContentOffsets(sb, pos) + _, _, numVecs, _, indexBytesLen, _ := getVectorSectionContentOffsets(sb, pos) stats.Store("num_vectors", fieldName, numVecs) stats.Store("vector_index_bytes", fieldName, indexBytesLen) - - if minDocID != math.MaxUint64 && maxDocID != 0 { - stats.Store("vector_min_doc_id", fieldName, minDocID) - stats.Store("vector_max_doc_id", fieldName, maxDocID) - } } } diff --git a/faiss_vector_test.go b/faiss_vector_test.go index b1b61fe..f1ad691 100644 --- a/faiss_vector_test.go +++ b/faiss_vector_test.go @@ -407,7 +407,7 @@ func TestVectorSegment(t *testing.T) { fieldsSectionsMap := vecSegBase.fieldsSectionsMap stubVecFieldStartAddr := fieldsSectionsMap[vecSegBase.fieldsMap["stubVec"]-1][SectionFaissVectorIndex] - docValueStart, docValueEnd, numVecs, _, _, _, indexBytesLen, + docValueStart, docValueEnd, numVecs, _, indexBytesLen, _ := getVectorSectionContentOffsets(vecSegBase, stubVecFieldStartAddr) if docValueStart != fieldNotUninverted { diff --git a/section_faiss_vector_index.go b/section_faiss_vector_index.go index 0bee8d4..95a90db 100644 --- a/section_faiss_vector_index.go +++ b/section_faiss_vector_index.go @@ -771,8 +771,6 @@ func getVectorSectionContentOffsets(sb *SegmentBase, offset uint64) ( docValueEnd, numVecs, vecDocIDsMappingOffset, - minDocID, - maxDocID, indexBytesLen, indexBytesOffset uint64, ) { @@ -791,19 +789,15 @@ func getVectorSectionContentOffsets(sb *SegmentBase, offset uint64) ( pos += uint64(n) vecDocIDsMappingOffset = pos - minDocID = uint64(math.MaxUint64) - maxDocID = uint64(0) - var docID uint64 + // Just based on the number of vectors, we can't skip the vecID:docID mapping + // This is because the vectorIDs and docIDs are encoded in a variable length + // format, exact size of which varies based on the value of the ID. + // So, we need to iterate over the vectorID:docID mapping to determine the + // exact offset of the index bytes. for i := 0; i < int(numVecs); i++ { _, n := binary.Varint(sb.mem[pos : pos+binary.MaxVarintLen64]) pos += uint64(n) - docID, n = binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) - if docID < minDocID { - minDocID = docID - } - if docID > maxDocID { - maxDocID = docID - } + _, n = binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) pos += uint64(n) }