diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index abf067a8a2b8..448656c8c6e2 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -255,6 +255,8 @@ Bug Fixes * GITHUB#15380: Fix potential EOF introduced by optimized filter iterations in MaxScoreBulkScorer. The approximation and the match verification could get out of sync, resulting in an EOFException. (Ben Trent) +* GITHUB#15478: Fix duplicate neighbor issue by checking node existence before making reverse link. (Pulkit Gupta) + Other --------------------- * GITHUB#15237: Fix SmartChinese to only deserialize dictionary data from classpath with a native array diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java index 70f607d3b5d0..2e55e3e06649 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java @@ -349,7 +349,7 @@ void addDiverseNeighbors( int node, NeighborArray candidates, UpdateableRandomVectorScorer scorer, - boolean outOfOrderInsertion) + boolean isLinkRepair) throws IOException { /* For each of the beamWidth nearest candidates (going from best to worst), select it only if it * is closer to target than it is to any of the already-selected neighbors (ie selected in this method, @@ -358,8 +358,7 @@ void addDiverseNeighbors( NeighborArray neighbors = hnsw.getNeighbors(level, node); int maxConnOnLevel = level == 0 ? M * 2 : M; boolean[] mask = - selectAndLinkDiverse( - node, neighbors, candidates, maxConnOnLevel, scorer, outOfOrderInsertion); + selectAndLinkDiverse(node, neighbors, candidates, maxConnOnLevel, scorer, isLinkRepair); // Link the selected nodes to the new node, and the new node to the selected nodes (again // applying diversity heuristic) @@ -374,16 +373,44 @@ void addDiverseNeighbors( if (hnswLock != null) { Lock lock = hnswLock.write(level, nbr); try { - NeighborArray nbrsOfNbr = getGraph().getNeighbors(level, nbr); - nbrsOfNbr.addAndEnsureDiversity(node, candidates.getScores(i), nbr, scorer); + updateNeighbor( + getGraph().getNeighbors(level, nbr), + node, + candidates.getScores(i), + nbr, + scorer, + isLinkRepair); } finally { lock.unlock(); } } else { - NeighborArray nbrsOfNbr = hnsw.getNeighbors(level, nbr); - nbrsOfNbr.addAndEnsureDiversity(node, candidates.getScores(i), nbr, scorer); + updateNeighbor( + hnsw.getNeighbors(level, nbr), + node, + candidates.getScores(i), + nbr, + scorer, + isLinkRepair); + } + } + } + + private void updateNeighbor( + NeighborArray nbrsOfNbr, + int node, + float score, + int nbr, + UpdateableRandomVectorScorer scorer, + boolean isLinkRepair) + throws IOException { + // Only check for duplicates during link repair to avoid performance overhead during normal + // construction + if (isLinkRepair) { + for (int j = 0; j < nbrsOfNbr.size(); j++) { + if (nbrsOfNbr.nodes()[j] == node) return; } } + nbrsOfNbr.addAndEnsureDiversity(node, score, nbr, scorer); } /** @@ -396,7 +423,7 @@ private boolean[] selectAndLinkDiverse( NeighborArray candidates, int maxConnOnLevel, UpdateableRandomVectorScorer scorer, - boolean outOfOrderInsertion) + boolean isLinkRepair) throws IOException { boolean[] mask = new boolean[candidates.size()]; // Select the best maxConnOnLevel neighbors of the new node, applying the diversity heuristic @@ -414,7 +441,7 @@ private boolean[] selectAndLinkDiverse( mask[i] = true; // here we don't need to lock, because there's no incoming link so no others is able to // discover this node such that no others will modify this neighbor array as well - if (outOfOrderInsertion) { + if (isLinkRepair) { neighbors.addOutOfOrder(cNode, cScore); } else { neighbors.addInOrder(cNode, cScore);