From e974311d91fd3f2f3aa015c9e56cf2d689290f41 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 13 Dec 2018 16:05:47 +0100 Subject: [PATCH 1/2] LUCENE-8609: Allow getting consistent docstats from IndexWriter Today we have #numDocs() and #maxDoc() on IndexWriter. This is enough to get all stats for the current index but it's subject to concurrency and might return numbers that are not consistent ie. some cases can return maxDoc < numDocs which is undesirable. This change adds a getDocStats() method to index writer to allow fetching consistent numbers for these stats. This change also deprecates IndexWriter#numDocs() and IndexWriter#maxDoc() and replaces all their usages wiht IndexWriter#getDocStats() --- lucene/CHANGES.txt | 4 ++ .../miscellaneous/TestEmptyTokenStream.java | 2 +- .../index/TestBackwardsCompatibility.java | 8 +-- .../org/apache/lucene/index/IndexWriter.java | 52 +++++++++++++++- .../perfield/TestPerFieldPostingsFormat2.java | 12 ++-- .../apache/lucene/index/TestAddIndexes.java | 62 +++++++++---------- .../index/TestConcurrentMergeScheduler.java | 2 +- .../org/apache/lucene/index/TestCrash.java | 2 +- .../lucene/index/TestDeletionPolicy.java | 8 +-- .../index/TestFlushByRamOrCountsPolicy.java | 16 ++--- .../lucene/index/TestIndexManyDocuments.java | 2 +- .../apache/lucene/index/TestIndexWriter.java | 45 ++++++++++---- .../lucene/index/TestIndexWriterCommit.java | 2 +- .../lucene/index/TestIndexWriterDelete.java | 2 +- .../index/TestIndexWriterFromReader.java | 20 +++--- .../lucene/index/TestIndexWriterMaxDocs.java | 6 +- .../index/TestIndexWriterMergePolicy.java | 2 +- .../lucene/index/TestIndexWriterMerging.java | 8 +-- .../lucene/index/TestIndexWriterReader.java | 4 +- .../apache/lucene/index/TestIsCurrent.java | 8 +-- .../lucene/index/TestRollingUpdates.java | 2 +- .../TestSoftDeletesRetentionMergePolicy.java | 20 +++--- .../lucene/index/TestThreadedForceMerge.java | 4 +- .../lucene/index/TestTieredMergePolicy.java | 28 ++++----- .../apache/lucene/search/TestBoolean2.java | 2 +- .../lucene/search/TestSearcherManager.java | 2 +- .../store/TestByteBuffersDirectory.java | 2 +- .../apache/lucene/store/TestRAMDirectory.java | 2 +- .../directory/DirectoryTaxonomyWriter.java | 4 +- .../lucene/queries/mlt/TestMoreLikeThis.java | 2 +- .../lucene/index/RandomIndexWriter.java | 8 +-- .../ThreadedIndexingAndSearchingTestCase.java | 2 +- .../ClassificationUpdateProcessorTest.java | 2 +- 33 files changed, 209 insertions(+), 138 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index d6276aa2e636..b4dd696b4e43 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -282,6 +282,10 @@ Other * LUCENE-8605: Separate bounding box spatial logic from query logic on LatLonShapeBoundingBoxQuery. (Ignacio Vera) +* LUCENE-8609: Deprecated IndexWriter#numDocs() and IndexWriter#maxDoc() in favor of IndexWriter#getDocStats() + that allows to get consistent numDocs and maxDoc stats that are not subject to concurrent changes. + (Simon Willnauer, Nhat Nguyen) + ======================= Lucene 7.6.0 ======================= Build diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestEmptyTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestEmptyTokenStream.java index 556aff0c4b74..92d4049e87f4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestEmptyTokenStream.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestEmptyTokenStream.java @@ -62,7 +62,7 @@ public void testIndexWriter_LUCENE4656() throws IOException { // this should not fail because we have no TermToBytesRefAttribute writer.addDocument(doc); - assertEquals(1, writer.numDocs()); + assertEquals(1, writer.getDocStats().numDocs); writer.close(); directory.close(); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 23fac474e191..b2ffc1f7c386 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -1071,7 +1071,7 @@ public void changeIndexWithAdds(Random random, Directory dir, Version nameVersio // make sure writer sees right total -- writer seems not to know about deletes in .del? final int expected = 45; - assertEquals("wrong doc count", expected, writer.numDocs()); + assertEquals("wrong doc count", expected, writer.getDocStats().numDocs); writer.close(); // make sure searching sees right # hits @@ -1139,7 +1139,7 @@ public void createIndex(String dirName, boolean doCFS, boolean fullyMerged) thro for(int i=0;i<35;i++) { addDoc(writer, i); } - assertEquals("wrong doc count", 35, writer.maxDoc()); + assertEquals("wrong doc count", 35, writer.getDocStats().maxDoc); if (fullyMerged) { writer.forceMerge(1); } @@ -1599,7 +1599,7 @@ public void testSoftDeletes() throws Exception { verifyUsesDefaultCodec(dir, dvUpdatesIndex); IndexWriterConfig conf = new IndexWriterConfig(new MockAnalyzer(random())).setSoftDeletesField("__soft_delete"); IndexWriter writer = new IndexWriter(dir, conf); - int maxDoc = writer.maxDoc(); + int maxDoc = writer.getDocStats().maxDoc; writer.updateDocValues(new Term("id", "1"),new NumericDocValuesField("__soft_delete", 1)); if (random().nextBoolean()) { @@ -1607,7 +1607,7 @@ public void testSoftDeletes() throws Exception { } writer.forceMerge(1); writer.commit(); - assertEquals(maxDoc-1, writer.maxDoc()); + assertEquals(maxDoc-1, writer.getDocStats().maxDoc); writer.close(); dir.close(); } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index f8415829cfd5..4771f3df681a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -1134,7 +1134,10 @@ public Analyzer getAnalyzer() { /** Returns total number of docs in this index, including * docs not yet flushed (still in the RAM buffer), * not counting deletions. - * @see #numDocs */ + * @see #numDocs + * @deprecated use {@link #getDocStats()} instead + * */ + @Deprecated public synchronized int maxDoc() { ensureOpen(); return docWriter.getNumDocs() + segmentInfos.totalMaxDoc(); @@ -1156,7 +1159,10 @@ public synchronized void advanceSegmentInfosVersion(long newVersion) { * including deletions. NOTE: buffered deletions * are not counted. If you really need these to be * counted you should call {@link #commit()} first. - * @see #numDocs */ + * @see #maxDoc + * @deprecated use {@link #getDocStats()} instead + * */ + @Deprecated public synchronized int numDocs() { ensureOpen(); int count = docWriter.getNumDocs(); @@ -5289,4 +5295,46 @@ final synchronized List listOfSegmentCommitInfos() { final synchronized SegmentInfos cloneSegmentInfos() { return segmentInfos.clone(); } + + /** + * Returns accurate {@link DocStats} form this writer. This is equivalent to calling {@link #numDocs()} and {@link #maxDoc()} + * but is not subject to race-conditions. The numDoc for instance can change after maxDoc is fetched that causes numDocs to be + * greater than maxDoc which makes it hard to get accurate document stats from IndexWriter. + */ + public synchronized DocStats getDocStats() { + ensureOpen(); + int numDocs = docWriter.getNumDocs(); + int maxDoc = numDocs; + for (final SegmentCommitInfo info : segmentInfos) { + maxDoc += info.info.maxDoc(); + numDocs += info.info.maxDoc() - numDeletedDocs(info); + } + assert maxDoc >= numDocs : "maxDoc is less than numDocs: " + maxDoc + " < " + numDocs; + return new DocStats(maxDoc, numDocs); + } + + /** + * DocStats for this index + */ + public static final class DocStats { + /** + * The total number of docs in this index, including + * docs not yet flushed (still in the RAM buffer), + * not counting deletions. + */ + public final int maxDoc; + /** + * The total number of docs in this index, including + * docs not yet flushed (still in the RAM buffer), and + * including deletions. NOTE: buffered deletions + * are not counted. If you really need these to be + * counted you should call {@link IndexWriter#commit()} first. + */ + public final int numDocs; + + private DocStats(int maxDoc, int numDocs) { + this.maxDoc = maxDoc; + this.numDocs = numDocs; + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java index b7cc2bf2b2fc..df18a7448e1c 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java @@ -120,10 +120,10 @@ public void testMergeUnusedPerFieldCodec() throws IOException { writer.commit(); addDocs2(writer, 10); writer.commit(); - assertEquals(30, writer.maxDoc()); + assertEquals(30, writer.getDocStats().maxDoc); TestUtil.checkIndex(dir); writer.forceMerge(1); - assertEquals(30, writer.maxDoc()); + assertEquals(30, writer.getDocStats().maxDoc); writer.close(); dir.close(); } @@ -173,7 +173,7 @@ public void testChangeCodecAndMerge() throws IOException { addDocs2(writer, 10); writer.commit(); codec = iwconf.getCodec(); - assertEquals(30, writer.maxDoc()); + assertEquals(30, writer.getDocStats().maxDoc); assertQuery(new Term("content", "bbb"), dir, 10); assertQuery(new Term("content", "ccc"), dir, 10); //// assertQuery(new Term("content", "aaa"), dir, 10); @@ -186,13 +186,13 @@ public void testChangeCodecAndMerge() throws IOException { assertQuery(new Term("content", "ccc"), dir, 10); assertQuery(new Term("content", "bbb"), dir, 20); assertQuery(new Term("content", "aaa"), dir, 10); - assertEquals(40, writer.maxDoc()); + assertEquals(40, writer.getDocStats().maxDoc); if (VERBOSE) { System.out.println("TEST: now optimize"); } writer.forceMerge(1); - assertEquals(40, writer.maxDoc()); + assertEquals(40, writer.getDocStats().maxDoc); writer.close(); assertQuery(new Term("content", "ccc"), dir, 10); assertQuery(new Term("content", "bbb"), dir, 20); @@ -258,7 +258,7 @@ public void testStressPerFieldCodec() throws IOException { writer.forceMerge(1); } writer.commit(); - assertEquals((i + 1) * docsPerRound, writer.maxDoc()); + assertEquals((i + 1) * docsPerRound, writer.getDocStats().maxDoc); writer.close(); } dir.close(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 1d0a3190bd80..7e5f0087a09b 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -66,7 +66,7 @@ public void testSimpleCase() throws IOException { .setOpenMode(OpenMode.CREATE)); // add 100 documents addDocs(writer, 100); - assertEquals(100, writer.maxDoc()); + assertEquals(100, writer.getDocStats().maxDoc); writer.close(); TestUtil.checkIndex(dir); @@ -78,20 +78,20 @@ public void testSimpleCase() throws IOException { ); // add 40 documents in separate files addDocs(writer, 40); - assertEquals(40, writer.maxDoc()); + assertEquals(40, writer.getDocStats().maxDoc); writer.close(); writer = newWriter(aux2, newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.CREATE)); // add 50 documents in compound files addDocs2(writer, 50); - assertEquals(50, writer.maxDoc()); + assertEquals(50, writer.getDocStats().maxDoc); writer.close(); // test doc count before segments are merged writer = newWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND)); - assertEquals(100, writer.maxDoc()); + assertEquals(100, writer.getDocStats().maxDoc); writer.addIndexes(aux, aux2); - assertEquals(190, writer.maxDoc()); + assertEquals(190, writer.getDocStats().maxDoc); writer.close(); TestUtil.checkIndex(dir); @@ -106,14 +106,14 @@ public void testSimpleCase() throws IOException { writer = newWriter(aux3, newIndexWriterConfig(new MockAnalyzer(random()))); // add 40 documents addDocs(writer, 40); - assertEquals(40, writer.maxDoc()); + assertEquals(40, writer.getDocStats().maxDoc); writer.close(); // test doc count before segments are merged writer = newWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND)); - assertEquals(190, writer.maxDoc()); + assertEquals(190, writer.getDocStats().maxDoc); writer.addIndexes(aux3); - assertEquals(230, writer.maxDoc()); + assertEquals(230, writer.getDocStats().maxDoc); writer.close(); // make sure the new index is correct @@ -142,9 +142,9 @@ public void testSimpleCase() throws IOException { writer.close(); writer = newWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND)); - assertEquals(230, writer.maxDoc()); + assertEquals(230, writer.getDocStats().maxDoc); writer.addIndexes(aux4); - assertEquals(231, writer.maxDoc()); + assertEquals(231, writer.getDocStats().maxDoc); writer.close(); verifyNumDocs(dir, 231); @@ -284,7 +284,7 @@ public void testAddSelf() throws IOException { writer = newWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); // add 100 documents addDocs(writer, 100); - assertEquals(100, writer.maxDoc()); + assertEquals(100, writer.getDocStats().maxDoc); writer.close(); writer = newWriter( @@ -312,7 +312,7 @@ public void testAddSelf() throws IOException { expectThrows(IllegalArgumentException.class, () -> { writer2.addIndexes(aux, dir); }); - assertEquals(100, writer2.maxDoc()); + assertEquals(100, writer2.getDocStats().maxDoc); writer2.close(); // make sure the index is correct @@ -342,7 +342,7 @@ public void testNoTailSegments() throws IOException { addDocs(writer, 10); writer.addIndexes(aux); - assertEquals(1040, writer.maxDoc()); + assertEquals(1040, writer.getDocStats().maxDoc); assertEquals(1000, writer.maxDoc(0)); writer.close(); @@ -371,7 +371,7 @@ public void testNoCopySegments() throws IOException { addDocs(writer, 2); writer.addIndexes(aux); - assertEquals(1032, writer.maxDoc()); + assertEquals(1032, writer.getDocStats().maxDoc); assertEquals(1000, writer.maxDoc(0)); writer.close(); @@ -399,7 +399,7 @@ public void testNoMergeAfterCopy() throws IOException { ); writer.addIndexes(aux, new MockDirectoryWrapper(random(), TestUtil.ramCopyOf(aux))); - assertEquals(1060, writer.maxDoc()); + assertEquals(1060, writer.getDocStats().maxDoc); assertEquals(1000, writer.maxDoc(0)); writer.close(); @@ -441,7 +441,7 @@ public void testMergeAfterCopy() throws IOException { System.out.println("\nTEST: now addIndexes"); } writer.addIndexes(aux, new MockDirectoryWrapper(random(), TestUtil.ramCopyOf(aux))); - assertEquals(1020, writer.maxDoc()); + assertEquals(1020, writer.getDocStats().maxDoc); assertEquals(1000, writer.maxDoc(0)); writer.close(); dir.close(); @@ -466,7 +466,7 @@ public void testMoreMerges() throws IOException { setMergePolicy(newLogMergePolicy(10)) ); writer.addIndexes(aux); - assertEquals(30, writer.maxDoc()); + assertEquals(30, writer.getDocStats().maxDoc); assertEquals(3, writer.getSegmentCount()); writer.close(); @@ -501,7 +501,7 @@ public void testMoreMerges() throws IOException { ); writer.addIndexes(aux, aux2); - assertEquals(1040, writer.maxDoc()); + assertEquals(1040, writer.getDocStats().maxDoc); assertEquals(1000, writer.maxDoc(0)); writer.close(); dir.close(); @@ -570,7 +570,7 @@ private void setUpDirs(Directory dir, Directory aux, boolean withID) throws IOEx } else { addDocs(writer, 1000); } - assertEquals(1000, writer.maxDoc()); + assertEquals(1000, writer.getDocStats().maxDoc); assertEquals(1, writer.getSegmentCount()); writer.close(); @@ -597,7 +597,7 @@ private void setUpDirs(Directory dir, Directory aux, boolean withID) throws IOEx setMergePolicy(newLogMergePolicy(false, 10)) ); } - assertEquals(30, writer.maxDoc()); + assertEquals(30, writer.getDocStats().maxDoc); assertEquals(3, writer.getSegmentCount()); writer.close(); } @@ -815,7 +815,7 @@ public void testAddIndexesWithThreads() throws Throwable { c.joinThreads(); int expectedNumDocs = 100+NUM_COPY*(4*NUM_ITER/5)*RunAddIndexesThreads.NUM_THREADS*RunAddIndexesThreads.NUM_INIT_DOCS; - assertEquals("expected num docs don't match - failures: " + c.failures, expectedNumDocs, c.writer2.numDocs()); + assertEquals("expected num docs don't match - failures: " + c.failures, expectedNumDocs, c.writer2.getDocStats().numDocs); c.close(true); @@ -1002,7 +1002,7 @@ public void testExistingDeletes() throws Exception { TestUtil.addIndexesSlowly(writer, r); } writer.commit(); - assertEquals("Documents from the incoming index should not have been deleted", 1, writer.numDocs()); + assertEquals("Documents from the incoming index should not have been deleted", 1, writer.getDocStats().numDocs); writer.close(); for (Directory dir : dirs) { @@ -1037,7 +1037,7 @@ public void testSimpleCaseCustomCodec() throws IOException { .setOpenMode(OpenMode.CREATE).setCodec(codec)); // add 100 documents addDocsWithID(writer, 100, 0); - assertEquals(100, writer.maxDoc()); + assertEquals(100, writer.getDocStats().maxDoc); writer.commit(); writer.close(); TestUtil.checkIndex(dir); @@ -1052,7 +1052,7 @@ public void testSimpleCaseCustomCodec() throws IOException { ); // add 40 documents in separate files addDocs(writer, 40); - assertEquals(40, writer.maxDoc()); + assertEquals(40, writer.getDocStats().maxDoc); writer.commit(); writer.close(); @@ -1064,7 +1064,7 @@ public void testSimpleCaseCustomCodec() throws IOException { ); // add 40 documents in compound files addDocs2(writer, 50); - assertEquals(50, writer.maxDoc()); + assertEquals(50, writer.getDocStats().maxDoc); writer.commit(); writer.close(); @@ -1075,9 +1075,9 @@ public void testSimpleCaseCustomCodec() throws IOException { setOpenMode(OpenMode.APPEND). setCodec(codec) ); - assertEquals(100, writer.maxDoc()); + assertEquals(100, writer.getDocStats().maxDoc); writer.addIndexes(aux, aux2); - assertEquals(190, writer.maxDoc()); + assertEquals(190, writer.getDocStats().maxDoc); writer.close(); dir.close(); @@ -1439,8 +1439,8 @@ public void testAddIndicesWithSoftDeletes() throws IOException { readers[i] = (CodecReader)reader.leaves().get(i).reader(); } writer.addIndexes(readers); - assertEquals(wrappedReader.numDocs(), writer.numDocs()); - assertEquals(maxDoc, writer.maxDoc()); + assertEquals(wrappedReader.numDocs(), writer.getDocStats().numDocs); + assertEquals(maxDoc, writer.getDocStats().maxDoc); writer.commit(); SegmentCommitInfo commitInfo = writer.listOfSegmentCommitInfos().get(0); assertEquals(maxDoc-wrappedReader.numDocs(), commitInfo.getSoftDelCount()); @@ -1454,8 +1454,8 @@ public void testAddIndicesWithSoftDeletes() throws IOException { readers[i] = (CodecReader)wrappedReader.leaves().get(i).reader(); } writer.addIndexes(readers); - assertEquals(wrappedReader.numDocs(), writer.numDocs()); - assertEquals(wrappedReader.numDocs(), writer.maxDoc()); + assertEquals(wrappedReader.numDocs(), writer.getDocStats().numDocs); + assertEquals(wrappedReader.numDocs(), writer.getDocStats().maxDoc); IOUtils.close(reader, writer, dir3, dir2, dir1); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java b/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java index 9d389e26914d..fad831fb74b3 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java @@ -552,7 +552,7 @@ public void run() { } }.start(); - while (w.numDocs() != 8) { + while (w.getDocStats().numDocs != 8) { Thread.sleep(10); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCrash.java b/lucene/core/src/test/org/apache/lucene/index/TestCrash.java index 227b8081a319..f6d15c93a591 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCrash.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCrash.java @@ -135,7 +135,7 @@ public void testCrashAfterReopen() throws IOException { writer.close(); writer = initIndex(random(), dir, false, true); - assertEquals(314, writer.maxDoc()); + assertEquals(314, writer.getDocStats().maxDoc); crash(writer); /* diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java index 869ef8c24f77..5999624e4e17 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java @@ -427,7 +427,7 @@ public void testOpenPriorSnapshot() throws IOException { writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setIndexDeletionPolicy(policy)); addDoc(writer); - assertEquals(11, writer.numDocs()); + assertEquals(11, writer.getDocStats().numDocs); writer.forceMerge(1); writer.close(); @@ -437,7 +437,7 @@ public void testOpenPriorSnapshot() throws IOException { writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setIndexDeletionPolicy(policy) .setIndexCommit(lastCommit)); - assertEquals(10, writer.numDocs()); + assertEquals(10, writer.getDocStats().numDocs); // Should undo our rollback: writer.rollback(); @@ -451,7 +451,7 @@ public void testOpenPriorSnapshot() throws IOException { writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setIndexDeletionPolicy(policy) .setIndexCommit(lastCommit)); - assertEquals(10, writer.numDocs()); + assertEquals(10, writer.getDocStats().numDocs); // Commits the rollback: writer.close(); @@ -480,7 +480,7 @@ public void testOpenPriorSnapshot() throws IOException { // but this time keeping only the last commit: writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setIndexCommit(lastCommit)); - assertEquals(10, writer.numDocs()); + assertEquals(10, writer.getDocStats().numDocs); // Reader still sees fully merged index, because writer // opened on the prior commit has not yet committed: diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFlushByRamOrCountsPolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestFlushByRamOrCountsPolicy.java index fa6cfa947a47..1184a887023f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFlushByRamOrCountsPolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFlushByRamOrCountsPolicy.java @@ -96,8 +96,8 @@ protected void runFlushByRam(int numThreads, double maxRamMB, final long maxRAMBytes = (long) (iwc.getRAMBufferSizeMB() * 1024. * 1024.); assertEquals(" all flushes must be due numThreads=" + numThreads, 0, writer.getFlushingBytes()); - assertEquals(numDocumentsToIndex, writer.numDocs()); - assertEquals(numDocumentsToIndex, writer.maxDoc()); + assertEquals(numDocumentsToIndex, writer.getDocStats().numDocs); + assertEquals(numDocumentsToIndex, writer.getDocStats().maxDoc); assertTrue("peak bytes without flush exceeded watermark", flushPolicy.peakBytesWithoutFlush <= maxRAMBytes); assertActiveBytesAfter(flushControl); @@ -151,8 +151,8 @@ public void testFlushDocCount() throws IOException, InterruptedException { assertEquals(" all flushes must be due numThreads=" + numThreads[i], 0, writer.getFlushingBytes()); - assertEquals(numDocumentsToIndex, writer.numDocs()); - assertEquals(numDocumentsToIndex, writer.maxDoc()); + assertEquals(numDocumentsToIndex, writer.getDocStats().numDocs); + assertEquals(numDocumentsToIndex, writer.getDocStats().maxDoc); assertTrue("peak bytes without flush exceeded watermark", flushPolicy.peakDocCountWithoutFlush <= iwc.getMaxBufferedDocs()); assertActiveBytesAfter(flushControl); @@ -195,8 +195,8 @@ public void testRandom() throws IOException, InterruptedException { threads[x].join(); } assertEquals(" all flushes must be due", 0, writer.getFlushingBytes()); - assertEquals(numDocumentsToIndex, writer.numDocs()); - assertEquals(numDocumentsToIndex, writer.maxDoc()); + assertEquals(numDocumentsToIndex, writer.getDocStats().numDocs); + assertEquals(numDocumentsToIndex, writer.getDocStats().maxDoc); if (flushPolicy.flushOnRAM() && !flushPolicy.flushOnDocCount()) { final long maxRAMBytes = (long) (iwc.getRAMBufferSizeMB() * 1024. * 1024.); assertTrue("peak bytes without flush exceeded watermark", @@ -256,8 +256,8 @@ public void testStallControl() throws InterruptedException, IOException { assertNotNull(docsWriter); DocumentsWriterFlushControl flushControl = docsWriter.flushControl; assertEquals(" all flushes must be due", 0, writer.getFlushingBytes()); - assertEquals(numDocumentsToIndex, writer.numDocs()); - assertEquals(numDocumentsToIndex, writer.maxDoc()); + assertEquals(numDocumentsToIndex, writer.getDocStats().numDocs); + assertEquals(numDocumentsToIndex, writer.getDocStats().maxDoc); if (numThreads[i] == 1) { assertFalse( "single thread must not block numThreads: " + numThreads[i], diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexManyDocuments.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexManyDocuments.java index eb31e738856e..889dc4ecede7 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexManyDocuments.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexManyDocuments.java @@ -61,7 +61,7 @@ public void run() { thread.join(); } - assertEquals("lost " + (numDocs - w.maxDoc()) + " documents; maxBufferedDocs=" + iwc.getMaxBufferedDocs(), numDocs, w.maxDoc()); + assertEquals("lost " + (numDocs - w.getDocStats().maxDoc) + " documents; maxBufferedDocs=" + iwc.getMaxBufferedDocs(), numDocs, w.getDocStats().maxDoc); w.close(); IndexReader r = DirectoryReader.open(dir); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index a5b45f70cb33..66a94604e07e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -127,16 +127,35 @@ public void testDocCount() throws IOException { // add 100 documents for (i = 0; i < 100; i++) { addDocWithIndex(writer,i); + if (random().nextBoolean()) { + writer.commit(); + } } - assertEquals(100, writer.maxDoc()); + IndexWriter.DocStats docStats = writer.getDocStats(); + assertEquals(100, docStats.maxDoc); + assertEquals(100, docStats.numDocs); writer.close(); // delete 40 documents writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) - .setMergePolicy(NoMergePolicy.INSTANCE)); + .setMergePolicy(new FilterMergePolicy(NoMergePolicy.INSTANCE) { + @Override + public boolean keepFullyDeletedSegment(IOSupplier + readerIOSupplier) { + return true; + } + })); + for (i = 0; i < 40; i++) { writer.deleteDocuments(new Term("id", ""+i)); + if (random().nextBoolean()) { + writer.commit(); + } } + writer.flush(); + docStats = writer.getDocStats(); + assertEquals(100, docStats.maxDoc); + assertEquals(60, docStats.numDocs); writer.close(); reader = DirectoryReader.open(dir); @@ -145,10 +164,11 @@ public void testDocCount() throws IOException { // merge the index down and check that the new doc count is correct writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); - assertEquals(60, writer.numDocs()); + assertEquals(60, writer.getDocStats().numDocs); writer.forceMerge(1); - assertEquals(60, writer.maxDoc()); - assertEquals(60, writer.numDocs()); + docStats = writer.getDocStats(); + assertEquals(60, docStats.maxDoc); + assertEquals(60, docStats.numDocs); writer.close(); // check that the index reader gives the same numbers. @@ -161,8 +181,9 @@ public void testDocCount() throws IOException { // this existing one works correctly: writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setOpenMode(OpenMode.CREATE)); - assertEquals(0, writer.maxDoc()); - assertEquals(0, writer.numDocs()); + docStats = writer.getDocStats(); + assertEquals(0, docStats.maxDoc); + assertEquals(0, docStats.numDocs); writer.close(); dir.close(); } @@ -226,7 +247,7 @@ public void testCreateWithReader() throws IOException { // now open index for create: writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setOpenMode(OpenMode.CREATE)); - assertEquals("should be zero documents", writer.maxDoc(), 0); + assertEquals("should be zero documents", writer.getDocStats().maxDoc, 0); addDoc(writer); writer.close(); @@ -2751,7 +2772,7 @@ public void testWithPendingDeletions() throws Exception { try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())).setIndexCommit(indexCommit))) { writer.addDocument(new Document()); writer.commit(); - assertEquals(1, writer.maxDoc()); + assertEquals(1, writer.getDocStats().maxDoc); // now check that we moved to 3 dir.openInput("segments_3", IOContext.READ).close();; } @@ -3147,7 +3168,8 @@ public void testSoftUpdateDocuments() throws IOException { for (SegmentCommitInfo info : writer.cloneSegmentInfos()) { numSoftDeleted += info.getSoftDelCount(); } - assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted); + IndexWriter.DocStats docStats = writer.getDocStats(); + assertEquals(docStats.maxDoc - docStats.numDocs, numSoftDeleted); for (LeafReaderContext context : reader.leaves()) { LeafReader leaf = context.reader(); assertNull(((SegmentReader) leaf).getHardLiveDocs()); @@ -3300,7 +3322,8 @@ public int numDeletesToMerge(SegmentCommitInfo info, int delCount, IOSupplier 1_000); + assertTrue("Our single segment should have quite a few docs", w.getDocStats().numDocs > 1_000); // Delete 60% of the documents and then add a few more docs and commit. This should "singleton merge" the large segment // created above. 60% leaves some wriggle room, LUCENE-8263 will change this assumption and should be tested // when we deal with that JIRA. - deletedThisPass = deletePctDocsFromEachSeg(w, (w.numDocs() * 60) / 100, true); + deletedThisPass = deletePctDocsFromEachSeg(w, (w.getDocStats().numDocs * 60) / 100, true); remainingDocs -= deletedThisPass; for (int i = 0; i < 50; i++) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java index 19fa917371b6..13323a49ad72 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java @@ -171,7 +171,7 @@ public static void beforeClass() throws Exception { RandomIndexWriter w = new RandomIndexWriter(random(), dir2, iwc); w.addIndexes(copy); copy.close(); - docCount = w.maxDoc(); + docCount = w.getDocStats().maxDoc; w.close(); mulFactor *= 2; } while(docCount < 3000 * NUM_FILLER_DOCS); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java b/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java index b70784c0f63a..5f4134ced580 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java @@ -583,7 +583,7 @@ public void run() { } docs.close(); if (VERBOSE) { - System.out.println("TEST: index count=" + writerRef.get().maxDoc()); + System.out.println("TEST: index count=" + writerRef.get().getDocStats().maxDoc); } } catch (IOException ioe) { throw new RuntimeException(ioe); diff --git a/lucene/core/src/test/org/apache/lucene/store/TestByteBuffersDirectory.java b/lucene/core/src/test/org/apache/lucene/store/TestByteBuffersDirectory.java index 5f2d447c2407..11647c6fbb8b 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestByteBuffersDirectory.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestByteBuffersDirectory.java @@ -58,7 +58,7 @@ public void testBuildIndex() throws IOException { writer.addDocument(doc); } writer.commit(); - assertEquals(docs, writer.numDocs()); + assertEquals(docs, writer.getDocStats().numDocs); } } diff --git a/lucene/core/src/test/org/apache/lucene/store/TestRAMDirectory.java b/lucene/core/src/test/org/apache/lucene/store/TestRAMDirectory.java index f0f2d4607370..02ea9d777aeb 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestRAMDirectory.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestRAMDirectory.java @@ -64,7 +64,7 @@ private Path buildIndex() throws IOException { doc.add(newStringField("content", English.intToEnglish(i).trim(), Field.Store.YES)); writer.addDocument(doc); } - assertEquals(DOCS_TO_ADD, writer.maxDoc()); + assertEquals(DOCS_TO_ADD, writer.getDocStats().maxDoc); writer.close(); dir.close(); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java index 5efeb3e981e7..b36bf398f6ee 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java @@ -195,7 +195,7 @@ public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode, parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft); fullPathField = new StringField(Consts.FULL, "", Field.Store.YES); - nextID = indexWriter.maxDoc(); + nextID = indexWriter.getDocStats().maxDoc; if (cache == null) { cache = defaultTaxonomyWriterCache(); @@ -968,7 +968,7 @@ public synchronized void replaceTaxonomy(Directory taxoDir) throws IOException { shouldRefreshReaderManager = true; initReaderManager(); // ensure that it's initialized refreshReaderManager(); - nextID = indexWriter.maxDoc(); + nextID = indexWriter.getDocStats().maxDoc; taxoArrays = null; // must nullify so that it's re-computed next time it's needed // need to clear the cache, so that addCategory won't accidentally return diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index 32a610bf8a93..5f1884092f8e 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -262,7 +262,7 @@ private int addShopDoc(RandomIndexWriter writer, String type, String[] weSell, S doc.add(newTextField(NOT_FOR_SALE, item, Field.Store.YES)); } writer.addDocument(doc); - return writer.numDocs() - 1; + return writer.getDocStats().numDocs - 1; } @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-7161") diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomIndexWriter.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomIndexWriter.java index 33c4421909f6..eb208c6f86c6 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomIndexWriter.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomIndexWriter.java @@ -315,12 +315,8 @@ public long commit() throws IOException { return w.commit(); } - public int numDocs() { - return w.numDocs(); - } - - public int maxDoc() { - return w.maxDoc(); + public IndexWriter.DocStats getDocStats() { + return w.getDocStats(); } public long deleteAll() throws IOException { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/ThreadedIndexingAndSearchingTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/ThreadedIndexingAndSearchingTestCase.java index 575c6c95bf1a..b6b15a1f13f2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/ThreadedIndexingAndSearchingTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/ThreadedIndexingAndSearchingTestCase.java @@ -635,7 +635,7 @@ public void message(String component, String message) { writer.commit(); - assertEquals("index=" + writer.segString() + " addCount=" + addCount + " delCount=" + delCount, addCount.get() - delCount.get(), writer.numDocs()); + assertEquals("index=" + writer.segString() + " addCount=" + addCount + " delCount=" + delCount, addCount.get() - delCount.get(), writer.getDocStats().numDocs); doClose(); diff --git a/solr/core/src/test/org/apache/solr/update/processor/ClassificationUpdateProcessorTest.java b/solr/core/src/test/org/apache/solr/update/processor/ClassificationUpdateProcessorTest.java index 2a9055a9e203..2c5dddcfa62b 100644 --- a/solr/core/src/test/org/apache/solr/update/processor/ClassificationUpdateProcessorTest.java +++ b/solr/core/src/test/org/apache/solr/update/processor/ClassificationUpdateProcessorTest.java @@ -502,6 +502,6 @@ public static Document buildLuceneDocument(Object... fieldsAndValues) { private int addDoc(RandomIndexWriter writer, Document doc) throws IOException { writer.addDocument(doc); - return writer.numDocs() - 1; + return writer.getDocStats().numDocs - 1; } } From 5c5c42cc37887236bf76b9e5f1cceaa25ee66886 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 14 Dec 2018 17:33:20 +0100 Subject: [PATCH 2/2] LUCENE-8609: Remove deprecated IW#numDocs() and IW#maxDoc() methdos --- lucene/CHANGES.txt | 3 ++ lucene/MIGRATE.txt | 6 ++++ .../org/apache/lucene/index/IndexWriter.java | 35 ++----------------- 3 files changed, 11 insertions(+), 33 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b4dd696b4e43..bed537b9dc52 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -102,6 +102,9 @@ API Changes number of gaps between its component sub-intervals. This can be used in a new filter available via Intervals.maxgaps(). (Alan Woodward) +* LUCENE-8609: Remove IndexWriter#numDocs() and IndexWriter#maxDoc() in favor + of IndexWriter#getDocStats(). (Simon Willnauer) + Changes in Runtime Behavior * LUCENE-8333: Switch MoreLikeThis.setMaxDocFreqPct to use maxDoc instead of diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 0b78d3c345a3..045f00db6270 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -158,3 +158,9 @@ constant factor was removed from the numerator of the scoring formula. Ordering of results is preserved unless scores are computed from multiple fields using different similarities. The previous behaviour is now exposed by the LegacyBM25Similarity class which can be found in the lucene-misc jar. + +## IndexWriter#maxDoc()/#numDocs() removed in favor of IndexWriter#getDocStats() ## + +IndexWriter#getDocStats() should be used instead of #maxDoc() / #numDocs() which offers a consistent +view on document stats. Previously calling two methods in order ot get point in time stats was subject +to concurrent changes. diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 4771f3df681a..f9aaf3468536 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -1131,18 +1131,6 @@ public Analyzer getAnalyzer() { return analyzer; } - /** Returns total number of docs in this index, including - * docs not yet flushed (still in the RAM buffer), - * not counting deletions. - * @see #numDocs - * @deprecated use {@link #getDocStats()} instead - * */ - @Deprecated - public synchronized int maxDoc() { - ensureOpen(); - return docWriter.getNumDocs() + segmentInfos.totalMaxDoc(); - } - /** If {@link SegmentInfos#getVersion} is below {@code newVersion} then update it to this value. * * @lucene.internal */ @@ -1154,24 +1142,6 @@ public synchronized void advanceSegmentInfosVersion(long newVersion) { changed(); } - /** Returns total number of docs in this index, including - * docs not yet flushed (still in the RAM buffer), and - * including deletions. NOTE: buffered deletions - * are not counted. If you really need these to be - * counted you should call {@link #commit()} first. - * @see #maxDoc - * @deprecated use {@link #getDocStats()} instead - * */ - @Deprecated - public synchronized int numDocs() { - ensureOpen(); - int count = docWriter.getNumDocs(); - for (final SegmentCommitInfo info : segmentInfos) { - count += info.info.maxDoc() - numDeletedDocs(info); - } - return count; - } - /** * Returns true if this index has deletions (including * buffered deletions). Note that this will return true @@ -5297,9 +5267,8 @@ final synchronized SegmentInfos cloneSegmentInfos() { } /** - * Returns accurate {@link DocStats} form this writer. This is equivalent to calling {@link #numDocs()} and {@link #maxDoc()} - * but is not subject to race-conditions. The numDoc for instance can change after maxDoc is fetched that causes numDocs to be - * greater than maxDoc which makes it hard to get accurate document stats from IndexWriter. + * Returns accurate {@link DocStats} form this writer. The numDoc for instance can change after maxDoc is fetched + * that causes numDocs to be greater than maxDoc which makes it hard to get accurate document stats from IndexWriter. */ public synchronized DocStats getDocStats() { ensureOpen();