LUCENE-7438: Renovate benchmark module's support for highlighting

apache · Oct 7, 2016 · 5ef60af · 5ef60af
1 parent 6aa28bd
commit 5ef60af
Show file tree

Hide file tree

Showing 20 changed files with 351 additions and 700 deletions.
diff --git a/build.xml b/build.xml
@@ -192,6 +192,8 @@
           // excludes:
           exclude(name: '**/build/**')
           exclude(name: '**/dist/**')
+          exclude(name: 'lucene/benchmark/work/**')
+          exclude(name: 'lucene/benchmark/temp/**')
           exclude(name: '**/CheckLoggingConfiguration.java')
           exclude(name: 'build.xml') // ourselves :-)
         }

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -76,6 +76,9 @@ Other
 * LUCENE-7452: Block join query exception suggests how to find a doc, which 
  violates orthogonality requirement. (Mikhail Khludnev)
 
+* LUCENE-7438: Renovate the Benchmark module's support for benchmarking highlighting. All
+  highlighters are supported via SearchTravRetHighlight. (David Smiley)
+
 Build
 
 * LUCENE-7292: Fix build to use "--release 8" instead of "-release 8" on

diff --git a/lucene/benchmark/.gitignore b/lucene/benchmark/.gitignore
@@ -1,2 +1,2 @@
-temp/
-work/
+/temp
+/work
diff --git a/lucene/benchmark/README.enwiki b/lucene/benchmark/README.enwiki
@@ -13,10 +13,13 @@ writing, there is a page file in
 http://download.wikimedia.org/enwiki/20070402/. You can download this
 file manually and put it in temp. Note that the file you download will
 probably have the date in the name, e.g.,
-http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. When
-you put it in temp, rename it to enwiki-latest-pages-articles.xml.bz2.
+http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2.
+
+If you use the EnwikiContentSource then the data will be decompressed on the fly
+during the benchmark.  If you want to benchmark indexing, you should probably decompress
+it beforehand using the "enwiki" Ant target which will produce a work/enwiki.txt, after
+which you can use LineDocSource in your benchmark.
 
 After that, ant enwiki should process the data set and run a load
-test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can
-also be used to download, decompress, and extract (to individual files
+test. Ant target enwiki will download, decompress, and extract (to individual files
 in work/enwiki) the dataset, respectively.
diff --git a/lucene/benchmark/conf/highlight-vs-vector-highlight.alg b/lucene/benchmark/conf/highlight-vs-vector-highlight.alg
diff --git a/...nchmark/conf/vector-highlight-profile.alg → .../benchmark/conf/highlighters-postings.alg b/...nchmark/conf/vector-highlight-profile.alg → .../benchmark/conf/highlighters-postings.alg
@@ -14,55 +14,52 @@
 # * See the License for the specific language governing permissions and
 # * limitations under the License.
 # */
-# -------------------------------------------------------------------------------------
-# multi val params are iterated by NewRound's, added to reports, start with column name.
 
-ram.flush.mb=flush:32:32
-compound=cmpnd:true:false
+# For postings-offsets with light term-vectors
 
 analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
 directory=FSDirectory
+work.dir=work/enwikiPostings
+ram.flush.mb=64
+compound=false
 
 doc.stored=true
 doc.tokenized=true
+# offsets in postings:
+doc.body.offsets=true
+# term vector, but no positions/offsets with it
 doc.term.vector=true
-doc.term.vector.offsets=true
-doc.term.vector.positions=true
-log.step=2000
 
-docs.dir=reuters-out
+content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
+docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
 
-content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
+file.query.maker.file=conf/query-phrases.txt
+log.queries=false
+log.step.SearchTravRetHighlight=-1
 
-query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
+highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
 
-# task at this depth or less would print when they start
-task.max.depth.log=2
-
-log.queries=true
-# -------------------------------------------------------------------------------------
 { "Populate"
         CreateIndex
-        { "MAddDocs" AddDoc } : 20000
-        ForceMerge(1)
+        [{ "MAddDocs" AddDoc > : 50000] : 4
         CloseIndex
-    }
-{ "Rounds"
+    } : 0
 
-    ResetSystemSoft
+{
+	"Rounds"
 
+        ResetSystemSoft
 
-    OpenReader
-      { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[10],fields[body]) > : 1000
+        OpenReader
 
-    CloseReader
+        { "Warm" SearchTravRetHighlight > : 1000
 
-    RepSumByPref MAddDocs
+        { "HL" SearchTravRetHighlight > : 500
 
-    NewRound
+        CloseReader
 
-} : 4
+        NewRound
+} : 6
 
-RepSumByNameRound
-RepSumByName
-RepSumByPrefRound MAddDocs
+RepSumByPrefRound HL
diff --git a/lucene/benchmark/conf/highlight-profile.alg → lucene/benchmark/conf/highlighters-tv.alg b/lucene/benchmark/conf/highlight-profile.alg → lucene/benchmark/conf/highlighters-tv.alg
@@ -14,55 +14,51 @@
 # * See the License for the specific language governing permissions and
 # * limitations under the License.
 # */
-# -------------------------------------------------------------------------------------
-# multi val params are iterated by NewRound's, added to reports, start with column name.
 
-ram.flush.mb=flush:32:32
-compound=cmpnd:true:false
+# This is a full-term vector configuration.
 
 analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
 directory=FSDirectory
+work.dir=work/enwikiTermVec
+ram.flush.mb=64
+compound=false
 
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=true
-doc.term.vector.offsets=true
 doc.term.vector.positions=true
-log.step=2000
-
-docs.dir=reuters-out
+doc.term.vector.offsets=true
 
-content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
+docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
 
-query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
+query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
+file.query.maker.file=conf/query-terms.txt
+log.queries=false
+log.step.SearchTravRetHighlight=-1
 
-# task at this depth or less would print when they start
-task.max.depth.log=2
+highlighter=HlImpl:NONE:SH_V:FVH_V:UH_V
 
-log.queries=true
-# -------------------------------------------------------------------------------------
 { "Populate"
         CreateIndex
-        { "MAddDocs" AddDoc } : 20000
-        ForceMerge(1)
+        [{ "MAddDocs" AddDoc > : 50000] : 4
         CloseIndex
-    }
-{ "Rounds"
+    } : 0
 
-    ResetSystemSoft
+{
+	"Rounds"
 
+        ResetSystemSoft
 
-    OpenReader
-      { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[10],fields[body]) > : 1000
+        OpenReader
 
-    CloseReader
+        { "Warm" SearchTravRetHighlight > : 1000
 
-    RepSumByPref MAddDocs
+        { "HL" SearchTravRetHighlight > : 500
 
-    NewRound
+        CloseReader
 
+        NewRound
 } : 4
 
-RepSumByNameRound
-RepSumByName
-RepSumByPrefRound MAddDocs
+RepSumByPrefRound HL
diff --git a/...nchmark/conf/standard-highlights-notv.alg → lucene/benchmark/conf/highlights.alg b/...nchmark/conf/standard-highlights-notv.alg → lucene/benchmark/conf/highlights.alg
@@ -54,7 +54,7 @@ log.queries=true
       { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
     CloseReader
     OpenReader
-      { "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
+      { "SearchHlgtSameRdr" SearchTravRetHighlight(type[UH]) > : 1000
 
     CloseReader
 

diff --git a/lucene/benchmark/conf/query-phrases.txt b/lucene/benchmark/conf/query-phrases.txt
@@ -0,0 +1,10 @@
+"Abraham Lincoln"
+"Union Wisconsin"
+"court of law"
+"Field Theory" OR "Set Theory"
+"Top 100"
+"red hot chili"
+"greatest guitarists"
+"Planes, Trains & Automobiles" OR ships
+"international airport"
+"Xbox 360"
diff --git a/lucene/benchmark/conf/query-terms.txt b/lucene/benchmark/conf/query-terms.txt
@@ -0,0 +1,10 @@
+Abraham AND Lincoln
+Union AND Wisconsin
+court AND law
+top AND 100
+(field OR set) AND theory
+red AND hot AND chili
+greatest AND guitarists
+(planes AND trains AND automobiles) OR ships
+international AND airport
+xbox AND 360
diff --git a/lucene/benchmark/conf/query-wildcards.txt b/lucene/benchmark/conf/query-wildcards.txt
@@ -0,0 +1,7 @@
+abrah* AND linc*
+court* AND law*
+(field OR set) AND theor*
+red AND hot AND chili*
+great* AND guitar*
+(plan* AND train* AND automob*) OR ship*
+international AND airport*
diff --git a/lucene/benchmark/conf/standard-highlights-tv.alg b/lucene/benchmark/conf/standard-highlights-tv.alg