From 11031a1eb014c74440bee571e97cbb20765f1459 Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Sun, 22 Mar 2020 18:03:22 +0100 Subject: [PATCH 01/13] no args constructor for FakeWordsEncoderAnalyzer --- .../java/io/anserini/ann/fw/FakeWordsEncoderAnalyzer.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/main/java/io/anserini/ann/fw/FakeWordsEncoderAnalyzer.java b/src/main/java/io/anserini/ann/fw/FakeWordsEncoderAnalyzer.java index a9e537f4c3..214fe1be72 100644 --- a/src/main/java/io/anserini/ann/fw/FakeWordsEncoderAnalyzer.java +++ b/src/main/java/io/anserini/ann/fw/FakeWordsEncoderAnalyzer.java @@ -30,10 +30,17 @@ public class FakeWordsEncoderAnalyzer extends Analyzer { static final String REMOVE_IT = "_"; + + private static final int DEFAULT_Q = 60; + private final int q; private final CharArraySet set = new CharArraySet(1, false); + public FakeWordsEncoderAnalyzer() { + this(DEFAULT_Q); + } + public FakeWordsEncoderAnalyzer(int q) { this.q = q; this.set.add(REMOVE_IT); From 8948fa70ace666d339f3e9d30c2101df5ade628c Mon Sep 17 00:00:00 2001 From: tteofili Date: Thu, 14 Jan 2021 17:04:39 +0100 Subject: [PATCH 02/13] merge --- docs/experiments-jdiq2018.md | 58 +++++++++---------- docs/regressions-backgroundlinking18.md | 4 +- docs/regressions-backgroundlinking19.md | 4 +- docs/regressions-car17v1.5.md | 4 +- docs/regressions-car17v2.0-doc2query.md | 4 +- docs/regressions-car17v2.0.md | 4 +- docs/regressions-clef06-fr.md | 6 +- docs/regressions-core17.md | 4 +- docs/regressions-core18.md | 4 +- docs/regressions-cw09b.md | 24 ++++---- docs/regressions-cw12.md | 16 ++--- docs/regressions-cw12b13.md | 16 ++--- docs/regressions-disk12.md | 12 ++-- docs/regressions-dl19-doc.md | 10 ++-- docs/regressions-dl19-passage.md | 10 ++-- docs/regressions-fire12-bn.md | 6 +- docs/regressions-fire12-en.md | 6 +- docs/regressions-fire12-hi.md | 6 +- docs/regressions-gov2.md | 12 ++-- docs/regressions-mb11.md | 8 +-- docs/regressions-mb13.md | 8 +-- ...sions-msmarco-doc-docTTTTTquery-per-doc.md | 4 +- ...s-msmarco-doc-docTTTTTquery-per-passage.md | 4 +- docs/regressions-msmarco-doc.md | 4 +- docs/regressions-msmarco-passage-doc2query.md | 4 +- ...gressions-msmarco-passage-docTTTTTquery.md | 4 +- docs/regressions-msmarco-passage.md | 4 +- docs/regressions-ntcir8-zh.md | 6 +- docs/regressions-robust04.md | 4 +- docs/regressions-robust05.md | 4 +- docs/regressions-trec02-ar.md | 6 +- docs/regressions-wt10g.md | 4 +- 32 files changed, 137 insertions(+), 137 deletions(-) diff --git a/docs/experiments-jdiq2018.md b/docs/experiments-jdiq2018.md index 1364c72fad..8e7a5dc7b8 100644 --- a/docs/experiments-jdiq2018.md +++ b/docs/experiments-jdiq2018.md @@ -46,95 +46,95 @@ The script assumes hard-coded index directories; modify as appropriate. #### disk12 MAP | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.151-200.txt | 0.2614 | 0.2512 | 0.2544 | 0.2558 | 0.2571 | 0.2459 | -topics.51-100.txt | 0.2274 | 0.2245 | 0.2226 | 0.2226 | 0.2260 | 0.2201 | -topics.101-150.txt | 0.2071 | 0.2035 | 0.1967 | 0.2015 | 0.2031 | 0.1840 | +topics.151-200.txt | 0,2614 | 0,2512 | 0,2544 | 0,2558 | 0,2571 | 0,2459 | +topics.51-100.txt | 0,2274 | 0,2245 | 0,2226 | 0,2226 | 0,2260 | 0,2201 | +topics.101-150.txt | 0,2071 | 0,2035 | 0,1967 | 0,2015 | 0,2031 | 0,1840 | #### robust04 MAP | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.robust04.301-450.601-700.txt | 0.2543 | 0.2516 | 0.2531 | 0.2514 | 0.2523 | 0.2509 | +topics.robust04.301-450.601-700.txt | 0,2543 | 0,2516 | 0,2531 | 0,2514 | 0,2523 | 0,2509 | #### robust05 MAP | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.robust05.txt | 0.2097 | 0.1998 | 0.2021 | 0.2030 | 0.2023 | 0.1980 | +topics.robust05.txt | 0,2097 | 0,1998 | 0,2021 | 0,2030 | 0,2023 | 0,1980 | #### core17 MAP | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.core17.txt | 0.2052 | 0.2005 | 0.2019 | 0.1943 | 0.2050 | 0.1999 | +topics.core17.txt | 0,2052 | 0,2005 | 0,2019 | 0,1943 | 0,2050 | 0,1999 | #### wt10g MAP | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.451-550.txt | 0.2005 | 0.2002 | 0.1880 | 0.2021 | 0.1946 | 0.1704 | +topics.451-550.txt | 0,2005 | 0,2002 | 0,1880 | 0,2021 | 0,1946 | 0,1704 | #### gov2 MAP | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.701-750.txt | 0.2702 | 0.2592 | 0.2726 | 0.2700 | 0.2689 | 0.2734 | -topics.751-800.txt | 0.3394 | 0.3195 | 0.3439 | 0.3303 | 0.3342 | 0.3393 | -topics.801-850.txt | 0.3085 | 0.2900 | 0.3088 | 0.3013 | 0.3026 | 0.3139 | +topics.701-750.txt | 0,2702 | 0,2592 | 0,2726 | 0,2700 | 0,2689 | 0,2734 | +topics.751-800.txt | 0,3394 | 0,3195 | 0,3439 | 0,3303 | 0,3342 | 0,3393 | +topics.801-850.txt | 0,3085 | 0,2900 | 0,3088 | 0,3013 | 0,3026 | 0,3139 | #### cw09b ERR20 | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.web.151-200.txt | 0.1524 | 0.1387 | 0.1439 | 0.1484 | 0.1524 | 0.1445 | -topics.web.101-150.txt | 0.0981 | 0.0935 | 0.0892 | 0.0868 | 0.0944 | 0.0893 | -topics.web.51-100.txt | 0.0774 | 0.0776 | 0.0635 | 0.0643 | 0.0725 | 0.0659 | +topics.web.151-200.txt | 0,1524 | 0,1387 | 0,1439 | 0,1484 | 0,1524 | 0,1445 | +topics.web.101-150.txt | 0,0981 | 0,0935 | 0,0892 | 0,0868 | 0,0944 | 0,0893 | +topics.web.51-100.txt | 0,0774 | 0,0776 | 0,0635 | 0,0643 | 0,0725 | 0,0659 | NDCG20 | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.web.151-200.txt | 0.1090 | 0.0933 | 0.0927 | 0.0978 | 0.0986 | 0.0933 | -topics.web.101-150.txt | 0.1927 | 0.1878 | 0.1765 | 0.1701 | 0.1917 | 0.1758 | -topics.web.51-100.txt | 0.1487 | 0.1418 | 0.1217 | 0.1185 | 0.1376 | 0.1252 | +topics.web.151-200.txt | 0,1090 | 0,0933 | 0,0927 | 0,0978 | 0,0986 | 0,0933 | +topics.web.101-150.txt | 0,1927 | 0,1878 | 0,1765 | 0,1701 | 0,1917 | 0,1758 | +topics.web.51-100.txt | 0,1487 | 0,1418 | 0,1217 | 0,1185 | 0,1376 | 0,1252 | MAP | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.web.151-200.txt | 0.1226 | 0.1089 | 0.1170 | 0.1113 | 0.1091 | 0.1163 | -topics.web.101-150.txt | 0.1104 | 0.1081 | 0.1067 | 0.1004 | 0.1104 | 0.1063 | -topics.web.51-100.txt | 0.1165 | 0.1111 | 0.1103 | 0.1060 | 0.1110 | 0.1099 | +topics.web.151-200.txt | 0,1226 | 0,1089 | 0,1170 | 0,1113 | 0,1091 | 0,1163 | +topics.web.101-150.txt | 0,1104 | 0,1081 | 0,1067 | 0,1004 | 0,1104 | 0,1063 | +topics.web.51-100.txt | 0,1165 | 0,1111 | 0,1103 | 0,1060 | 0,1110 | 0,1099 | #### cw12b13 ERR20 | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.web.251-300.txt | 0.1224 | 0.1203 | 0.1109 | 0.1108 | 0.1209 | 0.1135 | -topics.web.201-250.txt | 0.0993 | 0.0797 | 0.0933 | 0.0898 | 0.0821 | 0.0940 | +topics.web.251-300.txt | 0,1224 | 0,1203 | 0,1109 | 0,1108 | 0,1209 | 0,1135 | +topics.web.201-250.txt | 0,0993 | 0,0797 | 0,0933 | 0,0898 | 0,0821 | 0,0940 | NDCG20 | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.web.251-300.txt | 0.1247 | 0.1159 | 0.1213 | 0.1209 | 0.1189 | 0.1213 | -topics.web.201-250.txt | 0.1384 | 0.1222 | 0.1247 | 0.1168 | 0.1247 | 0.1258 | +topics.web.251-300.txt | 0,1247 | 0,1159 | 0,1213 | 0,1209 | 0,1189 | 0,1213 | +topics.web.201-250.txt | 0,1384 | 0,1222 | 0,1247 | 0,1168 | 0,1247 | 0,1258 | MAP | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.web.251-300.txt | 0.0237 | 0.0205 | 0.0242 | 0.0246 | 0.0213 | 0.0240 | -topics.web.201-250.txt | 0.0481 | 0.0450 | 0.0419 | 0.0398 | 0.0454 | 0.0418 | +topics.web.251-300.txt | 0,0237 | 0,0205 | 0,0242 | 0,0246 | 0,0213 | 0,0240 | +topics.web.201-250.txt | 0,0481 | 0,0450 | 0,0419 | 0,0398 | 0,0454 | 0,0418 | #### mb11 MAP | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.microblog2012.txt | 0.2083 | 0.2107 | 0.2046 | 0.2121 | 0.2033 | 0.2055 | -topics.microblog2011.txt | 0.3643 | 0.3769 | 0.3537 | 0.3607 | 0.3823 | 0.3567 | +topics.microblog2012.txt | 0,2083 | 0,2107 | 0,2046 | 0,2121 | 0,2033 | 0,2055 | +topics.microblog2011.txt | 0,3643 | 0,3769 | 0,3537 | 0,3607 | 0,3823 | 0,3567 | #### mb13 MAP | BM25 | F2EXP | PL2 | QL | F2LOG | SPL | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -topics.microblog2013.txt | 0.2600 | 0.2531 | 0.2524 | 0.2615 | 0.2622 | 0.2530 | -topics.microblog2014.txt | 0.4195 | 0.3854 | 0.4132 | 0.4200 | 0.4121 | 0.4147 | +topics.microblog2013.txt | 0,2600 | 0,2531 | 0,2524 | 0,2615 | 0,2622 | 0,2530 | +topics.microblog2014.txt | 0,4195 | 0,3854 | 0,4132 | 0,4200 | 0,4121 | 0,4147 | diff --git a/docs/regressions-backgroundlinking18.md b/docs/regressions-backgroundlinking18.md index 2a6d64e088..ad62257415 100644 --- a/docs/regressions-backgroundlinking18.md +++ b/docs/regressions-backgroundlinking18.md @@ -64,10 +64,10 @@ With the above commands, you should be able to replicate the following results: NCDG@5 | BM25 | +RM3 | +RM3+DF | :---------------------------------------|-----------|-----------|-----------| -[TREC 2018 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking18.txt)| 0.3293 | 0.3526 | 0.4171 | +[TREC 2018 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking18.txt)| 0,3293 | 0,3526 | 0,4171 | AP | BM25 | +RM3 | +RM3+DF | :---------------------------------------|-----------|-----------|-----------| -[TREC 2018 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking18.txt)| 0.2490 | 0.2642 | 0.2692 | +[TREC 2018 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking18.txt)| 0,2490 | 0,2642 | 0,2692 | diff --git a/docs/regressions-backgroundlinking19.md b/docs/regressions-backgroundlinking19.md index 60e834ae07..76655539ff 100644 --- a/docs/regressions-backgroundlinking19.md +++ b/docs/regressions-backgroundlinking19.md @@ -64,10 +64,10 @@ With the above commands, you should be able to replicate the following results: NCDG@5 | BM25 | +RM3 | +RM3+DF | :---------------------------------------|-----------|-----------|-----------| -[TREC 2019 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking19.txt)| 0.4785 | 0.5217 | 0.5051 | +[TREC 2019 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking19.txt)| 0,4785 | 0,5217 | 0,5051 | AP | BM25 | +RM3 | +RM3+DF | :---------------------------------------|-----------|-----------|-----------| -[TREC 2019 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking19.txt)| 0.3027 | 0.3790 | 0.3158 | +[TREC 2019 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking19.txt)| 0,3027 | 0,3790 | 0,3158 | diff --git a/docs/regressions-car17v1.5.md b/docs/regressions-car17v1.5.md index 634027010b..34b5b63952 100644 --- a/docs/regressions-car17v1.5.md +++ b/docs/regressions-car17v1.5.md @@ -86,9 +86,9 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 CAR: benchmarkY1test (v1.5)](../src/main/resources/topics-and-qrels/topics.car17v1.5.benchmarkY1test.txt/)| 0.1562 | 0.1295 | 0.1358 | 0.1386 | 0.1080 | 0.1048 | +[TREC 2017 CAR: benchmarkY1test (v1.5)](../src/main/resources/topics-and-qrels/topics.car17v1.5.benchmarkY1test.txt/)| 0,1562 | 0,1295 | 0,1358 | 0,1386 | 0,1080 | 0,1048 | RECIP_RANK | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 CAR: benchmarkY1test (v1.5)](../src/main/resources/topics-and-qrels/topics.car17v1.5.benchmarkY1test.txt/)| 0.2331 | 0.1923 | 0.1949 | 0.2037 | 0.1599 | 0.1524 | +[TREC 2017 CAR: benchmarkY1test (v1.5)](../src/main/resources/topics-and-qrels/topics.car17v1.5.benchmarkY1test.txt/)| 0,2331 | 0,1923 | 0,1949 | 0,2037 | 0,1599 | 0,1524 | diff --git a/docs/regressions-car17v2.0-doc2query.md b/docs/regressions-car17v2.0-doc2query.md index 0cf28028e1..0c9bf99307 100644 --- a/docs/regressions-car17v2.0-doc2query.md +++ b/docs/regressions-car17v2.0-doc2query.md @@ -92,9 +92,9 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 CAR: benchmarkY1test (v2.0)](../src/main/resources/topics-and-qrels/topics.car17v2.0.benchmarkY1test.txt)| 0.1807 | 0.1521 | 0.1470 | 0.1752 | 0.1453 | 0.1339 | +[TREC 2017 CAR: benchmarkY1test (v2.0)](../src/main/resources/topics-and-qrels/topics.car17v2.0.benchmarkY1test.txt)| 0,1807 | 0,1521 | 0,1470 | 0,1752 | 0,1453 | 0,1339 | RECIP_RANK | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 CAR: benchmarkY1test (v2.0)](../src/main/resources/topics-and-qrels/topics.car17v2.0.benchmarkY1test.txt)| 0.2750 | 0.2275 | 0.2186 | 0.2653 | 0.2156 | 0.1981 | +[TREC 2017 CAR: benchmarkY1test (v2.0)](../src/main/resources/topics-and-qrels/topics.car17v2.0.benchmarkY1test.txt)| 0,2750 | 0,2275 | 0,2186 | 0,2653 | 0,2156 | 0,1981 | diff --git a/docs/regressions-car17v2.0.md b/docs/regressions-car17v2.0.md index e7925e12e9..b3d41a1ac5 100644 --- a/docs/regressions-car17v2.0.md +++ b/docs/regressions-car17v2.0.md @@ -86,9 +86,9 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 CAR: benchmarkY1test (v2.0)](../src/main/resources/topics-and-qrels/topics.car17v2.0.benchmarkY1test.txt)| 0.1545 | 0.1286 | 0.1364 | 0.1371 | 0.1080 | 0.1077 | +[TREC 2017 CAR: benchmarkY1test (v2.0)](../src/main/resources/topics-and-qrels/topics.car17v2.0.benchmarkY1test.txt)| 0,1545 | 0,1286 | 0,1364 | 0,1371 | 0,1080 | 0,1077 | RECIP_RANK | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 CAR: benchmarkY1test (v2.0)](../src/main/resources/topics-and-qrels/topics.car17v2.0.benchmarkY1test.txt)| 0.2321 | 0.1927 | 0.1978 | 0.2013 | 0.1598 | 0.1588 | +[TREC 2017 CAR: benchmarkY1test (v2.0)](../src/main/resources/topics-and-qrels/topics.car17v2.0.benchmarkY1test.txt)| 0,2321 | 0,1927 | 0,1978 | 0,2013 | 0,1598 | 0,1588 | diff --git a/docs/regressions-clef06-fr.md b/docs/regressions-clef06-fr.md index f400cc222e..f99750d2ec 100644 --- a/docs/regressions-clef06-fr.md +++ b/docs/regressions-clef06-fr.md @@ -53,14 +53,14 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | :---------------------------------------|-----------| -[CLEF 2006 (Monolingual French)](../src/main/resources/topics-and-qrels/topics.clef06fr.mono.fr.txt)| 0.3111 | +[CLEF 2006 (Monolingual French)](../src/main/resources/topics-and-qrels/topics.clef06fr.mono.fr.txt)| 0,3111 | P20 | BM25 | :---------------------------------------|-----------| -[CLEF 2006 (Monolingual French)](../src/main/resources/topics-and-qrels/topics.clef06fr.mono.fr.txt)| 0.3184 | +[CLEF 2006 (Monolingual French)](../src/main/resources/topics-and-qrels/topics.clef06fr.mono.fr.txt)| 0,3184 | NDCG20 | BM25 | :---------------------------------------|-----------| -[CLEF 2006 (Monolingual French)](../src/main/resources/topics-and-qrels/topics.clef06fr.mono.fr.txt)| 0.4458 | +[CLEF 2006 (Monolingual French)](../src/main/resources/topics-and-qrels/topics.clef06fr.mono.fr.txt)| 0,4458 | diff --git a/docs/regressions-core17.md b/docs/regressions-core17.md index 0f4d7d9a84..2feb604eb0 100644 --- a/docs/regressions-core17.md +++ b/docs/regressions-core17.md @@ -85,12 +85,12 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core17.txt)| 0.2087 | 0.2823 | 0.2739 | 0.2032 | 0.2606 | 0.2579 | +[TREC 2017 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core17.txt)| 0,2087 | 0,2823 | 0,2739 | 0,2032 | 0,2606 | 0,2579 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2017 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core17.txt)| 0.4293 | 0.5093 | 0.4940 | 0.4467 | 0.4827 | 0.4893 | +[TREC 2017 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core17.txt)| 0,4293 | 0,5093 | 0,4940 | 0,4467 | 0,4827 | 0,4893 | ## Replication Log diff --git a/docs/regressions-core18.md b/docs/regressions-core18.md index c7a1b07548..81110eacca 100644 --- a/docs/regressions-core18.md +++ b/docs/regressions-core18.md @@ -85,12 +85,12 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.2495 | 0.3135 | 0.2841 | 0.2526 | 0.3073 | 0.2919 | +[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0,2495 | 0,3135 | 0,2841 | 0,2526 | 0,3073 | 0,2919 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.3567 | 0.4200 | 0.3947 | 0.3653 | 0.4000 | 0.4020 | +[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0,3567 | 0,4200 | 0,3947 | 0,3653 | 0,4000 | 0,4020 | ## Replication Log diff --git a/docs/regressions-cw09b.md b/docs/regressions-cw09b.md index b71b1deaf9..1971dd0c9f 100644 --- a/docs/regressions-cw09b.md +++ b/docs/regressions-cw09b.md @@ -168,27 +168,27 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.1126 | 0.0933 | 0.0929 | 0.1060 | 0.1019 | 0.1086 | -[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.1094 | 0.1085 | 0.0975 | 0.0958 | 0.0839 | 0.0879 | -[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.1105 | 0.1107 | 0.1315 | 0.1069 | 0.1058 | 0.1212 | +[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0,1126 | 0,0933 | 0,0929 | 0,1060 | 0,1019 | 0,1086 | +[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0,1094 | 0,1085 | 0,0975 | 0,0958 | 0,0839 | 0,0879 | +[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0,1105 | 0,1107 | 0,1315 | 0,1069 | 0,1058 | 0,1212 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.2694 | 0.2389 | 0.2354 | 0.2431 | 0.2312 | 0.2618 | -[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.2513 | 0.2480 | 0.2387 | 0.2147 | 0.2047 | 0.2173 | -[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.2167 | 0.1920 | 0.2553 | 0.2080 | 0.1980 | 0.2147 | +[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0,2694 | 0,2389 | 0,2354 | 0,2431 | 0,2312 | 0,2618 | +[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0,2513 | 0,2480 | 0,2387 | 0,2147 | 0,2047 | 0,2173 | +[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0,2167 | 0,1920 | 0,2553 | 0,2080 | 0,1980 | 0,2147 | NDCG20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.1354 | 0.1369 | 0.1632 | 0.1143 | 0.1182 | 0.1454 | -[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.1890 | 0.1916 | 0.1835 | 0.1619 | 0.1449 | 0.1517 | -[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.1014 | 0.0918 | 0.1441 | 0.0868 | 0.0896 | 0.1037 | +[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0,1354 | 0,1369 | 0,1632 | 0,1143 | 0,1182 | 0,1454 | +[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0,1890 | 0,1916 | 0,1835 | 0,1619 | 0,1449 | 0,1517 | +[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0,1014 | 0,0918 | 0,1441 | 0,0868 | 0,0896 | 0,1037 | ERR20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0.0733 | 0.0747 | 0.0977 | 0.0599 | 0.0592 | 0.0742 | -[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0.0959 | 0.0960 | 0.1091 | 0.0849 | 0.0787 | 0.0821 | -[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0.1303 | 0.1494 | 0.2355 | 0.1305 | 0.1334 | 0.1558 | +[TREC 2010 Web Track (Topics 51-100)](../src/main/resources/topics-and-qrels/topics.web.51-100.txt)| 0,0733 | 0,0747 | 0,0977 | 0,0599 | 0,0592 | 0,0742 | +[TREC 2011 Web Track (Topics 101-150)](../src/main/resources/topics-and-qrels/topics.web.101-150.txt)| 0,0959 | 0,0960 | 0,1091 | 0,0849 | 0,0787 | 0,0821 | +[TREC 2012 Web Track (Topics 151-200)](../src/main/resources/topics-and-qrels/topics.web.151-200.txt)| 0,1303 | 0,1494 | 0,2355 | 0,1305 | 0,1334 | 0,1558 | diff --git a/docs/regressions-cw12.md b/docs/regressions-cw12.md index 7708f7cb72..bbb57a39fd 100644 --- a/docs/regressions-cw12.md +++ b/docs/regressions-cw12.md @@ -100,23 +100,23 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.1694 | 0.1464 | 0.1494 | 0.1290 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.2469 | 0.2324 | 0.2466 | 0.2177 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0,1694 | 0,1464 | 0,1494 | 0,1290 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0,2469 | 0,2324 | 0,2466 | 0,2177 | P30 | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.2773 | 0.2393 | 0.2607 | 0.2347 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.4547 | 0.4080 | 0.4380 | 0.3800 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0,2773 | 0,2393 | 0,2607 | 0,2347 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0,4547 | 0,4080 | 0,4380 | 0,3800 | NDCG20 | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.2088 | 0.2033 | 0.1993 | 0.1725 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.2572 | 0.2530 | 0.2218 | 0.2083 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0,2088 | 0,2033 | 0,1993 | 0,1725 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0,2572 | 0,2530 | 0,2218 | 0,2083 | ERR20 | BM25 | +RM3 | QL | +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.1284 | 0.1264 | 0.1233 | 0.1008 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.1616 | 0.1655 | 0.1322 | 0.1245 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0,1284 | 0,1264 | 0,1233 | 0,1008 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0,1616 | 0,1655 | 0,1322 | 0,1245 | diff --git a/docs/regressions-cw12b13.md b/docs/regressions-cw12b13.md index 7c528d4d44..74ba8e596e 100644 --- a/docs/regressions-cw12b13.md +++ b/docs/regressions-cw12b13.md @@ -128,26 +128,26 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.0468 | 0.0408 | 0.0435 | 0.0397 | 0.0322 | 0.0358 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.0224 | 0.0210 | 0.0180 | 0.0235 | 0.0203 | 0.0183 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0,0468 | 0,0408 | 0,0435 | 0,0397 | 0,0322 | 0,0358 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0,0224 | 0,0210 | 0,0180 | 0,0235 | 0,0203 | 0,0183 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.2113 | 0.1673 | 0.1833 | 0.1780 | 0.1513 | 0.1507 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.1273 | 0.1207 | 0.1107 | 0.1373 | 0.1173 | 0.1147 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0,2113 | 0,1673 | 0,1833 | 0,1780 | 0,1513 | 0,1507 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0,1273 | 0,1207 | 0,1107 | 0,1373 | 0,1173 | 0,1147 | NDCG20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.1286 | 0.1119 | 0.1287 | 0.1106 | 0.0920 | 0.1141 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.1183 | 0.1081 | 0.0963 | 0.1177 | 0.1004 | 0.0989 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0,1286 | 0,1119 | 0,1287 | 0,1106 | 0,0920 | 0,1141 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0,1183 | 0,1081 | 0,0963 | 0,1177 | 0,1004 | 0,0989 | ERR20 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0.0838 | 0.0753 | 0.0941 | 0.0768 | 0.0553 | 0.0780 | -[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0.1201 | 0.1066 | 0.0928 | 0.1092 | 0.0928 | 0.0900 | +[TREC 2013 Web Track (Topics 201-250)](../src/main/resources/topics-and-qrels/topics.web.201-250.txt)| 0,0838 | 0,0753 | 0,0941 | 0,0768 | 0,0553 | 0,0780 | +[TREC 2014 Web Track (Topics 251-300)](../src/main/resources/topics-and-qrels/topics.web.251-300.txt)| 0,1201 | 0,1066 | 0,0928 | 0,1092 | 0,0928 | 0,0900 | ## Replication Log diff --git a/docs/regressions-disk12.md b/docs/regressions-disk12.md index 419e9d01dc..d7ccb68cfa 100644 --- a/docs/regressions-disk12.md +++ b/docs/regressions-disk12.md @@ -148,13 +148,13 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0.2273 | 0.2634 | 0.2640 | 0.2189 | 0.2435 | 0.2501 | -[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0.2010 | 0.2587 | 0.2722 | 0.2015 | 0.2442 | 0.2593 | -[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0.2580 | 0.3390 | 0.3318 | 0.2518 | 0.3042 | 0.3103 | +[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0,2273 | 0,2634 | 0,2640 | 0,2189 | 0,2435 | 0,2501 | +[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0,2010 | 0,2587 | 0,2722 | 0,2015 | 0,2442 | 0,2593 | +[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0,2580 | 0,3390 | 0,3318 | 0,2518 | 0,3042 | 0,3103 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0.4533 | 0.4800 | 0.5067 | 0.4520 | 0.4627 | 0.4953 | -[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0.4280 | 0.4593 | 0.4753 | 0.4207 | 0.4420 | 0.4740 | -[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0.4740 | 0.5273 | 0.5100 | 0.4580 | 0.4913 | 0.5167 | +[TREC-1 Ad Hoc Topics 51-100](../src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt)| 0,4533 | 0,4800 | 0,5067 | 0,4520 | 0,4627 | 0,4953 | +[TREC-2 Ad Hoc Topics 101-150](../src/main/resources/topics-and-qrels/topics.adhoc.101-150.txt)| 0,4280 | 0,4593 | 0,4753 | 0,4207 | 0,4420 | 0,4740 | +[TREC-3 Ad Hoc Topics 151-200](../src/main/resources/topics-and-qrels/topics.adhoc.151-200.txt)| 0,4740 | 0,5273 | 0,5100 | 0,4580 | 0,4913 | 0,5167 | diff --git a/docs/regressions-dl19-doc.md b/docs/regressions-dl19-doc.md index 230fd8022b..a3cbd87cb2 100644 --- a/docs/regressions-dl19-doc.md +++ b/docs/regressions-dl19-doc.md @@ -100,27 +100,27 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[DL19 (Doc)](https://trec.nist.gov/data/deep2019.html)| 0.3309 | 0.3870 | 0.3516 | 0.3624 | 0.3138 | 0.3697 | 0.3860 | 0.3858 | +[DL19 (Doc)](https://trec.nist.gov/data/deep2019.html)| 0,3309 | 0,3870 | 0,3516 | 0,3624 | 0,3138 | 0,3697 | 0,3860 | 0,3858 | NDCG@10 | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[DL19 (Doc)](https://trec.nist.gov/data/deep2019.html)| 0.5190 | 0.5169 | 0.4730 | 0.5105 | 0.5140 | 0.5485 | 0.5245 | 0.5280 | +[DL19 (Doc)](https://trec.nist.gov/data/deep2019.html)| 0,5190 | 0,5169 | 0,4730 | 0,5105 | 0,5140 | 0,5485 | 0,5245 | 0,5280 | RR | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[DL19 (Doc)](https://trec.nist.gov/data/deep2019.html)| 0.8046 | 0.7718 | 0.7428 | 0.7775 | 0.8872 | 0.8074 | 0.7492 | 0.8007 | +[DL19 (Doc)](https://trec.nist.gov/data/deep2019.html)| 0,8046 | 0,7718 | 0,7428 | 0,7775 | 0,8872 | 0,8074 | 0,7492 | 0,8007 | R@100 | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[DL19 (Doc)](https://trec.nist.gov/data/deep2019.html)| 0.3948 | 0.4189 | 0.3945 | 0.4004 | 0.3862 | 0.4193 | 0.4399 | 0.4287 | +[DL19 (Doc)](https://trec.nist.gov/data/deep2019.html)| 0,3948 | 0,4189 | 0,3945 | 0,4004 | 0,3862 | 0,4193 | 0,4399 | 0,4287 | R@1000 | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[DL19 (Doc)](https://trec.nist.gov/data/deep2019.html)| 0.6966 | 0.7504 | 0.7323 | 0.7357 | 0.6810 | 0.7282 | 0.7545 | 0.7553 | +[DL19 (Doc)](https://trec.nist.gov/data/deep2019.html)| 0,6966 | 0,7504 | 0,7323 | 0,7357 | 0,6810 | 0,7282 | 0,7545 | 0,7553 | The setting "default" refers the default BM25 settings of `k1=0.9`, `b=0.4`, while "tuned" refers to the tuned setting of `k1=3.44`, `b=0.87` (see [this page](experiments-msmarco-doc.md) for more details about tuning). diff --git a/docs/regressions-dl19-passage.md b/docs/regressions-dl19-passage.md index 308347db74..4413c84368 100644 --- a/docs/regressions-dl19-passage.md +++ b/docs/regressions-dl19-passage.md @@ -101,27 +101,27 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[DL19 (Passage)](https://trec.nist.gov/data/deep2019.html)| 0.3773 | 0.4270 | 0.4651 | 0.4533 | 0.3766 | 0.4249 | 0.4722 | 0.4522 | +[DL19 (Passage)](https://trec.nist.gov/data/deep2019.html)| 0,3773 | 0,4270 | 0,4651 | 0,4533 | 0,3766 | 0,4249 | 0,4722 | 0,4522 | NDCG@10 | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[DL19 (Passage)](https://trec.nist.gov/data/deep2019.html)| 0.5058 | 0.5180 | 0.5511 | 0.5372 | 0.4973 | 0.5231 | 0.5461 | 0.5536 | +[DL19 (Passage)](https://trec.nist.gov/data/deep2019.html)| 0,5058 | 0,5180 | 0,5511 | 0,5372 | 0,4973 | 0,5231 | 0,5461 | 0,5536 | RR | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[DL19 (Passage)](https://trec.nist.gov/data/deep2019.html)| 0.8245 | 0.8167 | 0.7736 | 0.8170 | 0.8457 | 0.8229 | 0.8218 | 0.8178 | +[DL19 (Passage)](https://trec.nist.gov/data/deep2019.html)| 0,8245 | 0,8167 | 0,7736 | 0,8170 | 0,8457 | 0,8229 | 0,8218 | 0,8178 | R@100 | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[DL19 (Passage)](https://trec.nist.gov/data/deep2019.html)| 0.4531 | 0.4761 | 0.4995 | 0.4974 | 0.4603 | 0.4747 | 0.5065 | 0.4969 | +[DL19 (Passage)](https://trec.nist.gov/data/deep2019.html)| 0,4531 | 0,4761 | 0,4995 | 0,4974 | 0,4603 | 0,4747 | 0,5065 | 0,4969 | R@1000 | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[DL19 (Passage)](https://trec.nist.gov/data/deep2019.html)| 0.7389 | 0.7882 | 0.8129 | 0.7845 | 0.7384 | 0.7762 | 0.8094 | 0.7894 | +[DL19 (Passage)](https://trec.nist.gov/data/deep2019.html)| 0,7389 | 0,7882 | 0,8129 | 0,7845 | 0,7384 | 0,7762 | 0,8094 | 0,7894 | The setting "default" refers the default BM25 settings of `k1=0.9`, `b=0.4`, while "tuned" refers to the tuned setting of `k1=0.82`, `b=0.68` (see [this page](experiments-msmarco-passage.md) for more details about tuning). diff --git a/docs/regressions-fire12-bn.md b/docs/regressions-fire12-bn.md index af234ad82f..8b0d731c83 100644 --- a/docs/regressions-fire12-bn.md +++ b/docs/regressions-fire12-bn.md @@ -52,14 +52,14 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | :---------------------------------------|-----------| -[FIRE 2012 (Monolingual Bengali)](../src/main/resources/topics-and-qrels/topics.fire12bn.176-225.txt)| 0.2881 | +[FIRE 2012 (Monolingual Bengali)](../src/main/resources/topics-and-qrels/topics.fire12bn.176-225.txt)| 0,2881 | P20 | BM25 | :---------------------------------------|-----------| -[FIRE 2012 (Monolingual Bengali)](../src/main/resources/topics-and-qrels/topics.fire12bn.176-225.txt)| 0.3740 | +[FIRE 2012 (Monolingual Bengali)](../src/main/resources/topics-and-qrels/topics.fire12bn.176-225.txt)| 0,3740 | NDCG20 | BM25 | :---------------------------------------|-----------| -[FIRE 2012 (Monolingual Bengali)](../src/main/resources/topics-and-qrels/topics.fire12bn.176-225.txt)| 0.4261 | +[FIRE 2012 (Monolingual Bengali)](../src/main/resources/topics-and-qrels/topics.fire12bn.176-225.txt)| 0,4261 | diff --git a/docs/regressions-fire12-en.md b/docs/regressions-fire12-en.md index 1e6c99bae3..ccaa12d0d7 100644 --- a/docs/regressions-fire12-en.md +++ b/docs/regressions-fire12-en.md @@ -52,14 +52,14 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | :---------------------------------------|-----------| -[FIRE 2012 (Monolingual English)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0.3713 | +[FIRE 2012 (Monolingual English)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0,3713 | P20 | BM25 | :---------------------------------------|-----------| -[FIRE 2012 (Monolingual English)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0.4970 | +[FIRE 2012 (Monolingual English)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0,4970 | NDCG20 | BM25 | :---------------------------------------|-----------| -[FIRE 2012 (Monolingual English)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0.5420 | +[FIRE 2012 (Monolingual English)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0,5420 | diff --git a/docs/regressions-fire12-hi.md b/docs/regressions-fire12-hi.md index 149228ac72..4f13502e6a 100644 --- a/docs/regressions-fire12-hi.md +++ b/docs/regressions-fire12-hi.md @@ -52,14 +52,14 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | :---------------------------------------|-----------| -[FIRE 2012 (Monolingual Hindi)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0.3867 | +[FIRE 2012 (Monolingual Hindi)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0,3867 | P20 | BM25 | :---------------------------------------|-----------| -[FIRE 2012 (Monolingual Hindi)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0.4470 | +[FIRE 2012 (Monolingual Hindi)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0,4470 | NDCG20 | BM25 | :---------------------------------------|-----------| -[FIRE 2012 (Monolingual Hindi)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0.5310 | +[FIRE 2012 (Monolingual Hindi)](../src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt)| 0,5310 | diff --git a/docs/regressions-gov2.md b/docs/regressions-gov2.md index c0e47f7b29..da9985ea5e 100644 --- a/docs/regressions-gov2.md +++ b/docs/regressions-gov2.md @@ -148,13 +148,13 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2004 Terabyte Track (Topics 701-750)](../src/main/resources/topics-and-qrels/topics.terabyte04.701-750.txt)| 0.2689 | 0.2844 | 0.2669 | 0.2681 | 0.2708 | 0.2666 | -[TREC 2005 Terabyte Track (Topics 751-800)](../src/main/resources/topics-and-qrels/topics.terabyte05.751-800.txt)| 0.3390 | 0.3820 | 0.3666 | 0.3303 | 0.3559 | 0.3646 | -[TREC 2006 Terabyte Track (Topics 801-850)](../src/main/resources/topics-and-qrels/topics.terabyte06.801-850.txt)| 0.3080 | 0.3377 | 0.3069 | 0.2997 | 0.3154 | 0.3084 | +[TREC 2004 Terabyte Track (Topics 701-750)](../src/main/resources/topics-and-qrels/topics.terabyte04.701-750.txt)| 0,2689 | 0,2844 | 0,2669 | 0,2681 | 0,2708 | 0,2666 | +[TREC 2005 Terabyte Track (Topics 751-800)](../src/main/resources/topics-and-qrels/topics.terabyte05.751-800.txt)| 0,3390 | 0,3820 | 0,3666 | 0,3303 | 0,3559 | 0,3646 | +[TREC 2006 Terabyte Track (Topics 801-850)](../src/main/resources/topics-and-qrels/topics.terabyte06.801-850.txt)| 0,3080 | 0,3377 | 0,3069 | 0,2997 | 0,3154 | 0,3084 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2004 Terabyte Track (Topics 701-750)](../src/main/resources/topics-and-qrels/topics.terabyte04.701-750.txt)| 0.4864 | 0.5190 | 0.4993 | 0.4755 | 0.4925 | 0.4932 | -[TREC 2005 Terabyte Track (Topics 751-800)](../src/main/resources/topics-and-qrels/topics.terabyte05.751-800.txt)| 0.5540 | 0.5920 | 0.5933 | 0.5347 | 0.5620 | 0.5840 | -[TREC 2006 Terabyte Track (Topics 801-850)](../src/main/resources/topics-and-qrels/topics.terabyte06.801-850.txt)| 0.4907 | 0.5160 | 0.5033 | 0.4720 | 0.4847 | 0.4920 | +[TREC 2004 Terabyte Track (Topics 701-750)](../src/main/resources/topics-and-qrels/topics.terabyte04.701-750.txt)| 0,4864 | 0,5190 | 0,4993 | 0,4755 | 0,4925 | 0,4932 | +[TREC 2005 Terabyte Track (Topics 751-800)](../src/main/resources/topics-and-qrels/topics.terabyte05.751-800.txt)| 0,5540 | 0,5920 | 0,5933 | 0,5347 | 0,5620 | 0,5840 | +[TREC 2006 Terabyte Track (Topics 801-850)](../src/main/resources/topics-and-qrels/topics.terabyte06.801-850.txt)| 0,4907 | 0,5160 | 0,5033 | 0,4720 | 0,4847 | 0,4920 | diff --git a/docs/regressions-mb11.md b/docs/regressions-mb11.md index 984ddfa09d..8ab4811db6 100644 --- a/docs/regressions-mb11.md +++ b/docs/regressions-mb11.md @@ -124,11 +124,11 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2011 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2011.txt)| 0.3384 | 0.3650 | 0.4008 | 0.3584 | 0.3923 | 0.4201 | -[TREC 2012 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2012.txt)| 0.1948 | 0.2193 | 0.2309 | 0.2102 | 0.2389 | 0.2474 | +[TREC 2011 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2011.txt)| 0,3384 | 0,3650 | 0,4008 | 0,3584 | 0,3923 | 0,4201 | +[TREC 2012 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2012.txt)| 0,1948 | 0,2193 | 0,2309 | 0,2102 | 0,2389 | 0,2474 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2011 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2011.txt)| 0.3959 | 0.4170 | 0.4612 | 0.4061 | 0.4435 | 0.4408 | -[TREC 2012 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2012.txt)| 0.3316 | 0.3463 | 0.3554 | 0.3333 | 0.3514 | 0.3842 | +[TREC 2011 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2011.txt)| 0,3959 | 0,4170 | 0,4612 | 0,4061 | 0,4435 | 0,4408 | +[TREC 2012 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2012.txt)| 0,3316 | 0,3463 | 0,3554 | 0,3333 | 0,3514 | 0,3842 | diff --git a/docs/regressions-mb13.md b/docs/regressions-mb13.md index 8e1803c361..c0ffa54249 100644 --- a/docs/regressions-mb13.md +++ b/docs/regressions-mb13.md @@ -124,11 +124,11 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2013.txt)| 0.2371 | 0.2513 | 0.2855 | 0.2602 | 0.2911 | 0.3152 | -[TREC 2014 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2014.txt)| 0.3931 | 0.4374 | 0.4796 | 0.4181 | 0.4676 | 0.4965 | +[TREC 2013 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2013.txt)| 0,2371 | 0,2513 | 0,2855 | 0,2602 | 0,2911 | 0,3152 | +[TREC 2014 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2014.txt)| 0,3931 | 0,4374 | 0,4796 | 0,4181 | 0,4676 | 0,4965 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2013 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2013.txt)| 0.4339 | 0.4411 | 0.4728 | 0.4561 | 0.4906 | 0.5078 | -[TREC 2014 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2014.txt)| 0.6212 | 0.6442 | 0.6648 | 0.6430 | 0.6533 | 0.6727 | +[TREC 2013 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2013.txt)| 0,4339 | 0,4411 | 0,4728 | 0,4561 | 0,4906 | 0,5078 | +[TREC 2014 Microblog Track Topics](../src/main/resources/topics-and-qrels/topics.microblog2014.txt)| 0,6212 | 0,6442 | 0,6648 | 0,6430 | 0,6533 | 0,6727 | diff --git a/docs/regressions-msmarco-doc-docTTTTTquery-per-doc.md b/docs/regressions-msmarco-doc-docTTTTTquery-per-doc.md index 0e472986f7..68dbb54528 100644 --- a/docs/regressions-msmarco-doc-docTTTTTquery-per-doc.md +++ b/docs/regressions-msmarco-doc-docTTTTTquery-per-doc.md @@ -49,12 +49,12 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 (Default)| :---------------------------------------|-----------| -[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0.2886 | +[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0,2886 | R@1000 | BM25 (Default)| :---------------------------------------|-----------| -[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0.9259 | +[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0,9259 | See [this page](https://github.com/castorini/docTTTTTquery#Replicating-MS-MARCO-Document-Ranking-Results-with-Anserini) for more details. Note that here we are using `trec_eval` to evaluate the top 1000 hits for each query; beware, the runs provided by MS MARCO organizers for reranking have only 100 hits per query. diff --git a/docs/regressions-msmarco-doc-docTTTTTquery-per-passage.md b/docs/regressions-msmarco-doc-docTTTTTquery-per-passage.md index 34f155cafa..d344feae4f 100644 --- a/docs/regressions-msmarco-doc-docTTTTTquery-per-passage.md +++ b/docs/regressions-msmarco-doc-docTTTTTquery-per-passage.md @@ -49,12 +49,12 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 (Default)| :---------------------------------------|-----------| -[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0.3182 | +[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0,3182 | R@1000 | BM25 (Default)| :---------------------------------------|-----------| -[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0.9490 | +[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0,9490 | See [this page](https://github.com/castorini/docTTTTTquery#Replicating-MS-MARCO-Document-Ranking-Results-with-Anserini) for more details. Note that here we are using `trec_eval` to evaluate the top 1000 hits for each query; beware, the runs provided by MS MARCO organizers for reranking have only 100 hits per query. diff --git a/docs/regressions-msmarco-doc.md b/docs/regressions-msmarco-doc.md index 209ab4bf9e..cc608dd03b 100644 --- a/docs/regressions-msmarco-doc.md +++ b/docs/regressions-msmarco-doc.md @@ -98,12 +98,12 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0.2310 | 0.1632 | 0.1147 | 0.1357 | 0.2788 | 0.2289 | 0.1895 | 0.1559 | +[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0,2310 | 0,1632 | 0,1147 | 0,1357 | 0,2788 | 0,2289 | 0,1895 | 0,1559 | R@1000 | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0.8856 | 0.8785 | 0.8369 | 0.8471 | 0.9326 | 0.9320 | 0.9264 | 0.8758 | +[MS MARCO Document Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Document-Ranking)| 0,8856 | 0,8785 | 0,8369 | 0,8471 | 0,9326 | 0,9320 | 0,9264 | 0,8758 | The setting "default" refers the default BM25 settings of `k1=0.9`, `b=0.4`, while "tuned" refers to the tuned setting of `k1=3.44`, `b=0.87`. See [this page](experiments-msmarco-doc.md) for more details. diff --git a/docs/regressions-msmarco-passage-doc2query.md b/docs/regressions-msmarco-passage-doc2query.md index a56436eb58..c9253d7c73 100644 --- a/docs/regressions-msmarco-passage-doc2query.md +++ b/docs/regressions-msmarco-passage-doc2query.md @@ -74,12 +74,12 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 (Default)| +RM3 | BM25 (Tuned)| +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0.2270 | 0.2028 | 0.2293 | 0.2077 | +[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0,2270 | 0,2028 | 0,2293 | 0,2077 | R@1000 | BM25 (Default)| +RM3 | BM25 (Tuned)| +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0.8900 | 0.8916 | 0.8911 | 0.8957 | +[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0,8900 | 0,8916 | 0,8911 | 0,8957 | The setting "default" refers the default BM25 settings of `k1=0.9`, `b=0.4`, while "tuned" refers to the tuned setting of `k1=0.82`, `b=0.72` _on the original passages_. See [this page](experiments-msmarco-passage.md) for more details. diff --git a/docs/regressions-msmarco-passage-docTTTTTquery.md b/docs/regressions-msmarco-passage-docTTTTTquery.md index 74a90e05e0..2770c0ce8e 100644 --- a/docs/regressions-msmarco-passage-docTTTTTquery.md +++ b/docs/regressions-msmarco-passage-docTTTTTquery.md @@ -70,12 +70,12 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 (Default)| +RM3 | BM25 (Tuned)| +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0.2805 | 0.2243 | 0.2850 | 0.2266 | +[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0,2805 | 0,2243 | 0,2850 | 0,2266 | R@1000 | BM25 (Default)| +RM3 | BM25 (Tuned)| +RM3 | :---------------------------------------|-----------|-----------|-----------|-----------| -[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0.9470 | 0.9463 | 0.9471 | 0.9479 | +[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0,9470 | 0,9463 | 0,9471 | 0,9479 | The setting "default" refers the default BM25 settings of `k1=0.9`, `b=0.4`, while "tuned" refers to the tuned setting of `k1=0.82`, `b=0.72` _on the original passages_. See [this page](experiments-msmarco-passage.md) for more details. diff --git a/docs/regressions-msmarco-passage.md b/docs/regressions-msmarco-passage.md index d9b50fe960..ef04d5d4c8 100644 --- a/docs/regressions-msmarco-passage.md +++ b/docs/regressions-msmarco-passage.md @@ -99,12 +99,12 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0.1926 | 0.1661 | 0.1625 | 0.1520 | 0.1958 | 0.1762 | 0.1699 | 0.1582 | +[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0,1926 | 0,1661 | 0,1625 | 0,1520 | 0,1958 | 0,1762 | 0,1699 | 0,1582 | R@1000 | BM25 (Default)| +RM3 | +Ax | +PRF | BM25 (Tuned)| +RM3 | +Ax | +PRF | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------| -[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0.8526 | 0.8606 | 0.8747 | 0.8537 | 0.8573 | 0.8687 | 0.8809 | 0.8561 | +[MS MARCO Passage Ranking: Dev Queries](https://github.com/microsoft/MSMARCO-Passage-Ranking)| 0,8526 | 0,8606 | 0,8747 | 0,8537 | 0,8573 | 0,8687 | 0,8809 | 0,8561 | The setting "default" refers the default BM25 settings of `k1=0.9`, `b=0.4`, while "tuned" refers to the tuned setting of `k1=0.82`, `b=0.68`. See [this page](experiments-msmarco-passage.md) for more details. diff --git a/docs/regressions-ntcir8-zh.md b/docs/regressions-ntcir8-zh.md index 33fd3b0fc1..0984b4eb60 100644 --- a/docs/regressions-ntcir8-zh.md +++ b/docs/regressions-ntcir8-zh.md @@ -54,14 +54,14 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | :---------------------------------------|-----------| -[NTCIR-8 ACLIA (IR4QA subtask, Monolingual Chinese)](../src/main/resources/topics-and-qrels/topics.ntcir8zh.eval.txt)| 0.4014 | +[NTCIR-8 ACLIA (IR4QA subtask, Monolingual Chinese)](../src/main/resources/topics-and-qrels/topics.ntcir8zh.eval.txt)| 0,4014 | P20 | BM25 | :---------------------------------------|-----------| -[NTCIR-8 ACLIA (IR4QA subtask, Monolingual Chinese)](../src/main/resources/topics-and-qrels/topics.ntcir8zh.eval.txt)| 0.3849 | +[NTCIR-8 ACLIA (IR4QA subtask, Monolingual Chinese)](../src/main/resources/topics-and-qrels/topics.ntcir8zh.eval.txt)| 0,3849 | NDCG20 | BM25 | :---------------------------------------|-----------| -[NTCIR-8 ACLIA (IR4QA subtask, Monolingual Chinese)](../src/main/resources/topics-and-qrels/topics.ntcir8zh.eval.txt)| 0.4757 | +[NTCIR-8 ACLIA (IR4QA subtask, Monolingual Chinese)](../src/main/resources/topics-and-qrels/topics.ntcir8zh.eval.txt)| 0,4757 | diff --git a/docs/regressions-robust04.md b/docs/regressions-robust04.md index 812082c142..b98a42a07e 100644 --- a/docs/regressions-robust04.md +++ b/docs/regressions-robust04.md @@ -85,12 +85,12 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2004 Robust Track Topics](../src/main/resources/topics-and-qrels/topics.robust04.txt)| 0.2531 | 0.2903 | 0.2896 | 0.2467 | 0.2747 | 0.2774 | +[TREC 2004 Robust Track Topics](../src/main/resources/topics-and-qrels/topics.robust04.txt)| 0,2531 | 0,2903 | 0,2896 | 0,2467 | 0,2747 | 0,2774 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2004 Robust Track Topics](../src/main/resources/topics-and-qrels/topics.robust04.txt)| 0.3102 | 0.3365 | 0.3333 | 0.3079 | 0.3232 | 0.3229 | +[TREC 2004 Robust Track Topics](../src/main/resources/topics-and-qrels/topics.robust04.txt)| 0,3102 | 0,3365 | 0,3333 | 0,3079 | 0,3232 | 0,3229 | ## Replication Log diff --git a/docs/regressions-robust05.md b/docs/regressions-robust05.md index 35db301446..8ec389abd5 100644 --- a/docs/regressions-robust05.md +++ b/docs/regressions-robust05.md @@ -84,9 +84,9 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2005 Robust Track Topics](../src/main/resources/topics-and-qrels/topics.robust05.txt)| 0.2032 | 0.2602 | 0.2587 | 0.2028 | 0.2491 | 0.2476 | +[TREC 2005 Robust Track Topics](../src/main/resources/topics-and-qrels/topics.robust05.txt)| 0,2032 | 0,2602 | 0,2587 | 0,2028 | 0,2491 | 0,2476 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[TREC 2005 Robust Track Topics](../src/main/resources/topics-and-qrels/topics.robust05.txt)| 0.3693 | 0.4187 | 0.4120 | 0.3653 | 0.4067 | 0.4113 | +[TREC 2005 Robust Track Topics](../src/main/resources/topics-and-qrels/topics.robust05.txt)| 0,3693 | 0,4187 | 0,4120 | 0,3653 | 0,4067 | 0,4113 | diff --git a/docs/regressions-trec02-ar.md b/docs/regressions-trec02-ar.md index 8cfb84b0b6..9786e6520f 100644 --- a/docs/regressions-trec02-ar.md +++ b/docs/regressions-trec02-ar.md @@ -54,14 +54,14 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | :---------------------------------------|-----------| -[TREC 2002 (Monolingual Arabic)](../src/main/resources/topics-and-qrels/topics.trec02ar-ar.txt)| 0.2932 | +[TREC 2002 (Monolingual Arabic)](../src/main/resources/topics-and-qrels/topics.trec02ar-ar.txt)| 0,2932 | P20 | BM25 | :---------------------------------------|-----------| -[TREC 2002 (Monolingual Arabic)](../src/main/resources/topics-and-qrels/topics.trec02ar-ar.txt)| 0.3610 | +[TREC 2002 (Monolingual Arabic)](../src/main/resources/topics-and-qrels/topics.trec02ar-ar.txt)| 0,3610 | NDCG20 | BM25 | :---------------------------------------|-----------| -[TREC 2002 (Monolingual Arabic)](../src/main/resources/topics-and-qrels/topics.trec02ar-ar.txt)| 0.4056 | +[TREC 2002 (Monolingual Arabic)](../src/main/resources/topics-and-qrels/topics.trec02ar-ar.txt)| 0,4056 | diff --git a/docs/regressions-wt10g.md b/docs/regressions-wt10g.md index 2c17cf18cc..3f86d62d49 100644 --- a/docs/regressions-wt10g.md +++ b/docs/regressions-wt10g.md @@ -84,9 +84,9 @@ With the above commands, you should be able to replicate the following results: MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[Wt10g (Topics 451-550)](../src/main/resources/topics-and-qrels/topics.adhoc.451-550.txt)| 0.1992 | 0.2276 | 0.2200 | 0.2021 | 0.2188 | 0.2275 | +[Wt10g (Topics 451-550)](../src/main/resources/topics-and-qrels/topics.adhoc.451-550.txt)| 0,1992 | 0,2276 | 0,2200 | 0,2021 | 0,2188 | 0,2275 | P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax | :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------| -[Wt10g (Topics 451-550)](../src/main/resources/topics-and-qrels/topics.adhoc.451-550.txt)| 0.2214 | 0.2398 | 0.2483 | 0.2180 | 0.2310 | 0.2514 | +[Wt10g (Topics 451-550)](../src/main/resources/topics-and-qrels/topics.adhoc.451-550.txt)| 0,2214 | 0,2398 | 0,2483 | 0,2180 | 0,2310 | 0,2514 | From 6e09bfba6bcbe17ce4689e9e891da470c213e075 Mon Sep 17 00:00:00 2001 From: tteofili Date: Mon, 25 Jan 2021 08:40:08 +0100 Subject: [PATCH 03/13] avoid loading vectors in a map --- .../java/io/anserini/ann/IndexVectors.java | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/main/java/io/anserini/ann/IndexVectors.java b/src/main/java/io/anserini/ann/IndexVectors.java index e46b6a4177..449910aac9 100644 --- a/src/main/java/io/anserini/ann/IndexVectors.java +++ b/src/main/java/io/anserini/ann/IndexVectors.java @@ -118,8 +118,6 @@ public static void main(String[] args) throws Exception { final long start = System.nanoTime(); System.out.println(String.format("Loading model %s", indexArgs.input)); - Map> vectors = readGloVe(indexArgs.input); - Path indexDir = indexArgs.path; if (!Files.exists(indexDir)) { Files.createDirectories(indexDir); @@ -136,10 +134,23 @@ public static void main(String[] args) throws Exception { IndexWriter indexWriter = new IndexWriter(d, conf); final AtomicInteger cnt = new AtomicInteger(); - for (Map.Entry> entry : vectors.entrySet()) { - for (float[] vector: entry.getValue()) { + for (String line : IOUtils.readLines(new FileReader(indexArgs.input))) { + String[] s = line.split("\\s+"); + if (s.length > 2) { + String key = s[0]; + float[] vector = new float[s.length - 1]; + float norm = 0f; + for (int i = 1; i < s.length; i++) { + float f = Float.parseFloat(s[i]); + vector[i - 1] = f; + norm += Math.pow(f, 2); + } + norm = (float) Math.sqrt(norm); + for (int i = 0; i < vector.length; i++) { + vector[i] = vector[i] / norm; + } Document doc = new Document(); - doc.add(new StringField(FIELD_ID, entry.getKey(), Field.Store.YES)); + doc.add(new StringField(FIELD_ID, key, Field.Store.YES)); StringBuilder sb = new StringBuilder(); for (double fv : vector) { if (sb.length() > 0) { @@ -151,8 +162,9 @@ public static void main(String[] args) throws Exception { try { indexWriter.addDocument(doc); int cur = cnt.incrementAndGet(); - if (cur % 100000 == 0) { + if (cur % 50000 == 0) { System.out.println(String.format("%s docs added", cnt)); + indexWriter.commit(); } } catch (IOException e) { System.err.println("Error while indexing: " + e.getLocalizedMessage()); From a914ee362180d19cd269b686adbfcd70e3ae5515 Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Thu, 19 May 2022 12:11:59 +0200 Subject: [PATCH 04/13] upgrade lucene and solr to 9.0 versions --- pom.xml | 63 +++++++++++++++++-- .../io/anserini/index/IndexCollection.java | 4 +- .../generator/AclAnthologyGenerator.java | 4 +- .../index/generator/BibtexGenerator.java | 4 +- .../index/generator/Cord19Generator.java | 4 +- .../index/generator/CoreGenerator.java | 6 +- .../DefaultLuceneDocumentGenerator.java | 4 +- .../index/generator/EpidemicQAGenerator.java | 4 +- .../io/anserini/rerank/ScoredDocuments.java | 6 +- .../java/io/anserini/search/SearchSolr.java | 4 +- .../search/query/SdmQueryGenerator.java | 6 +- .../solr/anserini/conf/solrconfig.xml | 30 ++------- .../java/io/anserini/IndexerTestBase.java | 10 +-- .../IndexerWithEmptyDocumentTestBase.java | 12 ++-- .../io/anserini/index/CloneIndexTest.java | 10 --- .../integration/CoreEndToEndTest.java | 2 +- .../io/anserini/integration/EndToEndTest.java | 2 +- .../integration/solr/SolrEndToEndTest.java | 6 +- .../DisjunctionMaxQueryGeneratorTest.java | 4 +- .../anserini/search/query/SdmQueryTest.java | 6 +- 20 files changed, 108 insertions(+), 83 deletions(-) diff --git a/pom.xml b/pom.xml index 3709a6dc1b..98ec3c12eb 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,8 @@ - 8.11.0 + 9.0.0 + 9.0.0 UTF-8 @@ -297,7 +298,12 @@ org.apache.lucene - lucene-analyzers-kuromoji + lucene-queries + ${lucene.version} + + + org.apache.lucene + lucene-analysis-kuromoji ${lucene.version} @@ -315,13 +321,62 @@ org.apache.solr solr-solrj - ${lucene.version} + ${solr.version} + + + org.apache.lucene + lucene-core + + + org.apache.lucene + lucene-analysis-common + + + org.apache.lucene + lucene-queries + + + org.slf4j + slf4j-api + + + org.apache.logging.log4j + log4j-slf4j-impl + + org.apache.solr solr-test-framework - ${lucene.version} + ${solr.version} test + + + org.apache.lucene + lucene-core + + + org.apache.lucene + lucene-analysis-common + + + org.apache.lucene + lucene-queries + + + org.slf4j + slf4j-api + + + org.apache.logging.log4j + log4j-slf4j-impl + + + + + org.apache.lucene + lucene-codecs + ${lucene.version} org.elasticsearch.client diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index ed13692d7f..76ae835577 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -408,8 +408,8 @@ private class SolrClientFactory extends BasePooledObjectFactory { @Override public SolrClient create() { return new CloudSolrClient.Builder(Splitter.on(',').splitToList(args.zkUrl), Optional.of(args.zkChroot)) - .withConnectionTimeout(TIMEOUT) - .withSocketTimeout(TIMEOUT) +// .withConnectionTimeout(TIMEOUT) +// .withSocketTimeout(TIMEOUT) .build(); } diff --git a/src/main/java/io/anserini/index/generator/AclAnthologyGenerator.java b/src/main/java/io/anserini/index/generator/AclAnthologyGenerator.java index bdf6210ecf..c854ec00cb 100644 --- a/src/main/java/io/anserini/index/generator/AclAnthologyGenerator.java +++ b/src/main/java/io/anserini/index/generator/AclAnthologyGenerator.java @@ -23,11 +23,11 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntPoint; -import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexOptions; @@ -102,7 +102,7 @@ public Document createDocument(AclAnthology.Document aclDoc) throws GeneratorExc // Store the collection docid. doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); // This is needed to break score ties by docid. - doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id))); + doc.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); if (args.storeRaw) { doc.add(new StoredField(IndexArgs.RAW, aclDoc.raw())); diff --git a/src/main/java/io/anserini/index/generator/BibtexGenerator.java b/src/main/java/io/anserini/index/generator/BibtexGenerator.java index 23371b5834..14b2e1a64b 100644 --- a/src/main/java/io/anserini/index/generator/BibtexGenerator.java +++ b/src/main/java/io/anserini/index/generator/BibtexGenerator.java @@ -22,11 +22,11 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntPoint; -import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexOptions; @@ -101,7 +101,7 @@ public Document createDocument(BibtexCollection.Document bibtexDoc) throws Gener // Store the collection docid. doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); // This is needed to break score ties by docid. - doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id))); + doc.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); // Store the collection's bibtex type doc.add(new StringField(TYPE, type, Field.Store.YES)); diff --git a/src/main/java/io/anserini/index/generator/Cord19Generator.java b/src/main/java/io/anserini/index/generator/Cord19Generator.java index 7403425f3a..f9abccc20e 100644 --- a/src/main/java/io/anserini/index/generator/Cord19Generator.java +++ b/src/main/java/io/anserini/index/generator/Cord19Generator.java @@ -24,11 +24,11 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntPoint; -import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexOptions; @@ -115,7 +115,7 @@ public Document createDocument(Cord19BaseDocument covidDoc) throws GeneratorExce // Store the collection docid. doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); // This is needed to break score ties by docid. - doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id))); + doc.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); if (args.storeRaw) { doc.add(new StoredField(IndexArgs.RAW, raw)); diff --git a/src/main/java/io/anserini/index/generator/CoreGenerator.java b/src/main/java/io/anserini/index/generator/CoreGenerator.java index 1f9bf3316c..8ea1710d1f 100644 --- a/src/main/java/io/anserini/index/generator/CoreGenerator.java +++ b/src/main/java/io/anserini/index/generator/CoreGenerator.java @@ -23,11 +23,11 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntPoint; -import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexOptions; @@ -106,7 +106,7 @@ public Document createDocument(CoreCollection.Document coreDoc) throws Generator // Store the collection docid. doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); // This is needed to break score ties by docid. - doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id))); + doc.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); if (args.storeRaw) { doc.add(new StoredField(IndexArgs.RAW, coreDoc.raw())); @@ -165,10 +165,10 @@ private void addDocumentField(Document doc, String key, JsonNode value, FieldTyp // index as numeric value to allow range queries try { doc.add(new IntPoint(key, Integer.parseInt(valueText))); + doc.add(new StoredField(key, valueText)); } catch(Exception e) { // year is not numeric value } - doc.add(new StoredField(key, valueText)); } else { doc.add(new Field(key, valueText, fieldType)); } diff --git a/src/main/java/io/anserini/index/generator/DefaultLuceneDocumentGenerator.java b/src/main/java/io/anserini/index/generator/DefaultLuceneDocumentGenerator.java index bc226baa6b..b78962e5ae 100644 --- a/src/main/java/io/anserini/index/generator/DefaultLuceneDocumentGenerator.java +++ b/src/main/java/io/anserini/index/generator/DefaultLuceneDocumentGenerator.java @@ -21,10 +21,10 @@ import io.anserini.collection.SourceDocument; import io.anserini.index.IndexArgs; import org.apache.commons.lang3.ArrayUtils; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexOptions; @@ -74,7 +74,7 @@ public Document createDocument(T src) throws GeneratorException { // Store the collection docid. document.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); // This is needed to break score ties by docid. - document.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id))); + document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); if (args.storeRaw) { document.add(new StoredField(IndexArgs.RAW, src.raw())); diff --git a/src/main/java/io/anserini/index/generator/EpidemicQAGenerator.java b/src/main/java/io/anserini/index/generator/EpidemicQAGenerator.java index eb870adbdb..9a74406087 100644 --- a/src/main/java/io/anserini/index/generator/EpidemicQAGenerator.java +++ b/src/main/java/io/anserini/index/generator/EpidemicQAGenerator.java @@ -22,10 +22,10 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexOptions; @@ -76,7 +76,7 @@ public Document createDocument(EpidemicQACollection.Document covidDoc) throws Ge // Store the collection docid. doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); // This is needed to break score ties by docid. - doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id))); + doc.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); if (args.storeRaw) { doc.add(new StoredField(IndexArgs.RAW, raw)); diff --git a/src/main/java/io/anserini/rerank/ScoredDocuments.java b/src/main/java/io/anserini/rerank/ScoredDocuments.java index cc41381254..c215927a31 100644 --- a/src/main/java/io/anserini/rerank/ScoredDocuments.java +++ b/src/main/java/io/anserini/rerank/ScoredDocuments.java @@ -17,12 +17,12 @@ package io.anserini.rerank; import io.anserini.index.IndexArgs; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; -import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; @@ -97,7 +97,7 @@ public static ScoredDocuments fromSolrDocs(SolrDocumentList rs) { // Store the collection docid. document.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); // This is needed to break score ties by docid. - document.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id))); + document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); scoredDocs.documents[i] = document; scoredDocs.scores[i] = score; scoredDocs.ids[i] = i; // no internal Lucene ID available, use index as placeholder @@ -130,7 +130,7 @@ public static ScoredDocuments fromESDocs(SearchHits rs) { // Store the collection docid. document.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); // This is needed to break score ties by docid. - document.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id))); + document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); scoredDocs.documents[i] = document; scoredDocs.scores[i] = score; scoredDocs.ids[i] = i; // no internal Lucene ID available, use index as placeholder diff --git a/src/main/java/io/anserini/search/SearchSolr.java b/src/main/java/io/anserini/search/SearchSolr.java index cc83c92978..86b1a42f6b 100644 --- a/src/main/java/io/anserini/search/SearchSolr.java +++ b/src/main/java/io/anserini/search/SearchSolr.java @@ -165,8 +165,8 @@ public SearchSolr(Args args) throws IOException { LOG.info("Solr ZooKeeper URL: " + args.zkUrl); this.client = new CloudSolrClient.Builder(Splitter.on(',') .splitToList(args.zkUrl), Optional.of(args.zkChroot)) - .withConnectionTimeout(TIMEOUT) - .withSocketTimeout(TIMEOUT) +// .withConnectionTimeout(TIMEOUT) +// .withSocketTimeout(TIMEOUT) .build(); } diff --git a/src/main/java/io/anserini/search/query/SdmQueryGenerator.java b/src/main/java/io/anserini/search/query/SdmQueryGenerator.java index c99292594e..18dac1019d 100644 --- a/src/main/java/io/anserini/search/query/SdmQueryGenerator.java +++ b/src/main/java/io/anserini/search/query/SdmQueryGenerator.java @@ -19,14 +19,14 @@ import io.anserini.analysis.AnalyzerUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.Term; +import org.apache.lucene.queries.spans.SpanNearQuery; +import org.apache.lucene.queries.spans.SpanQuery; +import org.apache.lucene.queries.spans.SpanTermQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.spans.SpanNearQuery; -import org.apache.lucene.search.spans.SpanQuery; -import org.apache.lucene.search.spans.SpanTermQuery; import java.util.List; diff --git a/src/main/resources/solr/anserini/conf/solrconfig.xml b/src/main/resources/solr/anserini/conf/solrconfig.xml index 424d806e88..b00368515b 100644 --- a/src/main/resources/solr/anserini/conf/solrconfig.xml +++ b/src/main/resources/solr/anserini/conf/solrconfig.xml @@ -35,7 +35,7 @@ that you fully re-index after changing this setting as it can affect both how text is indexed and queried. --> - 8.0.0 + 9.0.0 - - - - - - - - - - - - @@ -422,8 +410,7 @@ maxRamMB - the maximum amount of RAM (in MB) that this cache is allowed to occupy --> - @@ -433,14 +420,12 @@ document). Since Lucene internal document ids are transient, this cache will not be autowarmed. --> - ${velocity.params.resource.loader.enabled:false} - - - 5 - - 8.0.0 + 9.0.0 - - - - - - - - - - - - - @@ -422,8 +409,7 @@ maxRamMB - the maximum amount of RAM (in MB) that this cache is allowed to occupy --> - @@ -433,14 +419,12 @@ document). Since Lucene internal document ids are transient, this cache will not be autowarmed. --> - ${velocity.solr.resource.loader.enabled:true} ${velocity.params.resource.loader.enabled:false} - - - - 5 - - + org.apache.lucene - lucene-analyzers-morfologik + lucene-analysis-morfologik ${lucene.version} diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index d4b9a9107d..e768f587fe 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -85,6 +85,7 @@ import org.apache.lucene.store.FSDirectory; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.impl.Http2SolrClient; import org.apache.solr.common.SolrInputDocument; import org.elasticsearch.action.DocWriteRequest; import org.elasticsearch.action.bulk.BulkRequest; @@ -411,8 +412,7 @@ private class SolrClientFactory extends BasePooledObjectFactory { @Override public SolrClient create() { return new CloudSolrClient.Builder(Splitter.on(',').splitToList(args.zkUrl), Optional.of(args.zkChroot)) -// .withConnectionTimeout(TIMEOUT) -// .withSocketTimeout(TIMEOUT) + .withInternalClientBuilder(new Http2SolrClient.Builder().connectionTimeout(TIMEOUT)) .build(); } diff --git a/src/main/java/io/anserini/search/SearchSolr.java b/src/main/java/io/anserini/search/SearchSolr.java index 86b1a42f6b..9b01661aa4 100644 --- a/src/main/java/io/anserini/search/SearchSolr.java +++ b/src/main/java/io/anserini/search/SearchSolr.java @@ -31,6 +31,7 @@ import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.impl.Http2SolrClient; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocumentList; import org.kohsuke.args4j.CmdLineException; @@ -165,8 +166,7 @@ public SearchSolr(Args args) throws IOException { LOG.info("Solr ZooKeeper URL: " + args.zkUrl); this.client = new CloudSolrClient.Builder(Splitter.on(',') .splitToList(args.zkUrl), Optional.of(args.zkChroot)) -// .withConnectionTimeout(TIMEOUT) -// .withSocketTimeout(TIMEOUT) + .withInternalClientBuilder(new Http2SolrClient.Builder().connectionTimeout(TIMEOUT)) .build(); } diff --git a/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml b/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml index 092d84405e..5f3e4208ef 100644 --- a/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml +++ b/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml @@ -1287,7 +1287,7 @@ ${velocity.solr.resource.loader.enabled:true} ${velocity.params.resource.loader.enabled:false} - + org.apache.lucene lucene-analysis-morfologik @@ -328,77 +329,6 @@ 4.13.2 test - - org.apache.solr - solr-solrj - ${solr.version} - - - org.apache.lucene - lucene-core - - - org.apache.lucene - lucene-analysis-common - - - org.apache.lucene - lucene-queries - - - org.slf4j - slf4j-api - - - org.apache.logging.log4j - log4j-slf4j-impl - - - - - org.apache.solr - solr-test-framework - ${solr.version} - test - - - org.apache.lucene - lucene-core - - - org.apache.lucene - lucene-analysis-common - - - org.apache.lucene - lucene-queries - - - org.slf4j - slf4j-api - - - org.apache.logging.log4j - log4j-slf4j-impl - - - - - org.apache.lucene - lucene-codecs - ${lucene.version} - - - org.elasticsearch.client - elasticsearch-rest-high-level-client - 7.0.0 - - - org.apache.lucene - lucene-analyzers-common - - - org.tukaani xz @@ -502,6 +432,11 @@ commons-csv 1.8 + + org.apache.commons + commons-text + 1.9 + org.mockito mockito-all diff --git a/src/main/java/io/anserini/index/IndexArgs.java b/src/main/java/io/anserini/index/IndexArgs.java index 6997f95886..70957978da 100644 --- a/src/main/java/io/anserini/index/IndexArgs.java +++ b/src/main/java/io/anserini/index/IndexArgs.java @@ -69,8 +69,7 @@ public class IndexArgs { // optional arguments - @Option(name = "-index", metaVar = "[path]", forbids = {"-solr", "-es"}, - usage = "Index path.") + @Option(name = "-index", metaVar = "[path]", usage = "Index path.") public String index; @Option(name = "-fields", handler = StringArrayOptionHandler.class, @@ -160,82 +159,6 @@ public class IndexArgs { usage = "File that contains deleted tweet ids (longs), one per line; these tweets will be skipped during indexing.") public String tweetDeletedIdsFile = ""; - // Solr options - - @Option(name = "-solr", forbids = {"-index", "-es"}, - usage = "Indexes into Solr.") - public boolean solr = false; - - @Option(name = "-solr.batch", metaVar = "[n]", - usage = "Solr indexing batch size.") - public int solrBatch = 1000; - - @Option(name = "-solr.commitWithin", metaVar = "[s]", - usage = "Solr commitWithin setting (in seconds).") - public int solrCommitWithin = 60; - - @Option(name = "-solr.index", metaVar = "[name]", - usage = "Solr index name.") - public String solrIndex = null; - - @Option(name = "-solr.zkUrl", metaVar = "[urls]", - usage = "Solr ZooKeeper URLs (comma separated list).") - public String zkUrl = null; - - @Option(name = "-solr.zkChroot", metaVar = "[path]", - usage = "Solr ZooKeeper chroot") - public String zkChroot = "/"; - - @Option(name = "-solr.poolSize", metaVar = "[n]", - usage = "Solr client pool size.") - public int solrPoolSize = 16; - - // Elasticsearch options - - @Option(name = "-es", forbids = {"-index", "-solr"}, - usage = "Indexes into Elasticsearch.") - public boolean es = false; - - @Option(name = "-es.index", metaVar = "[name]", - usage = "Elasticsearch index name.") - public String esIndex = null; - - @Option(name = "-es.batch", metaVar = "[n]", - usage = "Elasticsearch batch index requests size.") - public int esBatch = 1000; - - @Option(name = "-es.bulk", metaVar = "[n]", - usage = "Elasticsearch max bulk requests size in bytes.") - public int esBulk = 80000000; - - @Option(name = "-es.hostname", metaVar = "[host]", - usage = "Elasticsearch host.") - public String esHostname = "localhost"; - - @Option(name = "-es.port", metaVar = "[port]", - usage = "Elasticsearch port number.") - public int esPort = 9200; - - @Option(name = "-es.user", metaVar = "[username]", - usage = "Elasticsearch user name.") - public String esUser = "elastic"; - - @Option(name = "-es.password", metaVar = "[password]", - usage = "Elasticsearch password.") - public String esPassword = "changeme"; - - @Option(name = "-es.poolSize", metaVar = "[num]", - usage = "Elasticsearch client pool size.") - public int esPoolSize = 10; - - @Option(name = "-es.connectTimeout", metaVar = "[ms]", - usage = "Elasticsearch (low level) REST client connect timeout (in ms).") - public int esConnectTimeout = TIMEOUT; - - @Option(name = "-es.socketTimeout", metaVar = "[ms]", - usage = "Elasticsearch (low level) REST client socket timeout (in ms).") - public int esSocketTimeout = TIMEOUT; - // Sharding options @Option(name = "-shard.count", metaVar = "[n]", diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index e768f587fe..b83403a3ae 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -16,8 +16,6 @@ package io.anserini.index; -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; import io.anserini.analysis.DefaultEnglishAnalyzer; import io.anserini.analysis.TweetAnalyzer; import io.anserini.collection.DocumentCollection; @@ -27,22 +25,10 @@ import io.anserini.index.generator.InvalidDocumentException; import io.anserini.index.generator.LuceneDocumentGenerator; import io.anserini.index.generator.SkippedDocumentException; -import io.anserini.index.generator.WashingtonPostGenerator; import io.anserini.search.similarity.AccurateBM25Similarity; import io.anserini.search.similarity.ImpactSimilarity; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.time.DurationFormatUtils; -import org.apache.commons.pool2.BasePooledObjectFactory; -import org.apache.commons.pool2.ObjectPool; -import org.apache.commons.pool2.PooledObject; -import org.apache.commons.pool2.impl.DefaultPooledObject; -import org.apache.commons.pool2.impl.GenericObjectPool; -import org.apache.commons.pool2.impl.GenericObjectPoolConfig; -import org.apache.http.HttpHost; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.CredentialsProvider; -import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.logging.log4j.Level; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -72,29 +58,14 @@ import org.apache.lucene.analysis.th.ThaiAnalyzer; import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer; - import org.apache.lucene.document.Document; import org.apache.lucene.index.ConcurrentMergeScheduler; -import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.solr.client.solrj.SolrClient; -import org.apache.solr.client.solrj.impl.CloudSolrClient; -import org.apache.solr.client.solrj.impl.Http2SolrClient; -import org.apache.solr.common.SolrInputDocument; -import org.elasticsearch.action.DocWriteRequest; -import org.elasticsearch.action.bulk.BulkRequest; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestClient; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.common.xcontent.XContentFactory; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.OptionHandlerFilter; @@ -105,32 +76,21 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.HashSet; import java.util.List; -import java.util.Optional; import java.util.Set; import java.util.concurrent.Executors; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import java.util.stream.Stream; public final class IndexCollection { private static final Logger LOG = LogManager.getLogger(IndexCollection.class); - private static final int TIMEOUT = 600 * 1000; // This is the default analyzer used, unless another stemming algorithm or language is specified. public static final Analyzer DEFAULT_ANALYZER = DefaultEnglishAnalyzer.newDefaultInstance(); - // When duplicates of these fields are attempted to be indexed in Solr, they are ignored. This allows some fields to be multi-valued, but not others. - // Stored vs. indexed vs. doc values vs. multi-valued vs. ... are controlled via config, rather than code, in Solr. - private static final List IGNORED_DUPLICATE_FIELDS = - Lists.newArrayList(WashingtonPostGenerator.WashingtonPostField.PUBLISHED_DATE.name); - public final class Counters { /** * Counter for successfully indexed documents. @@ -262,361 +222,6 @@ public void run() { } } - private final class SolrIndexerThread implements Runnable { - private final Path input; - private final DocumentCollection collection; - private final List buffer = new ArrayList<>(args.solrBatch); - private FileSegment fileSegment; - - private SolrIndexerThread(DocumentCollection collection, Path input) { - this.input = input; - this.collection = collection; - } - - @Override - @SuppressWarnings("unchecked") - public void run() { - try { - LuceneDocumentGenerator generator = (LuceneDocumentGenerator) - generatorClass.getDeclaredConstructor(IndexArgs.class).newInstance(args); - - // We keep track of two separate counts: the total count of documents in this file segment (cnt), - // and the number of documents in this current "batch" (batch). We update the global counter every - // 10k documents: this is so that we get intermediate updates, which is informative if a collection - // has only one file segment; see https://github.com/castorini/anserini/issues/683 - int cnt = 0; - int batch = 0; - - @SuppressWarnings("unchecked") - FileSegment segment = (FileSegment) collection.createFileSegment(input); - // in order to call close() and clean up resources in case of exception - this.fileSegment = segment; - - for (SourceDocument sourceDocument : segment) { - if (!sourceDocument.indexable()) { - counters.unindexable.incrementAndGet(); - continue; - } - - Document document; - try { - document = generator.createDocument(sourceDocument); - } catch (EmptyDocumentException e1) { - counters.empty.incrementAndGet(); - continue; - } catch (SkippedDocumentException e2) { - counters.skipped.incrementAndGet(); - continue; - } catch (InvalidDocumentException e3) { - counters.errors.incrementAndGet(); - continue; - } - - if (whitelistDocids != null && !whitelistDocids.contains(sourceDocument.id())) { - counters.skipped.incrementAndGet(); - continue; - } - - SolrInputDocument solrDocument = new SolrInputDocument(); - - // Copy all Lucene Document fields to Solr document - for (IndexableField field : document.getFields()) { - // Skip docValues fields - this is done via Solr config. - if (field.fieldType().docValuesType() != DocValuesType.NONE) { - continue; - } - // If the field is already in the doc, skip it. - // This fixes an issue with WaPo where published_date is in the Lucene doc as LongPoint and StoredField. Solr needs one copy, more fine-grained control in config. - if (solrDocument.containsKey(field.name()) && IGNORED_DUPLICATE_FIELDS.contains(field.name())) { - continue; - } - if (field.numericValue() != null) { - solrDocument.addField(field.name(), field.numericValue()); - } else if (field.stringValue() != null) { // For some reason, id is multi-valued with null as one of the values - solrDocument.addField(field.name(), field.stringValue()); - } - } - - buffer.add(solrDocument); - if (buffer.size() == args.solrBatch) { - flush(); - } - - cnt++; - batch++; - - // And the counts from this batch, reset batch counter. - if (batch % 10000 == 0) { - counters.indexed.addAndGet(batch); - batch = 0; - } - } - - // If we have docs in the buffer, flush them. - if (!buffer.isEmpty()) { - flush(); - } - - // Add the remaining documents. - counters.indexed.addAndGet(batch); - - int skipped = segment.getSkippedCount(); - if (skipped > 0) { - // When indexing tweets, this is normal, because there are delete messages that are skipped over. - counters.skipped.addAndGet(skipped); - LOG.warn(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": " + skipped + " docs skipped."); - } - - if (segment.getErrorStatus()) { - counters.errors.incrementAndGet(); - LOG.error(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": error iterating through segment."); - } - - // Log at the debug level because this can be quite noisy if there are lots of file segments. - LOG.debug(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": " + cnt + " docs added."); - } catch (Exception e) { - LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); - } finally { - if (fileSegment != null) { - fileSegment.close(); - } - } - } - - private void flush() { - if (!buffer.isEmpty()) { - SolrClient solrClient = null; - try { - solrClient = solrPool.borrowObject(); - solrClient.add(args.solrIndex, buffer, args.solrCommitWithin * 1000); - buffer.clear(); - } catch (Exception e) { - LOG.error("Error flushing documents to Solr", e); - } finally { - if (solrClient != null) { - try { - solrPool.returnObject(solrClient); - } catch (Exception e) { - LOG.error("Error returning SolrClient to pool", e); - } - } - } - } - } - } - - private class SolrClientFactory extends BasePooledObjectFactory { - @Override - public SolrClient create() { - return new CloudSolrClient.Builder(Splitter.on(',').splitToList(args.zkUrl), Optional.of(args.zkChroot)) - .withInternalClientBuilder(new Http2SolrClient.Builder().connectionTimeout(TIMEOUT)) - .build(); - } - - @Override - public PooledObject wrap(SolrClient solrClient) { - return new DefaultPooledObject<>(solrClient); - } - - @Override - public void destroyObject(PooledObject pooled) throws Exception { - pooled.getObject().close(); - } - } - - private final class ESIndexerThread implements Runnable { - private final Path input; - private final DocumentCollection collection; - private BulkRequest bulkRequest; - private FileSegment fileSegment; - - private ESIndexerThread(DocumentCollection collection, Path input) { - this.input = input; - this.collection = collection; - this.bulkRequest = new BulkRequest(); - } - - @Override - @SuppressWarnings("unchecked") - public void run() { - try { - LuceneDocumentGenerator generator = (LuceneDocumentGenerator) - generatorClass.getDeclaredConstructor(IndexArgs.class).newInstance(args); - - // We keep track of two separate counts: the total count of documents in this file segment (cnt), - // and the number of documents in this current "batch" (batch). We update the global counter every - // 10k documents: this is so that we get intermediate updates, which is informative if a collection - // has only one file segment; see https://github.com/castorini/anserini/issues/683 - int cnt = 0; - int batch = 0; - - FileSegment segment = collection.createFileSegment(input); - // in order to call close() and clean up resources in case of exception - this.fileSegment = segment; - - for (SourceDocument sourceDocument : segment) { - if (!sourceDocument.indexable()) { - counters.unindexable.incrementAndGet(); - continue; - } - - Document document; - try { - document = generator.createDocument(sourceDocument); - } catch (EmptyDocumentException e1) { - counters.empty.incrementAndGet(); - continue; - } catch (SkippedDocumentException e2) { - counters.skipped.incrementAndGet(); - continue; - } catch (InvalidDocumentException e3) { - counters.errors.incrementAndGet(); - continue; - } - - if (whitelistDocids != null && !whitelistDocids.contains(sourceDocument.id())) { - counters.skipped.incrementAndGet(); - continue; - } - - // Get distinct field names - List fields = document.getFields().stream().map(field -> field.name()).distinct().collect(Collectors.toList()); - - XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); - - for (String field : fields) { - - // Skip docValues fields - if (document.getField(field).fieldType().docValuesType() != DocValuesType.NONE) continue; - - // Get field objects for current field name (could be multiple, such as WaPo's fullCaption) - IndexableField[] indexableFields = document.getFields(field); - - if (field.equalsIgnoreCase("id") || indexableFields.length == 1) { - // Single value fields or "id" field - Object value = document.getField(field).stringValue() != null ? document.getField(field).stringValue() : document.getField(field).numericValue(); - builder.field(field, value); - } else { - // Multi-valued fields - Object[] values = Stream.of(indexableFields).map(f -> f.stringValue()).toArray(); - builder.array(field, values); - } - } - - builder.endObject(); - - String indexName = (args.esIndex != null) ? args.esIndex : input.getFileName().toString(); - bulkRequest.add(new IndexRequest(indexName).id(sourceDocument.id()).source(builder)); - - // sendBulkRequest when the batch size is reached OR the bulk size is reached - if (bulkRequest.numberOfActions() == args.esBatch || - bulkRequest.estimatedSizeInBytes() >= args.esBulk) { - sendBulkRequest(); - } - - cnt++; - batch++; - - // And the counts from this batch, reset batch counter. - if (batch % 10000 == 0) { - counters.indexed.addAndGet(batch); - batch = 0; - } - } - - if (bulkRequest.numberOfActions() != 0) { - sendBulkRequest(); - } - - // Add the remaining documents. - counters.indexed.addAndGet(batch); - - int skipped = segment.getSkippedCount(); - if (skipped > 0) { - // When indexing tweets, this is normal, because there are delete messages that are skipped over. - counters.skipped.addAndGet(skipped); - LOG.warn(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": " + skipped + " docs skipped."); - } - - if (segment.getErrorStatus()) { - counters.errors.incrementAndGet(); - LOG.error(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": error iterating through segment."); - } - - // Log at the debug level because this can be quite noisy if there are lots of file segments. - LOG.debug(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": " + cnt + " docs added."); - } catch (Exception e) { - LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); - } finally { - if (fileSegment != null){ - fileSegment.close(); - } - } - } - - private void sendBulkRequest() { - if (bulkRequest.numberOfActions() == 0) { - return; - } - - RestHighLevelClient esClient = null; - try { - esClient = esPool.borrowObject(); - esClient.bulk(bulkRequest, RequestOptions.DEFAULT); - bulkRequest = new BulkRequest(); - } catch (Exception e) { - LOG.error("Error sending bulk requests to Elasticsearch", e); - - // Log the 10 docs that have the largest sizes in this request - List> docs = bulkRequest.requests(); - Collections.sort(docs, (d1, d2) -> ((IndexRequest) d2).source().length() - ((IndexRequest) d1).source().length()); - - LOG.info("Error sending bulkRequest. The 10 largest docs in this request are the following cord_uid: "); - for (int i = 0; i < 10; i++) { - IndexRequest doc = (IndexRequest) docs.get(i); - LOG.info(doc.id()); - } - } finally { - if (esClient != null) { - try { - esPool.returnObject(esClient); - } catch (Exception e) { - LOG.error("Error returning ES client to pool", e); - } - } - } - } - } - - private class ESClientFactory extends BasePooledObjectFactory { - @Override - public RestHighLevelClient create() { - final CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); - credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(args.esUser, args.esPassword)); - return new RestHighLevelClient( - RestClient.builder(new HttpHost(args.esHostname, args.esPort, "http")) - .setHttpClientConfigCallback(builder -> builder.setDefaultCredentialsProvider(credentialsProvider)) - .setRequestConfigCallback(builder -> builder.setConnectTimeout(args.esConnectTimeout).setSocketTimeout(args.esSocketTimeout)) - ); - } - - @Override - public PooledObject wrap(RestHighLevelClient esClient) { - return new DefaultPooledObject<>(esClient); - } - - @Override - public void destroyObject(PooledObject pooled) throws Exception { - pooled.getObject().close(); - } - } - private final IndexArgs args; private final Path collectionPath; private final Set whitelistDocids; @@ -625,10 +230,6 @@ public void destroyObject(PooledObject pooled) throws Excep private final DocumentCollection collection; private final Counters counters; private Path indexPath; - private ObjectPool solrPool; - private ObjectPool esPool; - - @SuppressWarnings("unchecked") public IndexCollection(IndexArgs args) throws Exception { @@ -665,32 +266,7 @@ public IndexCollection(IndexArgs args) throws Exception { LOG.info("Optimize (merge segments)? " + args.optimize); LOG.info("Whitelist: " + args.whitelist); LOG.info("Pretokenized?: " + args.pretokenized); - - if (args.solr) { - LOG.info("Indexing into Solr..."); - LOG.info("Solr batch size: " + args.solrBatch); - LOG.info("Solr commitWithin: " + args.solrCommitWithin); - LOG.info("Solr index: " + args.solrIndex); - LOG.info("Solr ZooKeeper URL: " + args.zkUrl); - LOG.info("SolrClient pool size: " + args.solrPoolSize); - } else if (args.es) { - LOG.info("Indexing into Elasticsearch..."); - LOG.info("Elasticsearch batch size: " + args.esBatch); - LOG.info("Elasticsearch index: " + args.esIndex); - LOG.info("Elasticsearch hostname: " + args.esHostname); - LOG.info("Elasticsearch host port: " + args.esPort); - LOG.info("Elasticsearch client connect timeout (in ms): " + args.esConnectTimeout); - LOG.info("Elasticsearch client socket timeout (in ms): " + args.esSocketTimeout); - LOG.info("Elasticsearch pool size: " + args.esPoolSize); - LOG.info("Elasticsearch user: " + args.esUser); - } else { - LOG.info("Directly building Lucene indexes..."); - LOG.info("Index path: " + args.index); - } - - if (args.index == null && !args.solr && !args.es) { - throw new IllegalArgumentException("Must specify one of -index, -solr, or -es"); - } + LOG.info("Index path: " + args.index); if (args.index != null) { this.indexPath = Paths.get(args.index); @@ -723,18 +299,6 @@ public IndexCollection(IndexArgs args) throws Exception { this.whitelistDocids = null; } - if (args.solr) { - GenericObjectPoolConfig config = new GenericObjectPoolConfig<>(); - config.setMaxTotal(args.solrPoolSize); - config.setMinIdle(args.solrPoolSize); // To guard against premature discarding of solrClients - this.solrPool = new GenericObjectPool<>(new SolrClientFactory(), config); - } else if (args.es) { - GenericObjectPoolConfig config = new GenericObjectPoolConfig<>(); - config.setMaxTotal(args.esPoolSize); - config.setMinIdle(args.esPoolSize); - this.esPool = new GenericObjectPool<>(new ESClientFactory(), config); - } - this.counters = new Counters(); } @@ -865,13 +429,7 @@ public Counters run() throws IOException { LOG.info("Starting to index..."); for (int i = 0; i < segmentCnt; i++) { - if (args.solr) { - executor.execute(new SolrIndexerThread(collection, (Path) segmentPaths.get(i))); - } else if (args.es) { - executor.execute(new ESIndexerThread(collection, (Path) segmentPaths.get(i))); - } else { - executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPaths.get(i))); - } + executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPaths.get(i))); } executor.shutdown(); @@ -898,31 +456,9 @@ public Counters run() throws IOException { " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); } - long numIndexed; - - if (args.solr || args.es) { - numIndexed = counters.indexed.get(); - } else { - numIndexed = writer.getDocStats().maxDoc; - } + long numIndexed = writer.getDocStats().maxDoc; // Do a final commit - if (args.solr) { - try { - SolrClient client = solrPool.borrowObject(); - client.commit(args.solrIndex); - // Needed for orderly shutdown so the SolrClient executor does not delay main thread exit - solrPool.returnObject(client); - solrPool.close(); - } catch (Exception e) { - LOG.error("Exception during final Solr commit: ", e); - } - } - - if (args.es) { - esPool.close(); - } - try { if (writer != null) { writer.commit(); diff --git a/src/main/java/io/anserini/rerank/ScoredDocuments.java b/src/main/java/io/anserini/rerank/ScoredDocuments.java index c215927a31..e4eb692873 100644 --- a/src/main/java/io/anserini/rerank/ScoredDocuments.java +++ b/src/main/java/io/anserini/rerank/ScoredDocuments.java @@ -17,30 +17,21 @@ package io.anserini.rerank; import io.anserini.index.IndexArgs; -import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.StringField; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.util.BytesRef; -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; -import org.apache.commons.lang3.ArrayUtils; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.SearchHits; - -import java.util.List; +import java.io.IOException; import java.util.ArrayList; +import java.util.List; import java.util.Map; -import java.io.IOException; /** * ScoredDocuments object that converts TopDocs from the searcher into an Anserini format @@ -74,71 +65,6 @@ public static ScoredDocuments fromTopDocs(TopDocs rs, IndexSearcher searcher) { return scoredDocs; } - public static ScoredDocuments fromSolrDocs(SolrDocumentList rs) { - - ScoredDocuments scoredDocs = new ScoredDocuments(); - - int length = rs.size(); - scoredDocs.documents = new Document[length]; - scoredDocs.ids = new int[length]; - scoredDocs.scores = new float[length]; - - for (int i = 0; i < length; i++) { - - SolrDocument d = rs.get(i); - - // Create placeholder copies of Lucene Documents - // Intention is for compatibility with ScoreTiesAdjusterReranker without disturbing other aspects of reranker code - - Document document = new Document(); - String id = d.getFieldValue("id").toString(); - float score = (float) d.getFieldValue("score"); - - // Store the collection docid. - document.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); - // This is needed to break score ties by docid. - document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); - scoredDocs.documents[i] = document; - scoredDocs.scores[i] = score; - scoredDocs.ids[i] = i; // no internal Lucene ID available, use index as placeholder - } - - return scoredDocs; - } - - public static ScoredDocuments fromESDocs(SearchHits rs) { - - ScoredDocuments scoredDocs = new ScoredDocuments(); - SearchHit[] searchHits = rs.getHits(); - - int length = searchHits.length; - scoredDocs.documents = new Document[length]; - scoredDocs.ids = new int[length]; - scoredDocs.scores = new float[length]; - - for (int i = 0; i < length; i++) { - - SearchHit hit = searchHits[i]; - - // Create placeholder copies of Lucene Documents - // Intention is for compatibility with ScoreTiesAdjusterReranker without disturbing other aspects of reranker code - - Document document = new Document(); - String id = hit.getId(); - float score = hit.getScore(); - - // Store the collection docid. - document.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); - // This is needed to break score ties by docid. - document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); - scoredDocs.documents[i] = document; - scoredDocs.scores[i] = score; - scoredDocs.ids[i] = i; // no internal Lucene ID available, use index as placeholder - } - - return scoredDocs; - } - public static ScoredDocuments fromQrels(Map qrels, IndexReader reader) throws IOException { ScoredDocuments scoredDocs = new ScoredDocuments(); diff --git a/src/main/java/io/anserini/search/SearchElastic.java b/src/main/java/io/anserini/search/SearchElastic.java deleted file mode 100644 index fdc01e387f..0000000000 --- a/src/main/java/io/anserini/search/SearchElastic.java +++ /dev/null @@ -1,334 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search; - -import io.anserini.index.IndexArgs; -import io.anserini.index.generator.TweetGenerator; -import io.anserini.rerank.ScoredDocuments; -import io.anserini.rerank.lib.ScoreTiesAdjusterReranker; -import io.anserini.search.topicreader.TopicReader; -import org.apache.commons.lang3.time.DurationFormatUtils; -import org.apache.http.HttpHost; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.CredentialsProvider; -import org.apache.http.impl.client.BasicCredentialsProvider; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.HttpAsyncResponseConsumerFactory; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestClient; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.query.QueryStringQueryBuilder; -import org.elasticsearch.index.query.RangeQueryBuilder; -import org.elasticsearch.search.SearchHits; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.sort.FieldSortBuilder; -import org.elasticsearch.search.sort.ScoreSortBuilder; -import org.elasticsearch.search.sort.SortOrder; -import org.kohsuke.args4j.CmdLineException; -import org.kohsuke.args4j.CmdLineParser; -import org.kohsuke.args4j.Option; -import org.kohsuke.args4j.OptionHandlerFilter; -import org.kohsuke.args4j.ParserProperties; -import org.kohsuke.args4j.spi.StringArrayOptionHandler; - -import java.io.Closeable; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Locale; -import java.util.Map; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.concurrent.TimeUnit; - -/* -* Entry point of the Retrieval. - */ -public final class SearchElastic implements Closeable { - - private static final Logger LOG = LogManager.getLogger(SearchCollection.class); - private static final int TIMEOUT = 600 * 1000; - private final Args args; - private RestHighLevelClient client; - - private static final RequestOptions COMMON_OPTIONS; - static { - RequestOptions.Builder builder = RequestOptions.DEFAULT.toBuilder(); - builder.setHttpAsyncResponseConsumerFactory( - new HttpAsyncResponseConsumerFactory.HeapBufferedResponseConsumerFactory(1024 * 1024 * 1024)); - COMMON_OPTIONS = builder.build(); - } - - public static final class Args { - - // required arguments - - @Option(name = "-topics", metaVar = "[file]", handler = StringArrayOptionHandler.class, required = true, usage = "topics file") - public String[] topics; - - @Option(name = "-output", metaVar = "[file]", required = true, usage = "output file") - public String output; - - @Option(name = "-topicreader", required = true, usage = "define how to read the topic(query) file: one of [Trec|Webxml]") - public String topicReader; - - @Option(name = "-es.index", usage = "the name of the index in Elasticsearch") - public String esIndex = null; - - @Option(name = "-es.hostname", usage = "the name of Elasticsearch HTTP host") - public String esHostname = "localhost"; - - @Option(name = "-es.port", usage = "the port for Elasticsearch HTTP host") - public int esPort = 9200; - - /** - * The user and password are defaulted to those pre-configured for docker-elk - */ - @Option(name = "-es.user", usage = "the user of the ELK stack") - public String esUser = "elastic"; - - @Option(name = "-es.password", usage = "the password for the ELK stack") - public String esPassword = "changeme"; - - // optional arguments - @Option(name = "-topicfield", usage = "Which field of the query should be used, default \"title\"." + - " For TREC ad hoc topics, description or narrative can be used.") - public String topicfield = "title"; - - @Option(name = "-searchtweets", usage = "Whether the search is against a tweet " + - "index created by IndexCollection -collection TweetCollection") - public Boolean searchtweets = false; - - @Option(name = "-hits", metaVar = "[number]", required = false, usage = "max number of hits to return") - public int hits = 1000; - - @Option(name = "-runtag", metaVar = "[tag]", required = false, usage = "runtag") - public String runtag = null; - - } - - private final class ESSearcherThread extends Thread { - - final private SortedMap> topics; - final private String outputPath; - final private String runTag; - - private ESSearcherThread(SortedMap> topics, String outputPath, String runTag){ - - this.topics = topics; - this.runTag = runTag; - this.outputPath = outputPath; - setName(outputPath); - } - - @Override - public void run() { - try { - LOG.info("[Start] Retrieval with Elasticsearch collection: " + args.esIndex); - final long start = System.nanoTime(); - PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(outputPath), StandardCharsets.US_ASCII)); - - for (Map.Entry> entry : topics.entrySet()) { - K qid = entry.getKey(); - String queryString = entry.getValue().get(args.topicfield); - ScoredDocuments docs; - if (args.searchtweets) { - docs = searchTweets(queryString, Long.parseLong(entry.getValue().get("time"))); - } else { - docs = search(queryString); - } - - /** - * the first column is the topic number. - * the second column is currently unused and should always be "Q0". - * the third column is the official document identifier of the retrieved document. - * the fourth column is the rank the document is retrieved. - * the fifth column shows the score (integer or floating point) that generated the ranking. - * the sixth column is called the "run tag" and should be a unique identifier for your - */ - for (int i = 0; i < docs.documents.length; i++) { - out.println(String.format(Locale.US, "%s Q0 %s %d %f %s", qid, - docs.documents[i].getField(IndexArgs.ID).stringValue(), (i + 1), docs.scores[i], runTag)); - } - } - out.flush(); - out.close(); - - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("[Finished] Run " + topics.size() + " topics searched in " - + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); - } catch (Exception e) { - LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); - } - } - } - - public SearchElastic(Args args) { - this.args = args; - LOG.info("Elasticsearch index: " + args.esIndex); - LOG.info("Elasticsearch hostname: " + args.esHostname); - LOG.info("Elasticsearch host port: " + args.esPort); - - final CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); - credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(args.esUser, args.esPassword)); - - this.client = new RestHighLevelClient( - RestClient.builder(new HttpHost(args.esHostname, args.esPort, "http")) - .setHttpClientConfigCallback(builder -> builder.setDefaultCredentialsProvider(credentialsProvider)) - .setRequestConfigCallback(builder -> builder.setConnectTimeout(TIMEOUT).setSocketTimeout(TIMEOUT))); - } - - @SuppressWarnings("unchecked") - public void runTopics() throws IOException { - TopicReader tr; - SortedMap> topics = new TreeMap<>(); - for (String singleTopicsFile : args.topics) { - Path topicsFilePath = Paths.get(singleTopicsFile); - if (!Files.exists(topicsFilePath) || !Files.isRegularFile(topicsFilePath) || !Files.isReadable(topicsFilePath)) { - throw new IllegalArgumentException("Topics file : " + topicsFilePath + " does not exist or is not a (readable) file."); - } - try { - tr = (TopicReader) Class.forName("io.anserini.search.topicreader." + args.topicReader + "TopicReader") - .getConstructor(Path.class).newInstance(topicsFilePath); - topics.putAll(tr.read()); - } catch (Exception e) { - throw new IllegalArgumentException("Unable to load topic reader: " + args.topicReader); - } - } - - final String runTag = args.runtag == null ? "Elastirini" : args.runtag; - ESSearcherThread esThread = new ESSearcherThread(topics, args.output, runTag); - esThread.run(); - } - - public ScoredDocuments search(String queryString){ - - SearchHits results = null; - - String specials = "+-=&|> ScoredDocuments searchTweets(String queryString, long t){ - - SearchHits results = null; - - String specials = "+-=&|> tag contains the timestamp of the query in terms of the - // chronologically nearest tweet id within the corpus - RangeQueryBuilder queryTweetTime = QueryBuilders - .rangeQuery(TweetGenerator.TweetField.ID_LONG.name) - .from(0L) - .to(t); - - QueryStringQueryBuilder queryTerms = QueryBuilders - .queryStringQuery(queryString) - .defaultField("contents") - .analyzer("english"); - - BoolQueryBuilder query = QueryBuilders.boolQuery() - .filter(queryTweetTime) - .should(queryTerms); - - SearchRequest searchRequest = new SearchRequest(args.esIndex); - SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); - sourceBuilder.query(query); - sourceBuilder.size(args.hits); - sourceBuilder.sort(new ScoreSortBuilder().order(SortOrder.DESC)); - sourceBuilder.sort(new FieldSortBuilder(TweetGenerator.TweetField.ID_LONG.name).order(SortOrder.DESC)); - searchRequest.source(sourceBuilder); - - try { - SearchResponse searchResponse = client.search(searchRequest, COMMON_OPTIONS); - results = searchResponse.getHits(); - } catch (Exception e) { - LOG.error("Exception during ES query: ", e); - } - - ScoreTiesAdjusterReranker reranker = new ScoreTiesAdjusterReranker(); - return reranker.rerank(ScoredDocuments.fromESDocs(results), null); - } - - @Override - public void close() throws IOException { - client.close(); - } - - public static void main(String[] args) throws Exception { - Args searchElasticArgs = new Args(); - CmdLineParser parser = new CmdLineParser(searchElasticArgs, ParserProperties.defaults().withUsageWidth(90)); - - try { - parser.parseArgument(args); - } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - System.err.println("Example: SearchElastic" + parser.printExample(OptionHandlerFilter.REQUIRED)); - return; - } - - final long start = System.nanoTime(); - SearchElastic searcher = new SearchElastic(searchElasticArgs); - searcher.runTopics(); - searcher.close(); - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("Total run time: " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); - } -} \ No newline at end of file diff --git a/src/main/java/io/anserini/search/SearchSolr.java b/src/main/java/io/anserini/search/SearchSolr.java deleted file mode 100644 index 9b01661aa4..0000000000 --- a/src/main/java/io/anserini/search/SearchSolr.java +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search; - -import com.google.common.base.Splitter; -import io.anserini.index.IndexArgs; -import io.anserini.index.generator.TweetGenerator; -import io.anserini.rerank.ScoredDocuments; -import io.anserini.rerank.lib.ScoreTiesAdjusterReranker; -import io.anserini.search.topicreader.TopicReader; -import org.apache.commons.lang3.time.DurationFormatUtils; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.apache.lucene.document.LongPoint; -import org.apache.lucene.search.Query; -import org.apache.solr.client.solrj.SolrClient; -import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.SolrQuery.SortClause; -import org.apache.solr.client.solrj.impl.CloudSolrClient; -import org.apache.solr.client.solrj.impl.Http2SolrClient; -import org.apache.solr.client.solrj.response.QueryResponse; -import org.apache.solr.common.SolrDocumentList; -import org.kohsuke.args4j.CmdLineException; -import org.kohsuke.args4j.CmdLineParser; -import org.kohsuke.args4j.Option; -import org.kohsuke.args4j.OptionHandlerFilter; -import org.kohsuke.args4j.ParserProperties; -import org.kohsuke.args4j.spi.StringArrayOptionHandler; - -import java.io.Closeable; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Locale; -import java.util.Map; -import java.util.Optional; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.concurrent.TimeUnit; - -/* -* Entry point of the Retrieval. - */ -public final class SearchSolr implements Closeable { - - private static final Logger LOG = LogManager.getLogger(SearchCollection.class); - private static final int TIMEOUT = 600 * 1000; - private final Args args; - private SolrClient client; - - public static final class Args { - - // required arguments - - @Option(name = "-topics", metaVar = "[file]", handler = StringArrayOptionHandler.class, required = true, usage = "topics file") - public String[] topics; - - @Option(name = "-output", metaVar = "[file]", required = true, usage = "output file") - public String output; - - @Option(name = "-topicreader", required = true, usage = "define how to read the topic(query) file: one of [Trec|Webxml]") - public String topicReader; - - @Option(name = "-solr.index", usage = "the name of the index in Solr") - public String solrIndex = null; - - @Option(name = "-solr.zkUrl", usage = "the URL of Solr's ZooKeeper (comma separated list of using ensemble)") - public String zkUrl = null; - - @Option(name = "-solr.zkChroot", usage = "the ZooKeeper chroot") - public String zkChroot = "/"; - - // optional arguments - @Option(name = "-topicfield", usage = "Which field of the query should be used, default \"title\"." + - " For TREC ad hoc topics, description or narrative can be used.") - public String topicfield = "title"; - - @Option(name = "-searchtweets", usage = "Whether the search is against a tweet " + - "index created by IndexCollection -collection TweetCollection") - public Boolean searchtweets = false; - - @Option(name = "-hits", metaVar = "[number]", required = false, usage = "max number of hits to return") - public int hits = 1000; - - @Option(name = "-runtag", metaVar = "[tag]", required = false, usage = "runtag") - public String runtag = null; - - } - - private final class SolrSearcherThread extends Thread { - - final private SortedMap> topics; - final private String outputPath; - final private String runTag; - - private SolrSearcherThread(SortedMap> topics, String outputPath, String runTag){ - - this.topics = topics; - this.runTag = runTag; - this.outputPath = outputPath; - setName(outputPath); - } - - @Override - public void run() { - try { - LOG.info("[Start] Retrieval with Solr collection: " + args.solrIndex); - final long start = System.nanoTime(); - PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(outputPath), StandardCharsets.US_ASCII)); - - for (Map.Entry> entry : topics.entrySet()) { - K qid = entry.getKey(); - String queryString = entry.getValue().get(args.topicfield); - ScoredDocuments docs; - if (args.searchtweets) { - docs = searchTweets(queryString, Long.parseLong(entry.getValue().get("time"))); - } else { - docs = search(queryString); - } - - /** - * the first column is the topic number. - * the second column is currently unused and should always be "Q0". - * the third column is the official document identifier of the retrieved document. - * the fourth column is the rank the document is retrieved. - * the fifth column shows the score (integer or floating point) that generated the ranking. - * the sixth column is called the "run tag" and should be a unique identifier for your - */ - for (int i = 0; i < docs.documents.length; i++) { - out.println(String.format(Locale.US, "%s Q0 %s %d %f %s", qid, - docs.documents[i].getField(IndexArgs.ID).stringValue(), (i + 1), docs.scores[i], runTag)); - } - } - out.flush(); - out.close(); - - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("[Finished] Run " + topics.size() + " topics searched in " - + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); - } catch (Exception e) { - LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); - } - } - } - - public SearchSolr(Args args) throws IOException { - this.args = args; - LOG.info("Solr index: " + args.solrIndex); - LOG.info("Solr ZooKeeper URL: " + args.zkUrl); - this.client = new CloudSolrClient.Builder(Splitter.on(',') - .splitToList(args.zkUrl), Optional.of(args.zkChroot)) - .withInternalClientBuilder(new Http2SolrClient.Builder().connectionTimeout(TIMEOUT)) - .build(); - } - - @SuppressWarnings("unchecked") - public void runTopics() throws IOException { - TopicReader tr; - SortedMap> topics = new TreeMap<>(); - for (String singleTopicsFile : args.topics) { - Path topicsFilePath = Paths.get(singleTopicsFile); - if (!Files.exists(topicsFilePath) || !Files.isRegularFile(topicsFilePath) || !Files.isReadable(topicsFilePath)) { - throw new IllegalArgumentException("Topics file : " + topicsFilePath + " does not exist or is not a (readable) file."); - } - try { - tr = (TopicReader) Class.forName("io.anserini.search.topicreader." + args.topicReader + "TopicReader") - .getConstructor(Path.class).newInstance(topicsFilePath); - topics.putAll(tr.read()); - } catch (Exception e) { - throw new IllegalArgumentException("Unable to load topic reader: " + args.topicReader); - } - } - - final String runTag = args.runtag == null ? "Solrini" : args.runtag; - SolrSearcherThread solrThread = new SolrSearcherThread(topics, args.output, runTag); - solrThread.run(); - } - - public ScoredDocuments search(String queryString){ - - SolrDocumentList results = null; - - SolrQuery solrq = new SolrQuery(); - solrq.set("df", "contents"); - solrq.set("fl", "* score"); - // Remove some characters in query which are special syntax in Solr query parser - solrq.setQuery(queryString.replaceAll("[+=&|<>!(){}~*?:/\"\\^\\-\\[\\]\\\\]", " ")); - solrq.setRows(args.hits); - solrq.setSort(SortClause.desc("score")); - solrq.addSort(SortClause.asc(IndexArgs.ID)); - - try { - QueryResponse response = client.query(args.solrIndex, solrq); - results = response.getResults(); - } catch (Exception e) { - LOG.error("Exception during Solr query: ", e); - } - - ScoreTiesAdjusterReranker reranker = new ScoreTiesAdjusterReranker(); - return reranker.rerank(ScoredDocuments.fromSolrDocs(results), null); - } - - public ScoredDocuments searchTweets(String queryString, long t){ - - SolrDocumentList results = null; - - SolrQuery solrq = new SolrQuery(); - solrq.set("df", "contents"); - solrq.set("fl", "* score"); - // Remove double quotes in query since they are special syntax in Solr query parser - solrq.setQuery(queryString.replace("\"", "")); - solrq.setRows(args.hits); - solrq.setSort(SortClause.desc("score")); - solrq.addSort(SortClause.desc(TweetGenerator.TweetField.ID_LONG.name)); - - // Do not consider the tweets with tweet ids that are beyond the queryTweetTime - // tag contains the timestamp of the query in terms of the - // chronologically nearest tweet id within the corpus - Query filter = LongPoint.newRangeQuery(TweetGenerator.TweetField.ID_LONG.name, 0L, t); - solrq.set("fq", filter.toString()); - - try { - QueryResponse response = client.query(args.solrIndex, solrq); - results = response.getResults(); - } catch (Exception e) { - LOG.error("Exception during Solr query: ", e); - } - - ScoreTiesAdjusterReranker reranker = new ScoreTiesAdjusterReranker(); - return reranker.rerank(ScoredDocuments.fromSolrDocs(results), null); - } - - @Override - public void close() throws IOException { - client.close(); - } - - public static void main(String[] args) throws Exception { - Args searchSolrArgs = new Args(); - CmdLineParser parser = new CmdLineParser(searchSolrArgs, ParserProperties.defaults().withUsageWidth(90)); - - try { - parser.parseArgument(args); - } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - System.err.println("Example: SearchSolr" + parser.printExample(OptionHandlerFilter.REQUIRED)); - return; - } - - final long start = System.nanoTime(); - SearchSolr searcher = new SearchSolr(searchSolrArgs); - searcher.runTopics(); - searcher.close(); - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("Total run time: " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); - } -} diff --git a/src/main/python/run_es_regression.py b/src/main/python/run_es_regression.py deleted file mode 100644 index b2084de21d..0000000000 --- a/src/main/python/run_es_regression.py +++ /dev/null @@ -1,256 +0,0 @@ -# -# Pyserini: Python interface to the Anserini IR toolkit built on Lucene -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import logging -import math -import os -import requests -import time - -import regression_utils - -# Note that this class is specifically written with REST API requests instead of the -# Elasticsearch client eliminate an additional dependency - -logger = logging.getLogger('run_es_regression') -ch = logging.StreamHandler() -ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s - %(message)s')) -logger.addHandler(ch) -logger.setLevel(logging.INFO) - - -class ElasticsearchClient: - def __init__(self): - pass - - @staticmethod - def is_alive(): - try: - response = requests.get('http://localhost:9200/') - response.raise_for_status() - except requests.exceptions.RequestException: - return False - else: - return True - - def does_index_exist(self, collection): - # Make sure ES is alive: - if self.is_alive(): - try: - response = requests.get('http://localhost:9200/{}'.format(collection)) - response.raise_for_status() - except requests.exceptions.RequestException: - return False - else: - return True - else: - raise Exception('ES does not appear to be alive!') - - def delete_index(self, collection): - logger.info('Deleting index {}...'.format(collection)) - # Make sure the index exists: - if self.does_index_exist(collection): - try: - response = requests.request('DELETE', url='http://localhost:9200/{}'.format(collection)) - response.raise_for_status() - except requests.exceptions.RequestException: - return False - else: - return True - else: - raise Exception('The index {} does not exist!'.format(collection)) - - def create_index(self, collection): - logger.info('Creating index {}...'.format(collection)) - # Make sure the index does not exist: - if not self.does_index_exist(collection): - filename = 'src/main/resources/elasticsearch/index-config.{}.json'.format(collection) - if not os.path.exists(filename): - raise Exception('No config found in src/main/resources/elasticsearch/ for {}!'.format(collection)) - logger.info('Using index config for {} at {}'.format(collection, filename)) - with open(filename, mode='r') as file: - json = file.read() - response = '' - try: - response = requests.request('PUT', url='http://localhost:9200/{}'.format(collection), - data=json, headers={'Content-type': 'application/json'}) - response.raise_for_status() - except requests.exceptions.RequestException: - logger.info(response) - return False - else: - return True - else: - raise Exception('The index {} already exists!'.format(collection)) - - def insert_docs(self, collection, path): - logger.info('Inserting documents from {} into {}... '.format(path, collection)) - if not os.path.exists(args.input): - raise Exception('{} does not exist!'.format(args.input)) - if not self.does_index_exist(collection): - raise Exception('The index {} does not exist!'.format(collection)) - # TODO: abstract this into an external config instead of hard-coded. - if collection == 'robust04': - command = 'sh target/appassembler/bin/IndexCollection -collection TrecCollection ' + \ - '-generator DefaultLuceneDocumentGenerator -es -es.index robust04 -threads 8 -input ' + \ - path + ' -storePositions -storeDocvectors -storeRaw' - elif collection == 'msmarco-passage': - command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \ - '-generator DefaultLuceneDocumentGenerator -es -es.index msmarco-passage -threads 8 -input ' + \ - path + ' -storePositions -storeDocvectors -storeRaw' - elif collection == 'core18': - command = 'sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection ' + \ - '-generator WashingtonPostGenerator -es -es.index core18 -threads 8 -input ' + \ - path + ' -storePositions -storeDocvectors -storeContents' - elif collection == 'msmarco-doc': - command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \ - '-generator DefaultLuceneDocumentGenerator -es -es.index msmarco-doc -threads 8 -input ' + \ - path + ' -storePositions -storeDocvectors -storeRaw' - else: - raise Exception('Unknown collection: {}'.format(collection)) - logger.info('Running indexing command: ' + command) - return regression_utils.run_shell_command(command, logger, echo=True) - - def evaluate(self, collection): - if not self.does_index_exist(collection): - raise Exception('The index {} does not exist!'.format(collection)) - # TODO: abstract this into an external config instead of hard-coded. - if collection == 'robust04': - command = 'sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index robust04 ' + \ - '-topics src/main/resources/topics-and-qrels/topics.robust04.txt ' + \ - '-output runs/run.es.robust04.bm25.topics.robust04.txt' - elif collection == 'msmarco-passage': - command = 'sh target/appassembler/bin/SearchElastic -topicreader TsvString -es.index msmarco-passage ' + \ - '-topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt ' + \ - '-output runs/run.es.msmarco-passage.txt' - elif collection == 'core18': - command = 'sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index core18 ' + \ - '-topics src/main/resources/topics-and-qrels/topics.core18.txt ' + \ - '-output runs/run.es.core18.bm25.topics.core18.txt' - elif collection == 'msmarco-doc': - command = 'sh target/appassembler/bin/SearchElastic -topicreader TsvInt -es.index msmarco-doc ' + \ - '-topics src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt ' + \ - '-output runs/run.es.msmarco-doc.txt' - else: - raise Exception('Unknown collection: {}'.format(collection)) - - logger.info('Retrieval command: ' + command) - regression_utils.run_shell_command(command, logger, echo=True) - logger.info('Retrieval complete!') - - if collection == 'robust04': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ - 'src/main/resources/topics-and-qrels/qrels.robust04.txt ' + \ - 'runs/run.es.robust04.bm25.topics.robust04.txt' - elif collection == 'msmarco-passage': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ - 'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt ' + \ - 'runs/run.es.msmarco-passage.txt' - elif collection == 'core18': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ - 'src/main/resources/topics-and-qrels/qrels.core18.txt runs/run.es.core18.bm25.topics.core18.txt' - elif collection == 'msmarco-doc': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ - 'src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.es.msmarco-doc.txt' - else: - raise Exception('Unknown collection: {}'.format(collection)) - - logger.info('Evaluation command: ' + command) - output = regression_utils.run_shell_command(command, logger, capture=True) - ap = float(output[0].split('\t')[2]) - - if collection == 'robust04': - expected = 0.2531 - elif collection == 'msmarco-passage': - expected = 0.1956 - elif collection == 'core18': - expected = 0.2496 - elif collection == 'msmarco-doc': - expected = 0.2307 - else: - raise Exception('Unknown collection: {}'.format(collection)) - - if math.isclose(ap, expected): - logger.info('[SUCESS] {} MAP verified as expected!'.format(ap)) - else: - logger.info('[FAILED] {} MAP, expected {} MAP!'.format(ap, expected)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Program for running Elasticsearch regressions.') - parser.add_argument('--ping', action='store_true', default=False, help='Ping ES and exit.') - parser.add_argument('--check-index-exists', default='', type=str, metavar='collection', - help='Check if index exists.') - parser.add_argument('--delete-index', default='', type=str, metavar='collection', help='Delete index.') - parser.add_argument('--create-index', default='', type=str, metavar='collection', help='Create index.') - parser.add_argument('--insert-docs', default='', type=str, metavar='collection', - help='Insert documents into index.') - parser.add_argument('--input', default='', type=str, metavar='directory', - help='Location of documents to insert into index.') - parser.add_argument('--evaluate', default='', type=str, metavar='collection', - help='Search and evaluate on collection.') - parser.add_argument('--regression', default='', type=str, metavar='collection', help='Run end-to-end regression.') - - args = parser.parse_args() - es = ElasticsearchClient() - - if args.ping: - logger.info('Pinging Elasticsearch instance...') - if es.is_alive(): - logger.info('... appears to alive! :)') - else: - logger.info('... appears to dead! :(') - elif args.check_index_exists: - logger.info('Checking if index {} exists...'.format(args.check_index_exists)) - if es.does_index_exist(args.check_index_exists): - logger.info('... yes indeed!') - else: - logger.info('... appears not.') - elif args.delete_index: - if es.delete_index(args.delete_index): - logger.info('... successful!') - else: - logger.info('... failed!') - elif args.create_index: - if es.create_index(args.create_index): - logger.info('... successful!') - else: - logger.info('... failed!') - elif args.insert_docs: - if not args.input: - raise Exception('Location of corpus not specified (use --input)!') - else: - es.insert_docs(args.insert_docs, args.input) - elif args.evaluate: - es.evaluate(args.evaluate) - elif args.regression: - logger.info('Running BM25 regression on {}...'.format(args.regression)) - if not args.input: - raise Exception('Location of corpus not specified (use --input)!') - if not es.is_alive(): - raise Exception('Elasticsearch does not appear to be alive!') - if es.does_index_exist(args.regression): - logger.info('Index {} already exists: deleting and recreating.'.format(args.regression)) - es.delete_index(args.regression) - es.create_index(args.regression) - es.insert_docs(args.regression, args.input) - # Documents ingested into ES are not immediately searchable. There are lots of 'refresh' options - # to control the visibility behavior, but the simplest solution is just to wait for a bit... - logger.info('Document ingestion complete. Sleeping now for 120s...') - time.sleep(120) - logger.info('Waking up!') - es.evaluate(args.regression) diff --git a/src/main/python/run_solr_regression.py b/src/main/python/run_solr_regression.py deleted file mode 100644 index 3fa8486a4b..0000000000 --- a/src/main/python/run_solr_regression.py +++ /dev/null @@ -1,247 +0,0 @@ -# -# Pyserini: Python interface to the Anserini IR toolkit built on Lucene -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import logging -import math -import os -import requests - -import regression_utils - -logger = logging.getLogger('run_solr_regression') -ch = logging.StreamHandler() -ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s - %(message)s')) -logger.addHandler(ch) -logger.setLevel(logging.INFO) - - -class SolrClient: - def __init__(self): - pass - - @staticmethod - def is_alive(): - try: - response = requests.get('http://localhost:8983/') - response.raise_for_status() - except requests.exceptions.RequestException: - return False - else: - return True - - def does_index_exist(self, collection): - # Make sure Solr is alive: - if self.is_alive(): - try: - response = requests.get('http://localhost:8983/solr/admin/collections?action=LIST') - response.raise_for_status() - except requests.exceptions.RequestException: - return False - else: - return collection in response.json()['collections'] - else: - raise Exception('Solr does not appear to be alive!') - - def delete_index(self, collection): - # Make sure the index exists: - if self.does_index_exist(collection): - command = 'solrini/bin/solr delete -c {}'.format(collection) - logger.info('Deleting index {} command: {}'.format(collection, command)) - regression_utils.run_shell_command(command, logger, echo=True) - return not self.does_index_exist(collection) - else: - raise Exception('The index {} does not exist!'.format(collection)) - - def create_index(self, collection): - # Make sure the index does not exist: - if not self.does_index_exist(collection): - # Re-upload configsets to Solr's internal Zookeeper - self.upload_configs() - command = 'solrini/bin/solr create -n anserini -c {}'.format(collection) - logger.info('Creating index {} command: {}'.format(collection, command)) - regression_utils.run_shell_command(command, logger, echo=True) - return self.does_index_exist(collection) - else: - raise Exception('The index {} already exists!'.format(collection)) - - def insert_docs(self, collection, path): - logger.info('Inserting documents from {} into {}... '.format(path, collection)) - if not os.path.exists(args.input): - raise Exception('{} does not exist!'.format(args.input)) - if not self.does_index_exist(collection): - raise Exception('The index {} does not exist!'.format(collection)) - if collection == 'core18': - command = 'sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection ' + \ - '-generator WashingtonPostGenerator -solr -solr.index core18 -solr.zkUrl localhost:9983 ' + \ - '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeContents' - elif collection == 'robust04': - command = 'sh target/appassembler/bin/IndexCollection -collection TrecCollection ' + \ - '-generator DefaultLuceneDocumentGenerator ' + \ - '-solr -solr.index robust04 -solr.zkUrl localhost:9983 ' + \ - '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw' - elif collection == 'msmarco-passage': - command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \ - '-generator DefaultLuceneDocumentGenerator ' + \ - '-solr -solr.index msmarco-passage -solr.zkUrl localhost:9983 ' + \ - '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw' - elif collection == 'msmarco-doc': - command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \ - '-generator DefaultLuceneDocumentGenerator ' + \ - '-solr -solr.index msmarco-doc -solr.zkUrl localhost:9983 ' + \ - '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw' - else: - raise Exception('Unknown collection: {}'.format(collection)) - logger.info('Running indexing command: ' + command) - return regression_utils.run_shell_command(command, logger, echo=True) - - @staticmethod - def upload_configs(): - os.chdir('src/main/resources/solr') - command = 'rm -rf anserini/conf/lang anserini-twitter/conf/lang' - logger.info('Deleting existed configs command: ' + command) - regression_utils.run_shell_command(command, logger, echo=True) - command = './solr.sh ../../../../solrini localhost:9983' - logger.info('Uploading configs command: ' + command) - regression_utils.run_shell_command(command, logger, echo=True) - os.chdir('../../../..') - logger.info('Uploading complete!') - - def evaluate(self, collection): - if not self.does_index_exist(collection): - raise Exception('The index {} does not exist!'.format(collection)) - if collection == 'core18': - command = 'sh target/appassembler/bin/SearchSolr -topicreader Trec -solr.index core18 ' + \ - '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.core18.txt ' + \ - '-output runs/run.solr.core18.bm25.topics.core18.txt' - elif collection == 'robust04': - command = 'sh target/appassembler/bin/SearchSolr -topicreader Trec -solr.index robust04 ' + \ - '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.robust04.txt ' + \ - '-output runs/run.solr.robust04.bm25.topics.robust04.txt' - elif collection == 'msmarco-passage': - command = 'sh target/appassembler/bin/SearchSolr -topicreader TsvString -solr.index msmarco-passage ' + \ - '-solr.zkUrl localhost:9983 ' + \ - '-topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt ' + \ - '-output runs/run.solr.msmarco-passage.txt' - elif collection == 'msmarco-doc': - command = 'sh target/appassembler/bin/SearchSolr -topicreader TsvInt -solr.index msmarco-doc ' + \ - '-solr.zkUrl localhost:9983 ' + \ - '-topics src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt ' + \ - '-output runs/run.solr.msmarco-doc.txt ' - else: - raise Exception('Unknown collection: {}'.format(collection)) - - logger.info('Retrieval command: ' + command) - regression_utils.run_shell_command(command, logger, echo=True) - logger.info('Retrieval complete!') - - if collection == 'core18': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ - 'src/main/resources/topics-and-qrels/qrels.core18.txt runs/run.solr.core18.bm25.topics.core18.txt' - elif collection == 'robust04': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ - 'src/main/resources/topics-and-qrels/qrels.robust04.txt ' + \ - 'runs/run.solr.robust04.bm25.topics.robust04.txt' - elif collection == 'msmarco-passage': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ - 'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt ' + \ - 'runs/run.solr.msmarco-passage.txt' - elif collection == 'msmarco-doc': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ - 'src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.solr.msmarco-doc.txt' - else: - raise Exception('Unknown collection: {}'.format(collection)) - - logger.info('Evaluation command: ' + command) - output = regression_utils.run_shell_command(command, logger, capture=True) - ap = float(output[0].split('\t')[2]) - - if collection == 'core18': - expected = 0.2496 - elif collection == 'robust04': - expected = 0.2531 - elif collection == 'msmarco-passage': - expected = 0.1926 - elif collection == 'msmarco-doc': - expected = 0.2305 - else: - raise Exception('Unknown collection: {}'.format(collection)) - - if math.isclose(ap, expected): - logger.info('[SUCESS] {} MAP verified as expected!'.format(ap)) - else: - logger.info('[FAILED] {} MAP, expected {} MAP!'.format(ap, expected)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Program for running Solr regressions.') - parser.add_argument('--ping', action='store_true', default=False, help='ping Solr and exit') - parser.add_argument('--check-index-exists', default='', type=str, metavar='collection', - help='Check if index exists.') - parser.add_argument('--delete-index', default='', type=str, metavar='collection', help='Delete index.') - parser.add_argument('--create-index', default='', type=str, metavar='collection', help='Create index.') - parser.add_argument('--insert-docs', default='', type=str, metavar='collection', - help='Insert documents into index.') - parser.add_argument('--input', default='', type=str, metavar='directory', - help='Location of documents to insert into index.') - parser.add_argument('--evaluate', default='', type=str, metavar='collection', - help='Search and evaluate on collection.') - parser.add_argument('--regression', default='', type=str, metavar='collection', help='Run end-to-end regression.') - - args = parser.parse_args() - solr = SolrClient() - - if args.ping: - logger.info('Pinging Solr instance...') - if solr.is_alive(): - logger.info('... appears to alive! :)') - else: - logger.info('... appears to dead! :(') - elif args.check_index_exists: - logger.info('Checking if index {} exists...'.format(args.check_index_exists)) - if solr.does_index_exist(args.check_index_exists): - logger.info('... yes indeed!') - else: - logger.info('... appears not.') - elif args.delete_index: - if solr.delete_index(args.delete_index): - logger.info('... successful!') - else: - logger.info('... failed!') - elif args.create_index: - if solr.create_index(args.create_index): - logger.info('... successful!') - else: - logger.info('... failed!') - elif args.insert_docs: - if not args.input: - raise Exception('Location of corpus not specified (use --input)!') - else: - solr.insert_docs(args.insert_docs, args.input) - elif args.evaluate: - solr.evaluate(args.evaluate) - elif args.regression: - logger.info('Running BM25 regression on {}...'.format(args.regression)) - if not args.input: - raise Exception('Location of corpus not specified (use --input)!') - if not solr.is_alive(): - raise Exception('Solr does not appear to be alive!') - if solr.does_index_exist(args.regression): - logger.info('Index {} already exists: deleting and recreating.'.format(args.regression)) - solr.delete_index(args.regression) - solr.create_index(args.regression) - solr.insert_docs(args.regression, args.input) - solr.evaluate(args.regression) diff --git a/src/main/resources/elasticsearch/index-config.cord19.json b/src/main/resources/elasticsearch/index-config.cord19.json deleted file mode 100644 index c7c08e4610..0000000000 --- a/src/main/resources/elasticsearch/index-config.cord19.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "mappings": { - "properties": { - "id": { - "type": "keyword" - }, - "contents": { - "type": "text", - "store": false, - "index": true, - "analyzer": "english" - }, - "raw": { - "type": "text", - "store": true, - "index": false - } - } - }, - "settings": { - "index": { - "refresh_interval": "60s", - "similarity": { - "default": { - "type": "BM25", - "k1": "0.9", - "b": "0.4" - } - } - } - } -} diff --git a/src/main/resources/elasticsearch/index-config.core18.json b/src/main/resources/elasticsearch/index-config.core18.json deleted file mode 100644 index c7c08e4610..0000000000 --- a/src/main/resources/elasticsearch/index-config.core18.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "mappings": { - "properties": { - "id": { - "type": "keyword" - }, - "contents": { - "type": "text", - "store": false, - "index": true, - "analyzer": "english" - }, - "raw": { - "type": "text", - "store": true, - "index": false - } - } - }, - "settings": { - "index": { - "refresh_interval": "60s", - "similarity": { - "default": { - "type": "BM25", - "k1": "0.9", - "b": "0.4" - } - } - } - } -} diff --git a/src/main/resources/elasticsearch/index-config.msmarco-doc.json b/src/main/resources/elasticsearch/index-config.msmarco-doc.json deleted file mode 100644 index c7c08e4610..0000000000 --- a/src/main/resources/elasticsearch/index-config.msmarco-doc.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "mappings": { - "properties": { - "id": { - "type": "keyword" - }, - "contents": { - "type": "text", - "store": false, - "index": true, - "analyzer": "english" - }, - "raw": { - "type": "text", - "store": true, - "index": false - } - } - }, - "settings": { - "index": { - "refresh_interval": "60s", - "similarity": { - "default": { - "type": "BM25", - "k1": "0.9", - "b": "0.4" - } - } - } - } -} diff --git a/src/main/resources/elasticsearch/index-config.msmarco-passage.json b/src/main/resources/elasticsearch/index-config.msmarco-passage.json deleted file mode 100644 index ad33344097..0000000000 --- a/src/main/resources/elasticsearch/index-config.msmarco-passage.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "mappings": { - "properties": { - "id": { - "type": "keyword" - }, - "contents": { - "type": "text", - "store": false, - "index": true, - "analyzer": "english" - }, - "raw": { - "type": "text", - "store": true, - "index": false - } - } - }, - "settings": { - "index": { - "refresh_interval": "60s", - "similarity": { - "default": { - "type": "BM25", - "k1": "0.82", - "b": "0.68" - } - } - } - } -} diff --git a/src/main/resources/elasticsearch/index-config.robust04.json b/src/main/resources/elasticsearch/index-config.robust04.json deleted file mode 100644 index c7c08e4610..0000000000 --- a/src/main/resources/elasticsearch/index-config.robust04.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "mappings": { - "properties": { - "id": { - "type": "keyword" - }, - "contents": { - "type": "text", - "store": false, - "index": true, - "analyzer": "english" - }, - "raw": { - "type": "text", - "store": true, - "index": false - } - } - }, - "settings": { - "index": { - "refresh_interval": "60s", - "similarity": { - "default": { - "type": "BM25", - "k1": "0.9", - "b": "0.4" - } - } - } - } -} diff --git a/src/main/resources/solr/anserini-twitter/conf/managed-schema b/src/main/resources/solr/anserini-twitter/conf/managed-schema deleted file mode 100644 index 08e1f08be5..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/managed-schema +++ /dev/null @@ -1,216 +0,0 @@ - - - - id - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 0.9 - 0.4 - - - diff --git a/src/main/resources/solr/anserini-twitter/conf/params.json b/src/main/resources/solr/anserini-twitter/conf/params.json deleted file mode 100644 index 06114ef257..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/params.json +++ /dev/null @@ -1,20 +0,0 @@ -{"params":{ - "query":{ - "defType":"edismax", - "q.alt":"*:*", - "rows":"10", - "fl":"*,score", - "":{"v":0} - }, - "facets":{ - "facet":"on", - "facet.mincount": "1", - "":{"v":0} - }, - "velocity":{ - "wt": "velocity", - "v.template":"browse", - "v.layout": "layout", - "":{"v":0} - } -}} \ No newline at end of file diff --git a/src/main/resources/solr/anserini-twitter/conf/protwords.txt b/src/main/resources/solr/anserini-twitter/conf/protwords.txt deleted file mode 100644 index 1dfc0abecb..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/protwords.txt +++ /dev/null @@ -1,21 +0,0 @@ -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#----------------------------------------------------------------------- -# Use a protected word file to protect against the stemmer reducing two -# unrelated words to the same base word. - -# Some non-words that normally won't be encountered, -# just to test that they won't be stemmed. -dontstems -zwhacky - diff --git a/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml b/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml deleted file mode 100644 index 5f3e4208ef..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml +++ /dev/null @@ -1,1341 +0,0 @@ - - - - - - - - - 9.0.0 - - - - - - - - - ${solr.data.dir:} - - - - - - - - - - - - - - - - - - - 2048 - - - - - - - - - - ${solr.lock.type:native} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ${solr.ulog.dir:} - ${solr.ulog.numVersionBuckets:65536} - - - - - ${solr.autoCommit.maxTime:15000} - false - - - - - - ${solr.autoSoftCommit.maxTime:-1} - - - - - - - - - - - - - - 1024 - - - - - - - - - - - - - - - - - - - - - - - - true - - - - - - 20 - - - 200 - - - - - - - - - - - - - - - - false - - - - - - - - - - - - - - - - - - - - - - explicit - 10 - - - - - - - - - - - - - - - - explicit - json - true - - - - - - - - explicit - - - - - - _text_ - - - - - - - true - ignored_ - _text_ - - - - - - - - - text_general - - - - - - default - _text_ - solr.DirectSolrSpellChecker - - internal - - 0.5 - - 2 - - 1 - - 5 - - 4 - - 0.01 - - - - - - - - - - - - default - on - true - 10 - 5 - 5 - true - true - 10 - 5 - - - spellcheck - - - - - - - - - - true - - - tvComponent - - - - - - - - - - - - true - false - - - terms - - - - - - - - string - - - - - - explicit - - - elevator - - - - - - - - - - - 100 - - - - - - - - 70 - - 0.5 - - [-\w ,/\n\"']{20,200} - - - - - - - ]]> - ]]> - - - - - - - - - - - - - - - - - - - - - - - - ,, - ,, - ,, - ,, - ,]]> - ]]> - - - - - - 10 - .,!? - - - - - - - WORD - - - en - US - - - - - - - - - - - - [^\w-\.] - _ - - - - - - - yyyy-MM-dd'T'HH:mm:ss.SSSZ - yyyy-MM-dd'T'HH:mm:ss,SSSZ - yyyy-MM-dd'T'HH:mm:ss.SSS - yyyy-MM-dd'T'HH:mm:ss,SSS - yyyy-MM-dd'T'HH:mm:ssZ - yyyy-MM-dd'T'HH:mm:ss - yyyy-MM-dd'T'HH:mmZ - yyyy-MM-dd'T'HH:mm - yyyy-MM-dd HH:mm:ss.SSSZ - yyyy-MM-dd HH:mm:ss,SSSZ - yyyy-MM-dd HH:mm:ss.SSS - yyyy-MM-dd HH:mm:ss,SSS - yyyy-MM-dd HH:mm:ssZ - yyyy-MM-dd HH:mm:ss - yyyy-MM-dd HH:mmZ - yyyy-MM-dd HH:mm - yyyy-MM-dd - - - - - java.lang.String - text_general - - *_str - 256 - - - true - - - java.lang.Boolean - booleans - - - java.util.Date - pdates - - - java.lang.Long - java.lang.Integer - plongs - - - java.lang.Number - pdoubles - - - - - - - - - - - - - - - - - - - - - - - - - - text/plain; charset=UTF-8 - - - - - ${velocity.template.base.dir:} - ${velocity.solr.resource.loader.enabled:true} - ${velocity.params.resource.loader.enabled:false} - - - - - - - - - - - - - - diff --git a/src/main/resources/solr/anserini-twitter/conf/stopwords_en.txt b/src/main/resources/solr/anserini-twitter/conf/stopwords_en.txt deleted file mode 100644 index e11bbd5670..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/stopwords_en.txt +++ /dev/null @@ -1,49 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Standard english stop words taken from Lucene's StopAnalyzer -a -an -and -are -as -at -be -but -by -for -if -in -into -is -it -no -not -of -on -or -such -that -the -their -then -there -these -they -this -to -was -will -with diff --git a/src/main/resources/solr/anserini-twitter/conf/synonyms.txt b/src/main/resources/solr/anserini-twitter/conf/synonyms.txt deleted file mode 100644 index eab4ee8753..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/synonyms.txt +++ /dev/null @@ -1,29 +0,0 @@ -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#----------------------------------------------------------------------- -#some test synonym mappings unlikely to appear in real input text -aaafoo => aaabar -bbbfoo => bbbfoo bbbbar -cccfoo => cccbar cccbaz -fooaaa,baraaa,bazaaa - -# Some synonym groups specific to this example -GB,gib,gigabyte,gigabytes -MB,mib,megabyte,megabytes -Television, Televisions, TV, TVs -#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming -#after us won't split it into two words. - -# Synonym mappings can be used for spelling correction too -pixima => pixma - diff --git a/src/main/resources/solr/anserini/conf/managed-schema b/src/main/resources/solr/anserini/conf/managed-schema deleted file mode 100644 index 08e1f08be5..0000000000 --- a/src/main/resources/solr/anserini/conf/managed-schema +++ /dev/null @@ -1,216 +0,0 @@ - - - - id - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 0.9 - 0.4 - - - diff --git a/src/main/resources/solr/anserini/conf/params.json b/src/main/resources/solr/anserini/conf/params.json deleted file mode 100644 index 06114ef257..0000000000 --- a/src/main/resources/solr/anserini/conf/params.json +++ /dev/null @@ -1,20 +0,0 @@ -{"params":{ - "query":{ - "defType":"edismax", - "q.alt":"*:*", - "rows":"10", - "fl":"*,score", - "":{"v":0} - }, - "facets":{ - "facet":"on", - "facet.mincount": "1", - "":{"v":0} - }, - "velocity":{ - "wt": "velocity", - "v.template":"browse", - "v.layout": "layout", - "":{"v":0} - } -}} \ No newline at end of file diff --git a/src/main/resources/solr/anserini/conf/protwords.txt b/src/main/resources/solr/anserini/conf/protwords.txt deleted file mode 100644 index 1dfc0abecb..0000000000 --- a/src/main/resources/solr/anserini/conf/protwords.txt +++ /dev/null @@ -1,21 +0,0 @@ -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#----------------------------------------------------------------------- -# Use a protected word file to protect against the stemmer reducing two -# unrelated words to the same base word. - -# Some non-words that normally won't be encountered, -# just to test that they won't be stemmed. -dontstems -zwhacky - diff --git a/src/main/resources/solr/anserini/conf/solrconfig.xml b/src/main/resources/solr/anserini/conf/solrconfig.xml deleted file mode 100644 index b00368515b..0000000000 --- a/src/main/resources/solr/anserini/conf/solrconfig.xml +++ /dev/null @@ -1,1343 +0,0 @@ - - - - - - - - - 9.0.0 - - - - - - - - - - ${solr.data.dir:} - - - - - - - - - - - - - - - - - - - 2048 - - - - - - - - - - ${solr.lock.type:native} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ${solr.ulog.dir:} - ${solr.ulog.numVersionBuckets:65536} - - - - - ${solr.autoCommit.maxTime:15000} - false - - - - - - ${solr.autoSoftCommit.maxTime:-1} - - - - - - - - - - - - - - 1024 - - - - - - - - - - - - - - - - - - - - - - - - true - - - - - - 20 - - - 200 - - - - - - - - - - - - - - - - false - - - - - - - - - - - - - - - - - - - - - - explicit - 10 - - - - - - - - - - - - - - - - explicit - json - true - - - - - - - - explicit - - - - - - _text_ - - - - - - - true - ignored_ - _text_ - - - - - - - - - text_general - - - - - - default - _text_ - solr.DirectSolrSpellChecker - - internal - - 0.5 - - 2 - - 1 - - 5 - - 4 - - 0.01 - - - - - - - - - - - - default - on - true - 10 - 5 - 5 - true - true - 10 - 5 - - - spellcheck - - - - - - - - - - true - - - tvComponent - - - - - - - - - - - - true - false - - - terms - - - - - - - - string - - - - - - explicit - - - elevator - - - - - - - - - - - 100 - - - - - - - - 70 - - 0.5 - - [-\w ,/\n\"']{20,200} - - - - - - - ]]> - ]]> - - - - - - - - - - - - - - - - - - - - - - - - ,, - ,, - ,, - ,, - ,]]> - ]]> - - - - - - 10 - .,!? - - - - - - - WORD - - - en - US - - - - - - - - - - - - [^\w-\.] - _ - - - - - - - yyyy-MM-dd'T'HH:mm:ss.SSSZ - yyyy-MM-dd'T'HH:mm:ss,SSSZ - yyyy-MM-dd'T'HH:mm:ss.SSS - yyyy-MM-dd'T'HH:mm:ss,SSS - yyyy-MM-dd'T'HH:mm:ssZ - yyyy-MM-dd'T'HH:mm:ss - yyyy-MM-dd'T'HH:mmZ - yyyy-MM-dd'T'HH:mm - yyyy-MM-dd HH:mm:ss.SSSZ - yyyy-MM-dd HH:mm:ss,SSSZ - yyyy-MM-dd HH:mm:ss.SSS - yyyy-MM-dd HH:mm:ss,SSS - yyyy-MM-dd HH:mm:ssZ - yyyy-MM-dd HH:mm:ss - yyyy-MM-dd HH:mmZ - yyyy-MM-dd HH:mm - yyyy-MM-dd - - - - - java.lang.String - text_general - - *_str - 256 - - - true - - - java.lang.Boolean - booleans - - - java.util.Date - pdates - - - java.lang.Long - java.lang.Integer - plongs - - - java.lang.Number - pdoubles - - - - - - - - - - - - - - - - - - - - - - - - - - text/plain; charset=UTF-8 - - - - - ${velocity.template.base.dir:} - ${velocity.solr.resource.loader.enabled:true} - ${velocity.params.resource.loader.enabled:false} - - - - - - - - - - - - - - - diff --git a/src/main/resources/solr/anserini/conf/stopwords_en.txt b/src/main/resources/solr/anserini/conf/stopwords_en.txt deleted file mode 100644 index e11bbd5670..0000000000 --- a/src/main/resources/solr/anserini/conf/stopwords_en.txt +++ /dev/null @@ -1,49 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Standard english stop words taken from Lucene's StopAnalyzer -a -an -and -are -as -at -be -but -by -for -if -in -into -is -it -no -not -of -on -or -such -that -the -their -then -there -these -they -this -to -was -will -with diff --git a/src/main/resources/solr/anserini/conf/synonyms.txt b/src/main/resources/solr/anserini/conf/synonyms.txt deleted file mode 100644 index eab4ee8753..0000000000 --- a/src/main/resources/solr/anserini/conf/synonyms.txt +++ /dev/null @@ -1,29 +0,0 @@ -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#----------------------------------------------------------------------- -#some test synonym mappings unlikely to appear in real input text -aaafoo => aaabar -bbbfoo => bbbfoo bbbbar -cccfoo => cccbar cccbaz -fooaaa,baraaa,bazaaa - -# Some synonym groups specific to this example -GB,gib,gigabyte,gigabytes -MB,mib,megabyte,megabytes -Television, Televisions, TV, TVs -#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming -#after us won't split it into two words. - -# Synonym mappings can be used for spelling correction too -pixima => pixma - diff --git a/src/main/resources/solr/schemas/acl-anthology.json b/src/main/resources/solr/schemas/acl-anthology.json deleted file mode 100644 index e358861e83..0000000000 --- a/src/main/resources/solr/schemas/acl-anthology.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "add-field": { - "name":"authors", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"sigs", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"venues", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"pages", - "type":"string", - "stored":true, - "docValues": false - } -} \ No newline at end of file diff --git a/src/main/resources/solr/schemas/cord19.json b/src/main/resources/solr/schemas/cord19.json deleted file mode 100644 index 8a9d305b9b..0000000000 --- a/src/main/resources/solr/schemas/cord19.json +++ /dev/null @@ -1,86 +0,0 @@ -{ - "add-field": { - "name":"authors", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"source_x", - "type":"string", - "stored":true, - "multiValued": true - }, - "add-field": { - "name":"pmcid", - "type":"string", - "stored":true, - "docValues": true - }, - "add-field": { - "name":"pubmed_id", - "type":"string", - "stored":true, - "docValues": true - }, - "add-field": { - "name":"publish_time", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"doi", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"journal", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"license", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"sha", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"url", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"year", - "type":"pint", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"outcomes_vocab", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"population_vocab", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"interventions_vocab", - "type":"string", - "stored":true, - "multiValued":true - } -} diff --git a/src/main/resources/solr/schemas/core.json b/src/main/resources/solr/schemas/core.json deleted file mode 100644 index f6c205539b..0000000000 --- a/src/main/resources/solr/schemas/core.json +++ /dev/null @@ -1,56 +0,0 @@ -{ - "add-field": { - "name":"authors", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"contributors", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"identifiers", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"journals", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":false - }, - "add-field": { - "name":"relations", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"subjects", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"topics", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"datePublished", - "type":"string", - "stored":true - } -} \ No newline at end of file diff --git a/src/main/resources/solr/schemas/covid.json b/src/main/resources/solr/schemas/covid.json deleted file mode 100644 index f6a1f237f3..0000000000 --- a/src/main/resources/solr/schemas/covid.json +++ /dev/null @@ -1,86 +0,0 @@ -{ - "add-field": { - "name":"authors", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"source_x", - "type":"string", - "stored":true, - "docValues": true - }, - "add-field": { - "name":"pmcid", - "type":"string", - "stored":true, - "docValues": true - }, - "add-field": { - "name":"pubmed_id", - "type":"string", - "stored":true, - "docValues": true - }, - "add-field": { - "name":"publish_time", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"doi", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"journal", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"license", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"sha", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"url", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"year", - "type":"pint", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"outcomes_vocab", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"population_vocab", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"interventions_vocab", - "type":"string", - "stored":true, - "multiValued":true - } -} diff --git a/src/main/resources/solr/solr.sh b/src/main/resources/solr/solr.sh deleted file mode 100755 index 194ea446d8..0000000000 --- a/src/main/resources/solr/solr.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env sh - -### -# This script assumes a single-node SolrCloud instance is running locally. -### - -if [[ -z "$1" ]]; then - echo "Usage: ./solr.sh " - exit 1 -fi - -# Solr install directory -SOLR_DIR=$1 - -# Solr's ZooKeeper URL -ZOOKEEPER_URL=$2 - -# Copy anserini into lib dir -mkdir ${SOLR_DIR}/lib && cp ../../../../target/anserini-*-fatjar.jar ${SOLR_DIR}/lib - -# Upload configset to Solr -${SOLR_DIR}/bin/solr zk -z ${ZOOKEEPER_URL:-localhost:9983} upconfig -n anserini -d anserini -${SOLR_DIR}/bin/solr zk -z ${ZOOKEEPER_URL:-localhost:9983} upconfig -n anserini-twitter -d anserini-twitter diff --git a/src/test/java/io/anserini/integration/solr/AclAnthologyEndToEndTest.java b/src/test/java/io/anserini/integration/solr/AclAnthologyEndToEndTest.java deleted file mode 100644 index a46383b484..0000000000 --- a/src/test/java/io/anserini/integration/solr/AclAnthologyEndToEndTest.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.integration.solr; - -import io.anserini.collection.AclAnthology; -import io.anserini.index.IndexArgs; -import io.anserini.index.generator.AclAnthologyGenerator; -import io.anserini.search.SearchSolr; - -public class AclAnthologyEndToEndTest extends SolrEndToEndTest { - @Override - protected String getCollectionName() { - return "AclAnthology"; - } - - @Override - protected String getSchemaAdjustmentFile() { - return "solr/schemas/acl-anthology.json"; - } - - @Override - public IndexArgs getIndexArgs() { - IndexArgs indexArgs = createDefaultIndexArgs(); - indexArgs.input = "src/test/resources/sample_docs/acl"; - indexArgs.collectionClass = AclAnthology.class.getSimpleName(); - indexArgs.generatorClass = AclAnthologyGenerator.class.getSimpleName(); - return indexArgs; - } - - @Override - protected SearchSolr.Args getSearchArgs() { - return createSearchArgs("TsvInt", "src/test/resources/sample_topics/acl_topics.tsv"); - } - - @Override - protected String[] getRefRankingResult() { - return new String[]{ // bm25 - "1 Q0 C00-1007 1 0.294000 Solrini", - "1 Q0 E17-1003 2 0.186100 Solrini", - "2 Q0 C00-1003 1 0.622700 Solrini" - }; - } -} diff --git a/src/test/java/io/anserini/integration/solr/CoreEndToEndTest.java b/src/test/java/io/anserini/integration/solr/CoreEndToEndTest.java deleted file mode 100644 index 761e12e537..0000000000 --- a/src/test/java/io/anserini/integration/solr/CoreEndToEndTest.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.integration.solr; - -import io.anserini.collection.CoreCollection; -import io.anserini.index.IndexArgs; -import io.anserini.index.generator.CoreGenerator; -import io.anserini.search.SearchSolr; - -public class CoreEndToEndTest extends SolrEndToEndTest { - @Override - protected String getCollectionName() { - return "Core"; - } - - @Override - protected String getSchemaAdjustmentFile() { - return "solr/schemas/core.json"; - } - - @Override - protected IndexArgs getIndexArgs() { - IndexArgs indexArgs = createDefaultIndexArgs(); - indexArgs.input = "src/test/resources/sample_docs/core"; - indexArgs.collectionClass = CoreCollection.class.getSimpleName(); - indexArgs.generatorClass = CoreGenerator.class.getSimpleName(); - return indexArgs; - } - - @Override - protected SearchSolr.Args getSearchArgs() { - return createSearchArgs("TsvInt", "src/test/resources/sample_topics/core_topics.tsv"); - } - - @Override - protected String[] getRefRankingResult() { - return new String[]{ // bm25 - "1 Q0 coreDoc1 1 0.243200 Solrini", - "1 Q0 doi2 2 0.243199 Solrini", - "2 Q0 coreDoc1 1 0.243200 Solrini", - "2 Q0 doi2 2 0.243199 Solrini", - "3 Q0 fullCoreDoc 1 0.534600 Solrini" - }; - } -} diff --git a/src/test/java/io/anserini/integration/solr/SolrEndToEndTest.java b/src/test/java/io/anserini/integration/solr/SolrEndToEndTest.java deleted file mode 100644 index d2529d7c6d..0000000000 --- a/src/test/java/io/anserini/integration/solr/SolrEndToEndTest.java +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.integration.solr; - -import io.anserini.index.IndexArgs; -import io.anserini.index.IndexCollection; -import io.anserini.search.SearchSolr; -import org.apache.commons.io.FileUtils; -import org.apache.commons.pool2.BasePooledObjectFactory; -import org.apache.commons.pool2.ObjectPool; -import org.apache.commons.pool2.PooledObject; -import org.apache.commons.pool2.impl.DefaultPooledObject; -import org.apache.commons.pool2.impl.GenericObjectPool; -import org.apache.commons.pool2.impl.GenericObjectPoolConfig; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.solr.client.solrj.SolrClient; -import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; -import org.apache.solr.client.solrj.request.CoreAdminRequest; -import org.apache.solr.client.solrj.request.json.DirectJsonQueryRequest; -import org.apache.solr.client.solrj.response.QueryResponse; -import org.apache.solr.common.params.CommonParams; -import org.apache.solr.common.params.ModifiableSolrParams; -import org.apache.solr.core.NodeConfig; -import org.apache.solr.core.SolrResourceLoader; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.lang.reflect.Field; -import java.net.URL; -import java.nio.file.Files; - - -@LuceneTestCase.SuppressSysoutChecks(bugUrl = "None") -public abstract class SolrEndToEndTest extends LuceneTestCase { - private static final Logger LOG = LogManager.getLogger(SolrEndToEndTest.class); - - protected ObjectPool stubSolrPool; - protected final String searchOutputPrefix = "e2eTestSearch"; - - protected EmbeddedSolrServer client; - - protected static File getFile(String path) { - final URL url = SolrEndToEndTest.class.getClassLoader().getResource(path); - if (url != null) { - try { - return new File(url.toURI()); - } catch (Exception e) { - throw new RuntimeException("Resource was found on classpath, but cannot be resolved to a normal file: " + path); - } - } - final File file = new File(path); - if (file.exists()) { - return file; - } - throw new RuntimeException("Cannot find resource in classpath or in file-system (relative to CWD): " + path); - } - - @Before - @Override - public void setUp() throws Exception { - super.setUp(); - - final File solrHome = createTempDir().toFile(); - final File configSetBaseDir = new File(solrHome.toPath() + File.separator + "configsets"); - FileUtils.copyDirectory(getFile("solr/anserini"), new File(configSetBaseDir + File.separator + "anserini")); - - SolrResourceLoader loader = new SolrResourceLoader(solrHome.toPath()); - NodeConfig config = new NodeConfig.NodeConfigBuilder("embeddedSolrServerNode", loader.getInstancePath()) - .setConfigSetBaseDirectory(configSetBaseDir.getAbsolutePath()).build(); - client = new EmbeddedSolrServer(config, getCollectionName()); - LOG.info("Created Embedded Solr Server"); - - CoreAdminRequest.Create createRequest = new CoreAdminRequest.Create(); - createRequest.setCoreName(getCollectionName()); - createRequest.setConfigSet("anserini"); - createRequest.process(client); - client.commit(); - LOG.info("Created Solr Core: " + getCollectionName()); - - GenericObjectPoolConfig poolConfig = new GenericObjectPoolConfig<>(); - poolConfig.setMaxTotal(1); // only 1 EmbeddedSolrServer instance will be created by getSolrClient - poolConfig.setMinIdle(1); - stubSolrPool = new GenericObjectPool<>(new StubSolrClientFactory(client), poolConfig); - } - - @After - @Override - public void tearDown() throws Exception { - super.tearDown(); - - client.deleteByQuery("*:*"); - client.commit(); - client.close(); - stubSolrPool.close(); - } - - protected IndexArgs createDefaultIndexArgs() { - IndexArgs args = new IndexArgs(); - - args.solrIndex = getCollectionName(); - args.threads = 1; - args.storePositions = true; - args.storeDocvectors = true; - args.storeContents = true; - args.storeRaw = true; - args.optimize = true; - args.quiet = true; - args.solr = true; - - return args; - } - - protected SearchSolr.Args createSearchArgs(String topicReader, String topicFile) { - SearchSolr.Args args = new SearchSolr.Args(); - - args.solrIndex = getCollectionName(); - args.output = searchOutputPrefix + topicReader; - args.topicReader = topicReader; - args.topics = new String[]{topicFile}; - args.zkUrl = "localhost"; // SearchSolr initialization workaround - - return args; - } - - protected static class StubSolrClientFactory extends BasePooledObjectFactory { - final SolrClient client; - - public StubSolrClientFactory(SolrClient client) { - this.client = client; - } - - @Override - public SolrClient create() { - return this.client; - } - - @Override - public PooledObject wrap(SolrClient solrClient) { - return new DefaultPooledObject<>(solrClient); - } - } - - protected IndexCollection getIndexRunner(IndexArgs args) throws Exception { - IndexCollection runner = new IndexCollection(args); - Field f = runner.getClass().getDeclaredField("solrPool"); - f.setAccessible(true); - f.set(runner, stubSolrPool); - return runner; - } - - protected SearchSolr getSearchRunner(SearchSolr.Args args) throws Exception { - SearchSolr runner = new SearchSolr(args); - Field f = runner.getClass().getDeclaredField("client"); - f.setAccessible(true); - ((SolrClient) f.get(runner)).close(); // close the old client - f.set(runner, client); - return runner; - } - - protected abstract String getCollectionName(); - - protected abstract String getSchemaAdjustmentFile(); - - protected abstract IndexArgs getIndexArgs(); - - protected abstract SearchSolr.Args getSearchArgs(); - - protected abstract String[] getRefRankingResult(); - - @Test - public void testIndexAndSearch() throws Exception { - String schemaAdjustmentFile = getSchemaAdjustmentFile(); - if (schemaAdjustmentFile != null) { - // update schema, much like curl -X POST -H 'Content-type:application/json' --data-binary SCHEMA_NAME.json http://localhost:8983/solr/COLLECTION_NAME/schema - String schemaJson = Files.readString(getFile(schemaAdjustmentFile).toPath()); - ModifiableSolrParams params = new ModifiableSolrParams(); - params.add(CommonParams.QT, "/schema"); - DirectJsonQueryRequest schemaRequest = new DirectJsonQueryRequest(schemaJson, params); - QueryResponse response = schemaRequest.process(client, getCollectionName()); - assertEquals(0, response.getStatus()); - } - - IndexArgs indexArgs = getIndexArgs(); - IndexCollection indexRunner = getIndexRunner(indexArgs); - indexRunner.run(); - - SearchSolr.Args searchArgs = getSearchArgs(); - SearchSolr searchRunner = getSearchRunner(searchArgs); - searchRunner.runTopics(); - - BufferedReader br = new BufferedReader(new FileReader(searchArgs.output)); - String[] ref = getRefRankingResult(); - String s; - int cnt = 0; - while ((s = br.readLine()) != null) { - assertEquals(ref[cnt], s); - cnt++; - } - assertEquals(cnt, ref.length); - FileUtils.deleteQuietly(new File(searchArgs.output)); - } -} diff --git a/src/test/java/io/anserini/integration/solr/TrecEndToEndTest.java b/src/test/java/io/anserini/integration/solr/TrecEndToEndTest.java deleted file mode 100644 index f9d95a9d29..0000000000 --- a/src/test/java/io/anserini/integration/solr/TrecEndToEndTest.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.integration.solr; - -import io.anserini.collection.TrecCollection; -import io.anserini.index.IndexArgs; -import io.anserini.search.SearchSolr; - -public class TrecEndToEndTest extends SolrEndToEndTest { - @Override - protected String getCollectionName() { - return "Trec"; - } - - @Override - protected String getSchemaAdjustmentFile() { - return null; // no need to adjust schema - } - - @Override - protected IndexArgs getIndexArgs() { - IndexArgs indexArgs = createDefaultIndexArgs(); - indexArgs.input = "src/test/resources/sample_docs/trec/collection2"; - indexArgs.collectionClass = TrecCollection.class.getSimpleName(); - return indexArgs; - } - - @Override - protected SearchSolr.Args getSearchArgs() { - return createSearchArgs("Trec", "src/test/resources/sample_topics/Trec"); - } - - @Override - protected String[] getRefRankingResult() { - return new String[]{ // bm25 - "1 Q0 DOC222 1 0.343200 Solrini", - "1 Q0 TREC_DOC_1 2 0.333400 Solrini", - "1 Q0 WSJ_1 3 0.068700 Solrini" - }; - } -} From dedfbbbaa8c40d284fb02cec7319d833730b2ad1 Mon Sep 17 00:00:00 2001 From: lintool Date: Mon, 1 Aug 2022 07:44:14 -0400 Subject: [PATCH 10/13] Reordered. --- pom.xml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pom.xml b/pom.xml index f0eae16e99..50cb6a995c 100644 --- a/pom.xml +++ b/pom.xml @@ -2,7 +2,7 @@ 4.0.0 io.anserini anserini - 0.14.5-SNAPSHOT + 0.15.0-SNAPSHOT Anserini An information retrieval toolkit built on Lucene http://anserini.io/ @@ -289,27 +289,27 @@ org.apache.lucene - lucene-queries + lucene-codecs ${lucene.version} org.apache.lucene - lucene-queryparser + lucene-queries ${lucene.version} org.apache.lucene - lucene-analysis-common + lucene-queryparser ${lucene.version} org.apache.lucene - lucene-analysis-kuromoji + lucene-analysis-common ${lucene.version} org.apache.lucene - lucene-codecs + lucene-analysis-kuromoji ${lucene.version} From 3dd6c3498d0e8b73107acd0f990ff636a9da4e91 Mon Sep 17 00:00:00 2001 From: lintool Date: Mon, 1 Aug 2022 08:23:07 -0400 Subject: [PATCH 11/13] Bumped up to Lucene 9. --- pom.xml | 2 +- src/test/java/io/anserini/GeoIndexerTestBase.java | 11 ++++++++--- src/test/java/io/anserini/IndexerTestBase.java | 2 +- .../io/anserini/IndexerWithEmptyDocumentTestBase.java | 2 +- .../anserini/collection/DocumentCollectionTest.java | 2 +- .../java/io/anserini/integration/EndToEndTest.java | 4 ++-- .../io/anserini/ltr/BaseFeatureExtractorTest.java | 2 +- .../query/DisjunctionMaxQueryGeneratorTest.java | 2 +- .../java/io/anserini/search/query/SdmQueryTest.java | 2 +- src/test/java/io/anserini/util/FeatureVectorTest.java | 2 +- 10 files changed, 18 insertions(+), 13 deletions(-) diff --git a/pom.xml b/pom.xml index 50cb6a995c..e174478da3 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ - 9.0.0 + 9.3.0 UTF-8 diff --git a/src/test/java/io/anserini/GeoIndexerTestBase.java b/src/test/java/io/anserini/GeoIndexerTestBase.java index 8c3c94e88d..e3ecc13edf 100644 --- a/src/test/java/io/anserini/GeoIndexerTestBase.java +++ b/src/test/java/io/anserini/GeoIndexerTestBase.java @@ -17,7 +17,11 @@ package io.anserini; import io.anserini.index.IndexArgs; -import org.apache.lucene.document.*; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.LatLonDocValuesField; +import org.apache.lucene.document.LatLonShape; +import org.apache.lucene.document.StringField; import org.apache.lucene.geo.Line; import org.apache.lucene.geo.Polygon; import org.apache.lucene.geo.SimpleWKTShapeParser; @@ -25,9 +29,10 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; -import org.junit.Before; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.After; +import org.junit.Before; + import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; diff --git a/src/test/java/io/anserini/IndexerTestBase.java b/src/test/java/io/anserini/IndexerTestBase.java index ffd16c0b0c..8a1410bdc6 100644 --- a/src/test/java/io/anserini/IndexerTestBase.java +++ b/src/test/java/io/anserini/IndexerTestBase.java @@ -30,7 +30,7 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; import org.junit.After; import org.junit.Before; diff --git a/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java b/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java index 417a0fb0ea..e4a854d2ca 100644 --- a/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java +++ b/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java @@ -30,7 +30,7 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; import org.junit.After; import org.junit.Before; diff --git a/src/test/java/io/anserini/collection/DocumentCollectionTest.java b/src/test/java/io/anserini/collection/DocumentCollectionTest.java index 64b2faee63..ce06003621 100644 --- a/src/test/java/io/anserini/collection/DocumentCollectionTest.java +++ b/src/test/java/io/anserini/collection/DocumentCollectionTest.java @@ -16,7 +16,7 @@ package io.anserini.collection; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.After; import org.junit.Before; import org.junit.Test; diff --git a/src/test/java/io/anserini/integration/EndToEndTest.java b/src/test/java/io/anserini/integration/EndToEndTest.java index 3b509702e3..d0a55efe03 100644 --- a/src/test/java/io/anserini/integration/EndToEndTest.java +++ b/src/test/java/io/anserini/integration/EndToEndTest.java @@ -28,8 +28,8 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.TestRuleLimitSysouts; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestRuleLimitSysouts; import org.apache.lucene.util.IOUtils; import org.junit.After; import org.junit.Before; diff --git a/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java b/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java index b93cd5b42b..bfbe194a68 100644 --- a/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java +++ b/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java @@ -31,7 +31,7 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.After; import org.junit.Before; diff --git a/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java b/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java index 6b82cdcc29..00ce6a20a0 100644 --- a/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java +++ b/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java @@ -21,7 +21,7 @@ import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.Test; import java.util.Map; diff --git a/src/test/java/io/anserini/search/query/SdmQueryTest.java b/src/test/java/io/anserini/search/query/SdmQueryTest.java index 89663ffdca..c032882834 100644 --- a/src/test/java/io/anserini/search/query/SdmQueryTest.java +++ b/src/test/java/io/anserini/search/query/SdmQueryTest.java @@ -36,7 +36,7 @@ import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.After; import org.junit.Before; import org.junit.Test; diff --git a/src/test/java/io/anserini/util/FeatureVectorTest.java b/src/test/java/io/anserini/util/FeatureVectorTest.java index c747194004..12694bd90b 100644 --- a/src/test/java/io/anserini/util/FeatureVectorTest.java +++ b/src/test/java/io/anserini/util/FeatureVectorTest.java @@ -16,7 +16,7 @@ package io.anserini.util; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.Test; import java.util.Arrays; From c3f0ba292dcbb33e64314dc399fb49339c33fb71 Mon Sep 17 00:00:00 2001 From: lintool Date: Mon, 1 Aug 2022 09:03:51 -0400 Subject: [PATCH 12/13] Added backward codecs. --- pom.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pom.xml b/pom.xml index e174478da3..ecb15d4b84 100644 --- a/pom.xml +++ b/pom.xml @@ -292,6 +292,11 @@ lucene-codecs ${lucene.version} + + org.apache.lucene + lucene-backward-codecs + ${lucene.version} + org.apache.lucene lucene-queries From be33a261be61b9f29259d4105a133dfd10492dd3 Mon Sep 17 00:00:00 2001 From: lintool Date: Mon, 1 Aug 2022 09:39:56 -0400 Subject: [PATCH 13/13] Fix Lucene 8/9 index compat issue --- src/main/java/io/anserini/search/SearchArgs.java | 3 +++ .../io/anserini/search/SearchCollection.java | 6 ++++++ src/main/python/run_regression.py | 16 ++++++++++++++-- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/main/java/io/anserini/search/SearchArgs.java b/src/main/java/io/anserini/search/SearchArgs.java index b90dda1a06..d4d3f50d58 100644 --- a/src/main/java/io/anserini/search/SearchArgs.java +++ b/src/main/java/io/anserini/search/SearchArgs.java @@ -36,6 +36,9 @@ public class SearchArgs { @Option(name = "-topicreader", required = true, usage = "TopicReader to use.") public String topicReader; + @Option(name = "-lucene8", usage = "Enable Lucene 8 index compatibility.") + public Boolean lucene8 = false; + // optional arguments @Option(name = "-querygenerator", usage = "QueryGenerator to use.") public String queryGenerator = "BagOfWordsQueryGenerator"; diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index 1a7d1776ff..ed40f11217 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -494,6 +494,12 @@ public SearchCollection(SearchArgs args) throws IOException { loadQrels(args.rf_qrels); } + // See https://github.com/castorini/anserini/issues/1952 + // The solution to the issue described above is to turn off deterministic tie-breaking. + if (args.lucene8) { + args.arbitraryScoreTieBreak = true; + args.axiom_deterministic = false; + } } @Override diff --git a/src/main/python/run_regression.py b/src/main/python/run_regression.py index 9afc28d1b9..08884abeeb 100644 --- a/src/main/python/run_regression.py +++ b/src/main/python/run_regression.py @@ -61,6 +61,10 @@ def is_close(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) +def is_close_lucene8(a, b): + return abs(a-b) <= 0.001 + + def check_output(command): # Python 2.6 compatible subprocess.check_output process = Popen(command, shell=True, stdout=PIPE) @@ -131,6 +135,7 @@ def construct_search_commands(yaml_data): '-topicreader', topic_set['topic_reader'] if 'topic_reader' in topic_set and topic_set['topic_reader'] else yaml_data['topic_reader'], '-output', construct_runfile_path(yaml_data['corpus'], topic_set['id'], model['name']), model['params'], + '-lucene8' if args.lucene8 else '' ] for (model, topic_set) in list(itertools.product(yaml_data['models'], yaml_data['topics'])) ] @@ -154,6 +159,7 @@ def construct_convert_commands(yaml_data): def evaluate_and_verify(yaml_data, dry_run): fail_str = '\033[91m[FAIL]\033[0m ' ok_str = ' [OK] ' + okish_str = ' \033[94m[OK*]\033[0m ' failures = False logger.info('='*10 + ' Verifying Results: ' + yaml_data['corpus'] + ' ' + '='*10) @@ -181,8 +187,11 @@ def evaluate_and_verify(yaml_data, dry_run): if is_close(expected, actual): logger.info(ok_str + result_str) else: - logger.error(fail_str + result_str) - failures = True + if args.lucene8 and is_close_lucene8(expected, actual): + logger.info(okish_str + result_str) + else: + logger.error(fail_str + result_str) + failures = True if not dry_run: if failures: @@ -280,6 +289,7 @@ def download_url(url, save_dir, local_filename=None, md5=None, force=False, verb help='Number of converting runs to execute in parallel.') parser.add_argument('--dry-run', dest='dry_run', action='store_true', help='Output commands without actual execution.') + parser.add_argument('--lucene8', dest='lucene8', action='store_true', help='Enable Lucene 8 index compatibility.') args = parser.parse_args() with open('src/main/resources/regression/{}.yaml'.format(args.regression)) as f: @@ -340,6 +350,8 @@ def download_url(url, save_dir, local_filename=None, md5=None, force=False, verb # Search and verify results. if args.search: logger.info('='*10 + ' Ranking ' + '='*10) + if args.lucene8: + logger.info('Enabling Lucene 8 index compatibility.') search_cmds = construct_search_commands(yaml_data) if args.dry_run: for cmd in search_cmds: