Skip to content

Commit

Permalink
Add uniCOIL pre-encoded query bindings for MS MARCO V2 (doc, passage)…
Browse files Browse the repository at this point in the history
… + V1 doc (#1769)
  • Loading branch information
lintool committed Feb 19, 2022
1 parent 2fd22ba commit 6a70804
Show file tree
Hide file tree
Showing 2 changed files with 181 additions and 100 deletions.
11 changes: 10 additions & 1 deletion src/main/java/io/anserini/search/topicreader/Topics.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,26 @@ public enum Topics {
TREC2020_DL(TsvIntTopicReader.class,"topics-and-qrels/topics.dl20.txt"),
TREC2021_DL(TsvIntTopicReader.class,"topics-and-qrels/topics.dl21.txt"),
MSMARCO_DOC_DEV(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-doc.dev.txt"),
MSMARCO_DOC_DEV_UNICOIL(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-doc.dev.unicoil.tsv.gz"),
MSMARCO_DOC_TEST(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-doc.test.txt"),
MSMARCO_PASSAGE_DEV_SUBSET(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.dev-subset.txt"),
MSMARCO_PASSAGE_TEST_SUBSET(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.test-subset.txt"),
MSMARCO_PASSAGE_DEV_SUBSET_DEEPIMPACT(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.dev-subset.deepimpact.tsv.gz"),
MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_D2Q(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.dev-subset.unicoil.tsv.gz"),
MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_TILDE(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.dev-subset.unicoil-tilde-expansion.tsv.gz"),
MSMARCO_PASSAGE_DEV_SUBSET_DISTILL_SPLADE_MAX(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.dev-subset.distill-splade-max.tsv.gz"),
MSMARCO_PASSAGE_TEST_SUBSET(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.test-subset.txt"),
MSMARCO_V2_DOC_DEV(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev.txt"),
MSMARCO_V2_DOC_DEV_UNICOIL(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev.unicoil.0shot.tsv.gz"),
MSMARCO_V2_DOC_DEV_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.tsv.gz"),
MSMARCO_V2_DOC_DEV2(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev2.txt"),
MSMARCO_V2_DOC_DEV2_UNICOIL(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev2.unicoil.0shot.tsv.gz"),
MSMARCO_V2_DOC_DEV2_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev2.unicoil-noexp.0shot.tsv.gz"),
MSMARCO_V2_PASSAGE_DEV(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev.txt"),
MSMARCO_V2_PASSAGE_DEV_UNICOIL(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev.unicoil.0shot.tsv.gz"),
MSMARCO_V2_PASSAGE_DEV_UNICOIL_NOEXP(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev.unicoil-noexp.0shot.tsv.gz"),
MSMARCO_V2_PASSAGE_DEV2(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev2.txt"),
MSMARCO_V2_PASSAGE_DEV2_UNICOIL(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev2.unicoil.0shot.tsv.gz"),
MSMARCO_V2_PASSAGE_DEV2_UNICOIL_NOEXP(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev2.unicoil-noexp.0shot.tsv.gz"),
NTCIR8_ZH(TsvStringTopicReader.class, "topics-and-qrels/topics.ntcir8zh.eval.txt"),
CLEF2006_FR(TsvStringTopicReader.class, "topics-and-qrels/topics.clef06fr.mono.fr.txt"),
TREC2002_AR(TrecTopicReader.class, "topics-and-qrels/topics.trec02ar-ar.txt"),
Expand Down
270 changes: 171 additions & 99 deletions src/test/java/io/anserini/search/topicreader/TopicReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public void testIterateThroughAllEnums() {
String[] pathParts = topic.path.split("/");
assertEquals(topic.readerClass, TopicReader.getTopicReaderClassByFile(pathParts[1]));
}
assertEquals(104, cnt);
assertEquals(113, cnt);
}

@Test
Expand Down Expand Up @@ -493,104 +493,6 @@ public void testCAR_TopicIdsAsStrings() {
assertEquals("Yellowstone National Park/Recreation",
topics.get("enwiki:Yellowstone%20National%20Park/Recreation").get("title")); }

@Test
public void testMSMARCO() {
SortedMap<Integer, Map<String, String>> topics;

topics = TopicReader.getTopics(Topics.MSMARCO_DOC_DEV);
assertNotNull(topics);
assertEquals(5193, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("androgen receptor define", topics.get(topics.firstKey()).get("title"));
assertEquals(1102400, (int) topics.lastKey());
assertEquals("why do bears hibernate", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_DOC_TEST);
assertNotNull(topics);
assertEquals(5793, topics.size());
assertEquals(57, (int) topics.firstKey());
assertEquals("term service agreement definition", topics.get(topics.firstKey()).get("title"));
assertEquals(1136966, (int) topics.lastKey());
assertEquals("#ffffff color code", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET);
assertNotNull(topics);
assertEquals(6980, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title"));
assertEquals(1102400, (int) topics.lastKey());
assertEquals("why do bears hibernate", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_DEEPIMPACT);
assertNotNull(topics);
assertEquals(6980, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("receptor androgen define", topics.get(topics.firstKey()).get("title"));
assertEquals(1102400, (int) topics.lastKey());
assertEquals("why hibernate bears", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_D2Q);
assertNotNull(topics);
assertEquals(6980, topics.size());
assertEquals(619, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102400, (int) topics.lastKey());
assertEquals(686, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_TILDE);
assertNotNull(topics);
assertEquals(6980, topics.size());
assertEquals(584, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102400, (int) topics.lastKey());
assertEquals(610, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_DISTILL_SPLADE_MAX);
assertNotNull(topics);
assertEquals(6980, topics.size());
assertEquals(1991, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102400, (int) topics.lastKey());
assertEquals(2409, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_TEST_SUBSET);
assertNotNull(topics);
assertEquals(6837, topics.size());
assertEquals(57, (int) topics.firstKey());
assertEquals("term service agreement definition", topics.get(topics.firstKey()).get("title"));
assertEquals(1136966, (int) topics.lastKey());
assertEquals("#ffffff color code", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV);
assertNotNull(topics);
assertEquals(4552, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title"));
assertEquals(1102390, (int) topics.lastKey());
assertEquals("why do children get aggressive", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2);
assertNotNull(topics);
assertEquals(5000, topics.size());
assertEquals(361, (int) topics.firstKey());
assertEquals(". irritability medical definition", topics.get(topics.firstKey()).get("title"));
assertEquals(1102413, (int) topics.lastKey());
assertEquals("why do a ferritin level", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV);
assertNotNull(topics);
assertEquals(3903, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title"));
assertEquals(1102390, (int) topics.lastKey());
assertEquals("why do children get aggressive", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV2);
assertNotNull(topics);
assertEquals(4281, topics.size());
assertEquals(1325, (int) topics.firstKey());
assertEquals("323 area code zip code", topics.get(topics.firstKey()).get("title"));
assertEquals(1102413, (int) topics.lastKey());
assertEquals("why do a ferritin level", topics.get(topics.lastKey()).get("title"));
}

@Test
public void testDprNq() {
SortedMap<Integer, Map<String, String>> topics;
Expand Down Expand Up @@ -766,6 +668,176 @@ public void testTREC21DL() {
assertEquals("who killed nicholas ii of russia", topics.get(1043135).get("title"));
}

@Test
public void testMSMARCO() {
SortedMap<Integer, Map<String, String>> topics;

topics = TopicReader.getTopics(Topics.MSMARCO_DOC_DEV);
assertNotNull(topics);
assertEquals(5193, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("androgen receptor define", topics.get(topics.firstKey()).get("title"));
assertEquals(1102400, (int) topics.lastKey());
assertEquals("why do bears hibernate", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_DOC_DEV_UNICOIL);
assertNotNull(topics);
assertEquals(5193, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals(617, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102400, (int) topics.lastKey());
assertEquals(682, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_DOC_TEST);
assertNotNull(topics);
assertEquals(5793, topics.size());
assertEquals(57, (int) topics.firstKey());
assertEquals("term service agreement definition", topics.get(topics.firstKey()).get("title"));
assertEquals(1136966, (int) topics.lastKey());
assertEquals("#ffffff color code", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET);
assertNotNull(topics);
assertEquals(6980, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title"));
assertEquals(1102400, (int) topics.lastKey());
assertEquals("why do bears hibernate", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_DEEPIMPACT);
assertNotNull(topics);
assertEquals(6980, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("receptor androgen define", topics.get(topics.firstKey()).get("title"));
assertEquals(1102400, (int) topics.lastKey());
assertEquals("why hibernate bears", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_D2Q);
assertNotNull(topics);
assertEquals(6980, topics.size());
assertEquals(619, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102400, (int) topics.lastKey());
assertEquals(686, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_TILDE);
assertNotNull(topics);
assertEquals(6980, topics.size());
assertEquals(584, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102400, (int) topics.lastKey());
assertEquals(610, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_DISTILL_SPLADE_MAX);
assertNotNull(topics);
assertEquals(6980, topics.size());
assertEquals(1991, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102400, (int) topics.lastKey());
assertEquals(2409, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_TEST_SUBSET);
assertNotNull(topics);
assertEquals(6837, topics.size());
assertEquals(57, (int) topics.firstKey());
assertEquals("term service agreement definition", topics.get(topics.firstKey()).get("title"));
assertEquals(1136966, (int) topics.lastKey());
assertEquals("#ffffff color code", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV);
assertNotNull(topics);
assertEquals(4552, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title"));
assertEquals(1102390, (int) topics.lastKey());
assertEquals("why do children get aggressive", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV_UNICOIL);
assertNotNull(topics);
assertEquals(4552, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals(617, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102390, (int) topics.lastKey());
assertEquals(608, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV_UNICOIL_NOEXP);
assertNotNull(topics);
assertEquals(4552, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals(609, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102390, (int) topics.lastKey());
assertEquals(533, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2);
assertNotNull(topics);
assertEquals(5000, topics.size());
assertEquals(361, (int) topics.firstKey());
assertEquals(". irritability medical definition", topics.get(topics.firstKey()).get("title"));
assertEquals(1102413, (int) topics.lastKey());
assertEquals("why do a ferritin level", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2_UNICOIL);
assertNotNull(topics);
assertEquals(5000, topics.size());
assertEquals(361, (int) topics.firstKey());
assertEquals(714, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102413, (int) topics.lastKey());
assertEquals(664, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2_UNICOIL_NOEXP);
assertNotNull(topics);
assertEquals(5000, topics.size());
assertEquals(361, (int) topics.firstKey());
assertEquals(690, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102413, (int) topics.lastKey());
assertEquals(537, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV);
assertNotNull(topics);
assertEquals(3903, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title"));
assertEquals(1102390, (int) topics.lastKey());
assertEquals("why do children get aggressive", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV_UNICOIL);
assertNotNull(topics);
assertEquals(3903, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals(617, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102390, (int) topics.lastKey());
assertEquals(608, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV_UNICOIL_NOEXP);
assertNotNull(topics);
assertEquals(3903, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals(609, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102390, (int) topics.lastKey());
assertEquals(533, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV2);
assertNotNull(topics);
assertEquals(4281, topics.size());
assertEquals(1325, (int) topics.firstKey());
assertEquals("323 area code zip code", topics.get(topics.firstKey()).get("title"));
assertEquals(1102413, (int) topics.lastKey());
assertEquals("why do a ferritin level", topics.get(topics.lastKey()).get("title"));

topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV2_UNICOIL);
assertNotNull(topics);
assertEquals(4281, topics.size());
assertEquals(1325, (int) topics.firstKey());
assertEquals(671, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102413, (int) topics.lastKey());
assertEquals(664, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV2_UNICOIL_NOEXP);
assertNotNull(topics);
assertEquals(4281, topics.size());
assertEquals(1325, (int) topics.firstKey());
assertEquals(649, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1102413, (int) topics.lastKey());
assertEquals(537, topics.get(topics.lastKey()).get("title").split(" ").length);
}

@Test
public void testMSMARO_TopicIdsAsStrings() {
Map<String, Map<String, String>> topics;
Expand Down

0 comments on commit 6a70804

Please sign in to comment.