SimpleSearcher refactoring, clarifies 'contents' and 'raw' field (#1047)

castorini · Mar 22, 2020 · ff96bad · ff96bad
1 parent cb58277
commit ff96bad
Show file tree

Hide file tree

Showing 5 changed files with 180 additions and 60 deletions.
diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java
@@ -332,6 +332,7 @@ public static List<Posting> getPostingsListWithAnalyzer(IndexReader reader, Stri
 
   /**
    * Returns the document vector for a particular document as a map of terms to term frequencies.
+   *
    * @param reader index reader
    * @param docid collection docid
    * @return the document vector for a particular document as a map of terms to term frequencies
@@ -357,12 +358,13 @@ public static Map<String, Long> getDocumentVector(IndexReader reader, String doc
   }
 
   /**
-   * Returns the raw document given its collection docid.
+   * Returns the raw contents of a document based on an internal Lucene docid.
+   *
    * @param reader index reader
    * @param docid collection docid
-   * @return the raw document given its collection docid, or <code>null</code> if not found.
+   * @return raw contents of a document
    */
-  public static String getRawDocument(IndexReader reader, String docid) {
+  public static String getRawContents(IndexReader reader, String docid) {
     try {
       Document rawDoc = reader.document(convertDocidToLuceneDocid(reader, docid));
 
@@ -375,8 +377,29 @@ public static String getRawDocument(IndexReader reader, String docid) {
     }
   }
 
+  /**
+   * Returns the indexed contents of a document based on a collection docid.
+   *
+   * @param reader index reader
+   * @param docid collection docid
+   * @return indexed contents of a document
+   */
+  public static String getIndexedContents(IndexReader reader, String docid) {
+    try {
+      Document rawDoc = reader.document(convertDocidToLuceneDocid(reader, docid));
+
+      if (rawDoc == null) {
+        return null;
+      }
+      return rawDoc.get(IndexArgs.CONTENTS);
+    } catch (IOException e) {
+      return null;
+    }
+  }
+
   /**
    * Computes the BM25 weight of a term (prior to analysis) in a particular document.
+   *
    * @param reader index reader
    * @param docid collection docid
    * @param term term (prior to analysis)
@@ -498,6 +521,7 @@ public static void dumpDocumentVectors(IndexReader reader, String reqDocidsPath,
 
   /**
    * Converts a collection docid to a Lucene internal docid
+   *
    * @param reader index reader
    * @param docid collection docid
    * @return corresponding Lucene internal docid, or -1 if docid not found
@@ -523,6 +547,7 @@ public static int convertDocidToLuceneDocid(IndexReader reader, String docid) {
 
   /**
    * Converts a Lucene internal docid to a collection docid
+   *
    * @param reader index reader
    * @param docid Lucene internal docid
    * @return corresponding collection docid, or <code>null</code> if not found.

diff --git a/src/main/java/io/anserini/search/SimpleSearcher.java b/src/main/java/io/anserini/search/SimpleSearcher.java
@@ -93,17 +93,25 @@ public class SimpleSearcher implements Closeable {
 
   private IndexSearcher searcher = null;
 
+  /**
+   * This class is meant to serve as the bridge between Anserini and Pyserini.
+   * Note that we are adopting Python naming conventions here on purpose.
+   */
   public class Result {
     public String docid;
-    public int ldocid;
+    public int lucene_docid;
     public float score;
-    public String content;
+    public String contents;
+    public String raw;
+    public Document lucene_document;
 
-    public Result(String docid, int ldocid, float score, String content) {
+    public Result(String docid, int lucene_docid, float score, String contents, String raw, Document lucene_document) {
       this.docid = docid;
-      this.ldocid = ldocid;
+      this.lucene_docid = lucene_docid;
       this.score = score;
-      this.content = content;
+      this.contents = contents;
+      this.raw = raw;
+      this.lucene_document = lucene_document;
     }
   }
 
@@ -320,10 +328,15 @@ protected Result[] search(Query query, List<String> queryTokens, String queryStr
     for (int i = 0; i < hits.ids.length; i++) {
       Document doc = hits.documents[i];
       String docid = doc.getField(IndexArgs.ID).stringValue();
-      IndexableField field = doc.getField(IndexArgs.RAW);
-      String content = field == null ? null : field.stringValue();
 
-      results[i] = new Result(docid, hits.ids[i], hits.scores[i], content);
+      IndexableField field;
+      field = doc.getField(IndexArgs.CONTENTS);
+      String contents = field == null ? null : field.stringValue();
+
+      field = doc.getField(IndexArgs.RAW);
+      String raw = field == null ? null : field.stringValue();
+
+      results[i] = new Result(docid, hits.ids[i], hits.scores[i], contents, raw, doc);
     }
 
     return results;
@@ -356,6 +369,7 @@ public Result[] searchFields(String q, Map<String, Float> fields, int k, long t)
 
   /**
    * Fetches the Lucene {@link Document} based on an internal Lucene docid.
+   *
    * @param ldocid internal Lucene docid
    * @return corresponding Lucene {@link Document}
    */
@@ -372,6 +386,7 @@ public Document doc(int ldocid) {
 
   /**
    * Fetches the Lucene {@link Document} based on a collection docid.
+   *
    * @param docid collection docid
    * @return corresponding Lucene {@link Document}
    */
@@ -387,12 +402,45 @@ public Document doc(String docid) {
     }
   }
 
+  /**
+   * Returns the indexed contents of a document based on an internal Lucene docid.
+   *
+   * @param ldocid internal Lucene docid
+   * @return indexed contents of the document
+   */
+  public String getIndexedContents(int ldocid) {
+    Document doc = doc(ldocid);
+    if (doc == null) {
+      return null;
+    }
+
+    IndexableField field = doc.getField(IndexArgs.CONTENTS);
+    return field == null ? null : field.stringValue();
+  }
+
+  /**
+   * Returns the indexed contents of a document based on a collection docid.
+   *
+   * @param docid collection docid
+   * @return indexed contents of the document
+   */
+  public String getIndexedContents(String docid) {
+    Document doc = doc(docid);
+    if (doc == null) {
+      return null;
+    }
+
+    IndexableField field = doc.getField(IndexArgs.CONTENTS);
+    return field == null ? null : field.stringValue();
+  }
+
   /**
    * Returns the raw contents of a document based on an internal Lucene docid.
+   *
    * @param ldocid internal Lucene docid
    * @return raw contents of the document
    */
-  public String getContents(int ldocid) {
+  public String getRawContents(int ldocid) {
     Document doc = doc(ldocid);
     if (doc == null) {
       return null;
@@ -404,10 +452,11 @@ public String getContents(int ldocid) {
 
   /**
    * Returns the raw contents of a document based on a collection docid.
+   *
    * @param docid collection docid
    * @return raw contents of the document
    */
-  public String getContents(String docid) {
+  public String getRawContents(String docid) {
     Document doc = doc(docid);
     if (doc == null) {
       return null;

diff --git a/src/test/java/io/anserini/IndexerTestBase.java b/src/test/java/io/anserini/IndexerTestBase.java
@@ -63,23 +63,26 @@ private void buildTestIndex() throws IOException {
     doc1.add(new StringField(IndexArgs.ID, "doc1", Field.Store.YES));
     doc1.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc1".getBytes())));
     doc1.add(new Field(IndexArgs.CONTENTS, doc1Text , textOptions));
-    doc1.add(new StoredField(IndexArgs.RAW, doc1Text));
+    // specifically demonstrate how "contents" and "raw" might diverge:
+    doc1.add(new StoredField(IndexArgs.RAW, String.format("{\"contents\": \"%s\"}", doc1Text)));
     writer.addDocument(doc1);
 
     Document doc2 = new Document();
     String doc2Text = "more texts";
     doc2.add(new StringField(IndexArgs.ID, "doc2", Field.Store.YES));
     doc2.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc2".getBytes())));
     doc2.add(new Field(IndexArgs.CONTENTS, doc2Text, textOptions));  // Note plural, to test stemming
-    doc2.add(new StoredField(IndexArgs.RAW, doc2Text));
+    // specifically demonstrate how "contents" and "raw" might diverge:
+    doc2.add(new StoredField(IndexArgs.RAW, String.format("{\"contents\": \"%s\"}", doc2Text)));
     writer.addDocument(doc2);
 
     Document doc3 = new Document();
     String doc3Text = "here is a test";
     doc3.add(new StringField(IndexArgs.ID, "doc3", Field.Store.YES));
     doc3.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc3".getBytes())));
     doc3.add(new Field(IndexArgs.CONTENTS, doc3Text, textOptions));
-    doc3.add(new StoredField(IndexArgs.RAW, doc3Text));
+    // specifically demonstrate how "contents" and "raw" might diverge:
+    doc3.add(new StoredField(IndexArgs.RAW, String.format("{\"contents\": \"%s\"}", doc3Text)));
     writer.addDocument(doc3);
 
     writer.commit();

diff --git a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java
@@ -320,13 +320,23 @@ public void testDocumentVector() throws Exception {
   }
 
   @Test
-  public void testRawDoc() throws Exception {
+  public void testRawContents() throws Exception {
     Directory dir = FSDirectory.open(tempDir1);
     IndexReader reader = DirectoryReader.open(dir);
 
-    assertEquals("here is some text here is some more text. city.", IndexReaderUtils.getRawDocument(reader, "doc1"));
-    assertEquals("more texts", IndexReaderUtils.getRawDocument(reader, "doc2"));
-    assertEquals("here is a test", IndexReaderUtils.getRawDocument(reader, "doc3"));
+    assertEquals("{\"contents\": \"here is some text here is some more text. city.\"}", IndexReaderUtils.getRawContents(reader, "doc1"));
+    assertEquals("{\"contents\": \"more texts\"}", IndexReaderUtils.getRawContents(reader, "doc2"));
+    assertEquals("{\"contents\": \"here is a test\"}", IndexReaderUtils.getRawContents(reader, "doc3"));
+  }
+
+  @Test
+  public void testIndexedContents() throws Exception {
+    Directory dir = FSDirectory.open(tempDir1);
+    IndexReader reader = DirectoryReader.open(dir);
+
+    assertEquals("here is some text here is some more text. city.", IndexReaderUtils.getIndexedContents(reader, "doc1"));
+    assertEquals("more texts", IndexReaderUtils.getIndexedContents(reader, "doc2"));
+    assertEquals("here is a test", IndexReaderUtils.getIndexedContents(reader, "doc3"));
   }
 
   @Test