Skip to content

Commit

Permalink
SimpleSearcher refactoring, clarifies 'contents' and 'raw' field (#1047)
Browse files Browse the repository at this point in the history
  • Loading branch information
lintool committed Mar 22, 2020
1 parent cb58277 commit ff96bad
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 60 deletions.
31 changes: 28 additions & 3 deletions src/main/java/io/anserini/index/IndexReaderUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ public static List<Posting> getPostingsListWithAnalyzer(IndexReader reader, Stri

/**
* Returns the document vector for a particular document as a map of terms to term frequencies.
*
* @param reader index reader
* @param docid collection docid
* @return the document vector for a particular document as a map of terms to term frequencies
Expand All @@ -357,12 +358,13 @@ public static Map<String, Long> getDocumentVector(IndexReader reader, String doc
}

/**
* Returns the raw document given its collection docid.
* Returns the raw contents of a document based on an internal Lucene docid.
*
* @param reader index reader
* @param docid collection docid
* @return the raw document given its collection docid, or <code>null</code> if not found.
* @return raw contents of a document
*/
public static String getRawDocument(IndexReader reader, String docid) {
public static String getRawContents(IndexReader reader, String docid) {
try {
Document rawDoc = reader.document(convertDocidToLuceneDocid(reader, docid));

Expand All @@ -375,8 +377,29 @@ public static String getRawDocument(IndexReader reader, String docid) {
}
}

/**
* Returns the indexed contents of a document based on a collection docid.
*
* @param reader index reader
* @param docid collection docid
* @return indexed contents of a document
*/
public static String getIndexedContents(IndexReader reader, String docid) {
try {
Document rawDoc = reader.document(convertDocidToLuceneDocid(reader, docid));

if (rawDoc == null) {
return null;
}
return rawDoc.get(IndexArgs.CONTENTS);
} catch (IOException e) {
return null;
}
}

/**
* Computes the BM25 weight of a term (prior to analysis) in a particular document.
*
* @param reader index reader
* @param docid collection docid
* @param term term (prior to analysis)
Expand Down Expand Up @@ -498,6 +521,7 @@ public static void dumpDocumentVectors(IndexReader reader, String reqDocidsPath,

/**
* Converts a collection docid to a Lucene internal docid
*
* @param reader index reader
* @param docid collection docid
* @return corresponding Lucene internal docid, or -1 if docid not found
Expand All @@ -523,6 +547,7 @@ public static int convertDocidToLuceneDocid(IndexReader reader, String docid) {

/**
* Converts a Lucene internal docid to a collection docid
*
* @param reader index reader
* @param docid Lucene internal docid
* @return corresponding collection docid, or <code>null</code> if not found.
Expand Down
69 changes: 59 additions & 10 deletions src/main/java/io/anserini/search/SimpleSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,17 +93,25 @@ public class SimpleSearcher implements Closeable {

private IndexSearcher searcher = null;

/**
* This class is meant to serve as the bridge between Anserini and Pyserini.
* Note that we are adopting Python naming conventions here on purpose.
*/
public class Result {
public String docid;
public int ldocid;
public int lucene_docid;
public float score;
public String content;
public String contents;
public String raw;
public Document lucene_document;

public Result(String docid, int ldocid, float score, String content) {
public Result(String docid, int lucene_docid, float score, String contents, String raw, Document lucene_document) {
this.docid = docid;
this.ldocid = ldocid;
this.lucene_docid = lucene_docid;
this.score = score;
this.content = content;
this.contents = contents;
this.raw = raw;
this.lucene_document = lucene_document;
}
}

Expand Down Expand Up @@ -320,10 +328,15 @@ protected Result[] search(Query query, List<String> queryTokens, String queryStr
for (int i = 0; i < hits.ids.length; i++) {
Document doc = hits.documents[i];
String docid = doc.getField(IndexArgs.ID).stringValue();
IndexableField field = doc.getField(IndexArgs.RAW);
String content = field == null ? null : field.stringValue();

results[i] = new Result(docid, hits.ids[i], hits.scores[i], content);
IndexableField field;
field = doc.getField(IndexArgs.CONTENTS);
String contents = field == null ? null : field.stringValue();

field = doc.getField(IndexArgs.RAW);
String raw = field == null ? null : field.stringValue();

results[i] = new Result(docid, hits.ids[i], hits.scores[i], contents, raw, doc);
}

return results;
Expand Down Expand Up @@ -356,6 +369,7 @@ public Result[] searchFields(String q, Map<String, Float> fields, int k, long t)

/**
* Fetches the Lucene {@link Document} based on an internal Lucene docid.
*
* @param ldocid internal Lucene docid
* @return corresponding Lucene {@link Document}
*/
Expand All @@ -372,6 +386,7 @@ public Document doc(int ldocid) {

/**
* Fetches the Lucene {@link Document} based on a collection docid.
*
* @param docid collection docid
* @return corresponding Lucene {@link Document}
*/
Expand All @@ -387,12 +402,45 @@ public Document doc(String docid) {
}
}

/**
* Returns the indexed contents of a document based on an internal Lucene docid.
*
* @param ldocid internal Lucene docid
* @return indexed contents of the document
*/
public String getIndexedContents(int ldocid) {
Document doc = doc(ldocid);
if (doc == null) {
return null;
}

IndexableField field = doc.getField(IndexArgs.CONTENTS);
return field == null ? null : field.stringValue();
}

/**
* Returns the indexed contents of a document based on a collection docid.
*
* @param docid collection docid
* @return indexed contents of the document
*/
public String getIndexedContents(String docid) {
Document doc = doc(docid);
if (doc == null) {
return null;
}

IndexableField field = doc.getField(IndexArgs.CONTENTS);
return field == null ? null : field.stringValue();
}

/**
* Returns the raw contents of a document based on an internal Lucene docid.
*
* @param ldocid internal Lucene docid
* @return raw contents of the document
*/
public String getContents(int ldocid) {
public String getRawContents(int ldocid) {
Document doc = doc(ldocid);
if (doc == null) {
return null;
Expand All @@ -404,10 +452,11 @@ public String getContents(int ldocid) {

/**
* Returns the raw contents of a document based on a collection docid.
*
* @param docid collection docid
* @return raw contents of the document
*/
public String getContents(String docid) {
public String getRawContents(String docid) {
Document doc = doc(docid);
if (doc == null) {
return null;
Expand Down
9 changes: 6 additions & 3 deletions src/test/java/io/anserini/IndexerTestBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,23 +63,26 @@ private void buildTestIndex() throws IOException {
doc1.add(new StringField(IndexArgs.ID, "doc1", Field.Store.YES));
doc1.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc1".getBytes())));
doc1.add(new Field(IndexArgs.CONTENTS, doc1Text , textOptions));
doc1.add(new StoredField(IndexArgs.RAW, doc1Text));
// specifically demonstrate how "contents" and "raw" might diverge:
doc1.add(new StoredField(IndexArgs.RAW, String.format("{\"contents\": \"%s\"}", doc1Text)));
writer.addDocument(doc1);

Document doc2 = new Document();
String doc2Text = "more texts";
doc2.add(new StringField(IndexArgs.ID, "doc2", Field.Store.YES));
doc2.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc2".getBytes())));
doc2.add(new Field(IndexArgs.CONTENTS, doc2Text, textOptions)); // Note plural, to test stemming
doc2.add(new StoredField(IndexArgs.RAW, doc2Text));
// specifically demonstrate how "contents" and "raw" might diverge:
doc2.add(new StoredField(IndexArgs.RAW, String.format("{\"contents\": \"%s\"}", doc2Text)));
writer.addDocument(doc2);

Document doc3 = new Document();
String doc3Text = "here is a test";
doc3.add(new StringField(IndexArgs.ID, "doc3", Field.Store.YES));
doc3.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc3".getBytes())));
doc3.add(new Field(IndexArgs.CONTENTS, doc3Text, textOptions));
doc3.add(new StoredField(IndexArgs.RAW, doc3Text));
// specifically demonstrate how "contents" and "raw" might diverge:
doc3.add(new StoredField(IndexArgs.RAW, String.format("{\"contents\": \"%s\"}", doc3Text)));
writer.addDocument(doc3);

writer.commit();
Expand Down
18 changes: 14 additions & 4 deletions src/test/java/io/anserini/index/IndexReaderUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -320,13 +320,23 @@ public void testDocumentVector() throws Exception {
}

@Test
public void testRawDoc() throws Exception {
public void testRawContents() throws Exception {
Directory dir = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir);

assertEquals("here is some text here is some more text. city.", IndexReaderUtils.getRawDocument(reader, "doc1"));
assertEquals("more texts", IndexReaderUtils.getRawDocument(reader, "doc2"));
assertEquals("here is a test", IndexReaderUtils.getRawDocument(reader, "doc3"));
assertEquals("{\"contents\": \"here is some text here is some more text. city.\"}", IndexReaderUtils.getRawContents(reader, "doc1"));
assertEquals("{\"contents\": \"more texts\"}", IndexReaderUtils.getRawContents(reader, "doc2"));
assertEquals("{\"contents\": \"here is a test\"}", IndexReaderUtils.getRawContents(reader, "doc3"));
}

@Test
public void testIndexedContents() throws Exception {
Directory dir = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir);

assertEquals("here is some text here is some more text. city.", IndexReaderUtils.getIndexedContents(reader, "doc1"));
assertEquals("more texts", IndexReaderUtils.getIndexedContents(reader, "doc2"));
assertEquals("here is a test", IndexReaderUtils.getIndexedContents(reader, "doc3"));
}

@Test
Expand Down
Loading

0 comments on commit ff96bad

Please sign in to comment.