Skip to content
Permalink
Browse files

Bugfix outputting doc lengths (#800)

The lossy count was calculated using the unique count instead of the real length of the document.
  • Loading branch information...
Chriskamphuis authored and lintool committed Sep 10, 2019
1 parent f60961b commit 085c6d1d47a7c965a2be0efe9908b6c5ef82cd44
Showing with 2 additions and 7 deletions.
  1. +2 −7 src/main/java/io/anserini/util/ExtractDocumentLengths.java
@@ -19,7 +19,6 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.SmallFloat;
@@ -60,17 +59,13 @@ public static void main(String[] args) throws Exception {
int numDocs = reader.numDocs();
out.println("luceneID\tcount\tuniquecount\tlossycount");
for (int i = 0; i < numDocs; i++) {
int total = 0;
Terms terms = reader.getTermVector(i, "contents");
if(terms == null) {
out.println(i + "\t" + 0 + "\t" + 0 + "\t" + 0);
continue;
}
TermsEnum termsEnum = terms.iterator();
while ((termsEnum.next()) != null) {
total += termsEnum.totalTermFreq();
}
long length = SmallFloat.longToInt4(terms.size());
long total = terms.getSumTotalTermFreq();
long length = SmallFloat.longToInt4(total);
out.println(i + "\t" + total + "\t" + terms.size() + "\t" + length) ;
}
out.close();

0 comments on commit 085c6d1

Please sign in to comment.
You can’t perform that action at this time.