Skip to content
Permalink
Browse files

Script to extract doc lengths (#791)

Script that produces tsv file which contains the length of all documents, the unique term count and the lossy unique term count as used in the BM25similarity class.
  • Loading branch information...
Chriskamphuis authored and lintool committed Sep 6, 2019
1 parent 8638830 commit 61f6f20ff6872484966ea1badcdcdcebf1eea852
@@ -117,6 +117,10 @@
<mainClass>io.anserini.search.SearchMsmarco</mainClass>
<id>SearchMsmarco</id>
</program>
<program>
<mainClass>io.anserini.util.ExtractDocumentLengths</mainClass>
<id>DocLen</id>
</program>
<program>
<mainClass>io.anserini.eval.Eval</mainClass>
<id>Eval</id>
@@ -237,7 +237,7 @@ public void close() throws IOException {
reader.close();
}

public List<TaggedSimilarity> constructSimiliries() {
public List<TaggedSimilarity> constructSimilarities() {
// Figure out which scoring model to use.
List<TaggedSimilarity> similarities = new ArrayList<>();
if (args.ql || args.qld) {
@@ -361,7 +361,7 @@ public void close() throws IOException {

final String runTag = args.runtag == null ? "Anserini" : args.runtag;
final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(args.threads);
this.similarities = constructSimiliries();
this.similarities = constructSimilarities();
Map<String, RerankerCascade> cascades = constructRerankerCascades();
for (TaggedSimilarity taggedSimilarity : this.similarities) {
for (Map.Entry<String, RerankerCascade> cascade : cascades.entrySet()) {
@@ -0,0 +1,78 @@
/**
* Anserini: A Lucene toolkit for replicable information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.util;

import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.SmallFloat;
import org.kohsuke.args4j.*;

import java.io.File;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.nio.file.Paths;

public class ExtractDocumentLengths {

public static class Args {
@Option(name = "-index", metaVar = "[path]", required = true, usage = "Lucene index")
String index;

@Option(name = "-output", metaVar = "[file]", required = true, usage = "output file")
String output;
}

public static void main(String[] args) throws Exception {
Args myArgs = new Args();
CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));

try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
return;
}

Directory dir = FSDirectory.open(Paths.get(myArgs.index));
IndexReader reader = DirectoryReader.open(dir);

PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));

int numDocs = reader.numDocs();
out.println("luceneID\tcount\tuniquecount\tlossycount");
for (int i = 0; i < numDocs; i++) {
int total = 0;
Terms terms = reader.getTermVector(i, "contents");
if(terms == null) {
out.println(i + "\t" + 0 + "\t" + 0 + "\t" + 0);
continue;
}
TermsEnum termsEnum = terms.iterator();
while ((termsEnum.next()) != null) {
total += termsEnum.totalTermFreq();
}
long length = SmallFloat.longToInt4(terms.size());
out.println(i + "\t" + total + "\t" + terms.size() + "\t" + length) ;
}
out.close();
}
}

0 comments on commit 61f6f20

Please sign in to comment.
You can’t perform that action at this time.