Skip to content

Commit

Permalink
Add -pretokenized option to MsmarcoSearch (#1527)
Browse files Browse the repository at this point in the history
add option to use whitespace analyzer to accept pretokenized topics and collection in MsmarcoSearch
  • Loading branch information
stephaniewhoo authored Apr 28, 2021
1 parent a1e7177 commit 14b315d
Showing 1 changed file with 14 additions and 5 deletions.
19 changes: 14 additions & 5 deletions src/main/java/io/anserini/search/SearchMsmarco.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import io.anserini.search.query.QueryGenerator;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
Expand Down Expand Up @@ -101,6 +101,9 @@ public static class Args {
usage = "Path to file with stopwords.")
public String stopwords = null;

@Option(name = "-pretokenized", usage = "Boolean switch to accept pre tokenized jsonl.")
public boolean pretokenized = false;

}

public static void main(String[] args) throws Exception {
Expand All @@ -118,10 +121,16 @@ public static void main(String[] args) throws Exception {

long totalStartTime = System.nanoTime();

Analyzer analyzer = DefaultEnglishAnalyzer.fromArguments(
retrieveArgs.stemmer, retrieveArgs.keepstop, retrieveArgs.stopwords);
System.out.println("Initializing analyzer with stemmer=" + retrieveArgs.stemmer + ", keepstop=" +
retrieveArgs.keepstop + ", stopwords=" + retrieveArgs.stopwords);
Analyzer analyzer;
if (retrieveArgs.pretokenized){
analyzer = new WhitespaceAnalyzer();
System.out.println("Initializing whilte space analyzer");
} else {
analyzer = DefaultEnglishAnalyzer.fromArguments(
retrieveArgs.stemmer, retrieveArgs.keepstop, retrieveArgs.stopwords);
System.out.println("Initializing analyzer with stemmer=" + retrieveArgs.stemmer + ", keepstop=" +
retrieveArgs.keepstop + ", stopwords=" + retrieveArgs.stopwords);
}

SimpleSearcher searcher = new SimpleSearcher(retrieveArgs.index, analyzer);
searcher.setBM25(retrieveArgs.k1, retrieveArgs.b);
Expand Down

0 comments on commit 14b315d

Please sign in to comment.