Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Sequential Dependency Model query constructor #359

Merged
merged 18 commits into from
Aug 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/main/java/io/anserini/ltr/DumpTweetsLtrData.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
import io.anserini.rerank.RerankerContext;
import io.anserini.rerank.ScoredDocuments;
import io.anserini.search.SearchArgs;
import io.anserini.search.query.MicroblogTopicReader;
import io.anserini.search.query.TopicReader;
import io.anserini.search.query.BagOfWordsQueryGenerator;
import io.anserini.search.topicreader.MicroblogTopicReader;
import io.anserini.search.topicreader.TopicReader;
import io.anserini.util.AnalyzerUtils;
import io.anserini.util.Qrels;
import org.apache.logging.log4j.LogManager;
Expand Down Expand Up @@ -114,7 +115,7 @@ public static<K> void main(String[] argv) throws Exception {
String queryString = entry.getValue().get("title");
Long queryTime = Long.parseLong(entry.getValue().get("time"));
Query filter = LongPoint.newRangeQuery(TweetGenerator.FIELD_ID, 0L, queryTime);
Query query = AnalyzerUtils.buildBagOfWordsQuery(TweetGenerator.FIELD_ID,
Query query = new BagOfWordsQueryGenerator().buildQuery(TweetGenerator.FIELD_ID,
new TweetAnalyzer(), queryString);
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(filter, BooleanClause.Occur.FILTER);
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/io/anserini/ltr/FeatureExtractorCli.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package io.anserini.ltr;

import io.anserini.ltr.feature.FeatureExtractors;
import io.anserini.search.query.TopicReader;
import io.anserini.search.topicreader.TopicReader;
import io.anserini.util.Qrels;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
Expand Down Expand Up @@ -91,7 +91,7 @@ public static<K> void main(String args[]) throws Exception {
WebFeatureExtractor extractor = new WebFeatureExtractor(reader, qrels, topics, extractors);
extractor.printFeatures(out);
} else if (parsedArgs.collection.equals("twitter")) {
TopicReader<Integer> tr = (TopicReader<Integer>)Class.forName("io.anserini.search.query.MicroblogTopicReader")
TopicReader<Integer> tr = (TopicReader<Integer>)Class.forName("io.anserini.search.topicreader.MicroblogTopicReader")
.getConstructor(Path.class).newInstance(Paths.get(parsedArgs.topicsFile));
SortedMap<Integer, Map<String, String>> topics = tr.read();
LOG.debug(String.format("%d topics found", topics.size()));
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/io/anserini/ltr/TwitterFeatureExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import io.anserini.ltr.feature.UnorderedSequentialPairsFeatureExtractor;
import io.anserini.ltr.feature.base.*;
import io.anserini.ltr.feature.twitter.*;
import io.anserini.util.AnalyzerUtils;
import io.anserini.search.query.BagOfWordsQueryGenerator;
import io.anserini.util.Qrels;

import java.util.Arrays;
Expand Down Expand Up @@ -117,7 +117,7 @@ protected Set<String> getFieldsToLoad() {
@Override
protected Query parseQuery(String queryText) {
LOG.debug(String.format("Parsing query: %s", queryText) );
return AnalyzerUtils.buildBagOfWordsQuery(TweetGenerator.FIELD_BODY, new TweetAnalyzer(), queryText);
return new BagOfWordsQueryGenerator().buildQuery(TweetGenerator.FIELD_BODY, new TweetAnalyzer(), queryText);
}

@Override
Expand Down
3 changes: 1 addition & 2 deletions src/main/java/io/anserini/rerank/lib/Rm3Reranker.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
IndexSearcher searcher = context.getIndexSearcher();
IndexReader reader = searcher.getIndexReader();

FeatureVector qfv = FeatureVector.fromTerms(
AnalyzerUtils.tokenize(analyzer, context.getQueryText())).scaleToUnitL1Norm();
FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.tokenize(analyzer, context.getQueryText())).scaleToUnitL1Norm();

FeatureVector rm = estimateRelevanceModel(docs, reader, context.getSearchArgs().searchtweets);

Expand Down
12 changes: 12 additions & 0 deletions src/main/java/io/anserini/search/SearchArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,18 @@ public class SearchArgs {
@Option(name = "-f2log.s", metaVar = "[value]", required = false, usage = "F2Log s parameter")
public float f2log_s = 0.5f;

@Option(name = "-sdm", usage = "boolean switch to use Sequential Dependence Model query")
public boolean sdm = false;

@Option(name = "-sdm.tw", metaVar = "[value]", usage = "SDM term weight")
public float sdm_tw = 0.85f;

@Option(name = "-sdm.ow", metaVar = "[value]", usage = "ordered window weight in sdm")
public float sdm_ow = 0.1f;

@Option(name = "-sdm.uw", metaVar = "[value]", usage = "unordered window weight in sdm")
public float sdm_uw = 0.05f;

@Option(name = "-rm3", usage = "use RM3 query expansion model (implies using query likelihood)")
public boolean rm3 = false;

Expand Down
40 changes: 35 additions & 5 deletions src/main/java/io/anserini/search/SearchCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@
import io.anserini.rerank.lib.AxiomReranker;
import io.anserini.rerank.lib.Rm3Reranker;
import io.anserini.rerank.lib.ScoreTiesAdjusterReranker;
import io.anserini.search.query.TopicReader;
import io.anserini.search.similarity.F2ExpSimilarity;
import io.anserini.search.query.BagOfWordsQueryGenerator;
import io.anserini.search.query.SdmQueryGenerator;
import io.anserini.search.similarity.AxiomaticSimilarity;
import io.anserini.search.similarity.F2ExpSimilarity;
import io.anserini.search.similarity.F2LogSimilarity;
import io.anserini.search.topicreader.TopicReader;
import io.anserini.util.AnalyzerUtils;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.logging.log4j.LogManager;
Expand Down Expand Up @@ -86,6 +88,13 @@ public final class SearchCollection implements Closeable {
private final boolean isRerank;
private final RerankerCascade cascade;

enum QueryConstructor {
BagOfTerms,
SequentialDependenceModel
}

private final QueryConstructor qc;

public SearchCollection(SearchArgs args) throws IOException {
this.args = args;
Path indexPath = Paths.get(args.index);
Expand Down Expand Up @@ -122,18 +131,29 @@ public SearchCollection(SearchArgs args) throws IOException {

// Are we searching tweets?
if (args.searchtweets) {
LOG.info("Search Tweets");
analyzer = new TweetAnalyzer();
} else {
analyzer = args.keepstop ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
}

if (args.sdm) {
LOG.info("Use Sequential Dependence Model query");
qc = QueryConstructor.SequentialDependenceModel;
} else {
LOG.info("Use Bag of Terms query");
qc = QueryConstructor.BagOfTerms;
}

isRerank = args.rm3 || args.axiom;

// Set up the ranking cascade.
cascade = new RerankerCascade();
if (args.rm3) {
LOG.info("Rerank with RM3");
cascade.add(new Rm3Reranker(analyzer, FIELD_BODY, args));
} else if (args.axiom) {
LOG.info("Rerank with Axiomatic Reranking");
cascade.add(new AxiomReranker(FIELD_BODY, args));
}

Expand All @@ -159,7 +179,7 @@ public<K> int runTopics() throws IOException {
TopicReader<K> tr;
SortedMap<K, Map<String, String>> topics;
try {
tr = (TopicReader<K>) Class.forName("io.anserini.search.query." + args.topicReader + "TopicReader")
tr = (TopicReader<K>) Class.forName("io.anserini.search.topicreader." + args.topicReader + "TopicReader")
.getConstructor(Path.class).newInstance(topicsFile);
topics = tr.read();
} catch (Exception e) {
Expand Down Expand Up @@ -202,7 +222,12 @@ public<K> int runTopics() throws IOException {
}

public<K> ScoredDocuments search(IndexSearcher searcher, K qid, String queryString) throws IOException {
Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, analyzer, queryString);
Query query;
if (qc == QueryConstructor.SequentialDependenceModel) {
query = new SdmQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(FIELD_BODY, analyzer, queryString);
} else {
query = new BagOfWordsQueryGenerator().buildQuery(FIELD_BODY, analyzer, queryString);
}

TopDocs rs = new TopDocs(0, new ScoreDoc[]{}, Float.NaN);
if (!(isRerank && args.rerankcutoff <= 0)) {
Expand All @@ -220,7 +245,12 @@ public<K> ScoredDocuments search(IndexSearcher searcher, K qid, String queryStri
}

public<K> ScoredDocuments searchTweets(IndexSearcher searcher, K qid, String queryString, long t) throws IOException {
Query keywordQuery = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, analyzer, queryString);
Query keywordQuery;
if (qc == QueryConstructor.SequentialDependenceModel) {
keywordQuery = new SdmQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(FIELD_BODY, analyzer, queryString);
} else {
keywordQuery = new BagOfWordsQueryGenerator().buildQuery(FIELD_BODY, analyzer, queryString);
}
List<String> queryTokens = AnalyzerUtils.tokenize(analyzer, queryString);

// Do not consider the tweets with tweet ids that are beyond the queryTweetTime
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/io/anserini/search/SimpleSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package io.anserini.search;

import io.anserini.index.generator.LuceneDocumentGenerator;
import io.anserini.util.AnalyzerUtils;
import io.anserini.search.query.BagOfWordsQueryGenerator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
Expand Down Expand Up @@ -84,7 +84,7 @@ public Result[] search(String q) throws IOException {
public Result[] search(String q, int k) throws IOException {
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
Query query = AnalyzerUtils.buildBagOfWordsQuery(LuceneDocumentGenerator.FIELD_BODY, analyzer, q);
Query query = new BagOfWordsQueryGenerator().buildQuery(LuceneDocumentGenerator.FIELD_BODY, analyzer, q);

TopDocs rs = searcher.search(query, k);
ScoreDoc[] hits = rs.scoreDocs;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/**
* Anserini: An information retrieval toolkit built on Lucene
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.search.query;

import io.anserini.util.AnalyzerUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;

import java.util.List;

/*
* Bag of Terms query builder
*/
public class BagOfWordsQueryGenerator extends QueryGenerator {
@Override
public Query buildQuery(String field, Analyzer analyzer, String queryText) {
List<String> tokens = AnalyzerUtils.tokenize(analyzer, queryText);

BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (String t : tokens) {
builder.add(new TermQuery(new Term(field, t)), BooleanClause.Occur.SHOULD);
}

return builder.build();
}
}
55 changes: 55 additions & 0 deletions src/main/java/io/anserini/search/query/QueryGenerator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/**
* Anserini: An information retrieval toolkit built on Lucene
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.search.query;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.search.Query;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

public abstract class QueryGenerator {
static public List<String> tokenize(Analyzer analyzer, String s) {
List<String> list = new ArrayList<>();

try {
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s));
CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
if (cattr.toString().length() == 0) {
continue;
}
list.add(cattr.toString());
}
tokenStream.end();
tokenStream.close();
} catch (IOException e) {
e.printStackTrace();
}

return list;
}

public Query buildQuery(String field, Analyzer analyzer, String queryText) {
throw new UnsupportedOperationException();
}
}
84 changes: 84 additions & 0 deletions src/main/java/io/anserini/search/query/SdmQueryGenerator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/**
* Anserini: An information retrieval toolkit built on Lucene
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.search.query;

import io.anserini.util.AnalyzerUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;

import java.util.List;

/* Build the Term Dependency query. See:
* D. Metzler and W. B. Croft. A markov random field model for term dependencies. In SIGIR ’05.
*/
public class SdmQueryGenerator extends QueryGenerator {
private final float termWeight;
private final float orderWindowWeight;
private final float unorderWindowWeight;

public SdmQueryGenerator() {
this.termWeight = 0.85f;
this.orderWindowWeight = 0.1f;
this.unorderWindowWeight = 0.05f;
}

public SdmQueryGenerator(float termWeight, float orderWindowWeight, float unorderWindowWeight) {
this.termWeight = termWeight;
this.orderWindowWeight = orderWindowWeight;
this.unorderWindowWeight = unorderWindowWeight;
}

/*
* Sequential Dependency Model
*/
@Override
public Query buildQuery(String field, Analyzer analyzer, String queryText) {
List<String> tokens = AnalyzerUtils.tokenize(analyzer, queryText);

BooleanQuery.Builder termsBuilder = new BooleanQuery.Builder();
if (tokens.size() == 1) {
termsBuilder.add(new TermQuery(new Term(field, tokens.get(0))), BooleanClause.Occur.SHOULD);
return termsBuilder.build();
}

BooleanQuery.Builder orderedWindowBuilder = new BooleanQuery.Builder();
BooleanQuery.Builder unorderedWindowBuilder = new BooleanQuery.Builder();
for (int i = 0; i < tokens.size()-1; i++) {
termsBuilder.add(new TermQuery(new Term(field, tokens.get(i))), BooleanClause.Occur.SHOULD);

SpanTermQuery t1 = new SpanTermQuery(new Term(field, tokens.get(i)));
SpanTermQuery t2 = new SpanTermQuery(new Term(field, tokens.get(i+1)));
SpanNearQuery orderedQ = new SpanNearQuery(new SpanQuery[] {t1, t2}, 1, true);
SpanNearQuery unorderedQ = new SpanNearQuery(new SpanQuery[] {t1, t2}, 8, false);

orderedWindowBuilder.add(orderedQ, BooleanClause.Occur.SHOULD);
unorderedWindowBuilder.add(unorderedQ, BooleanClause.Occur.SHOULD);
}
termsBuilder.add(new TermQuery(new Term(field, tokens.get(tokens.size()-1))), BooleanClause.Occur.SHOULD);

BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new BoostQuery(termsBuilder.build(), termWeight), BooleanClause.Occur.SHOULD);
builder.add(new BoostQuery(orderedWindowBuilder.build(), orderWindowWeight), BooleanClause.Occur.SHOULD);
builder.add(new BoostQuery(unorderedWindowBuilder.build(), unorderWindowWeight), BooleanClause.Occur.SHOULD);

return builder.build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package io.anserini.search.query;
package io.anserini.search.topicreader;

import java.io.BufferedReader;
import java.io.IOException;
Expand Down
Loading