-
Notifications
You must be signed in to change notification settings - Fork 433
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement Sequential Dependency Model query constructor #359
Changes from 14 commits
92f3e81
7ca123f
3a1f99b
43e156a
b29d38f
231c07e
0a69229
49a1c59
cda8a84
3513bbd
ee17955
e3facdc
549a644
a58a982
6252abf
f72c877
14a1da1
6550c53
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,7 +25,9 @@ | |
import io.anserini.rerank.lib.AxiomReranker; | ||
import io.anserini.rerank.lib.Rm3Reranker; | ||
import io.anserini.rerank.lib.ScoreTiesAdjusterReranker; | ||
import io.anserini.search.query.TopicReader; | ||
import io.anserini.search.query.BagOfTermsQueryGenerator; | ||
import io.anserini.search.query.TermDependencyQueryGenerator; | ||
import io.anserini.search.topicreader.TopicReader; | ||
import io.anserini.search.similarity.F2LogSimilarity; | ||
import io.anserini.util.AnalyzerUtils; | ||
import org.apache.commons.lang3.time.DurationFormatUtils; | ||
|
@@ -84,6 +86,12 @@ public final class SearchCollection implements Closeable { | |
private final boolean isRerank; | ||
private final RerankerCascade cascade; | ||
|
||
enum QueryConstructor { | ||
BagOfTerms, | ||
SequentialDependenceModel | ||
} | ||
private final QueryConstructor qc; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add empty line? |
||
|
||
public SearchCollection(SearchArgs args) throws IOException { | ||
this.args = args; | ||
Path indexPath = Paths.get(args.index); | ||
|
@@ -111,18 +119,29 @@ public SearchCollection(SearchArgs args) throws IOException { | |
|
||
// Are we searching tweets? | ||
if (args.searchtweets) { | ||
LOG.info("Search Tweets"); | ||
analyzer = new TweetAnalyzer(); | ||
} else { | ||
analyzer = args.keepstop ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer(); | ||
} | ||
|
||
if (args.sdm) { | ||
LOG.info("Use Sequential Dependence Model query"); | ||
qc = QueryConstructor.SequentialDependenceModel; | ||
} else { | ||
LOG.info("Use Bag of Terms query"); | ||
qc = QueryConstructor.BagOfTerms; | ||
} | ||
|
||
isRerank = args.rm3 || args.axiom; | ||
|
||
// Set up the ranking cascade. | ||
cascade = new RerankerCascade(); | ||
if (args.rm3) { | ||
LOG.info("Rerank with RM3"); | ||
cascade.add(new Rm3Reranker(analyzer, FIELD_BODY, args)); | ||
} else if (args.axiom) { | ||
LOG.info("Rerank with Axiomatic Reranking"); | ||
cascade.add(new AxiomReranker(FIELD_BODY, args)); | ||
} | ||
|
||
|
@@ -148,7 +167,7 @@ public<K> int runTopics() throws IOException { | |
TopicReader<K> tr; | ||
SortedMap<K, Map<String, String>> topics; | ||
try { | ||
tr = (TopicReader<K>) Class.forName("io.anserini.search.query." + args.topicReader + "TopicReader") | ||
tr = (TopicReader<K>) Class.forName("io.anserini.search.topicreader." + args.topicReader + "TopicReader") | ||
.getConstructor(Path.class).newInstance(topicsFile); | ||
topics = tr.read(); | ||
} catch (Exception e) { | ||
|
@@ -192,7 +211,12 @@ public<K> int runTopics() throws IOException { | |
} | ||
|
||
public<K> ScoredDocuments search(IndexSearcher searcher, K qid, String queryString) throws IOException { | ||
Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, analyzer, queryString); | ||
Query query; | ||
if (qc == QueryConstructor.SequentialDependenceModel) { | ||
query = new TermDependencyQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(FIELD_BODY, analyzer, queryString); | ||
} else { | ||
query = new BagOfTermsQueryGenerator().buildQuery(FIELD_BODY, analyzer, queryString); | ||
} | ||
|
||
TopDocs rs = new TopDocs(0, new ScoreDoc[]{}, Float.NaN); | ||
if (!(isRerank && args.rerankcutoff <= 0)) { | ||
|
@@ -210,7 +234,12 @@ public<K> ScoredDocuments search(IndexSearcher searcher, K qid, String queryStri | |
} | ||
|
||
public<K> ScoredDocuments searchTweets(IndexSearcher searcher, K qid, String queryString, long t) throws IOException { | ||
Query keywordQuery = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, analyzer, queryString); | ||
Query keywordQuery; | ||
if (qc == QueryConstructor.SequentialDependenceModel) { | ||
keywordQuery = new TermDependencyQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(FIELD_BODY, analyzer, queryString); | ||
} else { | ||
keywordQuery = new BagOfTermsQueryGenerator().buildQuery(FIELD_BODY, analyzer, queryString); | ||
} | ||
List<String> queryTokens = AnalyzerUtils.tokenize(analyzer, queryString); | ||
|
||
// Do not consider the tweets with tweet ids that are beyond the queryTweetTime | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
/** | ||
* Anserini: An information retrieval toolkit built on Lucene | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.search.query; | ||
|
||
import io.anserini.util.AnalyzerUtils; | ||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.index.Term; | ||
import org.apache.lucene.search.BooleanClause; | ||
import org.apache.lucene.search.BooleanQuery; | ||
import org.apache.lucene.search.Query; | ||
import org.apache.lucene.search.TermQuery; | ||
|
||
import java.util.List; | ||
|
||
/* | ||
* Bag of Terms query builder | ||
*/ | ||
public class BagOfTermsQueryGenerator extends QueryGenerator { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we rename it |
||
@Override | ||
public Query buildQuery(String field, Analyzer analyzer, String queryText) { | ||
List<String> tokens = AnalyzerUtils.tokenize(analyzer, queryText); | ||
|
||
BooleanQuery.Builder builder = new BooleanQuery.Builder(); | ||
for (String t : tokens) { | ||
builder.add(new TermQuery(new Term(field, t)), BooleanClause.Occur.SHOULD); | ||
} | ||
|
||
return builder.build(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/** | ||
* Anserini: An information retrieval toolkit built on Lucene | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.search.query; | ||
|
||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.search.Query; | ||
|
||
import java.io.IOException; | ||
import java.io.StringReader; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
public abstract class QueryGenerator { | ||
static public List<String> tokenize(Analyzer analyzer, String s) { | ||
List<String> list = new ArrayList<>(); | ||
|
||
try { | ||
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s)); | ||
CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class); | ||
tokenStream.reset(); | ||
while (tokenStream.incrementToken()) { | ||
if (cattr.toString().length() == 0) { | ||
continue; | ||
} | ||
list.add(cattr.toString()); | ||
} | ||
tokenStream.end(); | ||
tokenStream.close(); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
|
||
return list; | ||
} | ||
|
||
public Query buildQuery(String field, Analyzer analyzer, String queryText) { | ||
throw new UnsupportedOperationException(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
/** | ||
* Anserini: An information retrieval toolkit built on Lucene | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.search.query; | ||
|
||
import io.anserini.util.AnalyzerUtils; | ||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.index.Term; | ||
import org.apache.lucene.search.*; | ||
import org.apache.lucene.search.spans.SpanNearQuery; | ||
import org.apache.lucene.search.spans.SpanQuery; | ||
import org.apache.lucene.search.spans.SpanTermQuery; | ||
|
||
import java.util.List; | ||
|
||
/* Build the Term Dependency query. See: | ||
* D. Metzler and W. B. Croft. A markov random field model for term dependencies. In SIGIR ’05. | ||
*/ | ||
public class TermDependencyQueryGenerator extends QueryGenerator { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "TermDependency" is vague. How about SdmQueryGenerator to make consistent with test case below? |
||
private final float termWeight; | ||
private final float orderWindowWeight; | ||
private final float unorderWindowWeight; | ||
|
||
public TermDependencyQueryGenerator() { | ||
this.termWeight = 0.85f; | ||
this.orderWindowWeight = 0.1f; | ||
this.unorderWindowWeight = 0.05f; | ||
} | ||
|
||
public TermDependencyQueryGenerator(float termWeight, float orderWindowWeight, float unorderWindowWeight) { | ||
this.termWeight = termWeight; | ||
this.orderWindowWeight = orderWindowWeight; | ||
this.unorderWindowWeight = unorderWindowWeight; | ||
} | ||
|
||
/* | ||
* Sequential Dependency Model | ||
*/ | ||
@Override | ||
public Query buildQuery(String field, Analyzer analyzer, String queryText) { | ||
List<String> tokens = AnalyzerUtils.tokenize(analyzer, queryText); | ||
|
||
BooleanQuery.Builder termsBuilder = new BooleanQuery.Builder(); | ||
if (tokens.size() == 1) { | ||
termsBuilder.add(new TermQuery(new Term(field, tokens.get(0))), BooleanClause.Occur.SHOULD); | ||
return termsBuilder.build(); | ||
} | ||
|
||
BooleanQuery.Builder orderedWindowBuilder = new BooleanQuery.Builder(); | ||
BooleanQuery.Builder unorderedWindowBuilder = new BooleanQuery.Builder(); | ||
for (int i = 0; i < tokens.size()-1; i++) { | ||
termsBuilder.add(new TermQuery(new Term(field, tokens.get(i))), BooleanClause.Occur.SHOULD); | ||
|
||
SpanTermQuery t1 = new SpanTermQuery(new Term(field, tokens.get(i))); | ||
SpanTermQuery t2 = new SpanTermQuery(new Term(field, tokens.get(i+1))); | ||
SpanNearQuery orderedQ = new SpanNearQuery(new SpanQuery[] {t1, t2}, 1, true); | ||
SpanNearQuery unorderedQ = new SpanNearQuery(new SpanQuery[] {t1, t2}, 8, false); | ||
|
||
orderedWindowBuilder.add(orderedQ, BooleanClause.Occur.SHOULD); | ||
unorderedWindowBuilder.add(unorderedQ, BooleanClause.Occur.SHOULD); | ||
} | ||
termsBuilder.add(new TermQuery(new Term(field, tokens.get(tokens.size()-1))), BooleanClause.Occur.SHOULD); | ||
|
||
BooleanQuery.Builder builder = new BooleanQuery.Builder(); | ||
builder.add(new BoostQuery(termsBuilder.build(), termWeight), BooleanClause.Occur.SHOULD); | ||
builder.add(new BoostQuery(orderedWindowBuilder.build(), orderWindowWeight), BooleanClause.Occur.SHOULD); | ||
builder.add(new BoostQuery(unorderedWindowBuilder.build(), unorderWindowWeight), BooleanClause.Occur.SHOULD); | ||
|
||
return builder.build(); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"term weight in sdm" -> "SRM term weight"?
And below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SDM?