Skip to content

Commit

Permalink
Add option to enable Lucene 8 index compatibility (#1953)
Browse files Browse the repository at this point in the history
Addresses #1952 - add a flag -lucene8 that abandons consistent tie breaking,
so retrieval doesn't need to touch the docvalues. In the regression script, a 
similar option --lucene8 allows the score matching to be more lenient.
  • Loading branch information
lintool committed Aug 3, 2022
1 parent 2725655 commit cb98607
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 2 deletions.
3 changes: 3 additions & 0 deletions src/main/java/io/anserini/search/SearchArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ public class SearchArgs {
@Option(name = "-topicreader", required = true, usage = "TopicReader to use.")
public String topicReader;

@Option(name = "-lucene8", usage = "Enable Lucene 8 index compatibility.")
public Boolean lucene8 = false;

// optional arguments
@Option(name = "-querygenerator", usage = "QueryGenerator to use.")
public String queryGenerator = "BagOfWordsQueryGenerator";
Expand Down
6 changes: 6 additions & 0 deletions src/main/java/io/anserini/search/SearchCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,12 @@ public SearchCollection(SearchArgs args) throws IOException {
loadQrels(args.rf_qrels);
}

// See https://github.com/castorini/anserini/issues/1952
// The solution to the issue described above is to turn off deterministic tie-breaking.
if (args.lucene8) {
args.arbitraryScoreTieBreak = true;
args.axiom_deterministic = false;
}
}

@Override
Expand Down
16 changes: 14 additions & 2 deletions src/main/python/run_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ def is_close(a, b, rel_tol=1e-09, abs_tol=0.0):
return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)


def is_close_lucene8(a, b):
return abs(a-b) <= 0.001


def check_output(command):
# Python 2.6 compatible subprocess.check_output
process = Popen(command, shell=True, stdout=PIPE)
Expand Down Expand Up @@ -131,6 +135,7 @@ def construct_search_commands(yaml_data):
'-topicreader', topic_set['topic_reader'] if 'topic_reader' in topic_set and topic_set['topic_reader'] else yaml_data['topic_reader'],
'-output', construct_runfile_path(yaml_data['corpus'], topic_set['id'], model['name']),
model['params'],
'-lucene8' if args.lucene8 else ''
]
for (model, topic_set) in list(itertools.product(yaml_data['models'], yaml_data['topics']))
]
Expand All @@ -154,6 +159,7 @@ def construct_convert_commands(yaml_data):
def evaluate_and_verify(yaml_data, dry_run):
fail_str = '\033[91m[FAIL]\033[0m '
ok_str = ' [OK] '
okish_str = ' \033[94m[OK*]\033[0m '
failures = False

logger.info('='*10 + ' Verifying Results: ' + yaml_data['corpus'] + ' ' + '='*10)
Expand Down Expand Up @@ -181,8 +187,11 @@ def evaluate_and_verify(yaml_data, dry_run):
if is_close(expected, actual):
logger.info(ok_str + result_str)
else:
logger.error(fail_str + result_str)
failures = True
if args.lucene8 and is_close_lucene8(expected, actual):
logger.info(okish_str + result_str)
else:
logger.error(fail_str + result_str)
failures = True

if not dry_run:
if failures:
Expand Down Expand Up @@ -280,6 +289,7 @@ def download_url(url, save_dir, local_filename=None, md5=None, force=False, verb
help='Number of converting runs to execute in parallel.')
parser.add_argument('--dry-run', dest='dry_run', action='store_true',
help='Output commands without actual execution.')
parser.add_argument('--lucene8', dest='lucene8', action='store_true', help='Enable Lucene 8 index compatibility.')
args = parser.parse_args()

with open('src/main/resources/regression/{}.yaml'.format(args.regression)) as f:
Expand Down Expand Up @@ -340,6 +350,8 @@ def download_url(url, save_dir, local_filename=None, md5=None, force=False, verb
# Search and verify results.
if args.search:
logger.info('='*10 + ' Ranking ' + '='*10)
if args.lucene8:
logger.info('Enabling Lucene 8 index compatibility.')
search_cmds = construct_search_commands(yaml_data)
if args.dry_run:
for cmd in search_cmds:
Expand Down

0 comments on commit cb98607

Please sign in to comment.