From e944e83b137527d5128a56be0253d25f4db9395f Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Wed, 2 May 2018 17:39:53 +0100 Subject: [PATCH 1/2] [SOLR-12304] More Like This component interesting term fix +tests --- .../solr/handler/MoreLikeThisHandler.java | 2 +- .../component/MoreLikeThisComponent.java | 44 ++- .../component/MoreLikeThisComponentTest.java | 286 ++++++++++++++++++ 3 files changed, 321 insertions(+), 11 deletions(-) create mode 100644 solr/core/src/test/org/apache/solr/handler/component/MoreLikeThisComponentTest.java diff --git a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java index 62f1016bbaf1..ace4ff3c90e7 100644 --- a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java @@ -389,7 +389,7 @@ public DocListAndSet getMoreLikeThis( int id, int start, int rows, List f rawMLTQuery = mlt.like(id); boostedMLTQuery = getBoostedQuery( rawMLTQuery ); if( terms != null ) { - fillInterestingTermsFromMLTQuery( rawMLTQuery, terms ); + fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms ); } // exclude current document from results diff --git a/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java b/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java index fd9d37d4aad7..c4afc1d9e22f 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java @@ -191,11 +191,6 @@ public void finishStage(ResponseBuilder rb) { log.info("MLT: results added for key: " + key + " documents: " + shardDocList.toString()); -// if (log.isDebugEnabled()) { -// for (SolrDocument doc : shardDocList) { -// doc.addField("shard", "=" + r.getShard()); -// } -// } SolrDocumentList mergedDocList = tempResults.get(key); if (mergedDocList == null) { @@ -370,21 +365,31 @@ NamedList getMoreLikeThese(ResponseBuilder rb, IndexSchema schema = searcher.getSchema(); MoreLikeThisHandler.MoreLikeThisHelper mltHelper = new MoreLikeThisHandler.MoreLikeThisHelper( p, searcher); - NamedList mlt = new SimpleOrderedMap<>(); + NamedList mltResponse = new SimpleOrderedMap<>(); DocIterator iterator = docs.iterator(); SimpleOrderedMap dbg = null; if (rb.isDebug()) { dbg = new SimpleOrderedMap<>(); } + + SimpleOrderedMap interestingTermsResponse = null; + MoreLikeThisParams.TermStyle termStyle = MoreLikeThisParams.TermStyle.get(p.get(MoreLikeThisParams.INTERESTING_TERMS)); + List interestingTerms = (termStyle == MoreLikeThisParams.TermStyle.NONE) + ? null : new ArrayList<>(mltHelper.getMoreLikeThis().getMaxQueryTerms()); + + if (interestingTerms!=null) { + interestingTermsResponse = new SimpleOrderedMap<>(); + } while (iterator.hasNext()) { int id = iterator.nextDoc(); int rows = p.getInt(MoreLikeThisParams.DOC_COUNT, 5); - DocListAndSet sim = mltHelper.getMoreLikeThis(id, 0, rows, null, null, + + DocListAndSet sim = mltHelper.getMoreLikeThis(id, 0, rows, null, interestingTerms, flags); String name = schema.printableUniqueKey(searcher.doc(id)); - mlt.add(name, sim.docList); + mltResponse.add(name, sim.docList); if (dbg != null) { SimpleOrderedMap docDbg = new SimpleOrderedMap<>(); @@ -403,13 +408,32 @@ NamedList getMoreLikeThese(ResponseBuilder rb, docDbg.add("explain", explains); dbg.add(name, docDbg); } + + if (interestingTermsResponse != null) { + if (termStyle == MoreLikeThisParams.TermStyle.DETAILS) { + NamedList interestingTermsWithScore = new NamedList<>(); + for (MoreLikeThisHandler.InterestingTerm interestingTerm : interestingTerms) { + interestingTermsWithScore.add(interestingTerm.term.toString(), interestingTerm.boost); + } + interestingTermsResponse.add(name, interestingTermsWithScore); + } else { + List interestingTermsString = new ArrayList<>(interestingTerms.size()); + for(MoreLikeThisHandler.InterestingTerm interestingTerm : interestingTerms){ + interestingTermsString.add(interestingTerm.term.toString()); + } + interestingTermsResponse.add(name, interestingTermsString); + } + } } - // add debug information if (dbg != null) { rb.addDebugInfo("moreLikeThis", dbg); } - return mlt; + // add Interesting Terms + if (interestingTermsResponse != null) { + rb.rsp.add("interestingTerms", interestingTermsResponse); + } + return mltResponse; } // /////////////////////////////////////////// diff --git a/solr/core/src/test/org/apache/solr/handler/component/MoreLikeThisComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/MoreLikeThisComponentTest.java new file mode 100644 index 000000000000..764d26078362 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/component/MoreLikeThisComponentTest.java @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.component; + +import org.apache.lucene.util.LuceneTestCase.Slow; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.MoreLikeThisParams; +import org.apache.solr.core.SolrCore; +import org.apache.solr.request.LocalSolrQueryRequest; +import org.apache.solr.request.SolrQueryRequest; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Test for MoreLikeThisComponent + * + * + * @see MoreLikeThisComponent + */ +@Slow +public class MoreLikeThisComponentTest extends SolrTestCaseJ4 { + + @BeforeClass + public static void moreLikeThisBeforeClass() throws Exception { + initCore("solrconfig.xml", "schema.xml"); + assertU(adoc("id","42","name","Tom Cruise","subword","Top Gun","subword","Risky Business","subword","The Color of Money","subword","Minority Report","subword", "Days of Thunder","subword", "Eyes Wide Shut","subword", "Far and Away", "foo_ti","10")); + assertU(adoc("id","43","name","Tom Hanks","subword","The Green Mile","subword","Forest Gump","subword","Philadelphia Story","subword","Big","subword","Cast Away", "foo_ti","10")); + assertU(adoc("id","44","name","Harrison Ford","subword","Star Wars","subword","Indiana Jones","subword","Patriot Games","subword","Regarding Henry")); + assertU(adoc("id","45","name","George Harrison","subword","Yellow Submarine","subword","Help","subword","Magical Mystery Tour","subword","Sgt. Peppers Lonley Hearts Club Band")); + assertU(adoc("id","46","name","Nicole Kidman","subword","Batman","subword","Days of Thunder","subword","Eyes Wide Shut","subword","Far and Away")); + assertU(commit()); + } + + private void initCommonMoreLikeThisParams(ModifiableSolrParams params) { + params.set(MoreLikeThisParams.MLT, "true"); + params.set(MoreLikeThisParams.SIMILARITY_FIELDS, "name,subword"); + params.set(MoreLikeThisParams.MIN_TERM_FREQ,"1"); + params.set(MoreLikeThisParams.MIN_DOC_FREQ,"1"); + params.set("indent","true"); + } + + @Test + public void testMLT_baseParams_shouldReturnSimilarDocuments() + { + SolrCore core = h.getCore(); + ModifiableSolrParams params = new ModifiableSolrParams(); + + initCommonMoreLikeThisParams(params); + + params.set(CommonParams.Q, "id:42"); + SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params); + assertQ("morelikethis - tom cruise",mltreq + ,"//result/doc[1]/str[@name='id'][.='46']" + ,"//result/doc[2]/str[@name='id'][.='43']"); + + params.set(CommonParams.Q, "id:44"); + mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); + assertQ("morelike this - harrison ford",mltreq + ,"//result/doc[1]/str[@name='id'][.='45']"); + mltreq.close(); + } + + @Test + public void testMLT_baseParamsInterestingTermsDetails_shouldReturnSimilarDocumentsAndInterestingTermsDetails() + { + SolrCore core = h.getCore(); + ModifiableSolrParams params = new ModifiableSolrParams(); + + initCommonMoreLikeThisParams(params); + params.set(MoreLikeThisParams.INTERESTING_TERMS, "details"); + + params.set(CommonParams.Q, "id:42"); + SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params); + assertQ("morelikethis - tom cruise",mltreq + ,"//result/doc[1]/str[@name='id'][.='46']" + ,"//result/doc[2]/str[@name='id'][.='43']", + "//lst[@name='interestingTerms']/lst[1][count(*)>0]", + "//lst[@name='interestingTerms']/lst[1]/float[.=1.0]"); + mltreq.close(); + } + + @Test + public void testMLT_baseParamsInterestingTermsList_shouldReturnSimilarDocumentsAndInterestingTermsList() + { + SolrCore core = h.getCore(); + ModifiableSolrParams params = new ModifiableSolrParams(); + + initCommonMoreLikeThisParams(params); + params.set(MoreLikeThisParams.INTERESTING_TERMS, "list"); + + params.set(CommonParams.Q, "id:42"); + SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params); + assertQ("morelikethis - tom cruise",mltreq + ,"//result/doc[1]/str[@name='id'][.='46']" + ,"//result/doc[2]/str[@name='id'][.='43']", + "//lst[@name='interestingTerms']/arr[@name='42'][count(*)>0]", + "//lst[@name='interestingTerms']/arr[@name='42']/str[.='name:Cruise']"); + mltreq.close(); + } + + @Test + public void testMLT_boostEnabled_shouldReturnSimilarDocumentsConsideringBoost() + { + SolrCore core = h.getCore(); + ModifiableSolrParams params = new ModifiableSolrParams(); + + initCommonMoreLikeThisParams(params); + params.set(MoreLikeThisParams.BOOST, "true"); + + params.set(CommonParams.Q, "id:42"); + SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params); + assertQ("morelikethis - tom cruise",mltreq + ,"//result/doc[1]/str[@name='id'][.='46']" + ,"//result/doc[2]/str[@name='id'][.='43']"); + + params.set(CommonParams.Q, "id:42"); + params.set(MoreLikeThisParams.QF,"name^5.0 subword^0.1"); + mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); + assertQ("morelikethis with weights",mltreq + ,"//result/doc[1]/str[@name='id'][.='43']" + ,"//result/doc[2]/str[@name='id'][.='46']"); + + mltreq.close(); + } + + @Test + public void testMLT_boostEnabledInterestingTermsDetails_shouldReturnSimilarDocumentsConsideringBoostAndInterestingTermsDetails() + { + SolrCore core = h.getCore(); + ModifiableSolrParams params = new ModifiableSolrParams(); + + initCommonMoreLikeThisParams(params); + params.set(MoreLikeThisParams.BOOST, "true"); + params.set(MoreLikeThisParams.INTERESTING_TERMS, "details"); + + params.set(CommonParams.Q, "id:42"); + SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params); + assertQ("morelikethis - tom cruise",mltreq + ,"//result/doc[1]/str[@name='id'][.='46']" + ,"//result/doc[2]/str[@name='id'][.='43']", + "//lst[@name='interestingTerms']/lst[1][count(*)>0]", + "//lst[@name='interestingTerms']/lst[1]/float[.>1.0]"); + + params.set(MoreLikeThisParams.QF,"name^5.0 subword^0.1"); + mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); + assertQ("morelikethis with weights",mltreq + ,"//result/doc[1]/str[@name='id'][.='43']" + ,"//result/doc[2]/str[@name='id'][.='46']", + "//lst[@name='interestingTerms']/lst[1][count(*)>0]", + "//lst[@name='interestingTerms']/lst[1]/float[.>5.0]"); + + mltreq.close(); + } + + @Test + public void testMLT_boostEnabledInterestingTermsList_shouldReturnSimilarDocumentsConsideringBoostAndInterestingTermsList() + { + SolrCore core = h.getCore(); + ModifiableSolrParams params = new ModifiableSolrParams(); + + initCommonMoreLikeThisParams(params); + params.set(MoreLikeThisParams.BOOST, "true"); + params.set(MoreLikeThisParams.INTERESTING_TERMS, "list"); + + params.set(CommonParams.Q, "id:42"); + SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params); + assertQ("morelikethis - tom cruise",mltreq + ,"//result/doc[1]/str[@name='id'][.='46']" + ,"//result/doc[2]/str[@name='id'][.='43']", + "//lst[@name='interestingTerms']/arr[@name='42'][count(*)>0]", + "//lst[@name='interestingTerms']/arr[@name='42']/str[.='name:Cruise']"); + + params.set(MoreLikeThisParams.QF,"name^5.0 subword^0.1"); + mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); + assertQ("morelikethis with weights",mltreq + ,"//result/doc[1]/str[@name='id'][.='43']" + ,"//result/doc[2]/str[@name='id'][.='46']", + "//lst[@name='interestingTerms']/arr[@name='42'][count(*)>0]", + "//lst[@name='interestingTerms']/arr[@name='42']/str[.='name:Cruise']"); + + mltreq.close(); + } + + @Test + public void testMLT_debugEnabled_shouldReturnSimilarDocumentsWithDebug() + { + ModifiableSolrParams params = new ModifiableSolrParams(); + + initCommonMoreLikeThisParams(params); + params.set(MoreLikeThisParams.BOOST, "true"); + + params.set(CommonParams.Q, "id:44"); + params.set(CommonParams.DEBUG_QUERY, "true"); + SolrQueryRequest mltreq = new LocalSolrQueryRequest(h.getCore(), params); + assertQ("morelike this - harrison ford",mltreq + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='rawMLTQuery']" + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='boostedMLTQuery']" + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='realMLTQuery']" + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/lst[@name='explain']/str[@name='45']" + ); + + params.remove(CommonParams.DEBUG_QUERY); + params.set(CommonParams.Q, "{!field f=id}44"); + mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); + assertQ(mltreq + ,"//result/doc[1]/str[@name='id'][.='45']"); + mltreq.close(); + } + + @Test + public void testMLT_debugEnabledInterestingTermsDetails_shouldReturnSimilarDocumentsWithDebugAndInterestingTermsDetails() + { + ModifiableSolrParams params = new ModifiableSolrParams(); + + initCommonMoreLikeThisParams(params); + params.set(MoreLikeThisParams.BOOST, "true"); + params.set(MoreLikeThisParams.INTERESTING_TERMS, "details"); + + params.set(CommonParams.Q, "id:44"); + params.set(CommonParams.DEBUG_QUERY, "true"); + SolrQueryRequest mltreq = new LocalSolrQueryRequest(h.getCore(), params); + assertQ("morelike this - harrison ford",mltreq + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='rawMLTQuery']" + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='boostedMLTQuery']" + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='realMLTQuery']" + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/lst[@name='explain']/str[@name='45']", + "//lst[@name='interestingTerms']/lst[1][count(*)>0]", + "//lst[@name='interestingTerms']/lst[1]/float[.>1.0]"); + + params.remove(CommonParams.DEBUG_QUERY); + params.set(CommonParams.Q, "{!field f=id}44"); + mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); + assertQ(mltreq + ,"//result/doc[1]/str[@name='id'][.='45']", + "//lst[@name='interestingTerms']/lst[1][count(*)>0]", + "//lst[@name='interestingTerms']/lst[1]/float[.>1.0]"); + mltreq.close(); + } + + @Test + public void testMLT_debugEnabledInterestingTermsList_shouldReturnSimilarDocumentsWithDebugAndInterestingTermsList() + { + ModifiableSolrParams params = new ModifiableSolrParams(); + + initCommonMoreLikeThisParams(params); + params.set(MoreLikeThisParams.BOOST, "true"); + params.set(MoreLikeThisParams.INTERESTING_TERMS, "list"); + + params.set(CommonParams.Q, "id:44"); + params.set(CommonParams.DEBUG_QUERY, "true"); + + SolrQueryRequest mltreq = new LocalSolrQueryRequest(h.getCore(), params); + assertQ("morelike this - harrison ford",mltreq + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='rawMLTQuery']" + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='boostedMLTQuery']" + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='realMLTQuery']" + ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/lst[@name='explain']/str[@name='45']", + "//lst[@name='interestingTerms']/arr[@name='44'][count(*)>0]", + "//lst[@name='interestingTerms']/arr[@name='44']/str[.='name:Harrison']"); + + params.remove(CommonParams.DEBUG_QUERY); + params.set(CommonParams.Q, "{!field f=id}44"); + mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); + assertQ(mltreq + ,"//result/doc[1]/str[@name='id'][.='45']", + "//lst[@name='interestingTerms']/arr[@name='44'][count(*)>0]", + "//lst[@name='interestingTerms']/arr[@name='44']/str[.='name:Harrison']"); + mltreq.close(); + } +} From be859d3f68d67f2401c9d334a2e85cf081c7b08e Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Sun, 27 Jan 2019 13:06:07 +0000 Subject: [PATCH 2/2] [SOLR-12304] minor refactor in data structures --- .../component/MoreLikeThisComponent.java | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java b/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java index c4afc1d9e22f..cab3a8077f28 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java @@ -374,22 +374,22 @@ NamedList getMoreLikeThese(ResponseBuilder rb, } SimpleOrderedMap interestingTermsResponse = null; - MoreLikeThisParams.TermStyle termStyle = MoreLikeThisParams.TermStyle.get(p.get(MoreLikeThisParams.INTERESTING_TERMS)); - List interestingTerms = (termStyle == MoreLikeThisParams.TermStyle.NONE) + MoreLikeThisParams.TermStyle interestingTermsConfig = MoreLikeThisParams.TermStyle.get(p.get(MoreLikeThisParams.INTERESTING_TERMS)); + List interestingTerms = (interestingTermsConfig == MoreLikeThisParams.TermStyle.NONE) ? null : new ArrayList<>(mltHelper.getMoreLikeThis().getMaxQueryTerms()); - - if (interestingTerms!=null) { + + if (interestingTerms != null) { interestingTermsResponse = new SimpleOrderedMap<>(); } while (iterator.hasNext()) { int id = iterator.nextDoc(); int rows = p.getInt(MoreLikeThisParams.DOC_COUNT, 5); - - DocListAndSet sim = mltHelper.getMoreLikeThis(id, 0, rows, null, interestingTerms, + + DocListAndSet similarDocuments = mltHelper.getMoreLikeThis(id, 0, rows, null, interestingTerms, flags); String name = schema.printableUniqueKey(searcher.doc(id)); - mltResponse.add(name, sim.docList); + mltResponse.add(name, similarDocuments.docList); if (dbg != null) { SimpleOrderedMap docDbg = new SimpleOrderedMap<>(); @@ -398,9 +398,9 @@ NamedList getMoreLikeThese(ResponseBuilder rb, .add("boostedMLTQuery", mltHelper.getBoostedMLTQuery().toString()); docDbg.add("realMLTQuery", mltHelper.getRealMLTQuery().toString()); SimpleOrderedMap explains = new SimpleOrderedMap<>(); - DocIterator mltIte = sim.docList.iterator(); - while (mltIte.hasNext()) { - int mltid = mltIte.nextDoc(); + DocIterator similarDocumentsIterator = similarDocuments.docList.iterator(); + while (similarDocumentsIterator.hasNext()) { + int mltid = similarDocumentsIterator.nextDoc(); String key = schema.printableUniqueKey(searcher.doc(mltid)); explains.add(key, searcher.explain(mltHelper.getRealMLTQuery(), mltid)); @@ -410,15 +410,15 @@ NamedList getMoreLikeThese(ResponseBuilder rb, } if (interestingTermsResponse != null) { - if (termStyle == MoreLikeThisParams.TermStyle.DETAILS) { - NamedList interestingTermsWithScore = new NamedList<>(); + if (interestingTermsConfig == MoreLikeThisParams.TermStyle.DETAILS) { + SimpleOrderedMap interestingTermsWithScore = new SimpleOrderedMap<>(); for (MoreLikeThisHandler.InterestingTerm interestingTerm : interestingTerms) { interestingTermsWithScore.add(interestingTerm.term.toString(), interestingTerm.boost); } interestingTermsResponse.add(name, interestingTermsWithScore); } else { List interestingTermsString = new ArrayList<>(interestingTerms.size()); - for(MoreLikeThisHandler.InterestingTerm interestingTerm : interestingTerms){ + for (MoreLikeThisHandler.InterestingTerm interestingTerm : interestingTerms) { interestingTermsString.add(interestingTerm.term.toString()); } interestingTermsResponse.add(name, interestingTermsString);