From 4cc6731adfd1d697d528c0963765fa27a5ca0e6a Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Thu, 31 May 2018 16:39:15 +0100 Subject: [PATCH 1/7] [LUCENE-6687] not necessary nested for loop removed for terms retrieval in More Like This --- .../lucene/queries/mlt/MoreLikeThis.java | 16 +++++------ .../lucene/queries/mlt/TestMoreLikeThis.java | 28 +++++++++++++++++++ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java index 8ea3933eec8a..89affb1b0f14 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java @@ -763,15 +763,13 @@ private PriorityQueue retrieveTerms(Map> f IOException { Map> field2termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { - for (String field : field2fieldValues.keySet()) { - Collection fieldValues = field2fieldValues.get(field); - if(fieldValues == null) - continue; - for(Object fieldValue:fieldValues) { - if (fieldValue != null) { - addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap, - fieldName); - } + Collection fieldValues = field2fieldValues.get(fieldName); + if (fieldValues == null) + continue; + for (Object fieldValue : fieldValues) { + if (fieldValue != null) { + addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap, + fieldName); } } } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index 32a610bf8a93..eadfc843e2fb 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -42,6 +42,8 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import static org.hamcrest.core.Is.is; + public class TestMoreLikeThis extends LuceneTestCase { private static final String SHOP_TYPE = "type"; @@ -186,6 +188,32 @@ public void testMultiValues() throws Exception { analyzer.close(); } + public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryAccordingToCorrectTermFrequencies() throws Exception { + MoreLikeThis mlt = new MoreLikeThis(reader); + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + mlt.setAnalyzer(analyzer); + mlt.setMinDocFreq(0); + mlt.setMinTermFreq(3); + mlt.setMinWordLen(1); + String sampleField1 = "text"; + String sampleField2 = "text2"; + mlt.setFieldNames(new String[]{sampleField1, sampleField2}); + + Map> filteredDocument = new HashMap<>(); + String textValue = "apache apache lucene lucene lucene"; + filteredDocument.put(sampleField1, Arrays.asList(textValue)); + filteredDocument.put(sampleField2, Arrays.asList(textValue)); + + BooleanQuery query = (BooleanQuery) mlt.like(filteredDocument); + Collection clauses = query.clauses(); + assertEquals("Expected 1 clauses only!", 1, clauses.size()); + for (BooleanClause clause : clauses) { + Term term = ((TermQuery) clause.getQuery()).getTerm(); + assertThat(term, is(new Term(sampleField1, "lucene"))); + } + analyzer.close(); + } + // just basic equals/hashcode etc public void testMoreLikeThisQuery() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); From 47d825be84f1292d521b4fe078853811227ec292 Mon Sep 17 00:00:00 2001 From: Florian Buetow Date: Tue, 9 Oct 2018 15:07:44 +0100 Subject: [PATCH 2/7] [LUCENE-6687] Adding an additional test to verify that term frequencies are ignored from field names that have not been set. --- .../lucene/queries/mlt/TestMoreLikeThis.java | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index eadfc843e2fb..b42f58235b09 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -214,6 +214,29 @@ public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryAccordingToC analyzer.close(); } + public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryWithCorrectTerms() throws Exception { + MoreLikeThis mlt = new MoreLikeThis(reader); + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + mlt.setAnalyzer(analyzer); + mlt.setMinDocFreq(0); + mlt.setMinTermFreq(3); + mlt.setMinWordLen(1); + String sampleField1 = "text"; + String sampleField2 = "text2"; + mlt.setFieldNames(new String[]{sampleField1}); + + Map> filteredDocument = new HashMap<>(); + String textValue1 = "apache apache lucene lucene"; + String textValue2 = "apache2 apache2 lucene2 lucene2 lucene2"; + filteredDocument.put(sampleField1, Arrays.asList(textValue1)); + filteredDocument.put(sampleField2, Arrays.asList(textValue2)); + + BooleanQuery query = (BooleanQuery) mlt.like(filteredDocument); + Collection clauses = query.clauses(); + assertEquals("Expected 0 clauses only!", 0, clauses.size()); + analyzer.close(); + } + // just basic equals/hashcode etc public void testMoreLikeThisQuery() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); From 15485ea40ab6c2e7931c68d35cc94ca437ee9365 Mon Sep 17 00:00:00 2001 From: Florian Buetow Date: Tue, 9 Oct 2018 15:14:35 +0100 Subject: [PATCH 3/7] [LUCENE-6687] Refactoring addDoc method and increasing document frequencies in the test index --- .../lucene/queries/mlt/TestMoreLikeThis.java | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index b42f58235b09..8d385330332f 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -61,10 +61,21 @@ public void setUp() throws Exception { RandomIndexWriter writer = new RandomIndexWriter(random(), directory); // Add series of docs with specific information for MoreLikeThis - addDoc(writer, "lucene"); - addDoc(writer, "lucene release"); - addDoc(writer, "apache"); - addDoc(writer, "apache lucene"); + addDoc(writer, "text", "lucene"); + addDoc(writer, "text", "lucene release"); + addDoc(writer, "text", "apache"); + addDoc(writer, "text", "apache lucene"); + + // one more time to increase the doc frequencies + addDoc(writer, "text","lucene2"); + addDoc(writer, "text", "lucene2 release2"); + addDoc(writer, "text", "apache2"); + addDoc(writer, "text", "apache2 lucene2"); + + addDoc(writer, "text2","lucene2"); + addDoc(writer, "text2", "lucene2 release2"); + addDoc(writer, "text2", "apache2"); + addDoc(writer, "text2", "apache2 lucene2"); reader = writer.getReader(); writer.close(); @@ -78,16 +89,16 @@ public void tearDown() throws Exception { super.tearDown(); } - private void addDoc(RandomIndexWriter writer, String text) throws IOException { + private void addDoc(RandomIndexWriter writer, String fieldName, String text) throws IOException { Document doc = new Document(); - doc.add(newTextField("text", text, Field.Store.YES)); + doc.add(newTextField(fieldName, text, Field.Store.YES)); writer.addDocument(doc); } - private void addDoc(RandomIndexWriter writer, String[] texts) throws IOException { + private void addDoc(RandomIndexWriter writer, String fieldName, String[] texts) throws IOException { Document doc = new Document(); for (String text : texts) { - doc.add(newTextField("text", text, Field.Store.YES)); + doc.add(newTextField(fieldName, text, Field.Store.YES)); } writer.addDocument(doc); } @@ -253,7 +264,7 @@ public void testTopN() throws Exception { Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir); for (int i = 0; i < numDocs; i++) { - addDoc(writer, generateStrSeq(0, i + 1)); + addDoc(writer, "text", generateStrSeq(0, i + 1)); } IndexReader reader = writer.getReader(); writer.close(); From a2677e5b77ea045239e6b838e5822d96e8e37da7 Mon Sep 17 00:00:00 2001 From: Florian Buetow Date: Tue, 9 Oct 2018 15:31:31 +0100 Subject: [PATCH 4/7] [LUCENE-6687] Adding a comment. --- .../src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java | 1 + 1 file changed, 1 insertion(+) diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index 8d385330332f..edb2ed500b02 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -244,6 +244,7 @@ public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryWithCorrectT BooleanQuery query = (BooleanQuery) mlt.like(filteredDocument); Collection clauses = query.clauses(); + // Because "text2" was not added to the list of field names, we don't expect any results even though the min tf for field "text2" is above the threshold assertEquals("Expected 0 clauses only!", 0, clauses.size()); analyzer.close(); } From c27b398ebd2da43381e9eceb7f7d036951f690ec Mon Sep 17 00:00:00 2001 From: Florian Buetow Date: Tue, 9 Oct 2018 16:08:15 +0100 Subject: [PATCH 5/7] [LUCENE-6687] Adding a test to verify that a mlt queries are build with specified field names only. --- .../lucene/queries/mlt/TestMoreLikeThis.java | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index edb2ed500b02..8de5b3d459f6 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -22,6 +22,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; import org.apache.lucene.analysis.Analyzer; @@ -244,8 +245,57 @@ public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryWithCorrectT BooleanQuery query = (BooleanQuery) mlt.like(filteredDocument); Collection clauses = query.clauses(); - // Because "text2" was not added to the list of field names, we don't expect any results even though the min tf for field "text2" is above the threshold + + HashSet unexpectedTerms = new HashSet<>(); + unexpectedTerms.add(new Term("text", "apache")); + unexpectedTerms.add(new Term("text", "lucene")); + unexpectedTerms.add(new Term("text", "apache2")); + unexpectedTerms.add(new Term("text", "lucene2")); + + for (BooleanClause clause : clauses) { + Term term = ((TermQuery) clause.getQuery()).getTerm(); + assertFalse("Unexpected term '" + term + "' found in query terms", unexpectedTerms.contains(term)); + } + assertEquals("Expected 0 clauses only!", 0, clauses.size()); + + analyzer.close(); + } + + public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryWithSpecifiedFieldnamesOnly() throws Exception { + MoreLikeThis mlt = new MoreLikeThis(reader); + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + mlt.setAnalyzer(analyzer); + mlt.setMinDocFreq(1); + mlt.setMinTermFreq(2); + mlt.setMinWordLen(1); + String sampleField1 = "text"; + String sampleField2 = "text2"; + mlt.setFieldNames(new String[]{sampleField1}); + + Map> filteredDocument = new HashMap<>(); + String textValue1 = "apache apache lucene lucene"; + String textValue2 = "apache2 apache2 lucene2 lucene2 lucene2"; + filteredDocument.put(sampleField1, Arrays.asList(textValue1)); + filteredDocument.put(sampleField2, Arrays.asList(textValue2)); + + BooleanQuery query = (BooleanQuery) mlt.like(filteredDocument); + Collection clauses = query.clauses(); + assertEquals("Expected 2 clauses only!", 2, clauses.size()); + + HashSet expectedTerms = new HashSet<>(); + expectedTerms.add(new Term("text", "apache")); + expectedTerms.add(new Term("text", "lucene")); + + HashSet unexpectedTerms = new HashSet<>(); + unexpectedTerms.add(new Term("text", "apache2")); + unexpectedTerms.add(new Term("text", "lucene2")); + + for (BooleanClause clause : clauses) { + Term term = ((TermQuery) clause.getQuery()).getTerm(); + assertTrue("Unexpected term '" + term + "' found in query terms", expectedTerms.contains(term)); + assertFalse("Expected term '" + term + "' not found in query terms", unexpectedTerms.contains(term)); + } analyzer.close(); } From 1104331066f9eec07c4224015e15f8c47f3455f9 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Tue, 9 Oct 2018 16:35:18 +0100 Subject: [PATCH 6/7] [LUCENE-6687]Tests assertions fix --- .../lucene/queries/mlt/TestMoreLikeThis.java | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index 8de5b3d459f6..232889c88c18 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -247,11 +247,12 @@ public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryWithCorrectT Collection clauses = query.clauses(); HashSet unexpectedTerms = new HashSet<>(); - unexpectedTerms.add(new Term("text", "apache")); - unexpectedTerms.add(new Term("text", "lucene")); - unexpectedTerms.add(new Term("text", "apache2")); - unexpectedTerms.add(new Term("text", "lucene2")); + unexpectedTerms.add(new Term("text", "apache"));//Term Frequency < Minimum Accepted Term Frequency + unexpectedTerms.add(new Term("text", "lucene"));//Term Frequency < Minimum Accepted Term Frequency + unexpectedTerms.add(new Term("text", "apache2"));//Term Frequency < Minimum Accepted Term Frequency + unexpectedTerms.add(new Term("text", "lucene2"));//Wrong Field + //None of the Not Expected terms is in the query for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertFalse("Unexpected term '" + term + "' found in query terms", unexpectedTerms.contains(term)); @@ -262,7 +263,7 @@ public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryWithCorrectT analyzer.close(); } - public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryWithSpecifiedFieldnamesOnly() throws Exception { + public void testLiveMapDocument_queryFieldsSet_shouldBuildQueryFromSpecifiedFieldnamesOnly() throws Exception { MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); @@ -281,6 +282,11 @@ public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryWithSpecifie BooleanQuery query = (BooleanQuery) mlt.like(filteredDocument); Collection clauses = query.clauses(); + HashSet clausesTerms = new HashSet<>(); + for (BooleanClause clause : clauses) { + Term term = ((TermQuery) clause.getQuery()).getTerm(); + clausesTerms.add(term); + } assertEquals("Expected 2 clauses only!", 2, clauses.size()); HashSet expectedTerms = new HashSet<>(); @@ -291,11 +297,17 @@ public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryWithSpecifie unexpectedTerms.add(new Term("text", "apache2")); unexpectedTerms.add(new Term("text", "lucene2")); + //None of the Not Expected terms is in the query for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); - assertTrue("Unexpected term '" + term + "' found in query terms", expectedTerms.contains(term)); - assertFalse("Expected term '" + term + "' not found in query terms", unexpectedTerms.contains(term)); + assertFalse("Unexpected term '" + term + "' found in query terms", unexpectedTerms.contains(term)); + } + + //All of the Expected terms are in the query + for (Term expectedTerm : expectedTerms) { + assertTrue("Expected term '" + expectedTerm + "' is not found in query terms", clausesTerms.contains(expectedTerm)); } + analyzer.close(); } From e518a8089333a1d51380e6d90cd3d39601d1ff63 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Sun, 27 Jan 2019 16:14:37 +0000 Subject: [PATCH 7/7] [LUCENE-6687] test refactor --- .../lucene/queries/mlt/TestMoreLikeThis.java | 158 ++++++++---------- 1 file changed, 66 insertions(+), 92 deletions(-) diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index 9b68a71146f9..4a60015c4850 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -50,10 +50,15 @@ public class TestMoreLikeThis extends LuceneTestCase { private static final String SHOP_TYPE = "type"; private static final String FOR_SALE = "weSell"; private static final String NOT_FOR_SALE = "weDontSell"; + private static final int MIN_DOC_FREQ = 0; + private static final int MIN_WORD_LEN = 1; + private static final int MIN_TERM_FREQ = 1; private Directory directory; private IndexReader reader; private IndexSearcher searcher; + private MoreLikeThis mlt; + private Analyzer analyzer; @Override public void setUp() throws Exception { @@ -81,10 +86,24 @@ public void setUp() throws Exception { reader = writer.getReader(); writer.close(); searcher = newSearcher(reader); + analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + mlt = this.getDefaultMoreLikeThis(reader); } + + private MoreLikeThis getDefaultMoreLikeThis(IndexReader reader) { + MoreLikeThis mlt = new MoreLikeThis(reader); + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + mlt.setAnalyzer(analyzer); + mlt.setMinDocFreq(MIN_DOC_FREQ); + mlt.setMinTermFreq(MIN_TERM_FREQ); + mlt.setMinWordLen(MIN_WORD_LEN); + return mlt; + } + @Override public void tearDown() throws Exception { + analyzer.close(); reader.close(); directory.close(); super.tearDown(); @@ -106,13 +125,6 @@ private void addDoc(RandomIndexWriter writer, String fieldName, String[] texts) public void testBoostFactor() throws Throwable { Map originalValues = getOriginalValues(); - - MoreLikeThis mlt = new MoreLikeThis(reader); - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); - mlt.setAnalyzer(analyzer); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); @@ -139,17 +151,10 @@ public void testBoostFactor() throws Throwable { + tq.getTerm().text() + "' got " + bq.getBoost(), totalBoost, bq .getBoost(), 0.0001); } - analyzer.close(); } private Map getOriginalValues() throws IOException { Map originalValues = new HashMap<>(); - MoreLikeThis mlt = new MoreLikeThis(reader); - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); - mlt.setAnalyzer(analyzer); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader( @@ -161,33 +166,21 @@ private Map getOriginalValues() throws IOException { TermQuery tq = (TermQuery) bq.getQuery(); originalValues.put(tq.getTerm().text(), bq.getBoost()); } - analyzer.close(); return originalValues; } // LUCENE-3326 public void testMultiFields() throws Exception { - MoreLikeThis mlt = new MoreLikeThis(reader); - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); - mlt.setAnalyzer(analyzer); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text", "foobar"}); mlt.like("foobar", new StringReader("this is a test")); - analyzer.close(); } // LUCENE-5725 public void testMultiValues() throws Exception { - MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); mlt.setAnalyzer(analyzer); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); - + BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene"), new StringReader("lucene release"), new StringReader("apache"), new StringReader("apache lucene")); @@ -197,60 +190,51 @@ public void testMultiValues() throws Exception { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue(Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term)); } - analyzer.close(); } - public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryAccordingToCorrectTermFrequencies() throws Exception { - MoreLikeThis mlt = new MoreLikeThis(reader); - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); - mlt.setAnalyzer(analyzer); - mlt.setMinDocFreq(0); + public void testSeedDocumentMap_minTermFrequencySet_shouldBuildQueryAccordingToCorrectTermFrequencies() throws Exception { mlt.setMinTermFreq(3); - mlt.setMinWordLen(1); - String sampleField1 = "text"; - String sampleField2 = "text2"; - mlt.setFieldNames(new String[]{sampleField1, sampleField2}); - Map> filteredDocument = new HashMap<>(); + String mltField1 = "text"; + String mltField2 = "text2"; + mlt.setFieldNames(new String[]{mltField1, mltField2}); + + Map> seedDocument = new HashMap<>(); String textValue = "apache apache lucene lucene lucene"; - filteredDocument.put(sampleField1, Arrays.asList(textValue)); - filteredDocument.put(sampleField2, Arrays.asList(textValue)); + seedDocument.put(mltField1, Arrays.asList(textValue)); + seedDocument.put(mltField2, Arrays.asList(textValue)); - BooleanQuery query = (BooleanQuery) mlt.like(filteredDocument); + BooleanQuery query = (BooleanQuery) mlt.like(seedDocument); Collection clauses = query.clauses(); assertEquals("Expected 1 clauses only!", 1, clauses.size()); for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); - assertThat(term, is(new Term(sampleField1, "lucene"))); + assertThat(term, is(new Term(mltField1, "lucene"))); } analyzer.close(); } - public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryWithCorrectTerms() throws Exception { - MoreLikeThis mlt = new MoreLikeThis(reader); - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); - mlt.setAnalyzer(analyzer); - mlt.setMinDocFreq(0); + public void testSeedDocumentMap_minTermFrequencySetMltFieldSet_shouldBuildQueryAccordingToCorrectTermFrequenciesAndField() throws Exception { mlt.setMinTermFreq(3); - mlt.setMinWordLen(1); - String sampleField1 = "text"; - String sampleField2 = "text2"; - mlt.setFieldNames(new String[]{sampleField1}); - Map> filteredDocument = new HashMap<>(); + String mltField = "text"; + mlt.setFieldNames(new String[]{mltField}); + + Map> seedDocument = new HashMap<>(); + String sampleField2 = "text2"; String textValue1 = "apache apache lucene lucene"; String textValue2 = "apache2 apache2 lucene2 lucene2 lucene2"; - filteredDocument.put(sampleField1, Arrays.asList(textValue1)); - filteredDocument.put(sampleField2, Arrays.asList(textValue2)); + seedDocument.put(mltField, Arrays.asList(textValue1)); + seedDocument.put(sampleField2, Arrays.asList(textValue2)); - BooleanQuery query = (BooleanQuery) mlt.like(filteredDocument); + BooleanQuery query = (BooleanQuery) mlt.like(seedDocument); Collection clauses = query.clauses(); HashSet unexpectedTerms = new HashSet<>(); - unexpectedTerms.add(new Term("text", "apache"));//Term Frequency < Minimum Accepted Term Frequency - unexpectedTerms.add(new Term("text", "lucene"));//Term Frequency < Minimum Accepted Term Frequency - unexpectedTerms.add(new Term("text", "apache2"));//Term Frequency < Minimum Accepted Term Frequency - unexpectedTerms.add(new Term("text", "lucene2"));//Wrong Field + unexpectedTerms.add(new Term(mltField, "apache"));//Term Frequency < Minimum Accepted Term Frequency + unexpectedTerms.add(new Term(mltField, "lucene"));//Term Frequency < Minimum Accepted Term Frequency + unexpectedTerms.add(new Term(mltField, "apache2"));//Term Frequency < Minimum Accepted Term Frequency + unexpectedTerms.add(new Term(mltField, "lucene2"));//Wrong Field //None of the Not Expected terms is in the query for (BooleanClause clause : clauses) { @@ -263,39 +247,39 @@ public void testLiveMapDocument_minTermFrequencySet_shouldBuildQueryWithCorrectT analyzer.close(); } - public void testLiveMapDocument_queryFieldsSet_shouldBuildQueryFromSpecifiedFieldnamesOnly() throws Exception { - MoreLikeThis mlt = new MoreLikeThis(reader); - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); - mlt.setAnalyzer(analyzer); - mlt.setMinDocFreq(1); + public void testSeedDocumentMap_queryFieldsSet_shouldBuildQueryFromSpecifiedFieldnamesOnly() throws Exception { mlt.setMinTermFreq(2); - mlt.setMinWordLen(1); - String sampleField1 = "text"; - String sampleField2 = "text2"; - mlt.setFieldNames(new String[]{sampleField1}); - Map> filteredDocument = new HashMap<>(); + String mltField = "text"; + + mlt.setFieldNames(new String[]{mltField}); + + Map> seedDocument = new HashMap<>(); + String notMltField = "text2"; String textValue1 = "apache apache lucene lucene"; String textValue2 = "apache2 apache2 lucene2 lucene2 lucene2"; - filteredDocument.put(sampleField1, Arrays.asList(textValue1)); - filteredDocument.put(sampleField2, Arrays.asList(textValue2)); + seedDocument.put(mltField, Arrays.asList(textValue1)); + seedDocument.put(notMltField, Arrays.asList(textValue2)); - BooleanQuery query = (BooleanQuery) mlt.like(filteredDocument); + HashSet expectedTerms = new HashSet<>(); + expectedTerms.add(new Term(mltField, "apache")); + expectedTerms.add(new Term(mltField, "lucene")); + + HashSet unexpectedTerms = new HashSet<>(); + unexpectedTerms.add(new Term(mltField, "apache2")); + unexpectedTerms.add(new Term(mltField, "lucene2")); + unexpectedTerms.add(new Term(notMltField, "apache2")); + unexpectedTerms.add(new Term(notMltField, "lucene2")); + + BooleanQuery query = (BooleanQuery) mlt.like(seedDocument); Collection clauses = query.clauses(); HashSet clausesTerms = new HashSet<>(); for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); clausesTerms.add(term); } - assertEquals("Expected 2 clauses only!", 2, clauses.size()); - - HashSet expectedTerms = new HashSet<>(); - expectedTerms.add(new Term("text", "apache")); - expectedTerms.add(new Term("text", "lucene")); - HashSet unexpectedTerms = new HashSet<>(); - unexpectedTerms.add(new Term("text", "apache2")); - unexpectedTerms.add(new Term("text", "lucene2")); + assertEquals("Expected 2 clauses only!", 2, clauses.size()); //None of the Not Expected terms is in the query for (BooleanClause clause : clauses) { @@ -308,7 +292,6 @@ public void testLiveMapDocument_queryFieldsSet_shouldBuildQueryFromSpecifiedFiel assertTrue("Expected term '" + expectedTerm + "' is not found in query terms", clausesTerms.contains(expectedTerm)); } - analyzer.close(); } // just basic equals/hashcode etc @@ -333,13 +316,8 @@ public void testTopN() throws Exception { writer.close(); // setup MLT query - MoreLikeThis mlt = new MoreLikeThis(reader); - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); - mlt.setAnalyzer(analyzer); + mlt = this.getDefaultMoreLikeThis(reader); mlt.setMaxQueryTerms(topN); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); mlt.setFieldNames(new String[]{"text"}); // perform MLT query @@ -418,13 +396,9 @@ public void testMultiFieldShouldReturnPerFieldBooleanQuery() throws Exception { writer.close(); // setup MLT query - MoreLikeThis mlt = new MoreLikeThis(reader); + MoreLikeThis mlt = this.getDefaultMoreLikeThis(reader); - mlt.setAnalyzer(analyzer); mlt.setMaxQueryTerms(maxQueryTerms); - mlt.setMinDocFreq(1); - mlt.setMinTermFreq(1); - mlt.setMinWordLen(1); mlt.setFieldNames(new String[]{FOR_SALE, NOT_FOR_SALE}); // perform MLT query