From 841212121194f126e5b19ca79a0d1ce41a4d1d45 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Mon, 16 May 2016 19:26:33 -0700 Subject: [PATCH 1/8] Enabling Term Vectors --- .../uci/ics/textdb/common/utils/Utils.java | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java index f175e7e00e1..8fb8dc5b43a 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java @@ -14,6 +14,7 @@ import org.apache.lucene.document.DateTools.Resolution; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.IndexOptions; import edu.uci.ics.textdb.api.common.Attribute; import edu.uci.ics.textdb.api.common.FieldType; @@ -49,7 +50,7 @@ public static IField getField(FieldType fieldType, String fieldValue) throws Par case TEXT: field = new TextField(fieldValue); break; - + default: break; } @@ -57,10 +58,10 @@ public static IField getField(FieldType fieldType, String fieldValue) throws Par } public static IndexableField getLuceneField(FieldType fieldType, - String fieldName, Object fieldValue) { + String fieldName, Object fieldValue) { IndexableField luceneField = null; switch(fieldType){ - case STRING: + case STRING: luceneField = new org.apache.lucene.document.StringField( fieldName, (String) fieldValue, Store.YES); break; @@ -78,10 +79,20 @@ public static IndexableField getLuceneField(FieldType fieldType, luceneField = new org.apache.lucene.document.StringField(fieldName, dateString, Store.YES); break; case TEXT: - luceneField = new org.apache.lucene.document.TextField( - fieldName, (String) fieldValue, Store.YES); - break; - + org.apache.lucene.document.FieldType luceneFieldType = new org.apache.lucene.document.FieldType(); + luceneFieldType.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS ); + luceneFieldType.setStored(true); + luceneFieldType.setStoreTermVectors( true ); + luceneFieldType.setStoreTermVectorOffsets( true ); + luceneFieldType.setStoreTermVectorPayloads( true ); + luceneFieldType.setStoreTermVectorPositions( true ); + luceneFieldType.setTokenized( true ); + + luceneField = new org.apache.lucene.document.Field( + fieldName,(String) fieldValue,luceneFieldType); + + break; + } return luceneField; } @@ -96,10 +107,10 @@ public static ITuple getSpanTuple( List fieldList, List spanList, IField[] fieldsDuplicate = fieldListDuplicate.toArray(new IField[fieldListDuplicate.size()]); return new DataTuple(spanSchema, fieldsDuplicate); } - + /** - * - * @param schema + * + * @param schema * @about Creating a new schema object, and adding SPAN_LIST_ATTRIBUTE to * the schema. SPAN_LIST_ATTRIBUTE is of type List */ From 9d6ced4daff332e11bdfcb87e285adc4334da8c7 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Tue, 17 May 2016 15:04:14 -0700 Subject: [PATCH 2/8] Adding support for Position information in Data Reader --- .../ics/textdb/common/field/ListField.java | 2 +- .../edu/uci/ics/textdb/common/field/Span.java | 21 ++- .../uci/ics/textdb/common/utils/Utils.java | 6 +- .../dataflow/common/KeywordPredicate.java | 2 +- .../dataflow/keywordmatch/KeywordMatcher.java | 126 +++++++----------- .../DictionaryMatcherTest.java | 3 +- .../keywordmatch/KeywordMatcherTest.java | 10 +- .../neextractor/NamedEntityExtractorTest.java | 4 +- .../regexmatch/RegexMatcherTestHelper.java | 5 +- .../source/IndexBasedSourceOperatorTest.java | 5 +- .../source/ScanBasedSourceOperatorTest.java | 3 +- .../textdb/storage/DataReaderPredicate.java | 25 +++- .../ics/textdb/storage/reader/DataReader.java | 89 ++++++++++++- .../storage/DataReaderPredicateTest.java | 4 +- .../textdb/storage/DataWriterReaderTest.java | 6 +- 15 files changed, 205 insertions(+), 106 deletions(-) diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java index 5c538ca6df3..beb54f805be 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java @@ -38,7 +38,7 @@ public boolean equals(Object obj) { if (list == null) { if (other.list != null) return false; - } else if (!list.equals(other.list)) + } else if (!list.containsAll(other.list)) return false; return true; } diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java index 173fea94fbe..455d9284a3e 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java @@ -11,14 +11,21 @@ public class Span { private String key; //The value matching the key private String value; - - - public Span(String fieldName, int start, int end, String key, String value) { + // The token position of the sapn + private int tokenOffset; + + public Span(String fieldName, int start, int end, String key, String value){ this.fieldName = fieldName; this.start = start; this.end = end; this.key = key; this.value = value; + this.tokenOffset = -1; + } + + public Span(String fieldName, int start, int end, String key, String value, int tokenOffset) { + this(fieldName, start, end, key, value); + this.tokenOffset = tokenOffset; } public String getFieldName() { @@ -41,6 +48,8 @@ public int getEnd() { return end; } + public int getTokenOffset(){return tokenOffset;} + @Override public int hashCode() { final int prime = 31; @@ -51,6 +60,7 @@ public int hashCode() { result = prime * result + ((key == null) ? 0 : key.hashCode()); result = prime * result + start; result = prime * result + ((value == null) ? 0 : value.hashCode()); + result = prime * result + tokenOffset; return result; } @@ -87,7 +97,10 @@ public boolean equals(Object obj) { return false; } else if (!value.equals(other.value)) return false; - + + if(tokenOffset!= other.tokenOffset) + return false; + return true; } } diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java index 8fb8dc5b43a..9bb1fb93d51 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java @@ -141,8 +141,10 @@ public static ArrayList tokenizeQuery(Analyzer analyzer, String query) { try{ tokenStream.reset(); while (tokenStream.incrementToken()) { - String term = charTermAttribute.toString(); - resultSet.add(term); + String token = charTermAttribute.toString(); + int tokenIndex = query.toLowerCase().indexOf(token); + String actualQueryToken = query.substring(tokenIndex, tokenIndex+token.length()); + resultSet.add(actualQueryToken); } tokenStream.close(); } catch (Exception e) { diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java index fc664e7aa5d..590d23fcc88 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java @@ -135,7 +135,7 @@ public Analyzer getAnalyzer(){ } public DataReaderPredicate getDataReaderPredicate() { - DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery); + DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery, this.query, this.analyzer,this.attributeList); return dataReaderPredicate; } diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java index e384a19a5ca..2366d047945 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java @@ -13,7 +13,9 @@ import edu.uci.ics.textdb.api.common.Schema; import edu.uci.ics.textdb.api.dataflow.IOperator; import edu.uci.ics.textdb.api.dataflow.ISourceOperator; +import edu.uci.ics.textdb.common.constants.SchemaConstants; import edu.uci.ics.textdb.common.exception.DataFlowException; +import edu.uci.ics.textdb.common.field.ListField; import edu.uci.ics.textdb.common.field.Span; import edu.uci.ics.textdb.common.field.StringField; import edu.uci.ics.textdb.common.field.TextField; @@ -30,19 +32,14 @@ public class KeywordMatcher implements IOperator { private final KeywordPredicate predicate; private ISourceOperator sourceOperator; - private List tokenPatternList; - private List spanList; - private List tempSpanList; private String query; private List attributeList; private List queryTokens; - private Set setOfQueryTokens; - private boolean spanSchemaDefined = false; - private Schema spanSchema; public KeywordMatcher(IPredicate predicate) { this.predicate = (KeywordPredicate)predicate; DataReaderPredicate dataReaderPredicate = this.predicate.getDataReaderPredicate(); + dataReaderPredicate.setIsSpanInformationAdded(true); this.sourceOperator = new IndexBasedSourceOperator(dataReaderPredicate); } @@ -53,17 +50,6 @@ public void open() throws DataFlowException { query = predicate.getQuery(); attributeList = predicate.getAttributeList(); queryTokens = predicate.getTokens(); - setOfQueryTokens = new HashSet<>(queryTokens); - tokenPatternList = new ArrayList(); - Pattern pattern; - String regex; - for(String token : queryTokens){ - regex = "\\b" + token.toLowerCase() + "\\b"; - pattern = Pattern.compile(regex); - tokenPatternList.add(pattern); - } - spanList = new ArrayList<>(); - tempSpanList = new ArrayList<>(); } catch (Exception e) { e.printStackTrace(); @@ -106,70 +92,55 @@ public void open() throws DataFlowException { @Override public ITuple getNextTuple() throws DataFlowException { - List fieldList; - Set setOfFoundTokens = new HashSet<>(); try { ITuple sourceTuple = sourceOperator.getNextTuple(); if(sourceTuple == null){ return null; } - fieldList = sourceTuple.getFields(); - spanList.clear(); - if(!spanSchemaDefined){ - Schema schema = sourceTuple.getSchema(); - spanSchema = Utils.createSpanSchema(schema); - spanSchemaDefined = true; - } - for(int attributeIndex = 0; attributeIndex < attributeList.size(); attributeIndex++){ - IField field = sourceTuple.getField(attributeList.get(attributeIndex).getFieldName()); - String fieldValue = (String) (field).getValue(); - String fieldName; - int positionIndex = 0; // Next position in the field to be checked. - int spanStartPosition; // Starting position of the matched query - if(field instanceof StringField){ + +// ITuple DataTuple = sourceTuple. + + int schemaIndex = sourceTuple.getSchema().getIndex(SchemaConstants.SPAN_LIST_ATTRIBUTE.getFieldName()); + List spanList = + (List)sourceTuple.getField(schemaIndex).getValue(); + + for(int attributeIndex = 0; attributeIndex < attributeList.size(); attributeIndex++) { + String fieldName = attributeList.get(attributeIndex).getFieldName(); + IField field = sourceTuple.getField(fieldName); + if (!(field instanceof TextField)) { + + String fieldValue = (String) (field).getValue(); + //Keyword should match fieldValue entirely - if(fieldValue.equalsIgnoreCase(query)){ - spanStartPosition = 0; - positionIndex = query.length(); - fieldName = attributeList.get(attributeIndex).getFieldName(); - addSpanToSpanList(fieldName, spanStartPosition, positionIndex, query, fieldValue); + if (fieldValue.equals(query)) { + Span span = new Span(fieldName, 0, query.length(), query, fieldValue); + spanList.add(span); } - } - else if(field instanceof TextField) { - //Each element of Array of keywords is matched in tokenized TextField Value - for(int iter = 0; iter < queryTokens.size(); iter++) { - positionIndex = 0; - String queryToken = queryTokens.get(iter); - //Ex: For keyword lin it obtains pattern like /blin/b which matches keywords at boundary - Pattern tokenPattern = tokenPatternList.get(iter); - Matcher matcher = tokenPattern.matcher(fieldValue.toLowerCase()); - while (matcher.find(positionIndex) != false) { - spanStartPosition = matcher.start(); - positionIndex = spanStartPosition + queryToken.length(); - String documentValue = fieldValue.substring(spanStartPosition, positionIndex); - fieldName = attributeList.get(attributeIndex).getFieldName(); - String actualQueryToken = query.substring(query.toLowerCase().indexOf(queryToken), query.toLowerCase().indexOf(queryToken)+queryToken.length()); - addSpanToTempSpanList(fieldName, spanStartPosition, positionIndex, actualQueryToken, documentValue); - setOfFoundTokens.add(queryToken); + } else { + // Check if all the tokens are present in that field, + // if any of the token is missing, remove all the span information for that field. + + boolean[] tokensPresent = new boolean[queryTokens.size()]; + + List spanForThisField = new ArrayList<>(); + + for (Span span : spanList) { + if (span.getFieldName().equals(fieldName)) { + spanForThisField.add(span); + if (queryTokens.contains(span.getKey())) + tokensPresent[queryTokens.indexOf(span.getKey())] = true; } } + + boolean allTokenPresent = areAllTrue(tokensPresent); + + if (!allTokenPresent) { + spanList.removeAll(spanForThisField); + } } - if (setOfFoundTokens.equals(setOfQueryTokens)){ - spanList.addAll(tempSpanList); - } - tempSpanList.clear(); } - //If all the 'attributes to be searched' have been processed return the result tuple with span info - //if (foundFlag || setOfFoundTokens.equals(setOfQueryTokens)){ - if(spanList.size()>0){ - return Utils.getSpanTuple(fieldList, spanList, spanSchema); - } - //Search next document if the required predicate did not match previous document - else{ - spanList.clear(); - return getNextTuple(); - } + return sourceTuple; } catch (Exception e) { e.printStackTrace(); @@ -178,17 +149,6 @@ else if(field instanceof TextField) { } - private void addSpanToSpanList(String fieldName, int start, int end, String key, String value) { - Span span = new Span(fieldName, start, end, key, value); - spanList.add(span); - } - - private void addSpanToTempSpanList(String fieldName, int start, int end, String key, String value) { - Span span = new Span(fieldName, start, end, key, value); - tempSpanList.add(span); - } - - @Override public void close() throws DataFlowException { try { @@ -198,4 +158,10 @@ public void close() throws DataFlowException { throw new DataFlowException(e.getMessage(), e); } } + + public static boolean areAllTrue(boolean[] array) + { + for(boolean b : array) if(!b) return false; + return true; + } } \ No newline at end of file diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java index e2f2db61d88..4da4203ba36 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java @@ -64,7 +64,8 @@ public void setUp() throws Exception { dataWriter = new DataWriter(dataStore, analyzer); QueryParser luceneQueryParser = new QueryParser(TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer); luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery); + dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY, + analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); dataReader = new DataReader(dataReaderPredicate); dataWriter.clearData(); dataWriter.writeData(TestConstants.getSamplePeopleTuples()); diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java index 3ed8043657b..896f15962f2 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java @@ -179,7 +179,7 @@ public void testSingleWordQueryInTextField() throws Exception { //Prepare expected result list List list = new ArrayList<>(); - Span span = new Span("description", 0, 4, "TaLL", "Tall"); + Span span = new Span("description", 0, 4, "TaLL", "Tall",0); list.add(span); Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1]; @@ -274,10 +274,10 @@ public void testWordInMultipleFieldsQuery() throws Exception { //Prepare expected result list List list = new ArrayList<>(); Span span1 = new Span("lastName", 0, 11, "lin clooney", "lin clooney"); - Span span2 = new Span("description", 0, 3, "lin", "Lin"); - Span span3 = new Span("description", 25, 28, "lin", "lin"); - Span span4 = new Span("description", 4, 11, "clooney", "Clooney"); - Span span5 = new Span("description", 29, 36, "clooney", "clooney"); + Span span2 = new Span("description", 0, 3, "lin", "Lin",0); + Span span3 = new Span("description", 25, 28, "lin", "lin",5); + Span span4 = new Span("description", 4, 11, "clooney", "Clooney",1); + Span span5 = new Span("description", 29, 36, "clooney", "clooney",6); list.add(span1); list.add(span2); list.add(span3); diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java index 829029e3eeb..26bebcf2b5f 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java @@ -1,6 +1,7 @@ package edu.uci.ics.textdb.dataflow.neextractor; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.lucene.analysis.Analyzer; @@ -167,7 +168,8 @@ public ISourceOperator getSourceOperator(Schema schema, List data) throw QueryParser queryParser = new QueryParser(NEExtractorTestConstants.ATTRIBUTES_ONE_SENTENCE.get(0).getFieldName(), analyzer); query = queryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, query); + dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY, + analyzer, Arrays.asList(NEExtractorTestConstants.ATTRIBUTES_ONE_SENTENCE.get(0))); dataReader = new DataReader(dataReaderPredicate); ISourceOperator sourceOperator = new ScanBasedSourceOperator(dataReader); diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java index d7c33ae495c..59cb074f19b 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java @@ -1,6 +1,8 @@ package edu.uci.ics.textdb.dataflow.regexmatch; +import java.lang.reflect.Array; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.lucene.analysis.Analyzer; @@ -64,7 +66,8 @@ public void runTest(String regex, String fieldName) throws Exception { QueryParser queryParser = new QueryParser( TestConstants.FIRST_NAME, analyzer); query = queryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, query); + dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY, + analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); dataReader = new DataReader(dataReaderPredicate); IPredicate predicate = new RegexPredicate(regex, fieldName); diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java index f70641a6c4a..56d88b16c35 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java @@ -4,8 +4,10 @@ package edu.uci.ics.textdb.dataflow.source; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import edu.uci.ics.textdb.api.common.Attribute; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.queryparser.classic.ParseException; @@ -60,7 +62,8 @@ public void constructIndexBasedSourceOperator(String query) throws ParseExceptio String defaultField = TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(); QueryParser queryParser = new QueryParser(defaultField, analyzer); Query queryObject = queryParser.parse(query); - dataReaderPredicate = new DataReaderPredicate(dataStore, queryObject); + dataReaderPredicate = new DataReaderPredicate(dataStore, queryObject, query, analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); + indexBasedSourceOperator = new IndexBasedSourceOperator(dataReaderPredicate); } diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java index 6366ce9673d..29532f5aeaa 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java @@ -5,6 +5,7 @@ import java.text.ParseException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.lucene.analysis.Analyzer; @@ -52,7 +53,7 @@ public void setUp() throws Exception{ QueryParser queryParser = new QueryParser( TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer); query = queryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, query); + dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY, analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); dataReader = new DataReader(dataReaderPredicate); dataWriter.clearData(); diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java index 94888778eb3..e1bffb6d313 100644 --- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java +++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java @@ -1,20 +1,35 @@ package edu.uci.ics.textdb.storage; +import edu.uci.ics.textdb.api.common.Attribute; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.search.Query; import edu.uci.ics.textdb.api.common.IPredicate; import edu.uci.ics.textdb.api.storage.IDataStore; +import java.util.List; + /** * Created by sandeepreddy602 on 05-06-2016. */ public class DataReaderPredicate implements IPredicate { private IDataStore dataStore; private Query luceneQuery; + private String queryString; + private Analyzer analyzer; + private List attributeList; + private boolean isSpanInformationAdded = false; - public DataReaderPredicate(IDataStore dataStore, Query luceneQuery){ + public DataReaderPredicate(IDataStore dataStore, Query luceneQuery, String queryString, Analyzer analyzer, List attributeList){ this.dataStore = dataStore; this.luceneQuery = luceneQuery; + this.analyzer = analyzer; + this.queryString = queryString; + this.attributeList = attributeList; + } + + public void setIsSpanInformationAdded(boolean flag){ + isSpanInformationAdded = flag; } public IDataStore getDataStore() { @@ -24,4 +39,12 @@ public IDataStore getDataStore() { public Query getLuceneQuery() { return luceneQuery; } + + public String getQueryString(){return queryString;} + + public Analyzer getAnalyzer(){return analyzer;} + + public List getAttributeList(){return attributeList;} + + public boolean getIsSpanInformationAdded(){return isSpanInformationAdded;} } diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java index 6f7266e43f6..c28d3c99f18 100644 --- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java +++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java @@ -8,6 +8,7 @@ import java.util.ArrayList; import java.util.List; +import edu.uci.ics.textdb.common.field.Span; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -16,6 +17,11 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.search.DocIdSetIterator; import edu.uci.ics.textdb.api.common.Attribute; import edu.uci.ics.textdb.api.common.FieldType; @@ -29,6 +35,7 @@ import edu.uci.ics.textdb.common.field.DataTuple; import edu.uci.ics.textdb.common.utils.Utils; import edu.uci.ics.textdb.storage.DataReaderPredicate; +import org.apache.lucene.util.packed.PackedLongValues; /** * @author sandeepreddy602 @@ -41,6 +48,11 @@ public class DataReader implements IDataReader{ private ScoreDoc[] scoreDocs; private IndexReader luceneIndexReader; private DataReaderPredicate dataReaderPredicate; + private ArrayList queryTokens; + private List attributeList; + private List queryTokensInBytesRef; + private Schema schema; + private Schema spanSchema; public DataReader(IPredicate dataReaderPredicate) { this.dataReaderPredicate = (DataReaderPredicate)dataReaderPredicate; @@ -59,6 +71,22 @@ public void open() throws DataFlowException { TopDocs topDocs = luceneIndexSearcher.search(dataReaderPredicate.getLuceneQuery(), Integer.MAX_VALUE); scoreDocs = topDocs.scoreDocs; cursor = OPENED; + + this.queryTokens = Utils.tokenizeQuery(dataReaderPredicate.getAnalyzer(),dataReaderPredicate.getQueryString()); + // sort the query token, as the term vector are also sorted. + // This makes the seek faster. + this.queryTokens.sort(String.CASE_INSENSITIVE_ORDER); + + this.queryTokensInBytesRef = new ArrayList<>(); + for(String token: queryTokens) { + BytesRef byteRef = new BytesRef(token.toLowerCase().getBytes()); + this.queryTokensInBytesRef.add(byteRef); + } + + this.attributeList = dataReaderPredicate.getAttributeList(); + this.schema = dataReaderPredicate.getDataStore().getSchema(); + this.spanSchema = Utils.createSpanSchema(schema); + } catch (IOException e) { e.printStackTrace(); throw new DataFlowException(e.getMessage(), e); @@ -74,18 +102,69 @@ public ITuple getNextTuple() throws DataFlowException { if(cursor >= scoreDocs.length){ return null; } - Document document = luceneIndexSearcher.doc(scoreDocs[cursor++].doc); - + Document document = luceneIndexSearcher.doc(scoreDocs[cursor].doc); + List spanList = new ArrayList<>(); List fields = new ArrayList(); - Schema schema = dataReaderPredicate.getDataStore().getSchema(); + for (Attribute attr : schema.getAttributes()) { FieldType fieldType = attr.getFieldType(); String fieldValue = document.get(attr.getFieldName()); fields.add(Utils.getField(fieldType, fieldValue)); } - - DataTuple dataTuple = new DataTuple(schema, fields.toArray(new IField[fields.size()])); + + if(!dataReaderPredicate.getIsSpanInformationAdded()){ + cursor++; + DataTuple dTuple = new DataTuple(schema, fields.toArray(new IField[fields.size()])); + return dTuple; + } + + for(Attribute attr: attributeList){ + + String fieldName = attr.getFieldName(); + Terms vector = luceneIndexReader.getTermVector(scoreDocs[cursor].doc,fieldName); + + if (vector != null) { + TermsEnum vectorEnum = vector.iterator(); + int queryTokenIndex = 0; + for(BytesRef term: queryTokensInBytesRef){ + + if(vectorEnum.seekExact(term)){ + System.out.println(term.utf8ToString()); + PostingsEnum postings = vectorEnum.postings(null, PostingsEnum.POSITIONS); + + while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + int freq = postings.freq(); + while (freq-- > 0) { + int tokenOffset = postings.nextPosition(); + int start = postings.startOffset(); + int end = start+term.length; + String key = queryTokens.get(queryTokenIndex); + String value = document.get(fieldName).substring(start,end); + Span span = new Span(fieldName, start, end, key, value, tokenOffset); + spanList.add(span); + } + + } + + } + + queryTokenIndex++; + } + + + } + + + + } + + cursor++; + + + + ITuple dataTuple = Utils.getSpanTuple(fields, spanList, spanSchema); return dataTuple; + } catch (IOException e) { e.printStackTrace(); throw new DataFlowException(e.getMessage(), e); diff --git a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java index 73703c5f2af..9f128368ab0 100644 --- a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java +++ b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java @@ -13,6 +13,8 @@ import edu.uci.ics.textdb.common.constants.DataConstants; import edu.uci.ics.textdb.common.constants.TestConstants; +import java.util.Arrays; + public class DataReaderPredicateTest { private DataReaderPredicate dataReaderPredicate; private IDataStore dataStore; @@ -24,7 +26,7 @@ public void setUp() throws ParseException{ QueryParser luceneQueryParser = new QueryParser( TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), new StandardAnalyzer()); luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery); + dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY,new StandardAnalyzer(), Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE)); } @Test diff --git a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java index 67ca872d986..f2e0b989674 100644 --- a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java +++ b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java @@ -1,6 +1,8 @@ package edu.uci.ics.textdb.storage; +import java.lang.reflect.Array; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import junit.framework.Assert; @@ -23,6 +25,8 @@ import edu.uci.ics.textdb.storage.reader.DataReader; import edu.uci.ics.textdb.storage.writer.DataWriter; +import javax.xml.crypto.Data; + public class DataWriterReaderTest { private IDataWriter dataWriter; private IDataReader dataReader; @@ -39,7 +43,7 @@ public void setUp() throws ParseException{ QueryParser queryParser = new QueryParser( TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer); query = queryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, query); + dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY,analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE)); dataReader = new DataReader(dataReaderPredicate); } From 58b4062f4a083319615d27e4e06c9d2e0021d1ca Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Tue, 17 May 2016 15:07:12 -0700 Subject: [PATCH 3/8] Adding comment --- .../src/main/java/edu/uci/ics/textdb/common/utils/Utils.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java index 9bb1fb93d51..6c24b5c4272 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java @@ -79,6 +79,8 @@ public static IndexableField getLuceneField(FieldType fieldType, luceneField = new org.apache.lucene.document.StringField(fieldName, dateString, Store.YES); break; case TEXT: + //By default we enable positional indexing in Lucene so that we can return + // information about character offsets and token offsets org.apache.lucene.document.FieldType luceneFieldType = new org.apache.lucene.document.FieldType(); luceneFieldType.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS ); luceneFieldType.setStored(true); From 219d0fabeeb41155901ba92de9822043fb4b9bb0 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Tue, 17 May 2016 15:25:50 -0700 Subject: [PATCH 4/8] Merge from master --- .../edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java index ecbaa6d847a..6a2cb681825 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java @@ -72,7 +72,7 @@ public Analyzer getAnalyzer() { public IOperator getScanSourceOperator() throws ParseException, DataFlowException { QueryParser luceneQueryParser = new QueryParser(attributeList.get(0).getFieldName(), luceneAnalyzer); Query luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY); - IPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery); + IPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY,luceneAnalyzer,attributeList); IDataReader dataReader = new DataReader(dataReaderPredicate); IOperator operator = new ScanBasedSourceOperator(dataReader); From 1b0c1e497836373e5128ef8f831b5a0a8de053fa Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Tue, 17 May 2016 17:15:17 -0700 Subject: [PATCH 5/8] Adding comments and minor refactoring --- .../ics/textdb/common/field/ListField.java | 2 +- .../edu/uci/ics/textdb/common/field/Span.java | 19 +++++++++++++++---- .../uci/ics/textdb/common/utils/Utils.java | 8 +++++--- .../dataflow/common/DictionaryPredicate.java | 4 ++-- .../dataflow/common/KeywordPredicate.java | 17 +++++++++-------- .../dataflow/keywordmatch/KeywordMatcher.java | 14 ++------------ .../main/resources/queryrewriter/wordsEn.txt | 2 +- .../DictionaryMatcherTest.java | 12 ++++++------ .../keywordmatch/KeywordMatcherTest.java | 14 ++++---------- .../source/IndexBasedSourceOperatorTest.java | 16 ++++++++-------- .../source/ScanBasedSourceOperatorTest.java | 11 ++++++----- .../ics/textdb/storage/reader/DataReader.java | 19 ++++++++++++++----- 12 files changed, 73 insertions(+), 65 deletions(-) diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java index beb54f805be..56946ad3996 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java @@ -38,7 +38,7 @@ public boolean equals(Object obj) { if (list == null) { if (other.list != null) return false; - } else if (!list.containsAll(other.list)) + } else if ( !(list.containsAll(other.list) & other.list.containsAll(list))) return false; return true; } diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java index 455d9284a3e..48a84ab476f 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java @@ -3,24 +3,35 @@ public class Span { //The name of the field (in the tuple) where this span is present private String fieldName; - //The start of the span + //The start of the span. It is the position of the first character of span in the document. private int start; - //The end of the span + //The end of the span.It is the position of the first character of span in the document private int end; //The key we are searching for eg: regex private String key; //The value matching the key private String value; - // The token position of the sapn + // The token position of the span private int tokenOffset; + /* + Example: + Value = "The quick brown fox jumps over the lazy dog" + Now the Span for brown should be + start = 10 : position of character 'b' + end = 15 : position of character 'n' + tokenOffset = 2 position of word 'brown' + */ + + public static int INVALID_TOKEN_OFFSET = -1; + public Span(String fieldName, int start, int end, String key, String value){ this.fieldName = fieldName; this.start = start; this.end = end; this.key = key; this.value = value; - this.tokenOffset = -1; + this.tokenOffset = INVALID_TOKEN_OFFSET; } public Span(String fieldName, int start, int end, String key, String value, int tokenOffset) { diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java index 6c24b5c4272..fde4b6bed2b 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java @@ -130,14 +130,14 @@ public static Schema createSpanSchema(Schema schema) { /** * Tokenizes the query string using the given analyser - * @param analyzer + * @param luceneAnalyzer * @param query * @return ArrayList list of results */ - public static ArrayList tokenizeQuery(Analyzer analyzer, String query) { + public static ArrayList tokenizeQuery(Analyzer luceneAnalyzer, String query) { HashSet resultSet = new HashSet<>(); ArrayList result = new ArrayList(); - TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(query)); + TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try{ @@ -145,6 +145,8 @@ public static ArrayList tokenizeQuery(Analyzer analyzer, String query) { while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); int tokenIndex = query.toLowerCase().indexOf(token); + // Since tokens are converted to lower case, + // get the exact token from the query string. String actualQueryToken = query.substring(tokenIndex, tokenIndex+token.length()); resultSet.add(actualQueryToken); } diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java index 6a2cb681825..25cf76fa4e0 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java @@ -34,11 +34,11 @@ public class DictionaryPredicate implements IPredicate { New and York; if searched in String field we search for Exact string. */ - public DictionaryPredicate(IDictionary dictionary, Analyzer analyzer, List attributeList, + public DictionaryPredicate(IDictionary dictionary, Analyzer luceneAnalyzer, List attributeList, SourceOperatorType srcOpType, IDataStore dataStore) { this.dictionary = dictionary; - this.luceneAnalyzer = analyzer; + this.luceneAnalyzer = luceneAnalyzer; this.attributeList = attributeList; this.srcOpType = srcOpType; this.dataStore = dataStore; diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java index 590d23fcc88..10e11ea888c 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java @@ -34,7 +34,7 @@ public class KeywordPredicate implements IPredicate{ private final String query; private final Query luceneQuery; private ArrayList tokens; - private Analyzer analyzer; + private Analyzer luceneAnalyzer; private IDataStore dataStore; /* @@ -42,10 +42,10 @@ public class KeywordPredicate implements IPredicate{ For Ex. New york if searched in TextField, we would consider both tokens New and York; if searched in String field we search for Exact string. */ - public KeywordPredicate(String query, List attributeList, Analyzer analyzer,IDataStore dataStore ) throws DataFlowException{ + public KeywordPredicate(String query, List attributeList, Analyzer luceneAnalyzer, IDataStore dataStore ) throws DataFlowException{ try { this.query = query; - this.tokens = Utils.tokenizeQuery(analyzer, query); + this.tokens = Utils.tokenizeQuery(luceneAnalyzer, query); this.attributeList = attributeList; this.dataStore = dataStore; String[] temp = new String[attributeList.size()]; @@ -54,7 +54,7 @@ public KeywordPredicate(String query, List attributeList, Analyzer an temp[i] = attributeList.get(i).getFieldName(); } this.fields = temp; - this.analyzer = analyzer; + this.luceneAnalyzer = luceneAnalyzer; this.luceneQuery = createLuceneQueryObject(); } catch (Exception e) { e.printStackTrace(); @@ -105,7 +105,7 @@ and generate boolean query (Textfield is Case Insensitive) */ String[] remainingTextFields = (String[]) textFieldList.toArray(new String[0]); BooleanQuery queryOnTextFields = new BooleanQuery(); - MultiFieldQueryParser parser = new MultiFieldQueryParser(remainingTextFields, analyzer); + MultiFieldQueryParser parser = new MultiFieldQueryParser(remainingTextFields, luceneAnalyzer); for(String searchToken : this.tokens){ Query termQuery = parser.parse(searchToken); @@ -130,12 +130,13 @@ public List getAttributeList() { public ArrayList getTokens(){return this.tokens;} - public Analyzer getAnalyzer(){ - return analyzer; + public Analyzer getLuceneAnalyzer(){ + return luceneAnalyzer; } public DataReaderPredicate getDataReaderPredicate() { - DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery, this.query, this.analyzer,this.attributeList); + DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery, + this.query, this.luceneAnalyzer, this.attributeList); return dataReaderPredicate; } diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java index 2366d047945..6325be992a4 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java @@ -1,32 +1,24 @@ package edu.uci.ics.textdb.dataflow.keywordmatch; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import edu.uci.ics.textdb.api.common.Attribute; import edu.uci.ics.textdb.api.common.IField; import edu.uci.ics.textdb.api.common.IPredicate; import edu.uci.ics.textdb.api.common.ITuple; -import edu.uci.ics.textdb.api.common.Schema; import edu.uci.ics.textdb.api.dataflow.IOperator; import edu.uci.ics.textdb.api.dataflow.ISourceOperator; import edu.uci.ics.textdb.common.constants.SchemaConstants; import edu.uci.ics.textdb.common.exception.DataFlowException; -import edu.uci.ics.textdb.common.field.ListField; import edu.uci.ics.textdb.common.field.Span; -import edu.uci.ics.textdb.common.field.StringField; import edu.uci.ics.textdb.common.field.TextField; -import edu.uci.ics.textdb.common.utils.Utils; import edu.uci.ics.textdb.dataflow.common.KeywordPredicate; import edu.uci.ics.textdb.dataflow.source.IndexBasedSourceOperator; import edu.uci.ics.textdb.storage.DataReaderPredicate; -import edu.uci.ics.textdb.storage.reader.DataReader; /** * @author prakul + * @author Akshay * */ public class KeywordMatcher implements IOperator { @@ -98,8 +90,6 @@ public ITuple getNextTuple() throws DataFlowException { return null; } -// ITuple DataTuple = sourceTuple. - int schemaIndex = sourceTuple.getSchema().getIndex(SchemaConstants.SPAN_LIST_ATTRIBUTE.getFieldName()); List spanList = (List)sourceTuple.getField(schemaIndex).getValue(); @@ -118,7 +108,7 @@ public ITuple getNextTuple() throws DataFlowException { } } else { // Check if all the tokens are present in that field, - // if any of the token is missing, remove all the span information for that field. + // if any of the tokens is missing, remove all the span information for that field. boolean[] tokensPresent = new boolean[queryTokens.size()]; diff --git a/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt b/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt index 05a3d743db3..1c0c6821b89 100644 --- a/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt +++ b/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt @@ -3240,7 +3240,7 @@ analytically analyzable analyze analyzed -analyzer +luceneAnalyzer analyzers analyzes analyzing diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java index 4da4203ba36..56d9d92ea87 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java @@ -52,7 +52,7 @@ public class DictionaryMatcherTest { private DataStore dataStore; private IDataWriter dataWriter; private IDataReader dataReader; - private Analyzer analyzer; + private Analyzer luceneAnalyzer; private Query luceneQuery; private IPredicate dataReaderPredicate; @@ -60,12 +60,12 @@ public class DictionaryMatcherTest { public void setUp() throws Exception { dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE); - analyzer = new StandardAnalyzer(); - dataWriter = new DataWriter(dataStore, analyzer); - QueryParser luceneQueryParser = new QueryParser(TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer); + luceneAnalyzer = new StandardAnalyzer(); + dataWriter = new DataWriter(dataStore, luceneAnalyzer); + QueryParser luceneQueryParser = new QueryParser(TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), luceneAnalyzer); luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY, - analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); + dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery, DataConstants.SCAN_QUERY, + luceneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); dataReader = new DataReader(dataReaderPredicate); dataWriter.clearData(); dataWriter.writeData(TestConstants.getSamplePeopleTuples()); diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java index 896f15962f2..f86ef37c134 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java @@ -35,11 +35,8 @@ import edu.uci.ics.textdb.common.field.Span; import edu.uci.ics.textdb.common.field.StringField; import edu.uci.ics.textdb.common.field.TextField; -import edu.uci.ics.textdb.common.utils.Utils; import edu.uci.ics.textdb.dataflow.common.KeywordPredicate; -import edu.uci.ics.textdb.dataflow.source.IndexBasedSourceOperator; import edu.uci.ics.textdb.dataflow.utils.TestUtils; -import edu.uci.ics.textdb.storage.DataReaderPredicate; import edu.uci.ics.textdb.storage.DataStore; import edu.uci.ics.textdb.storage.writer.DataWriter; @@ -54,8 +51,6 @@ public class KeywordMatcherTest { private IDataWriter dataWriter; private DataStore dataStore; private Analyzer analyzer; - private Schema schema; - private IPredicate keywordPredicate; @Before public void setUp() throws Exception { @@ -64,7 +59,6 @@ public void setUp() throws Exception { dataWriter = new DataWriter(dataStore, analyzer); dataWriter.clearData(); dataWriter.writeData(TestConstants.getSamplePeopleTuples()); - schema = dataStore.getSchema(); } @After @@ -274,10 +268,10 @@ public void testWordInMultipleFieldsQuery() throws Exception { //Prepare expected result list List list = new ArrayList<>(); Span span1 = new Span("lastName", 0, 11, "lin clooney", "lin clooney"); - Span span2 = new Span("description", 0, 3, "lin", "Lin",0); - Span span3 = new Span("description", 25, 28, "lin", "lin",5); - Span span4 = new Span("description", 4, 11, "clooney", "Clooney",1); - Span span5 = new Span("description", 29, 36, "clooney", "clooney",6); + Span span2 = new Span("description", 0, 3, "lin", "Lin", 0); + Span span3 = new Span("description", 25, 28, "lin", "lin", 5); + Span span4 = new Span("description", 4, 11, "clooney", "Clooney", 1); + Span span5 = new Span("description", 29, 36, "clooney", "clooney", 6); list.add(span1); list.add(span2); list.add(span3); diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java index 56d88b16c35..3816d820f64 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java @@ -7,7 +7,6 @@ import java.util.Arrays; import java.util.List; -import edu.uci.ics.textdb.api.common.Attribute; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.queryparser.classic.ParseException; @@ -39,15 +38,15 @@ public class IndexBasedSourceOperatorTest { private IDataWriter dataWriter; private IndexBasedSourceOperator indexBasedSourceOperator; private IDataStore dataStore; - private Analyzer analyzer; + private Analyzer luceneAnalyzer; private IPredicate dataReaderPredicate; @Before public void setUp() throws Exception { dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE); - analyzer = new StandardAnalyzer(); - dataWriter = new DataWriter(dataStore, analyzer); + luceneAnalyzer = new StandardAnalyzer(); + dataWriter = new DataWriter(dataStore, luceneAnalyzer); dataWriter.clearData(); dataWriter.writeData(TestConstants.getSamplePeopleTuples()); @@ -60,9 +59,10 @@ public void cleanUp() throws Exception { public void constructIndexBasedSourceOperator(String query) throws ParseException{ String defaultField = TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(); - QueryParser queryParser = new QueryParser(defaultField, analyzer); + QueryParser queryParser = new QueryParser(defaultField, luceneAnalyzer); Query queryObject = queryParser.parse(query); - dataReaderPredicate = new DataReaderPredicate(dataStore, queryObject, query, analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); + dataReaderPredicate = new DataReaderPredicate(dataStore, queryObject, + query, luceneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); indexBasedSourceOperator = new IndexBasedSourceOperator(dataReaderPredicate); } @@ -91,7 +91,7 @@ public void testTextSearchWithMultipleTokens() throws DataFlowException, ParseEx int numTuples = results.size(); Assert.assertEquals(3, numTuples); - boolean check = TestUtils.checkResults(results,"Tall,Brown" , this.analyzer,TestConstants.DESCRIPTION); + boolean check = TestUtils.checkResults(results,"Tall,Brown" , this.luceneAnalyzer,TestConstants.DESCRIPTION); Assert.assertTrue(check); } @@ -105,7 +105,7 @@ public void testTextSearchWithMultipleTokens() throws DataFlowException, ParseEx public void testTextSearchWithSingleToken() throws DataFlowException, ParseException { List results = getQueryResults(TestConstants.DESCRIPTION + ":angry"); int numTuples = results.size(); - boolean check = TestUtils.checkResults(results,"angry" , this.analyzer,TestConstants.DESCRIPTION); + boolean check = TestUtils.checkResults(results,"angry" , this.luceneAnalyzer,TestConstants.DESCRIPTION); Assert.assertTrue(check); Assert.assertEquals(3, numTuples); } diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java index 29532f5aeaa..66c042dd5f0 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java @@ -41,19 +41,20 @@ public class ScanBasedSourceOperatorTest { private ScanBasedSourceOperator scanBasedSourceOperator; private IDataReader dataReader; private IDataStore dataStore; - private Analyzer analyzer; + private Analyzer lucneAnalyzer; private Query query; private IPredicate dataReaderPredicate; @Before public void setUp() throws Exception{ dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE); - analyzer = new StandardAnalyzer(); - dataWriter = new DataWriter(dataStore, analyzer ); + lucneAnalyzer = new StandardAnalyzer(); + dataWriter = new DataWriter(dataStore, lucneAnalyzer); QueryParser queryParser = new QueryParser( - TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer); + TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), lucneAnalyzer); query = queryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY, analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); + dataReaderPredicate = new DataReaderPredicate(dataStore, query, + DataConstants.SCAN_QUERY, lucneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); dataReader = new DataReader(dataReaderPredicate); dataWriter.clearData(); diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java index c28d3c99f18..fa606290a43 100644 --- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java +++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java @@ -35,7 +35,6 @@ import edu.uci.ics.textdb.common.field.DataTuple; import edu.uci.ics.textdb.common.utils.Utils; import edu.uci.ics.textdb.storage.DataReaderPredicate; -import org.apache.lucene.util.packed.PackedLongValues; /** * @author sandeepreddy602 @@ -51,7 +50,9 @@ public class DataReader implements IDataReader{ private ArrayList queryTokens; private List attributeList; private List queryTokensInBytesRef; + // The schema of the data tuple private Schema schema; + //The schema o the data tuple along with the span information. private Schema spanSchema; public DataReader(IPredicate dataReaderPredicate) { @@ -73,7 +74,7 @@ public void open() throws DataFlowException { cursor = OPENED; this.queryTokens = Utils.tokenizeQuery(dataReaderPredicate.getAnalyzer(),dataReaderPredicate.getQueryString()); - // sort the query token, as the term vector are also sorted. + // sort the query tokens, as the term vector are also sorted. // This makes the seek faster. this.queryTokens.sort(String.CASE_INSENSITIVE_ORDER); @@ -112,28 +113,36 @@ public ITuple getNextTuple() throws DataFlowException { fields.add(Utils.getField(fieldType, fieldValue)); } + // If the span Information is not requested, + // just return the dataTuple without span information. + if(!dataReaderPredicate.getIsSpanInformationAdded()){ cursor++; - DataTuple dTuple = new DataTuple(schema, fields.toArray(new IField[fields.size()])); - return dTuple; + DataTuple dataTuple = new DataTuple(schema, fields.toArray(new IField[fields.size()])); + return dataTuple; } + // Create span information. + for(Attribute attr: attributeList){ String fieldName = attr.getFieldName(); + // Get the term vector fot the current field. Terms vector = luceneIndexReader.getTermVector(scoreDocs[cursor].doc,fieldName); if (vector != null) { TermsEnum vectorEnum = vector.iterator(); int queryTokenIndex = 0; + // Search for all the query tokens in the term vector one by one. for(BytesRef term: queryTokensInBytesRef){ + //If Term is found, calculate the position info and add to the Spans if(vectorEnum.seekExact(term)){ - System.out.println(term.utf8ToString()); PostingsEnum postings = vectorEnum.postings(null, PostingsEnum.POSITIONS); while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int freq = postings.freq(); + // Create a new span for every occurrence. while (freq-- > 0) { int tokenOffset = postings.nextPosition(); int start = postings.startOffset(); From 0e6ec5ae64fb2f4edae0b88ce0df86ed0e3db32a Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Tue, 17 May 2016 17:55:22 -0700 Subject: [PATCH 6/8] Adding comments --- .../uci/ics/textdb/dataflow/common/KeywordPredicate.java | 1 + .../ics/textdb/dataflow/keywordmatch/KeywordMatcher.java | 1 + .../java/edu/uci/ics/textdb/storage/reader/DataReader.java | 6 +++++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java index 10e11ea888c..3541c681b3c 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java @@ -126,6 +126,7 @@ public String getQuery(){ public List getAttributeList() { return attributeList; } + public Query getQueryObject(){return this.luceneQuery;} public ArrayList getTokens(){return this.tokens;} diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java index 6325be992a4..c4c02bf385a 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java @@ -110,6 +110,7 @@ public ITuple getNextTuple() throws DataFlowException { // Check if all the tokens are present in that field, // if any of the tokens is missing, remove all the span information for that field. + //By default, initialized to false. boolean[] tokensPresent = new boolean[queryTokens.size()]; List spanForThisField = new ArrayList<>(); diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java index fa606290a43..2af82b0f82c 100644 --- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java +++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java @@ -52,7 +52,7 @@ public class DataReader implements IDataReader{ private List queryTokensInBytesRef; // The schema of the data tuple private Schema schema; - //The schema o the data tuple along with the span information. + //The schema of the data tuple along with the span information. private Schema spanSchema; public DataReader(IPredicate dataReaderPredicate) { @@ -74,10 +74,14 @@ public void open() throws DataFlowException { cursor = OPENED; this.queryTokens = Utils.tokenizeQuery(dataReaderPredicate.getAnalyzer(),dataReaderPredicate.getQueryString()); + // sort the query tokens, as the term vector are also sorted. // This makes the seek faster. this.queryTokens.sort(String.CASE_INSENSITIVE_ORDER); + // The terms in the term vector are stored as ByteRef, + // hence convert token from String format to ByteRef and then search. + this.queryTokensInBytesRef = new ArrayList<>(); for(String token: queryTokens) { BytesRef byteRef = new BytesRef(token.toLowerCase().getBytes()); From e60a943fc0a0be7f73a653795c9d0bbb7667c51c Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Wed, 18 May 2016 13:42:28 -0700 Subject: [PATCH 7/8] Adding comments --- .../java/edu/uci/ics/textdb/common/field/Span.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java index 48a84ab476f..dd03540e505 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java @@ -3,23 +3,24 @@ public class Span { //The name of the field (in the tuple) where this span is present private String fieldName; - //The start of the span. It is the position of the first character of span in the document. + //The start position of the span, which is the offset of the gap before the first character of the span. private int start; - //The end of the span.It is the position of the first character of span in the document + //The end position of the span, which is the offset of the gap after the last character of the span. private int end; //The key we are searching for eg: regex private String key; //The value matching the key private String value; - // The token position of the span + // The token position of the span, starting from 0. private int tokenOffset; /* Example: Value = "The quick brown fox jumps over the lazy dog" Now the Span for brown should be - start = 10 : position of character 'b' - end = 15 : position of character 'n' + start = 10 : index Of character 'b' + end = 15 : index of character 'n'+ 1 OR start+length + Both of then result in same values. tokenOffset = 2 position of word 'brown' */ From 6c3ce95cc00201fc3387f31ddec13606add02256 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Wed, 18 May 2016 16:47:21 -0700 Subject: [PATCH 8/8] Minor changes and comments --- .../ics/textdb/dataflow/common/RegexPredicate.java | 7 ++++++- .../textdb/dataflow/regexmatch/RegexMatcher.java | 5 ++++- .../src/main/resources/queryrewriter/wordsEn.txt | 2 +- .../uci/ics/textdb/storage/DataReaderPredicate.java | 6 +++--- .../uci/ics/textdb/storage/reader/DataReader.java | 4 ++-- .../ics/textdb/storage/DataWriterReaderTest.java | 13 +++++-------- 6 files changed, 21 insertions(+), 16 deletions(-) diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java index c0059ec03e8..1d53e75d202 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java @@ -23,7 +23,7 @@ public class RegexPredicate implements IPredicate { private String regex; private List fieldNameList; - + private List attributeList; private Analyzer luceneAnalyzer; private IDataStore dataStore; @@ -31,6 +31,7 @@ public RegexPredicate(String regex, List attributeList, Analyzer anal this.regex = regex; this.luceneAnalyzer = analyzer; this.dataStore = dataStore; + this.attributeList = attributeList; this.fieldNameList = attributeList.stream() .filter(attr -> (attr.getFieldType() == FieldType.TEXT || attr.getFieldType() == FieldType.STRING)) .map(attr -> attr.getFieldName()).collect(Collectors.toList()); @@ -52,4 +53,8 @@ public List getFieldNameList() { return this.fieldNameList; } + public List getAttributeList() { + return attributeList; + } + } diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java index 152388db0a1..ec75fe304b3 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java @@ -3,6 +3,7 @@ import java.util.ArrayList; import java.util.List; +import edu.stanford.nlp.patterns.Data; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.ParseException; @@ -74,7 +75,9 @@ public RegexMatcher(IPredicate predicate) throws DataFlowException{ this.luceneQuery = generateLuceneQuery(regex, fieldNameList, DataConstants.SCAN_QUERY); } - this.sourceOperator = new IndexBasedSourceOperator(new DataReaderPredicate(dataStore, luceneQuery)); + DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery, + DataConstants.SCAN_QUERY, luceneAnalyzer, regexPredicate.getAttributeList()); + this.sourceOperator = new IndexBasedSourceOperator(dataReaderPredicate); } catch (ParseException | java.util.regex.PatternSyntaxException e) { throw new DataFlowException(e.getMessage(), e); } diff --git a/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt b/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt index 1c0c6821b89..05a3d743db3 100644 --- a/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt +++ b/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt @@ -3240,7 +3240,7 @@ analytically analyzable analyze analyzed -luceneAnalyzer +analyzer analyzers analyzes analyzing diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java index e1bffb6d313..22a96c5bc4b 100644 --- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java +++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java @@ -16,14 +16,14 @@ public class DataReaderPredicate implements IPredicate { private IDataStore dataStore; private Query luceneQuery; private String queryString; - private Analyzer analyzer; + private Analyzer luceneAnalyzer; private List attributeList; private boolean isSpanInformationAdded = false; public DataReaderPredicate(IDataStore dataStore, Query luceneQuery, String queryString, Analyzer analyzer, List attributeList){ this.dataStore = dataStore; this.luceneQuery = luceneQuery; - this.analyzer = analyzer; + this.luceneAnalyzer = analyzer; this.queryString = queryString; this.attributeList = attributeList; } @@ -42,7 +42,7 @@ public Query getLuceneQuery() { public String getQueryString(){return queryString;} - public Analyzer getAnalyzer(){return analyzer;} + public Analyzer getLuceneAnalyzer(){return luceneAnalyzer;} public List getAttributeList(){return attributeList;} diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java index 2af82b0f82c..a9827eddd06 100644 --- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java +++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java @@ -73,7 +73,7 @@ public void open() throws DataFlowException { scoreDocs = topDocs.scoreDocs; cursor = OPENED; - this.queryTokens = Utils.tokenizeQuery(dataReaderPredicate.getAnalyzer(),dataReaderPredicate.getQueryString()); + this.queryTokens = Utils.tokenizeQuery(dataReaderPredicate.getLuceneAnalyzer(),dataReaderPredicate.getQueryString()); // sort the query tokens, as the term vector are also sorted. // This makes the seek faster. @@ -131,7 +131,7 @@ public ITuple getNextTuple() throws DataFlowException { for(Attribute attr: attributeList){ String fieldName = attr.getFieldName(); - // Get the term vector fot the current field. + // Get the term vector for the current field. Terms vector = luceneIndexReader.getTermVector(scoreDocs[cursor].doc,fieldName); if (vector != null) { diff --git a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java index f2e0b989674..3f55c5c61c3 100644 --- a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java +++ b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java @@ -1,6 +1,5 @@ package edu.uci.ics.textdb.storage; -import java.lang.reflect.Array; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -25,25 +24,23 @@ import edu.uci.ics.textdb.storage.reader.DataReader; import edu.uci.ics.textdb.storage.writer.DataWriter; -import javax.xml.crypto.Data; - public class DataWriterReaderTest { private IDataWriter dataWriter; private IDataReader dataReader; private IDataStore dataStore; private IPredicate dataReaderPredicate; - private Analyzer analyzer; + private Analyzer luceneAnalyzer; private Query query; @Before public void setUp() throws ParseException{ dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE); - analyzer = new StandardAnalyzer(); - dataWriter = new DataWriter(dataStore, analyzer ); + luceneAnalyzer = new StandardAnalyzer(); + dataWriter = new DataWriter(dataStore, luceneAnalyzer); QueryParser queryParser = new QueryParser( - TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer); + TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), luceneAnalyzer); query = queryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY,analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE)); + dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY, luceneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE)); dataReader = new DataReader(dataReaderPredicate); }