diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java index 5c538ca6df3..56946ad3996 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java @@ -38,7 +38,7 @@ public boolean equals(Object obj) { if (list == null) { if (other.list != null) return false; - } else if (!list.equals(other.list)) + } else if ( !(list.containsAll(other.list) & other.list.containsAll(list))) return false; return true; } diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java index 173fea94fbe..dd03540e505 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java @@ -3,22 +3,41 @@ public class Span { //The name of the field (in the tuple) where this span is present private String fieldName; - //The start of the span + //The start position of the span, which is the offset of the gap before the first character of the span. private int start; - //The end of the span + //The end position of the span, which is the offset of the gap after the last character of the span. private int end; //The key we are searching for eg: regex private String key; //The value matching the key private String value; - - - public Span(String fieldName, int start, int end, String key, String value) { + // The token position of the span, starting from 0. + private int tokenOffset; + + /* + Example: + Value = "The quick brown fox jumps over the lazy dog" + Now the Span for brown should be + start = 10 : index Of character 'b' + end = 15 : index of character 'n'+ 1 OR start+length + Both of then result in same values. + tokenOffset = 2 position of word 'brown' + */ + + public static int INVALID_TOKEN_OFFSET = -1; + + public Span(String fieldName, int start, int end, String key, String value){ this.fieldName = fieldName; this.start = start; this.end = end; this.key = key; this.value = value; + this.tokenOffset = INVALID_TOKEN_OFFSET; + } + + public Span(String fieldName, int start, int end, String key, String value, int tokenOffset) { + this(fieldName, start, end, key, value); + this.tokenOffset = tokenOffset; } public String getFieldName() { @@ -41,6 +60,8 @@ public int getEnd() { return end; } + public int getTokenOffset(){return tokenOffset;} + @Override public int hashCode() { final int prime = 31; @@ -51,6 +72,7 @@ public int hashCode() { result = prime * result + ((key == null) ? 0 : key.hashCode()); result = prime * result + start; result = prime * result + ((value == null) ? 0 : value.hashCode()); + result = prime * result + tokenOffset; return result; } @@ -87,7 +109,10 @@ public boolean equals(Object obj) { return false; } else if (!value.equals(other.value)) return false; - + + if(tokenOffset!= other.tokenOffset) + return false; + return true; } } diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java index f175e7e00e1..fde4b6bed2b 100644 --- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java +++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java @@ -14,6 +14,7 @@ import org.apache.lucene.document.DateTools.Resolution; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.IndexOptions; import edu.uci.ics.textdb.api.common.Attribute; import edu.uci.ics.textdb.api.common.FieldType; @@ -49,7 +50,7 @@ public static IField getField(FieldType fieldType, String fieldValue) throws Par case TEXT: field = new TextField(fieldValue); break; - + default: break; } @@ -57,10 +58,10 @@ public static IField getField(FieldType fieldType, String fieldValue) throws Par } public static IndexableField getLuceneField(FieldType fieldType, - String fieldName, Object fieldValue) { + String fieldName, Object fieldValue) { IndexableField luceneField = null; switch(fieldType){ - case STRING: + case STRING: luceneField = new org.apache.lucene.document.StringField( fieldName, (String) fieldValue, Store.YES); break; @@ -78,10 +79,22 @@ public static IndexableField getLuceneField(FieldType fieldType, luceneField = new org.apache.lucene.document.StringField(fieldName, dateString, Store.YES); break; case TEXT: - luceneField = new org.apache.lucene.document.TextField( - fieldName, (String) fieldValue, Store.YES); - break; - + //By default we enable positional indexing in Lucene so that we can return + // information about character offsets and token offsets + org.apache.lucene.document.FieldType luceneFieldType = new org.apache.lucene.document.FieldType(); + luceneFieldType.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS ); + luceneFieldType.setStored(true); + luceneFieldType.setStoreTermVectors( true ); + luceneFieldType.setStoreTermVectorOffsets( true ); + luceneFieldType.setStoreTermVectorPayloads( true ); + luceneFieldType.setStoreTermVectorPositions( true ); + luceneFieldType.setTokenized( true ); + + luceneField = new org.apache.lucene.document.Field( + fieldName,(String) fieldValue,luceneFieldType); + + break; + } return luceneField; } @@ -96,10 +109,10 @@ public static ITuple getSpanTuple( List fieldList, List spanList, IField[] fieldsDuplicate = fieldListDuplicate.toArray(new IField[fieldListDuplicate.size()]); return new DataTuple(spanSchema, fieldsDuplicate); } - + /** - * - * @param schema + * + * @param schema * @about Creating a new schema object, and adding SPAN_LIST_ATTRIBUTE to * the schema. SPAN_LIST_ATTRIBUTE is of type List */ @@ -117,21 +130,25 @@ public static Schema createSpanSchema(Schema schema) { /** * Tokenizes the query string using the given analyser - * @param analyzer + * @param luceneAnalyzer * @param query * @return ArrayList list of results */ - public static ArrayList tokenizeQuery(Analyzer analyzer, String query) { + public static ArrayList tokenizeQuery(Analyzer luceneAnalyzer, String query) { HashSet resultSet = new HashSet<>(); ArrayList result = new ArrayList(); - TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(query)); + TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try{ tokenStream.reset(); while (tokenStream.incrementToken()) { - String term = charTermAttribute.toString(); - resultSet.add(term); + String token = charTermAttribute.toString(); + int tokenIndex = query.toLowerCase().indexOf(token); + // Since tokens are converted to lower case, + // get the exact token from the query string. + String actualQueryToken = query.substring(tokenIndex, tokenIndex+token.length()); + resultSet.add(actualQueryToken); } tokenStream.close(); } catch (Exception e) { diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java index ecbaa6d847a..25cf76fa4e0 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java @@ -34,11 +34,11 @@ public class DictionaryPredicate implements IPredicate { New and York; if searched in String field we search for Exact string. */ - public DictionaryPredicate(IDictionary dictionary, Analyzer analyzer, List attributeList, + public DictionaryPredicate(IDictionary dictionary, Analyzer luceneAnalyzer, List attributeList, SourceOperatorType srcOpType, IDataStore dataStore) { this.dictionary = dictionary; - this.luceneAnalyzer = analyzer; + this.luceneAnalyzer = luceneAnalyzer; this.attributeList = attributeList; this.srcOpType = srcOpType; this.dataStore = dataStore; @@ -72,7 +72,7 @@ public Analyzer getAnalyzer() { public IOperator getScanSourceOperator() throws ParseException, DataFlowException { QueryParser luceneQueryParser = new QueryParser(attributeList.get(0).getFieldName(), luceneAnalyzer); Query luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY); - IPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery); + IPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY,luceneAnalyzer,attributeList); IDataReader dataReader = new DataReader(dataReaderPredicate); IOperator operator = new ScanBasedSourceOperator(dataReader); diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java index fc664e7aa5d..3541c681b3c 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java @@ -34,7 +34,7 @@ public class KeywordPredicate implements IPredicate{ private final String query; private final Query luceneQuery; private ArrayList tokens; - private Analyzer analyzer; + private Analyzer luceneAnalyzer; private IDataStore dataStore; /* @@ -42,10 +42,10 @@ public class KeywordPredicate implements IPredicate{ For Ex. New york if searched in TextField, we would consider both tokens New and York; if searched in String field we search for Exact string. */ - public KeywordPredicate(String query, List attributeList, Analyzer analyzer,IDataStore dataStore ) throws DataFlowException{ + public KeywordPredicate(String query, List attributeList, Analyzer luceneAnalyzer, IDataStore dataStore ) throws DataFlowException{ try { this.query = query; - this.tokens = Utils.tokenizeQuery(analyzer, query); + this.tokens = Utils.tokenizeQuery(luceneAnalyzer, query); this.attributeList = attributeList; this.dataStore = dataStore; String[] temp = new String[attributeList.size()]; @@ -54,7 +54,7 @@ public KeywordPredicate(String query, List attributeList, Analyzer an temp[i] = attributeList.get(i).getFieldName(); } this.fields = temp; - this.analyzer = analyzer; + this.luceneAnalyzer = luceneAnalyzer; this.luceneQuery = createLuceneQueryObject(); } catch (Exception e) { e.printStackTrace(); @@ -105,7 +105,7 @@ and generate boolean query (Textfield is Case Insensitive) */ String[] remainingTextFields = (String[]) textFieldList.toArray(new String[0]); BooleanQuery queryOnTextFields = new BooleanQuery(); - MultiFieldQueryParser parser = new MultiFieldQueryParser(remainingTextFields, analyzer); + MultiFieldQueryParser parser = new MultiFieldQueryParser(remainingTextFields, luceneAnalyzer); for(String searchToken : this.tokens){ Query termQuery = parser.parse(searchToken); @@ -126,16 +126,18 @@ public String getQuery(){ public List getAttributeList() { return attributeList; } + public Query getQueryObject(){return this.luceneQuery;} public ArrayList getTokens(){return this.tokens;} - public Analyzer getAnalyzer(){ - return analyzer; + public Analyzer getLuceneAnalyzer(){ + return luceneAnalyzer; } public DataReaderPredicate getDataReaderPredicate() { - DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery); + DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery, + this.query, this.luceneAnalyzer, this.attributeList); return dataReaderPredicate; } diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java index c0059ec03e8..1d53e75d202 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java @@ -23,7 +23,7 @@ public class RegexPredicate implements IPredicate { private String regex; private List fieldNameList; - + private List attributeList; private Analyzer luceneAnalyzer; private IDataStore dataStore; @@ -31,6 +31,7 @@ public RegexPredicate(String regex, List attributeList, Analyzer anal this.regex = regex; this.luceneAnalyzer = analyzer; this.dataStore = dataStore; + this.attributeList = attributeList; this.fieldNameList = attributeList.stream() .filter(attr -> (attr.getFieldType() == FieldType.TEXT || attr.getFieldType() == FieldType.STRING)) .map(attr -> attr.getFieldName()).collect(Collectors.toList()); @@ -52,4 +53,8 @@ public List getFieldNameList() { return this.fieldNameList; } + public List getAttributeList() { + return attributeList; + } + } diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java index e384a19a5ca..c4c02bf385a 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java @@ -1,48 +1,37 @@ package edu.uci.ics.textdb.dataflow.keywordmatch; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import edu.uci.ics.textdb.api.common.Attribute; import edu.uci.ics.textdb.api.common.IField; import edu.uci.ics.textdb.api.common.IPredicate; import edu.uci.ics.textdb.api.common.ITuple; -import edu.uci.ics.textdb.api.common.Schema; import edu.uci.ics.textdb.api.dataflow.IOperator; import edu.uci.ics.textdb.api.dataflow.ISourceOperator; +import edu.uci.ics.textdb.common.constants.SchemaConstants; import edu.uci.ics.textdb.common.exception.DataFlowException; import edu.uci.ics.textdb.common.field.Span; -import edu.uci.ics.textdb.common.field.StringField; import edu.uci.ics.textdb.common.field.TextField; -import edu.uci.ics.textdb.common.utils.Utils; import edu.uci.ics.textdb.dataflow.common.KeywordPredicate; import edu.uci.ics.textdb.dataflow.source.IndexBasedSourceOperator; import edu.uci.ics.textdb.storage.DataReaderPredicate; -import edu.uci.ics.textdb.storage.reader.DataReader; /** * @author prakul + * @author Akshay * */ public class KeywordMatcher implements IOperator { private final KeywordPredicate predicate; private ISourceOperator sourceOperator; - private List tokenPatternList; - private List spanList; - private List tempSpanList; private String query; private List attributeList; private List queryTokens; - private Set setOfQueryTokens; - private boolean spanSchemaDefined = false; - private Schema spanSchema; public KeywordMatcher(IPredicate predicate) { this.predicate = (KeywordPredicate)predicate; DataReaderPredicate dataReaderPredicate = this.predicate.getDataReaderPredicate(); + dataReaderPredicate.setIsSpanInformationAdded(true); this.sourceOperator = new IndexBasedSourceOperator(dataReaderPredicate); } @@ -53,17 +42,6 @@ public void open() throws DataFlowException { query = predicate.getQuery(); attributeList = predicate.getAttributeList(); queryTokens = predicate.getTokens(); - setOfQueryTokens = new HashSet<>(queryTokens); - tokenPatternList = new ArrayList(); - Pattern pattern; - String regex; - for(String token : queryTokens){ - regex = "\\b" + token.toLowerCase() + "\\b"; - pattern = Pattern.compile(regex); - tokenPatternList.add(pattern); - } - spanList = new ArrayList<>(); - tempSpanList = new ArrayList<>(); } catch (Exception e) { e.printStackTrace(); @@ -106,70 +84,54 @@ public void open() throws DataFlowException { @Override public ITuple getNextTuple() throws DataFlowException { - List fieldList; - Set setOfFoundTokens = new HashSet<>(); try { ITuple sourceTuple = sourceOperator.getNextTuple(); if(sourceTuple == null){ return null; } - fieldList = sourceTuple.getFields(); - spanList.clear(); - if(!spanSchemaDefined){ - Schema schema = sourceTuple.getSchema(); - spanSchema = Utils.createSpanSchema(schema); - spanSchemaDefined = true; - } - for(int attributeIndex = 0; attributeIndex < attributeList.size(); attributeIndex++){ - IField field = sourceTuple.getField(attributeList.get(attributeIndex).getFieldName()); - String fieldValue = (String) (field).getValue(); - String fieldName; - int positionIndex = 0; // Next position in the field to be checked. - int spanStartPosition; // Starting position of the matched query - if(field instanceof StringField){ + + int schemaIndex = sourceTuple.getSchema().getIndex(SchemaConstants.SPAN_LIST_ATTRIBUTE.getFieldName()); + List spanList = + (List)sourceTuple.getField(schemaIndex).getValue(); + + for(int attributeIndex = 0; attributeIndex < attributeList.size(); attributeIndex++) { + String fieldName = attributeList.get(attributeIndex).getFieldName(); + IField field = sourceTuple.getField(fieldName); + if (!(field instanceof TextField)) { + + String fieldValue = (String) (field).getValue(); + //Keyword should match fieldValue entirely - if(fieldValue.equalsIgnoreCase(query)){ - spanStartPosition = 0; - positionIndex = query.length(); - fieldName = attributeList.get(attributeIndex).getFieldName(); - addSpanToSpanList(fieldName, spanStartPosition, positionIndex, query, fieldValue); + if (fieldValue.equals(query)) { + Span span = new Span(fieldName, 0, query.length(), query, fieldValue); + spanList.add(span); } - } - else if(field instanceof TextField) { - //Each element of Array of keywords is matched in tokenized TextField Value - for(int iter = 0; iter < queryTokens.size(); iter++) { - positionIndex = 0; - String queryToken = queryTokens.get(iter); - //Ex: For keyword lin it obtains pattern like /blin/b which matches keywords at boundary - Pattern tokenPattern = tokenPatternList.get(iter); - Matcher matcher = tokenPattern.matcher(fieldValue.toLowerCase()); - while (matcher.find(positionIndex) != false) { - spanStartPosition = matcher.start(); - positionIndex = spanStartPosition + queryToken.length(); - String documentValue = fieldValue.substring(spanStartPosition, positionIndex); - fieldName = attributeList.get(attributeIndex).getFieldName(); - String actualQueryToken = query.substring(query.toLowerCase().indexOf(queryToken), query.toLowerCase().indexOf(queryToken)+queryToken.length()); - addSpanToTempSpanList(fieldName, spanStartPosition, positionIndex, actualQueryToken, documentValue); - setOfFoundTokens.add(queryToken); + } else { + // Check if all the tokens are present in that field, + // if any of the tokens is missing, remove all the span information for that field. + + //By default, initialized to false. + boolean[] tokensPresent = new boolean[queryTokens.size()]; + + List spanForThisField = new ArrayList<>(); + + for (Span span : spanList) { + if (span.getFieldName().equals(fieldName)) { + spanForThisField.add(span); + if (queryTokens.contains(span.getKey())) + tokensPresent[queryTokens.indexOf(span.getKey())] = true; } } + + boolean allTokenPresent = areAllTrue(tokensPresent); + + if (!allTokenPresent) { + spanList.removeAll(spanForThisField); + } } - if (setOfFoundTokens.equals(setOfQueryTokens)){ - spanList.addAll(tempSpanList); - } - tempSpanList.clear(); } - //If all the 'attributes to be searched' have been processed return the result tuple with span info - //if (foundFlag || setOfFoundTokens.equals(setOfQueryTokens)){ - if(spanList.size()>0){ - return Utils.getSpanTuple(fieldList, spanList, spanSchema); - } - //Search next document if the required predicate did not match previous document - else{ - spanList.clear(); - return getNextTuple(); - } + return sourceTuple; } catch (Exception e) { e.printStackTrace(); @@ -178,17 +140,6 @@ else if(field instanceof TextField) { } - private void addSpanToSpanList(String fieldName, int start, int end, String key, String value) { - Span span = new Span(fieldName, start, end, key, value); - spanList.add(span); - } - - private void addSpanToTempSpanList(String fieldName, int start, int end, String key, String value) { - Span span = new Span(fieldName, start, end, key, value); - tempSpanList.add(span); - } - - @Override public void close() throws DataFlowException { try { @@ -198,4 +149,10 @@ public void close() throws DataFlowException { throw new DataFlowException(e.getMessage(), e); } } + + public static boolean areAllTrue(boolean[] array) + { + for(boolean b : array) if(!b) return false; + return true; + } } \ No newline at end of file diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java index 152388db0a1..ec75fe304b3 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java @@ -3,6 +3,7 @@ import java.util.ArrayList; import java.util.List; +import edu.stanford.nlp.patterns.Data; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.ParseException; @@ -74,7 +75,9 @@ public RegexMatcher(IPredicate predicate) throws DataFlowException{ this.luceneQuery = generateLuceneQuery(regex, fieldNameList, DataConstants.SCAN_QUERY); } - this.sourceOperator = new IndexBasedSourceOperator(new DataReaderPredicate(dataStore, luceneQuery)); + DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery, + DataConstants.SCAN_QUERY, luceneAnalyzer, regexPredicate.getAttributeList()); + this.sourceOperator = new IndexBasedSourceOperator(dataReaderPredicate); } catch (ParseException | java.util.regex.PatternSyntaxException e) { throw new DataFlowException(e.getMessage(), e); } diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java index e2f2db61d88..56d9d92ea87 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java @@ -52,7 +52,7 @@ public class DictionaryMatcherTest { private DataStore dataStore; private IDataWriter dataWriter; private IDataReader dataReader; - private Analyzer analyzer; + private Analyzer luceneAnalyzer; private Query luceneQuery; private IPredicate dataReaderPredicate; @@ -60,11 +60,12 @@ public class DictionaryMatcherTest { public void setUp() throws Exception { dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE); - analyzer = new StandardAnalyzer(); - dataWriter = new DataWriter(dataStore, analyzer); - QueryParser luceneQueryParser = new QueryParser(TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer); + luceneAnalyzer = new StandardAnalyzer(); + dataWriter = new DataWriter(dataStore, luceneAnalyzer); + QueryParser luceneQueryParser = new QueryParser(TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), luceneAnalyzer); luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery); + dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery, DataConstants.SCAN_QUERY, + luceneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); dataReader = new DataReader(dataReaderPredicate); dataWriter.clearData(); dataWriter.writeData(TestConstants.getSamplePeopleTuples()); diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java index 3ed8043657b..f86ef37c134 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java @@ -35,11 +35,8 @@ import edu.uci.ics.textdb.common.field.Span; import edu.uci.ics.textdb.common.field.StringField; import edu.uci.ics.textdb.common.field.TextField; -import edu.uci.ics.textdb.common.utils.Utils; import edu.uci.ics.textdb.dataflow.common.KeywordPredicate; -import edu.uci.ics.textdb.dataflow.source.IndexBasedSourceOperator; import edu.uci.ics.textdb.dataflow.utils.TestUtils; -import edu.uci.ics.textdb.storage.DataReaderPredicate; import edu.uci.ics.textdb.storage.DataStore; import edu.uci.ics.textdb.storage.writer.DataWriter; @@ -54,8 +51,6 @@ public class KeywordMatcherTest { private IDataWriter dataWriter; private DataStore dataStore; private Analyzer analyzer; - private Schema schema; - private IPredicate keywordPredicate; @Before public void setUp() throws Exception { @@ -64,7 +59,6 @@ public void setUp() throws Exception { dataWriter = new DataWriter(dataStore, analyzer); dataWriter.clearData(); dataWriter.writeData(TestConstants.getSamplePeopleTuples()); - schema = dataStore.getSchema(); } @After @@ -179,7 +173,7 @@ public void testSingleWordQueryInTextField() throws Exception { //Prepare expected result list List list = new ArrayList<>(); - Span span = new Span("description", 0, 4, "TaLL", "Tall"); + Span span = new Span("description", 0, 4, "TaLL", "Tall",0); list.add(span); Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1]; @@ -274,10 +268,10 @@ public void testWordInMultipleFieldsQuery() throws Exception { //Prepare expected result list List list = new ArrayList<>(); Span span1 = new Span("lastName", 0, 11, "lin clooney", "lin clooney"); - Span span2 = new Span("description", 0, 3, "lin", "Lin"); - Span span3 = new Span("description", 25, 28, "lin", "lin"); - Span span4 = new Span("description", 4, 11, "clooney", "Clooney"); - Span span5 = new Span("description", 29, 36, "clooney", "clooney"); + Span span2 = new Span("description", 0, 3, "lin", "Lin", 0); + Span span3 = new Span("description", 25, 28, "lin", "lin", 5); + Span span4 = new Span("description", 4, 11, "clooney", "Clooney", 1); + Span span5 = new Span("description", 29, 36, "clooney", "clooney", 6); list.add(span1); list.add(span2); list.add(span3); diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java index 043c192b41c..0b2b1a6193a 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java @@ -1,6 +1,7 @@ package edu.uci.ics.textdb.dataflow.neextractor; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import edu.uci.ics.textdb.api.common.Attribute; @@ -212,7 +213,8 @@ public ISourceOperator getSourceOperator(Schema schema, List data) throw QueryParser queryParser = new QueryParser(NEExtractorTestConstants.ATTRIBUTES_ONE_SENTENCE.get(0).getFieldName(), analyzer); query = queryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, query); + dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY, + analyzer, Arrays.asList(NEExtractorTestConstants.ATTRIBUTES_ONE_SENTENCE.get(0))); dataReader = new DataReader(dataReaderPredicate); ISourceOperator sourceOperator = new ScanBasedSourceOperator(dataReader); diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java index cf4a5b56567..2762b4cba9a 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java @@ -1,5 +1,6 @@ package edu.uci.ics.textdb.dataflow.regexmatch; +import java.lang.reflect.Array; import java.util.ArrayList; import java.util.Arrays; import java.util.List; diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java index f70641a6c4a..3816d820f64 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java @@ -4,6 +4,7 @@ package edu.uci.ics.textdb.dataflow.source; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.lucene.analysis.Analyzer; @@ -37,15 +38,15 @@ public class IndexBasedSourceOperatorTest { private IDataWriter dataWriter; private IndexBasedSourceOperator indexBasedSourceOperator; private IDataStore dataStore; - private Analyzer analyzer; + private Analyzer luceneAnalyzer; private IPredicate dataReaderPredicate; @Before public void setUp() throws Exception { dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE); - analyzer = new StandardAnalyzer(); - dataWriter = new DataWriter(dataStore, analyzer); + luceneAnalyzer = new StandardAnalyzer(); + dataWriter = new DataWriter(dataStore, luceneAnalyzer); dataWriter.clearData(); dataWriter.writeData(TestConstants.getSamplePeopleTuples()); @@ -58,9 +59,11 @@ public void cleanUp() throws Exception { public void constructIndexBasedSourceOperator(String query) throws ParseException{ String defaultField = TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(); - QueryParser queryParser = new QueryParser(defaultField, analyzer); + QueryParser queryParser = new QueryParser(defaultField, luceneAnalyzer); Query queryObject = queryParser.parse(query); - dataReaderPredicate = new DataReaderPredicate(dataStore, queryObject); + dataReaderPredicate = new DataReaderPredicate(dataStore, queryObject, + query, luceneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); + indexBasedSourceOperator = new IndexBasedSourceOperator(dataReaderPredicate); } @@ -88,7 +91,7 @@ public void testTextSearchWithMultipleTokens() throws DataFlowException, ParseEx int numTuples = results.size(); Assert.assertEquals(3, numTuples); - boolean check = TestUtils.checkResults(results,"Tall,Brown" , this.analyzer,TestConstants.DESCRIPTION); + boolean check = TestUtils.checkResults(results,"Tall,Brown" , this.luceneAnalyzer,TestConstants.DESCRIPTION); Assert.assertTrue(check); } @@ -102,7 +105,7 @@ public void testTextSearchWithMultipleTokens() throws DataFlowException, ParseEx public void testTextSearchWithSingleToken() throws DataFlowException, ParseException { List results = getQueryResults(TestConstants.DESCRIPTION + ":angry"); int numTuples = results.size(); - boolean check = TestUtils.checkResults(results,"angry" , this.analyzer,TestConstants.DESCRIPTION); + boolean check = TestUtils.checkResults(results,"angry" , this.luceneAnalyzer,TestConstants.DESCRIPTION); Assert.assertTrue(check); Assert.assertEquals(3, numTuples); } diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java index 6366ce9673d..66c042dd5f0 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java @@ -5,6 +5,7 @@ import java.text.ParseException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.lucene.analysis.Analyzer; @@ -40,19 +41,20 @@ public class ScanBasedSourceOperatorTest { private ScanBasedSourceOperator scanBasedSourceOperator; private IDataReader dataReader; private IDataStore dataStore; - private Analyzer analyzer; + private Analyzer lucneAnalyzer; private Query query; private IPredicate dataReaderPredicate; @Before public void setUp() throws Exception{ dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE); - analyzer = new StandardAnalyzer(); - dataWriter = new DataWriter(dataStore, analyzer ); + lucneAnalyzer = new StandardAnalyzer(); + dataWriter = new DataWriter(dataStore, lucneAnalyzer); QueryParser queryParser = new QueryParser( - TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer); + TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), lucneAnalyzer); query = queryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, query); + dataReaderPredicate = new DataReaderPredicate(dataStore, query, + DataConstants.SCAN_QUERY, lucneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0])); dataReader = new DataReader(dataReaderPredicate); dataWriter.clearData(); diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java index 94888778eb3..22a96c5bc4b 100644 --- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java +++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java @@ -1,20 +1,35 @@ package edu.uci.ics.textdb.storage; +import edu.uci.ics.textdb.api.common.Attribute; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.search.Query; import edu.uci.ics.textdb.api.common.IPredicate; import edu.uci.ics.textdb.api.storage.IDataStore; +import java.util.List; + /** * Created by sandeepreddy602 on 05-06-2016. */ public class DataReaderPredicate implements IPredicate { private IDataStore dataStore; private Query luceneQuery; + private String queryString; + private Analyzer luceneAnalyzer; + private List attributeList; + private boolean isSpanInformationAdded = false; - public DataReaderPredicate(IDataStore dataStore, Query luceneQuery){ + public DataReaderPredicate(IDataStore dataStore, Query luceneQuery, String queryString, Analyzer analyzer, List attributeList){ this.dataStore = dataStore; this.luceneQuery = luceneQuery; + this.luceneAnalyzer = analyzer; + this.queryString = queryString; + this.attributeList = attributeList; + } + + public void setIsSpanInformationAdded(boolean flag){ + isSpanInformationAdded = flag; } public IDataStore getDataStore() { @@ -24,4 +39,12 @@ public IDataStore getDataStore() { public Query getLuceneQuery() { return luceneQuery; } + + public String getQueryString(){return queryString;} + + public Analyzer getLuceneAnalyzer(){return luceneAnalyzer;} + + public List getAttributeList(){return attributeList;} + + public boolean getIsSpanInformationAdded(){return isSpanInformationAdded;} } diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java index 6f7266e43f6..a9827eddd06 100644 --- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java +++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java @@ -8,6 +8,7 @@ import java.util.ArrayList; import java.util.List; +import edu.uci.ics.textdb.common.field.Span; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -16,6 +17,11 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.search.DocIdSetIterator; import edu.uci.ics.textdb.api.common.Attribute; import edu.uci.ics.textdb.api.common.FieldType; @@ -41,6 +47,13 @@ public class DataReader implements IDataReader{ private ScoreDoc[] scoreDocs; private IndexReader luceneIndexReader; private DataReaderPredicate dataReaderPredicate; + private ArrayList queryTokens; + private List attributeList; + private List queryTokensInBytesRef; + // The schema of the data tuple + private Schema schema; + //The schema of the data tuple along with the span information. + private Schema spanSchema; public DataReader(IPredicate dataReaderPredicate) { this.dataReaderPredicate = (DataReaderPredicate)dataReaderPredicate; @@ -59,6 +72,26 @@ public void open() throws DataFlowException { TopDocs topDocs = luceneIndexSearcher.search(dataReaderPredicate.getLuceneQuery(), Integer.MAX_VALUE); scoreDocs = topDocs.scoreDocs; cursor = OPENED; + + this.queryTokens = Utils.tokenizeQuery(dataReaderPredicate.getLuceneAnalyzer(),dataReaderPredicate.getQueryString()); + + // sort the query tokens, as the term vector are also sorted. + // This makes the seek faster. + this.queryTokens.sort(String.CASE_INSENSITIVE_ORDER); + + // The terms in the term vector are stored as ByteRef, + // hence convert token from String format to ByteRef and then search. + + this.queryTokensInBytesRef = new ArrayList<>(); + for(String token: queryTokens) { + BytesRef byteRef = new BytesRef(token.toLowerCase().getBytes()); + this.queryTokensInBytesRef.add(byteRef); + } + + this.attributeList = dataReaderPredicate.getAttributeList(); + this.schema = dataReaderPredicate.getDataStore().getSchema(); + this.spanSchema = Utils.createSpanSchema(schema); + } catch (IOException e) { e.printStackTrace(); throw new DataFlowException(e.getMessage(), e); @@ -74,18 +107,77 @@ public ITuple getNextTuple() throws DataFlowException { if(cursor >= scoreDocs.length){ return null; } - Document document = luceneIndexSearcher.doc(scoreDocs[cursor++].doc); - + Document document = luceneIndexSearcher.doc(scoreDocs[cursor].doc); + List spanList = new ArrayList<>(); List fields = new ArrayList(); - Schema schema = dataReaderPredicate.getDataStore().getSchema(); + for (Attribute attr : schema.getAttributes()) { FieldType fieldType = attr.getFieldType(); String fieldValue = document.get(attr.getFieldName()); fields.add(Utils.getField(fieldType, fieldValue)); } - - DataTuple dataTuple = new DataTuple(schema, fields.toArray(new IField[fields.size()])); + + // If the span Information is not requested, + // just return the dataTuple without span information. + + if(!dataReaderPredicate.getIsSpanInformationAdded()){ + cursor++; + DataTuple dataTuple = new DataTuple(schema, fields.toArray(new IField[fields.size()])); + return dataTuple; + } + + // Create span information. + + for(Attribute attr: attributeList){ + + String fieldName = attr.getFieldName(); + // Get the term vector for the current field. + Terms vector = luceneIndexReader.getTermVector(scoreDocs[cursor].doc,fieldName); + + if (vector != null) { + TermsEnum vectorEnum = vector.iterator(); + int queryTokenIndex = 0; + // Search for all the query tokens in the term vector one by one. + for(BytesRef term: queryTokensInBytesRef){ + + //If Term is found, calculate the position info and add to the Spans + if(vectorEnum.seekExact(term)){ + PostingsEnum postings = vectorEnum.postings(null, PostingsEnum.POSITIONS); + + while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + int freq = postings.freq(); + // Create a new span for every occurrence. + while (freq-- > 0) { + int tokenOffset = postings.nextPosition(); + int start = postings.startOffset(); + int end = start+term.length; + String key = queryTokens.get(queryTokenIndex); + String value = document.get(fieldName).substring(start,end); + Span span = new Span(fieldName, start, end, key, value, tokenOffset); + spanList.add(span); + } + + } + + } + + queryTokenIndex++; + } + + + } + + + + } + + cursor++; + + + + ITuple dataTuple = Utils.getSpanTuple(fields, spanList, spanSchema); return dataTuple; + } catch (IOException e) { e.printStackTrace(); throw new DataFlowException(e.getMessage(), e); diff --git a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java index 73703c5f2af..9f128368ab0 100644 --- a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java +++ b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java @@ -13,6 +13,8 @@ import edu.uci.ics.textdb.common.constants.DataConstants; import edu.uci.ics.textdb.common.constants.TestConstants; +import java.util.Arrays; + public class DataReaderPredicateTest { private DataReaderPredicate dataReaderPredicate; private IDataStore dataStore; @@ -24,7 +26,7 @@ public void setUp() throws ParseException{ QueryParser luceneQueryParser = new QueryParser( TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), new StandardAnalyzer()); luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery); + dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY,new StandardAnalyzer(), Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE)); } @Test diff --git a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java index 67ca872d986..3f55c5c61c3 100644 --- a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java +++ b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java @@ -1,6 +1,7 @@ package edu.uci.ics.textdb.storage; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import junit.framework.Assert; @@ -28,18 +29,18 @@ public class DataWriterReaderTest { private IDataReader dataReader; private IDataStore dataStore; private IPredicate dataReaderPredicate; - private Analyzer analyzer; + private Analyzer luceneAnalyzer; private Query query; @Before public void setUp() throws ParseException{ dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE); - analyzer = new StandardAnalyzer(); - dataWriter = new DataWriter(dataStore, analyzer ); + luceneAnalyzer = new StandardAnalyzer(); + dataWriter = new DataWriter(dataStore, luceneAnalyzer); QueryParser queryParser = new QueryParser( - TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer); + TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), luceneAnalyzer); query = queryParser.parse(DataConstants.SCAN_QUERY); - dataReaderPredicate = new DataReaderPredicate(dataStore, query); + dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY, luceneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE)); dataReader = new DataReader(dataReaderPredicate); }