From 841212121194f126e5b19ca79a0d1ce41a4d1d45 Mon Sep 17 00:00:00 2001
From: Akshay Jain <akshaybetala@gmail.com>
Date: Mon, 16 May 2016 19:26:33 -0700
Subject: [PATCH 1/8] Enabling Term Vectors

---
 .../uci/ics/textdb/common/utils/Utils.java    | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)
diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
index f175e7e00e1..8fb8dc5b43a 100644
--- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
+++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
@@ -14,6 +14,7 @@
 import org.apache.lucene.document.DateTools.Resolution;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.IndexOptions;
 
 import edu.uci.ics.textdb.api.common.Attribute;
 import edu.uci.ics.textdb.api.common.FieldType;
@@ -49,7 +50,7 @@ public static IField getField(FieldType fieldType, String fieldValue) throws Par
             case TEXT:
                 field = new TextField(fieldValue);
                 break;
-            
+
             default:
                 break;
         }
@@ -57,10 +58,10 @@ public static IField getField(FieldType fieldType, String fieldValue) throws Par
     }
 
     public static IndexableField getLuceneField(FieldType fieldType,
-            String fieldName, Object fieldValue) {
+             String fieldName, Object fieldValue) {
         IndexableField luceneField = null;
         switch(fieldType){
-	        case STRING:
+            case STRING:
                 luceneField = new org.apache.lucene.document.StringField(
                         fieldName, (String) fieldValue, Store.YES);
                 break;
@@ -78,10 +79,20 @@ public static IndexableField getLuceneField(FieldType fieldType,
                 luceneField = new org.apache.lucene.document.StringField(fieldName, dateString, Store.YES);
                 break;
             case TEXT:
-	            luceneField = new org.apache.lucene.document.TextField(
-	                    fieldName, (String) fieldValue, Store.YES);
-	            break;
-            
+                org.apache.lucene.document.FieldType luceneFieldType = new org.apache.lucene.document.FieldType();
+                luceneFieldType.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS );
+                luceneFieldType.setStored(true);
+                luceneFieldType.setStoreTermVectors( true );
+                luceneFieldType.setStoreTermVectorOffsets( true );
+                luceneFieldType.setStoreTermVectorPayloads( true );
+                luceneFieldType.setStoreTermVectorPositions( true );
+                luceneFieldType.setTokenized( true );
+
+                luceneField = new org.apache.lucene.document.Field(
+                        fieldName,(String) fieldValue,luceneFieldType);
+
+                break;
+
         }
         return luceneField;
     }
@@ -96,10 +107,10 @@ public static ITuple getSpanTuple( List<IField> fieldList, List<Span> spanList,
         IField[] fieldsDuplicate = fieldListDuplicate.toArray(new IField[fieldListDuplicate.size()]);
         return new DataTuple(spanSchema, fieldsDuplicate);
     }
-    
+
     /**
-     * 
-     * @param schema 
+     *
+     * @param schema
      * @about Creating a new schema object, and adding SPAN_LIST_ATTRIBUTE to
      *        the schema. SPAN_LIST_ATTRIBUTE is of type List
      */

From 9d6ced4daff332e11bdfcb87e285adc4334da8c7 Mon Sep 17 00:00:00 2001
From: Akshay Jain <akshaybetala@gmail.com>
Date: Tue, 17 May 2016 15:04:14 -0700
Subject: [PATCH 2/8] Adding support for Position information in Data Reader

---
 .../ics/textdb/common/field/ListField.java    |   2 +-
 .../edu/uci/ics/textdb/common/field/Span.java |  21 ++-
 .../uci/ics/textdb/common/utils/Utils.java    |   6 +-
 .../dataflow/common/KeywordPredicate.java     |   2 +-
 .../dataflow/keywordmatch/KeywordMatcher.java | 126 +++++++-----------
 .../DictionaryMatcherTest.java                |   3 +-
 .../keywordmatch/KeywordMatcherTest.java      |  10 +-
 .../neextractor/NamedEntityExtractorTest.java |   4 +-
 .../regexmatch/RegexMatcherTestHelper.java    |   5 +-
 .../source/IndexBasedSourceOperatorTest.java  |   5 +-
 .../source/ScanBasedSourceOperatorTest.java   |   3 +-
 .../textdb/storage/DataReaderPredicate.java   |  25 +++-
 .../ics/textdb/storage/reader/DataReader.java |  89 ++++++++++++-
 .../storage/DataReaderPredicateTest.java      |   4 +-
 .../textdb/storage/DataWriterReaderTest.java  |   6 +-
 15 files changed, 205 insertions(+), 106 deletions(-)

diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java
index 5c538ca6df3..beb54f805be 100644
--- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java
+++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java
@@ -38,7 +38,7 @@ public boolean equals(Object obj) {
         if (list == null) {
             if (other.list != null)
                 return false;
-        } else if (!list.equals(other.list))
+        } else if (!list.containsAll(other.list))
             return false;
         return true;
     }
diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java
index 173fea94fbe..455d9284a3e 100644
--- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java
+++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java
@@ -11,14 +11,21 @@ public class Span {
     private String key;
     //The value matching the key
     private String value;
-    
-    
-    public Span(String fieldName, int start, int end, String key, String value) {
+    // The token position of the sapn
+    private int tokenOffset;
+
+    public Span(String fieldName, int start, int end, String key, String value){
         this.fieldName = fieldName;
         this.start = start;
         this.end = end;
         this.key = key;
         this.value = value;
+        this.tokenOffset = -1;
+    }
+
+    public Span(String fieldName, int start, int end, String key, String value, int tokenOffset) {
+        this(fieldName, start, end, key, value);
+        this.tokenOffset = tokenOffset;
     }
 
     public String getFieldName() {
@@ -41,6 +48,8 @@ public int getEnd() {
         return end;
     }
 
+    public  int getTokenOffset(){return tokenOffset;}
+
     @Override
     public int hashCode() {
         final int prime = 31;
@@ -51,6 +60,7 @@ public int hashCode() {
         result = prime * result + ((key == null) ? 0 : key.hashCode());
         result = prime * result + start;
         result = prime * result + ((value == null) ? 0 : value.hashCode());
+        result = prime * result + tokenOffset;
         return result;
     }
 
@@ -87,7 +97,10 @@ public boolean equals(Object obj) {
                 return false;
         } else if (!value.equals(other.value))
             return false;
-        
+
+        if(tokenOffset!= other.tokenOffset)
+            return false;
+
         return true;
     }
 }
diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
index 8fb8dc5b43a..9bb1fb93d51 100644
--- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
+++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
@@ -141,8 +141,10 @@ public static ArrayList<String> tokenizeQuery(Analyzer analyzer, String query) {
         try{
             tokenStream.reset();
             while (tokenStream.incrementToken()) {
-                String term = charTermAttribute.toString();
-                resultSet.add(term);
+                String token = charTermAttribute.toString();
+                int tokenIndex = query.toLowerCase().indexOf(token);
+                String actualQueryToken = query.substring(tokenIndex, tokenIndex+token.length());
+                resultSet.add(actualQueryToken);
             }
             tokenStream.close();
         } catch (Exception e) {
diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
index fc664e7aa5d..590d23fcc88 100644
--- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
+++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
@@ -135,7 +135,7 @@ public Analyzer getAnalyzer(){
     }
 
     public DataReaderPredicate getDataReaderPredicate() {
-        DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery);
+        DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery, this.query, this.analyzer,this.attributeList);
         return dataReaderPredicate;
     }
 
diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java
index e384a19a5ca..2366d047945 100644
--- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java
+++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java
@@ -13,7 +13,9 @@
 import edu.uci.ics.textdb.api.common.Schema;
 import edu.uci.ics.textdb.api.dataflow.IOperator;
 import edu.uci.ics.textdb.api.dataflow.ISourceOperator;
+import edu.uci.ics.textdb.common.constants.SchemaConstants;
 import edu.uci.ics.textdb.common.exception.DataFlowException;
+import edu.uci.ics.textdb.common.field.ListField;
 import edu.uci.ics.textdb.common.field.Span;
 import edu.uci.ics.textdb.common.field.StringField;
 import edu.uci.ics.textdb.common.field.TextField;
@@ -30,19 +32,14 @@
 public class KeywordMatcher implements IOperator {
     private final KeywordPredicate predicate;
     private ISourceOperator sourceOperator;
-    private List<Pattern> tokenPatternList;
-    private List<Span> spanList;
-    private List<Span> tempSpanList;
     private String query;
     private List<Attribute> attributeList;
     private List<String> queryTokens;
-    private Set<String> setOfQueryTokens;
-    private boolean spanSchemaDefined = false;
-    private Schema spanSchema;
 
     public KeywordMatcher(IPredicate predicate) {
         this.predicate = (KeywordPredicate)predicate;
         DataReaderPredicate dataReaderPredicate = this.predicate.getDataReaderPredicate();
+        dataReaderPredicate.setIsSpanInformationAdded(true);
         this.sourceOperator = new IndexBasedSourceOperator(dataReaderPredicate);
     }
 
@@ -53,17 +50,6 @@ public void open() throws DataFlowException {
             query = predicate.getQuery();
             attributeList = predicate.getAttributeList();
             queryTokens = predicate.getTokens();
-            setOfQueryTokens = new HashSet<>(queryTokens);
-            tokenPatternList = new ArrayList<Pattern>();
-            Pattern pattern;
-            String regex;
-            for(String token : queryTokens){
-                regex = "\\b" + token.toLowerCase() + "\\b";
-                pattern = Pattern.compile(regex);
-                tokenPatternList.add(pattern);
-            }
-            spanList = new ArrayList<>();
-            tempSpanList = new ArrayList<>();
 
         } catch (Exception e) {
             e.printStackTrace();
@@ -106,70 +92,55 @@ public void open() throws DataFlowException {
     @Override
     public ITuple getNextTuple() throws DataFlowException {
 
-        List<IField> fieldList;
-        Set<String> setOfFoundTokens = new HashSet<>();
         try {
             ITuple sourceTuple = sourceOperator.getNextTuple();
             if(sourceTuple == null){
                 return null;
             }
-            fieldList = sourceTuple.getFields();
-            spanList.clear();
-            if(!spanSchemaDefined){
-                Schema schema = sourceTuple.getSchema();
-                spanSchema = Utils.createSpanSchema(schema);
-                spanSchemaDefined = true;
-            }
-            for(int attributeIndex = 0; attributeIndex < attributeList.size(); attributeIndex++){
-                IField field = sourceTuple.getField(attributeList.get(attributeIndex).getFieldName());
-                String fieldValue = (String) (field).getValue();
-                String fieldName;
-                int positionIndex = 0; // Next position in the field to be checked.
-                int spanStartPosition; // Starting position of the matched query
-                if(field instanceof StringField){
+
+//            ITuple DataTuple = sourceTuple.
+
+            int schemaIndex = sourceTuple.getSchema().getIndex(SchemaConstants.SPAN_LIST_ATTRIBUTE.getFieldName());
+            List<Span> spanList =
+                    (List<Span>)sourceTuple.getField(schemaIndex).getValue();
+
+            for(int attributeIndex = 0; attributeIndex < attributeList.size(); attributeIndex++) {
+                String fieldName = attributeList.get(attributeIndex).getFieldName();
+                IField field = sourceTuple.getField(fieldName);
+                if (!(field instanceof TextField)) {
+
+                    String fieldValue = (String) (field).getValue();
+
                     //Keyword should match fieldValue entirely
-                    if(fieldValue.equalsIgnoreCase(query)){
-                        spanStartPosition = 0;
-                        positionIndex = query.length();
-                        fieldName = attributeList.get(attributeIndex).getFieldName();
-                        addSpanToSpanList(fieldName, spanStartPosition, positionIndex, query, fieldValue);
+                    if (fieldValue.equals(query)) {
+                        Span span = new Span(fieldName, 0, query.length(), query, fieldValue);
+                        spanList.add(span);
                     }
-                }
-                else if(field instanceof TextField) {
-                    //Each element of Array of keywords is matched in tokenized TextField Value
-                    for(int iter = 0; iter < queryTokens.size(); iter++) {
-                        positionIndex = 0;
-                        String queryToken = queryTokens.get(iter);
-                        //Ex: For keyword lin it obtains pattern like /blin/b which matches keywords at boundary
-                        Pattern tokenPattern = tokenPatternList.get(iter);
-                        Matcher matcher = tokenPattern.matcher(fieldValue.toLowerCase());
-                        while (matcher.find(positionIndex) != false) {
-                            spanStartPosition = matcher.start();
-                            positionIndex = spanStartPosition + queryToken.length();
-                            String documentValue = fieldValue.substring(spanStartPosition, positionIndex);
-                            fieldName = attributeList.get(attributeIndex).getFieldName();
-                            String actualQueryToken = query.substring(query.toLowerCase().indexOf(queryToken), query.toLowerCase().indexOf(queryToken)+queryToken.length());
-                            addSpanToTempSpanList(fieldName, spanStartPosition, positionIndex, actualQueryToken, documentValue);
-                            setOfFoundTokens.add(queryToken);
+                } else {
+                    // Check if all the tokens are present in that field,
+                    // if any of the token is missing, remove all the span information for that field.
+
+                    boolean[] tokensPresent = new boolean[queryTokens.size()];
+
+                    List<Span> spanForThisField = new ArrayList<>();
+
+                    for (Span span : spanList) {
+                        if (span.getFieldName().equals(fieldName)) {
+                            spanForThisField.add(span);
+                            if (queryTokens.contains(span.getKey()))
+                                tokensPresent[queryTokens.indexOf(span.getKey())] = true;
                         }
                     }
+
+                    boolean allTokenPresent = areAllTrue(tokensPresent);
+
+                    if (!allTokenPresent) {
+                        spanList.removeAll(spanForThisField);
+                    }
                 }
-                if (setOfFoundTokens.equals(setOfQueryTokens)){
-                    spanList.addAll(tempSpanList);
-                }
-                tempSpanList.clear();
             }
 
-            //If all the 'attributes to be searched' have been processed return the result tuple with span info
-            //if (foundFlag || setOfFoundTokens.equals(setOfQueryTokens)){
-            if(spanList.size()>0){
-                return Utils.getSpanTuple(fieldList, spanList, spanSchema);
-            }
-            //Search next document if the required predicate did not match previous document
-            else{
-                spanList.clear();
-                return getNextTuple();
-            }
+            return sourceTuple;
 
         } catch (Exception e) {
             e.printStackTrace();
@@ -178,17 +149,6 @@ else if(field instanceof TextField) {
 
     }
 
-    private void addSpanToSpanList(String fieldName, int start, int end, String key, String value) {
-        Span span = new Span(fieldName, start, end, key, value);
-        spanList.add(span);
-    }
-
-    private void addSpanToTempSpanList(String fieldName, int start, int end, String key, String value) {
-        Span span = new Span(fieldName, start, end, key, value);
-        tempSpanList.add(span);
-    }
-
-
     @Override
     public void close() throws DataFlowException {
         try {
@@ -198,4 +158,10 @@ public void close() throws DataFlowException {
             throw new DataFlowException(e.getMessage(), e);
         }
     }
+
+    public static boolean areAllTrue(boolean[] array)
+    {
+        for(boolean b : array) if(!b) return false;
+        return true;
+    }
 }
\ No newline at end of file
diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java
index e2f2db61d88..4da4203ba36 100644
--- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java
+++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java
@@ -64,7 +64,8 @@ public void setUp() throws Exception {
         dataWriter = new DataWriter(dataStore, analyzer);
         QueryParser luceneQueryParser = new QueryParser(TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer);
         luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY);
-        dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery);
+        dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY,
+                analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0]));
         dataReader = new DataReader(dataReaderPredicate);
         dataWriter.clearData();
         dataWriter.writeData(TestConstants.getSamplePeopleTuples());
diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java
index 3ed8043657b..896f15962f2 100644
--- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java
+++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java
@@ -179,7 +179,7 @@ public void testSingleWordQueryInTextField() throws Exception {
 
         //Prepare expected result list
         List<Span> list = new ArrayList<>();
-        Span span = new Span("description", 0, 4, "TaLL", "Tall");
+        Span span = new Span("description", 0, 4, "TaLL", "Tall",0);
         list.add(span);
         Attribute[] schemaAttributes = new Attribute[TestConstants.ATTRIBUTES_PEOPLE.length + 1];
 
@@ -274,10 +274,10 @@ public void testWordInMultipleFieldsQuery() throws Exception {
         //Prepare expected result list
         List<Span> list = new ArrayList<>();
         Span span1 = new Span("lastName", 0, 11, "lin clooney", "lin clooney");
-        Span span2 = new Span("description", 0, 3, "lin", "Lin");
-        Span span3 = new Span("description", 25, 28, "lin", "lin");
-        Span span4 = new Span("description", 4, 11, "clooney", "Clooney");
-        Span span5 = new Span("description", 29, 36, "clooney", "clooney");
+        Span span2 = new Span("description", 0, 3, "lin", "Lin",0);
+        Span span3 = new Span("description", 25, 28, "lin", "lin",5);
+        Span span4 = new Span("description", 4, 11, "clooney", "Clooney",1);
+        Span span5 = new Span("description", 29, 36, "clooney", "clooney",6);
         list.add(span1);
         list.add(span2);
         list.add(span3);
diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java
index 829029e3eeb..26bebcf2b5f 100644
--- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java
+++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/neextractor/NamedEntityExtractorTest.java
@@ -1,6 +1,7 @@
 package edu.uci.ics.textdb.dataflow.neextractor;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -167,7 +168,8 @@ public ISourceOperator getSourceOperator(Schema schema, List<ITuple> data) throw
 
         QueryParser queryParser = new QueryParser(NEExtractorTestConstants.ATTRIBUTES_ONE_SENTENCE.get(0).getFieldName(), analyzer);
         query = queryParser.parse(DataConstants.SCAN_QUERY);
-        dataReaderPredicate = new DataReaderPredicate(dataStore, query);
+        dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY,
+                analyzer, Arrays.asList(NEExtractorTestConstants.ATTRIBUTES_ONE_SENTENCE.get(0)));
         dataReader = new DataReader(dataReaderPredicate);
 
         ISourceOperator sourceOperator = new ScanBasedSourceOperator(dataReader);
diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java
index d7c33ae495c..59cb074f19b 100644
--- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java
+++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcherTestHelper.java
@@ -1,6 +1,8 @@
 package edu.uci.ics.textdb.dataflow.regexmatch;
 
+import java.lang.reflect.Array;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -64,7 +66,8 @@ public void runTest(String regex, String fieldName) throws Exception {
 		QueryParser queryParser = new QueryParser(
                 TestConstants.FIRST_NAME, analyzer);
         query = queryParser.parse(DataConstants.SCAN_QUERY);
-        dataReaderPredicate = new DataReaderPredicate(dataStore, query);
+        dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY,
+				analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0]));
         dataReader = new DataReader(dataReaderPredicate);
 
 		IPredicate predicate = new RegexPredicate(regex, fieldName);
diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java
index f70641a6c4a..56d88b16c35 100644
--- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java
+++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java
@@ -4,8 +4,10 @@
 package edu.uci.ics.textdb.dataflow.source;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
+import edu.uci.ics.textdb.api.common.Attribute;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.queryparser.classic.ParseException;
@@ -60,7 +62,8 @@ public void constructIndexBasedSourceOperator(String query) throws ParseExceptio
 	    String defaultField = TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName();
         QueryParser queryParser = new QueryParser(defaultField, analyzer);
         Query queryObject = queryParser.parse(query);
-        dataReaderPredicate = new DataReaderPredicate(dataStore, queryObject);
+        dataReaderPredicate = new DataReaderPredicate(dataStore, queryObject, query, analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0]));
+
         indexBasedSourceOperator = new IndexBasedSourceOperator(dataReaderPredicate);
 	}
 
diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java
index 6366ce9673d..29532f5aeaa 100644
--- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java
+++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java
@@ -5,6 +5,7 @@
 
 import java.text.ParseException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -52,7 +53,7 @@ public void setUp() throws Exception{
         QueryParser queryParser = new QueryParser(
                 TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer);
         query = queryParser.parse(DataConstants.SCAN_QUERY);
-        dataReaderPredicate = new DataReaderPredicate(dataStore, query);
+        dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY, analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0]));
         dataReader = new DataReader(dataReaderPredicate);
         
         dataWriter.clearData();
diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java
index 94888778eb3..e1bffb6d313 100644
--- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java
+++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java
@@ -1,20 +1,35 @@
 package edu.uci.ics.textdb.storage;
 
+import edu.uci.ics.textdb.api.common.Attribute;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.search.Query;
 
 import edu.uci.ics.textdb.api.common.IPredicate;
 import edu.uci.ics.textdb.api.storage.IDataStore;
 
+import java.util.List;
+
 /**
  * Created by sandeepreddy602 on 05-06-2016.
  */
 public class DataReaderPredicate implements IPredicate {
     private IDataStore dataStore;
     private Query luceneQuery;
+    private String queryString;
+    private Analyzer analyzer;
+    private List<Attribute> attributeList;
+    private boolean isSpanInformationAdded = false;
 
-    public DataReaderPredicate(IDataStore dataStore, Query luceneQuery){
+    public DataReaderPredicate(IDataStore dataStore, Query luceneQuery, String queryString, Analyzer analyzer, List<Attribute> attributeList){
         this.dataStore = dataStore;
         this.luceneQuery = luceneQuery;
+        this.analyzer = analyzer;
+        this.queryString = queryString;
+        this.attributeList  = attributeList;
+    }
+
+    public void setIsSpanInformationAdded(boolean flag){
+        isSpanInformationAdded = flag;
     }
 
     public IDataStore getDataStore() {
@@ -24,4 +39,12 @@ public IDataStore getDataStore() {
     public Query getLuceneQuery() {
         return luceneQuery;
     }
+
+    public String getQueryString(){return queryString;}
+
+    public Analyzer getAnalyzer(){return analyzer;}
+
+    public List<Attribute> getAttributeList(){return attributeList;}
+
+    public boolean getIsSpanInformationAdded(){return isSpanInformationAdded;}
 }
diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
index 6f7266e43f6..c28d3c99f18 100644
--- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
+++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
@@ -8,6 +8,7 @@
 import java.util.ArrayList;
 import java.util.List;
 
+import edu.uci.ics.textdb.common.field.Span;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
@@ -16,6 +17,11 @@
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
 
 import edu.uci.ics.textdb.api.common.Attribute;
 import edu.uci.ics.textdb.api.common.FieldType;
@@ -29,6 +35,7 @@
 import edu.uci.ics.textdb.common.field.DataTuple;
 import edu.uci.ics.textdb.common.utils.Utils;
 import edu.uci.ics.textdb.storage.DataReaderPredicate;
+import org.apache.lucene.util.packed.PackedLongValues;
 
 /**
  * @author sandeepreddy602
@@ -41,6 +48,11 @@ public class DataReader implements IDataReader{
     private ScoreDoc[] scoreDocs;
     private IndexReader luceneIndexReader;
     private DataReaderPredicate dataReaderPredicate;
+    private ArrayList<String> queryTokens;
+    private List<Attribute> attributeList;
+    private List<BytesRef> queryTokensInBytesRef;
+    private Schema schema;
+    private Schema spanSchema;
 
     public DataReader(IPredicate dataReaderPredicate) {
         this.dataReaderPredicate = (DataReaderPredicate)dataReaderPredicate;
@@ -59,6 +71,22 @@ public void open() throws DataFlowException {
             TopDocs topDocs = luceneIndexSearcher.search(dataReaderPredicate.getLuceneQuery(), Integer.MAX_VALUE);
             scoreDocs = topDocs.scoreDocs;
             cursor = OPENED;
+
+            this.queryTokens = Utils.tokenizeQuery(dataReaderPredicate.getAnalyzer(),dataReaderPredicate.getQueryString());
+            // sort the query token, as the term vector are also sorted.
+            // This makes the seek faster.
+            this.queryTokens.sort(String.CASE_INSENSITIVE_ORDER);
+
+            this.queryTokensInBytesRef = new ArrayList<>();
+            for(String token: queryTokens) {
+                BytesRef byteRef = new BytesRef(token.toLowerCase().getBytes());
+                this.queryTokensInBytesRef.add(byteRef);
+            }
+
+            this.attributeList = dataReaderPredicate.getAttributeList();
+            this.schema = dataReaderPredicate.getDataStore().getSchema();
+            this.spanSchema = Utils.createSpanSchema(schema);
+
         } catch (IOException e) {
             e.printStackTrace();
             throw new DataFlowException(e.getMessage(), e);
@@ -74,18 +102,69 @@ public ITuple getNextTuple() throws DataFlowException {
             if(cursor >= scoreDocs.length){
                 return null;
             }
-            Document document = luceneIndexSearcher.doc(scoreDocs[cursor++].doc);
-            
+            Document document = luceneIndexSearcher.doc(scoreDocs[cursor].doc);
+            List<Span> spanList = new ArrayList<>();
             List<IField> fields = new ArrayList<IField>();
-            Schema schema = dataReaderPredicate.getDataStore().getSchema();
+
             for (Attribute  attr : schema.getAttributes()) {
                 FieldType fieldType = attr.getFieldType();
                 String fieldValue = document.get(attr.getFieldName());
                 fields.add(Utils.getField(fieldType, fieldValue));
             }
-            
-            DataTuple dataTuple = new DataTuple(schema, fields.toArray(new IField[fields.size()]));
+
+            if(!dataReaderPredicate.getIsSpanInformationAdded()){
+                cursor++;
+                DataTuple dTuple = new DataTuple(schema, fields.toArray(new IField[fields.size()]));
+                return  dTuple;
+            }
+
+            for(Attribute attr: attributeList){
+
+                String fieldName  = attr.getFieldName();
+                Terms vector = luceneIndexReader.getTermVector(scoreDocs[cursor].doc,fieldName);
+
+                if (vector != null) {
+                    TermsEnum vectorEnum = vector.iterator();
+                    int queryTokenIndex = 0;
+                    for(BytesRef term: queryTokensInBytesRef){
+
+                        if(vectorEnum.seekExact(term)){
+                            System.out.println(term.utf8ToString());
+                            PostingsEnum postings = vectorEnum.postings(null, PostingsEnum.POSITIONS);
+
+                            while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+                                int freq = postings.freq();
+                                while (freq-- > 0) {
+                                    int tokenOffset = postings.nextPosition();
+                                    int start = postings.startOffset();
+                                    int end = start+term.length;
+                                    String key = queryTokens.get(queryTokenIndex);
+                                    String value = document.get(fieldName).substring(start,end);
+                                    Span span = new Span(fieldName, start, end, key, value, tokenOffset);
+                                    spanList.add(span);
+                                }
+
+                            }
+
+                        }
+
+                        queryTokenIndex++;
+                    }
+
+
+                }
+
+
+
+            }
+
+            cursor++;
+
+
+
+            ITuple dataTuple  = Utils.getSpanTuple(fields, spanList, spanSchema);
             return dataTuple;
+
         } catch (IOException e) {
             e.printStackTrace();
             throw new DataFlowException(e.getMessage(), e);
diff --git a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java
index 73703c5f2af..9f128368ab0 100644
--- a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java
+++ b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataReaderPredicateTest.java
@@ -13,6 +13,8 @@
 import edu.uci.ics.textdb.common.constants.DataConstants;
 import edu.uci.ics.textdb.common.constants.TestConstants;
 
+import java.util.Arrays;
+
 public class DataReaderPredicateTest {
     private DataReaderPredicate dataReaderPredicate;
     private IDataStore dataStore;
@@ -24,7 +26,7 @@ public void setUp() throws ParseException{
         QueryParser luceneQueryParser = new QueryParser(
                 TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), new  StandardAnalyzer());
         luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY);
-        dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery);
+        dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY,new StandardAnalyzer(), Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE));
     }
     
     @Test
diff --git a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java
index 67ca872d986..f2e0b989674 100644
--- a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java
+++ b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java
@@ -1,6 +1,8 @@
 package edu.uci.ics.textdb.storage;
 
+import java.lang.reflect.Array;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import junit.framework.Assert;
@@ -23,6 +25,8 @@
 import edu.uci.ics.textdb.storage.reader.DataReader;
 import edu.uci.ics.textdb.storage.writer.DataWriter;
 
+import javax.xml.crypto.Data;
+
 public class DataWriterReaderTest {
     private IDataWriter dataWriter;
     private IDataReader dataReader;
@@ -39,7 +43,7 @@ public void setUp() throws ParseException{
         QueryParser queryParser = new QueryParser(
                 TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer);
         query = queryParser.parse(DataConstants.SCAN_QUERY);
-        dataReaderPredicate = new DataReaderPredicate(dataStore, query);
+        dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY,analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE));
         dataReader = new DataReader(dataReaderPredicate);
     }
     

From 58b4062f4a083319615d27e4e06c9d2e0021d1ca Mon Sep 17 00:00:00 2001
From: Akshay Jain <akshaybetala@gmail.com>
Date: Tue, 17 May 2016 15:07:12 -0700
Subject: [PATCH 3/8] Adding comment

---
 .../src/main/java/edu/uci/ics/textdb/common/utils/Utils.java    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
index 9bb1fb93d51..6c24b5c4272 100644
--- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
+++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
@@ -79,6 +79,8 @@ public static IndexableField getLuceneField(FieldType fieldType,
                 luceneField = new org.apache.lucene.document.StringField(fieldName, dateString, Store.YES);
                 break;
             case TEXT:
+                //By default we enable positional indexing in Lucene so that we can return
+                // information about character offsets and token offsets
                 org.apache.lucene.document.FieldType luceneFieldType = new org.apache.lucene.document.FieldType();
                 luceneFieldType.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS );
                 luceneFieldType.setStored(true);

From 219d0fabeeb41155901ba92de9822043fb4b9bb0 Mon Sep 17 00:00:00 2001
From: Akshay Jain <akshaybetala@gmail.com>
Date: Tue, 17 May 2016 15:25:50 -0700
Subject: [PATCH 4/8] Merge from master

---
 .../edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java
index ecbaa6d847a..6a2cb681825 100644
--- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java
+++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java
@@ -72,7 +72,7 @@ public Analyzer getAnalyzer() {
     public IOperator getScanSourceOperator() throws ParseException, DataFlowException {
         QueryParser luceneQueryParser = new QueryParser(attributeList.get(0).getFieldName(), luceneAnalyzer);
         Query luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY);
-        IPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery);
+        IPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY,luceneAnalyzer,attributeList);
         IDataReader dataReader = new DataReader(dataReaderPredicate);
 
         IOperator operator = new ScanBasedSourceOperator(dataReader);

From 1b0c1e497836373e5128ef8f831b5a0a8de053fa Mon Sep 17 00:00:00 2001
From: Akshay Jain <akshaybetala@gmail.com>
Date: Tue, 17 May 2016 17:15:17 -0700
Subject: [PATCH 5/8] Adding comments and minor refactoring

---
 .../ics/textdb/common/field/ListField.java    |  2 +-
 .../edu/uci/ics/textdb/common/field/Span.java | 19 +++++++++++++++----
 .../uci/ics/textdb/common/utils/Utils.java    |  8 +++++---
 .../dataflow/common/DictionaryPredicate.java  |  4 ++--
 .../dataflow/common/KeywordPredicate.java     | 17 +++++++++--------
 .../dataflow/keywordmatch/KeywordMatcher.java | 14 ++------------
 .../main/resources/queryrewriter/wordsEn.txt  |  2 +-
 .../DictionaryMatcherTest.java                | 12 ++++++------
 .../keywordmatch/KeywordMatcherTest.java      | 14 ++++----------
 .../source/IndexBasedSourceOperatorTest.java  | 16 ++++++++--------
 .../source/ScanBasedSourceOperatorTest.java   | 11 ++++++-----
 .../ics/textdb/storage/reader/DataReader.java | 19 ++++++++++++++-----
 12 files changed, 73 insertions(+), 65 deletions(-)

diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java
index beb54f805be..56946ad3996 100644
--- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java
+++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java
@@ -38,7 +38,7 @@ public boolean equals(Object obj) {
         if (list == null) {
             if (other.list != null)
                 return false;
-        } else if (!list.containsAll(other.list))
+        } else if ( !(list.containsAll(other.list) & other.list.containsAll(list)))
             return false;
         return true;
     }
diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java
index 455d9284a3e..48a84ab476f 100644
--- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java
+++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java
@@ -3,24 +3,35 @@
 public class Span {
     //The name of the field (in the tuple) where this span is present
     private String fieldName;
-    //The start of the span
+    //The start of the span. It is the position of the first character of span in the document.
     private int start;
-    //The end of the span
+    //The end of the span.It is the position of the first character of span in the document
     private int end;
     //The key we are searching for eg: regex
     private String key;
     //The value matching the key
     private String value;
-    // The token position of the sapn
+    // The token position of the span
     private int tokenOffset;
 
+    /*
+    Example:
+        Value = "The quick brown fox jumps over the lazy dog"
+        Now the Span for brown should be
+        start = 10 : position of character 'b'
+        end = 15 : position of character 'n'
+        tokenOffset = 2 position of word 'brown'
+     */
+
+    public static int INVALID_TOKEN_OFFSET = -1;
+
     public Span(String fieldName, int start, int end, String key, String value){
         this.fieldName = fieldName;
         this.start = start;
         this.end = end;
         this.key = key;
         this.value = value;
-        this.tokenOffset = -1;
+        this.tokenOffset = INVALID_TOKEN_OFFSET;
     }
 
     public Span(String fieldName, int start, int end, String key, String value, int tokenOffset) {
diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
index 6c24b5c4272..fde4b6bed2b 100644
--- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
+++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
@@ -130,14 +130,14 @@ public static Schema createSpanSchema(Schema schema) {
 
     /**
      * Tokenizes the query string using the given analyser
-     * @param analyzer
+     * @param luceneAnalyzer
      * @param query
      * @return ArrayList<String> list of results
      */
-    public static ArrayList<String> tokenizeQuery(Analyzer analyzer, String query) {
+    public static ArrayList<String> tokenizeQuery(Analyzer luceneAnalyzer, String query) {
         HashSet<String> resultSet = new HashSet<>();
         ArrayList<String> result = new ArrayList<String>();
-        TokenStream tokenStream  = analyzer.tokenStream(null, new StringReader(query));
+        TokenStream tokenStream  = luceneAnalyzer.tokenStream(null, new StringReader(query));
         CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
 
         try{
@@ -145,6 +145,8 @@ public static ArrayList<String> tokenizeQuery(Analyzer analyzer, String query) {
             while (tokenStream.incrementToken()) {
                 String token = charTermAttribute.toString();
                 int tokenIndex = query.toLowerCase().indexOf(token);
+                // Since tokens are converted to lower case,
+                // get the exact token from the query string.
                 String actualQueryToken = query.substring(tokenIndex, tokenIndex+token.length());
                 resultSet.add(actualQueryToken);
             }
diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java
index 6a2cb681825..25cf76fa4e0 100644
--- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java
+++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java
@@ -34,11 +34,11 @@ public class DictionaryPredicate implements IPredicate {
     New and York; if searched in String field we search for Exact string.
      */
 
-    public DictionaryPredicate(IDictionary dictionary, Analyzer analyzer, List<Attribute> attributeList,
+    public DictionaryPredicate(IDictionary dictionary, Analyzer luceneAnalyzer, List<Attribute> attributeList,
             SourceOperatorType srcOpType, IDataStore dataStore) {
 
         this.dictionary = dictionary;
-        this.luceneAnalyzer = analyzer;
+        this.luceneAnalyzer = luceneAnalyzer;
         this.attributeList = attributeList;
         this.srcOpType = srcOpType;
         this.dataStore = dataStore;
diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
index 590d23fcc88..10e11ea888c 100644
--- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
+++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
@@ -34,7 +34,7 @@ public class KeywordPredicate implements IPredicate{
     private final String query;
     private final Query luceneQuery;
     private ArrayList<String> tokens;
-    private Analyzer analyzer;
+    private Analyzer luceneAnalyzer;
     private IDataStore dataStore;
 
     /*
@@ -42,10 +42,10 @@ public class KeywordPredicate implements IPredicate{
     For Ex. New york if searched in TextField, we would consider both tokens
     New and York; if searched in String field we search for Exact string.
      */
-    public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer analyzer,IDataStore dataStore ) throws DataFlowException{
+    public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer luceneAnalyzer, IDataStore dataStore ) throws DataFlowException{
         try {
             this.query = query;
-            this.tokens = Utils.tokenizeQuery(analyzer, query);
+            this.tokens = Utils.tokenizeQuery(luceneAnalyzer, query);
             this.attributeList = attributeList;
             this.dataStore = dataStore;
             String[] temp = new String[attributeList.size()];
@@ -54,7 +54,7 @@ public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer an
                 temp[i] = attributeList.get(i).getFieldName();
             }
             this.fields = temp;
-            this.analyzer = analyzer;
+            this.luceneAnalyzer = luceneAnalyzer;
             this.luceneQuery = createLuceneQueryObject();
         } catch (Exception e) {
             e.printStackTrace();
@@ -105,7 +105,7 @@ and generate  boolean query (Textfield is Case Insensitive)
          */
         String[] remainingTextFields = (String[]) textFieldList.toArray(new String[0]);
         BooleanQuery queryOnTextFields = new BooleanQuery();
-        MultiFieldQueryParser parser = new MultiFieldQueryParser(remainingTextFields, analyzer);
+        MultiFieldQueryParser parser = new MultiFieldQueryParser(remainingTextFields, luceneAnalyzer);
 
         for(String searchToken : this.tokens){
             Query termQuery = parser.parse(searchToken);
@@ -130,12 +130,13 @@ public List<Attribute> getAttributeList() {
 
     public ArrayList<String> getTokens(){return this.tokens;}
 
-    public Analyzer getAnalyzer(){
-        return analyzer;
+    public Analyzer getLuceneAnalyzer(){
+        return luceneAnalyzer;
     }
 
     public DataReaderPredicate getDataReaderPredicate() {
-        DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery, this.query, this.analyzer,this.attributeList);
+        DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery,
+                this.query, this.luceneAnalyzer, this.attributeList);
         return dataReaderPredicate;
     }
 
diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java
index 2366d047945..6325be992a4 100644
--- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java
+++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java
@@ -1,32 +1,24 @@
 package edu.uci.ics.textdb.dataflow.keywordmatch;
 
 import java.util.ArrayList;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import edu.uci.ics.textdb.api.common.Attribute;
 import edu.uci.ics.textdb.api.common.IField;
 import edu.uci.ics.textdb.api.common.IPredicate;
 import edu.uci.ics.textdb.api.common.ITuple;
-import edu.uci.ics.textdb.api.common.Schema;
 import edu.uci.ics.textdb.api.dataflow.IOperator;
 import edu.uci.ics.textdb.api.dataflow.ISourceOperator;
 import edu.uci.ics.textdb.common.constants.SchemaConstants;
 import edu.uci.ics.textdb.common.exception.DataFlowException;
-import edu.uci.ics.textdb.common.field.ListField;
 import edu.uci.ics.textdb.common.field.Span;
-import edu.uci.ics.textdb.common.field.StringField;
 import edu.uci.ics.textdb.common.field.TextField;
-import edu.uci.ics.textdb.common.utils.Utils;
 import edu.uci.ics.textdb.dataflow.common.KeywordPredicate;
 import edu.uci.ics.textdb.dataflow.source.IndexBasedSourceOperator;
 import edu.uci.ics.textdb.storage.DataReaderPredicate;
-import edu.uci.ics.textdb.storage.reader.DataReader;
 
 /**
  *  @author prakul
+ *  @author Akshay
  *
  */
 public class KeywordMatcher implements IOperator {
@@ -98,8 +90,6 @@ public ITuple getNextTuple() throws DataFlowException {
                 return null;
             }
 
-//            ITuple DataTuple = sourceTuple.
-
             int schemaIndex = sourceTuple.getSchema().getIndex(SchemaConstants.SPAN_LIST_ATTRIBUTE.getFieldName());
             List<Span> spanList =
                     (List<Span>)sourceTuple.getField(schemaIndex).getValue();
@@ -118,7 +108,7 @@ public ITuple getNextTuple() throws DataFlowException {
                     }
                 } else {
                     // Check if all the tokens are present in that field,
-                    // if any of the token is missing, remove all the span information for that field.
+                    // if any of the tokens is missing, remove all the span information for that field.
 
                     boolean[] tokensPresent = new boolean[queryTokens.size()];
 
diff --git a/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt b/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt
index 05a3d743db3..1c0c6821b89 100644
--- a/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt
+++ b/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt
@@ -3240,7 +3240,7 @@ analytically
 analyzable
 analyze
 analyzed
-analyzer
+luceneAnalyzer
 analyzers
 analyzes
 analyzing
diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java
index 4da4203ba36..56d9d92ea87 100644
--- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java
+++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcherTest.java
@@ -52,7 +52,7 @@ public class DictionaryMatcherTest {
     private DataStore dataStore;
     private IDataWriter dataWriter;
     private IDataReader dataReader;
-    private Analyzer analyzer;
+    private Analyzer luceneAnalyzer;
     private Query luceneQuery;
     private IPredicate dataReaderPredicate;
 
@@ -60,12 +60,12 @@ public class DictionaryMatcherTest {
     public void setUp() throws Exception {
 
         dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE);
-        analyzer = new StandardAnalyzer();
-        dataWriter = new DataWriter(dataStore, analyzer);
-        QueryParser luceneQueryParser = new QueryParser(TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer);
+        luceneAnalyzer = new StandardAnalyzer();
+        dataWriter = new DataWriter(dataStore, luceneAnalyzer);
+        QueryParser luceneQueryParser = new QueryParser(TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), luceneAnalyzer);
         luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY);
-        dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY,
-                analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0]));
+        dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery, DataConstants.SCAN_QUERY,
+                luceneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0]));
         dataReader = new DataReader(dataReaderPredicate);
         dataWriter.clearData();
         dataWriter.writeData(TestConstants.getSamplePeopleTuples());
diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java
index 896f15962f2..f86ef37c134 100644
--- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java
+++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcherTest.java
@@ -35,11 +35,8 @@
 import edu.uci.ics.textdb.common.field.Span;
 import edu.uci.ics.textdb.common.field.StringField;
 import edu.uci.ics.textdb.common.field.TextField;
-import edu.uci.ics.textdb.common.utils.Utils;
 import edu.uci.ics.textdb.dataflow.common.KeywordPredicate;
-import edu.uci.ics.textdb.dataflow.source.IndexBasedSourceOperator;
 import edu.uci.ics.textdb.dataflow.utils.TestUtils;
-import edu.uci.ics.textdb.storage.DataReaderPredicate;
 import edu.uci.ics.textdb.storage.DataStore;
 import edu.uci.ics.textdb.storage.writer.DataWriter;
 
@@ -54,8 +51,6 @@ public class KeywordMatcherTest {
     private IDataWriter dataWriter;
     private DataStore dataStore;
     private Analyzer analyzer;
-    private Schema schema;
-    private IPredicate keywordPredicate;
 
     @Before
     public void setUp() throws Exception {
@@ -64,7 +59,6 @@ public void setUp() throws Exception {
         dataWriter = new DataWriter(dataStore, analyzer);
         dataWriter.clearData();
         dataWriter.writeData(TestConstants.getSamplePeopleTuples());
-        schema = dataStore.getSchema();
     }
 
     @After
@@ -274,10 +268,10 @@ public void testWordInMultipleFieldsQuery() throws Exception {
         //Prepare expected result list
         List<Span> list = new ArrayList<>();
         Span span1 = new Span("lastName", 0, 11, "lin clooney", "lin clooney");
-        Span span2 = new Span("description", 0, 3, "lin", "Lin",0);
-        Span span3 = new Span("description", 25, 28, "lin", "lin",5);
-        Span span4 = new Span("description", 4, 11, "clooney", "Clooney",1);
-        Span span5 = new Span("description", 29, 36, "clooney", "clooney",6);
+        Span span2 = new Span("description", 0, 3, "lin", "Lin", 0);
+        Span span3 = new Span("description", 25, 28, "lin", "lin", 5);
+        Span span4 = new Span("description", 4, 11, "clooney", "Clooney", 1);
+        Span span5 = new Span("description", 29, 36, "clooney", "clooney", 6);
         list.add(span1);
         list.add(span2);
         list.add(span3);
diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java
index 56d88b16c35..3816d820f64 100644
--- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java
+++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/IndexBasedSourceOperatorTest.java
@@ -7,7 +7,6 @@
 import java.util.Arrays;
 import java.util.List;
 
-import edu.uci.ics.textdb.api.common.Attribute;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.queryparser.classic.ParseException;
@@ -39,15 +38,15 @@ public class IndexBasedSourceOperatorTest {
 	private IDataWriter dataWriter;
 	private IndexBasedSourceOperator indexBasedSourceOperator;
 	private IDataStore dataStore;
-	private Analyzer analyzer;
+	private Analyzer luceneAnalyzer;
     private IPredicate dataReaderPredicate;
 
 
 	@Before
 	public void setUp() throws Exception {
 		dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE);
-		analyzer = new StandardAnalyzer();
-		dataWriter = new DataWriter(dataStore, analyzer);
+		luceneAnalyzer = new StandardAnalyzer();
+		dataWriter = new DataWriter(dataStore, luceneAnalyzer);
 		dataWriter.clearData();
 		dataWriter.writeData(TestConstants.getSamplePeopleTuples());
 
@@ -60,9 +59,10 @@ public void cleanUp() throws Exception {
 	
 	public void constructIndexBasedSourceOperator(String query) throws ParseException{
 	    String defaultField = TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName();
-        QueryParser queryParser = new QueryParser(defaultField, analyzer);
+        QueryParser queryParser = new QueryParser(defaultField, luceneAnalyzer);
         Query queryObject = queryParser.parse(query);
-        dataReaderPredicate = new DataReaderPredicate(dataStore, queryObject, query, analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0]));
+        dataReaderPredicate = new DataReaderPredicate(dataStore, queryObject,
+				query, luceneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0]));
 
         indexBasedSourceOperator = new IndexBasedSourceOperator(dataReaderPredicate);
 	}
@@ -91,7 +91,7 @@ public void testTextSearchWithMultipleTokens() throws DataFlowException, ParseEx
 		int numTuples = results.size();
 		Assert.assertEquals(3, numTuples);
 
-		boolean check = TestUtils.checkResults(results,"Tall,Brown" , this.analyzer,TestConstants.DESCRIPTION);
+		boolean check = TestUtils.checkResults(results,"Tall,Brown" , this.luceneAnalyzer,TestConstants.DESCRIPTION);
 		Assert.assertTrue(check);
 	}
 
@@ -105,7 +105,7 @@ public void testTextSearchWithMultipleTokens() throws DataFlowException, ParseEx
 	public void testTextSearchWithSingleToken() throws DataFlowException, ParseException {
 		List<ITuple> results = getQueryResults(TestConstants.DESCRIPTION + ":angry");
 		int numTuples = results.size();
-		boolean check = TestUtils.checkResults(results,"angry" , this.analyzer,TestConstants.DESCRIPTION);
+		boolean check = TestUtils.checkResults(results,"angry" , this.luceneAnalyzer,TestConstants.DESCRIPTION);
 		Assert.assertTrue(check);
 		Assert.assertEquals(3, numTuples);
 	}
diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java
index 29532f5aeaa..66c042dd5f0 100644
--- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java
+++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/source/ScanBasedSourceOperatorTest.java
@@ -41,19 +41,20 @@ public class ScanBasedSourceOperatorTest {
     private ScanBasedSourceOperator scanBasedSourceOperator;
     private IDataReader dataReader;
     private IDataStore dataStore;
-    private Analyzer analyzer;
+    private Analyzer lucneAnalyzer;
     private Query query;
     private IPredicate dataReaderPredicate;
     
     @Before
     public void setUp() throws Exception{
         dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE);
-        analyzer = new  StandardAnalyzer();
-        dataWriter = new DataWriter(dataStore, analyzer );
+        lucneAnalyzer = new  StandardAnalyzer();
+        dataWriter = new DataWriter(dataStore, lucneAnalyzer);
         QueryParser queryParser = new QueryParser(
-                TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer);
+                TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), lucneAnalyzer);
         query = queryParser.parse(DataConstants.SCAN_QUERY);
-        dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY, analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0]));
+        dataReaderPredicate = new DataReaderPredicate(dataStore, query,
+                DataConstants.SCAN_QUERY, lucneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE[0]));
         dataReader = new DataReader(dataReaderPredicate);
         
         dataWriter.clearData();
diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
index c28d3c99f18..fa606290a43 100644
--- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
+++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
@@ -35,7 +35,6 @@
 import edu.uci.ics.textdb.common.field.DataTuple;
 import edu.uci.ics.textdb.common.utils.Utils;
 import edu.uci.ics.textdb.storage.DataReaderPredicate;
-import org.apache.lucene.util.packed.PackedLongValues;
 
 /**
  * @author sandeepreddy602
@@ -51,7 +50,9 @@ public class DataReader implements IDataReader{
     private ArrayList<String> queryTokens;
     private List<Attribute> attributeList;
     private List<BytesRef> queryTokensInBytesRef;
+    // The schema of the data tuple
     private Schema schema;
+    //The schema o the data tuple along with the span information.
     private Schema spanSchema;
 
     public DataReader(IPredicate dataReaderPredicate) {
@@ -73,7 +74,7 @@ public void open() throws DataFlowException {
             cursor = OPENED;
 
             this.queryTokens = Utils.tokenizeQuery(dataReaderPredicate.getAnalyzer(),dataReaderPredicate.getQueryString());
-            // sort the query token, as the term vector are also sorted.
+            // sort the query tokens, as the term vector are also sorted.
             // This makes the seek faster.
             this.queryTokens.sort(String.CASE_INSENSITIVE_ORDER);
 
@@ -112,28 +113,36 @@ public ITuple getNextTuple() throws DataFlowException {
                 fields.add(Utils.getField(fieldType, fieldValue));
             }
 
+            // If the span Information is not requested,
+            // just return the dataTuple without span information.
+
             if(!dataReaderPredicate.getIsSpanInformationAdded()){
                 cursor++;
-                DataTuple dTuple = new DataTuple(schema, fields.toArray(new IField[fields.size()]));
-                return  dTuple;
+                DataTuple dataTuple = new DataTuple(schema, fields.toArray(new IField[fields.size()]));
+                return  dataTuple;
             }
 
+            // Create span information.
+
             for(Attribute attr: attributeList){
 
                 String fieldName  = attr.getFieldName();
+                // Get the term vector fot the current field.
                 Terms vector = luceneIndexReader.getTermVector(scoreDocs[cursor].doc,fieldName);
 
                 if (vector != null) {
                     TermsEnum vectorEnum = vector.iterator();
                     int queryTokenIndex = 0;
+                    // Search for all the query tokens in the term vector one by one.
                     for(BytesRef term: queryTokensInBytesRef){
 
+                        //If Term is found, calculate the position info and add to the Spans
                         if(vectorEnum.seekExact(term)){
-                            System.out.println(term.utf8ToString());
                             PostingsEnum postings = vectorEnum.postings(null, PostingsEnum.POSITIONS);
 
                             while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                                 int freq = postings.freq();
+                                // Create a new span for every occurrence.
                                 while (freq-- > 0) {
                                     int tokenOffset = postings.nextPosition();
                                     int start = postings.startOffset();

From 0e6ec5ae64fb2f4edae0b88ce0df86ed0e3db32a Mon Sep 17 00:00:00 2001
From: Akshay Jain <akshaybetala@gmail.com>
Date: Tue, 17 May 2016 17:55:22 -0700
Subject: [PATCH 6/8] Adding comments

---
 .../uci/ics/textdb/dataflow/common/KeywordPredicate.java    | 1 +
 .../ics/textdb/dataflow/keywordmatch/KeywordMatcher.java    | 1 +
 .../java/edu/uci/ics/textdb/storage/reader/DataReader.java  | 6 +++++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
index 10e11ea888c..3541c681b3c 100644
--- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
+++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
@@ -126,6 +126,7 @@ public String getQuery(){
     public List<Attribute> getAttributeList() {
         return attributeList;
     }
+
     public Query getQueryObject(){return this.luceneQuery;}
 
     public ArrayList<String> getTokens(){return this.tokens;}
diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java
index 6325be992a4..c4c02bf385a 100644
--- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java
+++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java
@@ -110,6 +110,7 @@ public ITuple getNextTuple() throws DataFlowException {
                     // Check if all the tokens are present in that field,
                     // if any of the tokens is missing, remove all the span information for that field.
 
+                    //By default, initialized to false.
                     boolean[] tokensPresent = new boolean[queryTokens.size()];
 
                     List<Span> spanForThisField = new ArrayList<>();
diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
index fa606290a43..2af82b0f82c 100644
--- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
+++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
@@ -52,7 +52,7 @@ public class DataReader implements IDataReader{
     private List<BytesRef> queryTokensInBytesRef;
     // The schema of the data tuple
     private Schema schema;
-    //The schema o the data tuple along with the span information.
+    //The schema of the data tuple along with the span information.
     private Schema spanSchema;
 
     public DataReader(IPredicate dataReaderPredicate) {
@@ -74,10 +74,14 @@ public void open() throws DataFlowException {
             cursor = OPENED;
 
             this.queryTokens = Utils.tokenizeQuery(dataReaderPredicate.getAnalyzer(),dataReaderPredicate.getQueryString());
+
             // sort the query tokens, as the term vector are also sorted.
             // This makes the seek faster.
             this.queryTokens.sort(String.CASE_INSENSITIVE_ORDER);
 
+            // The terms in the term vector are stored as ByteRef,
+            // hence convert token from String format to ByteRef and then search.
+
             this.queryTokensInBytesRef = new ArrayList<>();
             for(String token: queryTokens) {
                 BytesRef byteRef = new BytesRef(token.toLowerCase().getBytes());

From e60a943fc0a0be7f73a653795c9d0bbb7667c51c Mon Sep 17 00:00:00 2001
From: Akshay Jain <akshaybetala@gmail.com>
Date: Wed, 18 May 2016 13:42:28 -0700
Subject: [PATCH 7/8] Adding comments

---
 .../java/edu/uci/ics/textdb/common/field/Span.java    | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java
index 48a84ab476f..dd03540e505 100644
--- a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java
+++ b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java
@@ -3,23 +3,24 @@
 public class Span {
     //The name of the field (in the tuple) where this span is present
     private String fieldName;
-    //The start of the span. It is the position of the first character of span in the document.
+    //The start position of the span, which is the offset of the gap before the first character of the span.
     private int start;
-    //The end of the span.It is the position of the first character of span in the document
+    //The end position of the span, which is the offset of the gap after the last character of the span.
     private int end;
     //The key we are searching for eg: regex
     private String key;
     //The value matching the key
     private String value;
-    // The token position of the span
+    // The token position of the span, starting from 0.
     private int tokenOffset;
 
     /*
     Example:
         Value = "The quick brown fox jumps over the lazy dog"
         Now the Span for brown should be
-        start = 10 : position of character 'b'
-        end = 15 : position of character 'n'
+        start = 10 : index Of character 'b'
+        end = 15 :  index of character 'n'+ 1 OR start+length
+                Both of then result in same values.
         tokenOffset = 2 position of word 'brown'
      */
 

From 6c3ce95cc00201fc3387f31ddec13606add02256 Mon Sep 17 00:00:00 2001
From: Akshay Jain <akshaybetala@gmail.com>
Date: Wed, 18 May 2016 16:47:21 -0700
Subject: [PATCH 8/8] Minor changes and comments

---
 .../ics/textdb/dataflow/common/RegexPredicate.java  |  7 ++++++-
 .../textdb/dataflow/regexmatch/RegexMatcher.java    |  5 ++++-
 .../src/main/resources/queryrewriter/wordsEn.txt    |  2 +-
 .../uci/ics/textdb/storage/DataReaderPredicate.java |  6 +++---
 .../uci/ics/textdb/storage/reader/DataReader.java   |  4 ++--
 .../ics/textdb/storage/DataWriterReaderTest.java    | 13 +++++--------
 6 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java
index c0059ec03e8..1d53e75d202 100644
--- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java
+++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java
@@ -23,7 +23,7 @@ public class RegexPredicate implements IPredicate {
 
 	private String regex;
 	private List<String> fieldNameList;
-
+	private List<Attribute> attributeList;
 	private Analyzer luceneAnalyzer;
 	private IDataStore dataStore;
 
@@ -31,6 +31,7 @@ public RegexPredicate(String regex, List<Attribute> attributeList, Analyzer anal
 		this.regex = regex;
 		this.luceneAnalyzer = analyzer;
 		this.dataStore = dataStore;
+		this.attributeList = attributeList;
 		this.fieldNameList = attributeList.stream()
 				.filter(attr -> (attr.getFieldType() == FieldType.TEXT || attr.getFieldType() == FieldType.STRING))
 				.map(attr -> attr.getFieldName()).collect(Collectors.toList());
@@ -52,4 +53,8 @@ public List<String> getFieldNameList() {
 		return this.fieldNameList;
 	}
 
+	public List<Attribute> getAttributeList() {
+		return attributeList;
+	}
+
 }
diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java
index 152388db0a1..ec75fe304b3 100644
--- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java
+++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexMatcher.java
@@ -3,6 +3,7 @@
 import java.util.ArrayList;
 import java.util.List;
 
+import edu.stanford.nlp.patterns.Data;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
 import org.apache.lucene.queryparser.classic.ParseException;
@@ -74,7 +75,9 @@ public RegexMatcher(IPredicate predicate) throws DataFlowException{
 				this.luceneQuery = generateLuceneQuery(regex, fieldNameList,
 						DataConstants.SCAN_QUERY);
 			}
-			this.sourceOperator = new IndexBasedSourceOperator(new DataReaderPredicate(dataStore, luceneQuery));
+			DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,
+					DataConstants.SCAN_QUERY, luceneAnalyzer, regexPredicate.getAttributeList());
+			this.sourceOperator = new IndexBasedSourceOperator(dataReaderPredicate);
 		} catch (ParseException | java.util.regex.PatternSyntaxException e) {
 			throw new DataFlowException(e.getMessage(), e);
 		}
diff --git a/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt b/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt
index 1c0c6821b89..05a3d743db3 100644
--- a/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt
+++ b/textdb/textdb-dataflow/src/main/resources/queryrewriter/wordsEn.txt
@@ -3240,7 +3240,7 @@ analytically
 analyzable
 analyze
 analyzed
-luceneAnalyzer
+analyzer
 analyzers
 analyzes
 analyzing
diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java
index e1bffb6d313..22a96c5bc4b 100644
--- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java
+++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/DataReaderPredicate.java
@@ -16,14 +16,14 @@ public class DataReaderPredicate implements IPredicate {
     private IDataStore dataStore;
     private Query luceneQuery;
     private String queryString;
-    private Analyzer analyzer;
+    private Analyzer luceneAnalyzer;
     private List<Attribute> attributeList;
     private boolean isSpanInformationAdded = false;
 
     public DataReaderPredicate(IDataStore dataStore, Query luceneQuery, String queryString, Analyzer analyzer, List<Attribute> attributeList){
         this.dataStore = dataStore;
         this.luceneQuery = luceneQuery;
-        this.analyzer = analyzer;
+        this.luceneAnalyzer = analyzer;
         this.queryString = queryString;
         this.attributeList  = attributeList;
     }
@@ -42,7 +42,7 @@ public Query getLuceneQuery() {
 
     public String getQueryString(){return queryString;}
 
-    public Analyzer getAnalyzer(){return analyzer;}
+    public Analyzer getLuceneAnalyzer(){return luceneAnalyzer;}
 
     public List<Attribute> getAttributeList(){return attributeList;}
 
diff --git a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
index 2af82b0f82c..a9827eddd06 100644
--- a/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
+++ b/textdb/textdb-storage/src/main/java/edu/uci/ics/textdb/storage/reader/DataReader.java
@@ -73,7 +73,7 @@ public void open() throws DataFlowException {
             scoreDocs = topDocs.scoreDocs;
             cursor = OPENED;
 
-            this.queryTokens = Utils.tokenizeQuery(dataReaderPredicate.getAnalyzer(),dataReaderPredicate.getQueryString());
+            this.queryTokens = Utils.tokenizeQuery(dataReaderPredicate.getLuceneAnalyzer(),dataReaderPredicate.getQueryString());
 
             // sort the query tokens, as the term vector are also sorted.
             // This makes the seek faster.
@@ -131,7 +131,7 @@ public ITuple getNextTuple() throws DataFlowException {
             for(Attribute attr: attributeList){
 
                 String fieldName  = attr.getFieldName();
-                // Get the term vector fot the current field.
+                // Get the term vector for the current field.
                 Terms vector = luceneIndexReader.getTermVector(scoreDocs[cursor].doc,fieldName);
 
                 if (vector != null) {
diff --git a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java
index f2e0b989674..3f55c5c61c3 100644
--- a/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java
+++ b/textdb/textdb-storage/src/test/java/edu/uci/ics/textdb/storage/DataWriterReaderTest.java
@@ -1,6 +1,5 @@
 package edu.uci.ics.textdb.storage;
 
-import java.lang.reflect.Array;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -25,25 +24,23 @@
 import edu.uci.ics.textdb.storage.reader.DataReader;
 import edu.uci.ics.textdb.storage.writer.DataWriter;
 
-import javax.xml.crypto.Data;
-
 public class DataWriterReaderTest {
     private IDataWriter dataWriter;
     private IDataReader dataReader;
     private IDataStore dataStore;
     private IPredicate dataReaderPredicate;
-    private Analyzer analyzer;
+    private Analyzer luceneAnalyzer;
     private Query query;
     
     @Before
     public void setUp() throws ParseException{
         dataStore = new DataStore(DataConstants.INDEX_DIR, TestConstants.SCHEMA_PEOPLE);
-        analyzer = new  StandardAnalyzer();
-        dataWriter = new DataWriter(dataStore, analyzer );
+        luceneAnalyzer = new  StandardAnalyzer();
+        dataWriter = new DataWriter(dataStore, luceneAnalyzer);
         QueryParser queryParser = new QueryParser(
-                TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), analyzer);
+                TestConstants.ATTRIBUTES_PEOPLE[0].getFieldName(), luceneAnalyzer);
         query = queryParser.parse(DataConstants.SCAN_QUERY);
-        dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY,analyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE));
+        dataReaderPredicate = new DataReaderPredicate(dataStore, query, DataConstants.SCAN_QUERY, luceneAnalyzer, Arrays.asList(TestConstants.ATTRIBUTES_PEOPLE));
         dataReader = new DataReader(dataReaderPredicate);
     }