Merge branch 'develop'

amaiya · Jul 17, 2020 · 9cdbc22 · 9cdbc22
2 parents 457c3b9 + 6338d94
commit 9cdbc22
Show file tree

Hide file tree

Showing 9 changed files with 77 additions and 32 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,20 @@ Most recent releases are shown at the top. Each release shows:
 - **Changed**: Additional parameters, changes to inputs or outputs, etc
 - **Fixed**: Bug fixes that don't change documented behaviour
 
+## 0.18.4 (2020-07-17)
+
+### New:
+- N/A
+
+### Changed
+- N/A
+
+### Fixed:
+- return gracefully if no documents match question in `qa` module
+- tokenize question in `qa` module to ensure all candidate documents are returned
+- Added error in `text.preprocessor` when training set has incomplete integer labels
+
+
 ## 0.18.3 (2020-07-12)
 
 ### New:

diff --git a/FAQ.md b/FAQ.md
@@ -35,6 +35,8 @@
 
 - [Can I use `tf.data.Dataset` instances with *ktrain*?](#can-i-use-tfdatadataset-instances-with-ktrain)
 
+- [Why am I seeing a "list index out of range" error when calling predict?](#why-am-i-seeing-a-list-index-out-of-range-error-when-calling-predict)
+
 
 
 ## Evaluation, Inspection, and Prediction
@@ -480,6 +482,15 @@ See [this tutorial](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/maste
 [[Back to Top](#frequently-asked-questions-about-ktrain)]
 
 
+### Why am I seeing a "list index out of range" error when calling predict?
+
+The set of integer labels in your training set need to be complete and consecutive (e.g., `[0,1]` or `[0,1,2,3,4]`, but not `[0, 3]`). See [this post](https://github.com/amaiya/ktrain/issues/116#issuecomment-614864565).
+
+
+[[Back to Top](#frequently-asked-questions-about-ktrain)]
+
+
+
 ### Why am I seeing an ERROR when installing *ktrain* on Google Colab?
 
 These errors (e.g., `has requirement gast>=0.3.2, but you'll have gast 0.2.2 which is incompatible`) are related to TensorFlow and can be usually safely ignored and shouldn't affect operation of *ktrain*.

diff --git a/examples/text/question_answering_with_bert.ipynb b/examples/text/question_answering_with_bert.ipynb
@@ -96,7 +96,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For documents sets that are too large to be loaded into a Python list, you can use `SimpleQA.index_from_folder`, which will crawl a folder and index all plain text documents.\n",
+    "For documents sets that are too large to be loaded into a Python list, you can use `SimpleQA.index_from_folder`, which will crawl a folder and index all plain text documents (e.g.,, `.txt` files).\n",
     "\n",
     "By default, `index_from_list` and `index_from_folder` use a single processor (`procs=1`) with each processor using a maximum of 256MB of memory (`limitmb=256`) and merging results into a single segment (`multisegment=False`).  These values can be changed to speedup indexing as arguments to `index_from_list` or `index_from_folder`.  See the [whoosh documentation](https://whoosh.readthedocs.io/en/latest/batch.html) for more information on these parameters and how to use them to speedup indexing.\n",
     "\n",
@@ -179,14 +179,14 @@
        "      <th>0</th>\n",
        "      <td>in october of 1997</td>\n",
        "      <td><div> cassini is scheduled for launch aboard a titan iv / centaur  <font color='red'>in october of 1997</font> .</div></td>\n",
-       "      <td>0.348673</td>\n",
+       "      <td>0.348675</td>\n",
        "      <td>59</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>on january 26,1962</td>\n",
        "      <td><div>ranger 3, launched  <font color='red'>on january 26,1962</font> , was intended to land an instrument capsule on the surface of the moon, but problems during the launch caused the probe to miss the moon and head into solar orbit.</div></td>\n",
-       "      <td>0.195162</td>\n",
+       "      <td>0.195161</td>\n",
        "      <td>8525</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -207,7 +207,7 @@
        "      <th>4</th>\n",
        "      <td>2001</td>\n",
        "      <td><div> possible launch dates : 1996 for imaging orbiter,  <font color='red'>2001</font>  for rover.</div></td>\n",
-       "      <td>0.069741</td>\n",
+       "      <td>0.069740</td>\n",
        "      <td>59</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -534,36 +534,36 @@
        "      <th>0</th>\n",
        "      <td>is god incarnate</td>\n",
        "      <td><div>jesus isn ' t god ? when jesus returns some people may miss him ? what version of the bible do you read mike ? jesus  <font color='red'>is god incarnate</font>  (in flesh).</div></td>\n",
-       "      <td>0.569719</td>\n",
+       "      <td>0.482224</td>\n",
        "      <td>6356</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>the incarnation of the son</td>\n",
-       "      <td><div> jesus is  <font color='red'>the incarnation of the son</font> .</div></td>\n",
-       "      <td>0.328918</td>\n",
-       "      <td>11661</td>\n",
+       "      <td>jesus god only of the jews</td>\n",
+       "      <td><div>which is more important : 1) the recorded word of jesus or 2) indications that you can deduce from the bible ? was  <font color='red'>jesus god only of the jews</font> , or god of all humankind of all race and sex ?</div></td>\n",
+       "      <td>0.164358</td>\n",
+       "      <td>7842</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>is god ' s son</td>\n",
-       "      <td><div>) you seem to be suggesting the jesus  <font color='red'>is god ' s son</font>  in a physical sense, with the holy spirit as father and mary as mother.</div></td>\n",
-       "      <td>0.069266</td>\n",
+       "      <td>was god in human form</td>\n",
+       "      <td><div> first question is, if jesus  <font color='red'>was god in human form</font> , how could he really be god ' s son ? if the holy ghost \" planted the seed \" in mary, so to speak, then it seems that jesus ' relationship to god would be the equivalent to the human father / son relationship.</div></td>\n",
+       "      <td>0.109961</td>\n",
        "      <td>11661</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>was god ' s only begotten son</td>\n",
-       "      <td><div> the fact that jesus  <font color='red'>was god ' s only begotten son</font>  does not seem to me to have much meaning since god can beget as many sons as he wants to.</div></td>\n",
-       "      <td>0.016456</td>\n",
-       "      <td>11661</td>\n",
+       "      <td>was magus from the east</td>\n",
+       "      <td><div>who acknowledged this fact ? on what basis ? are we extra biblical at this point ? why not also acknowledge that the bhagavad gita is the only relevant text for gentiles, after all we see in the bible that it  <font color='red'>was magus from the east</font>  who observed the star signs of jesus ? why bother with any texts at all ? why not just follow whatever the church has to say ?</div></td>\n",
+       "      <td>0.082453</td>\n",
+       "      <td>7842</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>jesus god only of the jews</td>\n",
-       "      <td><div>which is more important : 1) the recorded word of jesus or 2) indications that you can deduce from the bible ? was  <font color='red'>jesus god only of the jews</font> , or god of all humankind of all race and sex ?</div></td>\n",
-       "      <td>0.005702</td>\n",
-       "      <td>7842</td>\n",
+       "      <td>the incarnation of the son</td>\n",
+       "      <td><div> jesus is  <font color='red'>the incarnation of the son</font> .</div></td>\n",
+       "      <td>0.065281</td>\n",
+       "      <td>11661</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>"
@@ -577,7 +577,7 @@
     }
    ],
    "source": [
-    "answers = qa.ask('Who was Jesus Christ?')\n",
+    "answers = qa.ask('Who was Jesus?')\n",
     "qa.display_answers(answers[:5])"
    ]
   },
@@ -625,35 +625,35 @@
        "      <th>0</th>\n",
        "      <td>that not all display programs do gamma correction</td>\n",
        "      <td><div>the problem is  <font color='red'>that not all display programs do gamma correction</font> .</div></td>\n",
-       "      <td>0.848456</td>\n",
+       "      <td>0.848914</td>\n",
        "      <td>13873</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>if your viewer does not do gamma correction</td>\n",
        "      <td><div> <font color='red'>if your viewer does not do gamma correction</font> , then linear images will look too dark, and gamma corrected images will ok.</div></td>\n",
-       "      <td>0.042678</td>\n",
+       "      <td>0.042701</td>\n",
        "      <td>13873</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>altering the intensity in the hsv controls</td>\n",
        "      <td><div>  <font color='red'>altering the intensity in the hsv controls</font>  does not do the right thing, as it fails to take account of the effect gamma has on h and s.</div></td>\n",
-       "      <td>0.040854</td>\n",
+       "      <td>0.040876</td>\n",
        "      <td>13873</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>is gamma correction</td>\n",
        "      <td><div> this,  <font color='red'>is gamma correction</font>  (or the lack of it).</div></td>\n",
-       "      <td>0.019406</td>\n",
+       "      <td>0.019417</td>\n",
        "      <td>13873</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>if your viewer does not do gamma correction</td>\n",
        "      <td><div> <font color='red'>if your viewer does not do gamma correction</font> , then left hand ramp will have a long dark part and a short white part, and the point of equal brightness will be above the center.</div></td>\n",
-       "      <td>0.013617</td>\n",
+       "      <td>0.013624</td>\n",
        "      <td>13873</td>\n",
        "    </tr>\n",
        "  </tbody>\n",

diff --git a/ktrain/text/ner/predictor.py b/ktrain/text/ner/predictor.py
@@ -1,6 +1,6 @@
 from ...imports import *
 from ...predictor import Predictor
-from .preprocessor import NERPreprocessor, tokenize
+from .preprocessor import NERPreprocessor
 from ... import utils as U
 from .. import textutils as TU
 

diff --git a/ktrain/text/ner/preprocessor.py b/ktrain/text/ner/preprocessor.py
@@ -16,8 +16,8 @@
 
 
 #tokenizer_filter = rs='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
-re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
-def tokenize(s): return re_tok.sub(r' \1 ', s).split()
+#re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
+#def tokenize(s): return re_tok.sub(r' \1 ', s).split()
 
 
 class NERPreprocessor(Preprocessor):
@@ -83,7 +83,7 @@ def preprocess(self, sentences, lang=None):
                 tokenize_chinese = lambda text:[c for c in text]
                 tokens = tokenize_chinese(s)
             else:
-                tokens = tokenize(s)
+                tokens = TU.tokenize(s)
             X.append(tokens)
             y.append([OTHER] * len(tokens))
         nerseq = NERSequence(X, y, p=self.p)

diff --git a/ktrain/text/preprocessor.py b/ktrain/text/preprocessor.py
@@ -514,8 +514,10 @@ def _transform_y(self, y_data, verbose=1):
                              'The classes argument should have been supplied.')
 
         # convert string labels to integers, if necessary
+        train = False
         if isinstance(y_data[0], str):
             if self.label_encoder is None:
+                train = True
                 self.label_encoder = LabelEncoder()
                 self.label_encoder.fit(y_data)
                 if self.get_classes(): warnings.warn('class_names argument was ignored, as they were extracted from string labels in dataset')
@@ -526,6 +528,11 @@ def _transform_y(self, y_data, verbose=1):
         # if shape is 1, this is either a classification or regression task 
         # depending on class_names existing
         y_data = to_categorical(y_data) if len(y_data.shape) == 1 and self.get_classes() else y_data
+        if self.get_classes():
+            x = y_data.shape[1]
+            y = len(self.get_classes())
+            if train and y_data.shape[1] != len(self.get_classes()):
+                raise Exception('Class labels in training set are %s, but y_data has %s classes' % (self.get_classes(), y_data.shape[1]))
         return y_data
 
 

diff --git a/ktrain/text/qa/core.py b/ktrain/text/qa/core.py
@@ -279,9 +279,13 @@ def ask(self, question, n_docs_considered=10, n_answers=50, rerank_threshold=0.0
           list
         """
         # locate candidate document contexts
-        doc_results = self.search(question, limit=n_docs_considered)
         paragraphs = []
         refs = []
+        #doc_results = self.search(question, limit=n_docs_considered)
+        doc_results = self.search(TU.tokenize(question, join_tokens=True), limit=n_docs_considered)
+        if not doc_results: 
+            warnings.warn('No documents matched words in question')
+            return []
         for doc_result in doc_results:
             rawtext = doc_result.get('rawtext', '')
             reference = doc_result.get('reference', '')
@@ -356,6 +360,7 @@ def answers2df(self, answers):
 
 
     def display_answers(self, answers):
+        if not answers: return
         df = self.answers2df(answers)
         from IPython.core.display import display, HTML
         display(HTML(df.to_html(render_links=True, escape=False)))

diff --git a/ktrain/text/textutils.py b/ktrain/text/textutils.py
@@ -301,6 +301,14 @@ def read_text(filename):
     return decoded_text.strip()
 
 
+#tokenizer_filter = rs='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
+re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
+def tokenize(s, join_tokens=False, join_char=' '): 
+    tokens = re_tok.sub(r' \1 ', s).split()
+    if join_tokens: tokens = join_char.join(tokens)
+    return tokens
+
+
 
 def sent_tokenize(text):
     """

diff --git a/ktrain/version.py b/ktrain/version.py
@@ -1,2 +1,2 @@
 __all__ = ['__version__']
-__version__ = '0.18.3'
+__version__ = '0.18.4'