Merge branch 'develop'

amaiya · Jun 3, 2020 · ed4cb3f · ed4cb3f
2 parents 7671f36 + 1aa5091
commit ed4cb3f
Show file tree

Hide file tree

Showing 9 changed files with 79 additions and 38 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,21 @@ Most recent releases are shown at the top. Each release shows:
 - **Fixed**: Bug fixes that don't change documented behaviour
 
 
+## 0.15.4 (2020-06-03)
+
+### New:
+- N/A
+
+### Changed
+- N/A/
+
+### Fixed:
+- Added the `procs`, `limitmb`, and `multisegment` argumetns to `index_from_list` and `index_from_folder` method in `text.SimpleQA`
+  to speedup indexing when necessary.  Supplying `multisegment=True` speeds things up significantly, for example. Defaults, however, are
+  the same as before. Users must explicitly change values if desiring a speedup.
+- Load `xlm-roberta*` as `jplu/tf-xlm-roberta*` to bypass error from `transformers`
+
+
 ## 0.15.3 (2020-05-28)
 
 ### New:

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 ### [Overview](#overview) | [Tutorials](#tutorials) | [Examples](#examples) |  [Installation](#installation) | [How to Cite](#how-to-cite)
-[![PyPI Status](https://badge.fury.io/py/ktrain.svg)](https://badge.fury.io/py/ktrain) [![ktrain python compatibility](https://img.shields.io/pypi/pyversions/ktrain.svg)](https://pypi.python.org/pypi/ktrain) [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/amaiya/ktrain/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/ktrain)](https://pepy.tech/project/ktrain) [![Downloads](https://pepy.tech/badge/ktrain/month)](https://pepy.tech/project/ktrain/month)
+[![PyPI Status](https://badge.fury.io/py/ktrain.svg)](https://badge.fury.io/py/ktrain) [![ktrain python compatibility](https://img.shields.io/pypi/pyversions/ktrain.svg)](https://pypi.python.org/pypi/ktrain) [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/amaiya/ktrain/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/ktrain)](https://pepy.tech/project/ktrain)
 
 
 # ktrain

diff --git a/examples/text/IMDb-BERT.ipynb b/examples/text/IMDb-BERT.ipynb
@@ -18,15 +18,7 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using TensorFlow backend.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import ktrain\n",
     "from ktrain import text"
@@ -41,7 +33,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "preprocessing train...\n"
+      "detected encoding: utf-8\n",
+      "preprocessing train...\n",
+      "language: en\n"
      ]
     },
     {
@@ -60,7 +54,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "preprocessing test...\n"
+      "Is Multi-Label? False\n",
+      "preprocessing test...\n",
+      "language: en\n"
      ]
     },
     {
@@ -77,12 +73,12 @@
     }
    ],
    "source": [
-    "(x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder('data/aclImdb', \n",
-    "                                                                       maxlen=500, \n",
-    "                                                                       preprocess_mode='bert',\n",
-    "                                                                       train_test_names=['train', \n",
-    "                                                                                         'test'],\n",
-    "                                                                       classes=['pos', 'neg'])"
+    "trn, val, preproc = text.texts_from_folder('data/aclImdb', \n",
+    "                                           maxlen=500, \n",
+    "                                           preprocess_mode='bert',\n",
+    "                                          train_test_names=['train', \n",
+    "                                                            'test'],\n",
+    "                                           classes=['pos', 'neg'])"
    ]
   },
   {
@@ -101,7 +97,7 @@
     }
    ],
    "source": [
-    "model = text.text_classifier('bert', (x_train, y_train) , preproc=preproc)"
+    "model = text.text_classifier('bert', trn , preproc=preproc)"
    ]
   },
   {
@@ -111,8 +107,8 @@
    "outputs": [],
    "source": [
     "learner = ktrain.get_learner(model, \n",
-    "                             train_data=(x_train, y_train), \n",
-    "                             val_data=(x_test, y_test), \n",
+    "                             train_data=trn, \n",
+    "                             val_data=val, \n",
     "                             batch_size=6)"
    ]
   },
@@ -173,14 +169,13 @@
       "\n",
       "begin training using onecycle policy with max lr of 2e-05...\n",
       "Train on 25000 samples, validate on 25000 samples\n",
-      "Epoch 1/1\n",
-      "25000/25000 [==============================] - 1966s 79ms/step - loss: 0.2575 - acc: 0.8886 - val_loss: 0.1649 - val_acc: 0.9384\n"
+      "25000/25000 [==============================] - 2304s 92ms/sample - loss: 0.2442 - accuracy: 0.9008 - val_loss: 0.1596 - val_accuracy: 0.9394\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "<keras.callbacks.History at 0x7f1fa5bf9320>"
+       "<tensorflow.python.keras.callbacks.History at 0x7f6b102fe780>"
       ]
      },
      "execution_count": 8,
@@ -197,7 +192,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### **93.84%** accuracy in a single epoch."
+    "### **93.94%** accuracy in a single epoch."
    ]
   },
   {
@@ -258,6 +253,13 @@
    "source": [
     "predictor.predict(data)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -276,7 +278,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,

diff --git a/examples/text/question_answering_with_bert.ipynb b/examples/text/question_answering_with_bert.ipynb
@@ -98,6 +98,8 @@
    "source": [
     "For documents sets that are too large to be loaded into a Python list, you can use `SimpleQA.index_from_folder`, which will crawl a folder and index all plain text documents.\n",
     "\n",
+    "By default, `index_from_list` and `index_from_folder` use a single processor (`procs=1`) with each processor using a maximum of 256MB of memory (`limitmb=256`) and merging results into a single segment (`multisegment=False`).  These values can be changed to speedup indexing as arguments to `index_from_list` or `index_from_folder`.  See the [whoosh documentation](https://whoosh.readthedocs.io/en/latest/batch.html) for more information on these parameters and how to use them to speedup indexing.\n",
+    "\n",
     "The above steps need to only be performed once. Once an index is already created, you can skip this step and proceed directly to **STEP 2** to begin using your system."
    ]
   },
@@ -580,7 +582,7 @@
    "source": [
     "Here, we see different views on who Jesus was as debated and discussed in this document set.\n",
     "\n",
-    "Finally, the 20 Newsgroup dataset also contains many groups about computing hardward and software.  Let's ask a technical support question.\n",
+    "Finally, the 20 Newsgroup dataset also contains many groups about computing hardware and software.  Let's ask a technical support question.\n",
     "\n",
     "#### Technical Question"
    ]

diff --git a/examples/vision/mnist-tf_workflow.ipynb b/examples/vision/mnist-tf_workflow.ipynb
@@ -241,7 +241,8 @@
     "    model = make_model()\n",
     "\n",
     "\n",
-    "# set up learning rate decay\n",
+    "# set up learning rate decay [FROM ORIGINAL EXAMPLE BUT NOT USED]\n",
+    "# NOT NEEDED: we will use ktrain to find LR and decay learning rate during training\n",
     "LEARNING_RATE = 0.01\n",
     "LEARNING_RATE_EXP_DECAY = 0.6 if strategy.num_replicas_in_sync == 1 else 0.7\n",
     "lr_decay = tf.keras.callbacks.LearningRateScheduler(\n",

diff --git a/ktrain/tests/test_shallownlp.py → ktrain/tests/test_zzz_shallownlp.py b/ktrain/tests/test_shallownlp.py → ktrain/tests/test_zzz_shallownlp.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 """
-Tests of ktrain text classification flows
+Tests of ktrain shallownlp module:
+2020-05-26: renamed test_zzz_shallownlp.py because
+            causes issues for tests following it when run in conjunction with test_regression.py.
 """
 import testenv
 from unittest import TestCase, main, skip

diff --git a/ktrain/text/preprocessor.py b/ktrain/text/preprocessor.py
@@ -798,6 +798,7 @@ def __init__(self,  model_name,
         self.name = model_name.split('-')[0]
         if model_name.startswith('xlm-roberta'): 
             self.name = 'xlm_roberta'
+            self.model_name = 'jplu/tf-' + self.model_name
         else:
             self.name = model_name.split('-')[0]
         if self.name not in TRANSFORMER_MODELS:

diff --git a/ktrain/text/qa/core.py b/ktrain/text/qa/core.py
@@ -145,16 +145,23 @@ def initialize_index(cls, index_dir):
         return ix
 
     @classmethod
-    def index_from_list(cls, docs, index_dir, commit_every=1024):
+    def index_from_list(cls, docs, index_dir, commit_every=1024,
+                        procs=1, limitmb=256, multisegment=False):
         """
-        index documents from list
+        index documents from list.
+        The procs, limitmb, and especially multisegment arguments can be used to 
+        speed up indexing, if it is too slow.  Please see the whoosh documentation
+        for more information on these parameters:  https://whoosh.readthedocs.io/en/latest/batch.html
         Args:
           docs(list): list of strings representing documents
           commit_every(int): commet after adding this many documents
+          procs(int): number of processors
+          limitmb(int): memory limit in MB for each process
+          multisegment(bool): new segments written instead of merging
         """
         if not isinstance(docs, (np.ndarray, list)): raise ValueError('docs must be a list of strings')
         ix = index.open_dir(index_dir)
-        writer = ix.writer()
+        writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
         mb = master_bar(range(1))
         for i in mb:
             for idx, doc in enumerate(progress_bar(docs, parent=mb)):
@@ -164,23 +171,33 @@ def index_from_list(cls, docs, index_dir, commit_every=1024):
                 idx +=1
                 if idx % commit_every == 0:
                     writer.commit()
-                    writer = ix.writer()
+                    #writer = ix.writer()
+                    writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
             writer.commit()
         return
 
 
     @classmethod
-    def index_from_folder(cls, folder_path, index_dir,  commit_every=1024, verbose=1, encoding='utf-8'):
+    def index_from_folder(cls, folder_path, index_dir,  commit_every=1024, verbose=1, encoding='utf-8',
+                          procs=1, limitmb=256, multisegment=False):
         """
-        index all plain text documents within a folder
+        index all plain text documents within a folder.
+        The procs, limitmb, and especially multisegment arguments can be used to 
+        speed up indexing, if it is too slow.  Please see the whoosh documentation
+        for more information on these parameters:  https://whoosh.readthedocs.io/en/latest/batch.html
+
         Args:
           folder_path(str): path to folder containing plain text documents
           commit_every(int): commet after adding this many documents
+          procs(int): number of processors
+          limitmb(int): memory limit in MB for each process
+          multisegment(bool): new segments written instead of merging
+
         """
         if not os.path.isdir(folder_path): raise ValueError('folder_path is not a valid folder')
         if folder_path[-1] != os.sep: folder_path += os.sep
         ix = index.open_dir(index_dir)
-        writer = ix.writer()
+        writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
         for idx, fpath in enumerate(TU.extract_filenames(folder_path)):
             if not TU.is_txt(fpath): continue
             reference = "%s" % (fpath.join(fpath.split(folder_path)[1:]))
@@ -191,7 +208,8 @@ def index_from_folder(cls, folder_path, index_dir,  commit_every=1024, verbose=1
             idx +=1
             if idx % commit_every == 0:
                 writer.commit()
-                writer = ix.writer()
+                #writer = ix.writer()
+                writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
                 if verbose: print("%s docs indexed" % (idx))
         writer.commit()
         return

diff --git a/ktrain/version.py b/ktrain/version.py
@@ -1,2 +1,2 @@
 __all__ = ['__version__']
-__version__ = '0.15.3'
+__version__ = '0.15.4'