Merge branch 'develop'

amaiya · Mar 18, 2020 · 3d33803 · 3d33803
2 parents 62a51f5 + b8f13f0
commit 3d33803
Show file tree

Hide file tree

Showing 31 changed files with 1,324 additions and 353 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,35 @@ Most recent releases are shown at the top. Each release shows:
 - **Fixed**: Bug fixes that don't change documented behaviour
 
 
+## 0.11.0 (2020-03-18)
+
+### New:
+- sequence-taging (e.g., NER) now supports ELMo embeddings with `use_elmo=True` argument to data-loading
+  functions like `entities_from_array`  and `entities_from_txt`A
+- pretrained word embeddings (i.e., fasttext word2vec embeddings) can be specified by providing the URL to
+  a `.vec.gz` file from [here](https://fasttext.cc/docs/en/crawl-vectors.html). The URL (or path) is
+  supplied as `wv_path_or_url` argument to data-loading functions like `entities_from_array` and `entities_from_txt`
+- `show_random_images`: show random images from folder in Jupyter notebook
+- `NERPreprocessor` now includes a `preprocess_test` method for easier evaluation of test sets in datasets
+   that contain a training, validation, and test set
+
+### Changed:
+- ensure `DISABLE_V2_BEHAVIOR=True` when `ImagePredictor.explain` is invoked
+- added `SUPPRESS_TF_WARNINGS` environment variable.  Default is '1'. If set to '0', TF warnings will be displayed.
+- `merge_entities` method of `ktrain.text.shallownlp.ner.NER` changed to `merge_tokens` 
+- moved `load_predictor` to constructor in `krain.text.shallownlp.ner.NER`
+- `ktrain.text.shallownlp.ner.NER` now supports `predictor_path` argument
+
+### Fixed:
+- convert `class_names` to strings in `core.validate` to prevent error from scikit-learn
+- fixed error arising when no data augmentation scheme is provided to the `images_from*` functions
+- fixed bug in `images_from_fname` to ensure supplied `pattern` is used
+- added `val_folder` argument to `images_from_fname`
+- raise Exception when `preproc` is not found in `load_predictor`
+- check for existence of `preproc` in `text_classifier` and `text_regression_model`
+- fixed `text.eda` so that `detect_lang` is called correctly after being moved to `textutils`
+
+
 ## 0.10.1 (2020-03-04)
 
 ### New:

diff --git a/README.md b/README.md
@@ -7,6 +7,8 @@
 
 
 ### News and Announcements
+- **2020-03-18:**  
+  - ***ktrain*** **v0.11.x is released** and includes various fixes and enhancements to sequence-tagging including abilty to easily use non-English pretrained word embeddings covering 157 languages
 - **2020-03-03:**  
   - ***ktrain*** **v0.10.x is released** and now includes [ready-to-use NER for English, Chinese, and Russian](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/develop/examples/text/shallownlp-examples.ipynb) with no training required. 
   - **Also in v0.10.x:**  Ability to train [community-uploaded Hugging Face transformer models](https://huggingface.co/models) like [SciBERT](https://arxiv.org/abs/1903.10676) and  [BioBERT](https://arxiv.org/abs/1901.08746):
@@ -283,7 +285,7 @@ The following software/libraries should be installed:
 -->
 
 
-This code was tested on Ubuntu 18.04 LTS using TensorFlow 2.0 (Keras version 2.2.4-tf).
+This code was tested on Ubuntu 18.04 LTS using TensorFlow 2.1.0
 
 ----
 **Creator:  [Arun S. Maiya](http://arun.maiya.net)**

diff --git a/examples/README.md b/examples/README.md
@@ -83,6 +83,11 @@ The objective of the CoNLL2003 task is to classify sequences of words as belongi
 - [CoNLL2003-BiLSTM_CRF.ipynb](https://github.com/amaiya/ktrain/tree/master/examples/text):  A simple and fast Bidirectional LSTM-CRF model with randomly initialized word embeddings.
 
 
+#### [CoNLL2002 NER Task (Dutch)](https://www.clips.uantwerpen.be/conll2002/ner/):  Named Entity Recognition for Dutch
+
+- [CoNLL2002_Dutch-BiLSTM.ipynb](https://github.com/amaiya/ktrain/tree/master/examples/text):  A simple and fast Bidirectional LSTM model with pretrained fasttext word embeddings for Dutch.
+
+
 ### <a name="lda"></a> Topic Modeling
 
 #### [20 News Groups](http://qwone.com/~jason/20Newsgroups/): unsupervised learning on 20newsgroups corpus

diff --git a/examples/text/CoNLL2002_Dutch-BiLSTM.ipynb b/examples/text/CoNLL2002_Dutch-BiLSTM.ipynb
diff --git a/examples/vision/dogs_vs_cats-ResNet50.ipynb b/examples/vision/dogs_vs_cats-ResNet50.ipynb
@@ -235,7 +235,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,

diff --git a/ktrain/core.py b/ktrain/core.py
@@ -122,6 +122,10 @@ def validate(self, val_data=None, print_report=True, class_names=[]):
         y_true = np.argmax(y_true, axis=1)
         if print_report:
             if class_names:
+                try:
+                    class_names = [str(s) for s in class_names]
+                except:
+                    pass
                 report = classification_report(y_true, y_pred, target_names=class_names)
             else:
                 report = classification_report(y_true, y_pred)
@@ -1172,6 +1176,8 @@ def fit(self, lr, n_cycles, cycle_len=None, cycle_mult=1,
                     fit_fn = self.model.fit_generator
                 else:
                     fit_fn = self.model.fit
+                # fixed in 2.1.0
+                #fit_fn = self.model.fit
             hist = fit_fn(self._prepare(self.train_data),
                                         steps_per_epoch = steps_per_epoch,
                                         validation_steps = validation_steps,
@@ -1308,15 +1314,9 @@ def load_predictor(fname):
     """
 
     # load the preprocessor
-    try:
-        preproc = None
-        with open(fname +'.preproc', 'rb') as f:
-            preproc = pickle.load(f)
-    except FileNotFoundError:
-        print('load_predictor failed.\n'+\
-              'Could not find the saved preprocessor (%s) for this model.' % (fname+'.preproc') +\
-               ' Are you sure predictor.save method was called?')
-        return
+    preproc = None
+    with open(fname +'.preproc', 'rb') as f:
+        preproc = pickle.load(f)
 
     # load the model
     model = _load_model(fname, preproc=preproc)

diff --git a/ktrain/imports.py b/ktrain/imports.py
@@ -9,13 +9,15 @@
 from distutils.util import strtobool
 from packaging import version
 
-# suppress warnings
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-logging.getLogger('tensorflow').setLevel(logging.ERROR)
-warnings.simplefilter(action='ignore', category=FutureWarning)
-# elevate warnings to errors for debugging dependencies
-#warnings.simplefilter('error', FutureWarning)
 
+# suppress warnings
+SUPPRESS_TF_WARNINGS = strtobool(os.environ.get('SUPPRESS_TF_WARNINGS', '1'))
+if SUPPRESS_TF_WARNINGS:
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+    logging.getLogger('tensorflow').setLevel(logging.ERROR)
+    warnings.simplefilter(action='ignore', category=FutureWarning)
+    # elevate warnings to errors for debugging dependencies
+    #warnings.simplefilter('error', FutureWarning)
 
 
 
@@ -42,6 +44,8 @@
 
 # suppress autograph warnings
 tf.autograph.set_verbosity(1)
+#if SUPPRESS_TF_WARNINGS:
+    #tf.autograph.set_verbosity(1)
 
 if version.parse(tf.__version__) < version.parse('2.0'):
     raise Exception('As of v0.8.x, ktrain needs TensorFlow 2. Please upgrade TensorFlow.')
@@ -53,7 +57,7 @@
 
 
 # output Keras version
-print("using Keras version: %s" % (keras.__version__))
+#print("using Keras version: %s" % (keras.__version__))
 
 K = keras.backend
 Layer = keras.layers.Layer
@@ -146,6 +150,8 @@
 import codecs
 import urllib.request
 import zipfile
+import gzip
+import shutil
 import string
 import random
 import json
@@ -228,5 +234,8 @@
             'Please install with: '+\
             'pip3 install git+https://github.com/amaiya/stellargraph@no_tf_dep_082'
 
+ALLENNLP_ERRMSG  = 'To use ELMo embedings, please install allenlp:\n' +\
+                   'pip3 install allennlp'
+
 
 
diff --git a/ktrain/tests/image_data/train-vision.csv b/ktrain/tests/image_data/train-vision.csv
@@ -1,6 +1,6 @@
 filename,cat,dog
 cat.11737.jpg,1,0
-cat.11737.jpg,1,0
+cat.2266.jpg,1,0
 cat.2921.jpg,1,0
 cat.3570.jpg,1,0
 cat.394.jpg,1,0

diff --git a/ktrain/tests/image_data/valid-vision.csv b/ktrain/tests/image_data/valid-vision.csv
@@ -3,7 +3,7 @@ cat.5202.jpg,0,1
 cat.4785.jpg,0,1
 cat.4319.jpg,0,1
 cat.10435.jpg,0,1
-dog.4090.jpg,1,0
-dog.11314.jpg,1,0
 dog.10459.jpg,1,0
+dog.11314.jpg,1,0
+dog.4090.jpg,1,0
 dog.5697.jpg,1,0
diff --git a/ktrain/tests/test_chinese_text.py b/ktrain/tests/test_chinese_text.py
@@ -48,7 +48,7 @@ def test_fasttext_chinese(self):
         learner.load_model('/tmp/test_model')
 
         # test validate
-        cm = learner.validate()
+        cm = learner.validate(class_names=preproc.get_classes())
         print(cm)
         for i, row in enumerate(cm):
             self.assertEqual(np.argmax(row), i)

diff --git a/ktrain/tests/test_imageclassification.py b/ktrain/tests/test_imageclassification.py
@@ -33,6 +33,7 @@
 
 
 class TestImageClassification(TestCase):
+    #@skip('temporarily disabled')
     def test_folder(self):
         (trn, val, preproc) = vis.images_from_folder(
                                                       datadir='image_data/image_folder',
@@ -92,6 +93,73 @@ def test_folder(self):
         self.assertEqual(r, ['cat'])
 
 
+    @skip('temporarily disabled')
+    def test_csv(self):
+        train_fpath = './image_data/train-vision.csv'
+        val_fpath = './image_data/valid-vision.csv'
+        trn, val, preproc = vis.images_from_csv(
+                              train_fpath,
+                              'filename',
+                              directory='./image_data/image_folder/all',
+                              val_filepath = val_fpath,
+                              label_columns = ['cat', 'dog'], 
+                              data_aug=vis.get_data_aug(horizontal_flip=True))
+
+        lr = 1e-4
+        model = vis.image_classifier('pretrained_resnet50', trn, val)
+        learner = ktrain.get_learner(model=model, train_data=trn, val_data=val, batch_size=4)
+        learner.freeze()
+        hist = learner.fit_onecycle(lr, 3)
+
+        # test train
+        self.assertAlmostEqual(max(hist.history['lr']), lr)
+        if max(hist.history[ACC_NAME]) == 0.5:
+            raise Exception('unlucky initialization: please run test again')
+        self.assertGreater(max(hist.history[ACC_NAME]), 0.8)
+
+        # test top_losses
+        obs = learner.top_losses(n=1, val_data=val)
+        print(obs)
+        if obs:
+            self.assertIn(obs[0][0], list(range(U.nsamples_from_data(val))))
+        else:
+            self.assertEqual(max(hist.history[VAL_ACC_NAME]), 1)
+
+        # test weight decay
+        self.assertEqual(len(learner.get_weight_decay()), 54)
+        self.assertEqual(learner.get_weight_decay()[0], None)
+        learner.set_weight_decay(1e-4)
+        self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4)
+
+        # test load and save model
+        learner.save_model('/tmp/test_model')
+        learner.load_model('/tmp/test_model')
+
+        # test validate
+        cm = learner.validate(val_data=val)
+        print(cm)
+        for i, row in enumerate(cm):
+            self.assertEqual(np.argmax(row), i)
+
+        # test predictor
+        p = ktrain.get_predictor(learner.model, preproc)
+        r = p.predict_folder('image_data/image_folder/train/')
+        print(r)
+        self.assertEqual(r[0][1], 'cat')
+        r = p.predict_proba_folder('image_data/image_folder/train/')
+        self.assertEqual(np.argmax(r[0][1]), 0)
+        r = p.predict_filename('image_data/image_folder/train/cat/cat.11737.jpg')
+        self.assertEqual(r, ['cat'])
+        r = p.predict_proba_filename('image_data/image_folder/train/cat/cat.11737.jpg')
+        self.assertEqual(np.argmax(r), 0)
+
+        p.save('/tmp/test_predictor')
+        p = ktrain.load_predictor('/tmp/test_predictor')
+        r = p.predict_filename('image_data/image_folder/train/cat/cat.11737.jpg')
+        self.assertEqual(r, ['cat'])
+
+
+
 if __name__ == "__main__":
     main()
 
diff --git a/ktrain/text/__init__.py b/ktrain/text/__init__.py
@@ -4,7 +4,7 @@
 from .ner.models import sequence_tagger, print_sequence_taggers
 from .eda import get_topic_model
 from .textutils import extract_filenames, load_text_files, filter_by_id
-from .preprocessor import Transformer
+from .preprocessor import Transformer, TransformerEmbedding
 from . import shallownlp
 
 __all__ = [
@@ -22,6 +22,7 @@
            'extract_filenames', 
            'load_text_files',
            'Transformer',
+           'TranformerEmbedding',
            'shallownlp'
            ]
 

diff --git a/ktrain/text/eda.py b/ktrain/text/eda.py
@@ -121,7 +121,7 @@ def train(self,texts, model_type='lda', n_topics=None, n_features=10000,
 
         # adjust defaults based on language detected
         if texts is not None:
-            lang = pp.detect_lang(texts)
+            lang = TU.detect_lang(texts)
             if lang != 'en':
                 stopwords = None if stop_words=='english' else stop_words
                 token_pattern = r'(?u)\b\w+\b' if token_pattern is None else token_pattern