Merge branch 'develop'

amaiya · May 10, 2020 · c12fd8c · c12fd8c
2 parents 075b691 + cf4e2d6
commit c12fd8c
Show file tree

Hide file tree

Showing 24 changed files with 1,015 additions and 142 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,21 @@ Most recent releases are shown at the top. Each release shows:
 - **Fixed**: Bug fixes that don't change documented behaviour
 
 
+## 0.14.7 (2020-05-10)
+
+### New:
+- Added `TFDataset` class for use as wrapper around arbitrary `tf.data.Dataset` objects for use in *ktrain*
+
+### Changed
+- Added `NERPreprocessor.preprocess_train_from_conll2003`
+- Removed extraneous imports from `text.__init__.py` and `vision.__init__.py`
+- `classes` argument in `images_from_array` changed to `class_names`
+
+### Fixed:
+- ensure NER data is properly prepared `text.ner.learner.validate`
+- fixed typo with `df` reference in `images_from_fname`
+
+
 ## 0.14.6 (2020-05-06)
 
 ### New:

diff --git a/README.md b/README.md
@@ -287,13 +287,12 @@ This code was tested on Ubuntu 18.04 LTS using TensorFlow 2.1.0
 
 Please cite the [following paper](https://arxiv.org/abs/2004.10703) when using **ktrain**:
 ```
-@misc{maiya2020ktrain,
-    title={ktrain: A Low-Code Library for Augmented Machine Learning},
-    author={Arun S. Maiya},
-    year={2020},
-    eprint={2004.10703},
-    archivePrefix={arXiv},
-    primaryClass={cs.LG}
+@article{maiya2020ktrain,
+         title={ktrain: A Low-Code Library for Augmented Machine Learning},
+         author={Arun S. Maiya},
+         journal={ArXiv},
+         year={2020},
+         volume={arXiv:2004.10703 [cs.LG]}
 }
 ```
 

diff --git a/examples/README.md b/examples/README.md
@@ -142,6 +142,9 @@ The objective of the CoNLL2003 task is to classify sequences of words as belongi
 #### [MNIST](http://yann.lecun.com/exdb/mnist/):  Multiclass Classification
 - [mnist-image_from_array_example.ipynb](https://github.com/amaiya/ktrain/tree/master/examples/vision):  Build an MNIST model using `images_from_array`
 
+#### [MNIST](http://yann.lecun.com/exdb/mnist/):  Multiclass Classification
+- [mnist-tf_workflow.ipynb](https://github.com/amaiya/ktrain/tree/master/examples/vision):  Illustrates how *ktrain* can be used in minimally-invasive way with normal TF workflow
+
 #### [CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html):  Multiclass Classification
 - [cifar10-WRN22.ipynb](https://github.com/amaiya/ktrain/tree/master/examples/vision):  A randomly-initialized Wide Residual Network applied to CIFAR10
 

diff --git a/examples/vision/mnist-images_from_array_example.ipynb b/examples/vision/mnist-images_from_array_example.ipynb
@@ -77,7 +77,7 @@
     "                                            val_pct=0.1,\n",
     "                                            random_state=42,\n",
     "                                            data_aug=data_aug,\n",
-    "                                            classes=classes)"
+    "                                            class_names=classes)"
    ]
   },
   {

diff --git a/examples/vision/mnist-tf_workflow.ipynb b/examples/vision/mnist-tf_workflow.ipynb
diff --git a/examples/vision/pets-ResNet50.ipynb b/examples/vision/pets-ResNet50.ipynb
@@ -337,7 +337,7 @@
    ],
    "source": [
     "# find a good learning rate\n",
-    "learner.lr_find()"
+    "learner.lr_find(max_epochs=5)"
    ]
   },
   {
@@ -362,6 +362,13 @@
     "learner.lr_plot()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For demonstration purposes, we use `autofit` to train, which employs a triangular learning rate policy with `epochs=20` and `reduce_on_plateau=2`.  You may choose to try something different."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 12,
@@ -681,7 +688,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,

diff --git a/ktrain/__init__.py b/ktrain/__init__.py
@@ -5,12 +5,12 @@
 from .text.learner import BERTTextClassLearner, TransformerTextClassLearner
 from .text.ner.learner import NERLearner
 from .graph.learner import NodeClassLearner, LinkPredLearner
-from .data import Dataset
+from .data import Dataset, TFDataset, SequenceDataset
 
 from . import utils as U
 
 __all__ = ['get_learner', 'get_predictor', 'load_predictor', 'release_gpu_memory',
-           'Dataset']
+           'Dataset', 'TFDataset', 'SequenceDataset']
 
 
 

diff --git a/ktrain/core.py b/ktrain/core.py
@@ -233,11 +233,11 @@ def save_model(self, fpath):
         return
 
 
-    def load_model(self, fpath):
+    def load_model(self, fpath, custom_objects=None):
         """
         a wrapper to load_model
         """
-        self.model = _load_model(fpath, train_data=self.train_data)
+        self.model = _load_model(fpath, train_data=self.train_data, custom_objects=custom_objects)
         return
 
     def _is_adamlike(self):
@@ -640,23 +640,18 @@ def _cb_earlystopping(self, early_stopping, callbacks=[]):
         return callbacks
 
 
-    def _prepare(self, data, mode='train'):
+    def _prepare(self, data, train=True):
         """
         Subclasses can override this method if data
         needs to be specially-prepared prior to invoking fit methods
         Args:
           data:  dataset
-          mode: either 'train' or 'valid'
+          train(bool):  If True, prepare for training. Otherwise, prepare for evaluation.
         """
         if data is None: return None
 
         if hasattr(data, 'to_tfdataset'):
-            shuffle=True
-            repeat = True
-            if mode != 'train':
-                shuffle = False
-                repeat = False
-            return data.to_tfdataset(shuffle=shuffle, repeat=repeat)
+            return data.to_tfdataset(train=train)
         else:
             return data
 
@@ -894,7 +889,7 @@ def predict(self, val_data=None):
         if U.is_iter(val):
             if hasattr(val, 'reset'): val.reset()
             steps = np.ceil(U.nsamples_from_data(val)/val.batch_size)
-            result = self.model.predict_generator(self._prepare(val, mode='valid'), 
+            result = self.model.predict_generator(self._prepare(val, train=False), 
                                                 steps=steps)
             return result
         else:
@@ -987,7 +982,7 @@ def fit(self, lr, n_cycles, cycle_len=None, cycle_mult=1,
         with warnings.catch_warnings():
             warnings.filterwarnings('ignore', message='.*Check your callbacks.*')
             hist = self.model.fit(self._prepare(x_train), 
-                                  self._prepare(y_train, mode='valid'),
+                                  self._prepare(y_train, train=False),
                                   batch_size=self.batch_size,
                                   epochs=epochs,
                                   validation_data=validation, verbose=verbose, 
@@ -1189,7 +1184,7 @@ def fit(self, lr, n_cycles, cycle_len=None, cycle_mult=1,
                                         steps_per_epoch = steps_per_epoch,
                                         validation_steps = validation_steps,
                                         epochs=epochs,
-                                        validation_data=self._prepare(self.val_data, mode='valid'),
+                                        validation_data=self._prepare(self.val_data, train=False),
                                         workers=self.workers,
                                         use_multiprocessing=self.use_multiprocessing, 
                                         verbose=verbose,
@@ -1390,10 +1385,9 @@ def release_gpu_memory(device=0):
     return
 
 
-def _load_model(fname, preproc=None, train_data=None):
+def _load_model(fname, preproc=None, train_data=None, custom_objects=None):
     if not preproc and not train_data:
         raise ValueError('Either preproc or train_data is required.')
-    custom_objects=None
     if preproc and isinstance(preproc, TransformersPreprocessor):
         # note: with transformer models, fname is actually a directory
         model = preproc.get_model(fpath=fname)

diff --git a/ktrain/data.py b/ktrain/data.py
@@ -1,7 +1,7 @@
 from .imports import *
 
 
-class Dataset(Sequence):
+class Dataset:
     """
     Base class for custom datasets in ktrain.
 
@@ -14,20 +14,10 @@ class Dataset(Sequence):
 
     The signature of to_tfdataset is as follows:
 
-    def to_tfdataset(self, shuffle=True, repeat=True)
+    def to_tfdataset(self, train=True)
 
     See ktrain.text.preprocess.TransformerDataset as an example.
     """
-    def __init__(self, batch_size=32):
-        self.batch_size = batch_size
-
-    # required by keras.utils.Sequence instances
-    def __len__(self):
-        raise NotImplemented
-
-    # required by keras.utils.Sequence instances
-    def __getitem__(self, idx):
-        raise NotImplemented
 
     # required: used by ktrain.core.Learner instances
     def nsamples(self):
@@ -68,8 +58,78 @@ def nclasses(self):
         raise NotImplemented
 
 
+class TFDataset(Dataset):
+    """
+    Wrapper for tf.data.Datasets
+    """
+    def __init__(self, tfdataset, n, y):
+        """
+        Args:
+          tfdataset(tf.data.Dataset):  a tf.Dataset instance
+          n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets)
+          y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded)
+        """
+        if not isinstance(tfdataset, tf.data.Dataset):
+            raise ValueError('tfdataset must be a fully-configured tf.data.Dataset with batch_size, etc. set appropriately')
+        self.tfdataset = tfdataset
+        self.bs = next(tfdataset.as_numpy_iterator())[-1].shape[0] # extract batch_size from tfdataset
+        self.n = n
+        self.y = y
+
+    @property
+    def batch_size(self):
+        return self.bs
+
+    @batch_size.setter
+    def batch_size(self, value):
+        if value != self.bs:
+            warnings.warn('batch_size parameter is ignored, as pre-configured batch_size of tf.data.Dataset is used')
+
+
+    def nsamples(self):
+        return self.n
+
+    def get_y(self):
+        return self.y
+
+    def to_tfdataset(self, train=True):
+        return self.tfdataset
+
+
+class SequenceDataset(Dataset, Sequence):
+    """
+    Base class for custom datasets in ktrain.
+
+    If subclass of Dataset implements a method to to_tfdataset
+    that converts the data to a tf.Dataset, then this will be
+    invoked by Learner instances just prior to training so
+    fit() will train using a tf.Dataset representation of your data.
+    Sequence methods such as __get_item__ and __len__
+    must still be implemented.
+
+    The signature of to_tfdataset is as follows:
+
+    def to_tfdataset(self, training=True)
+
+    See ktrain.text.preprocess.TransformerDataset as an example.
+    """
+    def __init__(self, batch_size=32):
+        self.batch_size = batch_size
+
+    # required by keras.utils.Sequence instances
+    def __len__(self):
+        raise NotImplemented
+
+    # required by keras.utils.Sequence instances
+    def __getitem__(self, idx):
+        raise NotImplemented
+
+        return False
+
+
+
 
-class MultiArrayDataset(Dataset):
+class MultiArrayDataset(SequenceDataset):
     def __init__(self, x, y, batch_size=32, shuffle=True):
         # error checks
         err = False

diff --git a/ktrain/graph/sg_wrappers.py b/ktrain/graph/sg_wrappers.py
@@ -1,5 +1,5 @@
 from ..imports import *
-from ..data import Dataset
+from ..data import SequenceDataset
 
 
 # import stellargraph
@@ -12,7 +12,7 @@
     raise Exception(SG_ERRMSG)
 
 
-class NodeSequenceWrapper(node_mappers.NodeSequence, Dataset):
+class NodeSequenceWrapper(node_mappers.NodeSequence, SequenceDataset):
     def __init__(self, node_seq):
         if not isinstance(node_seq, node_mappers.NodeSequence):
             raise ValueError('node_seq must by a stellargraph NodeSequence object')
@@ -88,7 +88,7 @@ def nclasses(self):
 
 
 
-class LinkSequenceWrapper(link_mappers.LinkSequence, Dataset):
+class LinkSequenceWrapper(link_mappers.LinkSequence, SequenceDataset):
     def __init__(self, link_seq):
         if not isinstance(link_seq, link_mappers.LinkSequence):
             raise ValueError('link_seq must by a stellargraph LinkSequence object')

diff --git a/ktrain/imports.py b/ktrain/imports.py
@@ -216,6 +216,7 @@
 # ner
 from seqeval.metrics import classification_report as ner_classification_report
 from seqeval.metrics import f1_score as ner_f1_score
+from seqeval.metrics import accuracy_score as ner_accuracy_score
 from seqeval.metrics.sequence_labeling import get_entities
 import syntok.segmenter as segmenter
 

diff --git a/ktrain/text/__init__.py b/ktrain/text/__init__.py
@@ -1,5 +1,5 @@
-from .models import *
-from .data import *
+from .models import print_text_classifiers, print_text_regression_models, text_classifier, text_regression_model
+from .data import texts_from_folder, texts_from_csv, texts_from_df,  texts_from_array
 from .ner.data import entities_from_gmb, entities_from_conll2003, entities_from_txt, entities_from_df, entities_from_array
 from .ner.models import sequence_tagger, print_sequence_taggers
 from .eda import get_topic_model
@@ -13,7 +13,7 @@
 __all__ = [
            'text_classifier', 'text_regression_model',
            'print_text_classifiers', 'print_text_regression_models',
-           'texts_from_folder', 'texts_from_csv',
+           'texts_from_folder', 'texts_from_csv', 'texts_from_df', 'texts_from_array',
            'entities_from_gmb',
            'entities_from_conll2003',
            'entities_from_txt',

diff --git a/ktrain/text/learner.py b/ktrain/text/learner.py
@@ -129,20 +129,15 @@ def view_top_losses(self, n=4, preproc=None, val_data=None):
         return
 
 
-    def _prepare(self, data, mode='train'):
+    def _prepare(self, data, train=True):
         """
         prepare data as tf.Dataset
         """
         # HF_EXCEPTION
         # convert arrays to TF dataset (iterator) on-the-fly
         # to work around issues with transformers and tf.Datasets
         if data is None: return None
-        shuffle=True
-        repeat = True
-        if mode != 'train':
-            shuffle = False
-            repeat = False
-        return data.to_tfdataset(shuffle=shuffle, repeat=repeat)
+        return data.to_tfdataset(train=train)
 
 
     def predict(self, val_data=None):
@@ -156,7 +151,7 @@ def predict(self, val_data=None):
         if val is None: raise Exception('val_data must be supplied to get_learner or predict')
         if hasattr(val, 'reset'): val.reset()
         classification, multilabel = U.is_classifier(self.model)
-        preds = self.model.predict(self._prepare(val, mode='valid'))
+        preds = self.model.predict(self._prepare(val, train=False))
         if classification:
             if multilabel:
                 return activations.sigmoid(tf.convert_to_tensor(preds)).numpy()