Merge branch 'develop'

amaiya · Dec 23, 2020 · d50949c · d50949c
2 parents 8a7a0be + 863fdea
commit d50949c
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,16 +6,18 @@ Most recent releases are shown at the top. Each release shows:
 - **Changed**: Additional parameters, changes to inputs or outputs, etc
 - **Fixed**: Bug fixes that don't change documented behaviour
 
-## 0.25.3 (TBD)
+## 0.25.3 (2020-12-23)
 
 ### New:
 - N/A
 
 ### Changed
-- N/A
+- A `steps_per_epoch` argument has been added to all `*fit*` methods that operate on generators
+- Added `get_tokenizer` methods to all instances of `TextPreprocessor`
 
 ### Fixed:
 - propogate custom metrics to model when `distilbert` is chosen in `text_classifier` and `text_regression_model` functions
+- pin `scikit-learn` to 0.24.0 sue to breaking change
 
 
 ## 0.25.2 (2020-12-05)

diff --git a/FAQ.md b/FAQ.md
@@ -48,6 +48,9 @@
 
 - [Why am I seeing a "list index out of range" error when calling predict?](#why-am-i-seeing-a-list-index-out-of-range-error-when-calling-predict)
 
+- [How do I train a transformers model from a saved checkpoint folder?](#how-do-i-train-a-transformers-model-from-a-saved-checkpoint-folder)
+
+
 
 
 ## Evaluation, Inspection, and Prediction
@@ -469,6 +472,10 @@ http://0.0.0.0:8888/predict?text=text%20you%20want%20to%20classify
 
 In this toy example, we are supplying the text data to classify in the URL as a GET request.
 
+Note that the above example requires both **ktrain** and TensorFlow to be installed on the deployment machine.  If this footprint is too large,
+you can [convert the model to ONNX](#how-do-i-make-quantized-predictions-with-transformers-models).  This allows you to deploy the model
+and make predictions **without** having  **TensorFlow**, **ktrain**, and their many dependencies installed.  This is particurly well-suited to Heroku deployments, which restrict slug sizes to 500MB.
+
 
 
 [[Back to Top](#frequently-asked-questions-about-ktrain)]
@@ -844,7 +851,7 @@ convert(framework='pt', model=pt_path,output=Path(pt_onnx_path), opset=11,
 pt_onnx_quantized_path = quantize(optimize(Path(pt_onnx_path)))
 
 # create ONNX session (or create session manually if wanting to avoid ktrain/TensorFlow dependencies)
-sess = p.create_onnx_session(pt_onnx_quant_name.as_posix())
+sess = p.create_onnx_session(pt_onnx_quantized_path.as_posix())
 # create tokenizer (or create tokenizer manually if wanting to avoid ktrain/TensorFlow dependencies)
 tokenizer = p.preproc.get_tokenizer()
 tokens = tokenizer.encode_plus('My computer monitor is blurry.', max_length=p.preproc.maxlen, truncation=True)
@@ -860,6 +867,24 @@ The example above assumes the model saved at `predictor_path` was trained on a s
 You can also use **ktrain** to [create ONNX models directly from TensorFlow](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/examples/text/ktrain-ONNX-TFLite-examples.ipynb) with optional quantization.  Note, that conversions to ONNX from TensorFlow models appear to [require a hard-coded input size](https://github.com/huggingface/transformers/issues/8227) (i.e., padding is used), whereas conversions to ONNX from PyTorch models do not appear to have this requirement.
 
 
+[[Back to Top](#frequently-asked-questions-about-ktrain)]
+
+
+### How do I train a transformers model from a saved checkpoint folder?
+
+In the **ktrain** `Transformer` API, you can train/fine-tune a text classification model from a local path:
+```python
+t = text.Transformer(MODEL_LOCAL_PATH, maxlen=50, class_names=class_names)
+```
+
+This is useful, for example, if you first [fine-tune a language model](https://github.com/huggingface/transformers/tree/master/examples/language-modeling) using Hugging-Face **Trainer** **prior** to fine-tuning your text classifier.
+
+However, when supplying a local path to `Transformer`, **ktrain** will also look for the tokenizer files in that directory. So, you just need to ensure tokenizer files like the `vocab` file (which are quite small), exist in the local folder (in addition to the folder created by `predictor.save_predictor`.
+
+See [this post](https://github.com/amaiya/ktrain/issues/295#issuecomment-744509996) for more details.
+
+
+
 [[Back to Top](#frequently-asked-questions-about-ktrain)]
 
 

diff --git a/ktrain/core.py b/ktrain/core.py
@@ -793,7 +793,7 @@ def fit(self, lr, n_cycles, cycle_len=None, cycle_mult=1, batch_size=U.DEFAULT_B
 
     def fit_onecycle(self, lr, epochs, checkpoint_folder=None, 
                      cycle_momentum=True, max_momentum=0.95, min_momentum=0.85,
-                     verbose=1, class_weight=None, callbacks=[]):
+                     class_weight=None, callbacks=[], steps_per_epoch=None, verbose=1):
         """
         Train model using a version of Leslie Smith's 1cycle policy.
         This method can be used with any optimizer. Thus,
@@ -817,6 +817,8 @@ def fit_onecycle(self, lr, epochs, checkpoint_folder=None,
             min_momentum(float): minimum momentum to use if cycle_momentum=True
             class_weight (dict):       Optional dictionary mapping class indices (integers) to a weight (float) 
             callbacks (list): list of Callback instances to employ during training
+            steps_per_epoch(int):    Steps per epoch. If None, then, math.ceil(num_samples/batch_size) is used.
+                                     Ignored unless training dataset is generator.
             verbose (bool):  verbose mode
         """
         if not self._is_adamlike() and cycle_momentum:
@@ -826,7 +828,8 @@ def fit_onecycle(self, lr, epochs, checkpoint_folder=None,
 
 
         num_samples = U.nsamples_from_data(self.train_data)
-        steps_per_epoch = math.ceil(num_samples/self.batch_size)
+        if steps_per_epoch is None:
+            steps_per_epoch = math.ceil(num_samples/self.batch_size)
 
         # setup callbacks for learning rates and early stopping
         if not callbacks: kcallbacks = []
@@ -853,7 +856,8 @@ def fit_onecycle(self, lr, epochs, checkpoint_folder=None,
                 verbose=verbose)
         hist = self.fit(lr, epochs, early_stopping=None,
                         checkpoint_folder=checkpoint_folder,
-                        verbose=verbose, class_weight=class_weight, callbacks=kcallbacks)
+                        verbose=verbose, class_weight=class_weight, callbacks=kcallbacks, 
+                        steps_per_epoch=steps_per_epoch)
         hist.history['lr'] = clr.history['lr']
         hist.history['iterations'] = clr.history['iterations']
         if cycle_momentum:
@@ -866,8 +870,8 @@ def fit_onecycle(self, lr, epochs, checkpoint_folder=None,
     def autofit(self, lr, epochs=None, 
                 early_stopping=None, reduce_on_plateau=None, reduce_factor=2, 
                 cycle_momentum=True, max_momentum=0.95, min_momentum=0.85,
-                monitor='val_loss', checkpoint_folder=None, verbose=1, 
-                class_weight=None, callbacks=[]):
+                monitor='val_loss', checkpoint_folder=None,
+                class_weight=None, callbacks=[], steps_per_epoch=None, verbose=1):
         """
         Automatically train model using a default learning rate schedule shown to work well
         in practice.  By default, this method currently employs a triangular learning 
@@ -916,6 +920,8 @@ def autofit(self, lr, epochs=None,
                                         is enabled.
             class_weight (dict):       Optional dictionary mapping class indices (integers) to a weight (float) 
             callbacks (list): list of Callback instances to employ during training
+            steps_per_epoch(int):    Steps per epoch. If None, then, math.ceil(num_samples/batch_size) is used.
+                                     Ignored unless training dataset is generator.
             verbose (bool):  verbose mode
         """
         # check optimizer
@@ -927,7 +933,8 @@ def autofit(self, lr, epochs=None,
 
         # setup learning rate policy 
         num_samples = U.nsamples_from_data(self.train_data)
-        steps_per_epoch = math.ceil(num_samples/self.batch_size)
+        if steps_per_epoch is None:
+            steps_per_epoch = math.ceil(num_samples/self.batch_size)
         step_size = math.ceil(steps_per_epoch/2)
 
         # handle missing epochs
@@ -987,7 +994,8 @@ def autofit(self, lr, epochs=None,
                 verbose=verbose)
         hist = self.fit(lr, epochs, early_stopping=early_stopping,
                         checkpoint_folder=checkpoint_folder,
-                        verbose=verbose, class_weight=class_weight, callbacks=kcallbacks)
+                        verbose=verbose, class_weight=class_weight, callbacks=kcallbacks, 
+                        steps_per_epoch=steps_per_epoch)
         hist.history['lr'] = clr.history['lr']
         hist.history['iterations'] = clr.history['iterations']
         if cycle_momentum:
@@ -1056,7 +1064,7 @@ def __init__(self, model, train_data=None, val_data=None,
 
     def fit(self, lr, n_cycles, cycle_len=None, cycle_mult=1, 
             lr_decay=1, checkpoint_folder = None, early_stopping=None,
-            verbose=1, class_weight=None, callbacks=[]):
+            verbose=1, class_weight=None, callbacks=[], steps_per_epoch=None):
         """
         Trains the model. By default, fit is simply a wrapper for model.fit.
         When cycle_len parameter is supplied, an SGDR learning rate schedule is used.
@@ -1081,6 +1089,8 @@ def fit(self, lr, n_cycles, cycle_len=None, cycle_mult=1,
                                   with lowest validation loss.
         callbacks (list):         list of Callback instances to employ during training
         class_weight (dict):       Optional dictionary mapping class indices (integers) to a weight (float) 
+        steps_per_epoch(int):    Steps per epoch. If None, then, math.ceil(num_samples/batch_size) is used.
+                                 Ignored unless training dataset is generator (and in ArrayLearner instances).
         verbose (bool):           whether or not to show progress bar
         """
 
@@ -1233,7 +1243,7 @@ def __init__(self, model, train_data=None, val_data=None,
 
     def fit(self, lr, n_cycles, cycle_len=None, cycle_mult=1,
             lr_decay=1.0, checkpoint_folder=None, early_stopping=None, 
-            class_weight=None, callbacks=[], verbose=1):
+            class_weight=None, callbacks=[], steps_per_epoch=None, verbose=1):
         """
         Trains the model. By default, fit is simply a wrapper for model.fit (for generators/sequences).
         When cycle_len parameter is supplied, an SGDR learning rate schedule is used.
@@ -1258,6 +1268,7 @@ def fit(self, lr, n_cycles, cycle_len=None, cycle_mult=1,
                                   with lowest validation loss.
         class_weight (dict):       Optional dictionary mapping class indices (integers) to a weight (float) 
         callbacks (list):         list of Callback instances to employ during training
+        steps_per_epoch(int):    Steps per epoch. If None, then, math.ceil(num_samples/batch_size) is used.
         verbose (boolean):       whether or not to print progress bar
         """
         # check early_stopping
@@ -1268,7 +1279,8 @@ def fit(self, lr, n_cycles, cycle_len=None, cycle_mult=1,
         # handle callbacks
         num_samples = U.nsamples_from_data(self.train_data)
         train_bs = self.train_data.batch_size if hasattr(self.train_data, 'batch_size') else self.batch_size
-        steps_per_epoch = math.ceil(num_samples/train_bs)
+        if steps_per_epoch is None:
+            steps_per_epoch = math.ceil(num_samples/train_bs)
         validation_steps = None
         if self.val_data is not None:
             val_bs = self.val_data.batch_size if hasattr(self.val_data, 'batch_size') else self.batch_size

diff --git a/ktrain/text/preprocessor.py b/ktrain/text/preprocessor.py
@@ -415,6 +415,8 @@ def migrate_classes(self, class_names, classes):
         return class_names
 
 
+    def get_tokenizer(self):
+        raise NotImplementedError('This method was not overridden in subclass')
 
     def check_trained(self):
         if not self.preprocess_train_called:
@@ -536,6 +538,9 @@ def __init__(self, maxlen, max_features, class_names=[], classes=[],
         self.max_features = max_features
         self.ngram_range = ngram_range
 
+    def get_tokenizer(self):
+        return self.tok
+
 
     def __getstate__(self):
         return {k: v for k, v in self.__dict__.items()}
@@ -742,6 +747,10 @@ def __init__(self, maxlen, max_features, class_names=[], classes=[],
         self.ngram_range = 1 # ignored
 
 
+    def get_tokenizer(self):
+        return self.tok
+
+
     def __getstate__(self):
         return {k: v for k, v in self.__dict__.items()}
 

diff --git a/setup.py b/setup.py
@@ -25,7 +25,7 @@
   url = 'https://github.com/amaiya/ktrain',
   keywords = ['tensorflow', 'keras', 'deep learning', 'machine learning'],
   install_requires=[
-          'scikit-learn>=0.21.3', # previously pinned to 0.21.3 due to TextPredictor.explain, but no longer needed as of 0.19.7
+          'scikit-learn==0.23.2', # pinned to 0.23.2 due breaking change in 0.24.x
           'matplotlib >= 3.0.0',
           'pandas >= 1.0.1',
           'fastprogress >= 0.1.21',