Merge branch 'develop'

amaiya · Jan 15, 2020 · 24ec38b · 24ec38b
2 parents 54c7735 + 29a9cb7
commit 24ec38b
Show file tree

Hide file tree

Showing 7 changed files with 51 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,20 @@ Most recent releases are shown at the top. Each release shows:
 - **Changed**: Additional parameters, changes to inputs or outputs, etc
 - **Fixed**: Bug fixes that don't change documented behaviour
 
-## 0.8.0 (2020-01-13)
+## 0.8.1 (2020-01-15)
+
+### New:
+- N/A
+
+### Changed:
+- N/A
+
+### Fixed:
+- fix to support multilabel text classification
+- `_prepare_dataset` no longer breaks when validation dataset has not been supplied
+
+
+## 0.8.0 (2020-01-14)
 
 ### New:
 - availability of a new, simplied interface to Hugging Face transformer models

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 ### News and Announcements
 - **2020-01-14:**  
-  - ***ktrain*** **v0.8.x is released** and now includes a simplified interface for text classification with [HuggingFace Transformers](https://github.com/huggingface/transformers).  See [this tutorial notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/develop/tutorials/tutorial-A3-hugging_face_transformers.ipynb) for more details. 
+  - ***ktrain*** **v0.8.x is released** and now includes a thin and easy-to-use wrapper to [HuggingFace Transformers](https://github.com/huggingface/transformers) for text classification.  See [this tutorial notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/develop/tutorials/tutorial-A3-hugging_face_transformers.ipynb) for more details. 
   - As of v0.8.x, *ktrain* now uses **TensorFlow 2**. TensorFlow 1.x is no longer supported.  If you're using Google Colab and `import tensorflow as tf;  print(tf.__version__)` shows v1.15 is installed, you must install TensorFlow 2: `!pip3 install -q tensorflow_gpu==2.0`.  Remember to import Keras modules like this:  `from tensorflow.keras.layers import Dense`.  (That is, don't do this:  `from keras.layers import Dense`.)
 - **Coming Soon**:
   - better support for custom data formats and models
@@ -22,7 +22,7 @@
 - utilize learning rate schedules such as the [triangular policy](https://arxiv.org/abs/1506.01186), the [1cycle policy](https://arxiv.org/abs/1803.09820), and [SGDR](https://arxiv.org/abs/1608.03983) to effectively minimize loss and improve generalization
 - employ fast and easy-to-use pre-canned models for  `text`, `vision`, and `graph` data:
   - `text` data:
-     - **Text Classification**: [BERT](https://arxiv.org/abs/1810.04805), [NBSVM](https://www.aclweb.org/anthology/P12-2018), [fastText](https://arxiv.org/abs/1607.01759), GRUs with [pretrained word vectors](https://fasttext.cc/docs/en/english-vectors.html), and other models <sub><sup>[[example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/examples/text/IMDb-BERT.ipynb)]</sup></sub>
+     - **Text Classification**: [BERT](https://arxiv.org/abs/1810.04805), [DistilBERT](https://arxiv.org/abs/1910.01108), [NBSVM](https://www.aclweb.org/anthology/P12-2018), [fastText](https://arxiv.org/abs/1607.01759), and other models <sub><sup>[[example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/examples/text/IMDb-BERT.ipynb)]</sup></sub>
      - **Sequence Labeling**:  [Bidirectional LSTM-CRF](https://arxiv.org/abs/1603.01360) with optional pretrained word embeddings <sub><sup>[[example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/tutorial-06-sequence-tagging.ipynb)]</sup></sub>
      - **Unsupervised Topic Modeling** with [LDA](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf)  <sub><sup>[[example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/examples/text/20newsgroups-topic_modeling.ipynb)]</sup></sub>
      - **Document Similarity with One-Class Learning**:  given some documents of interest, find and score new documents that are semantically similar to them using [One-Class Text Classification](https://en.wikipedia.org/wiki/One-class_classification) <sub><sup>[[example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/examples/text/20newsgroups-document_similarity_scorer.ipynb)]</sup></sub>
@@ -58,7 +58,7 @@ Some blog tutorials about *ktrain* are shown below:
 
 > [**BERT Text Classification in 3 Lines of Code**](https://towardsdatascience.com/bert-text-classification-in-3-lines-of-code-using-keras-264db7e7a358)  
 
-> [**Explainable AI in Practice**](https://medium.com/@asmaiya/explainable-ai-in-practice-2e5ae2d16dc7)
+> [**Text Classification with Hugging Face Transformers in  TensorFlow 2 (Without Tears)**](https://medium.com/@asmaiya/text-classification-with-hugging-face-transformers-in-tensorflow-2-without-tears-ee50e4f3e7ed)
 
 
 

diff --git a/ktrain/text/learner.py b/ktrain/text/learner.py
@@ -134,6 +134,7 @@ def _prepare(self, data, mode='train'):
         # HF_EXCEPTION
         # convert arrays to TF dataset (iterator) on-the-fly
         # to work around issues with transformers and tf.Datasets
+        if data is None: return None
         shuffle=True
         repeat = True
         if mode != 'train':

diff --git a/ktrain/text/preprocessor.py b/ktrain/text/preprocessor.py
@@ -354,6 +354,7 @@ def __init__(self, maxlen, classes, lang='en', multilabel=False):
         self.c = classes
         self.maxlen = maxlen
         self.lang = lang
+        self.multilabel = multilabel
 
 
     def get_preprocessor(self):
@@ -754,7 +755,7 @@ class Transformer(TransformersPreprocessor):
     """
 
     def __init__(self, model_name, maxlen=128, classes=[], 
-                 batch_size=None, is_multilabel=False,
+                 batch_size=None, multilabel=False,
                  use_with_learner=True):
         """
         Args:
@@ -768,7 +769,7 @@ def __init__(self, model_name, maxlen=128, classes=[],
                                      return a ktrain TransformerSequence object for use with
                                      ktrain.get_learner.
             batch_size (int): batch_size - only required if use_with_learner=False
-            is_multilabel (int):  if True, classifier will be configured for
+            multilabel (int):  if True, classifier will be configured for
                                   multilabel classification.
 
         """
@@ -777,15 +778,20 @@ def __init__(self, model_name, maxlen=128, classes=[],
         if classes is None or not classes:
             raise ValueError('classes argument is required - provide list of class names as strings')
         super().__init__(model_name,
-                         maxlen, max_features=10000, classes=classes)
+                         maxlen, max_features=10000, classes=classes, multilabel=multilabel)
         self.batch_size = batch_size
-        self.is_multilabel = is_multilabel
         self.use_with_learner = use_with_learner
 
 
     def preprocess_train(self, texts, y=None, mode='train', verbose=1):
         """
-        preprocess training set
+        Preprocess training set for A Transformer model
+
+        Each label can be in the form of either:
+        1) integer representing the class (index into array returned by get_classes)
+           for binary and multiclass text classification
+        2) multi-hot-encoded vector for multilabel text classification problems
+
         Args:
             texts (list of strings): text of documents
             y: labels
@@ -804,7 +810,17 @@ def preprocess_train(self, texts, y=None, mode='train', verbose=1):
 
     def preprocess_test(self, texts, y=None,  verbose=1):
         """
-        preprocess validation or test datasets
+        Preprocess the validation or test set for a Transformer model
+
+        Each label can be in the form of either:
+        1) integer representing the class (index into array returned by get_classes)
+           for binary and multiclass text classification
+        2) multi-hot-encoded vector for multilabel text classification problems
+
+        Args:
+            texts (list of strings): text of documents
+            y: labels
+            verbose(bool): verbosity
         """
         return self.preprocess_train(texts, y=y, mode='test', verbose=verbose)
 
@@ -813,7 +829,7 @@ def preprocess_test(self, texts, y=None,  verbose=1):
     def get_classifier(self):
         num_labels = len(self.get_classes())
         model = self.model_type.from_pretrained(self.model_name, num_labels=num_labels)
-        if self.is_multilabel:
+        if self.multilabel:
             loss_fn =  keras.losses.BinaryCrossentropy(from_logits=True)
         else:
             loss_fn = keras.losses.CategoricalCrossentropy(from_logits=True)

diff --git a/ktrain/utils.py b/ktrain/utils.py
@@ -40,12 +40,15 @@ def is_classifier(model):
 
     # check for multilabel
     if loss == 'binary_crossentropy':
-        last = model.layers[-1]
-        output_shape = last.output_shape
-        mult_output = True if len(output_shape) ==2 and output_shape[1] >  1 else False
-        if ( (hasattr(last, 'activation') and isinstance(last.activation, type(sigmoid))) or\
-           isinstance(last, type(sigmoid)) ) and mult_output:
+        if is_huggingface(model=model):
             is_multilabel = True
+        else:
+            last = model.layers[-1]
+            output_shape = last.output_shape
+            mult_output = True if len(output_shape) ==2 and output_shape[1] >  1 else False
+            if ( (hasattr(last, 'activation') and isinstance(last.activation, type(sigmoid))) or\
+               isinstance(last, type(sigmoid)) ) and mult_output:
+                is_multilabel = True
     return (is_classifier, is_multilabel)
 
 

diff --git a/ktrain/version.py b/ktrain/version.py
@@ -1,2 +1,2 @@
 __all__ = ['__version__']
-__version__ = '0.8.0'
+__version__ = '0.8.1'
diff --git a/tutorials/tutorial-A3-hugging_face_transformers.ipynb b/tutorials/tutorial-A3-hugging_face_transformers.ipynb
@@ -20,7 +20,7 @@
    "source": [
     "# Text Classification with Hugging Face Transformers in *ktrain*\n",
     "\n",
-    "As of v0.8.x, *ktrain* now includes a simplifed interface for text classification with the Hugging Face transformers library."
+    "As of v0.8.x, *ktrain* now includes an easy-to-use, thin wrapper to the Hugging Face transformers library for text classification."
    ]
   },
   {