Merge branch 'develop'

amaiya · Feb 4, 2020 · f25d22c · f25d22c
2 parents ed0668a + 2802d14
commit f25d22c
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,20 @@ Most recent releases are shown at the top. Each release shows:
 - **Fixed**: Bug fixes that don't change documented behaviour
 
 
+## 0.9.2 (2020-02-04)
+
+### New:
+- N/A
+
+### Changed:
+- Removed Exception when `distilbert` is selected in `text_classifier` for non-English language after 
+  [Hugging Face fixed the reported bug](https://github.com/huggingface/transformers/issues/2462). 
+
+### Fixed:
+- XLNet models like `xlnet-base-cased` now works after casting input arrays to `int32`
+- modified `TextPredictor.explain` to propogate correct error message from `eli5` for multilabel text classification.
+
+
 ## 0.9.1 (2020-02-01)
 
 ### New:

diff --git a/examples/text/ChineseHotelReviews-BERT.ipynb b/examples/text/ChineseHotelReviews-BERT.ipynb
@@ -124,13 +124,13 @@
     }
    ],
    "source": [
-    "(x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder('/home/amaiya/data/ChnSentiCorp_htl_ba_6000', \n",
-    "                                                                       maxlen=75, \n",
-    "                                                                       max_features=30000,\n",
-    "                                                                       preprocess_mode='bert',\n",
-    "                                                                       train_test_names=['train'],\n",
-    "                                                                       val_pct=0.1,\n",
-    "                                                                       classes=['pos', 'neg'])"
+    "trn, val, preproc = text.texts_from_folder('/home/amaiya/data/ChnSentiCorp_htl_ba_6000', \n",
+    "                                            maxlen=75, \n",
+    "                                            max_features=30000,\n",
+    "                                            preprocess_mode='bert',\n",
+    "                                            train_test_names=['train'],\n",
+    "                                            val_pct=0.1,\n",
+    "                                            classes=['pos', 'neg'])"
    ]
   },
   {
@@ -156,10 +156,10 @@
     }
    ],
    "source": [
-    "model = text.text_classifier('bert', (x_train, y_train) , preproc=preproc)\n",
+    "model = text.text_classifier('bert', trn, preproc=preproc)\n",
     "learner = ktrain.get_learner(model, \n",
-    "                             train_data=(x_train, y_train), \n",
-    "                             val_data=(x_test, y_test), \n",
+    "                             train_data=trn, \n",
+    "                             val_data=val, \n",
     "                             batch_size=32)"
    ]
   },
@@ -423,7 +423,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,

diff --git a/ktrain/text/predictor.py b/ktrain/text/predictor.py
@@ -38,7 +38,7 @@ def predict(self, texts, return_proba=False):
         elif not isinstance(texts, np.ndarray) and not isinstance(texts, list):
             raise ValueError('data must be numpy.ndarray or list (of texts)')
         classification, multilabel = U.is_classifier(self.model)
-        if multilabel: return_proba = True
+        #if multilabel: return_proba = True
         #treat_multilabel = False
         #loss = self.model.loss
         #if loss != 'categorical_crossentropy' and not return_proba:
@@ -55,8 +55,8 @@ def predict(self, texts, return_proba=False):
             else:
                 preds = np.squeeze(preds)
                 if len(preds.shape) == 0: preds = np.expand_dims(preds, -1)
-        result =  preds if return_proba or not self.c else [self.c[np.argmax(pred)] for pred in preds] 
-        if multilabel:
+        result =  preds if return_proba or multilabel or not self.c else [self.c[np.argmax(pred)] for pred in preds] 
+        if multilabel and not return_proba:
             result =  [list(zip(self.c, r)) for r in result]
         if is_str: return result[0]
         else:      return result

diff --git a/ktrain/text/preprocessor.py b/ktrain/text/preprocessor.py
@@ -793,8 +793,6 @@ def __init__(self, maxlen, max_features, classes=[],
             model_name = 'distilbert-base-uncased'
         else:
             model_name = 'distilbert-base-multilingual-cased'
-            raise Exception('currently_unsupported: non-English languages are not currently supported for '+\
-                            'distilbert due to issues with TF2 version of transformers library. ')
 
         super().__init__(model_name,
                          maxlen, max_features, classes=classes, 
@@ -948,10 +946,28 @@ def to_tfdataset(self, shuffle=True, repeat=True):
         """
         convert transformer features to tf.Dataset
         """
-        tfdataset = tf.data.Dataset.from_tensor_slices((self.x, self.y))
-        tfdataset = tfdataset.map(lambda x,y: ({'input_ids': x[0], 
-                                                'attention_mask': x[1], 
-                                                 'token_type_ids': x[2]}, y))
+        if len(self.y.shape) == 1:
+            yshape = []
+        else:
+            yshape = [None]
+
+        def gen():
+            for idx, data in enumerate(self.x):
+                yield ({'input_ids': data[0],
+                         'attention_mask': data[1],
+                         'token_type_ids': data[2]},
+                        self.y[idx])
+
+        tfdataset= tf.data.Dataset.from_generator(gen,
+            ({'input_ids': tf.int32,
+              'attention_mask': tf.int32,
+              'token_type_ids': tf.int32},
+             tf.int64),
+            ({'input_ids': tf.TensorShape([None]),
+              'attention_mask': tf.TensorShape([None]),
+              'token_type_ids': tf.TensorShape([None])},
+             tf.TensorShape(yshape)))
+
         if shuffle:
             tfdataset = tfdataset.shuffle(self.x.shape[0])
         tfdataset = tfdataset.batch(self.batch_size)

diff --git a/ktrain/version.py b/ktrain/version.py
@@ -1,2 +1,2 @@
 __all__ = ['__version__']
-__version__ = '0.9.1'
+__version__ = '0.9.2'