Merge branch 'develop'

amaiya · Apr 30, 2020 · aa2c1a4 · aa2c1a4
2 parents f619e52 + 262f43f
commit aa2c1a4
Show file tree

Hide file tree

Showing 7 changed files with 129 additions and 59 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,19 @@ Most recent releases are shown at the top. Each release shows:
 - **Fixed**: Bug fixes that don't change documented behaviour
 
 
+
+## 0.14.4 (2020-04-30)
+
+### New:
+- N/A
+
+### Changed
+- added `merge_tokens` and `return_proba` options to `NERPredictor.predict`
+
+### Fixed:
+- N/A
+
+
 ## 0.14.3 (2020-04-27)
 
 ### New:

diff --git a/README.md b/README.md
@@ -253,8 +253,8 @@ learner.validate(class_names=t.get_classes()) # class_names must be string value
 
 
 Using *ktrain* on **Google Colab**?  See these Colab examples:
--  [a simple demo of Multiclass Text Classification with BERT](https://colab.research.google.com/drive/1AH3fkKiEqBpVpO5ua00scp7zcHs5IDLK).
--  [a simple demo of Multiclass Text Classification with Hugging Face Transformers](https://colab.research.google.com/drive/1YxcceZxsNlvK35pRURgbwvkgejXwFxUt).
+-  [a simple demo of Multiclass Text Classification with BERT](https://colab.research.google.com/drive/1AH3fkKiEqBpVpO5ua00scp7zcHs5IDLK)
+-  [a simple demo of Multiclass Text Classification with Hugging Face Transformers](https://colab.research.google.com/drive/1YxcceZxsNlvK35pRURgbwvkgejXwFxUt)
 -  [image classification with Cats vs. Dogs](https://colab.research.google.com/drive/1WipQJUPL7zqyvLT10yekxf_HNMXDDtyR)
 
 **Additional examples can be found [here](https://github.com/amaiya/ktrain/tree/master/examples).**
@@ -263,20 +263,14 @@ Using *ktrain* on **Google Colab**?  See these Colab examples:
 
 ### Installation
 
-Make sure pip is up-to-date with: `pip3 install -U pip`.
-
-1. Ensure TensorFlow 2.1.0 [is installed](https://www.tensorflow.org/install/pip?lang=python3) if it is not already. 
-   (While *ktrain* will probably work with other versions of TensorFlow 2.x, v2.1.0 is the current recommended and tested version.)
-
-> For GPU: `pip3 install "tensorflow_gpu==2.1.0"`
-
-> For CPU: `pip3 install "tensorflow==2.1.0"`
+*ktrain* currently uses [TensorFlow 2.1.0](https://www.tensorflow.org/install/pip?lang=python3), which will be installed automatically when installing *ktrain*. 
+While *ktrain* will probably work with other versions of TensorFlow 2.x, v2.1.0 is the current recommended and tested version.
 
+1.  Make sure pip is up-to-date with: `pip3 install -U pip`
 
 2. Install *ktrain*: `pip3 install ktrain`
 
 **Some things to note:**
-- As of v0.8.x, *ktrain* requires TensorFlow 2.  TensorFlow 1.x (1.14, 1.15) is no longer suppoted. 
 - Since some *ktrain* dependencies have not yet been migrated to `tf.keras` in TensorFlow 2 (or may have other issues), 
   *ktrain* is temporarily using forked versions of some libraries. Specifically, *ktrain* uses forked versions of the `eli5` and `stellargraph` libraries.  If not installed, *ktrain* will complain  when a method or function needing 
   either of these libraries is invoked.

diff --git a/ktrain/tests/text_data/wines.csv.zip b/ktrain/tests/text_data/wines.csv.zip
diff --git a/ktrain/text/ner/predictor.py b/ktrain/text/ner/predictor.py
@@ -2,6 +2,7 @@
 from ...predictor import Predictor
 from .preprocessor import NERPreprocessor, tokenize
 from ... import utils as U
+from .. import textutils as TU
 
 class NERPredictor(Predictor):
     """
@@ -25,20 +26,81 @@ def get_classes(self):
         return self.c
 
 
-    def predict(self, sentence):
+    def predict(self, sentence, return_proba=False, merge_tokens=False):
         """
         Makes predictions for a string-representation of a sentence
-        If return_proba is True, returns probabilities of each class.
+        Args:
+          sentence(str): sentence of text
+          return_proba(bool): If return_proba is True, returns probability distribution for each token
+          merge_tokens(bool):  If True, tokens will be merged together by the entity
+                               to which they are associated:
+                               ('Paul', 'B-PER'), ('Newman', 'I-PER') becomes ('Paul Newman', 'PER')
+
+        Returns:
+          list: list of tuples representing each token.
         """
         if not isinstance(sentence, str):
             raise ValueError('Param sentence must be a string-representation of a sentence')
-        nerseq = self.preproc.preprocess([sentence])
+        if return_proba and merge_tokens:
+            raise ValueError('return_proba and merge_tokens are mutually exclusive with one another.')
+        lang = TU.detect_lang([sentence])
+        nerseq = self.preproc.preprocess([sentence], lang=lang)
         if not nerseq.prepare_called:
             nerseq.prepare()
         nerseq.batch_size = self.batch_size
         x_true, _ = nerseq[0]
         lengths = nerseq.get_lengths(0)
         y_pred = self.model.predict_on_batch(x_true)
-        y_pred = self.preproc.p.inverse_transform(y_pred, lengths)
-        y_pred = y_pred[0]
-        return list(zip(nerseq.x[0], y_pred))
+        y_labels = self.preproc.p.inverse_transform(y_pred, lengths)
+        y_labels = y_labels[0]
+        if return_proba:
+            #probs = np.max(y_pred, axis=2)[0]
+            y_pred = y_pred[0].numpy().tolist()
+            return list(zip(nerseq.x[0], y_labels, y_pred))
+        else:
+            result =  list(zip(nerseq.x[0], y_labels))
+            if merge_tokens:
+                result = self.merge_tokens(result, lang)
+            return result
+
+
+    def merge_tokens(self, annotated_sentence, lang):
+
+        if TU.is_chinese(lang, strict=False): # strict=False: workaround for langdetect bug on short chinese texts
+            sep = ''
+        else:
+            sep = ' '
+
+        current_token = ""
+        current_tag = ""
+        entities = []
+
+        for tup in annotated_sentence:
+            token = tup[0]
+            entity = tup[1]
+            tag = entity.split('-')[1] if '-' in entity else None
+            prefix = entity.split('-')[0] if '-' in entity else None
+            # not within entity
+            if tag is None and not current_token:
+                continue
+            # beginning of entity
+            #elif tag and prefix=='B':
+            elif tag and (prefix=='B' or prefix=='I' and not current_token):
+                if current_token: # consecutive entities
+                    entities.append((current_token, current_tag))
+                    current_token = ""
+                    current_tag = None
+                current_token = token
+                current_tag = tag
+            # end of entity
+            elif tag is None and current_token:
+                entities.append((current_token, current_tag))
+                current_token = ""
+                current_tag = None
+                continue
+            # within entity
+            elif tag and current_token:  #  prefix I
+                current_token = current_token + sep + token
+                current_tag = tag
+        return entities
+
diff --git a/ktrain/text/ner/preprocessor.py b/ktrain/text/ner/preprocessor.py
@@ -65,12 +65,12 @@ def get_wv_model(self, wv_path_or_url, verbose=1):
 
 
 
-    def preprocess(self, sentences):
+    def preprocess(self, sentences, lang=None):
         if type(sentences) != list:
             raise ValueError('Param sentences must be a list of strings')
 
         # language detection
-        lang = TU.detect_lang(sentences)
+        if lang is None: lang = TU.detect_lang(sentences)
         X = []
         y = []
         for s in sentences:

diff --git a/ktrain/text/shallownlp/ner.py b/ktrain/text/shallownlp/ner.py
@@ -71,49 +71,50 @@ def predict(self, texts, merge_tokens=True):
         results = []
         for text in texts:
             text = text.strip()
-            result = self.predictor.predict(text)
-            if merge_tokens:
-                result = self.merge_tokens(result)
+            result = self.predictor.predict(text, merge_tokens=merge_tokens)
+            #if merge_tokens:
+                #result = self.merge_tokens(result)
             results.append(result)
         if len(result) == 1: result = result[0]
         return result
 
 
-    def merge_tokens(self, annotated_sentence):
-        if self.lang.startswith('zh'):
-            sep = ''
-        else:
-            sep = ' '
-        current_token = ""
-        current_tag = ""
-        entities = []
+    # 2020-04-30: moved to text.ner.predictor
+    #def merge_tokens(self, annotated_sentence):
+    #    if self.lang.startswith('zh'):
+    #        sep = ''
+    #    else:
+    #        sep = ' '
+    #    current_token = ""
+    #    current_tag = ""
+    #    entities = []
 
-        for tup in annotated_sentence:
-            token = tup[0]
-            entity = tup[1]
-            tag = entity.split('-')[1] if '-' in entity else None
-            prefix = entity.split('-')[0] if '-' in entity else None
-            # not within entity
-            if tag is None and not current_token:
-                continue
-            # beginning of entity
-            #elif tag and prefix=='B':
-            elif tag and (prefix=='B' or prefix=='I' and not current_token):
-                if current_token: # consecutive entities
-                    entities.append((current_token, current_tag))
-                    current_token = ""
-                    current_tag = None
-                current_token = token
-                current_tag = tag
-            # end of entity
-            elif tag is None and current_token:
-                entities.append((current_token, current_tag))
-                current_token = ""
-                current_tag = None
-                continue
-            # within entity
-            elif tag and current_token:  #  prefix I
-                current_token = current_token + sep + token
-                current_tag = tag
-        return entities  
+    #    for tup in annotated_sentence:
+    #        token = tup[0]
+    #        entity = tup[1]
+    #        tag = entity.split('-')[1] if '-' in entity else None
+    #        prefix = entity.split('-')[0] if '-' in entity else None
+    #        # not within entity
+    #        if tag is None and not current_token:
+    #            continue
+    #        # beginning of entity
+    #        #elif tag and prefix=='B':
+    #        elif tag and (prefix=='B' or prefix=='I' and not current_token):
+    #            if current_token: # consecutive entities
+    #                entities.append((current_token, current_tag))
+    #                current_token = ""
+    #                current_tag = None
+    #            current_token = token
+    #            current_tag = tag
+    #        # end of entity
+    #        elif tag is None and current_token:
+    #            entities.append((current_token, current_tag))
+    #            current_token = ""
+    #            current_tag = None
+    #            continue
+    #        # within entity
+    #        elif tag and current_token:  #  prefix I
+    #            current_token = current_token + sep + token
+    #            current_tag = tag
+    #    return entities  
 
diff --git a/ktrain/version.py b/ktrain/version.py
@@ -1,2 +1,2 @@
 __all__ = ['__version__']
-__version__ = '0.14.3'
+__version__ = '0.14.4'