Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
amaiya committed Apr 30, 2020
2 parents f619e52 + 262f43f commit aa2c1a4
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 59 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,19 @@ Most recent releases are shown at the top. Each release shows:
- **Fixed**: Bug fixes that don't change documented behaviour



## 0.14.4 (2020-04-30)

### New:
- N/A

### Changed
- added `merge_tokens` and `return_proba` options to `NERPredictor.predict`

### Fixed:
- N/A


## 0.14.3 (2020-04-27)

### New:
Expand Down
16 changes: 5 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,8 @@ learner.validate(class_names=t.get_classes()) # class_names must be string value


Using *ktrain* on **Google Colab**? See these Colab examples:
- [a simple demo of Multiclass Text Classification with BERT](https://colab.research.google.com/drive/1AH3fkKiEqBpVpO5ua00scp7zcHs5IDLK).
- [a simple demo of Multiclass Text Classification with Hugging Face Transformers](https://colab.research.google.com/drive/1YxcceZxsNlvK35pRURgbwvkgejXwFxUt).
- [a simple demo of Multiclass Text Classification with BERT](https://colab.research.google.com/drive/1AH3fkKiEqBpVpO5ua00scp7zcHs5IDLK)
- [a simple demo of Multiclass Text Classification with Hugging Face Transformers](https://colab.research.google.com/drive/1YxcceZxsNlvK35pRURgbwvkgejXwFxUt)
- [image classification with Cats vs. Dogs](https://colab.research.google.com/drive/1WipQJUPL7zqyvLT10yekxf_HNMXDDtyR)

**Additional examples can be found [here](https://github.com/amaiya/ktrain/tree/master/examples).**
Expand All @@ -263,20 +263,14 @@ Using *ktrain* on **Google Colab**? See these Colab examples:

### Installation

Make sure pip is up-to-date with: `pip3 install -U pip`.

1. Ensure TensorFlow 2.1.0 [is installed](https://www.tensorflow.org/install/pip?lang=python3) if it is not already.
(While *ktrain* will probably work with other versions of TensorFlow 2.x, v2.1.0 is the current recommended and tested version.)

> For GPU: `pip3 install "tensorflow_gpu==2.1.0"`
> For CPU: `pip3 install "tensorflow==2.1.0"`
*ktrain* currently uses [TensorFlow 2.1.0](https://www.tensorflow.org/install/pip?lang=python3), which will be installed automatically when installing *ktrain*.
While *ktrain* will probably work with other versions of TensorFlow 2.x, v2.1.0 is the current recommended and tested version.

1. Make sure pip is up-to-date with: `pip3 install -U pip`

2. Install *ktrain*: `pip3 install ktrain`

**Some things to note:**
- As of v0.8.x, *ktrain* requires TensorFlow 2. TensorFlow 1.x (1.14, 1.15) is no longer suppoted.
- Since some *ktrain* dependencies have not yet been migrated to `tf.keras` in TensorFlow 2 (or may have other issues),
*ktrain* is temporarily using forked versions of some libraries. Specifically, *ktrain* uses forked versions of the `eli5` and `stellargraph` libraries. If not installed, *ktrain* will complain when a method or function needing
either of these libraries is invoked.
Expand Down
Binary file added ktrain/tests/text_data/wines.csv.zip
Binary file not shown.
74 changes: 68 additions & 6 deletions ktrain/text/ner/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from ...predictor import Predictor
from .preprocessor import NERPreprocessor, tokenize
from ... import utils as U
from .. import textutils as TU

class NERPredictor(Predictor):
"""
Expand All @@ -25,20 +26,81 @@ def get_classes(self):
return self.c


def predict(self, sentence):
def predict(self, sentence, return_proba=False, merge_tokens=False):
"""
Makes predictions for a string-representation of a sentence
If return_proba is True, returns probabilities of each class.
Args:
sentence(str): sentence of text
return_proba(bool): If return_proba is True, returns probability distribution for each token
merge_tokens(bool): If True, tokens will be merged together by the entity
to which they are associated:
('Paul', 'B-PER'), ('Newman', 'I-PER') becomes ('Paul Newman', 'PER')
Returns:
list: list of tuples representing each token.
"""
if not isinstance(sentence, str):
raise ValueError('Param sentence must be a string-representation of a sentence')
nerseq = self.preproc.preprocess([sentence])
if return_proba and merge_tokens:
raise ValueError('return_proba and merge_tokens are mutually exclusive with one another.')
lang = TU.detect_lang([sentence])
nerseq = self.preproc.preprocess([sentence], lang=lang)
if not nerseq.prepare_called:
nerseq.prepare()
nerseq.batch_size = self.batch_size
x_true, _ = nerseq[0]
lengths = nerseq.get_lengths(0)
y_pred = self.model.predict_on_batch(x_true)
y_pred = self.preproc.p.inverse_transform(y_pred, lengths)
y_pred = y_pred[0]
return list(zip(nerseq.x[0], y_pred))
y_labels = self.preproc.p.inverse_transform(y_pred, lengths)
y_labels = y_labels[0]
if return_proba:
#probs = np.max(y_pred, axis=2)[0]
y_pred = y_pred[0].numpy().tolist()
return list(zip(nerseq.x[0], y_labels, y_pred))
else:
result = list(zip(nerseq.x[0], y_labels))
if merge_tokens:
result = self.merge_tokens(result, lang)
return result


def merge_tokens(self, annotated_sentence, lang):

if TU.is_chinese(lang, strict=False): # strict=False: workaround for langdetect bug on short chinese texts
sep = ''
else:
sep = ' '

current_token = ""
current_tag = ""
entities = []

for tup in annotated_sentence:
token = tup[0]
entity = tup[1]
tag = entity.split('-')[1] if '-' in entity else None
prefix = entity.split('-')[0] if '-' in entity else None
# not within entity
if tag is None and not current_token:
continue
# beginning of entity
#elif tag and prefix=='B':
elif tag and (prefix=='B' or prefix=='I' and not current_token):
if current_token: # consecutive entities
entities.append((current_token, current_tag))
current_token = ""
current_tag = None
current_token = token
current_tag = tag
# end of entity
elif tag is None and current_token:
entities.append((current_token, current_tag))
current_token = ""
current_tag = None
continue
# within entity
elif tag and current_token: # prefix I
current_token = current_token + sep + token
current_tag = tag
return entities

4 changes: 2 additions & 2 deletions ktrain/text/ner/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,12 @@ def get_wv_model(self, wv_path_or_url, verbose=1):



def preprocess(self, sentences):
def preprocess(self, sentences, lang=None):
if type(sentences) != list:
raise ValueError('Param sentences must be a list of strings')

# language detection
lang = TU.detect_lang(sentences)
if lang is None: lang = TU.detect_lang(sentences)
X = []
y = []
for s in sentences:
Expand Down
79 changes: 40 additions & 39 deletions ktrain/text/shallownlp/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,49 +71,50 @@ def predict(self, texts, merge_tokens=True):
results = []
for text in texts:
text = text.strip()
result = self.predictor.predict(text)
if merge_tokens:
result = self.merge_tokens(result)
result = self.predictor.predict(text, merge_tokens=merge_tokens)
#if merge_tokens:
#result = self.merge_tokens(result)
results.append(result)
if len(result) == 1: result = result[0]
return result


def merge_tokens(self, annotated_sentence):
if self.lang.startswith('zh'):
sep = ''
else:
sep = ' '
current_token = ""
current_tag = ""
entities = []
# 2020-04-30: moved to text.ner.predictor
#def merge_tokens(self, annotated_sentence):
# if self.lang.startswith('zh'):
# sep = ''
# else:
# sep = ' '
# current_token = ""
# current_tag = ""
# entities = []

for tup in annotated_sentence:
token = tup[0]
entity = tup[1]
tag = entity.split('-')[1] if '-' in entity else None
prefix = entity.split('-')[0] if '-' in entity else None
# not within entity
if tag is None and not current_token:
continue
# beginning of entity
#elif tag and prefix=='B':
elif tag and (prefix=='B' or prefix=='I' and not current_token):
if current_token: # consecutive entities
entities.append((current_token, current_tag))
current_token = ""
current_tag = None
current_token = token
current_tag = tag
# end of entity
elif tag is None and current_token:
entities.append((current_token, current_tag))
current_token = ""
current_tag = None
continue
# within entity
elif tag and current_token: # prefix I
current_token = current_token + sep + token
current_tag = tag
return entities
# for tup in annotated_sentence:
# token = tup[0]
# entity = tup[1]
# tag = entity.split('-')[1] if '-' in entity else None
# prefix = entity.split('-')[0] if '-' in entity else None
# # not within entity
# if tag is None and not current_token:
# continue
# # beginning of entity
# #elif tag and prefix=='B':
# elif tag and (prefix=='B' or prefix=='I' and not current_token):
# if current_token: # consecutive entities
# entities.append((current_token, current_tag))
# current_token = ""
# current_tag = None
# current_token = token
# current_tag = tag
# # end of entity
# elif tag is None and current_token:
# entities.append((current_token, current_tag))
# current_token = ""
# current_tag = None
# continue
# # within entity
# elif tag and current_token: # prefix I
# current_token = current_token + sep + token
# current_tag = tag
# return entities

2 changes: 1 addition & 1 deletion ktrain/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__all__ = ['__version__']
__version__ = '0.14.3'
__version__ = '0.14.4'

0 comments on commit aa2c1a4

Please sign in to comment.