Merge branch 'develop'

amaiya · Apr 27, 2020 · 4b24963 · 4b24963
2 parents 87ac4dd + 8aed2c7
commit 4b24963
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,19 @@ Most recent releases are shown at the top. Each release shows:
 - **Fixed**: Bug fixes that don't change documented behaviour
 
 
+## 0.14.3 (2020-04-27)
+
+### New:
+- N/A
+
+### Changed
+- added `textutils` to `text` namespace and added note about `sent_tokenize` to sequence-tagging tutorial
+
+### Fixed:
+- cast dependent variable to `tf.float32` instead of `tf.int64` for text regression problems using `transformers` library
+
+
+
 ## 0.14.2 (2020-04-21)
 
 ### New:

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-### [Overview](#overview) | [Tutorials](#tutorials) | [Examples](#examples) |  [Installation](#installation)
+### [Overview](#overview) | [Tutorials](#tutorials) | [Examples](#examples) |  [Installation](#installation) | [How to Cite](#how-to-cite)
 [![PyPI Status](https://badge.fury.io/py/ktrain.svg)](https://badge.fury.io/py/ktrain) [![ktrain python compatibility](https://img.shields.io/pypi/pyversions/ktrain.svg)](https://pypi.python.org/pypi/ktrain) [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/amaiya/ktrain/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/ktrain)](https://pepy.tech/project/ktrain) [![Downloads](https://pepy.tech/badge/ktrain/month)](https://pepy.tech/project/ktrain/month)
 
 
@@ -53,7 +53,7 @@ learner.fit(0.01, 1, cycle_len=5)
      - **Text Summarization**:  summarize long documents with a pretrained BART model - no training required <sub><sup>[[example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/examples/text/text_summarization_with_bart.ipynb)]</sup></sub>
      - **Open-Domain Question-Answering**:  ask a large text corpus questions and receive exact answers <sub><sup>[[example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/examples/text/question_answering_with_bert.ipynb)]</sup></sub>
   - `vision` data:
-    - **image classification** (e.g., [ResNet](https://arxiv.org/abs/1512.03385), [Wide ResNet](https://arxiv.org/abs/1605.07146), [Inception](https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf)) <sub><sup>[[example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/examples/vision/dogs_vs_cats-ResNet50.ipynb)]</sup></sub>
+    - **image classification** (e.g., [ResNet](https://arxiv.org/abs/1512.03385), [Wide ResNet](https://arxiv.org/abs/1605.07146), [Inception](https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf)) <sub><sup>[[example notebook](https://colab.research.google.com/drive/1WipQJUPL7zqyvLT10yekxf_HNMXDDtyR)]</sup></sub>
   - `graph` data:
     - **node classification** with graph neural networks ([GraphSAGE](https://cs.stanford.edu/people/jure/pubs/graphsage-nips17.pdf)) <sub><sup>[[example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/examples/graphs/pubmed_node_classification-GraphSAGE.ipynb)]</sup></sub>
     - **link prediction** with graph neural networks ([GraphSAGE](https://cs.stanford.edu/people/jure/pubs/graphsage-nips17.pdf)) <sub><sup>[[example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/examples/graphs/cora_link_prediction-GraphSAGE.ipynb)]</sup></sub>
@@ -252,7 +252,10 @@ learner.validate(class_names=t.get_classes()) # class_names must be string value
 ```
 
 
-Using *ktrain* on **Google Colab**?  See [this simple demo of Multiclass Text Classification with BERT](https://colab.research.google.com/drive/1AH3fkKiEqBpVpO5ua00scp7zcHs5IDLK).
+Using *ktrain* on **Google Colab**?  See these Colab examples:
+-  [a simple demo of Multiclass Text Classification with BERT](https://colab.research.google.com/drive/1AH3fkKiEqBpVpO5ua00scp7zcHs5IDLK).
+-  [a simple demo of Multiclass Text Classification with Hugging Face Transformers](https://colab.research.google.com/drive/1YxcceZxsNlvK35pRURgbwvkgejXwFxUt).
+-  [image classification with Cats vs. Dogs](https://colab.research.google.com/drive/1WipQJUPL7zqyvLT10yekxf_HNMXDDtyR)
 
 **Additional examples can be found [here](https://github.com/amaiya/ktrain/tree/master/examples).**
 
@@ -283,7 +286,22 @@ pip3 install git+https://github.com/amaiya/eli5@tfkeras_0_10_1
 pip3 install git+https://github.com/amaiya/stellargraph@no_tf_dep_082
 ```
 
+This code was tested on Ubuntu 18.04 LTS using TensorFlow 2.1.0
+
+
+### How to Cite
 
+Please cite the following paper when using **ktrain**:
+```
+@misc{maiya2020ktrain,
+    title={ktrain: A Low-Code Library for Augmented Machine Learning},
+    author={Arun S. Maiya},
+    year={2020},
+    eprint={2004.10703},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}
+```
 
 
 <!--
@@ -302,7 +320,6 @@ The following software/libraries should be installed:
 -->
 
 
-This code was tested on Ubuntu 18.04 LTS using TensorFlow 2.1.0
 
 ----
 **Creator:  [Arun S. Maiya](http://arun.maiya.net)**

diff --git a/ktrain/core.py b/ktrain/core.py
@@ -465,7 +465,7 @@ def lr_find(self, start_lr=1e-7, lr_mult=1.01, max_epochs=None,
         U.vprint('\n', verbose=verbose)
         U.vprint('done.', verbose=verbose)
         if show_plot:
-            U.vprint('Visually inspect the loss plot to help identify the maximal learning rate', verbose=verbose)
+            U.vprint('Visually inspect loss plot and select learning rate associated with falling loss', verbose=verbose)
             self.lr_plot()
         else:
             U.vprint('Please invoke the Learner.lr_plot() method to visually inspect '

diff --git a/ktrain/text/__init__.py b/ktrain/text/__init__.py
@@ -8,6 +8,7 @@
 from .summarization import TransformerSummarizer
 from . import shallownlp
 from .qa import SimpleQA
+from . import textutils
 
 __all__ = [
            'text_classifier', 'text_regression_model',
@@ -21,13 +22,13 @@
            'sequence_tagger',
            'print_sequence_taggers',
            'get_topic_model',
-           'extract_filenames', 
-           'load_text_files',
            'Transformer',
            'TranformerEmbedding',
            'shallownlp',
            'TransformerSummarizer',
-           'SimpleQA'
+           'SimpleQA',
+           'extract_filenames', 
+           'load_text_files',
            ]
 
 

diff --git a/ktrain/text/preprocessor.py b/ktrain/text/preprocessor.py
@@ -1190,8 +1190,10 @@ def to_tfdataset(self, shuffle=True, repeat=True):
         """
         if len(self.y.shape) == 1:
             yshape = []
+            ytype = tf.float32
         else:
             yshape = [None]
+            ytype = tf.int64
 
         def gen():
             for idx, data in enumerate(self.x):
@@ -1204,7 +1206,7 @@ def gen():
             ({'input_ids': tf.int32,
               'attention_mask': tf.int32,
               'token_type_ids': tf.int32},
-             tf.int64),
+             ytype),
             ({'input_ids': tf.TensorShape([None]),
               'attention_mask': tf.TensorShape([None]),
               'token_type_ids': tf.TensorShape([None])},

diff --git a/ktrain/version.py b/ktrain/version.py
@@ -1,2 +1,2 @@
 __all__ = ['__version__']
-__version__ = '0.14.2'
+__version__ = '0.14.3'
diff --git a/tutorials/tutorial-06-sequence-tagging.ipynb b/tutorials/tutorial-06-sequence-tagging.ipynb
@@ -530,6 +530,26 @@
     "reloaded_predictor.predict('Paul Newman is my favorite American actor.')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### A Note on Sentence Tokenization\n",
+    "\n",
+    "The `predict` method typically operates on individual sentences instead of entire paragraphs or documents. The model after all was trained on individual sentences.  In production, you can use the `sent_tokenize` function to tokenize text into individual sentences.\n",
+    "\n",
+    "```python\n",
+    "from ktrain import text\n",
+    "text.textutils.sent_tokenize('This is the first sentence about Dr. Smith.  This is the second sentence.')\n",
+    "```\n",
+    "\n",
+    "The above will output:\n",
+    "```\n",
+    "['This is the first sentence about Dr . Smith .',\n",
+    " 'This is the second sentence .']\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,