Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
amaiya committed Jun 3, 2020
2 parents 7671f36 + 1aa5091 commit ed4cb3f
Show file tree
Hide file tree
Showing 9 changed files with 79 additions and 38 deletions.
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,21 @@ Most recent releases are shown at the top. Each release shows:
- **Fixed**: Bug fixes that don't change documented behaviour


## 0.15.4 (2020-06-03)

### New:
- N/A

### Changed
- N/A/

### Fixed:
- Added the `procs`, `limitmb`, and `multisegment` argumetns to `index_from_list` and `index_from_folder` method in `text.SimpleQA`
to speedup indexing when necessary. Supplying `multisegment=True` speeds things up significantly, for example. Defaults, however, are
the same as before. Users must explicitly change values if desiring a speedup.
- Load `xlm-roberta*` as `jplu/tf-xlm-roberta*` to bypass error from `transformers`


## 0.15.3 (2020-05-28)

### New:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
### [Overview](#overview) | [Tutorials](#tutorials) | [Examples](#examples) | [Installation](#installation) | [How to Cite](#how-to-cite)
[![PyPI Status](https://badge.fury.io/py/ktrain.svg)](https://badge.fury.io/py/ktrain) [![ktrain python compatibility](https://img.shields.io/pypi/pyversions/ktrain.svg)](https://pypi.python.org/pypi/ktrain) [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/amaiya/ktrain/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/ktrain)](https://pepy.tech/project/ktrain) [![Downloads](https://pepy.tech/badge/ktrain/month)](https://pepy.tech/project/ktrain/month)
[![PyPI Status](https://badge.fury.io/py/ktrain.svg)](https://badge.fury.io/py/ktrain) [![ktrain python compatibility](https://img.shields.io/pypi/pyversions/ktrain.svg)](https://pypi.python.org/pypi/ktrain) [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/amaiya/ktrain/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/ktrain)](https://pepy.tech/project/ktrain)


# ktrain
Expand Down
52 changes: 27 additions & 25 deletions examples/text/IMDb-BERT.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,7 @@
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"outputs": [],
"source": [
"import ktrain\n",
"from ktrain import text"
Expand All @@ -41,7 +33,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"preprocessing train...\n"
"detected encoding: utf-8\n",
"preprocessing train...\n",
"language: en\n"
]
},
{
Expand All @@ -60,7 +54,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"preprocessing test...\n"
"Is Multi-Label? False\n",
"preprocessing test...\n",
"language: en\n"
]
},
{
Expand All @@ -77,12 +73,12 @@
}
],
"source": [
"(x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder('data/aclImdb', \n",
" maxlen=500, \n",
" preprocess_mode='bert',\n",
" train_test_names=['train', \n",
" 'test'],\n",
" classes=['pos', 'neg'])"
"trn, val, preproc = text.texts_from_folder('data/aclImdb', \n",
" maxlen=500, \n",
" preprocess_mode='bert',\n",
" train_test_names=['train', \n",
" 'test'],\n",
" classes=['pos', 'neg'])"
]
},
{
Expand All @@ -101,7 +97,7 @@
}
],
"source": [
"model = text.text_classifier('bert', (x_train, y_train) , preproc=preproc)"
"model = text.text_classifier('bert', trn , preproc=preproc)"
]
},
{
Expand All @@ -111,8 +107,8 @@
"outputs": [],
"source": [
"learner = ktrain.get_learner(model, \n",
" train_data=(x_train, y_train), \n",
" val_data=(x_test, y_test), \n",
" train_data=trn, \n",
" val_data=val, \n",
" batch_size=6)"
]
},
Expand Down Expand Up @@ -173,14 +169,13 @@
"\n",
"begin training using onecycle policy with max lr of 2e-05...\n",
"Train on 25000 samples, validate on 25000 samples\n",
"Epoch 1/1\n",
"25000/25000 [==============================] - 1966s 79ms/step - loss: 0.2575 - acc: 0.8886 - val_loss: 0.1649 - val_acc: 0.9384\n"
"25000/25000 [==============================] - 2304s 92ms/sample - loss: 0.2442 - accuracy: 0.9008 - val_loss: 0.1596 - val_accuracy: 0.9394\n"
]
},
{
"data": {
"text/plain": [
"<keras.callbacks.History at 0x7f1fa5bf9320>"
"<tensorflow.python.keras.callbacks.History at 0x7f6b102fe780>"
]
},
"execution_count": 8,
Expand All @@ -197,7 +192,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### **93.84%** accuracy in a single epoch."
"### **93.94%** accuracy in a single epoch."
]
},
{
Expand Down Expand Up @@ -258,6 +253,13 @@
"source": [
"predictor.predict(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -276,7 +278,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.6.9"
}
},
"nbformat": 4,
Expand Down
4 changes: 3 additions & 1 deletion examples/text/question_answering_with_bert.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@
"source": [
"For documents sets that are too large to be loaded into a Python list, you can use `SimpleQA.index_from_folder`, which will crawl a folder and index all plain text documents.\n",
"\n",
"By default, `index_from_list` and `index_from_folder` use a single processor (`procs=1`) with each processor using a maximum of 256MB of memory (`limitmb=256`) and merging results into a single segment (`multisegment=False`). These values can be changed to speedup indexing as arguments to `index_from_list` or `index_from_folder`. See the [whoosh documentation](https://whoosh.readthedocs.io/en/latest/batch.html) for more information on these parameters and how to use them to speedup indexing.\n",
"\n",
"The above steps need to only be performed once. Once an index is already created, you can skip this step and proceed directly to **STEP 2** to begin using your system."
]
},
Expand Down Expand Up @@ -580,7 +582,7 @@
"source": [
"Here, we see different views on who Jesus was as debated and discussed in this document set.\n",
"\n",
"Finally, the 20 Newsgroup dataset also contains many groups about computing hardward and software. Let's ask a technical support question.\n",
"Finally, the 20 Newsgroup dataset also contains many groups about computing hardware and software. Let's ask a technical support question.\n",
"\n",
"#### Technical Question"
]
Expand Down
3 changes: 2 additions & 1 deletion examples/vision/mnist-tf_workflow.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@
" model = make_model()\n",
"\n",
"\n",
"# set up learning rate decay\n",
"# set up learning rate decay [FROM ORIGINAL EXAMPLE BUT NOT USED]\n",
"# NOT NEEDED: we will use ktrain to find LR and decay learning rate during training\n",
"LEARNING_RATE = 0.01\n",
"LEARNING_RATE_EXP_DECAY = 0.6 if strategy.num_replicas_in_sync == 1 else 0.7\n",
"lr_decay = tf.keras.callbacks.LearningRateScheduler(\n",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python3
"""
Tests of ktrain text classification flows
Tests of ktrain shallownlp module:
2020-05-26: renamed test_zzz_shallownlp.py because
causes issues for tests following it when run in conjunction with test_regression.py.
"""
import testenv
from unittest import TestCase, main, skip
Expand Down
1 change: 1 addition & 0 deletions ktrain/text/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,7 @@ def __init__(self, model_name,
self.name = model_name.split('-')[0]
if model_name.startswith('xlm-roberta'):
self.name = 'xlm_roberta'
self.model_name = 'jplu/tf-' + self.model_name
else:
self.name = model_name.split('-')[0]
if self.name not in TRANSFORMER_MODELS:
Expand Down
34 changes: 26 additions & 8 deletions ktrain/text/qa/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,16 +145,23 @@ def initialize_index(cls, index_dir):
return ix

@classmethod
def index_from_list(cls, docs, index_dir, commit_every=1024):
def index_from_list(cls, docs, index_dir, commit_every=1024,
procs=1, limitmb=256, multisegment=False):
"""
index documents from list
index documents from list.
The procs, limitmb, and especially multisegment arguments can be used to
speed up indexing, if it is too slow. Please see the whoosh documentation
for more information on these parameters: https://whoosh.readthedocs.io/en/latest/batch.html
Args:
docs(list): list of strings representing documents
commit_every(int): commet after adding this many documents
procs(int): number of processors
limitmb(int): memory limit in MB for each process
multisegment(bool): new segments written instead of merging
"""
if not isinstance(docs, (np.ndarray, list)): raise ValueError('docs must be a list of strings')
ix = index.open_dir(index_dir)
writer = ix.writer()
writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
mb = master_bar(range(1))
for i in mb:
for idx, doc in enumerate(progress_bar(docs, parent=mb)):
Expand All @@ -164,23 +171,33 @@ def index_from_list(cls, docs, index_dir, commit_every=1024):
idx +=1
if idx % commit_every == 0:
writer.commit()
writer = ix.writer()
#writer = ix.writer()
writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
writer.commit()
return


@classmethod
def index_from_folder(cls, folder_path, index_dir, commit_every=1024, verbose=1, encoding='utf-8'):
def index_from_folder(cls, folder_path, index_dir, commit_every=1024, verbose=1, encoding='utf-8',
procs=1, limitmb=256, multisegment=False):
"""
index all plain text documents within a folder
index all plain text documents within a folder.
The procs, limitmb, and especially multisegment arguments can be used to
speed up indexing, if it is too slow. Please see the whoosh documentation
for more information on these parameters: https://whoosh.readthedocs.io/en/latest/batch.html
Args:
folder_path(str): path to folder containing plain text documents
commit_every(int): commet after adding this many documents
procs(int): number of processors
limitmb(int): memory limit in MB for each process
multisegment(bool): new segments written instead of merging
"""
if not os.path.isdir(folder_path): raise ValueError('folder_path is not a valid folder')
if folder_path[-1] != os.sep: folder_path += os.sep
ix = index.open_dir(index_dir)
writer = ix.writer()
writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
for idx, fpath in enumerate(TU.extract_filenames(folder_path)):
if not TU.is_txt(fpath): continue
reference = "%s" % (fpath.join(fpath.split(folder_path)[1:]))
Expand All @@ -191,7 +208,8 @@ def index_from_folder(cls, folder_path, index_dir, commit_every=1024, verbose=1
idx +=1
if idx % commit_every == 0:
writer.commit()
writer = ix.writer()
#writer = ix.writer()
writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
if verbose: print("%s docs indexed" % (idx))
writer.commit()
return
Expand Down
2 changes: 1 addition & 1 deletion ktrain/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__all__ = ['__version__']
__version__ = '0.15.3'
__version__ = '0.15.4'

0 comments on commit ed4cb3f

Please sign in to comment.