Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
amaiya committed May 10, 2020
2 parents 075b691 + cf4e2d6 commit c12fd8c
Show file tree
Hide file tree
Showing 24 changed files with 1,015 additions and 142 deletions.
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,21 @@ Most recent releases are shown at the top. Each release shows:
- **Fixed**: Bug fixes that don't change documented behaviour


## 0.14.7 (2020-05-10)

### New:
- Added `TFDataset` class for use as wrapper around arbitrary `tf.data.Dataset` objects for use in *ktrain*

### Changed
- Added `NERPreprocessor.preprocess_train_from_conll2003`
- Removed extraneous imports from `text.__init__.py` and `vision.__init__.py`
- `classes` argument in `images_from_array` changed to `class_names`

### Fixed:
- ensure NER data is properly prepared `text.ner.learner.validate`
- fixed typo with `df` reference in `images_from_fname`


## 0.14.6 (2020-05-06)

### New:
Expand Down
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,13 +287,12 @@ This code was tested on Ubuntu 18.04 LTS using TensorFlow 2.1.0

Please cite the [following paper](https://arxiv.org/abs/2004.10703) when using **ktrain**:
```
@misc{maiya2020ktrain,
title={ktrain: A Low-Code Library for Augmented Machine Learning},
author={Arun S. Maiya},
year={2020},
eprint={2004.10703},
archivePrefix={arXiv},
primaryClass={cs.LG}
@article{maiya2020ktrain,
title={ktrain: A Low-Code Library for Augmented Machine Learning},
author={Arun S. Maiya},
journal={ArXiv},
year={2020},
volume={arXiv:2004.10703 [cs.LG]}
}
```

Expand Down
3 changes: 3 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ The objective of the CoNLL2003 task is to classify sequences of words as belongi
#### [MNIST](http://yann.lecun.com/exdb/mnist/): Multiclass Classification
- [mnist-image_from_array_example.ipynb](https://github.com/amaiya/ktrain/tree/master/examples/vision): Build an MNIST model using `images_from_array`

#### [MNIST](http://yann.lecun.com/exdb/mnist/): Multiclass Classification
- [mnist-tf_workflow.ipynb](https://github.com/amaiya/ktrain/tree/master/examples/vision): Illustrates how *ktrain* can be used in minimally-invasive way with normal TF workflow

#### [CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html): Multiclass Classification
- [cifar10-WRN22.ipynb](https://github.com/amaiya/ktrain/tree/master/examples/vision): A randomly-initialized Wide Residual Network applied to CIFAR10

Expand Down
2 changes: 1 addition & 1 deletion examples/vision/mnist-images_from_array_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
" val_pct=0.1,\n",
" random_state=42,\n",
" data_aug=data_aug,\n",
" classes=classes)"
" class_names=classes)"
]
},
{
Expand Down
734 changes: 734 additions & 0 deletions examples/vision/mnist-tf_workflow.ipynb

Large diffs are not rendered by default.

11 changes: 9 additions & 2 deletions examples/vision/pets-ResNet50.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@
],
"source": [
"# find a good learning rate\n",
"learner.lr_find()"
"learner.lr_find(max_epochs=5)"
]
},
{
Expand All @@ -362,6 +362,13 @@
"learner.lr_plot()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For demonstration purposes, we use `autofit` to train, which employs a triangular learning rate policy with `epochs=20` and `reduce_on_plateau=2`. You may choose to try something different."
]
},
{
"cell_type": "code",
"execution_count": 12,
Expand Down Expand Up @@ -681,7 +688,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.6.9"
}
},
"nbformat": 4,
Expand Down
4 changes: 2 additions & 2 deletions ktrain/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
from .text.learner import BERTTextClassLearner, TransformerTextClassLearner
from .text.ner.learner import NERLearner
from .graph.learner import NodeClassLearner, LinkPredLearner
from .data import Dataset
from .data import Dataset, TFDataset, SequenceDataset

from . import utils as U

__all__ = ['get_learner', 'get_predictor', 'load_predictor', 'release_gpu_memory',
'Dataset']
'Dataset', 'TFDataset', 'SequenceDataset']



Expand Down
24 changes: 9 additions & 15 deletions ktrain/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,11 +233,11 @@ def save_model(self, fpath):
return


def load_model(self, fpath):
def load_model(self, fpath, custom_objects=None):
"""
a wrapper to load_model
"""
self.model = _load_model(fpath, train_data=self.train_data)
self.model = _load_model(fpath, train_data=self.train_data, custom_objects=custom_objects)
return

def _is_adamlike(self):
Expand Down Expand Up @@ -640,23 +640,18 @@ def _cb_earlystopping(self, early_stopping, callbacks=[]):
return callbacks


def _prepare(self, data, mode='train'):
def _prepare(self, data, train=True):
"""
Subclasses can override this method if data
needs to be specially-prepared prior to invoking fit methods
Args:
data: dataset
mode: either 'train' or 'valid'
train(bool): If True, prepare for training. Otherwise, prepare for evaluation.
"""
if data is None: return None

if hasattr(data, 'to_tfdataset'):
shuffle=True
repeat = True
if mode != 'train':
shuffle = False
repeat = False
return data.to_tfdataset(shuffle=shuffle, repeat=repeat)
return data.to_tfdataset(train=train)
else:
return data

Expand Down Expand Up @@ -894,7 +889,7 @@ def predict(self, val_data=None):
if U.is_iter(val):
if hasattr(val, 'reset'): val.reset()
steps = np.ceil(U.nsamples_from_data(val)/val.batch_size)
result = self.model.predict_generator(self._prepare(val, mode='valid'),
result = self.model.predict_generator(self._prepare(val, train=False),
steps=steps)
return result
else:
Expand Down Expand Up @@ -987,7 +982,7 @@ def fit(self, lr, n_cycles, cycle_len=None, cycle_mult=1,
with warnings.catch_warnings():
warnings.filterwarnings('ignore', message='.*Check your callbacks.*')
hist = self.model.fit(self._prepare(x_train),
self._prepare(y_train, mode='valid'),
self._prepare(y_train, train=False),
batch_size=self.batch_size,
epochs=epochs,
validation_data=validation, verbose=verbose,
Expand Down Expand Up @@ -1189,7 +1184,7 @@ def fit(self, lr, n_cycles, cycle_len=None, cycle_mult=1,
steps_per_epoch = steps_per_epoch,
validation_steps = validation_steps,
epochs=epochs,
validation_data=self._prepare(self.val_data, mode='valid'),
validation_data=self._prepare(self.val_data, train=False),
workers=self.workers,
use_multiprocessing=self.use_multiprocessing,
verbose=verbose,
Expand Down Expand Up @@ -1390,10 +1385,9 @@ def release_gpu_memory(device=0):
return


def _load_model(fname, preproc=None, train_data=None):
def _load_model(fname, preproc=None, train_data=None, custom_objects=None):
if not preproc and not train_data:
raise ValueError('Either preproc or train_data is required.')
custom_objects=None
if preproc and isinstance(preproc, TransformersPreprocessor):
# note: with transformer models, fname is actually a directory
model = preproc.get_model(fpath=fname)
Expand Down
86 changes: 73 additions & 13 deletions ktrain/data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .imports import *


class Dataset(Sequence):
class Dataset:
"""
Base class for custom datasets in ktrain.
Expand All @@ -14,20 +14,10 @@ class Dataset(Sequence):
The signature of to_tfdataset is as follows:
def to_tfdataset(self, shuffle=True, repeat=True)
def to_tfdataset(self, train=True)
See ktrain.text.preprocess.TransformerDataset as an example.
"""
def __init__(self, batch_size=32):
self.batch_size = batch_size

# required by keras.utils.Sequence instances
def __len__(self):
raise NotImplemented

# required by keras.utils.Sequence instances
def __getitem__(self, idx):
raise NotImplemented

# required: used by ktrain.core.Learner instances
def nsamples(self):
Expand Down Expand Up @@ -68,8 +58,78 @@ def nclasses(self):
raise NotImplemented


class TFDataset(Dataset):
"""
Wrapper for tf.data.Datasets
"""
def __init__(self, tfdataset, n, y):
"""
Args:
tfdataset(tf.data.Dataset): a tf.Dataset instance
n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets)
y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded)
"""
if not isinstance(tfdataset, tf.data.Dataset):
raise ValueError('tfdataset must be a fully-configured tf.data.Dataset with batch_size, etc. set appropriately')
self.tfdataset = tfdataset
self.bs = next(tfdataset.as_numpy_iterator())[-1].shape[0] # extract batch_size from tfdataset
self.n = n
self.y = y

@property
def batch_size(self):
return self.bs

@batch_size.setter
def batch_size(self, value):
if value != self.bs:
warnings.warn('batch_size parameter is ignored, as pre-configured batch_size of tf.data.Dataset is used')


def nsamples(self):
return self.n

def get_y(self):
return self.y

def to_tfdataset(self, train=True):
return self.tfdataset


class SequenceDataset(Dataset, Sequence):
"""
Base class for custom datasets in ktrain.
If subclass of Dataset implements a method to to_tfdataset
that converts the data to a tf.Dataset, then this will be
invoked by Learner instances just prior to training so
fit() will train using a tf.Dataset representation of your data.
Sequence methods such as __get_item__ and __len__
must still be implemented.
The signature of to_tfdataset is as follows:
def to_tfdataset(self, training=True)
See ktrain.text.preprocess.TransformerDataset as an example.
"""
def __init__(self, batch_size=32):
self.batch_size = batch_size

# required by keras.utils.Sequence instances
def __len__(self):
raise NotImplemented

# required by keras.utils.Sequence instances
def __getitem__(self, idx):
raise NotImplemented

return False




class MultiArrayDataset(Dataset):
class MultiArrayDataset(SequenceDataset):
def __init__(self, x, y, batch_size=32, shuffle=True):
# error checks
err = False
Expand Down
6 changes: 3 additions & 3 deletions ktrain/graph/sg_wrappers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from ..imports import *
from ..data import Dataset
from ..data import SequenceDataset


# import stellargraph
Expand All @@ -12,7 +12,7 @@
raise Exception(SG_ERRMSG)


class NodeSequenceWrapper(node_mappers.NodeSequence, Dataset):
class NodeSequenceWrapper(node_mappers.NodeSequence, SequenceDataset):
def __init__(self, node_seq):
if not isinstance(node_seq, node_mappers.NodeSequence):
raise ValueError('node_seq must by a stellargraph NodeSequence object')
Expand Down Expand Up @@ -88,7 +88,7 @@ def nclasses(self):



class LinkSequenceWrapper(link_mappers.LinkSequence, Dataset):
class LinkSequenceWrapper(link_mappers.LinkSequence, SequenceDataset):
def __init__(self, link_seq):
if not isinstance(link_seq, link_mappers.LinkSequence):
raise ValueError('link_seq must by a stellargraph LinkSequence object')
Expand Down
1 change: 1 addition & 0 deletions ktrain/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@
# ner
from seqeval.metrics import classification_report as ner_classification_report
from seqeval.metrics import f1_score as ner_f1_score
from seqeval.metrics import accuracy_score as ner_accuracy_score
from seqeval.metrics.sequence_labeling import get_entities
import syntok.segmenter as segmenter

Expand Down
6 changes: 3 additions & 3 deletions ktrain/text/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .models import *
from .data import *
from .models import print_text_classifiers, print_text_regression_models, text_classifier, text_regression_model
from .data import texts_from_folder, texts_from_csv, texts_from_df, texts_from_array
from .ner.data import entities_from_gmb, entities_from_conll2003, entities_from_txt, entities_from_df, entities_from_array
from .ner.models import sequence_tagger, print_sequence_taggers
from .eda import get_topic_model
Expand All @@ -13,7 +13,7 @@
__all__ = [
'text_classifier', 'text_regression_model',
'print_text_classifiers', 'print_text_regression_models',
'texts_from_folder', 'texts_from_csv',
'texts_from_folder', 'texts_from_csv', 'texts_from_df', 'texts_from_array',
'entities_from_gmb',
'entities_from_conll2003',
'entities_from_txt',
Expand Down
11 changes: 3 additions & 8 deletions ktrain/text/learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,20 +129,15 @@ def view_top_losses(self, n=4, preproc=None, val_data=None):
return


def _prepare(self, data, mode='train'):
def _prepare(self, data, train=True):
"""
prepare data as tf.Dataset
"""
# HF_EXCEPTION
# convert arrays to TF dataset (iterator) on-the-fly
# to work around issues with transformers and tf.Datasets
if data is None: return None
shuffle=True
repeat = True
if mode != 'train':
shuffle = False
repeat = False
return data.to_tfdataset(shuffle=shuffle, repeat=repeat)
return data.to_tfdataset(train=train)


def predict(self, val_data=None):
Expand All @@ -156,7 +151,7 @@ def predict(self, val_data=None):
if val is None: raise Exception('val_data must be supplied to get_learner or predict')
if hasattr(val, 'reset'): val.reset()
classification, multilabel = U.is_classifier(self.model)
preds = self.model.predict(self._prepare(val, mode='valid'))
preds = self.model.predict(self._prepare(val, train=False))
if classification:
if multilabel:
return activations.sigmoid(tf.convert_to_tensor(preds)).numpy()
Expand Down

0 comments on commit c12fd8c

Please sign in to comment.