Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
amaiya committed Feb 11, 2020
2 parents f25d22c + d33d35e commit e2f0019
Show file tree
Hide file tree
Showing 13 changed files with 151 additions and 50 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,20 @@ Most recent releases are shown at the top. Each release shows:
- **Fixed**: Bug fixes that don't change documented behaviour


## 0.9.3 (2020-02-11)

### New:
- N/A

### Changed:
- Transformed data containers for transformers, NER, and graph -node classification to be
instances of `ktrain.data.Dataset`.

### Fixed:
- fixed `images_from_array` so that y labels are correctly 1-hot-encoded when necessary
- correct tokenization for `bert-base-japanese` Transformer models from PR 57


## 0.9.2 (2020-02-04)

### New:
Expand Down
11 changes: 6 additions & 5 deletions examples/vision/cifar10-WRN22.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"source": [
"import ktrain\n",
"from ktrain import vision\n",
"import keras.backend as K"
"import tensorflow.keras.backend as K"
]
},
{
Expand All @@ -41,8 +41,8 @@
"outputs": [],
"source": [
"# load cifar10 and manually standaridize\n",
"from keras.datasets import cifar10\n",
"from keras.utils import to_categorical\n",
"from tensorflow.keras.datasets import cifar10\n",
"from tensorflow.keras.utils import to_categorical\n",
"(x_train, y_train), (x_test, y_test) = cifar10.load_data()\n",
"x_train = x_train.astype('float32')\n",
"x_train = (x_train - x_train.mean(axis=0)) / (x_train.std(axis=0))\n",
Expand Down Expand Up @@ -78,7 +78,8 @@
"metadata": {},
"outputs": [],
"source": [
"input_shape = (3, 32, 32) if K.image_dim_ordering() == 'th' else (32, 32, 3)"
"#input_shape = (3, 32, 32) if K.image_dim_ordering() == 'th' else (32, 32, 3)\n",
"input_shape = (32, 32, 3)"
]
},
{
Expand Down Expand Up @@ -326,7 +327,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.6.9"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion examples/vision/mnist-WRN22.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.6.9"
}
},
"nbformat": 4,
Expand Down
12 changes: 11 additions & 1 deletion ktrain/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,17 @@ def _prepare(self, data, mode='train'):
data: dataset
mode: either 'train' or 'valid'
"""
return data
if data is None: return None

if hasattr(data, 'to_tfdataset'):
shuffle=True
repeat = True
if mode != 'train':
shuffle = False
repeat = False
return data.to_tfdataset(shuffle=shuffle, repeat=repeat)
else:
return data


@abstractmethod
Expand Down
16 changes: 16 additions & 0 deletions ktrain/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,22 @@


class Dataset(Sequence):
"""
Base class for custom datasets in ktrain.
If subclass of Dataset implements a method to to_tfdataset
that converts the data to a tf.Dataset, then this will be
invoked by Learner instances just prior to training so
fit() will train using a tf.Dataset representation of your data.
Sequence methods such as __get_item__ and __len__
must still be implemented.
The signature of to_tfdataset is as follows:
def to_tfdataset(self, shuffle=True, repeat=True)
See ktrain.text.preprocess.TransformerDataset as an example.
"""
def __init__(self, batch_size=32):
self.batch_size = batch_size

Expand Down
32 changes: 31 additions & 1 deletion ktrain/graph/node_generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ..imports import *
from ..data import Dataset


# import stellargraph
Expand All @@ -11,7 +12,7 @@
raise Exception(SG_ERRMSG)


class NodeSequenceWrapper(node_mappers.NodeSequence):
class NodeSequenceWrapper(node_mappers.NodeSequence, Dataset):
def __init__(self, node_seq):
if not isinstance(node_seq, node_mappers.NodeSequence):
raise ValueError('node_seq must by a stellargraph NodeSequence object')
Expand Down Expand Up @@ -66,3 +67,32 @@ def __getattr__(self, name):
raise AttributeError
return


def nsamples(self):
return self.targets.shape[0]


def get_y(self):
return self.targets


def xshape(self):
return self[0][0][0].shape[1:] # returns 1st neighborhood only


def nclasses(self):
return self[0][1].shape[1]














1 change: 1 addition & 0 deletions ktrain/tests/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def test_transformers_api_1(self):
self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)


#@skip('temporarily disabled')
def test_transformers_api_2(self):
MODEL_NAME = 'distilbert-base-uncased'
preproc = txt.Transformer(MODEL_NAME, maxlen=500, classes=self.classes)
Expand Down
31 changes: 30 additions & 1 deletion ktrain/text/ner/preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from ...imports import *
from ... import utils as U
from ...preprocessor import Preprocessor
from ...data import Dataset

OTHER = 'O'
W2V = 'word2vec'
Expand Down Expand Up @@ -137,7 +138,7 @@ def get_next(self):



class NERSequence(Sequence):
class NERSequence(Dataset):

def __init__(self, x, y, batch_size=1, p=None):
self.x = x
Expand Down Expand Up @@ -167,3 +168,31 @@ def get_lengths(self, idx):

return lengths

def nsamples(self):
return len(self.x)


def get_y(self):
return self.y


def xshape(self):
return (len(self.x), self[0][0][0].shape[1])


def nclasses(self):
return len(self.p._label_vocab._id2token)














25 changes: 20 additions & 5 deletions ktrain/text/preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from ..imports import *
from .. import utils as U
from ..preprocessor import Preprocessor
from ..data import Dataset

DistilBertTokenizer = transformers.DistilBertTokenizer
DISTILBERT= 'distilbert'
Expand Down Expand Up @@ -288,7 +289,8 @@ def hf_convert_examples(texts, y=None, tokenizer=None,
# HF_EXCEPTION
# due to issues in transormers library and TF2 tf.Datasets, arrays are converted
# to iterators on-the-fly
return TransformerSequence(np.array(features_list), np.array(labels))
#return TransformerSequence(np.array(features_list), np.array(labels))
return TransformerDataset(np.array(features_list), np.array(labels))


#------------------------------------------------------------------------------
Expand Down Expand Up @@ -719,6 +721,8 @@ def __init__(self, model_name,
raise ValueError('uknown model name %s' % (model_name))
self.model_type = TRANSFORMER_MODELS[self.name][1]
self.tokenizer_type = TRANSFORMER_MODELS[self.name][2]
if "bert-base-japanese" in model_name:
self.tokenizer_type = transformers.BertJapaneseTokenizer

tokenizer = self.tokenizer_type.from_pretrained(model_name)

Expand Down Expand Up @@ -918,9 +922,10 @@ def get_model(self):




# Transformer Sequence
class TransformerSequence(Sequence):
class TransformerDataset(Dataset):
"""
Wrapper for Transformer datasets.
"""

def __init__(self, x, y, batch_size=1):
if type(x) not in [list, np.ndarray]: raise ValueError('x must be list or np.ndarray')
Expand Down Expand Up @@ -976,10 +981,20 @@ def gen():
return tfdataset


def get_y(self):
return self.y

def nsamples(self):
return len(self.x)

def nclasses(self):
return self.y.shape[1]

def xshape(self):
return (len(self.x), self.x[0].shape[1])


# preprocessors
TEXT_PREPROCESSORS = {'standard': StandardTextPreprocessor,
'bert': BERTPreprocessor,
'distilbert': DistilBertPreprocessor}

36 changes: 7 additions & 29 deletions ktrain/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def is_huggingface_from_model(model):


def is_huggingface_from_data(data):
return type(data).__name__ == 'TransformerSequence'
return type(data).__name__ in ['TransformerDataset']



Expand Down Expand Up @@ -158,11 +158,6 @@ def shape_from_data(data):
err_msg = 'could not determine shape from %s' % (type(data))
if is_iter(data):
if isinstance(data, Dataset): return data.xshape()
elif is_ner(data=data): return (len(data.x), data[0][0][0].shape[1]) # NERSequence
elif is_huggingface(data=data): # HF Transformer
return (len(data.x), data[0][0][0].shape[1])
elif is_nodeclass(data=data): # NodeSequence
return data[0][0][0].shape[1:] # returns 1st neighborhood only
elif hasattr(data, 'image_shape'): return data.image_shape # DirectoryIterator/DataFrameIterator
elif hasattr(data, 'x'): # NumpyIterator
return data.x.shape[1:]
Expand All @@ -185,23 +180,17 @@ def ondisk(data):
if hasattr(data, 'ondisk'): return data.ondisk()

ondisk = is_iter(data) and \
(type(data).__name__ not in ['ArrayDataset', 'NumpyArrayIterator', 'NERSequence',
'NodeSequenceWrapper', 'TransformerSequence'])
(type(data).__name__ not in ['NumpyArrayIterator'])
return ondisk


def nsamples_from_data(data):
err_msg = 'could not determine number of samples from %s' % (type(data))
if is_iter(data):
if isinstance(data, Dataset): return data.nsamples()
elif is_ner(data=data): return len(data.x) # NERSequence
elif is_huggingface(data=data): # HuggingFace Transformer
return len(data.x)
elif is_nodeclass(data=data): # NodeSequenceWrapper
return data.targets.shape[0]
elif hasattr(data, 'samples'): # DirectoryIterator/DataFrameIterator
elif hasattr(data, 'samples'): # DirectoryIterator/DataFrameIterator
return data.samples
elif hasattr(data, 'n'): # DirectoryIterator/DataFrameIterator/NumpyIterator
elif hasattr(data, 'n'): # DirectoryIterator/DataFrameIterator/NumpyIterator
return data.n
else:
raise Exception(err_msg)
Expand All @@ -218,16 +207,11 @@ def nsamples_from_data(data):
def nclasses_from_data(data):
if is_iter(data):
if isinstance(data, Dataset): return data.nclasses()
elif is_ner(data=data): return len(data.p._label_vocab._id2token) # NERSequence
elif is_huggingface(data=data): # Hugging Face Transformer
return data.y.shape[1]
elif is_nodeclass(data=data): # NodeSequenceWrapper
return data[0][1].shape[1]
elif hasattr(data, 'classes'): # DirectoryIterator
elif hasattr(data, 'classes'): # DirectoryIterator
return len(set(data.classes))
else:
try:
return data[0][1].shape[1] # DataFrameIterator/NumpyIterator
return data[0][1].shape[1] # DataFrameIterator/NumpyIterator
except:
raise Exception('could not determine number of classes from %s' % (type(data)))
else:
Expand All @@ -240,11 +224,6 @@ def nclasses_from_data(data):
def y_from_data(data):
if is_iter(data):
if isinstance(data, Dataset): return data.get_y()
elif is_ner(data=data): return data.y # NERSequence
if is_huggingface(data=data): # Hugging Face Transformer
return data.y
elif is_nodeclass(data=data): # NodeSequenceWrapper
return data.targets
elif hasattr(data, 'classes'): # DirectoryIterator
return to_categorical(data.classes)
elif hasattr(data, 'labels'): # DataFrameIterator
Expand All @@ -263,8 +242,7 @@ def y_from_data(data):
def is_iter(data, ignore=False):
if ignore: return True
iter_classes = ["NumpyArrayIterator", "DirectoryIterator",
"DataFrameIterator", "Iterator", "Sequence",
"NERSequence", "NodeSequenceWrapper", "TransformerSequence"]
"DataFrameIterator", "Iterator", "Sequence"]
return data.__class__.__name__ in iter_classes or isinstance(data, Dataset)


Expand Down
2 changes: 1 addition & 1 deletion ktrain/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__all__ = ['__version__']
__version__ = '0.9.2'
__version__ = '0.9.3'
13 changes: 9 additions & 4 deletions ktrain/vision/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,10 +614,15 @@ def images_from_array(x_train, y_train,
"""

# one-hot-encode if necessary
if isinstance(y_train[0], int):
if np.issubdtype(type(y_train[0]), np.integer) or\
(isinstance(y_train[0], (list, np.ndarray)) and len(y_train[0]) == 1):
y_train = to_categorical(y_train)
if validation_data and isinstance(validation_data[1][0], int):
validation_data[1] = to_categorical(validation_data[1])
if validation_data:
x_test = validation_data[0]
y_test = validation_data[1]
if np.issubdtype(type(y_test[0]), np.integer) or\
(isinstance(y_test[0], (list, np.ndarray)) and len(y_test[0]) == 1):
y_test = to_categorical(y_test)


(train_datagen, test_datagen) = process_datagen(data_aug, train_array=x_train)
Expand All @@ -627,7 +632,7 @@ def images_from_array(x_train, y_train,
batches_te = None
preproc = None
if validation_data:
batches_te = test_datagen.flow(validation_data[0], validation_data[1],
batches_te = test_datagen.flow(x_test, y_test,
shuffle=False)
classes = map(str, list(range(len(y_train[0]))))
preproc = ImagePreprocessor(test_datagen, classes, target_size=None, color_mode=None)
Expand Down

0 comments on commit e2f0019

Please sign in to comment.