-
Notifications
You must be signed in to change notification settings - Fork 269
/
data.py
186 lines (142 loc) · 5.24 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from .imports import *
class Dataset:
"""
Base class for custom datasets in ktrain.
If subclass of Dataset implements a method to to_tfdataset
that converts the data to a tf.Dataset, then this will be
invoked by Learner instances just prior to training so
fit() will train using a tf.Dataset representation of your data.
Sequence methods such as __get_item__ and __len__
must still be implemented.
The signature of to_tfdataset is as follows:
def to_tfdataset(self, train=True)
See ktrain.text.preprocess.TransformerDataset as an example.
"""
# required: used by ktrain.core.Learner instances
def nsamples(self):
raise NotImplemented
# required: used by ktrain.core.Learner instances
def get_y(self):
raise NotImplemented
# optional: to modify dataset between epochs (e.g., shuffle)
def on_epoch_end(self):
pass
# optional
def ondisk(self):
"""
Is data being read from disk like with DirectoryIterators?
"""
return False
# optional: used only if invoking *_classifier functions
def xshape(self):
"""
shape of X
Examples:
for images: input_shape
for text: (n_example, sequence_length)
"""
raise NotImplemented
# optional: used only if invoking *_classifier functions
def nclasses(self):
"""
Number of classes
For classification problems: this is the number of labels
Not used for regression problems
"""
raise NotImplemented
class TFDataset(Dataset):
"""
Wrapper for tf.data.Datasets
"""
def __init__(self, tfdataset, n, y):
"""
Args:
tfdataset(tf.data.Dataset): a tf.Dataset instance
n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets)
y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded)
"""
if not isinstance(tfdataset, tf.data.Dataset):
raise ValueError('tfdataset must be a fully-configured tf.data.Dataset with batch_size, etc. set appropriately')
self.tfdataset = tfdataset
self.bs = next(tfdataset.as_numpy_iterator())[-1].shape[0] # extract batch_size from tfdataset
self.n = n
self.y = y
@property
def batch_size(self):
return self.bs
@batch_size.setter
def batch_size(self, value):
if value != self.bs:
warnings.warn('batch_size parameter is ignored, as pre-configured batch_size of tf.data.Dataset is used')
def nsamples(self):
return self.n
def get_y(self):
return self.y
def to_tfdataset(self, train=True):
return self.tfdataset
class SequenceDataset(Dataset, Sequence):
"""
Base class for custom datasets in ktrain.
If subclass of Dataset implements a method to to_tfdataset
that converts the data to a tf.Dataset, then this will be
invoked by Learner instances just prior to training so
fit() will train using a tf.Dataset representation of your data.
Sequence methods such as __get_item__ and __len__
must still be implemented.
The signature of to_tfdataset is as follows:
def to_tfdataset(self, training=True)
See ktrain.text.preprocess.TransformerDataset as an example.
"""
def __init__(self, batch_size=32):
self.batch_size = batch_size
# required by keras.utils.Sequence instances
def __len__(self):
raise NotImplemented
# required by keras.utils.Sequence instances
def __getitem__(self, idx):
raise NotImplemented
return False
class MultiArrayDataset(SequenceDataset):
def __init__(self, x, y, batch_size=32, shuffle=True):
# error checks
err = False
if type(x) == np.ndarray and len(x.shape) != 2: err = True
elif type(x) == list:
for d in x:
if type(d) != np.ndarray or len(d.shape) != 2:
err = True
break
else: err = True
if err:
raise ValueError('x must be a 2d numpy array or a list of 2d numpy arrays')
if type(y) != np.ndarray:
raise ValueError('y must be a numpy array')
if type(x) == np.ndarray:
x = [x]
# set variables
super().__init__(batch_size=batch_size)
self.x, self.y = x, y
self.indices = np.arange(self.x[0].shape[0])
self.n_inputs = len(x)
self.shuffle = shuffle
def __len__(self):
return math.ceil(self.x[0].shape[0] / self.batch_size)
def __getitem__(self, idx):
inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_x = []
for i in range(self.n_inputs):
batch_x.append(self.x[i][inds])
batch_y = self.y[inds]
return tuple(batch_x), batch_y
def nsamples(self):
return self.x[0].shape[0]
def get_y(self):
return self.y
def on_epoch_end(self):
if self.shuffle: np.random.shuffle(self.indices)
def xshape(self):
return self.x[0].shape
def nclasses(self):
return self.y.shape[1]
def ondisk(self):
return False