Skip to content

Commit

Permalink
fixed generation dtype compat
Browse files Browse the repository at this point in the history
  • Loading branch information
alvinwan committed Jan 9, 2017
1 parent 96219d5 commit 6e5e29f
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 34 deletions.
14 changes: 6 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,19 +77,17 @@ our kernel matrix or a function of our kernel matrix.

python mlml.py ssgd (mnist|spam|cifar-10) --memId=<memId> [options]

For example, the following runs kernelized sgd on a subset of 35000
samples from MNIST, using the radial basis function (RBF). Note that
For example, the following runs kernelized sgd on all samples from
cifar-10, using the radial basis function (RBF). Note that
the first command will output the `<memId>` needed for the second
command.

python mlml.py generate mnist --subset=35000 --kernel=RBF
python mlml.py ssgd mnist --memId=<memId> --subset=35000
python mlml.py generate cifar-10 --kernel=RBF
python mlml.py ssgd cifar-10 --memId=<memId>

To use a more computationally efficient but memory-consuming algorithm,
use the `--simulated` flag. For example, the following runs the
alternative generation scheme for `CIFAR-10`.
To run on a subset of your data, use the `--subset` flag.

python mlml.py generate cifar-10 --subset=30000 --kernel=RBF --simulated
python mlml.py generate cifar=10 --kernel=RBF --subset=35000

## Command-Line Utility

Expand Down
23 changes: 12 additions & 11 deletions demos/ksgd.ipynb

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions mlml.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
--test=<test> Path to test data [default: data/test]
--trn-dtype=<dtype> The numeric type of each training sample [default: uint8]
--tst-dtype=<dtype> The numeric type of each test sample [default: uint8]
--shuffle-on-disk Flag for ssgd to shuffle on disk.
--simulated Mark memory constraints as simulated. Allows full accuracy tests.
--subset=<num> Specify subset of data to pick. Ignored if <= 0. [default: 0]
"""
Expand Down Expand Up @@ -134,7 +135,8 @@ def preprocess_arguments(arguments) -> dict:
"""

if arguments['mnist']:
arguments['--dtype'] = 'uint8'
arguments['--trn-dtype'] = 'uint8'
arguments['--tst-dtype'] = 'uint8'
arguments['--train'] = 'data/mnist-%s-60000-train' % arguments['--dtype']
arguments['--test'] = 'data/mnist-%s-10000-test' % arguments['--dtype']
arguments['--n'] = 60000
Expand All @@ -151,7 +153,6 @@ def preprocess_arguments(arguments) -> dict:
arguments['--k'] = 1
arguments['--d'] = 55
if arguments['cifar-10']:
arguments['--dtype'] = 'uint8'
arguments['--trn-dtype'] = 'uint8'
arguments['--tst-dtype'] = 'uint8'
arguments['--train'] = 'data/cifar-10-uint8-50000-train'
Expand All @@ -161,6 +162,7 @@ def preprocess_arguments(arguments) -> dict:
arguments['--k'] = 10
arguments['--d'] = 3072
arguments['--one-hot'] = 'true'
arguments['--data-hook'] = lambda X, labels: (X / 255.0, labels)

arguments['--damp'] = float(arguments['--damp'])
arguments['--data-hook'] = arguments.get('--data-hook', lambda *args: args)
Expand All @@ -176,7 +178,7 @@ def preprocess_arguments(arguments) -> dict:
arguments['--one-hot'] = arguments['--one-hot'].lower() == 'true'
arguments['--reg'] = float(arguments['--reg'])
arguments['--step'] = int(arguments['--step'])
arguments['--subset'] = int(arguments['--subset'])
arguments['--subset'] = int(arguments['--subset']) or arguments['--n']

if arguments['--memId']:
arguments['--data-hook'] = lambda *args: args
Expand Down
3 changes: 2 additions & 1 deletion mlml/kernels/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ def __init__(
def generate(self):
"""Generate kernel from matrix X and save to disk."""
assert self.data is not None, 'Data required to generate kernel matrix.'
print(' * [MemKernel] Generating kernel matrix', self.memId)
print(' * [MemKernel] Generating kernel matrix', self.memId,
'(', self.dtype, ')')
s, rows_written, cols_written = min(self.num_samples, self.n), 0, 0
writer = BlockWriter(self.dtype, self.n, s, self.kernel_path)
for i in range(ceil(self.n / s)):
Expand Down
2 changes: 1 addition & 1 deletion mlml/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def iteration(
labels_train: np.ndarray,
X_test: np.ndarray,
labels_test: np.ndarray):
if log_frequency and iteration % log_frequency == 0:
if iteration % log_frequency == 0:
train_accuracy = model.accuracy(X_train, labels_train)
test_accuracy = model.accuracy(X_test, labels_test)
print('Train:', train_accuracy, 'Test:', test_accuracy,
Expand Down
6 changes: 5 additions & 1 deletion mlml/ssgd/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def from_arguments(
num_features=arguments['--d'],
num_per_block=arguments['--num-per-block'],
one_hot=arguments['--one-hot'],
shuffle_on_disk=arguments['--shuffle-on-disk'],
simulated=arguments['--simulated'],
step=arguments['--step'],
subset=arguments['--subset'],
Expand All @@ -74,6 +75,7 @@ def train(
num_features: int,
num_per_block: int,
one_hot: bool,
shuffle_on_disk: bool,
simulated: bool,
step: int,
subset: int,
Expand Down Expand Up @@ -108,6 +110,7 @@ def train(
num_per_block: Number of training samples to load into each block
one_hot: Whether or not to use one hot encodings
limited by the size of the buffer and size of each sample
shuffle_on_disk: Whether or not to shuffle on disk
step: Number of iterations between each alpha decay
train_path: Path to the training file (binary)
X_test: Test input data
Expand All @@ -127,7 +130,8 @@ def train(
w_delta = iteration = 0
for p in range(epochs):
shuffled_path = shuffle_train(
algorithm, dtype, n, num_per_block, num_features, train_path)
algorithm, dtype, n, num_per_block, num_features, train_path) \
if shuffle_on_disk else train_path
blocks = BlockBuffer(
dtype, n, num_features + 1, num_per_block, shuffled_path)
for X, labels in map(block_x_labels, blocks):
Expand Down
16 changes: 7 additions & 9 deletions mlml/utils/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""""""

from sklearn import preprocessing
from typing import Tuple
from typing import Callable

Expand Down Expand Up @@ -37,13 +38,11 @@ def read_dataset(
data = np.memmap(path, dtype=dtype, mode='r', shape=(shape[0], shape[1] + 1))
if subset > 0:
data = data[:subset]
X, labels = data_hook(*block_x_labels(data, dtype))
X, labels = data_hook(*block_x_labels(data))
return Data(labels, num_classes, one_hot, X)


def block_x_labels(
block: np.ndarray,
dtype: str='float16') -> Tuple[np.ndarray, np.ndarray]:
def block_x_labels(block: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Converts a block of data into X and Y.
Args:
Expand All @@ -54,17 +53,16 @@ def block_x_labels(
X: the data inputs
labels: the data outputs
"""
X = block[:, :-1].astype(dtype, copy=False)
X = block[:, :-1].astype('float64', copy=False)
labels = block[:, -1].astype(int, copy=False)
return X, labels


def to_one_hot(num_classes: int, y: np.ndarray):
"""Convert vector into one hot form."""
one_hot = np.eye(num_classes)[y]
if len(one_hot.shape) > 2:
one_hot.shape = (one_hot.shape[0], one_hot.shape[-1])
return one_hot
lb = preprocessing.LabelBinarizer()
lb.fit(list(range(num_classes)))
return lb.transform(y)


def de_one_hot(X: np.ndarray):
Expand Down

0 comments on commit 6e5e29f

Please sign in to comment.