fixed generation dtype compat

alvinwan · Jan 9, 2017 · 6e5e29f · 6e5e29f
1 parent 96219d5
commit 6e5e29f
Show file tree

Hide file tree

Showing 7 changed files with 38 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -77,19 +77,17 @@ our kernel matrix or a function of our kernel matrix.
 
     python mlml.py ssgd (mnist|spam|cifar-10) --memId=<memId> [options] 
 
-For example, the following runs kernelized sgd on a subset of 35000
-samples from MNIST, using the radial basis function (RBF). Note that
+For example, the following runs kernelized sgd on all samples from
+cifar-10, using the radial basis function (RBF). Note that
 the first command will output the `<memId>` needed for the second
 command.
 
-    python mlml.py generate mnist --subset=35000 --kernel=RBF
-    python mlml.py ssgd mnist --memId=<memId> --subset=35000
+    python mlml.py generate cifar-10 --kernel=RBF
+    python mlml.py ssgd cifar-10 --memId=<memId>
 
-To use a more computationally efficient but memory-consuming algorithm,
-use the `--simulated` flag. For example, the following runs the
-alternative generation scheme for `CIFAR-10`.
+To run on a subset of your data, use the `--subset` flag.
 
-    python mlml.py generate cifar-10 --subset=30000 --kernel=RBF --simulated
+    python mlml.py generate cifar=10 --kernel=RBF --subset=35000
 
 ## Command-Line Utility
 

diff --git a/demos/ksgd.ipynb b/demos/ksgd.ipynb
diff --git a/mlml.py b/mlml.py
@@ -37,6 +37,7 @@
     --test=<test>       Path to test data [default: data/test]
     --trn-dtype=<dtype> The numeric type of each training sample [default: uint8]
     --tst-dtype=<dtype> The numeric type of each test sample [default: uint8]
+    --shuffle-on-disk   Flag for ssgd to shuffle on disk.
     --simulated         Mark memory constraints as simulated. Allows full accuracy tests.
     --subset=<num>      Specify subset of data to pick. Ignored if <= 0. [default: 0]
 """
@@ -134,7 +135,8 @@ def preprocess_arguments(arguments) -> dict:
     """
 
     if arguments['mnist']:
-        arguments['--dtype'] = 'uint8'
+        arguments['--trn-dtype'] = 'uint8'
+        arguments['--tst-dtype'] = 'uint8'
         arguments['--train'] = 'data/mnist-%s-60000-train' % arguments['--dtype']
         arguments['--test'] = 'data/mnist-%s-10000-test' % arguments['--dtype']
         arguments['--n'] = 60000
@@ -151,7 +153,6 @@ def preprocess_arguments(arguments) -> dict:
         arguments['--k'] = 1
         arguments['--d'] = 55
     if arguments['cifar-10']:
-        arguments['--dtype'] = 'uint8'
         arguments['--trn-dtype'] = 'uint8'
         arguments['--tst-dtype'] = 'uint8'
         arguments['--train'] = 'data/cifar-10-uint8-50000-train'
@@ -161,6 +162,7 @@ def preprocess_arguments(arguments) -> dict:
         arguments['--k'] = 10
         arguments['--d'] = 3072
         arguments['--one-hot'] = 'true'
+        arguments['--data-hook'] = lambda X, labels: (X / 255.0, labels)
 
     arguments['--damp'] = float(arguments['--damp'])
     arguments['--data-hook'] = arguments.get('--data-hook', lambda *args: args)
@@ -176,7 +178,7 @@ def preprocess_arguments(arguments) -> dict:
     arguments['--one-hot'] = arguments['--one-hot'].lower() == 'true'
     arguments['--reg'] = float(arguments['--reg'])
     arguments['--step'] = int(arguments['--step'])
-    arguments['--subset'] = int(arguments['--subset'])
+    arguments['--subset'] = int(arguments['--subset']) or arguments['--n']
 
     if arguments['--memId']:
         arguments['--data-hook'] = lambda *args: args

diff --git a/mlml/kernels/generate.py b/mlml/kernels/generate.py
@@ -127,7 +127,8 @@ def __init__(
     def generate(self):
         """Generate kernel from matrix X and save to disk."""
         assert self.data is not None, 'Data required to generate kernel matrix.'
-        print(' * [MemKernel] Generating kernel matrix', self.memId)
+        print(' * [MemKernel] Generating kernel matrix', self.memId,
+              '(', self.dtype, ')')
         s, rows_written, cols_written = min(self.num_samples, self.n), 0, 0
         writer = BlockWriter(self.dtype, self.n, s, self.kernel_path)
         for i in range(ceil(self.n / s)):

diff --git a/mlml/logging.py b/mlml/logging.py
@@ -59,7 +59,7 @@ def iteration(
             labels_train: np.ndarray,
             X_test: np.ndarray,
             labels_test: np.ndarray):
-        if log_frequency and iteration % log_frequency == 0:
+        if iteration % log_frequency == 0:
             train_accuracy = model.accuracy(X_train, labels_train)
             test_accuracy = model.accuracy(X_test, labels_test)
             print('Train:', train_accuracy, 'Test:', test_accuracy,

diff --git a/mlml/ssgd/algorithm.py b/mlml/ssgd/algorithm.py
@@ -50,6 +50,7 @@ def from_arguments(
             num_features=arguments['--d'],
             num_per_block=arguments['--num-per-block'],
             one_hot=arguments['--one-hot'],
+            shuffle_on_disk=arguments['--shuffle-on-disk'],
             simulated=arguments['--simulated'],
             step=arguments['--step'],
             subset=arguments['--subset'],
@@ -74,6 +75,7 @@ def train(
             num_features: int,
             num_per_block: int,
             one_hot: bool,
+            shuffle_on_disk: bool,
             simulated: bool,
             step: int,
             subset: int,
@@ -108,6 +110,7 @@ def train(
             num_per_block: Number of training samples to load into each block
             one_hot: Whether or not to use one hot encodings
             limited by the size of the buffer and size of each sample
+            shuffle_on_disk: Whether or not to shuffle on disk
             step: Number of iterations between each alpha decay
             train_path: Path to the training file (binary)
             X_test: Test input data
@@ -127,7 +130,8 @@ def train(
         w_delta = iteration = 0
         for p in range(epochs):
             shuffled_path = shuffle_train(
-                algorithm, dtype, n, num_per_block, num_features, train_path)
+                algorithm, dtype, n, num_per_block, num_features, train_path) \
+                if shuffle_on_disk else train_path
             blocks = BlockBuffer(
                 dtype, n, num_features + 1, num_per_block, shuffled_path)
             for X, labels in map(block_x_labels, blocks):

diff --git a/mlml/utils/data.py b/mlml/utils/data.py
@@ -1,5 +1,6 @@
 """"""
 
+from sklearn import preprocessing
 from typing import Tuple
 from typing import Callable
 
@@ -37,13 +38,11 @@ def read_dataset(
     data = np.memmap(path, dtype=dtype, mode='r', shape=(shape[0], shape[1] + 1))
     if subset > 0:
         data = data[:subset]
-    X, labels = data_hook(*block_x_labels(data, dtype))
+    X, labels = data_hook(*block_x_labels(data))
     return Data(labels, num_classes, one_hot, X)
 
 
-def block_x_labels(
-        block: np.ndarray,
-        dtype: str='float16') -> Tuple[np.ndarray, np.ndarray]:
+def block_x_labels(block: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     """Converts a block of data into X and Y.
 
     Args:
@@ -54,17 +53,16 @@ def block_x_labels(
         X: the data inputs
         labels: the data outputs
     """
-    X = block[:, :-1].astype(dtype, copy=False)
+    X = block[:, :-1].astype('float64', copy=False)
     labels = block[:, -1].astype(int, copy=False)
     return X, labels
 
 
 def to_one_hot(num_classes: int, y: np.ndarray):
     """Convert vector into one hot form."""
-    one_hot = np.eye(num_classes)[y]
-    if len(one_hot.shape) > 2:
-        one_hot.shape = (one_hot.shape[0], one_hot.shape[-1])
-    return one_hot
+    lb = preprocessing.LabelBinarizer()
+    lb.fit(list(range(num_classes)))
+    return lb.transform(y)
 
 
 def de_one_hot(X: np.ndarray):