Skip to content

Commit

Permalink
Update unit tests style and fix inaccurate hyper-parameters (#103)
Browse files Browse the repository at this point in the history
* Update unit tests of kmeans, pca, linear learner, factorization machines, lda and ntm into a parameterized test style

* Fix inaccurate hyper-parameters in linear learner, pca and kmeans
  • Loading branch information
yangaws committed Mar 23, 2018
1 parent da14a6b commit 447197e
Show file tree
Hide file tree
Showing 12 changed files with 846 additions and 711 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ CHANGELOG
========

* feature: Tests: create configurable ``sagemaker_session`` pytest fixture for all integration tests
* bug-fix: AmazonEstimators: fix inaccurate hyper-parameters in kmeans, pca and linear learner

1.1.2
=====
Expand Down
14 changes: 10 additions & 4 deletions src/sagemaker/amazon/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa
from sagemaker.amazon.validation import gt, isin, ge
from sagemaker.amazon.validation import gt, isin, ge, le
from sagemaker.predictor import RealTimePredictor
from sagemaker.model import Model
from sagemaker.session import Session
Expand All @@ -27,16 +27,18 @@ class KMeans(AmazonAlgorithmEstimatorBase):
k = hp('k', gt(1), 'An integer greater-than 1', int)
init_method = hp('init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"', str)
max_iterations = hp('local_lloyd_max_iterations', gt(0), 'An integer greater-than 0', int)
tol = hp('local_lloyd_tol', gt(0), 'An integer greater-than 0', int)
tol = hp('local_lloyd_tol', (ge(0), le(1)), 'An float in [0, 1]', float)
num_trials = hp('local_lloyd_num_trials', gt(0), 'An integer greater-than 0', int)
local_init_method = hp('local_lloyd_init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"', str)
half_life_time_size = hp('half_life_time_size', ge(0), 'An integer greater-than-or-equal-to 0', int)
epochs = hp('epochs', gt(0), 'An integer greater-than 0', int)
center_factor = hp('extra_center_factor', gt(0), 'An integer greater-than 0', int)
eval_metrics = hp(name='eval_metrics', validation_message='A comma separated list of "msd" or "ssd"',
data_type=list)

def __init__(self, role, train_instance_count, train_instance_type, k, init_method=None,
max_iterations=None, tol=None, num_trials=None, local_init_method=None,
half_life_time_size=None, epochs=None, center_factor=None, **kwargs):
half_life_time_size=None, epochs=None, center_factor=None, eval_metrics=None, **kwargs):
"""
A k-means clustering :class:`~sagemaker.amazon.AmazonAlgorithmEstimatorBase`. Finds k clusters of data in an
unlabeled dataset.
Expand Down Expand Up @@ -70,7 +72,7 @@ def __init__(self, role, train_instance_count, train_instance_type, k, init_meth
k (int): The number of clusters to produce.
init_method (str): How to initialize cluster locations. One of 'random' or 'kmeans++'.
max_iterations (int): Maximum iterations for Lloyds EM procedure in the local kmeans used in finalize stage.
tol (int): Tolerance for change in ssd for early stopping in local kmeans.
tol (float): Tolerance for change in ssd for early stopping in local kmeans.
num_trials (int): Local version is run multiple times and the one with the best loss is chosen. This
determines how many times.
local_init_method (str): Initialization method for local version. One of 'random', 'kmeans++'
Expand All @@ -82,6 +84,9 @@ def __init__(self, role, train_instance_count, train_instance_type, k, init_meth
epochs (int): Number of passes done over the training data.
center_factor(int): The algorithm will create ``num_clusters * extra_center_factor`` as it runs and
reduce the number of centers to ``k`` when finalizing
eval_metrics(list): JSON list of metrics types to be used for reporting the score for the model.
Allowed values are "msd" Means Square Error, "ssd": Sum of square distance. If test data is provided,
the score shall be reported in terms of all requested metrics.
**kwargs: base class keyword argument values.
"""
super(KMeans, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
Expand All @@ -94,6 +99,7 @@ def __init__(self, role, train_instance_count, train_instance_type, k, init_meth
self.half_life_time_size = half_life_time_size
self.epochs = epochs
self.center_factor = center_factor
self.eval_metrics = eval_metrics

def create_model(self):
"""Return a :class:`~sagemaker.amazon.kmeans.KMeansModel` referencing the latest
Expand Down
8 changes: 2 additions & 6 deletions src/sagemaker/amazon/linear_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ class LinearLearner(AmazonAlgorithmEstimatorBase):
data_type=str)
target_recall = hp('target_recall', (gt(0), lt(1)), "A float in (0,1)", float)
target_precision = hp('target_precision', (gt(0), lt(1)), "A float in (0,1)", float)
positive_example_weight_mult = hp('positive_example_weight_mult', gt(0), "A float greater than 0", float)
epochs = hp('epochs', gt(0), "An integer greater-than 0", int)
predictor_type = hp('predictor_type', isin('binary_classifier', 'regressor'),
'One of "binary_classifier" or "regressor"', str)
Expand Down Expand Up @@ -64,9 +63,9 @@ class LinearLearner(AmazonAlgorithmEstimatorBase):
unbias_label = hp('unbias_label', (), 'A boolean', bool)
num_point_for_scaler = hp('num_point_for_scaler', gt(0), 'An integer greater-than 0', int)

def __init__(self, role, train_instance_count, train_instance_type, predictor_type='binary_classifier',
def __init__(self, role, train_instance_count, train_instance_type, predictor_type,
binary_classifier_model_selection_criteria=None, target_recall=None, target_precision=None,
positive_example_weight_mult=None, epochs=None, use_bias=None, num_models=None,
epochs=None, use_bias=None, num_models=None,
num_calibration_samples=None, init_method=None, init_scale=None, init_sigma=None, init_bias=None,
optimizer=None, loss=None, wd=None, l1=None, momentum=None, learning_rate=None, beta_1=None,
beta_2=None, bias_lr_mult=None, bias_wd_mult=None, use_lr_scheduler=None, lr_scheduler_step=None,
Expand Down Expand Up @@ -114,8 +113,6 @@ def __init__(self, role, train_instance_count, train_instance_type, predictor_ty
precision_at_target_recall.
target_precision (float): Target precision. Only applicable if binary_classifier_model_selection_criteria
is recall_at_target_precision.
positive_example_weight_mult (float): The importance weight of positive examples is multiplied by this
constant. Useful for skewed datasets. Only applies for classification tasks.
epochs (int): The maximum number of passes to make over the training data.
use_bias (bool): Whether to include a bias field
num_models (int): Number of models to train in parallel. If not set, the number of parallel models to
Expand Down Expand Up @@ -160,7 +157,6 @@ def __init__(self, role, train_instance_count, train_instance_type, predictor_ty
self.binary_classifier_model_selection_criteria = binary_classifier_model_selection_criteria
self.target_recall = target_recall
self.target_precision = target_precision
self.positive_example_weight_mult = positive_example_weight_mult
self.epochs = epochs
self.use_bias = use_bias
self.num_models = num_models
Expand Down
20 changes: 11 additions & 9 deletions src/sagemaker/amazon/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa
from sagemaker.amazon.validation import gt, isin
from sagemaker.predictor import RealTimePredictor
from sagemaker.model import Model
from sagemaker.session import Session
Expand All @@ -25,13 +26,13 @@ class PCA(AmazonAlgorithmEstimatorBase):

DEFAULT_MINI_BATCH_SIZE = 500

num_components = hp(name='num_components', validate=lambda x: x > 0,
validation_message='Value must be an integer greater than zero', data_type=int)
algorithm_mode = hp(name='algorithm_mode', validate=lambda x: x in ['regular', 'stable', 'randomized'],
validation_message='Value must be one of "regular", "stable", "randomized"', data_type=str)
num_components = hp('num_components', gt(0), 'Value must be an integer greater than zero', int)
algorithm_mode = hp('algorithm_mode', isin('regular', 'randomized'),
'Value must be one of "regular" and "randomized"', str)
subtract_mean = hp(name='subtract_mean', validation_message='Value must be a boolean', data_type=bool)
extra_components = hp(name='extra_components', validate=lambda x: x >= 0,
validation_message="Value must be an integer greater than or equal to 0", data_type=int)
extra_components = hp(name='extra_components',
validation_message="Value must be an integer greater than or equal to 0, or -1.",
data_type=int)

def __init__(self, role, train_instance_count, train_instance_type, num_components,
algorithm_mode=None, subtract_mean=None, extra_components=None, **kwargs):
Expand Down Expand Up @@ -68,12 +69,13 @@ def __init__(self, role, train_instance_count, train_instance_type, num_componen
train_instance_count (int): Number of Amazon EC2 instances to use for training.
train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
num_components(int): The number of principal components. Must be greater than zero.
algorithm_mode (str): Mode for computing the principal components. One of 'regular', 'stable' or
algorithm_mode (str): Mode for computing the principal components. One of 'regular' or
'randomized'.
subtract_mean (bool): Whether the data should be unbiased both during train and at inference.
extra_components (int): As the value grows larger, the solution becomes more accurate but the
runtime and memory consumption increase linearly. If this value is unset, then a default value equal
to the maximum of 10 and num_components will be used. Valid for randomized mode only.
runtime and memory consumption increase linearly. If this value is unset or set to -1,
then a default value equal to the maximum of 10 and num_components will be used.
Valid for randomized mode only.
**kwargs: base class keyword argument values.
"""
super(PCA, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
Expand Down
8 changes: 2 additions & 6 deletions tests/integ/test_linear_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,11 @@ def test_linear_learner(sagemaker_session):
train_set = train_set[0], train_set[1].astype(np.dtype('float32'))

ll = LinearLearner('SageMakerRole', 1, 'ml.c4.2xlarge', base_job_name='test-linear-learner',
sagemaker_session=sagemaker_session)
predictor_type='binary_classifier', sagemaker_session=sagemaker_session)
ll.binary_classifier_model_selection_criteria = 'accuracy'
ll.target_recall = 0.5
ll.target_precision = 0.5
ll.positive_example_weight_mult = 0.1
ll.epochs = 1
ll.predictor_type = 'binary_classifier'
ll.use_bias = True
ll.num_models = 1
ll.num_calibration_samples = 1
Expand Down Expand Up @@ -100,13 +98,11 @@ def test_async_linear_learner(sagemaker_session):
train_set = train_set[0], train_set[1].astype(np.dtype('float32'))

ll = LinearLearner('SageMakerRole', 1, 'ml.c4.2xlarge', base_job_name='test-linear-learner',
sagemaker_session=sagemaker_session)
predictor_type='binary_classifier', sagemaker_session=sagemaker_session)
ll.binary_classifier_model_selection_criteria = 'accuracy'
ll.target_recall = 0.5
ll.target_precision = 0.5
ll.positive_example_weight_mult = 0.1
ll.epochs = 1
ll.predictor_type = 'binary_classifier'
ll.use_bias = True
ll.num_models = 1
ll.num_calibration_samples = 1
Expand Down
16 changes: 0 additions & 16 deletions tests/unit/test_amazon_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,22 +95,6 @@ def test_data_location_does_not_call_default_bucket(sagemaker_session):
assert not sagemaker_session.default_bucket.called


def test_pca_hyperparameters(sagemaker_session):
pca = PCA(num_components=55, algorithm_mode='randomized',
subtract_mean=True, extra_components=33, sagemaker_session=sagemaker_session,
**COMMON_ARGS)
assert pca.hyperparameters() == dict(
num_components='55',
extra_components='33',
subtract_mean='True',
algorithm_mode='randomized')


def test_image(sagemaker_session):
pca = PCA(num_components=55, sagemaker_session=sagemaker_session, **COMMON_ARGS)
assert pca.train_image() == registry('us-west-2') + '/pca:1'


@patch('time.strftime', return_value=TIMESTAMP)
def test_fit_ndarray(time, sagemaker_session):
mock_s3 = Mock()
Expand Down

0 comments on commit 447197e

Please sign in to comment.