Skip to content

Commit

Permalink
Add ntm algorithm with doc, unit tests, integ tests (#73)
Browse files Browse the repository at this point in the history
  • Loading branch information
yangaws authored and lukmis committed Feb 9, 2018
1 parent b400fa4 commit 795b030
Show file tree
Hide file tree
Showing 15 changed files with 603 additions and 30 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@
CHANGELOG
=========

1.0.4
=====

* feature: Estimators: add support for Amazon Neural Topic Model(NTM) algorithm
* feature: Documentation: Fix description of an argument of sagemaker.session.train
* feature: Documentation: Add FM and LDA to the documentation
* feature: Estimators: add support for async fit
* bug-fix: Estimators: fix estimator role expansion

1.0.3
=====

Expand Down
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ You can install from source by cloning this repository and issuing a pip install

git clone https://github.com/aws/sagemaker-python-sdk.git
python setup.py sdist
pip install dist/sagemaker-1.0.3.tar.gz
pip install dist/sagemaker-1.0.4.tar.gz

Supported Python versions
~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -1447,7 +1447,7 @@ Amazon SageMaker provides several built-in machine learning algorithms that you
The full list of algorithms is available on the AWS website: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html
SageMaker Python SDK includes Estimator wrappers for the AWS K-means, Principal Components Analysis, Linear Learner, Factorization Machines and LDA algorithms.
SageMaker Python SDK includes Estimator wrappers for the AWS K-means, Principal Components Analysis(PCA), Linear Learner, Factorization Machines, Latent Dirichlet Allocation(LDA) and Neural Topic Model(NTM) algorithms.
Definition and usage
~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __getattr__(cls, name):
'tensorflow.python.framework', 'tensorflow_serving', 'tensorflow_serving.apis']
sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)

version = '1.0.3'
version = '1.0.4'
project = u'sagemaker'

# Add any Sphinx extension module names here, as strings. They can be extensions
Expand Down
1 change: 1 addition & 0 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,4 @@ Amazon provides implementations of some common machine learning algortithms opti
sagemaker.amazon.amazon_estimator
factorization_machines
lda
ntm
23 changes: 23 additions & 0 deletions doc/ntm.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
NTM
--------------------

The Amazon SageMaker NTM algorithm.

.. autoclass:: sagemaker.NTM
:members:
:undoc-members:
:show-inheritance:
:inherited-members:
:exclude-members: image, num_topics, encoder_layers, epochs, encoder_layers_activation, optimizer, tolerance,
num_patience_epochs, batch_norm, rescale_gradient, clip_gradient, weight_decay, learning_rate


.. autoclass:: sagemaker.NTMModel
:members:
:undoc-members:
:show-inheritance:

.. autoclass:: sagemaker.NTMPredictor
:members:
:undoc-members:
:show-inheritance:
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def read(fname):


setup(name="sagemaker",
version="1.0.3",
version="1.0.4",
description="Open source library for training and deploying models on Amazon SageMaker.",
packages=find_packages('src'),
package_dir={'': 'src'},
Expand Down
3 changes: 2 additions & 1 deletion src/sagemaker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel, LinearLearnerPredictor
from sagemaker.amazon.factorization_machines import FactorizationMachines, FactorizationMachinesModel
from sagemaker.amazon.factorization_machines import FactorizationMachinesPredictor
from sagemaker.amazon.ntm import NTM, NTMModel, NTMPredictor

from sagemaker.model import Model
from sagemaker.predictor import RealTimePredictor
Expand All @@ -33,5 +34,5 @@
LinearLearnerModel, LinearLearnerPredictor,
LDA, LDAModel, LDAPredictor,
FactorizationMachines, FactorizationMachinesModel, FactorizationMachinesPredictor,
Model, RealTimePredictor, Session,
Model, NTM, NTMModel, NTMPredictor, RealTimePredictor, Session,
container_def, s3_input, production_variant, get_execution_role]
2 changes: 1 addition & 1 deletion src/sagemaker/amazon/amazon_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def upload_numpy_to_s3_shards(num_shards, s3, bucket, key_prefix, array, labels=

def registry(region_name, algorithm=None):
"""Return docker registry for the given AWS region"""
if algorithm in [None, "pca", "kmeans", "linear-learner", "factorization-machines"]:
if algorithm in [None, "pca", "kmeans", "linear-learner", "factorization-machines", "ntm"]:
account_id = {
"us-east-1": "382416733822",
"us-east-2": "404615174143",
Expand Down
146 changes: 146 additions & 0 deletions src/sagemaker/amazon/ntm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa
from sagemaker.amazon.validation import ge, le, isin
from sagemaker.predictor import RealTimePredictor
from sagemaker.model import Model
from sagemaker.session import Session


class NTM(AmazonAlgorithmEstimatorBase):

repo_name = 'ntm'
repo_version = 1

num_topics = hp('num_topics', (ge(2), le(1000)), 'An integer in [2, 1000]', int)
encoder_layers = hp(name='encoder_layers', validation_message='A comma separated list of '
'positive integers', data_type=list)
epochs = hp('epochs', (ge(1), le(100)), 'An integer in [1, 100]', int)
encoder_layers_activation = hp('encoder_layers_activation', isin('sigmoid', 'tanh', 'relu'),
'One of "sigmoid", "tanh" or "relu"', str)
optimizer = hp('optimizer', isin('adagrad', 'adam', 'rmsprop', 'sgd', 'adadelta'),
'One of "adagrad", "adam", "rmsprop", "sgd" and "adadelta"', str)
tolerance = hp('tolerance', (ge(1e-6), le(0.1)), 'A float in [1e-6, 0.1]', float)
num_patience_epochs = hp('num_patience_epochs', (ge(1), le(10)), 'An integer in [1, 10]', int)
batch_norm = hp(name='batch_norm', validation_message='Value must be a boolean', data_type=bool)
rescale_gradient = hp('rescale_gradient', (ge(1e-3), le(1.0)), 'A float in [1e-3, 1.0]', float)
clip_gradient = hp('clip_gradient', ge(1e-3), 'A float greater equal to 1e-3', float)
weight_decay = hp('weight_decay', (ge(0.0), le(1.0)), 'A float in [0.0, 1.0]', float)
learning_rate = hp('learning_rate', (ge(1e-6), le(1.0)), 'A float in [1e-6, 1.0]', float)

def __init__(self, role, train_instance_count, train_instance_type, num_topics,
encoder_layers=None, epochs=None, encoder_layers_activation=None, optimizer=None, tolerance=None,
num_patience_epochs=None, batch_norm=None, rescale_gradient=None, clip_gradient=None,
weight_decay=None, learning_rate=None, **kwargs):
"""Neural Topic Model (NTM) is :class:`Estimator` used for unsupervised learning.
This Estimator may be fit via calls to
:meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
:class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
to the `fit` call.
To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html
After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
deploy returns a :class:`~sagemaker.amazon.ntm.NTMPredictor` object that can be used
for inference calls using the trained model hosted in the SageMaker Endpoint.
NTM Estimators can be configured by setting hyperparameters. The available hyperparameters for
NTM are documented below.
For further information on the AWS NTM algorithm,
please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/ntm.html
Args:
role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
APIs that create Amazon SageMaker endpoints use this role to access
training data and model artifacts. After the endpoint is created,
the inference code might use the IAM role, if accessing AWS resource.
train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
num_topics (int): Required. The number of topics for NTM to find within the data.
encoder_layers (list): Optional. Represents number of layers in the encoder and the output size of
each layer.
epochs (int): Optional. Maximum number of passes over the training data.
encoder_layers_activation (str): Optional. Activation function to use in the encoder layers.
optimizer (str): Optional. Optimizer to use for training.
tolerance (float): Optional. Maximum relative change in the loss function within the last
num_patience_epochs number of epochs below which early stopping is triggered.
num_patience_epochs (int): Optional. Number of successive epochs over which early stopping criterion
is evaluated.
batch_norm (bool): Optional. Whether to use batch normalization during training.
rescale_gradient (float): Optional. Rescale factor for gradient.
clip_gradient (float): Optional. Maximum magnitude for each gradient component.
weight_decay (float): Optional. Weight decay coefficient. Adds L2 regularization.
learning_rate (float): Optional. Learning rate for the optimizer.
**kwargs: base class keyword argument values.
"""

super(NTM, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
self.num_topics = num_topics
self.encoder_layers = encoder_layers
self.epochs = epochs
self.encoder_layers_activation = encoder_layers_activation
self.optimizer = optimizer
self.tolerance = tolerance
self.num_patience_epochs = num_patience_epochs
self.batch_norm = batch_norm
self.rescale_gradient = rescale_gradient
self.clip_gradient = clip_gradient
self.weight_decay = weight_decay
self.learning_rate = learning_rate

def create_model(self):
"""Return a :class:`~sagemaker.amazon.NTMModel` referencing the latest
s3 model data produced by this Estimator."""

return NTMModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)

def fit(self, records, mini_batch_size=None, **kwargs):
if mini_batch_size is not None and (mini_batch_size < 1 or mini_batch_size > 10000):
raise ValueError("mini_batch_size must be in [1, 10000]")
super(NTM, self).fit(records, mini_batch_size, **kwargs)


class NTMPredictor(RealTimePredictor):
"""Transforms input vectors to lower-dimesional representations.
The implementation of :meth:`~sagemaker.predictor.RealTimePredictor.predict` in this
`RealTimePredictor` requires a numpy ``ndarray`` as input. The array should contain the
same number of columns as the feature-dimension of the data used to fit the model this
Predictor performs inference on.
:meth:`predict()` returns a list of :class:`~sagemaker.amazon.record_pb2.Record` objects, one
for each row in the input ``ndarray``. The lower dimension vector result is stored in the ``projection``
key of the ``Record.label`` field."""

def __init__(self, endpoint, sagemaker_session=None):
super(NTMPredictor, self).__init__(endpoint, sagemaker_session, serializer=numpy_to_record_serializer(),
deserializer=record_deserializer())


class NTMModel(Model):
"""Reference NTM s3 model data. Calling :meth:`~sagemaker.model.Model.deploy` creates an Endpoint and return
a Predictor that transforms vectors to a lower-dimensional representation."""

def __init__(self, model_data, role, sagemaker_session=None):
sagemaker_session = sagemaker_session or Session()
repo = '{}:{}'.format(NTM.repo_name, NTM.repo_version)
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name, NTM.repo_name), repo)
super(NTMModel, self).__init__(model_data, image, role, predictor_cls=NTMPredictor,
sagemaker_session=sagemaker_session)
6 changes: 6 additions & 0 deletions src/sagemaker/amazon/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ def validate(value):
return validate


def le(maximum):
def validate(value):
return value <= maximum
return validate


def isin(*expected):
def validate(value):
return value in expected
Expand Down
Binary file added tests/data/ntm/nips-train_1.pbr
Binary file not shown.
23 changes: 23 additions & 0 deletions tests/integ/record_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from six.moves.urllib.parse import urlparse

from sagemaker.amazon.amazon_estimator import RecordSet
from sagemaker.utils import sagemaker_timestamp


def prepare_record_set_from_local_files(dir_path, destination, num_records, feature_dim, sagemaker_session):
"""Build a :class:`~RecordSet` by pointing to local files.
Args:
dir_path (string): Path to local directory from where the files shall be uploaded.
destination (string): S3 path to upload the file to.
num_records (int): Number of records in all the files
feature_dim (int): Number of features in the data set
sagemaker_session (sagemaker.session.Session): Session object to manage interactions with Amazon SageMaker APIs.
Returns:
RecordSet: A RecordSet specified by S3Prefix to to be used in training.
"""
key_prefix = urlparse(destination).path
key_prefix = key_prefix + '{}-{}'.format("testfiles", sagemaker_timestamp())
key_prefix = key_prefix.lstrip('/')
uploaded_location = sagemaker_session.upload_data(path=dir_path, key_prefix=key_prefix)
return RecordSet(uploaded_location, num_records, feature_dim, s3_data_type='S3Prefix')
28 changes: 4 additions & 24 deletions tests/integ/test_lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,15 @@
import boto3
import numpy as np
import os
from six.moves.urllib.parse import urlparse

import sagemaker
from sagemaker import LDA, LDAModel
from sagemaker.amazon.amazon_estimator import RecordSet
from sagemaker.amazon.common import read_records
from sagemaker.utils import name_from_base, sagemaker_timestamp
from sagemaker.utils import name_from_base

from tests.integ import DATA_DIR, REGION
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
from tests.integ.record_set import prepare_record_set_from_local_files


def test_lda():
Expand All @@ -41,8 +40,8 @@ def test_lda():
lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10,
sagemaker_session=sagemaker_session, base_job_name='test-lda')

record_set = _prepare_record_set_from_local_files(data_path, lda.data_location,
len(all_records), feature_num, sagemaker_session)
record_set = prepare_record_set_from_local_files(data_path, lda.data_location,
len(all_records), feature_num, sagemaker_session)
lda.fit(record_set, 100)

endpoint_name = name_from_base('lda')
Expand All @@ -56,22 +55,3 @@ def test_lda():
assert len(result) == 1
for record in result:
assert record.label["topic_mixture"] is not None


def _prepare_record_set_from_local_files(dir_path, destination, num_records, feature_dim, sagemaker_session):
"""Build a :class:`~RecordSet` by pointing to local files.
Args:
dir_path (string): Path to local directory from where the files shall be uploaded.
destination (string): S3 path to upload the file to.
num_records (int): Number of records in all the files
feature_dim (int): Number of features in the data set
sagemaker_session (sagemaker.session.Session): Session object to manage interactions with Amazon SageMaker APIs.
Returns:
RecordSet: A RecordSet specified by S3Prefix to to be used in training.
"""
key_prefix = urlparse(destination).path
key_prefix = key_prefix + '{}-{}'.format("testfiles", sagemaker_timestamp())
key_prefix = key_prefix.lstrip('/')
uploaded_location = sagemaker_session.upload_data(path=dir_path, key_prefix=key_prefix)
return RecordSet(uploaded_location, num_records, feature_dim, s3_data_type='S3Prefix')
57 changes: 57 additions & 0 deletions tests/integ/test_ntm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import boto3
import numpy as np
import os

import sagemaker
from sagemaker import NTM, NTMModel
from sagemaker.amazon.common import read_records
from sagemaker.utils import name_from_base

from tests.integ import DATA_DIR, REGION
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
from tests.integ.record_set import prepare_record_set_from_local_files


def test_ntm():

with timeout(minutes=15):
sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
data_path = os.path.join(DATA_DIR, 'ntm')
data_filename = 'nips-train_1.pbr'

with open(os.path.join(data_path, data_filename), 'rb') as f:
all_records = read_records(f)

# all records must be same
feature_num = int(all_records[0].features['values'].float32_tensor.shape[0])

ntm = NTM(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_topics=10,
sagemaker_session=sagemaker_session, base_job_name='test-ntm')

record_set = prepare_record_set_from_local_files(data_path, ntm.data_location,
len(all_records), feature_num, sagemaker_session)
ntm.fit(record_set, None)

endpoint_name = name_from_base('ntm')
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
model = NTMModel(ntm.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

predict_input = np.random.rand(1, feature_num)
result = predictor.predict(predict_input)

assert len(result) == 1
for record in result:
assert record.label["topic_weights"] is not None
Loading

0 comments on commit 795b030

Please sign in to comment.