diff --git a/.gitignore b/.gitignore index 3ee5780429..d67b44258a 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ examples/tensorflow/distributed_mnist/data doc/_build **/.DS_Store venv/ +*~ diff --git a/README.rst b/README.rst index 1bb4d00679..e06dea8b3c 100644 --- a/README.rst +++ b/README.rst @@ -21,6 +21,7 @@ Table of Contents 4. `TensorFlow SageMaker Estimators <#tensorflow-sagemaker-estimators>`__ 5. `AWS SageMaker Estimators <#aws-sagemaker-estimators>`__ 6. `BYO Docker Containers with SageMaker Estimators <#byo-docker-containers-with-sagemaker-estimators>`__ +7. `BYO Model <#byo-model>`__ Getting SageMaker Python SDK @@ -93,7 +94,7 @@ SageMaker Python SDK Overview SageMaker Python SDK provides several high-level abstractions for working with Amazon SageMaker. These are: -- **Estimators**: Encapsulate training on SageMaker. Can be ``fit()`` to run training, then the resulting model ``deploy()`` ed to a SageMaker Endpoint. +- **Estimators**: Encapsulate training on SageMaker. Can be ``fit()`` to run training, then the resulting model ``deploy()`` ed to a SageMaker Endpoint. - **Models**: Encapsulate built ML models. Can be ``deploy()`` ed to a SageMaker Endpoint. - **Predictors**: Provide real-time inference and transformation using Python data-types against a SageMaker Endpoint. - **Session**: Provides a collection of convience methods for working with SageMaker resources. @@ -177,7 +178,7 @@ You don't have to use all the arguments, arguments you don't care about can be i pass **Note: Writing a training script that imports correctly** -When SageMaker runs your training script, it imports it as a Python module and then invokes ``train`` on the imported module. Consequently, you should not include any statements that won't execute successfully in SageMaker when your module is imported. For example, don't attempt to open any local files in top-level statements in your training script. +When SageMaker runs your training script, it imports it as a Python module and then invokes ``train`` on the imported module. Consequently, you should not include any statements that won't execute successfully in SageMaker when your module is imported. For example, don't attempt to open any local files in top-level statements in your training script. If you want to run your training script locally via the Python interpreter, look at using a ``___name__ == '__main__'`` guard, discussed in more detail here: https://stackoverflow.com/questions/419163/what-does-if-name-main-do . @@ -410,7 +411,7 @@ The MXNet Endpoint you create with ``deploy`` runs a SageMaker MXNet model serve You can configure two components of the SageMaker MXNet model server: Model loading and model serving. Model loading is the process of deserializing your saved model back into an MXNet model. Serving is the process of translating InvokeEndpoint requests to inference calls on the loaded model. -As with MXNet training, you configure the MXNet model server by defining functions in the Python source file you passed to the MXNet constructor. +As with MXNet training, you configure the MXNet model server by defining functions in the Python source file you passed to the MXNet constructor. Model loading ^^^^^^^^^^^^^ @@ -697,19 +698,19 @@ The Docker images extend Ubuntu 16.04. TensorFlow SageMaker Estimators ------------------------------- -TensorFlow SageMaker Estimators allow you to run your own TensorFlow -training algorithms on SageMaker Learner, and to host your own TensorFlow +TensorFlow SageMaker Estimators allow you to run your own TensorFlow +training algorithms on SageMaker Learner, and to host your own TensorFlow models on SageMaker Hosting. Training with TensorFlow ~~~~~~~~~~~~~~~~~~~~~~~~ -Training TensorFlow models using a ``sagemaker.tensorflow.TensorFlow`` +Training TensorFlow models using a ``sagemaker.tensorflow.TensorFlow`` is a two-step process. -First, you prepare your training script, then second, you run it on +First, you prepare your training script, then second, you run it on SageMaker Learner via the ``sagemaker.tensorflow.TensorFlow`` estimator. -Suppose that you already have a TensorFlow training script called +Suppose that you already have a TensorFlow training script called ``tf-train.py``. You can train this script in SageMaker Learner as follows: @@ -717,7 +718,7 @@ follows: from sagemaker.tensorflow import TensorFlow - tf_estimator = TensorFlow('tf-train.py', role='SageMakerRole', + tf_estimator = TensorFlow(entry_point='tf-train.py', role='SageMakerRole', training_steps=10000, evaluation_steps=100, train_instance_count=1, train_instance_type='ml.p2.xlarge') tf_estimator.fit('s3://bucket/path/to/training/data') @@ -727,7 +728,7 @@ constructor keyword arguments define how SageMaker runs your training script and are discussed, in detail, in a later section. In the following sections, we'll discuss how to prepare a training script for execution on -SageMaker, then how to run that script on SageMaker using a ``sagemaker.tensorflow.TensorFlow`` +SageMaker, then how to run that script on SageMaker using a ``sagemaker.tensorflow.TensorFlow`` estimator. Preparing the TensorFlow training script @@ -744,7 +745,7 @@ version is **1.4.0**. This training script **must contain** the following functi Creating a ``model_fn`` ^^^^^^^^^^^^^^^^^^^^^^^ -A ``model_fn`` is a function that contains all the logic to support training, evaluation, +A ``model_fn`` is a function that contains all the logic to support training, evaluation, and prediction. The basic skeleton for a ``model_fn`` looks like this: .. code:: python @@ -771,8 +772,8 @@ The ``model_fn`` must accept four positional arguments: - ``TRAIN``: the ``model_fn`` was invoked in **training** mode. - ``EVAL``: the ``model_fn`` was invoked in **evaluation** mode. - ``PREDICT``: the ``model_fn`` was invoked in **predict** mode. -- ``hyperparameters``: The hyperparameters passed to SageMaker TrainingJob that runs - your TensorFlow training script. You can use this to pass hyperparameters to your +- ``hyperparameters``: The hyperparameters passed to SageMaker TrainingJob that runs + your TensorFlow training script. You can use this to pass hyperparameters to your training script. Example of a complete ``model_fn`` @@ -821,21 +822,21 @@ Example of a complete ``model_fn`` Distributed training '''''''''''''''''''' -When distributed training happens, a copy of the same neural network will be sent to -multiple training instances. Each instance will train with a batch of the dataset, +When distributed training happens, a copy of the same neural network will be sent to +multiple training instances. Each instance will train with a batch of the dataset, calculate loss and minimize the optimizer. One entire loop of this process is called training step. A `global step `_ is a global counter shared between the instances. It is necessary for distributed training, so the optimizer -can keep track of the number of training steps across instances. The only change in the -previous complete ``model_fn`` to enable distributed training is to pass in the global +can keep track of the number of training steps across instances. The only change in the +previous complete ``model_fn`` to enable distributed training is to pass in the global step into the ``optimizer.minimize`` function: .. code:: python - + train_op = optimizer.minimize(loss, tf.train.get_or_create_global_step()) -More information about distributed training can be find in talk from the TensorFlow Dev Summit 2017 +More information about distributed training can be find in talk from the TensorFlow Dev Summit 2017 `Distributed TensorFlow `_. @@ -845,8 +846,8 @@ More details on how to create a ``model_fn`` can be find in `Constructing the mo Creating ``train_input_fn`` and ``eval_input_fn`` functions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``train_input_fn`` is used to pass ``features`` and ``labels`` to the ``model_fn`` -in **training** mode. The ``eval_input_fn`` is used to ``features`` and ``labels`` to the +The ``train_input_fn`` is used to pass ``features`` and ``labels`` to the ``model_fn`` +in **training** mode. The ``eval_input_fn`` is used to ``features`` and ``labels`` to the ``model_fn`` in **evaluation** mode. The basic skeleton for the ``train_input_fn`` looks like this: @@ -859,7 +860,7 @@ The basic skeleton for the ``train_input_fn`` looks like this: # 2. Preprocess the dataset # 3. Return 1) a mapping of feature columns to Tensors with # the corresponding feature data, and 2) a Tensor containing labels - return feature_cols, labels + return feature_cols, labels An ``eval_input_fn`` follows the same format: @@ -871,7 +872,7 @@ An ``eval_input_fn`` follows the same format: # 2. Preprocess the dataset # 3. Return 1) a mapping of feature columns to Tensors with # the corresponding feature data, and 2) a Tensor containing labels - return feature_cols, labels + return feature_cols, labels Example of a complete ``train_input_fn`` and ``eval_input_fn`` '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' @@ -904,12 +905,12 @@ More details on how to create input functions can be find in `Building Input Fun Creating a ``serving_input_fn`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -During training, ``train_input_fn`` ingests data and prepares it for use by the model. -At the end of training, similarly, ``serving_input_fn`` is used to create the model that +During training, ``train_input_fn`` ingests data and prepares it for use by the model. +At the end of training, similarly, ``serving_input_fn`` is used to create the model that is exported for TensorFlow Serving. This function has the following purposes: - To add placeholders to the graph that the serving system will feed with inference requests. -- To add any additional ops needed to convert data from the input format into the feature Tensors +- To add any additional ops needed to convert data from the input format into the feature Tensors expected by the model. The basic skeleton for the ``serving_input_fn`` looks like this: @@ -920,7 +921,7 @@ The basic skeleton for the ``serving_input_fn`` looks like this: # Logic to the following: # 1. Defines placeholders that TensorFlow serving will feed with inference requests # 2. Preprocess input data - # 3. Returns a tf.estimator.export.ServingInputReceiver object, which packages the placeholders + # 3. Returns a tf.estimator.export.ServingInputReceiver object, which packages the placeholders and the resulting feature Tensors together. Example of a complete ``serving_input_fn`` @@ -943,7 +944,7 @@ More examples on how to create a TensorFlow training script can be find in the ` Support for pre-made ``tf.estimator`` and ``Keras`` models ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In addition to ``model_fn``, ``sagemaker.tensorflow.TensorFlow`` supports pre-canned ``tf.estimator`` +In addition to ``model_fn``, ``sagemaker.tensorflow.TensorFlow`` supports pre-canned ``tf.estimator`` and ``Keras`` models. Using a pre-made ``tensorflow.estimator`` instead of a ``model_fn`` @@ -952,20 +953,20 @@ Using a pre-made ``tensorflow.estimator`` instead of a ``model_fn`` Pre-canned estimators are machine learning estimators premade for general purpose problems. ``tf.estimator`` provides the following pre-canned estimators: -- `tf.estimator.LinearClassifier `_: Constructs +- `tf.estimator.LinearClassifier `_: Constructs a linear classification model. -- `tf.estimator.LinearRegressor `_: Constructs +- `tf.estimator.LinearRegressor `_: Constructs a linear regression model. -- `tf.estimator.DNNClassifier `_: Constructs +- `tf.estimator.DNNClassifier `_: Constructs a neural network classification model. -- `tf.estimator.DNNRegressor `_: Construct +- `tf.estimator.DNNRegressor `_: Construct a neural network regression model. -- `tf.estimator.DNNLinearCombinedClassifier `_: Constructs +- `tf.estimator.DNNLinearCombinedClassifier `_: Constructs a neural network and linear combined classification model. -- `tf.estimator.DNNLinearCombinedRegressor `_: Constructs +- `tf.estimator.DNNLinearCombinedRegressor `_: Constructs a neural network and linear combined regression model. -To use a pre-canned ``tensorflow.estimator`` instead of creating a ``model_fn``, you need to write a ``estimator_fn``. +To use a pre-canned ``tensorflow.estimator`` instead of creating a ``model_fn``, you need to write a ``estimator_fn``. The base skeleton for the ``estimator_fn`` looks like this: .. code:: python @@ -998,7 +999,7 @@ An example on how to create a TensorFlow training script with an ``estimator_fn` Using a ``Keras`` model instead of a ``model_fn`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``tf.keras`` is an full implementation inside TensorFlow of the Keras API. To use a ``tf.keras`` +``tf.keras`` is an full implementation inside TensorFlow of the Keras API. To use a ``tf.keras`` model for training instead of ``model_fn``, you need to write a ``keras_model_fn``. The base skeleton of a ``keras_model_fn`` looks like this: @@ -1041,7 +1042,7 @@ The following code sample shows how to train a custom TensorFlow script 'tf-trai from sagemaker.tensorflow import TensorFlow - tf_estimator = TensorFlow('tf-train.py', role='SageMakerRole', + tf_estimator = TensorFlow(entry_point='tf-train.py', role='SageMakerRole', training_steps=10000, evaluation_steps=100, train_instance_count=1, train_instance_type='ml.p2.xlarge') tf_estimator.fit('s3://bucket/path/to/training/data') @@ -1054,7 +1055,7 @@ The ``TensorFlow`` constructor takes both required and optional arguments. Required arguments '''''''''''''''''' -The following are required arguments to the TensorFlow constructor. +The following are required arguments to the TensorFlow constructor. - ``entry_point (str)`` Path (absolute or relative) to the Python file which should be executed as the entry point to training. @@ -1067,7 +1068,7 @@ The following are required arguments to the TensorFlow constructor. training. - ``train_instance_type (str)`` Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'. -- ``training_steps (int)`` Perform this many steps of training. ``None``, means train forever. +- ``training_steps (int)`` Perform this many steps of training. ``None``, means train forever. - ``evaluation_steps (int)`` Perform this many steps of evaluation. ``None``, means that evaluation runs until input from ``eval_input_fn`` is exhausted (or another exception is raised). @@ -1083,7 +1084,7 @@ you can specify these as keyword arguments. on SageMaker. - ``hyperparameters (dict[str,ANY])`` Hyperparameters that will be used for training. Will be made accessible as a dict[] to the training code on - SageMaker. Some hyperparameters will be interpreted by TensorFlow and can be use to + SageMaker. Some hyperparameters will be interpreted by TensorFlow and can be use to fine tune training. See `Optional Hyperparameters <#optional-hyperparameters>`_. - ``train_volume_size (int)`` Size in GB of the EBS volume to use for storing input data during training. Must be large enough to the store training @@ -1095,10 +1096,10 @@ you can specify these as keyword arguments. are stored to a default bucket. If the bucket with the specific name does not exist, the estimator creates the bucket during the ``fit`` method execution. -- ``checkpoint_path`` S3 location where checkpoint data will saved and restored. - The default location is *bucket_name/job_name/checkpoint*. If the location - already has checkpoints before the training starts, the model will restore - state from the last saved checkpoint. It is very useful to restart a training. +- ``checkpoint_path`` S3 location where checkpoint data will saved and restored. + The default location is *bucket_name/job_name/checkpoint*. If the location + already has checkpoints before the training starts, the model will restore + state from the last saved checkpoint. It is very useful to restart a training. See `Restoring from checkpoints <#restoring-from-checkpoints>`_. - ``output_kms_key`` Optional KMS key ID to optionally encrypt training output with. @@ -1110,8 +1111,8 @@ you can specify these as keyword arguments. Optional Hyperparameters '''''''''''''''''''''''' -These hyperparameters are used by TensorFlow to fine tune the training. -You need to add them inside the hyperparameters dictionary in the +These hyperparameters are used by TensorFlow to fine tune the training. +You need to add them inside the hyperparameters dictionary in the ``TensorFlow`` estimator constructor. - ``save_summary_steps (int)`` Save summaries every this many steps. @@ -1138,10 +1139,10 @@ both required and optional arguments. Required argument ''''''''''''''''' -- ``inputs (str)``: A S3 URI, for example ``s3://my-bucket/my-training-data``, which contains +- ``inputs (str)``: A S3 URI, for example ``s3://my-bucket/my-training-data``, which contains the dataset that will be used for training. When the training job starts in SageMaker the container will download the dataset. Both ``train_input_fn`` and ``eval_input_fn`` functions - have a parameter called ``training_dir`` which contains the directory inside the container + have a parameter called ``training_dir`` which contains the directory inside the container where the dataset was saved into. See `Creating train_input_fn and eval_input_fn functions`_. Optional arguments @@ -1151,8 +1152,8 @@ Optional arguments training script to complete before returning. - ``logs (bool)``: Defaults to True, whether to show logs produced by training job in the Python session. Only meaningful when wait is True. -- ``run_tensorboard_locally (bool)``: Defaults to False. Executes TensorBoard in a different - process with downloaded checkpoint information. Requires modules TensorBoard and AWS CLI. +- ``run_tensorboard_locally (bool)``: Defaults to False. Executes TensorBoard in a different + process with downloaded checkpoint information. Requires modules TensorBoard and AWS CLI. installed. Terminates TensorBoard when the execution ends. See `Running TensorBoard`_. - ``job_name (str)``: Training job name. If not specified, the estimator generates a default job name, based on the training image name and current timestamp. @@ -1168,15 +1169,15 @@ Calling ``fit`` starts a SageMaker training job. The training job will execute t - starts a Docker container optimized for TensorFlow, see `SageMaker TensorFlow Docker containers`_. - downloads the dataset. - setup up distributed training. - - starts asynchronous training, executing the ``model_fn`` function defined in your script - in **training** mode; i.e., ``features`` and ``labels`` are fed by a batch of the + - starts asynchronous training, executing the ``model_fn`` function defined in your script + in **training** mode; i.e., ``features`` and ``labels`` are fed by a batch of the training dataset defined by ``train_input_fn``. See `Creating train_input_fn and eval_input_fn functions`_. The training job finishes after the number of training steps reaches the value defined by the ``TensorFlow`` estimator parameter ``training_steps`` is finished or when the training job execution time reaches the ``TensorFlow`` estimator parameter ``train_max_run``. -When the training job finishes, a `TensorFlow serving `_ +When the training job finishes, a `TensorFlow serving `_ with the result of the training is generated and saved to the S3 location define by the ``TensorFlow`` estimator parameter ``output_path``. @@ -1188,7 +1189,7 @@ During the training job, the first EC2 instance that is executing the training i All instances execute the training loop, feeding the ``model_fn`` with ``train_input_fn``. Every ``min_eval_frequency`` steps (see `Optional Hyperparameters`_), the ``master`` instance will execute the ``model_fn`` in **evaluation** mode; i.e., ``features`` and ``labels`` are -fed with the evaluation dataset defined by ``eval_input_fn``. See `Creating train_input_fn and eval_input_fn functions`_. +fed with the evaluation dataset defined by ``eval_input_fn``. See `Creating train_input_fn and eval_input_fn functions`_. For more information on training and evaluation process, see `tf.estimator.train_and_evaluate `_. @@ -1197,20 +1198,20 @@ For more information on fit, see `SageMaker Python SDK Overview <#sagemaker-pyth TensorFlow serving models ^^^^^^^^^^^^^^^^^^^^^^^^^ -After your training job is complete in SageMaker and the ``fit`` call ends, the training job -will generate a `TensorFlow serving `_ -model ready for deployment. Your TensorFlow serving model will be available in the S3 location -``output_path`` that you specified when you created your `sagemaker.tensorflow.TensorFlow` +After your training job is complete in SageMaker and the ``fit`` call ends, the training job +will generate a `TensorFlow serving `_ +model ready for deployment. Your TensorFlow serving model will be available in the S3 location +``output_path`` that you specified when you created your `sagemaker.tensorflow.TensorFlow` estimator. Restoring from checkpoints ^^^^^^^^^^^^^^^^^^^^^^^^^^ While your training job is executing, TensorFlow will generate checkpoints and save them in the S3 -location defined by ``checkpoint_path`` parameter in the ``TensorFlow`` constructor. +location defined by ``checkpoint_path`` parameter in the ``TensorFlow`` constructor. These checkpoints can be used to restore a previous session or to evaluate the current training using ``TensorBoard``. -To restore a previous session, you just need to create a new ``sagemaker.tensorflow.TensorFlow`` +To restore a previous session, you just need to create a new ``sagemaker.tensorflow.TensorFlow`` estimator pointing to the previous checkpoint path: .. code:: python @@ -1227,20 +1228,20 @@ estimator pointing to the previous checkpoint path: Running TensorBoard ^^^^^^^^^^^^^^^^^^^ -When the ``fit`` parameter ``run_tensorboard_locally`` is set ``True``, all the checkpoint data -located in ``checkpoint_path`` will be downloaded to a local temporary folder and a local -``TensorBoard`` application will be watching that temporary folder. +When the ``fit`` parameter ``run_tensorboard_locally`` is set ``True``, all the checkpoint data +located in ``checkpoint_path`` will be downloaded to a local temporary folder and a local +``TensorBoard`` application will be watching that temporary folder. Every time a new checkpoint is created by the training job in the S3 bucket, ``fit`` will download that checkpoint to the same temporary folder and update ``TensorBoard``. -When the ``fit`` method starts the training, it will log the port that ``TensorBoard`` is using -to display metrics. The default port is **6006**, but another port can be chosen depending on +When the ``fit`` method starts the training, it will log the port that ``TensorBoard`` is using +to display metrics. The default port is **6006**, but another port can be chosen depending on availability. The port number will increase until finds an available port. After that, the port number will be printed in stdout. It takes a few minutes to provision containers and start the training job. TensorBoard will start to display metrics shortly after that. -You can access TensorBoard locally at http://localhost:6006 or using your SakeMaker workspace at -`https*workspace_base_url*proxy/6006/ `_ (TensorBoard will not work if you forget to put the slash, +You can access TensorBoard locally at http://localhost:6006 or using your SakeMaker workspace at +`https*workspace_base_url*proxy/6006/ `_ (TensorBoard will not work if you forget to put the slash, '/', in end of the url). If TensorBoard started on a different port, adjust these URLs to match. @@ -1249,7 +1250,7 @@ Deploying TensorFlow Serving models After a ``TensorFlow`` Estimator has been fit, it saves a ``TensorFlow Serving`` model in the S3 location defined by ``output_path``. You can call ``deploy`` on a ``TensorFlow`` -estimator to create a SageMaker Endpoint. +estimator to create a SageMaker Endpoint. A common usage of the ``deploy`` method, after the ``TensorFlow`` estimator has been fit look like this: @@ -1265,7 +1266,7 @@ like this: predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge') -The code block above deploys a SageMaker Endpoint with one instance of the type 'ml.c4.xlarge'. +The code block above deploys a SageMaker Endpoint with one instance of the type 'ml.c4.xlarge'. What happens when deploy is called ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1299,18 +1300,18 @@ The following code adds a prediction request to the previous code example: result = predictor.predict([6.4, 3.2, 4.5, 1.5]) -The ``predictor.predict`` method call takes one parameter, the input ``data`` for which you want the ``SageMaker Endpoint`` +The ``predictor.predict`` method call takes one parameter, the input ``data`` for which you want the ``SageMaker Endpoint`` to provide inference. ``predict`` will serialize the input data, and send it in as request to the ``SageMaker Endpoint`` by an ``InvokeEndpoint`` SageMaker operation. ``InvokeEndpoint`` operation requests can be made by ``predictor.predict``, by -boto3 ``SageMaker.runtime`` client or by AWS CLI. +boto3 ``SageMaker.runtime`` client or by AWS CLI. -The ``SageMaker Endpoint`` web server will process the request, make an inference using the deployed model, and return a response. +The ``SageMaker Endpoint`` web server will process the request, make an inference using the deployed model, and return a response. The ``result`` returned by ``predict`` is a Python dictionary with the model prediction. In the code example above, the prediction ``result`` looks like this: .. code:: python - {'result': + {'result': {'classifications': [ {'classes': [ {'label': '0', 'score': 0.0012890376383438706}, @@ -1323,8 +1324,8 @@ a Python dictionary with the model prediction. In the code example above, the pr Specifying the output of a prediction request ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The format of the prediction ``result`` is determined by the parameter ``export_outputs`` of the `tf.estimator.EstimatorSpec `_ that you returned when you created your ``model_fn``, see -`Example of a complete model_fn`_ for an example of ``export_outputs``. +The format of the prediction ``result`` is determined by the parameter ``export_outputs`` of the `tf.estimator.EstimatorSpec `_ that you returned when you created your ``model_fn``, see +`Example of a complete model_fn`_ for an example of ``export_outputs``. More information on how to create ``export_outputs`` can find in `specifying the outputs of a custom model `_. @@ -1373,7 +1374,7 @@ An example of ``input_fn`` for the content-type "application/python-pickle" can else: # Handle other content-types here or raise an Exception # if the content type is not supported. - pass + pass Overriding output precessing with an ``output_fn`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1391,7 +1392,7 @@ An example of ``output_fn`` for the accept type "application/python-pickle" can else: # Handle other content-types here or raise an Exception # if the content type is not supported. - pass + pass A example with ``input_fn`` and ``output_fn`` above can be found in `here `_. @@ -1423,7 +1424,7 @@ The Docker images extend Ubuntu 16.04. AWS SageMaker Estimators ------------------------ -Amazon SageMaker provides several built-in machine learning algorithms that you can use for a variety of problem types. +Amazon SageMaker provides several built-in machine learning algorithms that you can use for a variety of problem types. The full list of algorithms is available on the AWS website: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html @@ -1524,6 +1525,36 @@ Example code using the TensorFlow predictor: :: from sagemaker.tensorflow import TensorFlowPredictor - + predictor = TensorFlowPredictor('myexistingendpoint') result = predictor.predict(['my request body']) + + +BYO Model +----------------------------------------------- +You can also create an endpoint from an existing model rather than training one - i.e. bring your own model. + +First, package the files for the trained model into a ``.tar.gz`` file, and upload the archive to S3. + +Next, create a ``Model`` object that corresponds to the framework that you are using: `MXNetModel `__ or `TensorFlowModel `__. + +Example code using ``MXNetModel``: + +.. code:: python + + from sagemaker.mxnet.model import MXNetModel + + sagemaker_model = MXNetModel(model_data='s3://path/to/model.tar.gz', + role='arn:aws:iam::accid:sagemaker-role', + entry_point='entry_point.py') + +After that, invoke the ``deploy()`` method on the ``Model``: + +.. code:: python + + predictor = sagemaker_model.deploy(initial_instance_count=1, + instance_type='ml.m4.xlarge') + +This returns a predictor the same way an ``Estimator`` does when ``deploy()`` is called. You can now get inferences just like with any other model deployed on Amazon SageMaker. + +A full example is available in the `Amazon SageMaker examples repository `__. diff --git a/examples/cli/host/data/model.json b/examples/cli/host/data/model.json new file mode 100644 index 0000000000..8d0d57a51e --- /dev/null +++ b/examples/cli/host/data/model.json @@ -0,0 +1,126 @@ +{ + "nodes": [ + { + "op": "null", + "name": "data", + "inputs": [] + }, + { + "op": "null", + "name": "sequential0_dense0_weight", + "attr": { + "__dtype__": "0", + "__lr_mult__": "1.0", + "__shape__": "(128, 0)", + "__wd_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "sequential0_dense0_bias", + "attr": { + "__dtype__": "0", + "__init__": "zeros", + "__lr_mult__": "1.0", + "__shape__": "(128,)", + "__wd_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "FullyConnected", + "name": "sequential0_dense0_fwd", + "attr": {"num_hidden": "128"}, + "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]] + }, + { + "op": "Activation", + "name": "sequential0_dense0_relu_fwd", + "attr": {"act_type": "relu"}, + "inputs": [[3, 0, 0]] + }, + { + "op": "null", + "name": "sequential0_dense1_weight", + "attr": { + "__dtype__": "0", + "__lr_mult__": "1.0", + "__shape__": "(64, 0)", + "__wd_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "sequential0_dense1_bias", + "attr": { + "__dtype__": "0", + "__init__": "zeros", + "__lr_mult__": "1.0", + "__shape__": "(64,)", + "__wd_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "FullyConnected", + "name": "sequential0_dense1_fwd", + "attr": {"num_hidden": "64"}, + "inputs": [[4, 0, 0], [5, 0, 0], [6, 0, 0]] + }, + { + "op": "Activation", + "name": "sequential0_dense1_relu_fwd", + "attr": {"act_type": "relu"}, + "inputs": [[7, 0, 0]] + }, + { + "op": "null", + "name": "sequential0_dense2_weight", + "attr": { + "__dtype__": "0", + "__lr_mult__": "1.0", + "__shape__": "(10, 0)", + "__wd_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "sequential0_dense2_bias", + "attr": { + "__dtype__": "0", + "__init__": "zeros", + "__lr_mult__": "1.0", + "__shape__": "(10,)", + "__wd_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "FullyConnected", + "name": "sequential0_dense2_fwd", + "attr": {"num_hidden": "10"}, + "inputs": [[8, 0, 0], [9, 0, 0], [10, 0, 0]] + } + ], + "arg_nodes": [0, 1, 2, 5, 6, 9, 10], + "node_row_ptr": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12 + ], + "heads": [[11, 0, 0]], + "attrs": {"mxnet_version": ["int", 1100]} +} \ No newline at end of file diff --git a/examples/cli/host/data/model.params b/examples/cli/host/data/model.params new file mode 100644 index 0000000000..3757d543c8 Binary files /dev/null and b/examples/cli/host/data/model.params differ diff --git a/examples/cli/host/run_hosting_example.sh b/examples/cli/host/run_hosting_example.sh new file mode 100644 index 0000000000..b6d7e92d4d --- /dev/null +++ b/examples/cli/host/run_hosting_example.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +sagemaker mxnet host --role-name diff --git a/examples/cli/host/script.py b/examples/cli/host/script.py new file mode 100644 index 0000000000..f3775b4310 --- /dev/null +++ b/examples/cli/host/script.py @@ -0,0 +1,41 @@ +from __future__ import print_function + +import json +import mxnet as mx +from mxnet import gluon + + +def model_fn(model_dir): + """ + Load the gluon model. Called once when hosting service starts. + + :param: model_dir The directory where model files are stored. + :return: a model (in this case a Gluon network) + """ + symbol = mx.sym.load('%s/model.json' % model_dir) + outputs = mx.symbol.softmax(data=symbol, name='softmax_label') + inputs = mx.sym.var('data') + param_dict = gluon.ParameterDict('model_') + net = gluon.SymbolBlock(outputs, inputs, param_dict) + net.load_params('%s/model.params' % model_dir, ctx=mx.cpu()) + return net + + +def transform_fn(net, data, input_content_type, output_content_type): + """ + Transform a request using the Gluon model. Called once per request. + + :param net: The Gluon model. + :param data: The request payload. + :param input_content_type: The request content type. + :param output_content_type: The (desired) response content type. + :return: response payload and content type. + """ + # we can use content types to vary input/output handling, but + # here we just assume json for both + parsed = json.loads(data) + nda = mx.nd.array(parsed) + output = net(nda) + prediction = mx.nd.argmax(output, axis=1) + response_body = json.dumps(prediction.asnumpy().tolist()) + return response_body, output_content_type diff --git a/examples/cli/train/data/training/t10k-images-idx3-ubyte.gz b/examples/cli/train/data/training/t10k-images-idx3-ubyte.gz new file mode 100644 index 0000000000..5ace8ea93f Binary files /dev/null and b/examples/cli/train/data/training/t10k-images-idx3-ubyte.gz differ diff --git a/examples/cli/train/data/training/t10k-labels-idx1-ubyte.gz b/examples/cli/train/data/training/t10k-labels-idx1-ubyte.gz new file mode 100644 index 0000000000..a7e141541c Binary files /dev/null and b/examples/cli/train/data/training/t10k-labels-idx1-ubyte.gz differ diff --git a/examples/cli/train/data/training/train-images-idx3-ubyte.gz b/examples/cli/train/data/training/train-images-idx3-ubyte.gz new file mode 100644 index 0000000000..b50e4b6bcc Binary files /dev/null and b/examples/cli/train/data/training/train-images-idx3-ubyte.gz differ diff --git a/examples/cli/train/data/training/train-labels-idx1-ubyte.gz b/examples/cli/train/data/training/train-labels-idx1-ubyte.gz new file mode 100644 index 0000000000..707a576bb5 Binary files /dev/null and b/examples/cli/train/data/training/train-labels-idx1-ubyte.gz differ diff --git a/examples/cli/train/download_training_data.py b/examples/cli/train/download_training_data.py new file mode 100644 index 0000000000..eb33996904 --- /dev/null +++ b/examples/cli/train/download_training_data.py @@ -0,0 +1,10 @@ +from mxnet import gluon + + +def download_training_data(): + gluon.data.vision.MNIST('./data/training', train=True) + gluon.data.vision.MNIST('./data/training', train=False) + + +if __name__ == "__main__": + download_training_data() diff --git a/examples/cli/train/hyperparameters.json b/examples/cli/train/hyperparameters.json new file mode 100644 index 0000000000..01c3269250 --- /dev/null +++ b/examples/cli/train/hyperparameters.json @@ -0,0 +1,7 @@ +{ + "batch_size": 100, + "epochs": 10, + "learning_rate": 0.1, + "momentum": 0.9, + "log_interval": 100 +} diff --git a/examples/cli/train/run_training_example.sh b/examples/cli/train/run_training_example.sh new file mode 100755 index 0000000000..10176920d4 --- /dev/null +++ b/examples/cli/train/run_training_example.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +python ./download_training_data.py +sagemaker mxnet train --role-name diff --git a/examples/cli/train/script.py b/examples/cli/train/script.py new file mode 100644 index 0000000000..a219548fcc --- /dev/null +++ b/examples/cli/train/script.py @@ -0,0 +1,118 @@ +import logging +import time + +import mxnet as mx +import numpy as np +from mxnet import gluon, autograd +from mxnet.gluon import nn + +logger = logging.getLogger(__name__) + + +def train(channel_input_dirs, hyperparameters, **kwargs): + # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to + # the current container environment, but here we just use simple cpu context. + ctx = mx.cpu() + + # retrieve the hyperparameters we set in notebook (with some defaults) + batch_size = hyperparameters.get('batch_size', 100) + epochs = hyperparameters.get('epochs', 10) + learning_rate = hyperparameters.get('learning_rate', 0.1) + momentum = hyperparameters.get('momentum', 0.9) + log_interval = hyperparameters.get('log_interval', 100) + + training_data = channel_input_dirs['training'] + + # load training and validation data + # we use the gluon.data.vision.MNIST class because of its built in mnist pre-processing logic, + # but point it at the location where SageMaker placed the data files, so it doesn't download them again. + train_data = get_train_data(training_data, batch_size) + val_data = get_val_data(training_data, batch_size) + + # define the network + net = define_network() + + # Collect all parameters from net and its children, then initialize them. + net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) + # Trainer is for updating parameters with gradient. + trainer = gluon.Trainer(net.collect_params(), 'sgd', + {'learning_rate': learning_rate, 'momentum': momentum}) + metric = mx.metric.Accuracy() + loss = gluon.loss.SoftmaxCrossEntropyLoss() + + for epoch in range(epochs): + # reset data iterator and metric at begining of epoch. + metric.reset() + btic = time.time() + for i, (data, label) in enumerate(train_data): + # Copy data to ctx if necessary + data = data.as_in_context(ctx) + label = label.as_in_context(ctx) + # Start recording computation graph with record() section. + # Recorded graphs can then be differentiated with backward. + with autograd.record(): + output = net(data) + L = loss(output, label) + L.backward() + # take a gradient step with batch_size equal to data.shape[0] + trainer.step(data.shape[0]) + # update metric at last. + metric.update([label], [output]) + + if i % log_interval == 0 and i > 0: + name, acc = metric.get() + logger.info('[Epoch %d Batch %d] Training: %s=%f, %f samples/s' % + (epoch, i, name, acc, batch_size / (time.time() - btic))) + + btic = time.time() + + name, acc = metric.get() + logger.info('[Epoch %d] Training: %s=%f' % (epoch, name, acc)) + + name, val_acc = test(ctx, net, val_data) + logger.info('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc)) + + return net + + +def save(net, model_dir): + # save the model + y = net(mx.sym.var('data')) + y.save('%s/model.json' % model_dir) + net.collect_params().save('%s/model.params' % model_dir) + + +def define_network(): + net = nn.Sequential() + with net.name_scope(): + net.add(nn.Dense(128, activation='relu')) + net.add(nn.Dense(64, activation='relu')) + net.add(nn.Dense(10)) + return net + + +def input_transformer(data, label): + data = data.reshape((-1,)).astype(np.float32) / 255 + return data, label + + +def get_train_data(data_dir, batch_size): + return gluon.data.DataLoader( + gluon.data.vision.MNIST(data_dir, train=True, transform=input_transformer), + batch_size=batch_size, shuffle=True, last_batch='discard') + + +def get_val_data(data_dir, batch_size): + return gluon.data.DataLoader( + gluon.data.vision.MNIST(data_dir, train=False, transform=input_transformer), + batch_size=batch_size, shuffle=False) + + +def test(ctx, net, val_data): + metric = mx.metric.Accuracy() + for data, label in val_data: + data = data.as_in_context(ctx) + label = label.as_in_context(ctx) + output = net(data) + metric.update([label], [output]) + return metric.get() diff --git a/setup.py b/setup.py index fa81d4aaeb..6b048f83fc 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,10 @@ import os -from setuptools import setup, find_packages from glob import glob from os.path import basename from os.path import splitext +from setuptools import setup, find_packages + def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() @@ -36,4 +37,8 @@ def read(fname): extras_require={ 'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock', 'tensorflow>=1.3.0', 'contextlib2']}, + + entry_points={ + 'console_scripts': ['sagemaker=sagemaker.cli.main:main'], + } ) diff --git a/src/sagemaker/amazon/amazon_estimator.py b/src/sagemaker/amazon/amazon_estimator.py index 9ed28c1894..9fbdb8b631 100644 --- a/src/sagemaker/amazon/amazon_estimator.py +++ b/src/sagemaker/amazon/amazon_estimator.py @@ -28,8 +28,8 @@ class AmazonAlgorithmEstimatorBase(EstimatorBase): """Base class for Amazon first-party Estimator implementations. This class isn't intended to be instantiated directly.""" - feature_dim = hp('feature_dim', (validation.isint, validation.gt(0))) - mini_batch_size = hp('mini_batch_size', (validation.isint, validation.gt(0))) + feature_dim = hp('feature_dim', validation.gt(0), data_type=int) + mini_batch_size = hp('mini_batch_size', validation.gt(0), data_type=int) def __init__(self, role, train_instance_count, train_instance_type, data_location=None, **kwargs): """Initialize an AmazonAlgorithmEstimatorBase. diff --git a/src/sagemaker/amazon/factorization_machines.py b/src/sagemaker/amazon/factorization_machines.py index 5340fa11f0..66972316ac 100644 --- a/src/sagemaker/amazon/factorization_machines.py +++ b/src/sagemaker/amazon/factorization_machines.py @@ -13,7 +13,7 @@ from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa -from sagemaker.amazon.validation import gt, isin, isint, ge, isnumber +from sagemaker.amazon.validation import gt, isin, ge from sagemaker.predictor import RealTimePredictor from sagemaker.model import Model from sagemaker.session import Session @@ -23,34 +23,34 @@ class FactorizationMachines(AmazonAlgorithmEstimatorBase): repo = 'factorization-machines:1' - num_factors = hp('num_factors', (gt(0), isint), 'An integer greater than zero') + num_factors = hp('num_factors', gt(0), 'An integer greater than zero', int) predictor_type = hp('predictor_type', isin('binary_classifier', 'regressor'), - 'Value "binary_classifier" or "regressor"') - epochs = hp('epochs', (gt(0), isint), "An integer greater than 0") - clip_gradient = hp('clip_gradient', isnumber, "A float value") - eps = hp('eps', isnumber, "A float value") - rescale_grad = hp('rescale_grad', isnumber, "A float value") - bias_lr = hp('bias_lr', (ge(0), isnumber), "A non-negative float") - linear_lr = hp('linear_lr', (ge(0), isnumber), "A non-negative float") - factors_lr = hp('factors_lr', (ge(0), isnumber), "A non-negative float") - bias_wd = hp('bias_wd', (ge(0), isnumber), "A non-negative float") - linear_wd = hp('linear_wd', (ge(0), isnumber), "A non-negative float") - factors_wd = hp('factors_wd', (ge(0), isnumber), "A non-negative float") + 'Value "binary_classifier" or "regressor"', str) + epochs = hp('epochs', gt(0), "An integer greater than 0", int) + clip_gradient = hp('clip_gradient', (), "A float value", float) + eps = hp('eps', (), "A float value", float) + rescale_grad = hp('rescale_grad', (), "A float value", float) + bias_lr = hp('bias_lr', ge(0), "A non-negative float", float) + linear_lr = hp('linear_lr', ge(0), "A non-negative float", float) + factors_lr = hp('factors_lr', ge(0), "A non-negative float", float) + bias_wd = hp('bias_wd', ge(0), "A non-negative float", float) + linear_wd = hp('linear_wd', ge(0), "A non-negative float", float) + factors_wd = hp('factors_wd', ge(0), "A non-negative float", float) bias_init_method = hp('bias_init_method', isin('normal', 'uniform', 'constant'), - 'Value "normal", "uniform" or "constant"') - bias_init_scale = hp('bias_init_scale', (ge(0), isnumber), "A non-negative float") - bias_init_sigma = hp('bias_init_sigma', (ge(0), isnumber), "A non-negative float") - bias_init_value = hp('bias_init_value', isnumber, "A float value") + 'Value "normal", "uniform" or "constant"', str) + bias_init_scale = hp('bias_init_scale', ge(0), "A non-negative float", float) + bias_init_sigma = hp('bias_init_sigma', ge(0), "A non-negative float", float) + bias_init_value = hp('bias_init_value', (), "A float value", float) linear_init_method = hp('linear_init_method', isin('normal', 'uniform', 'constant'), - 'Value "normal", "uniform" or "constant"') - linear_init_scale = hp('linear_init_scale', (ge(0), isnumber), "A non-negative float") - linear_init_sigma = hp('linear_init_sigma', (ge(0), isnumber), "A non-negative float") - linear_init_value = hp('linear_init_value', isnumber, "A float value") + 'Value "normal", "uniform" or "constant"', str) + linear_init_scale = hp('linear_init_scale', ge(0), "A non-negative float", float) + linear_init_sigma = hp('linear_init_sigma', ge(0), "A non-negative float", float) + linear_init_value = hp('linear_init_value', (), "A float value", float) factors_init_method = hp('factors_init_method', isin('normal', 'uniform', 'constant'), - 'Value "normal", "uniform" or "constant"') - factors_init_scale = hp('factors_init_scale', (ge(0), isnumber), "A non-negative float") - factors_init_sigma = hp('factors_init_sigma', (ge(0), isnumber), "A non-negative float") - factors_init_value = hp('factors_init_value', isnumber, "A float value") + 'Value "normal", "uniform" or "constant"', str) + factors_init_scale = hp('factors_init_scale', ge(0), "A non-negative float", float) + factors_init_sigma = hp('factors_init_sigma', ge(0), "A non-negative float", float) + factors_init_value = hp('factors_init_value', (), "A float value", float) def __init__(self, role, train_instance_count, train_instance_type, num_factors, predictor_type, diff --git a/src/sagemaker/amazon/hyperparameter.py b/src/sagemaker/amazon/hyperparameter.py index 0d86191474..a3ac76367c 100644 --- a/src/sagemaker/amazon/hyperparameter.py +++ b/src/sagemaker/amazon/hyperparameter.py @@ -16,7 +16,7 @@ class Hyperparameter(object): """An algorithm hyperparameter with optional validation. Implemented as a python descriptor object.""" - def __init__(self, name, validate=lambda _: True, validation_message=""): + def __init__(self, name, validate=lambda _: True, validation_message="", data_type=str): """Args: name (str): The name of this hyperparameter validate (callable[object]->[bool]): A validation function or list of validation functions. @@ -27,6 +27,7 @@ def __init__(self, name, validate=lambda _: True, validation_message=""): self.validation = validate self.validation_message = validation_message self.name = name + self.data_type = data_type try: iter(self.validation) except TypeError: @@ -35,9 +36,10 @@ def __init__(self, name, validate=lambda _: True, validation_message=""): def validate(self, value): if value is None: # We allow assignment from None, but Nones are not sent to training. return + for valid in self.validation: if not valid(value): - error_message = "Invalid hyperparameter value {}".format(value) + error_message = "Invalid hyperparameter value {} for {}".format(value, self.name) if self.validation_message: error_message = error_message + ". Expecting: " + self.validation_message raise ValueError(error_message) @@ -50,6 +52,7 @@ def __get__(self, obj, objtype): def __set__(self, obj, value): """Validate the supplied value and set this hyperparameter to value""" + value = None if value is None else self.data_type(value) self.validate(value) if '_hyperparameters' not in dir(obj): obj._hyperparameters = dict() diff --git a/src/sagemaker/amazon/kmeans.py b/src/sagemaker/amazon/kmeans.py index d3fb5d670e..020f496d8f 100644 --- a/src/sagemaker/amazon/kmeans.py +++ b/src/sagemaker/amazon/kmeans.py @@ -13,7 +13,7 @@ from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa -from sagemaker.amazon.validation import gt, isin, isint, ge +from sagemaker.amazon.validation import gt, isin, ge from sagemaker.predictor import RealTimePredictor from sagemaker.model import Model from sagemaker.session import Session @@ -23,15 +23,15 @@ class KMeans(AmazonAlgorithmEstimatorBase): repo = 'kmeans:1' - k = hp('k', (gt(1), isint), 'An integer greater-than 1') - init_method = hp('init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"') - max_iterations = hp('local_lloyd_max_iterations', (gt(0), isint), 'An integer greater-than 0') - tol = hp('local_lloyd_tol', (gt(0), isint), 'An integer greater-than 0') - num_trials = hp('local_lloyd_num_trials', (gt(0), isint), 'An integer greater-than 0') - local_init_method = hp('local_lloyd_init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"') - half_life_time_size = hp('half_life_time_size', (ge(0), isint), 'An integer greater-than-or-equal-to 0') - epochs = hp('epochs', (gt(0), isint), 'An integer greater-than 0') - center_factor = hp('extra_center_factor', (gt(0), isint), 'An integer greater-than 0') + k = hp('k', gt(1), 'An integer greater-than 1', int) + init_method = hp('init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"', str) + max_iterations = hp('local_lloyd_max_iterations', gt(0), 'An integer greater-than 0', int) + tol = hp('local_lloyd_tol', gt(0), 'An integer greater-than 0', int) + num_trials = hp('local_lloyd_num_trials', gt(0), 'An integer greater-than 0', int) + local_init_method = hp('local_lloyd_init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"', str) + half_life_time_size = hp('half_life_time_size', ge(0), 'An integer greater-than-or-equal-to 0', int) + epochs = hp('epochs', gt(0), 'An integer greater-than 0', int) + center_factor = hp('extra_center_factor', gt(0), 'An integer greater-than 0', int) def __init__(self, role, train_instance_count, train_instance_type, k, init_method=None, max_iterations=None, tol=None, num_trials=None, local_init_method=None, diff --git a/src/sagemaker/amazon/linear_learner.py b/src/sagemaker/amazon/linear_learner.py index af336a8e6b..d1d9dd6cb8 100644 --- a/src/sagemaker/amazon/linear_learner.py +++ b/src/sagemaker/amazon/linear_learner.py @@ -13,7 +13,7 @@ from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa -from sagemaker.amazon.validation import isin, gt, lt, isint, isbool, isnumber +from sagemaker.amazon.validation import isin, gt, lt from sagemaker.predictor import RealTimePredictor from sagemaker.model import Model from sagemaker.session import Session @@ -27,40 +27,41 @@ class LinearLearner(AmazonAlgorithmEstimatorBase): binary_classifier_model_selection_criteria = hp('binary_classifier_model_selection_criteria', isin('accuracy', 'f1', 'precision_at_target_recall', - 'recall_at_target_precision', 'cross_entropy_loss')) - target_recall = hp('target_recall', (gt(0), lt(1)), "A float in (0,1)") - target_precision = hp('target_precision', (gt(0), lt(1)), "A float in (0,1)") - positive_example_weight_mult = hp('positive_example_weight_mult', gt(0), "A float greater than 0") - epochs = hp('epochs', (gt(0), isint), "An integer greater-than 0") + 'recall_at_target_precision', 'cross_entropy_loss'), + data_type=str) + target_recall = hp('target_recall', (gt(0), lt(1)), "A float in (0,1)", float) + target_precision = hp('target_precision', (gt(0), lt(1)), "A float in (0,1)", float) + positive_example_weight_mult = hp('positive_example_weight_mult', gt(0), "A float greater than 0", float) + epochs = hp('epochs', gt(0), "An integer greater-than 0", int) predictor_type = hp('predictor_type', isin('binary_classifier', 'regressor'), - 'One of "binary_classifier" or "regressor"') - use_bias = hp('use_bias', isbool, "Either True or False") - num_models = hp('num_models', (gt(0), isint), "An integer greater-than 0") - num_calibration_samples = hp('num_calibration_samples', (gt(0), isint), "An integer greater-than 0") - init_method = hp('init_method', isin('uniform', 'normal'), 'One of "uniform" or "normal"') - init_scale = hp('init_scale', (gt(-1), lt(1)), 'A float in (-1, 1)') - init_sigma = hp('init_sigma', (gt(0), lt(1)), 'A float in (0, 1)') - init_bias = hp('init_bias', isnumber, 'A number') - optimizer = hp('optimizer', isin('sgd', 'adam', 'auto'), 'One of "sgd", "adam" or "auto') + 'One of "binary_classifier" or "regressor"', str) + use_bias = hp('use_bias', (), "Either True or False", bool) + num_models = hp('num_models', gt(0), "An integer greater-than 0", int) + num_calibration_samples = hp('num_calibration_samples', gt(0), "An integer greater-than 0", int) + init_method = hp('init_method', isin('uniform', 'normal'), 'One of "uniform" or "normal"', str) + init_scale = hp('init_scale', (gt(-1), lt(1)), 'A float in (-1, 1)', float) + init_sigma = hp('init_sigma', (gt(0), lt(1)), 'A float in (0, 1)', float) + init_bias = hp('init_bias', (), 'A number', float) + optimizer = hp('optimizer', isin('sgd', 'adam', 'auto'), 'One of "sgd", "adam" or "auto', str) loss = hp('loss', isin('logistic', 'squared_loss', 'absolute_loss', 'auto'), - '"logistic", "squared_loss", "absolute_loss" or"auto"') - wd = hp('wd', (gt(0), lt(1)), 'A float in (0,1)') - l1 = hp('l1', (gt(0), lt(1)), 'A float in (0,1)') - momentum = hp('momentum', (gt(0), lt(1)), 'A float in (0,1)') - learning_rate = hp('learning_rate', (gt(0), lt(1)), 'A float in (0,1)') - beta_1 = hp('beta_1', (gt(0), lt(1)), 'A float in (0,1)') - beta_2 = hp('beta_1', (gt(0), lt(1)), 'A float in (0,1)') - bias_lr_mult = hp('bias_lr_mult', gt(0), 'A float greater-than 0') - bias_wd_mult = hp('bias_wd_mult', gt(0), 'A float greater-than 0') - use_lr_scheduler = hp('use_lr_scheduler', isbool, 'A boolean') - lr_scheduler_step = hp('lr_scheduler_step', (gt(0), isint), 'An integer greater-than 0') - lr_scheduler_factor = hp('lr_scheduler_factor', (gt(0), lt(1)), 'A float in (0,1)') - lr_scheduler_minimum_lr = hp('lr_scheduler_minimum_lr', gt(0), 'A float greater-than 0') - normalize_data = hp('normalize_data', isbool, 'A boolean') - normalize_label = hp('normalize_label', isbool, 'A boolean') - unbias_data = hp('unbias_data', isbool, 'A boolean') - unbias_label = hp('unbias_label', isbool, 'A boolean') - num_point_for_scalar = hp('num_point_for_scalar', (isint, gt(0)), 'An integer greater-than 0') + '"logistic", "squared_loss", "absolute_loss" or"auto"', str) + wd = hp('wd', (gt(0), lt(1)), 'A float in (0,1)', float) + l1 = hp('l1', (gt(0), lt(1)), 'A float in (0,1)', float) + momentum = hp('momentum', (gt(0), lt(1)), 'A float in (0,1)', float) + learning_rate = hp('learning_rate', (gt(0), lt(1)), 'A float in (0,1)', float) + beta_1 = hp('beta_1', (gt(0), lt(1)), 'A float in (0,1)', float) + beta_2 = hp('beta_1', (gt(0), lt(1)), 'A float in (0,1)', float) + bias_lr_mult = hp('bias_lr_mult', gt(0), 'A float greater-than 0', float) + bias_wd_mult = hp('bias_wd_mult', gt(0), 'A float greater-than 0', float) + use_lr_scheduler = hp('use_lr_scheduler', (), 'A boolean', bool) + lr_scheduler_step = hp('lr_scheduler_step', gt(0), 'An integer greater-than 0', int) + lr_scheduler_factor = hp('lr_scheduler_factor', (gt(0), lt(1)), 'A float in (0,1)', float) + lr_scheduler_minimum_lr = hp('lr_scheduler_minimum_lr', gt(0), 'A float greater-than 0', float) + normalize_data = hp('normalize_data', (), 'A boolean', bool) + normalize_label = hp('normalize_label', (), 'A boolean', bool) + unbias_data = hp('unbias_data', (), 'A boolean', bool) + unbias_label = hp('unbias_label', (), 'A boolean', bool) + num_point_for_scalar = hp('num_point_for_scalar', gt(0), 'An integer greater-than 0', int) def __init__(self, role, train_instance_count, train_instance_type, predictor_type='binary_classifier', binary_classifier_model_selection_criteria=None, target_recall=None, target_precision=None, diff --git a/src/sagemaker/amazon/pca.py b/src/sagemaker/amazon/pca.py index 7a23f60c7c..48f99c04d1 100644 --- a/src/sagemaker/amazon/pca.py +++ b/src/sagemaker/amazon/pca.py @@ -24,14 +24,13 @@ class PCA(AmazonAlgorithmEstimatorBase): DEFAULT_MINI_BATCH_SIZE = 500 - num_components = hp(name='num_components', validate=lambda x: x > 0 and isinstance(x, int), - validation_message='Value must be an integer greater than zero') + num_components = hp(name='num_components', validate=lambda x: x > 0, + validation_message='Value must be an integer greater than zero', data_type=int) algorithm_mode = hp(name='algorithm_mode', validate=lambda x: x in ['regular', 'stable', 'randomized'], - validation_message='Value must be one of "regular", "stable", "randomized"') - subtract_mean = hp(name='subtract_mean', validate=lambda x: isinstance(x, bool), - validation_message='Value must be a boolean') - extra_components = hp(name='extra_components', validate=lambda x: x >= 0 and isinstance(x, int), - validation_message="Value must be an integer greater than or equal to 0") + validation_message='Value must be one of "regular", "stable", "randomized"', data_type=str) + subtract_mean = hp(name='subtract_mean', validation_message='Value must be a boolean', data_type=bool) + extra_components = hp(name='extra_components', validate=lambda x: x >= 0, + validation_message="Value must be an integer greater than or equal to 0", data_type=int) def __init__(self, role, train_instance_count, train_instance_type, num_components, algorithm_mode=None, subtract_mean=None, extra_components=None, **kwargs): diff --git a/src/sagemaker/amazon/validation.py b/src/sagemaker/amazon/validation.py index ff3259be8f..ede48cc9b3 100644 --- a/src/sagemaker/amazon/validation.py +++ b/src/sagemaker/amazon/validation.py @@ -10,7 +10,6 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -import numbers def gt(minimum): @@ -41,8 +40,3 @@ def istype(expected): def validate(value): return isinstance(value, expected) return validate - - -isint = istype(int) -isbool = istype(bool) -isnumber = istype(numbers.Number) # noqa diff --git a/src/sagemaker/cli/__init__.py b/src/sagemaker/cli/__init__.py new file mode 100644 index 0000000000..4d78ed0e5c --- /dev/null +++ b/src/sagemaker/cli/__init__.py @@ -0,0 +1,12 @@ +# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. diff --git a/src/sagemaker/cli/common.py b/src/sagemaker/cli/common.py new file mode 100644 index 0000000000..80f6fe07ca --- /dev/null +++ b/src/sagemaker/cli/common.py @@ -0,0 +1,112 @@ +# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import logging +import os +import shutil +import tarfile +import tempfile + +import sagemaker + +logger = logging.getLogger(__name__) + + +class HostCommand(object): + def __init__(self, args): + self.endpoint_name = args.job_name + self.bucket = args.bucket_name # may be None + self.role_name = args.role_name + self.python = args.python + self.data = args.data + self.script = args.script + self.instance_type = args.instance_type + self.instance_count = args.instance_count + self.environment = {k: v for k, v in (kv.split('=') for kv in args.env)} + + self.session = sagemaker.Session() + + def upload_model(self): + prefix = '{}/model'.format(self.endpoint_name) + + archive = self.create_model_archive(self.data) + model_uri = self.session.upload_data(path=archive, bucket=self.bucket, key_prefix=prefix) + shutil.rmtree(os.path.dirname(archive)) + + return model_uri + + @staticmethod + def create_model_archive(src): + if os.path.isdir(src): + arcname = '.' + else: + arcname = os.path.basename(src) + + tmp = tempfile.mkdtemp() + archive = os.path.join(tmp, 'model.tar.gz') + + with tarfile.open(archive, mode='w:gz') as t: + t.add(src, arcname=arcname) + return archive + + def create_model(self, model_url): + raise NotImplementedError # subclasses must override + + def start(self): + model_url = self.upload_model() + model = self.create_model(model_url) + predictor = model.deploy(initial_instance_count=self.instance_count, + instance_type=self.instance_type) + + return predictor + + +class TrainCommand(object): + def __init__(self, args): + self.job_name = args.job_name + self.bucket = args.bucket_name # may be None + self.role_name = args.role_name + self.python = args.python + self.data = args.data + self.script = args.script + self.instance_type = args.instance_type + self.instance_count = args.instance_count + self.hyperparameters = self.load_hyperparameters(args.hyperparameters) + + self.session = sagemaker.Session() + + @staticmethod + def load_hyperparameters(src): + hp = {} + if src and os.path.exists(src): + with open(src, 'r') as f: + hp = json.load(f) + return hp + + def upload_training_data(self): + prefix = '{}/data'.format(self.job_name) + data_url = self.session.upload_data(path=self.data, bucket=self.bucket, key_prefix=prefix) + return data_url + + def create_estimator(self): + raise NotImplementedError # subclasses must override + + def start(self): + data_url = self.upload_training_data() + estimator = self.create_estimator() + estimator.fit(data_url) + logger.debug('code location: {}'.format(estimator.uploaded_code.s3_prefix)) + logger.debug('model location: {}{}/output/model.tar.gz'.format(estimator.output_path, + estimator._current_job_name)) diff --git a/src/sagemaker/cli/main.py b/src/sagemaker/cli/main.py new file mode 100644 index 0000000000..5306757a0f --- /dev/null +++ b/src/sagemaker/cli/main.py @@ -0,0 +1,110 @@ +# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import argparse +import logging +import sys + +import sagemaker +import sagemaker.cli.mxnet +import sagemaker.cli.tensorflow + +logger = logging.getLogger(__name__) + +DEFAULT_LOG_LEVEL = 'info' +DEFAULT_BOTOCORE_LOG_LEVEL = 'warning' + + +def parse_arguments(args): + parser = argparse.ArgumentParser(description='Launch SageMaker training jobs or hosting endpoints') + parser.set_defaults(func=lambda x: parser.print_usage()) + + # common args for training/hosting/all frameworks + common_parser = argparse.ArgumentParser(add_help=False) + common_parser.add_argument('--role-name', help='SageMaker execution role name', type=str, required=True) + common_parser.add_argument('--data', help='path to training data or model files', type=str, default='./data') + common_parser.add_argument('--script', help='path to script', type=str, default='./script.py') + common_parser.add_argument('--job-name', help='job or endpoint name', type=str, default=None) + common_parser.add_argument('--bucket-name', help='S3 bucket for training/model data and script files', + type=str, default=None) + common_parser.add_argument('--python', help='python version', type=str, default='py2') + + instance_group = common_parser.add_argument_group('instance settings') + instance_group.add_argument('--instance-type', type=str, help='instance type', default='ml.m4.xlarge') + instance_group.add_argument('--instance-count', type=int, help='instance count', default=1) + + # common training args + common_train_parser = argparse.ArgumentParser(add_help=False) + common_train_parser.add_argument('--hyperparameters', help='path to training hyperparameters file', + type=str, default='./hyperparameters.json') + + # common hosting args + common_host_parser = argparse.ArgumentParser(add_help=False) + common_host_parser.add_argument('--env', help='hosting environment variable(s)', type=str, nargs='*', default=[]) + + subparsers = parser.add_subparsers() + + # framework/algo subcommands + mxnet_parser = subparsers.add_parser('mxnet', help='use MXNet', parents=[]) + mxnet_subparsers = mxnet_parser.add_subparsers() + mxnet_train_parser = mxnet_subparsers.add_parser('train', + help='start a training job', + parents=[common_parser, common_train_parser]) + mxnet_train_parser.set_defaults(func=sagemaker.cli.mxnet.train) + + mxnet_host_parser = mxnet_subparsers.add_parser('host', + help='start a hosting endpoint', + parents=[common_parser, common_host_parser]) + mxnet_host_parser.set_defaults(func=sagemaker.cli.mxnet.host) + + tensorflow_parser = subparsers.add_parser('tensorflow', help='use TensorFlow', parents=[]) + tensorflow_subparsers = tensorflow_parser.add_subparsers() + tensorflow_train_parser = tensorflow_subparsers.add_parser('train', + help='start a training job', + parents=[common_parser, common_train_parser]) + tensorflow_train_parser.add_argument('--training-steps', + help='number of training steps (tensorflow only)', type=int, default=None) + tensorflow_train_parser.add_argument('--evaluation-steps', + help='number of evaluation steps (tensorflow only)', type=int, default=None) + tensorflow_train_parser.set_defaults(func=sagemaker.cli.tensorflow.train) + + tensorflow_host_parser = tensorflow_subparsers.add_parser('host', + help='start a hosting endpoint', + parents=[common_parser, common_host_parser]) + tensorflow_host_parser.set_defaults(func=sagemaker.cli.tensorflow.host) + + log_group = parser.add_argument_group('optional log settings') + log_group.add_argument('--log-level', help='log level for this command', type=str, default=DEFAULT_LOG_LEVEL) + log_group.add_argument('--botocore-log-level', help='log level for botocore', type=str, + default=DEFAULT_BOTOCORE_LOG_LEVEL) + + return parser.parse_args(args) + + +def configure_logging(args): + log_format = '%(asctime)s %(levelname)s %(name)s: %(message)s' + log_level = logging.getLevelName(args.log_level.upper()) + logging.basicConfig(format=log_format, level=log_level) + logging.getLogger("botocore").setLevel(args.botocore_log_level.upper()) + + +def main(): + args = parse_arguments(sys.argv[1:]) + configure_logging(args) + logger.debug('args: {}'.format(args)) + args.func(args) + + +if __name__ == '__main__': + main() diff --git a/src/sagemaker/cli/mxnet.py b/src/sagemaker/cli/mxnet.py new file mode 100644 index 0000000000..fa7f2c2a7f --- /dev/null +++ b/src/sagemaker/cli/mxnet.py @@ -0,0 +1,46 @@ +# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from sagemaker.cli.common import HostCommand, TrainCommand + + +def train(args): + MXNetTrainCommand(args).start() + + +def host(args): + MXNetHostCommand(args).start() + + +class MXNetTrainCommand(TrainCommand): + def __init__(self, args): + super(MXNetTrainCommand, self).__init__(args) + + def create_estimator(self): + from sagemaker.mxnet.estimator import MXNet + return MXNet(self.script, + role=self.role_name, + base_job_name=self.job_name, + train_instance_count=self.instance_count, + train_instance_type=self.instance_type, + hyperparameters=self.hyperparameters, + py_version=self.python) + + +class MXNetHostCommand(HostCommand): + def __init__(self, args): + super(MXNetHostCommand, self).__init__(args) + + def create_model(self, model_url): + from sagemaker.mxnet.model import MXNetModel + return MXNetModel(model_data=model_url, role=self.role_name, entry_point=self.script, + py_version=self.python, name=self.endpoint_name, env=self.environment) diff --git a/src/sagemaker/cli/tensorflow.py b/src/sagemaker/cli/tensorflow.py new file mode 100644 index 0000000000..231fa96e83 --- /dev/null +++ b/src/sagemaker/cli/tensorflow.py @@ -0,0 +1,50 @@ +# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from sagemaker.cli.common import HostCommand, TrainCommand + + +def train(args): + TensorFlowTrainCommand(args).start() + + +def host(args): + TensorFlowHostCommand(args).start() + + +class TensorFlowTrainCommand(TrainCommand): + def __init__(self, args): + super(TensorFlowTrainCommand, self).__init__(args) + self.training_steps = args.training_steps + self.evaluation_steps = args.evaluation_steps + + def create_estimator(self): + from sagemaker.tensorflow import TensorFlow + return TensorFlow(training_steps=self.training_steps, + evaluation_steps=self.evaluation_steps, + py_version=self.python, + entry_point=self.script, + role=self.role_name, + base_job_name=self.job_name, + train_instance_count=self.instance_count, + train_instance_type=self.instance_type, + hyperparameters=self.hyperparameters) + + +class TensorFlowHostCommand(HostCommand): + def __init__(self, args): + super(TensorFlowHostCommand, self).__init__(args) + + def create_model(self, model_url): + from sagemaker.tensorflow.model import TensorFlowModel + return TensorFlowModel(model_data=model_url, role=self.role_name, entry_point=self.script, + py_version=self.python, name=self.endpoint_name, env=self.environment) diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index 7b6639c935..0a175825b3 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -127,8 +127,7 @@ def upload_data(self, path, bucket=None, key_prefix='data'): s3 = self.boto_session.resource('s3') for local_path, s3_key in files: - with open(local_path, 'rb') as f: - s3.Object(bucket, s3_key).put(Body=f) + s3.Object(bucket, s3_key).upload_file(local_path) s3_uri = 's3://{}/{}'.format(bucket, key_prefix) # If a specific file was used as input (instead of a directory), we return the full S3 key diff --git a/tests/integ/test_byo_estimator.py b/tests/integ/test_byo_estimator.py new file mode 100644 index 0000000000..d0c1a18e07 --- /dev/null +++ b/tests/integ/test_byo_estimator.py @@ -0,0 +1,99 @@ +# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +import gzip +import io +import json +import numpy as np +import os +import pickle +import sys + +import boto3 + +import sagemaker +from sagemaker.estimator import Estimator +from sagemaker.amazon.amazon_estimator import registry +from sagemaker.amazon.common import write_numpy_to_dense_tensor +from sagemaker.utils import name_from_base +from tests.integ import DATA_DIR, REGION +from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name + + +def test_byo_estimator(): + """Use Factorization Machines algorithm as an example here. + + First we need to prepare data for training. We take standard data set, convert it to the + format that the algorithm can process and upload it to S3. + Then we create the Estimator and set hyperparamets as required by the algorithm. + Next, we can call fit() with path to the S3. + Later the trained model is deployed and prediction is called against the endpoint. + Default predictor is updated with json serializer and deserializer. + + """ + image_name = registry(REGION) + "/factorization-machines:1" + + with timeout(minutes=15): + sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION)) + data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') + pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} + + with gzip.open(data_path, 'rb') as f: + train_set, _, _ = pickle.load(f, **pickle_args) + + # take 100 examples for faster execution + vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32') + labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32') + + buf = io.BytesIO() + write_numpy_to_dense_tensor(buf, vectors, labels) + buf.seek(0) + + bucket = sagemaker_session.default_bucket() + prefix = 'test_byo_estimator' + key = 'recordio-pb-data' + boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf) + s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key) + + estimator = Estimator(image_name=image_name, + role='SageMakerRole', train_instance_count=1, + train_instance_type='ml.c4.xlarge', + sagemaker_session=sagemaker_session, base_job_name='test-byo') + + estimator.set_hyperparameters(num_factors=10, + feature_dim=784, + mini_batch_size=100, + predictor_type='binary_classifier') + + # training labels must be 'float32' + estimator.fit({'train': s3_train_data}) + + endpoint_name = name_from_base('byo') + + def fm_serializer(data): + js = {'instances': []} + for row in data: + js['instances'].append({'features': row.tolist()}) + return json.dumps(js) + + with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20): + model = estimator.create_model() + predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) + predictor.serializer = fm_serializer + predictor.content_type = 'application/json' + predictor.deserializer = sagemaker.predictor.json_deserializer + + result = predictor.predict(train_set[0][:10]) + + assert len(result['predictions']) == 10 + for prediction in result['predictions']: + assert prediction['score'] is not None diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py new file mode 100644 index 0000000000..1956e95ba5 --- /dev/null +++ b/tests/unit/test_cli.py @@ -0,0 +1,193 @@ +# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +import pytest +import sagemaker.cli.main as cli +from mock import patch + +COMMON_ARGS = '--role-name myrole --data mydata --script myscript --job-name myjob --bucket-name mybucket ' + \ + '--python py3 --instance-type myinstance --instance-count 2' + +TRAIN_ARGS = '--hyperparameters myhyperparameters.json' + +LOG_ARGS = '--log-level debug --botocore-log-level debug' + +HOST_ARGS = '--env ENV1=env1 ENV2=env2' + + +def assert_common_defaults(args): + assert args.data == './data' + assert args.script == './script.py' + assert args.job_name is None + assert args.bucket_name is None + assert args.python == 'py2' + assert args.instance_type == 'ml.m4.xlarge' + assert args.instance_count == 1 + assert args.log_level == 'info' + assert args.botocore_log_level == 'warning' + + +def assert_common_non_defaults(args): + assert args.data == 'mydata' + assert args.script == 'myscript' + assert args.job_name == 'myjob' + assert args.bucket_name == 'mybucket' + assert args.role_name == 'myrole' + assert args.python == 'py3' + assert args.instance_type == 'myinstance' + assert args.instance_count == 2 + assert args.log_level == 'debug' + assert args.botocore_log_level == 'debug' + + +def assert_train_defaults(args): + assert args.hyperparameters == './hyperparameters.json' + + +def assert_train_non_defaults(args): + assert args.hyperparameters == 'myhyperparameters.json' + + +def assert_host_defaults(args): + assert args.env == [] + + +def assert_host_non_defaults(args): + assert args.env == ['ENV1=env1', 'ENV2=env2'] + + +def test_args_mxnet_train_defaults(): + args = cli.parse_arguments('mxnet train --role-name role'.split()) + assert_common_defaults(args) + assert_train_defaults(args) + assert args.func.__module__ == 'sagemaker.cli.mxnet' + assert args.func.__name__ == 'train' + + +def test_args_mxnet_train_non_defaults(): + args = cli.parse_arguments('{} mxnet train --role-name role {} {}' + .format(LOG_ARGS, COMMON_ARGS, TRAIN_ARGS) + .split()) + assert_common_non_defaults(args) + assert_train_non_defaults(args) + assert args.func.__module__ == 'sagemaker.cli.mxnet' + assert args.func.__name__ == 'train' + + +def test_args_mxnet_host_defaults(): + args = cli.parse_arguments('mxnet host --role-name role'.split()) + assert_common_defaults(args) + assert_host_defaults(args) + assert args.func.__module__ == 'sagemaker.cli.mxnet' + assert args.func.__name__ == 'host' + + +def test_args_mxnet_host_non_defaults(): + args = cli.parse_arguments('{} mxnet host --role-name role {} {}' + .format(LOG_ARGS, COMMON_ARGS, HOST_ARGS) + .split()) + assert_common_non_defaults(args) + assert_host_non_defaults(args) + assert args.func.__module__ == 'sagemaker.cli.mxnet' + assert args.func.__name__ == 'host' + + +def test_args_tensorflow_train_defaults(): + args = cli.parse_arguments('tensorflow train --role-name role'.split()) + assert_common_defaults(args) + assert_train_defaults(args) + assert args.training_steps is None + assert args.evaluation_steps is None + assert args.func.__module__ == 'sagemaker.cli.tensorflow' + assert args.func.__name__ == 'train' + + +def test_args_tensorflow_train_non_defaults(): + args = cli.parse_arguments('{} tensorflow train --role-name role --training-steps 10 --evaluation-steps 5 {} {}' + .format(LOG_ARGS, COMMON_ARGS, TRAIN_ARGS) + .split()) + assert_common_non_defaults(args) + assert_train_non_defaults(args) + assert args.training_steps == 10 + assert args.evaluation_steps == 5 + assert args.func.__module__ == 'sagemaker.cli.tensorflow' + assert args.func.__name__ == 'train' + + +def test_args_tensorflow_host_defaults(): + args = cli.parse_arguments('tensorflow host --role-name role'.split()) + assert_common_defaults(args) + assert_host_defaults(args) + assert args.func.__module__ == 'sagemaker.cli.tensorflow' + assert args.func.__name__ == 'host' + + +def test_args_tensorflow_host_non_defaults(): + args = cli.parse_arguments('{} tensorflow host --role-name role {} {}' + .format(LOG_ARGS, COMMON_ARGS, HOST_ARGS) + .split()) + assert_common_non_defaults(args) + assert_host_non_defaults(args) + assert args.func.__module__ == 'sagemaker.cli.tensorflow' + assert args.func.__name__ == 'host' + + +def test_args_invalid_framework(): + with pytest.raises(SystemExit): + cli.parse_arguments('fakeframework train --role-name role'.split()) + + +def test_args_invalid_subcommand(): + with pytest.raises(SystemExit): + cli.parse_arguments('mxnet drain'.split()) + + +def test_args_invalid_args(): + with pytest.raises(SystemExit): + cli.parse_arguments('tensorflow train --role-name role --notdata foo'.split()) + + +def test_args_invalid_mxnet_python(): + with pytest.raises(SystemExit): + cli.parse_arguments('mxnet train --role-name role nython py2'.split()) + + +def test_args_invalid_host_args_in_train(): + with pytest.raises(SystemExit): + cli.parse_arguments('mxnet train --role-name role --env FOO=bar'.split()) + + +def test_args_invalid_train_args_in_host(): + with pytest.raises(SystemExit): + cli.parse_arguments('tensorflow host --role-name role --hyperparameters foo.json'.split()) + + +@patch('sagemaker.mxnet.estimator.MXNet') +@patch('sagemaker.Session') +def test_mxnet_train(session, estimator): + args = cli.parse_arguments('mxnet train --role-name role'.split()) + args.func(args) + session.return_value.upload_data.assert_called() + estimator.assert_called() + estimator.return_value.fit.assert_called() + + +@patch('sagemaker.mxnet.model.MXNetModel') +@patch('sagemaker.cli.common.HostCommand.upload_model') +@patch('sagemaker.Session') +def test_mxnet_host(session, upload_model, model): + args = cli.parse_arguments('mxnet host --role-name role'.split()) + args.func(args) + session.assert_called() + upload_model.assert_called() + model.assert_called() + model.return_value.deploy.assert_called() diff --git a/tests/unit/test_hyperparameter.py b/tests/unit/test_hyperparameter.py index c168f3275e..db7ed3f64c 100644 --- a/tests/unit/test_hyperparameter.py +++ b/tests/unit/test_hyperparameter.py @@ -16,9 +16,9 @@ class Test(object): - blank = Hyperparameter(name="some-name") + blank = Hyperparameter(name="some-name", data_type=int) elizabeth = Hyperparameter(name='elizabeth') - validated = Hyperparameter(name="validated", validate=lambda value: value > 55) + validated = Hyperparameter(name="validated", validate=lambda value: value > 55, data_type=int) def test_blank_access(): @@ -55,3 +55,20 @@ def test_validated(): x.validated = 66 with pytest.raises(ValueError): x.validated = 23 + + +def test_data_type(): + x = Test() + x.validated = 66 + assert type(x.validated) == Test.__dict__["validated"].data_type + + +def test_from_string(): + x = Test() + value = 65 + + x.validated = value + from_api = str(value) + + x.validated = from_api + assert x.validated == value diff --git a/tests/unit/test_upload_data.py b/tests/unit/test_upload_data.py index 4d3a3efc08..43442c0c7f 100644 --- a/tests/unit/test_upload_data.py +++ b/tests/unit/test_upload_data.py @@ -34,8 +34,8 @@ def sagemaker_session(): def test_upload_data_absolute_dir(sagemaker_session): result_s3_uri = sagemaker_session.upload_data(UPLOAD_DATA_TESTS_FILES_DIR) - uploaded_files = [kwargs['Body'].name for name, args, kwargs in sagemaker_session.boto_session.mock_calls - if name == 'resource().Object().put'] + uploaded_files = [args[0] for name, args, kwargs in sagemaker_session.boto_session.mock_calls + if name == 'resource().Object().upload_file'] assert result_s3_uri == 's3://{}/data'.format(BUCKET_NAME) assert len(uploaded_files) == 4 for file in uploaded_files: @@ -45,8 +45,8 @@ def test_upload_data_absolute_dir(sagemaker_session): def test_upload_data_absolute_file(sagemaker_session): result_s3_uri = sagemaker_session.upload_data(UPLOAD_DATA_TESTS_SINGLE_FILE) - uploaded_files = [kwargs['Body'].name for name, args, kwargs in sagemaker_session.boto_session.mock_calls - if name == 'resource().Object().put'] + uploaded_files = [args[0] for name, args, kwargs in sagemaker_session.boto_session.mock_calls + if name == 'resource().Object().upload_file'] assert result_s3_uri == 's3://{}/data/{}'.format(BUCKET_NAME, SINGLE_FILE_NAME) assert len(uploaded_files) == 1 assert os.path.exists(uploaded_files[0])