Skip to content

training a sequential keras model fails #279

@MyDuan

Description

@MyDuan

Please fill out the form below.

System Information

  • Framework (e.g. TensorFlow) / Algorithm (e.g. KMeans): TensorFlow (Keras)
  • Framework Version: tensorflow version is 1.8.0
  • Python Version: Python 3.6.5
  • CPU or GPU: CPU
  • Python SDK Version: 1.5.3
  • Are you using a custom image: No

Describe the problem

I'm trying to run the model training for a keras sequential model on sagemaker to solve a classification problem on the dataset iris. But an error was happening.

Minimal repro / logs

python code:

import numpy as np
import os
import tensorflow as tf


def estimator_fn(run_config, params):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(10, activation='relu', input_shape=(4, ), name="features"))
    model.add(tf.keras.layers.Dense(20, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid', name='output'))
    model.compile(
              optimizer='rmsprop', #tf.keras.optimizers.RMSprop(lr=2e-5),
              loss='categorical_crossentropy',
              metrics=['accuracy']
    )
    return tf.keras.estimator.model_to_estimator(keras_model = model)


def serving_input_fn(params):
    feature_spec = {'features_input': tf.FixedLenFeature(dtype=tf.float32, shape=[4])}
    return tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)()

def train_input_fn(training_dir, params):
    """Returns input function that would feed the model during training"""
    print("Returns input function that would feed the model during training")
    training_set = _generate_input_fn(training_dir, 'iris_training.csv')
    print("----------------------------------------train-------------------------------------------")
    return training_set

def eval_input_fn(training_dir, params):
    """Returns input function that would feed the model during evaluation"""
    print("Returns input function that would feed the model during evaluation")
    testing_set = _generate_input_fn(training_dir, 'iris_test.csv')
    print("----------------------------------------test-------------------------------------------")
    return testing_set

def _generate_input_fn(training_dir, training_filename):
    training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
        filename=os.path.join(training_dir, training_filename), 
        target_dtype=np.int, 
        features_dtype=np.float32)
    
    print(training_filename)
    #print(training_set)
    
    input_fn = tf.estimator.inputs.numpy_input_fn(
        x={'features_input': np.array(training_set.data)},
        y=np.array(training_set.target).reshape((-1,1)),
        num_epochs = None,
        shuffle=True)()
    
    print("-----------------------------------", input_fn)
    return input_fn

notbook

from sagemaker import get_execution_role

custom_code_upload_location = 's3://XXX/tensorflow_iris'
model_artifacts_location = 's3://XXX/artifacts'
role = 'XXX'
from sagemaker.tensorflow import TensorFlow

iris_estimator = TensorFlow(entry_point='pCTR.py',
                            role=role,
                            output_path=model_artifacts_location,
                            code_location=custom_code_upload_location,
                            train_instance_count=1,
                            train_instance_type= 'ml.c4.xlarge', #'ml.c4.xlarge',
                            training_steps=1000,
                            evaluation_steps=100)
%%time
import boto3

# use the region-specific sample data bucket
region = boto3.Session().region_name
train_data_location = 's3://sagemaker-sample-data-{}/tensorflow/iris'.format(region)
print(train_data_location)
iris_estimator.fit(train_data_location)

The logs:

s3://sagemaker-sample-data-us-west-2/tensorflow/iris
.................
2018-07-06 12:24:05,914 INFO - root - running container entrypoint
2018-07-06 12:24:05,914 INFO - root - starting train task
2018-07-06 12:24:05,920 INFO - container_support.training - Training starting
2018-07-06 12:24:07,522 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTP connection (1): ...
2018-07-06 12:24:07,732 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): ...
2018-07-06 12:24:07,798 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (2): ...
2018-07-06 12:24:07,816 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): ...
2018-07-06 12:24:07,924 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (2): ...
2018-07-06 12:24:08,062 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): s3.us-west-2.amazonaws.com
2018-07-06 12:24:08,102 INFO - tf_container - ----------------------TF_CONFIG--------------------------
2018-07-06 12:24:08,102 INFO - tf_container - {"environment": "cloud", "cluster": {"master": ["algo-1:2222"]}, "task": {"index": 0, "type": "master"}}
2018-07-06 12:24:08,102 INFO - tf_container - ---------------------------------------------------------
2018-07-06 12:24:08,102 INFO - tf_container - creating RunConfig:
2018-07-06 12:24:08,102 INFO - tf_container - {'save_checkpoints_secs': 300}
2018-07-06 12:24:08,102 INFO - tensorflow - TF_CONFIG environment variable: {u'environment': u'cloud', u'cluster': {u'master': [u'algo-1:2222']}, u'task': {u'index': 0, u'type': u'master'}}
2018-07-06 12:24:08,102 INFO - tf_container - invoking the user-provided estimator_fn
2018-07-06 12:24:08,211 INFO - tensorflow - Using the Keras model provided.
2018-07-06 12:24:08,211 INFO - tensorflow - TF_CONFIG environment variable: {u'environment': u'cloud', u'cluster': {u'master': [u'algo-1:2222']}, u'task': {u'index': 0, u'type': u'master'}}
2018-07-06 12:24:08,212 INFO - tensorflow - Using default config.
2018-07-06 12:24:08,212 WARNING - tensorflow - Using temporary folder as model directory: /tmp/tmpR6YK7f
2018-07-06 12:24:08,213 INFO - tensorflow - Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': u'master', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa1f80fac10>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpR6YK7f', '_global_id_in_cluster': 0, '_save_summary_steps': 100}
2018-07-06 12:24:08.214155: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2018-07-06 12:24:08,903 INFO - tensorflow - Skip starting Tensorflow server as there is only one node in the cluster.
2018-07-06 12:24:09,988 WARNING - tensorflow - From /opt/ml/code/pCTR.py:52: load_csv_with_header (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data instead.
2018-07-06 12:24:10,001 INFO - tensorflow - Calling model_fn.
2018-07-06 12:24:10,279 INFO - tensorflow - Done calling model_fn.
2018-07-06 12:24:10,280 INFO - tensorflow - Create CheckpointSaverHook.
2018-07-06 12:24:10,447 INFO - tensorflow - Graph was finalized.
2018-07-06 12:24:10,447 INFO - tensorflow - Restoring parameters from /tmp/tmpR6YK7f/keras_model.ckpt
2018-07-06 12:24:10,477 INFO - tensorflow - Running local_init_op.
2018-07-06 12:24:10,482 INFO - tensorflow - Done running local_init_op.
2018-07-06 12:24:10,821 INFO - tensorflow - Saving checkpoints for 1 into /tmp/tmpR6YK7f/model.ckpt.
2018-07-06 12:24:10,877 INFO - tensorflow - Calling model_fn.
2018-07-06 12:24:10,964 INFO - tensorflow - Done calling model_fn.
2018-07-06 12:24:10,981 INFO - tensorflow - Starting evaluation at 2018-07-06-12:24:10
2018-07-06 12:24:11,064 INFO - tensorflow - Graph was finalized.
2018-07-06 12:24:11,064 INFO - tensorflow - Restoring parameters from /tmp/tmpR6YK7f/model.ckpt-1
2018-07-06 12:24:11,081 INFO - tensorflow - Running local_init_op.
2018-07-06 12:24:11,086 INFO - tensorflow - Done running local_init_op.
2018-07-06 12:24:11,129 INFO - tensorflow - Evaluation [10/100]
2018-07-06 12:24:11,139 INFO - tensorflow - Evaluation [20/100]
2018-07-06 12:24:11,149 INFO - tensorflow - Evaluation [30/100]
2018-07-06 12:24:11,158 INFO - tensorflow - Evaluation [40/100]
2018-07-06 12:24:11,167 INFO - tensorflow - Evaluation [50/100]
2018-07-06 12:24:11,178 INFO - tensorflow - Evaluation [60/100]
2018-07-06 12:24:11,186 INFO - tensorflow - Evaluation [70/100]
2018-07-06 12:24:11,194 INFO - tensorflow - Evaluation [80/100]
2018-07-06 12:24:11,205 INFO - tensorflow - Evaluation [90/100]
2018-07-06 12:24:11,214 INFO - tensorflow - Evaluation [100/100]
2018-07-06 12:24:11,237 INFO - tensorflow - Finished evaluation at 2018-07-06-12:24:11
2018-07-06 12:24:11,237 INFO - tensorflow - Saving dict for global step 1: accuracy = 0.73328125, global_step = 1, loss = 1.1924654e-07
2018-07-06 12:24:11,285 INFO - tensorflow - Calling model_fn.
2018-07-06 12:24:11,417 INFO - tensorflow - Done calling model_fn.
2018-07-06 12:24:11,417 INFO - tensorflow - Signatures INCLUDED in export for Classify: None
2018-07-06 12:24:11,417 INFO - tensorflow - Signatures INCLUDED in export for Regress: None
2018-07-06 12:24:11,417 INFO - tensorflow - Signatures INCLUDED in export for Predict: ['serving_default']
2018-07-06 12:24:11,431 INFO - tensorflow - Restoring parameters from /tmp/tmpR6YK7f/model.ckpt-1
2018-07-06 12:24:11,437 INFO - tensorflow - Assets added to graph.
2018-07-06 12:24:11,437 INFO - tensorflow - No assets to write.
2018-07-06 12:24:11,480 INFO - tensorflow - SavedModel written to: /tmp/tmpR6YK7f/export/Servo/temp-1530879851/saved_model.pb
2018-07-06 12:24:11,482 INFO - tensorflow - loss = 1.0430813e-07, step = 1
2018-07-06 12:24:11,717 INFO - tensorflow - global_step/sec: 426.163
2018-07-06 12:24:11,718 INFO - tensorflow - loss = 1.09896064e-07, step = 101 (0.235 sec)
2018-07-06 12:24:11,998 INFO - tensorflow - global_step/sec: 355.469
2018-07-06 12:24:11,998 INFO - tensorflow - loss = 1.0430813e-07, step = 201 (0.281 sec)
2018-07-06 12:24:12,137 INFO - tensorflow - global_step/sec: 718.473
2018-07-06 12:24:12,137 INFO - tensorflow - loss = 1.2107193e-07, step = 301 (0.139 sec)
2018-07-06 12:24:12,273 INFO - tensorflow - global_step/sec: 734.884
2018-07-06 12:24:12,273 INFO - tensorflow - loss = 1.17346644e-07, step = 401 (0.136 sec)
2018-07-06 12:24:12,408 INFO - tensorflow - global_step/sec: 743.461
2018-07-06 12:24:12,408 INFO - tensorflow - loss = 1.238659e-07, step = 501 (0.134 sec)
2018-07-06 12:24:12,538 INFO - tensorflow - global_step/sec: 767.29
2018-07-06 12:24:12,538 INFO - tensorflow - loss = 1.1920929e-07, step = 601 (0.130 sec)
2018-07-06 12:24:12,674 INFO - tensorflow - global_step/sec: 736.052
2018-07-06 12:24:12,674 INFO - tensorflow - loss = 1.07102096e-07, step = 701 (0.136 sec)
2018-07-06 12:24:12,805 INFO - tensorflow - global_step/sec: 763.504
2018-07-06 12:24:12,805 INFO - tensorflow - loss = 1.2852252e-07, step = 801 (0.131 sec)
2018-07-06 12:24:12,938 INFO - tensorflow - global_step/sec: 750.705
2018-07-06 12:24:12,938 INFO - tensorflow - loss = 1.2014061e-07, step = 901 (0.133 sec)
2018-07-06 12:24:13,071 INFO - tensorflow - Saving checkpoints for 1000 into /tmp/tmpR6YK7f/model.ckpt.
2018-07-06 12:24:13,107 INFO - tensorflow - Skip the current checkpoint eval due to throttle secs (600 secs).
2018-07-06 12:24:13,128 INFO - tensorflow - Loss for final step: 1.1641532e-07.
2018-07-06 12:24:13,129 INFO - tensorflow - Training has already ended. But the last eval is skipped due to eval throttle_secs. Now evaluating the final checkpoint.
2018-07-06 12:24:13,144 INFO - tensorflow - Calling model_fn.
2018-07-06 12:24:13,230 INFO - tensorflow - Done calling model_fn.
2018-07-06 12:24:13,246 INFO - tensorflow - Starting evaluation at 2018-07-06-12:24:13
2018-07-06 12:24:13,327 INFO - tensorflow - Graph was finalized.
2018-07-06 12:24:13,327 INFO - tensorflow - Restoring parameters from /tmp/tmpR6YK7f/model.ckpt-1000
2018-07-06 12:24:13,344 INFO - tensorflow - Running local_init_op.
2018-07-06 12:24:13,349 INFO - tensorflow - Done running local_init_op.
2018-07-06 12:24:13,391 INFO - tensorflow - Evaluation [10/100]
2018-07-06 12:24:13,402 INFO - tensorflow - Evaluation [20/100]
2018-07-06 12:24:13,412 INFO - tensorflow - Evaluation [30/100]
2018-07-06 12:24:13,423 INFO - tensorflow - Evaluation [40/100]
2018-07-06 12:24:13,432 INFO - tensorflow - Evaluation [50/100]
2018-07-06 12:24:13,443 INFO - tensorflow - Evaluation [60/100]
2018-07-06 12:24:13,453 INFO - tensorflow - Evaluation [70/100]
2018-07-06 12:24:13,461 INFO - tensorflow - Evaluation [80/100]
2018-07-06 12:24:13,471 INFO - tensorflow - Evaluation [90/100]
2018-07-06 12:24:13,482 INFO - tensorflow - Evaluation [100/100]
2018-07-06 12:24:13,496 INFO - tensorflow - Finished evaluation at 2018-07-06-12:24:13
2018-07-06 12:24:13,496 INFO - tensorflow - Saving dict for global step 1000: accuracy = 0.73390627, global_step = 1000, loss = 1.1910684e-07
2018-07-06 12:24:13,502 INFO - tensorflow - Calling model_fn.
2018-07-06 12:24:13,552 INFO - tensorflow - Done calling model_fn.
2018-07-06 12:24:13,553 INFO - tensorflow - Signatures INCLUDED in export for Classify: None
2018-07-06 12:24:13,553 INFO - tensorflow - Signatures INCLUDED in export for Regress: None
2018-07-06 12:24:13,553 INFO - tensorflow - Signatures INCLUDED in export for Predict: ['serving_default']
2018-07-06 12:24:13,566 INFO - tensorflow - Restoring parameters from /tmp/tmpR6YK7f/model.ckpt-1000
2018-07-06 12:24:13,572 INFO - tensorflow - Assets added to graph.
2018-07-06 12:24:13,572 INFO - tensorflow - No assets to write.
2018-07-06 12:24:13,613 INFO - tensorflow - SavedModel written to: /tmp/tmpR6YK7f/export/Servo/temp-1530879853/saved_model.pb
2018-07-06 12:24:13,617 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): 
2018-07-06 12:24:13,632 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): ....
2018-07-06 12:24:13,679 ERROR - tf_container - Failed to download saved model. File does not exist in s3://XXX/artifacts/sagemaker-tensorflow-2018-07-06-12-21-23-923/checkpoints
2018-07-06 12:24:13,679 ERROR - container_support.training - uncaught exception during training: 'Contents'
Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/container_support/training.py", line 36, in start
    fw.train()
  File "/usr/local/lib/python2.7/dist-packages/tf_container/train_entry_point.py", line 168, in train
    serve.export_saved_model(checkpoint_dir, env.model_dir)
  File "/usr/local/lib/python2.7/dist-packages/tf_container/serve.py", line 52, in export_saved_model
    raise e
KeyError: 'Contents'


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<timed exec> in <module>()

~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/tensorflow/estimator.py in fit(self, inputs, wait, logs, job_name, run_tensorboard_locally)
    247                 tensorboard.join()
    248         else:
--> 249             fit_super()
    250 
    251     @classmethod

~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/tensorflow/estimator.py in fit_super()
    229         """
    230         def fit_super():
--> 231             super(TensorFlow, self).fit(inputs, wait, logs, job_name)
    232 
    233         if run_tensorboard_locally and wait is False:

~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name)
    178         self.latest_training_job = _TrainingJob.start_new(self, inputs)
    179         if wait:
--> 180             self.latest_training_job.wait(logs=logs)
    181 
    182     @classmethod

~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in wait(self, logs)
    367     def wait(self, logs=True):
    368         if logs:
--> 369             self.sagemaker_session.logs_for_job(self.job_name, wait=True)
    370         else:
    371             self.sagemaker_session.wait_for_job(self.job_name)

~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/session.py in logs_for_job(self, job_name, wait, poll)
    803 
    804         if wait:
--> 805             self._check_job_status(job_name, description, 'TrainingJobStatus')
    806             if dot:
    807                 print()

~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/session.py in _check_job_status(self, job, desc, status_key_name)
    539         if status != 'Completed' and status != 'Stopped':
    540             reason = desc.get('FailureReason', '(No reason provided)')
--> 541             raise ValueError('Error training {}: {} Reason: {}'.format(job, status, reason))
    542 
    543     def wait_for_endpoint(self, endpoint, poll=5):

ValueError: Error training sagemaker-tensorflow-2018-07-06-12-21-23-923: Failed Reason: AlgorithmError: uncaught exception during training: 'Contents'
Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/container_support/training.py", line 36, in start
    fw.train()
  File "/usr/local/lib/python2.7/dist-packages/tf_container/train_entry_point.py", line 168, in train
    serve.export_saved_model(checkpoint_dir, env.model_dir)
  File "/usr/local/lib/python2.7/dist-packages/tf_container/serve.py", line 52, in export_saved_model
    raise e
KeyError: 'Contents'
  • Exact command to reproduce:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions