-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Closed
Description
Please fill out the form below.
System Information
- Framework (e.g. TensorFlow) / Algorithm (e.g. KMeans): TensorFlow (Keras)
- Framework Version: tensorflow version is 1.8.0
- Python Version: Python 3.6.5
- CPU or GPU: CPU
- Python SDK Version: 1.5.3
- Are you using a custom image: No
Describe the problem
I'm trying to run the model training for a keras sequential model on sagemaker to solve a classification problem on the dataset iris. But an error was happening.
Minimal repro / logs
python code:
import numpy as np
import os
import tensorflow as tf
def estimator_fn(run_config, params):
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(10, activation='relu', input_shape=(4, ), name="features"))
model.add(tf.keras.layers.Dense(20, activation='relu'))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid', name='output'))
model.compile(
optimizer='rmsprop', #tf.keras.optimizers.RMSprop(lr=2e-5),
loss='categorical_crossentropy',
metrics=['accuracy']
)
return tf.keras.estimator.model_to_estimator(keras_model = model)
def serving_input_fn(params):
feature_spec = {'features_input': tf.FixedLenFeature(dtype=tf.float32, shape=[4])}
return tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)()
def train_input_fn(training_dir, params):
"""Returns input function that would feed the model during training"""
print("Returns input function that would feed the model during training")
training_set = _generate_input_fn(training_dir, 'iris_training.csv')
print("----------------------------------------train-------------------------------------------")
return training_set
def eval_input_fn(training_dir, params):
"""Returns input function that would feed the model during evaluation"""
print("Returns input function that would feed the model during evaluation")
testing_set = _generate_input_fn(training_dir, 'iris_test.csv')
print("----------------------------------------test-------------------------------------------")
return testing_set
def _generate_input_fn(training_dir, training_filename):
training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
filename=os.path.join(training_dir, training_filename),
target_dtype=np.int,
features_dtype=np.float32)
print(training_filename)
#print(training_set)
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'features_input': np.array(training_set.data)},
y=np.array(training_set.target).reshape((-1,1)),
num_epochs = None,
shuffle=True)()
print("-----------------------------------", input_fn)
return input_fn
notbook
from sagemaker import get_execution_role
custom_code_upload_location = 's3://XXX/tensorflow_iris'
model_artifacts_location = 's3://XXX/artifacts'
role = 'XXX'
from sagemaker.tensorflow import TensorFlow
iris_estimator = TensorFlow(entry_point='pCTR.py',
role=role,
output_path=model_artifacts_location,
code_location=custom_code_upload_location,
train_instance_count=1,
train_instance_type= 'ml.c4.xlarge', #'ml.c4.xlarge',
training_steps=1000,
evaluation_steps=100)
%%time
import boto3
# use the region-specific sample data bucket
region = boto3.Session().region_name
train_data_location = 's3://sagemaker-sample-data-{}/tensorflow/iris'.format(region)
print(train_data_location)
iris_estimator.fit(train_data_location)
The logs:
s3://sagemaker-sample-data-us-west-2/tensorflow/iris
.................
2018-07-06 12:24:05,914 INFO - root - running container entrypoint
2018-07-06 12:24:05,914 INFO - root - starting train task
2018-07-06 12:24:05,920 INFO - container_support.training - Training starting
2018-07-06 12:24:07,522 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTP connection (1): ...
2018-07-06 12:24:07,732 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): ...
2018-07-06 12:24:07,798 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (2): ...
2018-07-06 12:24:07,816 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): ...
2018-07-06 12:24:07,924 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (2): ...
2018-07-06 12:24:08,062 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): s3.us-west-2.amazonaws.com
2018-07-06 12:24:08,102 INFO - tf_container - ----------------------TF_CONFIG--------------------------
2018-07-06 12:24:08,102 INFO - tf_container - {"environment": "cloud", "cluster": {"master": ["algo-1:2222"]}, "task": {"index": 0, "type": "master"}}
2018-07-06 12:24:08,102 INFO - tf_container - ---------------------------------------------------------
2018-07-06 12:24:08,102 INFO - tf_container - creating RunConfig:
2018-07-06 12:24:08,102 INFO - tf_container - {'save_checkpoints_secs': 300}
2018-07-06 12:24:08,102 INFO - tensorflow - TF_CONFIG environment variable: {u'environment': u'cloud', u'cluster': {u'master': [u'algo-1:2222']}, u'task': {u'index': 0, u'type': u'master'}}
2018-07-06 12:24:08,102 INFO - tf_container - invoking the user-provided estimator_fn
2018-07-06 12:24:08,211 INFO - tensorflow - Using the Keras model provided.
2018-07-06 12:24:08,211 INFO - tensorflow - TF_CONFIG environment variable: {u'environment': u'cloud', u'cluster': {u'master': [u'algo-1:2222']}, u'task': {u'index': 0, u'type': u'master'}}
2018-07-06 12:24:08,212 INFO - tensorflow - Using default config.
2018-07-06 12:24:08,212 WARNING - tensorflow - Using temporary folder as model directory: /tmp/tmpR6YK7f
2018-07-06 12:24:08,213 INFO - tensorflow - Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': u'master', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa1f80fac10>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpR6YK7f', '_global_id_in_cluster': 0, '_save_summary_steps': 100}
2018-07-06 12:24:08.214155: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2018-07-06 12:24:08,903 INFO - tensorflow - Skip starting Tensorflow server as there is only one node in the cluster.
2018-07-06 12:24:09,988 WARNING - tensorflow - From /opt/ml/code/pCTR.py:52: load_csv_with_header (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data instead.
2018-07-06 12:24:10,001 INFO - tensorflow - Calling model_fn.
2018-07-06 12:24:10,279 INFO - tensorflow - Done calling model_fn.
2018-07-06 12:24:10,280 INFO - tensorflow - Create CheckpointSaverHook.
2018-07-06 12:24:10,447 INFO - tensorflow - Graph was finalized.
2018-07-06 12:24:10,447 INFO - tensorflow - Restoring parameters from /tmp/tmpR6YK7f/keras_model.ckpt
2018-07-06 12:24:10,477 INFO - tensorflow - Running local_init_op.
2018-07-06 12:24:10,482 INFO - tensorflow - Done running local_init_op.
2018-07-06 12:24:10,821 INFO - tensorflow - Saving checkpoints for 1 into /tmp/tmpR6YK7f/model.ckpt.
2018-07-06 12:24:10,877 INFO - tensorflow - Calling model_fn.
2018-07-06 12:24:10,964 INFO - tensorflow - Done calling model_fn.
2018-07-06 12:24:10,981 INFO - tensorflow - Starting evaluation at 2018-07-06-12:24:10
2018-07-06 12:24:11,064 INFO - tensorflow - Graph was finalized.
2018-07-06 12:24:11,064 INFO - tensorflow - Restoring parameters from /tmp/tmpR6YK7f/model.ckpt-1
2018-07-06 12:24:11,081 INFO - tensorflow - Running local_init_op.
2018-07-06 12:24:11,086 INFO - tensorflow - Done running local_init_op.
2018-07-06 12:24:11,129 INFO - tensorflow - Evaluation [10/100]
2018-07-06 12:24:11,139 INFO - tensorflow - Evaluation [20/100]
2018-07-06 12:24:11,149 INFO - tensorflow - Evaluation [30/100]
2018-07-06 12:24:11,158 INFO - tensorflow - Evaluation [40/100]
2018-07-06 12:24:11,167 INFO - tensorflow - Evaluation [50/100]
2018-07-06 12:24:11,178 INFO - tensorflow - Evaluation [60/100]
2018-07-06 12:24:11,186 INFO - tensorflow - Evaluation [70/100]
2018-07-06 12:24:11,194 INFO - tensorflow - Evaluation [80/100]
2018-07-06 12:24:11,205 INFO - tensorflow - Evaluation [90/100]
2018-07-06 12:24:11,214 INFO - tensorflow - Evaluation [100/100]
2018-07-06 12:24:11,237 INFO - tensorflow - Finished evaluation at 2018-07-06-12:24:11
2018-07-06 12:24:11,237 INFO - tensorflow - Saving dict for global step 1: accuracy = 0.73328125, global_step = 1, loss = 1.1924654e-07
2018-07-06 12:24:11,285 INFO - tensorflow - Calling model_fn.
2018-07-06 12:24:11,417 INFO - tensorflow - Done calling model_fn.
2018-07-06 12:24:11,417 INFO - tensorflow - Signatures INCLUDED in export for Classify: None
2018-07-06 12:24:11,417 INFO - tensorflow - Signatures INCLUDED in export for Regress: None
2018-07-06 12:24:11,417 INFO - tensorflow - Signatures INCLUDED in export for Predict: ['serving_default']
2018-07-06 12:24:11,431 INFO - tensorflow - Restoring parameters from /tmp/tmpR6YK7f/model.ckpt-1
2018-07-06 12:24:11,437 INFO - tensorflow - Assets added to graph.
2018-07-06 12:24:11,437 INFO - tensorflow - No assets to write.
2018-07-06 12:24:11,480 INFO - tensorflow - SavedModel written to: /tmp/tmpR6YK7f/export/Servo/temp-1530879851/saved_model.pb
2018-07-06 12:24:11,482 INFO - tensorflow - loss = 1.0430813e-07, step = 1
2018-07-06 12:24:11,717 INFO - tensorflow - global_step/sec: 426.163
2018-07-06 12:24:11,718 INFO - tensorflow - loss = 1.09896064e-07, step = 101 (0.235 sec)
2018-07-06 12:24:11,998 INFO - tensorflow - global_step/sec: 355.469
2018-07-06 12:24:11,998 INFO - tensorflow - loss = 1.0430813e-07, step = 201 (0.281 sec)
2018-07-06 12:24:12,137 INFO - tensorflow - global_step/sec: 718.473
2018-07-06 12:24:12,137 INFO - tensorflow - loss = 1.2107193e-07, step = 301 (0.139 sec)
2018-07-06 12:24:12,273 INFO - tensorflow - global_step/sec: 734.884
2018-07-06 12:24:12,273 INFO - tensorflow - loss = 1.17346644e-07, step = 401 (0.136 sec)
2018-07-06 12:24:12,408 INFO - tensorflow - global_step/sec: 743.461
2018-07-06 12:24:12,408 INFO - tensorflow - loss = 1.238659e-07, step = 501 (0.134 sec)
2018-07-06 12:24:12,538 INFO - tensorflow - global_step/sec: 767.29
2018-07-06 12:24:12,538 INFO - tensorflow - loss = 1.1920929e-07, step = 601 (0.130 sec)
2018-07-06 12:24:12,674 INFO - tensorflow - global_step/sec: 736.052
2018-07-06 12:24:12,674 INFO - tensorflow - loss = 1.07102096e-07, step = 701 (0.136 sec)
2018-07-06 12:24:12,805 INFO - tensorflow - global_step/sec: 763.504
2018-07-06 12:24:12,805 INFO - tensorflow - loss = 1.2852252e-07, step = 801 (0.131 sec)
2018-07-06 12:24:12,938 INFO - tensorflow - global_step/sec: 750.705
2018-07-06 12:24:12,938 INFO - tensorflow - loss = 1.2014061e-07, step = 901 (0.133 sec)
2018-07-06 12:24:13,071 INFO - tensorflow - Saving checkpoints for 1000 into /tmp/tmpR6YK7f/model.ckpt.
2018-07-06 12:24:13,107 INFO - tensorflow - Skip the current checkpoint eval due to throttle secs (600 secs).
2018-07-06 12:24:13,128 INFO - tensorflow - Loss for final step: 1.1641532e-07.
2018-07-06 12:24:13,129 INFO - tensorflow - Training has already ended. But the last eval is skipped due to eval throttle_secs. Now evaluating the final checkpoint.
2018-07-06 12:24:13,144 INFO - tensorflow - Calling model_fn.
2018-07-06 12:24:13,230 INFO - tensorflow - Done calling model_fn.
2018-07-06 12:24:13,246 INFO - tensorflow - Starting evaluation at 2018-07-06-12:24:13
2018-07-06 12:24:13,327 INFO - tensorflow - Graph was finalized.
2018-07-06 12:24:13,327 INFO - tensorflow - Restoring parameters from /tmp/tmpR6YK7f/model.ckpt-1000
2018-07-06 12:24:13,344 INFO - tensorflow - Running local_init_op.
2018-07-06 12:24:13,349 INFO - tensorflow - Done running local_init_op.
2018-07-06 12:24:13,391 INFO - tensorflow - Evaluation [10/100]
2018-07-06 12:24:13,402 INFO - tensorflow - Evaluation [20/100]
2018-07-06 12:24:13,412 INFO - tensorflow - Evaluation [30/100]
2018-07-06 12:24:13,423 INFO - tensorflow - Evaluation [40/100]
2018-07-06 12:24:13,432 INFO - tensorflow - Evaluation [50/100]
2018-07-06 12:24:13,443 INFO - tensorflow - Evaluation [60/100]
2018-07-06 12:24:13,453 INFO - tensorflow - Evaluation [70/100]
2018-07-06 12:24:13,461 INFO - tensorflow - Evaluation [80/100]
2018-07-06 12:24:13,471 INFO - tensorflow - Evaluation [90/100]
2018-07-06 12:24:13,482 INFO - tensorflow - Evaluation [100/100]
2018-07-06 12:24:13,496 INFO - tensorflow - Finished evaluation at 2018-07-06-12:24:13
2018-07-06 12:24:13,496 INFO - tensorflow - Saving dict for global step 1000: accuracy = 0.73390627, global_step = 1000, loss = 1.1910684e-07
2018-07-06 12:24:13,502 INFO - tensorflow - Calling model_fn.
2018-07-06 12:24:13,552 INFO - tensorflow - Done calling model_fn.
2018-07-06 12:24:13,553 INFO - tensorflow - Signatures INCLUDED in export for Classify: None
2018-07-06 12:24:13,553 INFO - tensorflow - Signatures INCLUDED in export for Regress: None
2018-07-06 12:24:13,553 INFO - tensorflow - Signatures INCLUDED in export for Predict: ['serving_default']
2018-07-06 12:24:13,566 INFO - tensorflow - Restoring parameters from /tmp/tmpR6YK7f/model.ckpt-1000
2018-07-06 12:24:13,572 INFO - tensorflow - Assets added to graph.
2018-07-06 12:24:13,572 INFO - tensorflow - No assets to write.
2018-07-06 12:24:13,613 INFO - tensorflow - SavedModel written to: /tmp/tmpR6YK7f/export/Servo/temp-1530879853/saved_model.pb
2018-07-06 12:24:13,617 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1):
2018-07-06 12:24:13,632 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): ....
2018-07-06 12:24:13,679 ERROR - tf_container - Failed to download saved model. File does not exist in s3://XXX/artifacts/sagemaker-tensorflow-2018-07-06-12-21-23-923/checkpoints
2018-07-06 12:24:13,679 ERROR - container_support.training - uncaught exception during training: 'Contents'
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/container_support/training.py", line 36, in start
fw.train()
File "/usr/local/lib/python2.7/dist-packages/tf_container/train_entry_point.py", line 168, in train
serve.export_saved_model(checkpoint_dir, env.model_dir)
File "/usr/local/lib/python2.7/dist-packages/tf_container/serve.py", line 52, in export_saved_model
raise e
KeyError: 'Contents'
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<timed exec> in <module>()
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/tensorflow/estimator.py in fit(self, inputs, wait, logs, job_name, run_tensorboard_locally)
247 tensorboard.join()
248 else:
--> 249 fit_super()
250
251 @classmethod
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/tensorflow/estimator.py in fit_super()
229 """
230 def fit_super():
--> 231 super(TensorFlow, self).fit(inputs, wait, logs, job_name)
232
233 if run_tensorboard_locally and wait is False:
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name)
178 self.latest_training_job = _TrainingJob.start_new(self, inputs)
179 if wait:
--> 180 self.latest_training_job.wait(logs=logs)
181
182 @classmethod
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in wait(self, logs)
367 def wait(self, logs=True):
368 if logs:
--> 369 self.sagemaker_session.logs_for_job(self.job_name, wait=True)
370 else:
371 self.sagemaker_session.wait_for_job(self.job_name)
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/session.py in logs_for_job(self, job_name, wait, poll)
803
804 if wait:
--> 805 self._check_job_status(job_name, description, 'TrainingJobStatus')
806 if dot:
807 print()
~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/session.py in _check_job_status(self, job, desc, status_key_name)
539 if status != 'Completed' and status != 'Stopped':
540 reason = desc.get('FailureReason', '(No reason provided)')
--> 541 raise ValueError('Error training {}: {} Reason: {}'.format(job, status, reason))
542
543 def wait_for_endpoint(self, endpoint, poll=5):
ValueError: Error training sagemaker-tensorflow-2018-07-06-12-21-23-923: Failed Reason: AlgorithmError: uncaught exception during training: 'Contents'
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/container_support/training.py", line 36, in start
fw.train()
File "/usr/local/lib/python2.7/dist-packages/tf_container/train_entry_point.py", line 168, in train
serve.export_saved_model(checkpoint_dir, env.model_dir)
File "/usr/local/lib/python2.7/dist-packages/tf_container/serve.py", line 52, in export_saved_model
raise e
KeyError: 'Contents'
- Exact command to reproduce:
Metadata
Metadata
Assignees
Labels
No labels