-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Closed
Description
Describe the bug
When you create Pytorch's estimator and train the model and then use pytorch_estimator.model_data it outputs
s3://bucket-name/pytorch-training-2021-11-16-22-53-00-748/output/model.tar.gz
However, in reality when you check inside S3 bucket it output.tar.gz not model.tar.gz!
And I believe because of this reason when I try to deploy using the same estimator
pytorch_estimator.deploy(instance_type='ml.m5.xlarge',
initial_instance_count=1)I get
ClientError: An error occurred (404) when calling the HeadObject operation: Not FoundRenaming output.tar.gz to model.tar.gz in S3 bucket fixing the problem.
To reproduce
hyperparameters = {
"epochs": 10,
"batch-size": 64,
"embedding-dim": 125,
"hidden-dim": 2
}
estimator_config = {
"entry_point": "train_script.py",
"source_dir": "scripts", # we provide source_dir in order to install torchtext!!
"framework_version": "1.9",
"py_version": "py38",
"instance_type": "ml.m5.xlarge",
"instance_count": 1,
"role": sagemaker.get_execution_role(),
"output_path": f"s3://{bucket_name}", # if this is not specified then SM will create new bucket to store artifacts
"hyperparameters": hyperparameters,
}
pytorch_estimator = PyTorch(**estimator_config)
# https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html
data_channels = {"train": f"s3://{bucket_name}/data/"}
pytorch_estimator.fit(data_channels)
pytorch_estimator.model_data # outputs -> s3://bucket-name/pytorch-training-2021-11-16-22-53-00-748/output/model.tar.gz
pytorch_estimator.deploy(instance_type='ml.m5.xlarge',
initial_instance_count=1)
# ClientError: An error occurred (404) when calling the HeadObject operation: Not FoundExpected behavior
Output of pytorch_estimator.model_data should contain the actual and correct name of tar file which is output.tar.gz and endpoint should be deployed without ClientError
Screenshots or logs
ClientError Traceback (most recent call last)
<ipython-input-57-5ceec0733f49> in <module>
1 pytorch_estimator.deploy(instance_type='ml.m5.xlarge',
----> 2 initial_instance_count=1)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/estimator.py in deploy(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, use_compiled_model, wait, model_name, kms_key, data_capture_config, tags, **kwargs)
963 wait=wait,
964 kms_key=kms_key,
--> 965 data_capture_config=data_capture_config,
966 )
967
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/model.py in deploy(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, tags, kms_key, wait, data_capture_config, **kwargs)
709 self._base_name = "-".join((self._base_name, compiled_model_suffix))
710
--> 711 self._create_sagemaker_model(instance_type, accelerator_type, tags)
712 production_variant = sagemaker.production_variant(
713 self.name, instance_type, initial_instance_count, accelerator_type=accelerator_type
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/model.py in _create_sagemaker_model(self, instance_type, accelerator_type, tags)
263 /api/latest/reference/services/sagemaker.html#SageMaker.Client.add_tags
264 """
--> 265 container_def = self.prepare_container_def(instance_type, accelerator_type=accelerator_type)
266
267 self._ensure_base_name_if_needed(container_def["Image"])
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/pytorch/model.py in prepare_container_def(self, instance_type, accelerator_type)
237
238 deploy_key_prefix = model_code_key_prefix(self.key_prefix, self.name, deploy_image)
--> 239 self._upload_code(deploy_key_prefix, repack=self._is_mms_version())
240 deploy_env = dict(self.env)
241 deploy_env.update(self._framework_env_vars())
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/model.py in _upload_code(self, key_prefix, repack)
1088 repacked_model_uri=repacked_model_data,
1089 sagemaker_session=self.sagemaker_session,
-> 1090 kms_key=self.model_kms_key,
1091 )
1092
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/utils.py in repack_model(inference_script, source_directory, dependencies, model_uri, repacked_model_uri, sagemaker_session, kms_key)
410
411 with _tmpdir() as tmp:
--> 412 model_dir = _extract_model(model_uri, sagemaker_session, tmp)
413
414 _create_or_update_code_dir(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/utils.py in _extract_model(model_uri, sagemaker_session, tmp)
484 if model_uri.lower().startswith("s3://"):
485 local_model_path = os.path.join(tmp, "tar_file")
--> 486 download_file_from_url(model_uri, local_model_path, sagemaker_session)
487 else:
488 local_model_path = model_uri.replace("file://", "")
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/utils.py in download_file_from_url(url, dst, sagemaker_session)
497 bucket, key = url.netloc, url.path.lstrip("/")
498
--> 499 download_file(bucket, key, dst, sagemaker_session)
500
501
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/utils.py in download_file(bucket_name, path, target, sagemaker_session)
515 s3 = boto_session.resource("s3", region_name=sagemaker_session.boto_region_name)
516 bucket = s3.Bucket(bucket_name)
--> 517 bucket.download_file(path, target)
518
519
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/boto3/s3/inject.py in bucket_download_file(self, Key, Filename, ExtraArgs, Callback, Config)
245 return self.meta.client.download_file(
246 Bucket=self.name, Key=Key, Filename=Filename,
--> 247 ExtraArgs=ExtraArgs, Callback=Callback, Config=Config)
248
249
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/boto3/s3/inject.py in download_file(self, Bucket, Key, Filename, ExtraArgs, Callback, Config)
171 return transfer.download_file(
172 bucket=Bucket, key=Key, filename=Filename,
--> 173 extra_args=ExtraArgs, callback=Callback)
174
175
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/boto3/s3/transfer.py in download_file(self, bucket, key, filename, extra_args, callback)
305 bucket, key, filename, extra_args, subscribers)
306 try:
--> 307 future.result()
308 # This is for backwards compatibility where when retries are
309 # exceeded we need to throw the same error from boto3 instead of
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/s3transfer/futures.py in result(self)
104 # however if a KeyboardInterrupt is raised we want want to exit
105 # out of this and propogate the exception.
--> 106 return self._coordinator.result()
107 except KeyboardInterrupt as e:
108 self.cancel()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/s3transfer/futures.py in result(self)
263 # final result.
264 if self._exception:
--> 265 raise self._exception
266 return self._result
267
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/s3transfer/tasks.py in _main(self, transfer_future, **kwargs)
253 # Call the submit method to start submitting tasks to execute the
254 # transfer.
--> 255 self._submit(transfer_future=transfer_future, **kwargs)
256 except BaseException as e:
257 # If there was an exception raised during the submission of task
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/s3transfer/download.py in _submit(self, client, config, osutil, request_executor, io_executor, transfer_future, bandwidth_limiter)
341 Bucket=transfer_future.meta.call_args.bucket,
342 Key=transfer_future.meta.call_args.key,
--> 343 **transfer_future.meta.call_args.extra_args
344 )
345 transfer_future.meta.provide_transfer_size(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
386 "%s() only accepts keyword arguments." % py_operation_name)
387 # The "self" in this scope is referring to the BaseClient.
--> 388 return self._make_api_call(operation_name, kwargs)
389
390 _api_call.__name__ = str(py_operation_name)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
706 error_code = parsed_response.get("Error", {}).get("Code")
707 error_class = self.exceptions.from_code(error_code)
--> 708 raise error_class(parsed_response, operation_name)
709 else:
710 return parsed_response
ClientError: An error occurred (404) when calling the HeadObject operation: Not FoundSystem information
A description of your system. Please provide:
- SageMaker Python SDK version: 2.68.0
- Framework name (eg. PyTorch) or algorithm (eg. KMeans): Pytorch
- Framework version: 1.9
- Python version: Py38
- CPU or GPU: CPU
- Custom Docker image (Y/N): N
I believe it might be related to this issue #1354
Thanks a ton
Best
CC @nadiaya