diff --git a/designs/md5_checksums_package_deploy.md b/designs/md5_checksums_package_deploy.md new file mode 100644 index 0000000000..9f0e646be0 --- /dev/null +++ b/designs/md5_checksums_package_deploy.md @@ -0,0 +1,143 @@ +Checksum on artifacts for `sam package` +==================================== + + +What is the problem? +-------------------- + +Today, `sam package` goes through the list of packageable paths and looks up objects in s3 and compares checksums across local and whats in S3 (if they already exist). The comparison on `zip` files are prone to failure as the zipped file does not have respect permissions of the underlying directory which was zipped. Therefore the calculated checksums between local and S3 are different, resulting in re-upload when deploying an application repeatedly even with no changes in source. + +Lets consider following cases: + +NOTE: `sam deploy` attempts to package on deploy. + +`sam build` -> `sam deploy` : Results in upload to s3 +`sam build` -> `sam deploy` (s3 upload) -> `sam deploy` (s3 upload again on same built artifacts) + +What will be changed? +--------------------- + +Instead of calculating checksum on a zip file, the checksum is calculated on the directory which is to be zipped up instead. + +* Symlinks within the directory are followed. +* Cyclic symlinks cause failure to package. +* Both name and content of the files within the directory are used to calculate a checksum. + +What algorithm is used for checksum calculation? +------------------------------------------------ + +* `md5` + +Caveat: There are still chances for collision of hashes with `md5`, `sha256` may be better in this case, but the codebase has been using `md5` for a while and switching to `sha256` may cause regressions(?) + +Success criteria for the change +------------------------------- + +* `sam build` -> `sam deploy` -> `sam deploy` (Does not result in another deploy) +* `sam build` -> `sam deploy` -> `sam build` (No changes to source) -> `sam deploy` (Does not result in another deploy) + +Out-of-Scope +------------ + +* This is a bug fix of the prior implementation. + +User Experience Walkthrough +--------------------------- + +Implementation +============== + +CLI Changes +----------- + +- No changes to CLI parameters itself. + +### Breaking Change + +- The breaking change here is that users that relied on always `re-deploying` even with no changes to source made might be broken. + +Design +------ + +- A new method called `dir_checksum` is written which will take a directory as input and give back a md5 checksum of all the contents within the directory. + * Goes through all subdirectories and files + * Checksums file names and contents of each file. + +`samconfig.toml` Changes +---------------- + +None + +Security +-------- + +**What new dependencies (libraries/cli) does this change require?** + +N/A + +**What other Docker container images are you using?** + +N/A + +**Are you creating a new HTTP endpoint? If so explain how it will be +created & used** + +N/A + +**Are you connecting to a remote API? If so explain how is this +connection secured** + +N/A + +**Are you reading/writing to a temporary folder? If so, what is this +used for and when do you clean up?** + +No Temporary folders are read, but the contents of each file specified in a directory are read in order to determine md5 checksum. + +**How do you validate new .samrc configuration?** + +N/A + +What is your Testing Plan (QA)? +=============================== + +Goal +---- + +* Integration and Unit tests pass + +Pre-requesites +-------------- + +N/A + +Test Scenarios/Cases +-------------------- +* build and deploy an application, rebuild and attempt to deploy an application. The second deploy should not trigger. + +Expected Results +---------------- +* Scenario tests are successful + + +Documentation Changes +===================== + +* Fixes an underlying bug, the documentation does not state that this is an issue today. + +Open Issues +============ + +* https://github.com/awslabs/aws-sam-cli/issues/1779 + +Task Breakdown +============== + +- \[x\] Send a Pull Request with this design document +- \[ \] Build the command line interface +- \[ \] Build the underlying library +- \[ \] Unit tests +- \[ \] Functional Tests +- \[ \] Integration tests +- \[ \] Run all tests on Windows +- \[ \] Update documentation diff --git a/samcli/lib/package/artifact_exporter.py b/samcli/lib/package/artifact_exporter.py index 023f87891a..6a60920f3d 100644 --- a/samcli/lib/package/artifact_exporter.py +++ b/samcli/lib/package/artifact_exporter.py @@ -47,6 +47,7 @@ from samcli.commands._utils.template import METADATA_WITH_LOCAL_PATHS, RESOURCES_WITH_LOCAL_PATHS from samcli.commands.package import exceptions +from samcli.lib.utils.hash import dir_checksum from samcli.yamlhelper import yaml_dump, yaml_parse @@ -165,8 +166,8 @@ def resource_not_packageable(resource_dict): def zip_and_upload(local_path, uploader): - with zip_folder(local_path) as zip_file: - return uploader.upload_with_dedup(zip_file) + with zip_folder(local_path) as (zip_file, md5_hash): + return uploader.upload_with_dedup(zip_file, precomputed_md5=md5_hash) @contextmanager @@ -178,11 +179,12 @@ def zip_folder(folder_path): :param folder_path: :return: Name of the zipfile """ - filename = os.path.join(tempfile.gettempdir(), "data-" + uuid.uuid4().hex) + md5hash = dir_checksum(folder_path, followlinks=True) + filename = os.path.join(tempfile.gettempdir(), "data-" + md5hash) zipfile_name = make_zip(filename, folder_path) try: - yield zipfile_name + yield zipfile_name, md5hash finally: if os.path.exists(zipfile_name): os.remove(zipfile_name) diff --git a/samcli/lib/package/s3_uploader.py b/samcli/lib/package/s3_uploader.py index 3e9e167ced..82e85af719 100644 --- a/samcli/lib/package/s3_uploader.py +++ b/samcli/lib/package/s3_uploader.py @@ -28,6 +28,7 @@ from boto3.s3 import transfer from samcli.commands.package.exceptions import NoSuchBucketError, BucketNotSpecifiedError +from samcli.lib.utils.hash import file_checksum LOG = logging.getLogger(__name__) @@ -106,12 +107,13 @@ def upload(self, file_name, remote_path): raise NoSuchBucketError(bucket_name=self.bucket_name) raise ex - def upload_with_dedup(self, file_name, extension=None): + def upload_with_dedup(self, file_name, extension=None, precomputed_md5=None): """ Makes and returns name of the S3 object based on the file's MD5 sum :param file_name: file to upload :param extension: String of file extension to append to the object + :param precomputed_md5: Specified md5 hash for the file to be uploaded. :return: S3 URL of the uploaded object """ @@ -119,7 +121,7 @@ def upload_with_dedup(self, file_name, extension=None): # uploads of same object. Uploader will check if the file exists in S3 # and re-upload only if necessary. So the template points to same file # in multiple places, this will upload only once - filemd5 = self.file_checksum(file_name) + filemd5 = precomputed_md5 or file_checksum(file_name) remote_path = filemd5 if extension: remote_path = remote_path + "." + extension @@ -150,27 +152,6 @@ def make_url(self, obj_path): raise BucketNotSpecifiedError() return "s3://{0}/{1}".format(self.bucket_name, obj_path) - def file_checksum(self, file_name): - - with open(file_name, "rb") as file_handle: - md5 = hashlib.md5() - # Read file in chunks of 4096 bytes - block_size = 4096 - - # Save current cursor position and reset cursor to start of file - curpos = file_handle.tell() - file_handle.seek(0) - - buf = file_handle.read(block_size) - while buf: - md5.update(buf) - buf = file_handle.read(block_size) - - # Restore file cursor's position - file_handle.seek(curpos) - - return md5.hexdigest() - def to_path_style_s3_url(self, key, version=None): """ This link describes the format of Path Style URLs diff --git a/samcli/lib/utils/hash.py b/samcli/lib/utils/hash.py new file mode 100644 index 0000000000..d9a5f3a476 --- /dev/null +++ b/samcli/lib/utils/hash.py @@ -0,0 +1,63 @@ +""" +Hash calculation utilities for files and directories. +""" +import os +import hashlib + +BLOCK_SIZE = 4096 + + +def file_checksum(file_name): + """ + + Parameters + ---------- + file_name: file name of the file for which md5 checksum is required. + + Returns + ------- + md5 checksum of the given file. + + """ + with open(file_name, "rb") as file_handle: + md5 = hashlib.md5() + + # Save current cursor position and reset cursor to start of file + curpos = file_handle.tell() + file_handle.seek(0) + + buf = file_handle.read(BLOCK_SIZE) + while buf: + md5.update(buf) + buf = file_handle.read(BLOCK_SIZE) + + # Restore file cursor's position + file_handle.seek(curpos) + + return md5.hexdigest() + + +def dir_checksum(directory, followlinks=True): + """ + + Parameters + ---------- + directory : A directory with an absolute path + followlinks: Follow symbolic links through the given directory + + Returns + ------- + md5 checksum of the directory. + + """ + md5_dir = hashlib.md5() + # Walk through given directory and find all directories and files. + for dirpath, _, filenames in os.walk(directory, followlinks=followlinks): + # Go through every file in the directory and sub-directory. + for filepath in [os.path.join(dirpath, filename) for filename in filenames]: + # Look at filename and contents. + # Encode file's checksum to be utf-8 and bytes. + md5_dir.update(filepath.encode("utf-8")) + filepath_checksum = file_checksum(filepath) + md5_dir.update(filepath_checksum.encode("utf-8")) + return md5_dir.hexdigest() diff --git a/tests/integration/deploy/deploy_integ_base.py b/tests/integration/deploy/deploy_integ_base.py index c191ab7a27..e89493f59b 100644 --- a/tests/integration/deploy/deploy_integ_base.py +++ b/tests/integration/deploy/deploy_integ_base.py @@ -1,12 +1,6 @@ import os -import uuid -import json -import time -from pathlib import Path from unittest import TestCase -import boto3 - class DeployIntegBase(TestCase): @classmethod @@ -91,3 +85,13 @@ def get_deploy_command_list( command_list = command_list + ["--profile", str(profile)] return command_list + + def get_minimal_build_command_list(self, template_file=None, build_dir=None): + command_list = [self.base_command(), "build"] + + if template_file: + command_list = command_list + ["--template-file", str(template_file)] + if build_dir: + command_list = command_list + ["--build-dir", str(build_dir)] + + return command_list diff --git a/tests/integration/deploy/test_deploy_command.py b/tests/integration/deploy/test_deploy_command.py index ce0cce70b2..950a135075 100644 --- a/tests/integration/deploy/test_deploy_command.py +++ b/tests/integration/deploy/test_deploy_command.py @@ -1,4 +1,5 @@ import os +import shutil import tempfile import uuid import time @@ -31,6 +32,7 @@ def setUp(self): super(TestDeploy, self).setUp() def tearDown(self): + shutil.rmtree(os.path.join(os.getcwd(), ".aws-sam", "build"), ignore_errors=True) for stack_name in self.stack_names: self.cf_client.delete_stack(StackName=stack_name) super(TestDeploy, self).tearDown() @@ -131,6 +133,66 @@ def test_no_package_and_deploy_with_s3_bucket_all_args(self, template_file): raise self.assertEqual(deploy_process_execute.returncode, 0) + @parameterized.expand(["aws-serverless-function.yaml"]) + def test_deploy_no_redeploy_on_same_built_artifacts(self, template_file): + template_path = self.test_data_path.joinpath(template_file) + # Build project + build_command_list = self.get_minimal_build_command_list(template_file=template_path) + + build_process = Popen(build_command_list, stdout=PIPE) + try: + build_process.communicate(timeout=TIMEOUT) + except TimeoutExpired: + build_process.kill() + raise + + stack_name = "a" + str(uuid.uuid4()).replace("-", "")[:10] + self.stack_names.append(stack_name) + + # Package and Deploy in one go without confirming change set on a built template. + # Should result in a zero exit code. + deploy_command_list = self.get_deploy_command_list( + stack_name=stack_name, + capabilities="CAPABILITY_IAM", + s3_prefix="integ_deploy", + s3_bucket=self.s3_bucket.name, + force_upload=True, + notification_arns=self.sns_arn, + parameter_overrides="Parameter=Clarity", + kms_key_id=self.kms_key, + no_execute_changeset=False, + tags="integ=true clarity=yes", + confirm_changeset=False, + ) + + deploy_process_execute = Popen(deploy_command_list, stdout=PIPE) + try: + deploy_process_execute.communicate(timeout=TIMEOUT) + except TimeoutExpired: + deploy_process_execute.kill() + raise + self.assertEqual(deploy_process_execute.returncode, 0) + # ReBuild project, absolutely nothing has changed, will result in same build artifacts. + + build_process = Popen(build_command_list, stdout=PIPE) + try: + build_process.communicate(timeout=TIMEOUT) + except TimeoutExpired: + build_process.kill() + raise + + # Re-deploy, this should cause an empty changeset error and not re-deploy. + # This will cause a non zero exit code. + + deploy_process_execute = Popen(deploy_command_list, stdout=PIPE) + try: + deploy_process_execute.communicate(timeout=TIMEOUT) + except TimeoutExpired: + deploy_process_execute.kill() + raise + # Does not cause a re-deploy + self.assertEqual(deploy_process_execute.returncode, 1) + @parameterized.expand(["aws-serverless-function.yaml"]) def test_no_package_and_deploy_with_s3_bucket_all_args_confirm_changeset(self, template_file): template_path = self.test_data_path.joinpath(template_file) diff --git a/tests/integration/testdata/package/main.py b/tests/integration/testdata/package/main.py new file mode 100644 index 0000000000..2c3d5b3503 --- /dev/null +++ b/tests/integration/testdata/package/main.py @@ -0,0 +1,4 @@ +import json + +def handler(event, context): + return {"statusCode": 200, "body": json.dumps({"hello": "world"})} diff --git a/tests/integration/testdata/package/requirements.txt b/tests/integration/testdata/package/requirements.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/lib/package/test_artifact_exporter.py b/tests/unit/lib/package/test_artifact_exporter.py index 76814630a4..b7b8f35f3f 100644 --- a/tests/unit/lib/package/test_artifact_exporter.py +++ b/tests/unit/lib/package/test_artifact_exporter.py @@ -342,7 +342,7 @@ def test_zip_folder(self, make_zip_mock): with self.make_temp_dir() as dirname: with zip_folder(dirname) as actual_zip_file_name: - self.assertEqual(actual_zip_file_name, zip_file_name) + self.assertEqual(actual_zip_file_name, (zip_file_name, mock.ANY)) make_zip_mock.assert_called_once_with(mock.ANY, dirname) diff --git a/tests/unit/lib/package/test_s3_uploader.py b/tests/unit/lib/package/test_s3_uploader.py index 496d9db08c..7f9adddb4e 100644 --- a/tests/unit/lib/package/test_s3_uploader.py +++ b/tests/unit/lib/package/test_s3_uploader.py @@ -9,6 +9,7 @@ from samcli.commands.package.exceptions import NoSuchBucketError, BucketNotSpecifiedError from samcli.lib.package.s3_uploader import S3Uploader +from samcli.lib.utils.hash import file_checksum class TestS3Uploader(TestCase): @@ -114,7 +115,7 @@ def test_file_checksum(self): with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f: f.write(b"Hello World!") f.seek(0) - self.assertEqual("ed076287532e86365e841e92bfc50d8c", s3_uploader.file_checksum(f.name)) + self.assertEqual("ed076287532e86365e841e92bfc50d8c", file_checksum(f.name)) def test_path_style_s3_url(self): s3_uploader = S3Uploader( @@ -171,5 +172,5 @@ def test_s3_upload_with_dedup(self): with tempfile.NamedTemporaryFile(mode="w", delete=False) as f: s3_url = s3_uploader.upload_with_dedup(f.name, "zip") self.assertEqual( - s3_url, "s3://{0}/{1}/{2}.zip".format(self.bucket_name, self.prefix, s3_uploader.file_checksum(f.name)) + s3_url, "s3://{0}/{1}/{2}.zip".format(self.bucket_name, self.prefix, file_checksum(f.name)) ) diff --git a/tests/unit/lib/utils/test_hash.py b/tests/unit/lib/utils/test_hash.py new file mode 100644 index 0000000000..2027a411b5 --- /dev/null +++ b/tests/unit/lib/utils/test_hash.py @@ -0,0 +1,37 @@ +import os +import shutil +import tempfile +from unittest import TestCase + +from samcli.lib.utils.hash import dir_checksum + + +class TestHash(TestCase): + def setUp(self): + self.temp_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def test_dir_hash_same_contents_diff_file_per_directory(self): + _file = tempfile.NamedTemporaryFile(delete=False, dir=self.temp_dir) + _file.write(b"Testfile") + _file.close() + checksum_before = dir_checksum(os.path.dirname(_file.name)) + shutil.move(os.path.abspath(_file.name), os.path.join(os.path.dirname(_file.name), "different_name")) + checksum_after = dir_checksum(os.path.dirname(_file.name)) + self.assertNotEqual(checksum_before, checksum_after) + + def test_dir_cyclic_links(self): + _file = tempfile.NamedTemporaryFile(delete=False, dir=self.temp_dir) + _file.write(b"Testfile") + _file.close() + os.symlink(os.path.abspath(_file.name), os.path.join(os.path.dirname(_file.name), "symlink")) + os.symlink( + os.path.join(os.path.dirname(_file.name), "symlink"), os.path.join(os.path.dirname(_file.name), "symlink2") + ) + os.unlink(os.path.abspath(_file.name)) + os.symlink(os.path.join(os.path.dirname(_file.name), "symlink2"), os.path.abspath(_file.name)) + with self.assertRaises(OSError) as ex: + dir_checksum(os.path.dirname(_file.name)) + self.assertIn("Too many levels of symbolic links", ex.message)