diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 22744f7..0000000 --- a/.coveragerc +++ /dev/null @@ -1,4 +0,0 @@ -[run] -source = collectfast -[report] -omit = */tests/* diff --git a/collectfast/boto.py b/collectfast/boto.py deleted file mode 100644 index d1f6552..0000000 --- a/collectfast/boto.py +++ /dev/null @@ -1,42 +0,0 @@ -from typing import Any - -from django.core.exceptions import ImproperlyConfigured - -from . import settings - -has_boto = True # type: bool -has_boto3 = True # type: bool - -try: - from storages.backends.s3boto import S3BotoStorage -except (ImportError, ImproperlyConfigured): - has_boto = False - -try: - from storages.backends.s3boto3 import S3Boto3Storage -except (ImportError, ImproperlyConfigured): - has_boto3 = False - - -def is_boto3(storage): - # type: (Any) -> bool - return has_boto3 and isinstance(storage, S3Boto3Storage) - - -def is_boto(storage): - # type: (Any) -> bool - return has_boto and isinstance(storage, S3BotoStorage) - - -def is_any_boto(storage): - # type: (Any) -> bool - return is_boto(storage) or is_boto3(storage) - - -def reset_connection(storage): - # type: (Any) -> None - """ - Reset connection if thread pooling is enabled and storage is boto3. - """ - if settings.threads and is_boto3(storage): - storage._connection = None diff --git a/collectfast/etag.py b/collectfast/etag.py deleted file mode 100644 index 80651e0..0000000 --- a/collectfast/etag.py +++ /dev/null @@ -1,117 +0,0 @@ -import gzip -import hashlib -import logging -import mimetypes -from functools import lru_cache -from io import BytesIO -from typing import Optional - -from django.core.cache import caches -from django.core.files.storage import Storage -from django.utils.encoding import force_bytes -from storages.utils import safe_join - -from collectfast import settings - -cache = caches[settings.cache] -logger = logging.getLogger(__name__) - - -@lru_cache() -def get_cache_key(path): - # type: (str) -> str - """ - Create a cache key by concatenating the prefix with a hash of the path. - """ - path_hash = hashlib.md5(path.encode()).hexdigest() - return settings.cache_key_prefix + path_hash - - -def get_remote_etag(storage, prefixed_path): - # type: (Storage, str) -> Optional[str] - """ - Get etag of path from S3 using boto or boto3. - """ - normalized_path = safe_join( - storage.location, # type: ignore - prefixed_path, - ).replace("\\", "/") - try: - return storage.bucket.get_key(normalized_path).etag # type: ignore - except AttributeError: - pass - try: - return storage.bucket.Object(normalized_path).e_tag # type: ignore - except: - pass - return None - - -def get_etag(storage, path, prefixed_path): - # type: (Storage, str, str) -> str - """ - Get etag of path from cache or S3 - in that order. - """ - cache_key = get_cache_key(path) - etag = cache.get(cache_key, False) - if etag is False: - etag = get_remote_etag(storage, prefixed_path) - cache.set(cache_key, etag) - return etag - - -def destroy_etag(path): - # type: (str) -> None - """ - Clear etag of path from cache. - """ - cache.delete(get_cache_key(path)) - - -def get_file_hash(storage, path): - # type: (Storage, str) -> str - """ - Create md5 hash from file contents. - """ - contents = storage.open(path).read() - file_hash = hashlib.md5(contents).hexdigest() - - # Check if content should be gzipped and hash gzipped content - content_type = mimetypes.guess_type(path)[0] or "application/octet-stream" - if settings.is_gzipped and content_type in settings.gzip_content_types: - cache_key = get_cache_key("gzip_hash_%s" % file_hash) - file_hash = cache.get(cache_key, False) - if file_hash is False: - buffer = BytesIO() - zf = gzip.GzipFile(mode="wb", compresslevel=6, fileobj=buffer, mtime=0.0) - zf.write(force_bytes(contents)) - zf.close() - file_hash = hashlib.md5(buffer.getvalue()).hexdigest() - cache.set(cache_key, file_hash) - - return '"%s"' % file_hash - - -def has_matching_etag(remote_storage, source_storage, path, prefixed_path): - # type: (Storage, Storage, str, str) -> bool - """ - Compare etag of path in source storage with remote. - """ - storage_etag = get_etag(remote_storage, path, prefixed_path) - local_etag = get_file_hash(source_storage, path) - return storage_etag == local_etag - - -def should_copy_file(remote_storage, path, prefixed_path, source_storage): - # type: (Storage, str, str, Storage) -> bool - """ - Returns True if the file should be copied, otherwise False. - """ - if has_matching_etag(remote_storage, source_storage, path, prefixed_path): - logger.info("%s: Skipping based on matching file hashes" % path) - return False - - # Invalidate cached versions of lookup before copy - destroy_etag(path) - logger.info("%s: Hashes did not match" % path) - return True diff --git a/collectfast/management/commands/collectstatic.py b/collectfast/management/commands/collectstatic.py index abb4ad1..98660b9 100644 --- a/collectfast/management/commands/collectstatic.py +++ b/collectfast/management/commands/collectstatic.py @@ -9,24 +9,30 @@ from django.utils.encoding import smart_str from collectfast import settings -from collectfast.boto import is_any_boto -from collectfast.boto import reset_connection -from collectfast.etag import should_copy_file +from collectfast.strategies import guess_strategy +from collectfast.strategies import load_strategy Task = Tuple[str, str, Storage] +if settings.strategy is not None: + strategy = load_strategy(settings.strategy) +elif settings.staticfiles_storage is not None: + warnings.warn( + "Falling back to guessing strategy for backwards compatibility. This " + "is deprecated and will be removed in a future release. Explicitly " + "set COLLECTFAST_STRATEGY to silence this warning." + ) + strategy = load_strategy(guess_strategy(settings.staticfiles_storage)) +else: + raise RuntimeError( + "No strategy configured, please make sure COLLECTFAST_STRATEGY is set." + ) + class Command(collectstatic.Command): def add_arguments(self, parser): # type: (CommandParser) -> None - super(Command, self).add_arguments(parser) - parser.add_argument( - "--ignore-etag", - action="store_true", - dest="ignore_etag", - default=False, - help="Deprecated since 0.5.0, use --disable-collectfast instead.", - ) + super().add_arguments(parser) parser.add_argument( "--disable-collectfast", action="store_true", @@ -36,65 +42,45 @@ def add_arguments(self, parser): ) def __init__(self, *args, **kwargs): - super(Command, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self.num_copied_files = 0 self.tasks = [] # type: List[Task] self.collectfast_enabled = settings.enabled - - if is_any_boto(self.storage) and self.storage.preload_metadata is not True: - self.storage.preload_metadata = True - warnings.warn( - "Collectfast does not work properly without `preload_metadata` " - "set to `True` on the storage class. Overriding " - "`storage.preload_metadata` and continuing." - ) + self.strategy = strategy(self.storage) def set_options(self, **options): - """ - Set options and handle deprecation. - """ - ignore_etag = options.pop("ignore_etag", False) + """Set options and handle deprecation.""" disable = options.pop("disable_collectfast", False) - if ignore_etag: - warnings.warn( - "--ignore-etag is deprecated since 0.5.0, use " - "--disable-collectfast instead." - ) - if ignore_etag or disable: - self.collectfast_enabled = False - super(Command, self).set_options(**options) + self.collectfast_enabled = not disable + super().set_options(**options) def collect(self): """ Override collect to copy files concurrently. The tasks are populated by Command.copy_file() which is called by super().collect(). """ - ret = super(Command, self).collect() + ret = super().collect() if settings.threads: Pool(settings.threads).map(self.do_copy_file, self.tasks) return ret def handle(self, **options): - """ - Override handle to supress summary output - """ - super(Command, self).handle(**options) + """Override handle to suppress summary output.""" + super().handle(**options) return "{} static file{} copied.".format( self.num_copied_files, "" if self.num_copied_files == 1 else "s" ) def do_copy_file(self, args): - """ - Determine if file should be copied or not and handle exceptions. - """ + """Determine if file should be copied or not and handle exceptions.""" path, prefixed_path, source_storage = args - reset_connection(self.storage) + self.strategy.pre_should_copy_hook() if self.collectfast_enabled and not self.dry_run: try: - if not should_copy_file( - self.storage, path, prefixed_path, source_storage + if not self.strategy.should_copy_file( + path, prefixed_path, source_storage ): return False except Exception as e: @@ -109,12 +95,12 @@ def do_copy_file(self, args): ) self.num_copied_files += 1 - return super(Command, self).copy_file(path, prefixed_path, source_storage) + return super().copy_file(path, prefixed_path, source_storage) def copy_file(self, path, prefixed_path, source_storage): # type: (str, str, Storage) -> None """ - Appends path to task queue if threads are enabled, otherwise copies the + Append path to task queue if threads are enabled, otherwise copy the file with a blocking call. """ args = (path, prefixed_path, source_storage) @@ -125,15 +111,11 @@ def copy_file(self, path, prefixed_path, source_storage): def delete_file(self, path, prefixed_path, source_storage): # type: (str, str, Storage) -> bool - """ - Override delete_file to skip modified time and exists lookups. - """ + """Override delete_file to skip modified time and exists lookups.""" if not self.collectfast_enabled: # The delete_file method is incorrectly annotated in django-stubs, # see https://github.com/typeddjango/django-stubs/issues/130 - return super(Command, self).delete_file( # type: ignore - path, prefixed_path, source_storage - ) + return super().delete_file(path, prefixed_path, source_storage) if not self.dry_run: self.log("Deleting '%s'" % path) self.storage.delete(prefixed_path) diff --git a/collectfast/settings.py b/collectfast/settings.py index 64351f5..4298f6f 100644 --- a/collectfast/settings.py +++ b/collectfast/settings.py @@ -1,13 +1,22 @@ +from typing import Optional +from typing import Sequence +from typing import Type +from typing import Union + from django.conf import settings +from typing_extensions import Final + -debug = getattr(settings, "COLLECTFAST_DEBUG", getattr(settings, "DEBUG", False)) +debug = getattr( + settings, "COLLECTFAST_DEBUG", getattr(settings, "DEBUG", False) +) # type: Final[bool] cache_key_prefix = getattr( settings, "COLLECTFAST_CACHE_KEY_PREFIX", "collectfast05_asset_" -) -cache = getattr(settings, "COLLECTFAST_CACHE", "default") -threads = getattr(settings, "COLLECTFAST_THREADS", False) -enabled = getattr(settings, "COLLECTFAST_ENABLED", True) -is_gzipped = getattr(settings, "AWS_IS_GZIPPED", False) +) # type: Final[str] +cache = getattr(settings, "COLLECTFAST_CACHE", "default") # type: Final[str] +threads = getattr(settings, "COLLECTFAST_THREADS", False) # type: Final[bool] +enabled = getattr(settings, "COLLECTFAST_ENABLED", True) # type: Final[bool] +aws_is_gzipped = getattr(settings, "AWS_IS_GZIPPED", False) # type: Final[bool] gzip_content_types = getattr( settings, "GZIP_CONTENT_TYPES", @@ -18,4 +27,10 @@ "application/x-javascript", "image/svg+xml", ), -) +) # type: Final[Sequence[str]] +staticfiles_storage = getattr( + settings, "STATICFILES_STORAGE", None +) # type: Final[Optional[str]] +strategy = getattr( + settings, "COLLECTFAST_STRATEGY", None +) # type: Final[Optional[Union[str, Type]]] diff --git a/collectfast/strategies/__init__.py b/collectfast/strategies/__init__.py new file mode 100644 index 0000000..a5ed028 --- /dev/null +++ b/collectfast/strategies/__init__.py @@ -0,0 +1,4 @@ +from .base import guess_strategy +from .base import load_strategy + +__all__ = ("load_strategy", "guess_strategy") diff --git a/collectfast/strategies/base.py b/collectfast/strategies/base.py new file mode 100644 index 0000000..1c6e173 --- /dev/null +++ b/collectfast/strategies/base.py @@ -0,0 +1,144 @@ +import abc +import gzip +import hashlib +import logging +import mimetypes +from functools import lru_cache +from io import BytesIO +from pydoc import locate +from typing import Generic +from typing import Optional +from typing import Type +from typing import TypeVar +from typing import Union + +from django.core.cache import caches +from django.core.files.storage import Storage +from django.utils.encoding import force_bytes +from typing_extensions import ClassVar + +from collectfast import settings + + +_RemoteStorage = TypeVar("_RemoteStorage", bound=Storage) + + +cache = caches[settings.cache] +logger = logging.getLogger(__name__) + + +class Strategy(abc.ABC, Generic[_RemoteStorage]): + def __init__(self, remote_storage): + # type: (_RemoteStorage) -> None + self.remote_storage = remote_storage + + @abc.abstractmethod + def should_copy_file(self, path, prefixed_path, local_storage): + # type: (str, str, Storage) -> bool + ... + + def pre_should_copy_hook(self): + # type: () -> None + """Hook called before calling should_copy_file.""" + ... + + +class HashStrategy(Strategy[_RemoteStorage], abc.ABC): + use_gzip = False # type: ClassVar + + def should_copy_file(self, path, prefixed_path, local_storage): + # type: (str, str, Storage) -> bool + local_hash = self.get_local_file_hash(path, local_storage) + remote_hash = self.get_remote_file_hash(prefixed_path) + return local_hash == remote_hash + + def get_gzipped_local_file_hash(self, uncompressed_file_hash, path, contents): + # type: (str, str, str) -> str + buffer = BytesIO() + zf = gzip.GzipFile(mode="wb", compresslevel=6, fileobj=buffer, mtime=0.0) + zf.write(force_bytes(contents)) + zf.close() + return hashlib.md5(buffer.getvalue()).hexdigest() + + # todo: bring back gzip support + def get_local_file_hash(self, path, local_storage): + # type: (str, Storage) -> str + """Create md5 hash from file contents.""" + contents = local_storage.open(path).read() + file_hash = hashlib.md5(contents).hexdigest() + + # Check if content should be gzipped and hash gzipped content + content_type = mimetypes.guess_type(path)[0] or "application/octet-stream" + if self.use_gzip and content_type in settings.gzip_content_types: + file_hash = self.get_gzipped_local_file_hash(file_hash, path, contents) + + return '"%s"' % file_hash + + @abc.abstractmethod + def get_remote_file_hash(self, prefixed_path): + # type: (str) -> Optional[str] + ... + + +class CachingHashStrategy(HashStrategy[_RemoteStorage], abc.ABC): + @lru_cache() + def get_cache_key(self, path): + # type: (str) -> str + path_hash = hashlib.md5(path.encode()).hexdigest() + return settings.cache_key_prefix + path_hash + + def invalidate_cached_hash(self, path): + # type: (str) -> None + cache.delete(self.get_cache_key(path)) + + def should_copy_file(self, path, prefixed_path, local_storage): + # type: (str, str, Storage) -> bool + local_hash = self.get_local_file_hash(path, local_storage) + remote_hash = self.get_cached_remote_file_hash(path, prefixed_path) + matches = local_hash == remote_hash + if matches: + # invalidate cached hash, since we expect its corresponding file to + # be overwritten + self.invalidate_cached_hash(path) + return matches + + def get_cached_remote_file_hash(self, path, prefixed_path): + # type: (str, str) -> str + """Cache the hash of the remote storage file.""" + cache_key = self.get_cache_key(path) + remote_hash = cache.get(cache_key, False) + if remote_hash is False: + remote_hash = self.get_remote_file_hash(prefixed_path) + cache.set(cache_key, remote_hash) + return remote_hash + + def get_gzipped_local_file_hash(self, uncompressed_file_hash, path, contents): + # type: (str, str, str) -> str + """Cache the hash of the gzipped local file.""" + cache_key = self.get_cache_key("gzip_hash_%s" % uncompressed_file_hash) + file_hash = cache.get(cache_key, False) + if file_hash is False: + file_hash = super().get_gzipped_local_file_hash( + uncompressed_file_hash, path, contents + ) + cache.set(cache_key, file_hash) + return file_hash + + +def load_strategy(klass: Union[str, Type, object]) -> Type[Strategy]: + if isinstance(klass, str): + klass = locate(klass) + if not isinstance(klass, type) or not issubclass(klass, Strategy): + raise RuntimeError( + "Configured strategies must be subclasses of %s.%s" + % (Strategy.__module__, Strategy.__qualname__) + ) + return klass + + +def guess_strategy(storage: str) -> Optional[str]: + if storage == "storages.backends.s3boto.S3BotoStorage": + return "collectfast.strategies.boto.BotoStrategy" + if storage == "storages.backends.s3boto3.S3Boto3Storage": + return "collectfast.strategies.boto3.Boto3Strategy" + return None diff --git a/collectfast/strategies/boto.py b/collectfast/strategies/boto.py new file mode 100644 index 0000000..b7e966e --- /dev/null +++ b/collectfast/strategies/boto.py @@ -0,0 +1,23 @@ +from typing import Optional + +from storages.backends.s3boto import S3BotoStorage +from storages.utils import safe_join + +from .base import CachingHashStrategy +from collectfast import settings + + +class BotoStrategy(CachingHashStrategy[S3BotoStorage]): + use_gzip = settings.aws_is_gzipped + + def __init__(self, remote_storage): + # type: (S3BotoStorage) -> None + super().__init__(remote_storage) + self.remote_storage.preload_metadata = True + + def get_remote_file_hash(self, prefixed_path): + # type: (str) -> Optional[str] + normalized_path = safe_join( + self.remote_storage.location, prefixed_path + ).replace("\\", "/") + return self.remote_storage.bucket.get_key(normalized_path).etag diff --git a/collectfast/strategies/boto3.py b/collectfast/strategies/boto3.py new file mode 100644 index 0000000..f897971 --- /dev/null +++ b/collectfast/strategies/boto3.py @@ -0,0 +1,28 @@ +from typing import Optional + +from storages.backends.s3boto3 import S3Boto3Storage +from storages.utils import safe_join + +from .base import CachingHashStrategy +from collectfast import settings + + +class Boto3Strategy(CachingHashStrategy[S3Boto3Storage]): + use_gzip = settings.aws_is_gzipped + + def __init__(self, remote_storage): + # type: (S3Boto3Storage) -> None + super().__init__(remote_storage) + self.remote_storage.preload_metadata = True + + def get_remote_file_hash(self, prefixed_path): + # type: (str) -> Optional[str] + normalized_path = safe_join( + self.remote_storage.location, prefixed_path + ).replace("\\", "/") + return self.remote_storage.bucket.Object(normalized_path).e_tag + + def pre_should_copy_hook(self): + # type: () -> None + if settings.threads: + self.remote_storage._connection = None diff --git a/collectfast/tests/test_command.py b/collectfast/tests/test_command.py index fcf8430..864a94b 100644 --- a/collectfast/tests/test_command.py +++ b/collectfast/tests/test_command.py @@ -40,19 +40,6 @@ def test_threads(case): case.assertIn("0 static files copied.", result) -@test -@override_django_settings( - STATICFILES_STORAGE="collectfast.tests.no_preload_metadata.NPM" -) -def test_warn_preload_metadata(case): - clean_static_dir() - create_static_file() - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - call_collectstatic() - case.assertIn("preload_metadata", str(w[0].message)) - - @test def test_collectfast_disabled(case): clean_static_dir() @@ -80,19 +67,9 @@ def test_disable_collectfast(case): case.assertIn("1 static file copied.", result) -@test -def test_ignore_etag_deprecated(case): - clean_static_dir() - create_static_file() - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - call_collectstatic(ignore_etag=True) - case.assertIn("ignore-etag is deprecated", str(w[0].message)) - - @test @override_storage_attr("gzip", True) -@override_setting("is_gzipped", True) +@override_setting("aws_is_gzipped", True) def test_is_gzipped(case): clean_static_dir() create_static_file() diff --git a/collectfast/tests/utils.py b/collectfast/tests/utils.py index 70e1aa1..fb46b2e 100644 --- a/collectfast/tests/utils.py +++ b/collectfast/tests/utils.py @@ -58,7 +58,7 @@ def override_storage_attr(name, value): def decorator(fn): @functools.wraps(fn) def wrapper(*args, **kwargs): - storage = import_string(getattr(django_settings, "STATICFILES_STORAGE")) + storage = import_string(django_settings.STATICFILES_STORAGE) original = getattr(storage, name) setattr(storage, name, value) ret = fn(*args, **kwargs) diff --git a/setup.cfg b/setup.cfg index 8519c00..0d1e6a2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,14 +1,11 @@ [flake8] -exclude = appveyor, .idea, .git, .venv, .tox, __pycache__, *.egg-info +exclude = appveyor, .idea, .git, .venv, .tox, __pycache__, *.egg-info, build max-complexity = 8 max-line-length = 88 -enable-extensions = - M511 # flake8-mutable -# see this discussion as to why we're ignoring E722: -# https://github.com/PyCQA/pycodestyle/issues/703 extend-ignore = F403,E265,E722 [mypy] +python_version = 3.4 ignore_missing_imports = True strict_optional = True no_implicit_optional = True @@ -16,9 +13,12 @@ check_untyped_defs = True disallow_incomplete_defs = True new_semantic_analyzer = True ignore_errors = False - [mypy-runtests] ignore_errors = True - [mypy-collectfast.tests.*] ignore_errors = True + +[coverage:run] +source = collectfast +[coverage:report] +omit = */tests/* diff --git a/test-requirements.txt b/test-requirements.txt index 69be697..e397f8d 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,5 +1,5 @@ flake8 -flake8-mutable +flake8-bugbear mypy typing typing-extensions