From 07fd949c98011767f173fce606fae11b45e1eca5 Mon Sep 17 00:00:00 2001 From: Nick Miles Date: Sun, 26 Jul 2020 18:12:44 -0700 Subject: [PATCH 1/7] supporting wildcard matching --- awswrangler/s3/_list.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/awswrangler/s3/_list.py b/awswrangler/s3/_list.py index ed9b73e9b..4bc1aed72 100644 --- a/awswrangler/s3/_list.py +++ b/awswrangler/s3/_list.py @@ -2,6 +2,7 @@ import datetime import logging +import fnmatch from typing import Any, Dict, List, Optional import boto3 # type: ignore @@ -60,10 +61,12 @@ def _list_objects( last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, + wildcard: Optional[str] = '*', ) -> List[str]: + wildcard_prefix: Optional[List] = path.split(wildcard)[0] bucket: str prefix: str - bucket, prefix = _utils.parse_path(path=path) + bucket, prefix = _utils.parse_path(path=wildcard_prefix) client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) paginator = client_s3.get_paginator("list_objects_v2") args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}} @@ -74,9 +77,10 @@ def _list_objects( _validate_datetimes(last_modified_begin=last_modified_begin, last_modified_end=last_modified_end) + for page in response_iterator: # pylint: disable=too-many-nested-blocks if delimiter is None: - contents: Optional[List] = page.get("Contents") + contents: Optional[List] = wildcard_prefix.get("Contents") if contents is not None: for content in contents: key: str = content["Key"] @@ -90,13 +94,16 @@ def _list_objects( continue paths.append(f"s3://{bucket}/{key}") else: - prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes") + prefixes: Optional[List[Optional[Dict[str, str]]]] = wildcard_prefix.get("CommonPrefixes") if prefixes is not None: for pfx in prefixes: if (pfx is not None) and ("Prefix" in pfx): key = pfx["Prefix"] paths.append(f"s3://{bucket}/{key}") - return paths + + fnmatch_path = path + "*" + filtered_paths = fnmatch.filter(paths, fnmatch_path) + return filtered_paths def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None) -> bool: From 76b028a11d620e56960c891c13b2081114034b3b Mon Sep 17 00:00:00 2001 From: Nick Miles Date: Mon, 27 Jul 2020 19:41:57 -0700 Subject: [PATCH 2/7] tweaking type hinting --- awswrangler/s3/_list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awswrangler/s3/_list.py b/awswrangler/s3/_list.py index 4bc1aed72..d68557919 100644 --- a/awswrangler/s3/_list.py +++ b/awswrangler/s3/_list.py @@ -63,7 +63,7 @@ def _list_objects( boto3_session: Optional[boto3.Session] = None, wildcard: Optional[str] = '*', ) -> List[str]: - wildcard_prefix: Optional[List] = path.split(wildcard)[0] + wildcard_prefix: Optional[str] = path.split(wildcard)[0] bucket: str prefix: str bucket, prefix = _utils.parse_path(path=wildcard_prefix) From 8933eb4572e89ea58bdd14c86e6a8e6705e519e4 Mon Sep 17 00:00:00 2001 From: Nick Miles Date: Mon, 27 Jul 2020 19:42:25 -0700 Subject: [PATCH 3/7] removing extra line breaks --- awswrangler/s3/_list.py | 1 - 1 file changed, 1 deletion(-) diff --git a/awswrangler/s3/_list.py b/awswrangler/s3/_list.py index d68557919..1a0e77a55 100644 --- a/awswrangler/s3/_list.py +++ b/awswrangler/s3/_list.py @@ -77,7 +77,6 @@ def _list_objects( _validate_datetimes(last_modified_begin=last_modified_begin, last_modified_end=last_modified_end) - for page in response_iterator: # pylint: disable=too-many-nested-blocks if delimiter is None: contents: Optional[List] = wildcard_prefix.get("Contents") From dfd60d408dc8ccd98e609034238bcf878a4b77fe Mon Sep 17 00:00:00 2001 From: Nick Miles Date: Mon, 27 Jul 2020 19:44:39 -0700 Subject: [PATCH 4/7] improving return for legibility --- awswrangler/s3/_list.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/awswrangler/s3/_list.py b/awswrangler/s3/_list.py index 1a0e77a55..b84fe5060 100644 --- a/awswrangler/s3/_list.py +++ b/awswrangler/s3/_list.py @@ -61,9 +61,9 @@ def _list_objects( last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, - wildcard: Optional[str] = '*', + wildcard_character: Optional[str] = '*', ) -> List[str]: - wildcard_prefix: Optional[str] = path.split(wildcard)[0] + wildcard_prefix: Optional[str] = path.split(wildcard_character)[0] bucket: str prefix: str bucket, prefix = _utils.parse_path(path=wildcard_prefix) @@ -100,9 +100,11 @@ def _list_objects( key = pfx["Prefix"] paths.append(f"s3://{bucket}/{key}") - fnmatch_path = path + "*" - filtered_paths = fnmatch.filter(paths, fnmatch_path) - return filtered_paths + if wildcard_character in path: + filtered_paths = fnmatch.filter(paths, path) + return filtered_paths + + return paths def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None) -> bool: From 733353d94369d10e108b7923f7b9f63515d3248f Mon Sep 17 00:00:00 2001 From: Nick Miles Date: Tue, 28 Jul 2020 10:35:13 -0700 Subject: [PATCH 5/7] validation shell script changes --- awswrangler/s3/_list.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/awswrangler/s3/_list.py b/awswrangler/s3/_list.py index b84fe5060..a2112239e 100644 --- a/awswrangler/s3/_list.py +++ b/awswrangler/s3/_list.py @@ -1,8 +1,8 @@ """Amazon S3 List Module (PRIVATE).""" import datetime -import logging import fnmatch +import logging from typing import Any, Dict, List, Optional import boto3 # type: ignore @@ -61,9 +61,9 @@ def _list_objects( last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, - wildcard_character: Optional[str] = '*', + wildcard_character: Optional[str] = "*", ) -> List[str]: - wildcard_prefix: Optional[str] = path.split(wildcard_character)[0] + wildcard_prefix: str = path.split(wildcard_character)[0] bucket: str prefix: str bucket, prefix = _utils.parse_path(path=wildcard_prefix) From b3f0eff8abd77c08ad6ce849cdf4e7e1b1408e1e Mon Sep 17 00:00:00 2001 From: Nick Miles Date: Tue, 28 Jul 2020 10:40:05 -0700 Subject: [PATCH 6/7] mypy fixes --- awswrangler/s3/_list.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/awswrangler/s3/_list.py b/awswrangler/s3/_list.py index a2112239e..733363ae0 100644 --- a/awswrangler/s3/_list.py +++ b/awswrangler/s3/_list.py @@ -79,7 +79,7 @@ def _list_objects( for page in response_iterator: # pylint: disable=too-many-nested-blocks if delimiter is None: - contents: Optional[List] = wildcard_prefix.get("Contents") + contents: Optional[List] = page.get("Contents") if contents is not None: for content in contents: key: str = content["Key"] @@ -93,7 +93,7 @@ def _list_objects( continue paths.append(f"s3://{bucket}/{key}") else: - prefixes: Optional[List[Optional[Dict[str, str]]]] = wildcard_prefix.get("CommonPrefixes") + prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes") if prefixes is not None: for pfx in prefixes: if (pfx is not None) and ("Prefix" in pfx): From 45ea5cf65b89548310aa1a5c11047b1d9aed20a5 Mon Sep 17 00:00:00 2001 From: Nick Miles Date: Tue, 28 Jul 2020 10:45:24 -0700 Subject: [PATCH 7/7] further tweaks to type hinting --- awswrangler/s3/_list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awswrangler/s3/_list.py b/awswrangler/s3/_list.py index 733363ae0..d9fbe3da1 100644 --- a/awswrangler/s3/_list.py +++ b/awswrangler/s3/_list.py @@ -61,7 +61,7 @@ def _list_objects( last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, - wildcard_character: Optional[str] = "*", + wildcard_character: str = "*", ) -> List[str]: wildcard_prefix: str = path.split(wildcard_character)[0] bucket: str