aws · luigift · Apr 18, 2020 · Apr 19, 2020 · Apr 19, 2020 · Apr 19, 2020
diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml
@@ -27,10 +27,11 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r requirements-dev.txt
+        pip install -r requirements-torch.txt
     - name: CloudFormation Lint
       run: cfn-lint -t testing/cloudformation.yaml
     - name: Documentation Lint
-      run: pydocstyle awswrangler/ --add-ignore=D204
+      run: pydocstyle awswrangler/ --add-ignore=D204,D403
     - name: mypy check
       run: mypy awswrangler
     - name: Flake8 Lint

diff --git a/.pylintrc b/.pylintrc
@@ -141,7 +141,8 @@ disable=print-statement,
         comprehension-escape,
         C0330,
         C0103,
-        W1202
+        W1202,
+        too-few-public-methods
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option

diff --git a/README.md b/README.md
@@ -85,6 +85,7 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine)
   - [11 - CSV Datasets](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/11%20-%20CSV%20Datasets.ipynb)
   - [12 - CSV Crawler](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/12%20-%20CSV%20Crawler.ipynb)
   - [13 - Merging Datasets on S3](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/13%20-%20Merging%20Datasets%20on%20S3.ipynb)
+  - [14 - PyTorch](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/14%20-%20PyTorch.ipynb)
 - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/latest/api.html)
   - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#amazon-s3)
   - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#aws-glue-catalog)

diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
@@ -5,9 +5,18 @@
 
 """
 
+import importlib
 import logging
 
 from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3  # noqa
 from awswrangler.__metadata__ import __description__, __license__, __title__, __version__  # noqa
 
+if (
+    importlib.util.find_spec("torch")
+    and importlib.util.find_spec("torchvision")
+    and importlib.util.find_spec("torchaudio")
+    and importlib.util.find_spec("PIL")
+):  # type: ignore
+    from awswrangler import torch  # noqa
+
 logging.getLogger("awswrangler").addHandler(logging.NullHandler())
diff --git a/awswrangler/db.py b/awswrangler/db.py
@@ -155,29 +155,15 @@ def read_sql_query(
     ... )
 
     """
-    if not isinstance(con, sqlalchemy.engine.Engine):  # pragma: no cover
-        raise exceptions.InvalidConnection(
-            "Invalid 'con' argument, please pass a "
-            "SQLAlchemy Engine. Use wr.db.get_engine(), "
-            "wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()"
-        )
+    _validate_engine(con=con)
     with con.connect() as _con:
         args = _convert_params(sql, params)
         cursor = _con.execute(*args)
         if chunksize is None:
             return _records2df(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype)
-        return _iterate_cursor(cursor=cursor, chunksize=chunksize, index=index_col, dtype=dtype)
-
-
-def _iterate_cursor(
-    cursor, chunksize: int, index: Optional[Union[str, List[str]]], dtype: Optional[Dict[str, pa.DataType]] = None
-) -> Iterator[pd.DataFrame]:
-    while True:
-        records = cursor.fetchmany(chunksize)
-        if not records:
-            break
-        df: pd.DataFrame = _records2df(records=records, cols_names=cursor.keys(), index=index, dtype=dtype)
-        yield df
+        return _iterate_cursor(
+            cursor=cursor, chunksize=chunksize, cols_names=cursor.keys(), index=index_col, dtype=dtype
+        )
 
 
 def _records2df(
@@ -207,6 +193,20 @@ def _records2df(
     return df
 
 
+def _iterate_cursor(
+    cursor: Any,
+    chunksize: int,
+    cols_names: List[str],
+    index: Optional[Union[str, List[str]]],
+    dtype: Optional[Dict[str, pa.DataType]] = None,
+) -> Iterator[pd.DataFrame]:
+    while True:
+        records = cursor.fetchmany(chunksize)
+        if not records:
+            break
+        yield _records2df(records=records, cols_names=cols_names, index=index, dtype=dtype)
+
+
 def _convert_params(sql: str, params: Optional[Union[List, Tuple, Dict]]) -> List[Any]:
     args: List[Any] = [sql]
     if params is not None:
@@ -1087,3 +1087,12 @@ def unload_redshift_to_files(
         paths = [x[0].replace(" ", "") for x in _con.execute(sql).fetchall()]
         _logger.debug(f"paths: {paths}")
         return paths
+
+
+def _validate_engine(con: sqlalchemy.engine.Engine) -> None:  # pragma: no cover
+    if not isinstance(con, sqlalchemy.engine.Engine):
+        raise exceptions.InvalidConnection(
+            "Invalid 'con' argument, please pass a "
+            "SQLAlchemy Engine. Use wr.db.get_engine(), "
+            "wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()"
+        )
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
@@ -111,7 +111,7 @@ def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None)
         raise ex  # pragma: no cover
 
 
-def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]:
+def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> List[str]:
     """List Amazon S3 objects from a prefix.
 
     Parameters
@@ -120,6 +120,8 @@ def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> Li
         S3 path (e.g. s3://bucket/prefix).
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+    suffix: str, optional
+        Suffix for filtering S3 keys.
 
     Returns
     -------
@@ -155,15 +157,16 @@ def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> Li
             for content in contents:
                 if (content is not None) and ("Key" in content):
                     key: str = content["Key"]
-                    paths.append(f"s3://{bucket}/{key}")
+                    if (suffix is None) or key.endswith(suffix):
+                        paths.append(f"s3://{bucket}/{key}")
     return paths
 
 
-def _path2list(path: Union[str, List[str]], boto3_session: Optional[boto3.Session]) -> List[str]:
+def _path2list(path: object, boto3_session: boto3.Session, suffix: str = None) -> List[str]:
     if isinstance(path, str):  # prefix
         paths: List[str] = list_objects(path=path, boto3_session=boto3_session)
     elif isinstance(path, list):
-        paths = path
+        paths = path if suffix is None else [x for x in path if x.endswith(suffix)]
     else:
         raise exceptions.InvalidArgumentType(f"{type(path)} is not a valid path type. Please, use str or List[str].")
     return paths