From 8fd49660d875f60cb1682cd9fe6a43426564399b Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Sat, 18 Apr 2020 20:09:14 -0300
Subject: [PATCH 01/59] initial draft

---
 awswrangler/db.py    |  36 ++++++++++----
 awswrangler/torch.py | 111 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 139 insertions(+), 8 deletions(-)
 create mode 100644 awswrangler/torch.py

diff --git a/awswrangler/db.py b/awswrangler/db.py
index 491fe7784..42d22fe73 100644
--- a/awswrangler/db.py
+++ b/awswrangler/db.py
@@ -155,6 +155,18 @@ def read_sql_query(
     ... )
 
     """
+    return _read_sql_query(fn=_record2df, sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype)
+
+
+def _read_sql_query(
+    sql: str,
+    con: sqlalchemy.engine.Engine,
+    index_col: Optional[Union[str, List[str]]] = None,
+    params: Optional[Union[List, Tuple, Dict]] = None,
+    chunksize: Optional[int] = None,
+    dtype: Optional[Dict[str, pa.DataType]] = None,
+    fn: Callable,
+):
     if not isinstance(con, sqlalchemy.engine.Engine):  # pragma: no cover
         raise exceptions.InvalidConnection(
             "Invalid 'con' argument, please pass a "
@@ -165,19 +177,27 @@ def read_sql_query(
         args = _convert_params(sql, params)
         cursor = _con.execute(*args)
         if chunksize is None:
-            return _records2df(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype)
-        return _iterate_cursor(cursor=cursor, chunksize=chunksize, index=index_col, dtype=dtype)
+            return fn(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype)
+        return _iterate_cursor(fn=fn, cursor=cursor, chunksize=chunksize, index=index_col, dtype=dtype)
 
 
 def _iterate_cursor(
     cursor, chunksize: int, index: Optional[Union[str, List[str]]], dtype: Optional[Dict[str, pa.DataType]] = None
-) -> Iterator[pd.DataFrame]:
+) -> Iterator[Any]:
     while True:
         records = cursor.fetchmany(chunksize)
-        if not records:
-            break
-        df: pd.DataFrame = _records2df(records=records, cols_names=cursor.keys(), index=index, dtype=dtype)
-        yield df
+        if not records: break
+        yield fn(records=records, cols_names=cursor.keys(), index=index, dtype=dtype)
+
+
+def _records2numpy(
+    records: List[Tuple[Any]],
+    cols_names: List[str],
+    index: Optional[Union[str, List[str]]],
+    dtype: Optional[Dict[str, pa.DataType]] = None,
+) -> Iterator[np.ndarry]:
+    for record in records:
+        yield np.array(record, float)
 
 
 def _records2df(
@@ -191,7 +211,7 @@ def _records2df(
         if (dtype is None) or (col_name not in dtype):
             array: pa.Array = pa.array(obj=col_values, safe=True)  # Creating Arrow array
         else:
-            array = pa.array(obj=col_values, type=dtype[col_name], safe=True)  # Creating Arrow array with dtype
+            array: pa.Array = pa.array(obj=col_values, type=dtype[col_name], safe=True)  # Creating Arrow array with dtype
         arrays.append(array)
     table = pa.Table.from_arrays(arrays=arrays, names=cols_names)  # Creating arrow Table
     df: pd.DataFrame = table.to_pandas(  # Creating Pandas DataFrame
diff --git a/awswrangler/torch.py b/awswrangler/torch.py
new file mode 100644
index 000000000..00cc273f0
--- /dev/null
+++ b/awswrangler/torch.py
@@ -0,0 +1,111 @@
+"""PyTorch Module."""
+
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+
+import torch
+import boto3  # type: ignore
+import botocore.exceptions  # type: ignore
+import pandas as pd  # type: ignore
+import pandas.io.parsers  # type: ignore
+import pyarrow as pa  # type: ignore
+import pyarrow.lib  # type: ignore
+import pyarrow.parquet  # type: ignore
+import s3fs  # type: ignore
+from boto3.s3.transfer import TransferConfig  # type: ignore
+from pandas.io.common import infer_compression  # type: ignore
+from  torch.utils.data import Dataset, IterableDataset
+
+from awswrangler import _data_types, _utils, catalog, exceptions, s3
+
+_logger: logging.Logger = logging.getLogger(__name__)
+
+
+class S3Dataset(Dataset):
+    """PyTorch Map-Style S3 Dataset.
+
+    Parameters
+    ----------
+    path : Union[str, List[str]]
+        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    torch.utils.data.Dataset
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> import boto3
+    >>> label_fn = lambda path: path.split[0][-2]
+    >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session())
+
+    """
+    def __init__(self, path: Union[str, List[str]], label_fn, boto3_session):
+        super(S3IterableDataset).__init__()
+        self.label_fn = label_fn
+        self.paths: List[str] = s3._path2list(
+            path=path,
+            boto3_session=self.boto3_session
+        )
+        self._s3 = boto3_session.resource('s3')
+
+    def _fetch_obj(self, path):
+        obj = _s3.Object(bucket_name, key).get()
+        return obj['Body'].read()
+
+    def __getitem__(self, index):
+        path = self.paths[index])
+        return [self._fetch_obj(path), label_fn(path)]
+
+    def __len__(self):
+        return len(self.paths)
+
+
+class SQLDataset(torch.utils.data.IterableDataset):
+    """PyTorch Iterable SQL Dataset.
+
+    Parameters
+    ----------
+    path : Union[str, List[str]]
+        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    torch.utils.data.Dataset
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> con = wr.catalog.get_engine("aws-data-wrangler-postgresql")
+    >>> ds = wr.torch.SQLDataset('select * from public.tutorial', con=con)
+
+    """
+    def __init__(self, ):
+        super(SQLDataset).__init__(
+            sql: str,
+            con: sqlalchemy.engine.Engine,
+            index_col: Optional[Union[str, List[str]]] = None,
+        ):
+        self.sql = sql
+        self.con = con
+        self.index_col = index_col
+
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:  # single-process data loading, return the full iterator
+            pass
+        else:  # in a worker process
+            raise NotImplemented()
+
+        for ds in wr.db._read_sql_query(
+            fn=wr.db._records2numpy,
+            sql=self.sql,
+            con=self.con,
+            index_col=self.index_col,
+        ):
+            for row in ds:
+                yield row

From 863ba2698fd97411edb5a82a9f22c176852f5093 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sun, 19 Apr 2020 09:16:54 -0300
Subject: [PATCH 02/59] adding Pytorch as a development dependency

---
 requirements-dev.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 3fdd3cdf3..137f57383 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -17,4 +17,6 @@ twine~=3.1.1
 wheel~=0.34.2
 sphinx~=3.0.1
 sphinx_bootstrap_theme~=0.7.1
-moto~=1.3.14
\ No newline at end of file
+moto~=1.3.14
+torch~=1.4.0
+torchvision~=0.5.0
\ No newline at end of file

From 2864dc09c2851a44661f1a875d3b6e47ec1f0017 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sun, 19 Apr 2020 09:52:41 -0300
Subject: [PATCH 03/59] Cleaning up initial draft

---
 awswrangler/db.py    |  27 ++++---
 awswrangler/torch.py | 173 ++++++++++++++++++++-----------------------
 2 files changed, 95 insertions(+), 105 deletions(-)

diff --git a/awswrangler/db.py b/awswrangler/db.py
index 42d22fe73..f4508d09c 100644
--- a/awswrangler/db.py
+++ b/awswrangler/db.py
@@ -2,10 +2,11 @@
 
 import json
 import logging
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
 from urllib.parse import quote_plus
 
 import boto3  # type: ignore
+import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
 import pyarrow as pa  # type: ignore
 import sqlalchemy  # type: ignore
@@ -155,17 +156,19 @@ def read_sql_query(
     ... )
 
     """
-    return _read_sql_query(fn=_record2df, sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype)
+    return _read_sql_query(
+        fn=_records2df, sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype
+    )
 
 
 def _read_sql_query(
+    fn: Callable,
     sql: str,
     con: sqlalchemy.engine.Engine,
     index_col: Optional[Union[str, List[str]]] = None,
     params: Optional[Union[List, Tuple, Dict]] = None,
     chunksize: Optional[int] = None,
     dtype: Optional[Dict[str, pa.DataType]] = None,
-    fn: Callable,
 ):
     if not isinstance(con, sqlalchemy.engine.Engine):  # pragma: no cover
         raise exceptions.InvalidConnection(
@@ -182,20 +185,20 @@ def _read_sql_query(
 
 
 def _iterate_cursor(
-    cursor, chunksize: int, index: Optional[Union[str, List[str]]], dtype: Optional[Dict[str, pa.DataType]] = None
+    fn: Callable,
+    cursor,
+    chunksize: int,
+    index: Optional[Union[str, List[str]]],
+    dtype: Optional[Dict[str, pa.DataType]] = None,
 ) -> Iterator[Any]:
     while True:
         records = cursor.fetchmany(chunksize)
-        if not records: break
+        if not records:
+            break
         yield fn(records=records, cols_names=cursor.keys(), index=index, dtype=dtype)
 
 
-def _records2numpy(
-    records: List[Tuple[Any]],
-    cols_names: List[str],
-    index: Optional[Union[str, List[str]]],
-    dtype: Optional[Dict[str, pa.DataType]] = None,
-) -> Iterator[np.ndarry]:
+def _records2numpy(records: List[Tuple[Any]], **kwargs) -> Iterator[np.ndarry]:  # pylint: disable=unused-argument
     for record in records:
         yield np.array(record, float)
 
@@ -211,7 +214,7 @@ def _records2df(
         if (dtype is None) or (col_name not in dtype):
             array: pa.Array = pa.array(obj=col_values, safe=True)  # Creating Arrow array
         else:
-            array: pa.Array = pa.array(obj=col_values, type=dtype[col_name], safe=True)  # Creating Arrow array with dtype
+            array = pa.array(obj=col_values, type=dtype[col_name], safe=True)  # Creating Arrow array with dtype
         arrays.append(array)
     table = pa.Table.from_arrays(arrays=arrays, names=cols_names)  # Creating arrow Table
     df: pd.DataFrame = table.to_pandas(  # Creating Pandas DataFrame
diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index 00cc273f0..afe85f1de 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -1,111 +1,98 @@
 """PyTorch Module."""
 
-from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+import logging
 
+import sqlalchemy  # type: ignore
 import torch
-import boto3  # type: ignore
-import botocore.exceptions  # type: ignore
-import pandas as pd  # type: ignore
-import pandas.io.parsers  # type: ignore
-import pyarrow as pa  # type: ignore
-import pyarrow.lib  # type: ignore
-import pyarrow.parquet  # type: ignore
-import s3fs  # type: ignore
-from boto3.s3.transfer import TransferConfig  # type: ignore
-from pandas.io.common import infer_compression  # type: ignore
-from  torch.utils.data import Dataset, IterableDataset
-
-from awswrangler import _data_types, _utils, catalog, exceptions, s3
+from torch.utils.data.dataset import IterableDataset
+
+from awswrangler import db
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
-class S3Dataset(Dataset):
-    """PyTorch Map-Style S3 Dataset.
-
-    Parameters
-    ----------
-    path : Union[str, List[str]]
-        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-    boto3_session : boto3.Session(), optional
-        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-
-    Returns
-    -------
-    torch.utils.data.Dataset
-
-    Examples
-    --------
-    >>> import awswrangler as wr
-    >>> import boto3
-    >>> label_fn = lambda path: path.split[0][-2]
-    >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session())
-
-    """
-    def __init__(self, path: Union[str, List[str]], label_fn, boto3_session):
-        super(S3IterableDataset).__init__()
-        self.label_fn = label_fn
-        self.paths: List[str] = s3._path2list(
-            path=path,
-            boto3_session=self.boto3_session
-        )
-        self._s3 = boto3_session.resource('s3')
-
-    def _fetch_obj(self, path):
-        obj = _s3.Object(bucket_name, key).get()
-        return obj['Body'].read()
-
-    def __getitem__(self, index):
-        path = self.paths[index])
-        return [self._fetch_obj(path), label_fn(path)]
-
-    def __len__(self):
-        return len(self.paths)
-
-
-class SQLDataset(torch.utils.data.IterableDataset):
-    """PyTorch Iterable SQL Dataset.
-
-    Parameters
-    ----------
-    path : Union[str, List[str]]
-        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-    boto3_session : boto3.Session(), optional
-        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-
-    Returns
-    -------
-    torch.utils.data.Dataset
-
-    Examples
-    --------
-    >>> import awswrangler as wr
-    >>> con = wr.catalog.get_engine("aws-data-wrangler-postgresql")
-    >>> ds = wr.torch.SQLDataset('select * from public.tutorial', con=con)
-
-    """
-    def __init__(self, ):
-        super(SQLDataset).__init__(
-            sql: str,
-            con: sqlalchemy.engine.Engine,
-            index_col: Optional[Union[str, List[str]]] = None,
-        ):
+# class S3Dataset(Dataset):
+#     """PyTorch Map-Style S3 Dataset.
+#
+#     Parameters
+#     ----------
+#     path : Union[str, List[str]]
+#         S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+#     boto3_session : boto3.Session(), optional
+#         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+#
+#     Returns
+#     -------
+#     torch.utils.data.Dataset
+#
+#     Examples
+#     --------
+#     >>> import awswrangler as wr
+#     >>> import boto3
+#     >>> label_fn = lambda path: path.split[0][-2]
+#     >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session())
+#
+#     """
+#     def __init__(self, path: Union[str, List[str]], label_fn, boto3_session):
+#         super(S3IterableDataset).__init__()
+#         self.label_fn = label_fn
+#         self.paths: List[str] = s3._path2list(
+#             path=path,
+#             boto3_session=self.boto3_session
+#         )
+#         self._s3 = boto3_session.resource('s3')
+#
+#     def _fetch_obj(self, path):
+#         obj = _s3.Object(bucket_name, key).get()
+#         return obj['Body'].read()
+#
+#     def __getitem__(self, index):
+#         path = self.paths[index])
+#         return [self._fetch_obj(path), label_fn(path)]
+#
+#     def __len__(self):
+#         return len(self.paths)
+
+
+class SQLDataset(IterableDataset):  # pylint: disable=too-few-public-methods,abstract-method
+    """Pytorch Iterable SQL Dataset."""
+
+    def __init__(self, sql: str, con: sqlalchemy.engine.Engine):
+        """Pytorch Iterable SQL Dataset.
+
+        Support for **Redshift**, **PostgreSQL** and **MySQL**.
+
+        Parameters
+        ----------
+        sql : str
+            Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
+        con : sqlalchemy.engine.Engine
+            SQLAlchemy Engine. Please use,
+            wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()
+
+        Returns
+        -------
+        torch.utils.data.dataset.IterableDataset
+
+        Examples
+        --------
+        >>> import awswrangler as wr
+        >>> con = wr.catalog.get_engine("aws-data-wrangler-postgresql")
+        >>> ds = wr.torch.SQLDataset('select * from public.tutorial', con=con)
+
+        """
+        super().__init__()
         self.sql = sql
         self.con = con
-        self.index_col = index_col
 
     def __iter__(self):
+        """Iterate over the Dataset."""
         worker_info = torch.utils.data.get_worker_info()
         if worker_info is None:  # single-process data loading, return the full iterator
             pass
         else:  # in a worker process
-            raise NotImplemented()
-
-        for ds in wr.db._read_sql_query(
-            fn=wr.db._records2numpy,
-            sql=self.sql,
-            con=self.con,
-            index_col=self.index_col,
-        ):
+            raise NotImplementedError()
+
+        for ds in db._read_sql_query(fn=db._records2numpy, sql=self.sql, con=self.con):
             for row in ds:
                 yield row

From 4fed4c7f5f743e90dbb16b8678b5cd9a104ae3ed Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sun, 19 Apr 2020 13:53:07 -0300
Subject: [PATCH 04/59] Add first test

---
 awswrangler/__init__.py                |   2 +-
 awswrangler/db.py                      |   5 +-
 awswrangler/s3.py                      |   9 +-
 awswrangler/torch.py                   | 127 ++++++++++++++++---------
 pytest.ini                             |   2 +-
 testing/test_awswrangler/test_torch.py |  99 +++++++++++++++++++
 6 files changed, 188 insertions(+), 56 deletions(-)
 create mode 100644 testing/test_awswrangler/test_torch.py

diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
index ce11c7ad5..ff6a2bd71 100644
--- a/awswrangler/__init__.py
+++ b/awswrangler/__init__.py
@@ -7,7 +7,7 @@
 
 import logging
 
-from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3  # noqa
+from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3, torch  # noqa
 from awswrangler.__metadata__ import __description__, __license__, __title__, __version__  # noqa
 
 logging.getLogger("awswrangler").addHandler(logging.NullHandler())
diff --git a/awswrangler/db.py b/awswrangler/db.py
index f4508d09c..78979787c 100644
--- a/awswrangler/db.py
+++ b/awswrangler/db.py
@@ -198,9 +198,8 @@ def _iterate_cursor(
         yield fn(records=records, cols_names=cursor.keys(), index=index, dtype=dtype)
 
 
-def _records2numpy(records: List[Tuple[Any]], **kwargs) -> Iterator[np.ndarry]:  # pylint: disable=unused-argument
-    for record in records:
-        yield np.array(record, float)
+def _records2numpy(records: List[Tuple[Any]], **kwargs) -> Iterator[np.ndarray]:  # pylint: disable=unused-argument
+    return np.array(records, dtype=float)
 
 
 def _records2df(
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index f728937db..157607d8c 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -111,7 +111,7 @@ def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None)
         raise ex  # pragma: no cover
 
 
-def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]:
+def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> List[str]:
     """List Amazon S3 objects from a prefix.
 
     Parameters
@@ -155,15 +155,16 @@ def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> Li
             for content in contents:
                 if (content is not None) and ("Key" in content):
                     key: str = content["Key"]
-                    paths.append(f"s3://{bucket}/{key}")
+                    if (suffix is None) or key.endswith(suffix):
+                        paths.append(f"s3://{bucket}/{key}")
     return paths
 
 
-def _path2list(path: Union[str, List[str]], boto3_session: Optional[boto3.Session]) -> List[str]:
+def _path2list(path: Union[str, List[str]], boto3_session: Optional[boto3.Session], suffix: Optional[str] = None) -> List[str]:
     if isinstance(path, str):  # prefix
         paths: List[str] = list_objects(path=path, boto3_session=boto3_session)
     elif isinstance(path, list):
-        paths = path
+        paths = path if suffix is None else [x for x in path if x.endswith(suffix)]
     else:
         raise exceptions.InvalidArgumentType(f"{type(path)} is not a valid path type. Please, use str or List[str].")
     return paths
diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index afe85f1de..7d84be981 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -1,63 +1,93 @@
 """PyTorch Module."""
 
 import logging
+from io import BytesIO
+from typing import Optional, Union, List
 
 import sqlalchemy  # type: ignore
+import numpy as np  # type: ignore
+import boto3  # type: ignore
 import torch
-from torch.utils.data.dataset import IterableDataset
+from torch.utils.data.dataset import Dataset, IterableDataset
+from PIL import Image
+from torchvision.transforms.functional import to_tensor
 
-from awswrangler import db
+from awswrangler import db, s3, _utils
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
-# class S3Dataset(Dataset):
-#     """PyTorch Map-Style S3 Dataset.
-#
-#     Parameters
-#     ----------
-#     path : Union[str, List[str]]
-#         S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-#     boto3_session : boto3.Session(), optional
-#         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-#
-#     Returns
-#     -------
-#     torch.utils.data.Dataset
-#
-#     Examples
-#     --------
-#     >>> import awswrangler as wr
-#     >>> import boto3
-#     >>> label_fn = lambda path: path.split[0][-2]
-#     >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session())
-#
-#     """
-#     def __init__(self, path: Union[str, List[str]], label_fn, boto3_session):
-#         super(S3IterableDataset).__init__()
-#         self.label_fn = label_fn
-#         self.paths: List[str] = s3._path2list(
-#             path=path,
-#             boto3_session=self.boto3_session
-#         )
-#         self._s3 = boto3_session.resource('s3')
-#
-#     def _fetch_obj(self, path):
-#         obj = _s3.Object(bucket_name, key).get()
-#         return obj['Body'].read()
-#
-#     def __getitem__(self, index):
-#         path = self.paths[index])
-#         return [self._fetch_obj(path), label_fn(path)]
-#
-#     def __len__(self):
-#         return len(self.paths)
+class _BaseS3Dataset(Dataset):
+    """PyTorch Map-Style S3 Dataset.
+
+    Parameters
+    ----------
+    path : Union[str, List[str]]
+        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    torch.utils.data.Dataset
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> import boto3
+    >>> label_fn = lambda path: path.split[0][-2]
+    >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session())
+
+    """
+    def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session):
+        super().__init__()
+        self.session = _utils.ensure_session(session=boto3_session)
+        self.paths: List[str] = s3._path2list(
+            path=path,
+            suffix=suffix,
+            boto3_session=self.session
+        )
+
+    def __getitem__(self, index):
+        path = self.paths[index]
+        obj = self._fetch_obj(path)
+        return [self.parser_fn(obj), self.label_fn(path)]
+
+    def __len__(self):
+        return len(self.paths)
+
+    def _fetch_obj(self, path):
+        bucket, key = _utils.parse_path(path=path)
+        buff = BytesIO()
+        client_s3: boto3.client = _utils.client(service_name="s3", session=self.session)
+        client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff)
+        return buff.seek(0)
+
+    def parser_fn(self, obj):
+        pass
+
+    def label_fn(self, obj):
+        pass
+
+
+class ImageS3Dataset(Dataset):
+
+    @staticmethod
+    def parser_fn(obj):
+        image = Image.open('YOUR_PATH')
+        tensor = to_tensor(image)
+        tensor.unsqueeze_(0)
+        return tensor
+
+    @staticmethod
+    def label_fn(obj):
+        pass
 
 
 class SQLDataset(IterableDataset):  # pylint: disable=too-few-public-methods,abstract-method
     """Pytorch Iterable SQL Dataset."""
 
-    def __init__(self, sql: str, con: sqlalchemy.engine.Engine):
+    def __init__(self, sql: str, con: sqlalchemy.engine.Engine, chunksize: Optional[int] = None,):
         """Pytorch Iterable SQL Dataset.
 
         Support for **Redshift**, **PostgreSQL** and **MySQL**.
@@ -84,6 +114,7 @@ def __init__(self, sql: str, con: sqlalchemy.engine.Engine):
         super().__init__()
         self.sql = sql
         self.con = con
+        self.chunksize = chunksize
 
     def __iter__(self):
         """Iterate over the Dataset."""
@@ -92,7 +123,9 @@ def __iter__(self):
             pass
         else:  # in a worker process
             raise NotImplementedError()
-
-        for ds in db._read_sql_query(fn=db._records2numpy, sql=self.sql, con=self.con):
+        ret = db._read_sql_query(fn=db._records2numpy, sql=self.sql, con=self.con, chunksize=self.chunksize)
+        if isinstance(ret, np.ndarray):
+            ret = [ret]
+        for ds in ret:
             for row in ds:
-                yield row
+                yield torch.as_tensor(row, dtype=torch.float)
diff --git a/pytest.ini b/pytest.ini
index 8e7a47ef1..d233cbf74 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,7 +1,7 @@
 [pytest]
 addopts =
     --verbose
-    --capture=fd
+    --capture=no
 filterwarnings =
     ignore::DeprecationWarning
     ignore::UserWarning
\ No newline at end of file
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
new file mode 100644
index 000000000..4725f1ea4
--- /dev/null
+++ b/testing/test_awswrangler/test_torch.py
@@ -0,0 +1,99 @@
+import logging
+
+import boto3
+import pandas as pd
+import pytest
+import torch
+
+import awswrangler as wr
+
+logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s")
+logging.getLogger("awswrangler").setLevel(logging.DEBUG)
+logging.getLogger("botocore.credentials").setLevel(logging.CRITICAL)
+
+
+@pytest.fixture(scope="module")
+def cloudformation_outputs():
+    response = boto3.client("cloudformation").describe_stacks(StackName="aws-data-wrangler-test")
+    outputs = {}
+    for output in response.get("Stacks")[0].get("Outputs"):
+        outputs[output.get("OutputKey")] = output.get("OutputValue")
+    yield outputs
+
+
+@pytest.fixture(scope="module")
+def bucket(cloudformation_outputs):
+    if "BucketName" in cloudformation_outputs:
+        bucket = cloudformation_outputs["BucketName"]
+    else:
+        raise Exception("You must deploy/update the test infrastructure (CloudFormation)")
+    yield bucket
+
+
+@pytest.fixture(scope="module")
+def parameters(cloudformation_outputs):
+    parameters = dict(postgresql={}, mysql={}, redshift={})
+    parameters["postgresql"]["host"] = cloudformation_outputs["PostgresqlAddress"]
+    parameters["postgresql"]["port"] = 3306
+    parameters["postgresql"]["schema"] = "public"
+    parameters["postgresql"]["database"] = "postgres"
+    parameters["mysql"]["host"] = cloudformation_outputs["MysqlAddress"]
+    parameters["mysql"]["port"] = 3306
+    parameters["mysql"]["schema"] = "test"
+    parameters["mysql"]["database"] = "test"
+    parameters["redshift"]["host"] = cloudformation_outputs["RedshiftAddress"]
+    parameters["redshift"]["port"] = cloudformation_outputs["RedshiftPort"]
+    parameters["redshift"]["identifier"] = cloudformation_outputs["RedshiftIdentifier"]
+    parameters["redshift"]["schema"] = "public"
+    parameters["redshift"]["database"] = "test"
+    parameters["redshift"]["role"] = cloudformation_outputs["RedshiftRole"]
+    parameters["password"] = cloudformation_outputs["DatabasesPassword"]
+    parameters["user"] = "test"
+    yield parameters
+
+
+@pytest.mark.parametrize("db_type, chunksize", [
+    ("mysql", None),
+    ("redshift", None),
+    ("postgresql", None),
+    ("mysql", 1),
+    ("redshift", 1),
+    ("postgresql", 1),
+])
+def test_torch_sql(parameters, db_type, chunksize):
+    schema = parameters[db_type]["schema"]
+    table = "test_torch_sql"
+    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
+    wr.db.to_sql(
+        df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}),
+        con=engine,
+        name=table,
+        schema=schema,
+        if_exists="replace",
+        index=False,
+        index_label=None,
+        chunksize=None,
+        method=None
+    )
+    ds = list(wr.torch.SQLDataset(f"SELECT * FROM {schema}.{table}", con=engine, chunksize=chunksize))
+    assert torch.all(ds[0].eq(torch.tensor([1.0, 4.0])))
+    assert torch.all(ds[1].eq(torch.tensor([2.0, 5.0])))
+    assert torch.all(ds[2].eq(torch.tensor([3.0, 6.0])))
+
+
+def test_torch_sql(parameters, db_type, chunksize):
+    schema = parameters[db_type]["schema"]
+    table = "test_torch_sql"
+    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
+    wr.db.to_sql(
+        df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}),
+        con=engine,
+        name=table,
+        schema=schema,
+        if_exists="replace",
+        index=False,
+        index_label=None,
+        chunksize=None,
+        method=None
+    )
+

From 72c739c905f3a34545ffc71da7693ff4baf029c1 Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Sun, 19 Apr 2020 18:58:13 -0300
Subject: [PATCH 05/59] add audio and image dataset

---
 awswrangler/s3.py                      |   4 +-
 awswrangler/torch.py                   | 169 ++++++++++++++++++++-----
 testing/test_awswrangler/test_torch.py |  32 +++--
 3 files changed, 159 insertions(+), 46 deletions(-)

diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index 157607d8c..f2f869ac2 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -120,6 +120,8 @@ def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optiona
         S3 path (e.g. s3://bucket/prefix).
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+    suffix: str, optional
+        Suffix for filtering S3 keys
 
     Returns
     -------
@@ -160,7 +162,7 @@ def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optiona
     return paths
 
 
-def _path2list(path: Union[str, List[str]], boto3_session: Optional[boto3.Session], suffix: Optional[str] = None) -> List[str]:
+def _path2list(path: object, boto3_session: boto3.Session, suffix: str = None) -> List[str]:
     if isinstance(path, str):  # prefix
         paths: List[str] = list_objects(path=path, boto3_session=boto3_session)
     elif isinstance(path, list):
diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index 7d84be981..a5b6497d8 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -1,5 +1,6 @@
 """PyTorch Module."""
 
+import re
 import logging
 from io import BytesIO
 from typing import Optional, Union, List
@@ -9,8 +10,7 @@
 import boto3  # type: ignore
 import torch
 from torch.utils.data.dataset import Dataset, IterableDataset
-from PIL import Image
-from torchvision.transforms.functional import to_tensor
+
 
 from awswrangler import db, s3, _utils
 
@@ -18,34 +18,29 @@
 
 
 class _BaseS3Dataset(Dataset):
-    """PyTorch Map-Style S3 Dataset.
+    """PyTorch Map-Style S3 Dataset."""
 
-    Parameters
-    ----------
-    path : Union[str, List[str]]
-        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-    boto3_session : boto3.Session(), optional
-        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+    def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session):
+        """PyTorch Map-Style S3 Dataset.
 
-    Returns
-    -------
-    torch.utils.data.Dataset
+        Parameters
+        ----------
+        path : Union[str, List[str]]
+            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        boto3_session : boto3.Session(), optional
+            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
-    Examples
-    --------
-    >>> import awswrangler as wr
-    >>> import boto3
-    >>> label_fn = lambda path: path.split[0][-2]
-    >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session())
+        Returns
+        -------
+        torch.utils.data.Dataset
 
-    """
-    def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session):
+        """
         super().__init__()
         self.session = _utils.ensure_session(session=boto3_session)
         self.paths: List[str] = s3._path2list(
             path=path,
             suffix=suffix,
-            boto3_session=self.session
+            boto3_session=self.session,
         )
 
     def __getitem__(self, index):
@@ -66,28 +61,139 @@ def _fetch_obj(self, path):
     def parser_fn(self, obj):
         pass
 
-    def label_fn(self, obj):
+    def label_fn(self, path):
         pass
 
 
-class ImageS3Dataset(Dataset):
+class _S3PartitionedDataset(_BaseS3Dataset):
+
+    def label_fn(self, path):
+        return int(re.findall(r'/(.*?)=(.*?)/', path)[-1][1])
+
+
+class AudioS3Dataset(_S3PartitionedDataset):
+
+    def __init__(self):
+        """PyTorch S3 Audio Dataset.
+
+        Assumes audio files are stored with the following structure:
+
+        bucket
+        ├── class=0
+        │   ├── audio0.wav
+        │   └── audio1.wav
+        └── class=1
+            ├── audio2.wav
+            └── audio3.wav
+
+        Parameters
+        ----------
+        path : Union[str, List[str]]
+            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        boto3_session : boto3.Session(), optional
+            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+        Returns
+        -------
+        torch.utils.data.Dataset
+
+        Examples
+        --------
+        >>> import awswrangler as wr
+        >>> import boto3
+        >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session())
+
+        """
+        super(AudioS3Dataset, self).__init__()
+        import torchaudio
+
+    def parser_fn(self, obj):
+        waveform, sample_rate = torchaudio.load(obj)
+        return waveform, sample_rate
+
+
+class LambdaS3Dataset(_BaseS3Dataset):
+    """PyTorch S3 Audio Dataset.
+
+    Parameters
+    ----------
+    path : Union[str, List[str]]
+        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    torch.utils.data.Dataset
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> import boto3
+    >>> parse_fn = lambda x: torch.tensor(x)
+    >>> label_fn = lambda x: x.split('.')[-1]
+    >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), parse_fn=parse_fn, label_fn=label_fn)
+
+    """
+    def __init__(self, parse_fn, label_fn):
+        self._parse_fn = parse_fn
+        self._label_fn = label_fn
+
+    def label_fn(self, path):
+        return self._label_fn(path)
 
-    @staticmethod
-    def parser_fn(obj):
-        image = Image.open('YOUR_PATH')
+    def parse_fn(self, obj):
+        return self._parse_fn(obj)
+
+
+class ImageS3Dataset(_S3PartitionedDataset):
+
+    def __init__(self):
+        """PyTorch Image S3 Dataset.
+
+        Assumes Images are stored with the following structure:
+
+        bucket
+        ├── class=0
+        │   ├── img0.jpeg
+        │   └── img1.jpeg
+        └── class=1
+            ├── img2.jpeg
+            └── img3.jpeg
+
+        Parameters
+        ----------
+        path : Union[str, List[str]]
+            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        boto3_session : boto3.Session(), optional
+            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+        Returns
+        -------
+        torch.utils.data.Dataset
+
+        Examples
+        --------
+        >>> import awswrangler as wr
+        >>> import boto3
+        >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session())
+
+        """
+        super(ImageS3Dataset, self).__init__()
+        from PIL import Image
+        from torchvision.transforms.functional import to_tensor
+
+    def parser_fn(self, obj):
+        image = Image.open(obj)
         tensor = to_tensor(image)
         tensor.unsqueeze_(0)
         return tensor
 
-    @staticmethod
-    def label_fn(obj):
-        pass
-
 
 class SQLDataset(IterableDataset):  # pylint: disable=too-few-public-methods,abstract-method
     """Pytorch Iterable SQL Dataset."""
 
-    def __init__(self, sql: str, con: sqlalchemy.engine.Engine, chunksize: Optional[int] = None,):
+    def __init__(self, sql: str, con: sqlalchemy.engine.Engine, label_col: Optional[str], chunksize: Optional[int] = None,):
         """Pytorch Iterable SQL Dataset.
 
         Support for **Redshift**, **PostgreSQL** and **MySQL**.
@@ -114,6 +220,7 @@ def __init__(self, sql: str, con: sqlalchemy.engine.Engine, chunksize: Optional[
         super().__init__()
         self.sql = sql
         self.con = con
+        self.label_col = label_col
         self.chunksize = chunksize
 
     def __iter__(self):
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index 4725f1ea4..f30736c16 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -81,19 +81,23 @@ def test_torch_sql(parameters, db_type, chunksize):
     assert torch.all(ds[2].eq(torch.tensor([3.0, 6.0])))
 
 
-def test_torch_sql(parameters, db_type, chunksize):
-    schema = parameters[db_type]["schema"]
-    table = "test_torch_sql"
-    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
-    wr.db.to_sql(
-        df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}),
-        con=engine,
-        name=table,
-        schema=schema,
-        if_exists="replace",
-        index=False,
-        index_label=None,
-        chunksize=None,
-        method=None
+def test_torch_image_s3(bucket):
+    s3 = boto3.client('s3')
+    ref_label = 0
+    s3.put_object(
+        Body=open("../../docs/source/_static/logo.png"),
+        Bucket=bucket,
+        Key=f'class={ref_label}/logo.png',
     )
+    ds = wr.torch.ImageS3Dataset()
+    for image, label in ds:
+        assert image.shape == torch.Size([1, 28, 28])
+        assert label == torch.int(ref_label)
+        break
+
 
+# def test_torch_audio_s3(bucket):
+#     ds = wr.torch.AudioS3Dataset()
+#     for image, label in ds:
+#         assert image.shape == torch.Size([1, 28, 28])
+#         break
\ No newline at end of file

From f72810ec53fb33a60df0b5c97fb5ab8059317f81 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Mon, 20 Apr 2020 01:07:55 -0300
Subject: [PATCH 06/59] Add label_col to torch.SQLDataset

---
 awswrangler/db.py                      |  63 ++---
 awswrangler/s3.py                      |   2 +-
 awswrangler/torch.py                   | 313 ++++++++++++++-----------
 testing/test_awswrangler/test_torch.py |  52 ++--
 4 files changed, 235 insertions(+), 195 deletions(-)

diff --git a/awswrangler/db.py b/awswrangler/db.py
index 78979787c..e69739433 100644
--- a/awswrangler/db.py
+++ b/awswrangler/db.py
@@ -6,7 +6,6 @@
 from urllib.parse import quote_plus
 
 import boto3  # type: ignore
-import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
 import pyarrow as pa  # type: ignore
 import sqlalchemy  # type: ignore
@@ -156,50 +155,15 @@ def read_sql_query(
     ... )
 
     """
-    return _read_sql_query(
-        fn=_records2df, sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype
-    )
-
-
-def _read_sql_query(
-    fn: Callable,
-    sql: str,
-    con: sqlalchemy.engine.Engine,
-    index_col: Optional[Union[str, List[str]]] = None,
-    params: Optional[Union[List, Tuple, Dict]] = None,
-    chunksize: Optional[int] = None,
-    dtype: Optional[Dict[str, pa.DataType]] = None,
-):
-    if not isinstance(con, sqlalchemy.engine.Engine):  # pragma: no cover
-        raise exceptions.InvalidConnection(
-            "Invalid 'con' argument, please pass a "
-            "SQLAlchemy Engine. Use wr.db.get_engine(), "
-            "wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()"
-        )
+    _validate_engine(con=con)
     with con.connect() as _con:
         args = _convert_params(sql, params)
         cursor = _con.execute(*args)
         if chunksize is None:
-            return fn(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype)
-        return _iterate_cursor(fn=fn, cursor=cursor, chunksize=chunksize, index=index_col, dtype=dtype)
-
-
-def _iterate_cursor(
-    fn: Callable,
-    cursor,
-    chunksize: int,
-    index: Optional[Union[str, List[str]]],
-    dtype: Optional[Dict[str, pa.DataType]] = None,
-) -> Iterator[Any]:
-    while True:
-        records = cursor.fetchmany(chunksize)
-        if not records:
-            break
-        yield fn(records=records, cols_names=cursor.keys(), index=index, dtype=dtype)
-
-
-def _records2numpy(records: List[Tuple[Any]], **kwargs) -> Iterator[np.ndarray]:  # pylint: disable=unused-argument
-    return np.array(records, dtype=float)
+            return _records2df(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype)
+        return _iterate_cursor(
+            fn=_records2df, cursor=cursor, chunksize=chunksize, cols_names=cursor.keys(), index=index_col, dtype=dtype
+        )
 
 
 def _records2df(
@@ -229,6 +193,14 @@ def _records2df(
     return df
 
 
+def _iterate_cursor(fn: Callable, cursor: Any, chunksize: int, **kwargs) -> Iterator[Any]:
+    while True:
+        records = cursor.fetchmany(chunksize)
+        if not records:
+            break
+        yield fn(records=records, **kwargs)
+
+
 def _convert_params(sql: str, params: Optional[Union[List, Tuple, Dict]]) -> List[Any]:
     args: List[Any] = [sql]
     if params is not None:
@@ -1109,3 +1081,12 @@ def unload_redshift_to_files(
         paths = [x[0].replace(" ", "") for x in _con.execute(sql).fetchall()]
         _logger.debug(f"paths: {paths}")
         return paths
+
+
+def _validate_engine(con: sqlalchemy.engine.Engine) -> None:  # pragma: no cover
+    if not isinstance(con, sqlalchemy.engine.Engine):
+        raise exceptions.InvalidConnection(
+            "Invalid 'con' argument, please pass a "
+            "SQLAlchemy Engine. Use wr.db.get_engine(), "
+            "wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()"
+        )
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index f2f869ac2..c083d52c5 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -121,7 +121,7 @@ def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optiona
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
     suffix: str, optional
-        Suffix for filtering S3 keys
+        Suffix for filtering S3 keys.
 
     Returns
     -------
diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index a5b6497d8..b27422750 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -3,16 +3,15 @@
 import re
 import logging
 from io import BytesIO
-from typing import Optional, Union, List
+from typing import Any, Iterator, List, Optional, Tuple, Union
 
-import sqlalchemy  # type: ignore
 import numpy as np  # type: ignore
+import sqlalchemy  # type: ignore
 import boto3  # type: ignore
 import torch
 from torch.utils.data.dataset import Dataset, IterableDataset
 
-
-from awswrangler import db, s3, _utils
+from awswrangler import db, _utils, s3
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
@@ -71,129 +70,135 @@ def label_fn(self, path):
         return int(re.findall(r'/(.*?)=(.*?)/', path)[-1][1])
 
 
-class AudioS3Dataset(_S3PartitionedDataset):
-
-    def __init__(self):
-        """PyTorch S3 Audio Dataset.
-
-        Assumes audio files are stored with the following structure:
-
-        bucket
-        ├── class=0
-        │   ├── audio0.wav
-        │   └── audio1.wav
-        └── class=1
-            ├── audio2.wav
-            └── audio3.wav
-
-        Parameters
-        ----------
-        path : Union[str, List[str]]
-            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-        boto3_session : boto3.Session(), optional
-            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-
-        Returns
-        -------
-        torch.utils.data.Dataset
-
-        Examples
-        --------
-        >>> import awswrangler as wr
-        >>> import boto3
-        >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session())
-
-        """
-        super(AudioS3Dataset, self).__init__()
-        import torchaudio
-
-    def parser_fn(self, obj):
-        waveform, sample_rate = torchaudio.load(obj)
-        return waveform, sample_rate
-
-
-class LambdaS3Dataset(_BaseS3Dataset):
-    """PyTorch S3 Audio Dataset.
-
-    Parameters
-    ----------
-    path : Union[str, List[str]]
-        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-    boto3_session : boto3.Session(), optional
-        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-
-    Returns
-    -------
-    torch.utils.data.Dataset
-
-    Examples
-    --------
-    >>> import awswrangler as wr
-    >>> import boto3
-    >>> parse_fn = lambda x: torch.tensor(x)
-    >>> label_fn = lambda x: x.split('.')[-1]
-    >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), parse_fn=parse_fn, label_fn=label_fn)
-
-    """
-    def __init__(self, parse_fn, label_fn):
-        self._parse_fn = parse_fn
-        self._label_fn = label_fn
-
-    def label_fn(self, path):
-        return self._label_fn(path)
-
-    def parse_fn(self, obj):
-        return self._parse_fn(obj)
-
-
-class ImageS3Dataset(_S3PartitionedDataset):
-
-    def __init__(self):
-        """PyTorch Image S3 Dataset.
-
-        Assumes Images are stored with the following structure:
-
-        bucket
-        ├── class=0
-        │   ├── img0.jpeg
-        │   └── img1.jpeg
-        └── class=1
-            ├── img2.jpeg
-            └── img3.jpeg
-
-        Parameters
-        ----------
-        path : Union[str, List[str]]
-            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-        boto3_session : boto3.Session(), optional
-            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-
-        Returns
-        -------
-        torch.utils.data.Dataset
-
-        Examples
-        --------
-        >>> import awswrangler as wr
-        >>> import boto3
-        >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session())
-
-        """
-        super(ImageS3Dataset, self).__init__()
-        from PIL import Image
-        from torchvision.transforms.functional import to_tensor
-
-    def parser_fn(self, obj):
-        image = Image.open(obj)
-        tensor = to_tensor(image)
-        tensor.unsqueeze_(0)
-        return tensor
+# class AudioS3Dataset(_S3PartitionedDataset):
+#
+#     def __init__(self):
+#         """PyTorch S3 Audio Dataset.
+#
+#         Assumes audio files are stored with the following structure:
+#
+#         bucket
+#         ├── class=0
+#         │   ├── audio0.wav
+#         │   └── audio1.wav
+#         └── class=1
+#             ├── audio2.wav
+#             └── audio3.wav
+#
+#         Parameters
+#         ----------
+#         path : Union[str, List[str]]
+#             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+#         boto3_session : boto3.Session(), optional
+#             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+#
+#         Returns
+#         -------
+#         torch.utils.data.Dataset
+#
+#         Examples
+#         --------
+#         >>> import awswrangler as wr
+#         >>> import boto3
+#         >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session())
+#
+#         """
+#         super(AudioS3Dataset, self).__init__()
+#         import torchaudio
+#
+#     def parser_fn(self, obj):
+#         waveform, sample_rate = torchaudio.load(obj)
+#         return waveform, sample_rate
+
+
+# class LambdaS3Dataset(_BaseS3Dataset):
+#     """PyTorch S3 Audio Dataset.
+#
+#     Parameters
+#     ----------
+#     path : Union[str, List[str]]
+#         S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+#     boto3_session : boto3.Session(), optional
+#         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+#
+#     Returns
+#     -------
+#     torch.utils.data.Dataset
+#
+#     Examples
+#     --------
+#     >>> import awswrangler as wr
+#     >>> import boto3
+#     >>> parse_fn = lambda x: torch.tensor(x)
+#     >>> label_fn = lambda x: x.split('.')[-1]
+#     >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), parse_fn=parse_fn, label_fn=label_fn)
+#
+#     """
+#     def __init__(self, parse_fn, label_fn):
+#         self._parse_fn = parse_fn
+#         self._label_fn = label_fn
+#
+#     def label_fn(self, path):
+#         return self._label_fn(path)
+#
+#     def parse_fn(self, obj):
+#         return self._parse_fn(obj)
+#
+#
+# class ImageS3Dataset(_S3PartitionedDataset):
+#
+#     def __init__(self):
+#         """PyTorch Image S3 Dataset.
+#
+#         Assumes Images are stored with the following structure:
+#
+#         bucket
+#         ├── class=0
+#         │   ├── img0.jpeg
+#         │   └── img1.jpeg
+#         └── class=1
+#             ├── img2.jpeg
+#             └── img3.jpeg
+#
+#         Parameters
+#         ----------
+#         path : Union[str, List[str]]
+#             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+#         boto3_session : boto3.Session(), optional
+#             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+#
+#         Returns
+#         -------
+#         torch.utils.data.Dataset
+#
+#         Examples
+#         --------
+#         >>> import awswrangler as wr
+#         >>> import boto3
+#         >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session())
+#
+#         """
+#         super(ImageS3Dataset, self).__init__()
+#         from PIL import Image
+#         from torchvision.transforms.functional import to_tensor
+#
+#     def parser_fn(self, obj):
+#         image = Image.open(obj)
+#         tensor = to_tensor(image)
+#         tensor.unsqueeze_(0)
+#         return tensor
 
 
 class SQLDataset(IterableDataset):  # pylint: disable=too-few-public-methods,abstract-method
     """Pytorch Iterable SQL Dataset."""
 
-    def __init__(self, sql: str, con: sqlalchemy.engine.Engine, label_col: Optional[str], chunksize: Optional[int] = None,):
+    def __init__(
+        self,
+        sql: str,
+        con: sqlalchemy.engine.Engine,
+        label_col: Optional[Union[int, str]] = None,
+        chunksize: Optional[int] = None,
+    ):
         """Pytorch Iterable SQL Dataset.
 
         Support for **Redshift**, **PostgreSQL** and **MySQL**.
@@ -205,6 +210,8 @@ def __init__(self, sql: str, con: sqlalchemy.engine.Engine, label_col: Optional[
         con : sqlalchemy.engine.Engine
             SQLAlchemy Engine. Please use,
             wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()
+        label_col : int, optional
+            Label column number
 
         Returns
         -------
@@ -218,21 +225,53 @@ def __init__(self, sql: str, con: sqlalchemy.engine.Engine, label_col: Optional[
 
         """
         super().__init__()
-        self.sql = sql
-        self.con = con
-        self.label_col = label_col
-        self.chunksize = chunksize
+        self._sql = sql
+        self._con = con
+        self._label_col = label_col
+        self._chunksize = chunksize
 
-    def __iter__(self):
+    def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]:
         """Iterate over the Dataset."""
-        worker_info = torch.utils.data.get_worker_info()
-        if worker_info is None:  # single-process data loading, return the full iterator
-            pass
-        else:  # in a worker process
+        if torch.utils.data.get_worker_info() is not None:  # type: ignore
             raise NotImplementedError()
-        ret = db._read_sql_query(fn=db._records2numpy, sql=self.sql, con=self.con, chunksize=self.chunksize)
-        if isinstance(ret, np.ndarray):
-            ret = [ret]
-        for ds in ret:
-            for row in ds:
-                yield torch.as_tensor(row, dtype=torch.float)
+        db._validate_engine(con=self._con)
+        with self._con.connect() as con:
+            cursor: Any = con.execute(self._sql)
+            if (self._label_col is not None) and isinstance(self._label_col, str):
+                label_col: Optional[int] = list(cursor.keys()).index(self._label_col)
+            else:
+                label_col = self._label_col
+            _logger.debug(f"label_col: {label_col}")
+            return self._records2tensor(cursor=cursor, chunksize=self._chunksize, label_col=label_col)
+
+    @staticmethod
+    def _records2tensor(
+        cursor: Any, chunksize: Optional[int] = None, label_col: Optional[int] = None
+    ) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]:  # pylint: disable=unused-argument
+        chunks: Iterator[Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]]
+        if chunksize is None:
+            chunks = iter([SQLDataset._records2numpy(records=cursor.fetchall(), label_col=label_col)])
+        else:
+            chunks = db._iterate_cursor(  # pylint: disable=protected-access
+                fn=SQLDataset._records2numpy, cursor=cursor, chunksize=chunksize, label_col=label_col
+            )
+        if label_col is None:
+            for data in chunks:
+                for data_row in data:
+                    yield torch.as_tensor(data_row, dtype=torch.float)  # pylint: disable=no-member
+        for data, label in chunks:
+            for data_row, label_row in zip(data, label):
+                ts_data: torch.Tensor = torch.as_tensor(data_row, dtype=torch.float)  # pylint: disable=no-member
+                ts_label: torch.Tensor = torch.as_tensor(label_row, dtype=torch.float)  # pylint: disable=no-member
+                yield ts_data, ts_label
+
+    @staticmethod
+    def _records2numpy(
+        records: List[Tuple[Any]], label_col: Optional[int] = None
+    ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:  # pylint: disable=unused-argument
+        arr: np.ndarray = np.array(records, dtype=np.float)
+        if label_col is None:
+            return arr
+        data: np.ndarray = np.concatenate([arr[:, :label_col], arr[:, (label_col + 1) :]], axis=1)  # noqa: E203
+        label: np.ndarray = arr[:, label_col]
+        return data, label
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index f30736c16..d39ec8ddb 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -52,14 +52,10 @@ def parameters(cloudformation_outputs):
     yield parameters
 
 
-@pytest.mark.parametrize("db_type, chunksize", [
-    ("mysql", None),
-    ("redshift", None),
-    ("postgresql", None),
-    ("mysql", 1),
-    ("redshift", 1),
-    ("postgresql", 1),
-])
+@pytest.mark.parametrize(
+    "db_type, chunksize",
+    [("mysql", None), ("redshift", None), ("postgresql", None), ("mysql", 1), ("redshift", 1), ("postgresql", 1)],
+)
 def test_torch_sql(parameters, db_type, chunksize):
     schema = parameters[db_type]["schema"]
     table = "test_torch_sql"
@@ -73,7 +69,7 @@ def test_torch_sql(parameters, db_type, chunksize):
         index=False,
         index_label=None,
         chunksize=None,
-        method=None
+        method=None,
     )
     ds = list(wr.torch.SQLDataset(f"SELECT * FROM {schema}.{table}", con=engine, chunksize=chunksize))
     assert torch.all(ds[0].eq(torch.tensor([1.0, 4.0])))
@@ -81,14 +77,38 @@ def test_torch_sql(parameters, db_type, chunksize):
     assert torch.all(ds[2].eq(torch.tensor([3.0, 6.0])))
 
 
+@pytest.mark.parametrize(
+    "db_type, chunksize",
+    [("mysql", None), ("redshift", None), ("postgresql", None), ("mysql", 1), ("redshift", 1), ("postgresql", 1)],
+)
+def test_torch_sql_label(parameters, db_type, chunksize):
+    schema = parameters[db_type]["schema"]
+    table = "test_torch_sql_label"
+    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
+    wr.db.to_sql(
+        df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0], "c": [7, 8, 9]}),
+        con=engine,
+        name=table,
+        schema=schema,
+        if_exists="replace",
+        index=False,
+        index_label=None,
+        chunksize=None,
+        method=None,
+    )
+    ts = list(wr.torch.SQLDataset(f"SELECT * FROM {schema}.{table}", con=engine, chunksize=chunksize, label_col=2))
+    assert torch.all(ts[0][0].eq(torch.tensor([1.0, 4.0])))
+    assert torch.all(ts[0][1].eq(torch.tensor([7], dtype=torch.long)))
+    assert torch.all(ts[1][0].eq(torch.tensor([2.0, 5.0])))
+    assert torch.all(ts[1][1].eq(torch.tensor([8], dtype=torch.long)))
+    assert torch.all(ts[2][0].eq(torch.tensor([3.0, 6.0])))
+    assert torch.all(ts[2][1].eq(torch.tensor([9], dtype=torch.long)))
+
+
 def test_torch_image_s3(bucket):
-    s3 = boto3.client('s3')
+    s3 = boto3.client("s3")
     ref_label = 0
-    s3.put_object(
-        Body=open("../../docs/source/_static/logo.png"),
-        Bucket=bucket,
-        Key=f'class={ref_label}/logo.png',
-    )
+    s3.put_object(Body=open("../../docs/source/_static/logo.png"), Bucket=bucket, Key=f"class={ref_label}/logo.png")
     ds = wr.torch.ImageS3Dataset()
     for image, label in ds:
         assert image.shape == torch.Size([1, 28, 28])
@@ -100,4 +120,4 @@ def test_torch_image_s3(bucket):
 #     ds = wr.torch.AudioS3Dataset()
 #     for image, label in ds:
 #         assert image.shape == torch.Size([1, 28, 28])
-#         break
\ No newline at end of file
+#         break

From bf1be0746d0523d91db1d9181152150ff18c9919 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Mon, 20 Apr 2020 09:20:41 -0300
Subject: [PATCH 07/59] Updating catersian product of pytest parameters

---
 testing/test_awswrangler/test_torch.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index d39ec8ddb..3c08ec319 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -52,10 +52,8 @@ def parameters(cloudformation_outputs):
     yield parameters
 
 
-@pytest.mark.parametrize(
-    "db_type, chunksize",
-    [("mysql", None), ("redshift", None), ("postgresql", None), ("mysql", 1), ("redshift", 1), ("postgresql", 1)],
-)
+@pytest.mark.parametrize("chunksize", [None, 1, 10])
+@pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"])
 def test_torch_sql(parameters, db_type, chunksize):
     schema = parameters[db_type]["schema"]
     table = "test_torch_sql"
@@ -77,10 +75,8 @@ def test_torch_sql(parameters, db_type, chunksize):
     assert torch.all(ds[2].eq(torch.tensor([3.0, 6.0])))
 
 
-@pytest.mark.parametrize(
-    "db_type, chunksize",
-    [("mysql", None), ("redshift", None), ("postgresql", None), ("mysql", 1), ("redshift", 1), ("postgresql", 1)],
-)
+@pytest.mark.parametrize("chunksize", [None, 1, 10])
+@pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"])
 def test_torch_sql_label(parameters, db_type, chunksize):
     schema = parameters[db_type]["schema"]
     table = "test_torch_sql_label"

From 1a41d1887217312298f2fab4f32e156fffb7e8d5 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Mon, 20 Apr 2020 12:37:21 -0300
Subject: [PATCH 08/59] Pivoting SQLDataset parser strategy to avoid cast
 losses.

---
 awswrangler/db.py                      |  14 +-
 awswrangler/torch.py                   | 169 ++++++++++++-------------
 testing/test_awswrangler/test_torch.py |  18 +--
 3 files changed, 100 insertions(+), 101 deletions(-)

diff --git a/awswrangler/db.py b/awswrangler/db.py
index e69739433..5d16301ad 100644
--- a/awswrangler/db.py
+++ b/awswrangler/db.py
@@ -2,7 +2,7 @@
 
 import json
 import logging
-from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 from urllib.parse import quote_plus
 
 import boto3  # type: ignore
@@ -162,7 +162,7 @@ def read_sql_query(
         if chunksize is None:
             return _records2df(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype)
         return _iterate_cursor(
-            fn=_records2df, cursor=cursor, chunksize=chunksize, cols_names=cursor.keys(), index=index_col, dtype=dtype
+            cursor=cursor, chunksize=chunksize, cols_names=cursor.keys(), index=index_col, dtype=dtype
         )
 
 
@@ -193,12 +193,18 @@ def _records2df(
     return df
 
 
-def _iterate_cursor(fn: Callable, cursor: Any, chunksize: int, **kwargs) -> Iterator[Any]:
+def _iterate_cursor(
+    cursor: Any,
+    chunksize: int,
+    cols_names: List[str],
+    index: Optional[Union[str, List[str]]],
+    dtype: Optional[Dict[str, pa.DataType]] = None,
+) -> Iterator[pd.DataFrame]:
     while True:
         records = cursor.fetchmany(chunksize)
         if not records:
             break
-        yield fn(records=records, **kwargs)
+        yield _records2df(records=records, cols_names=cols_names, index=index, dtype=dtype)
 
 
 def _convert_params(sql: str, params: Optional[Union[List, Tuple, Dict]]) -> List[Any]:
diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index b27422750..a73f4d198 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -1,79 +1,75 @@
 """PyTorch Module."""
 
-import re
 import logging
-from io import BytesIO
+# import re
+# from io import BytesIO
 from typing import Any, Iterator, List, Optional, Tuple, Union
 
+# import boto3  # type: ignore
 import numpy as np  # type: ignore
 import sqlalchemy  # type: ignore
-import boto3  # type: ignore
 import torch
-from torch.utils.data.dataset import Dataset, IterableDataset
+from torch.utils.data.dataset import IterableDataset
 
-from awswrangler import db, _utils, s3
+from awswrangler import db  # , s3, _utils
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
-class _BaseS3Dataset(Dataset):
-    """PyTorch Map-Style S3 Dataset."""
-
-    def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session):
-        """PyTorch Map-Style S3 Dataset.
-
-        Parameters
-        ----------
-        path : Union[str, List[str]]
-            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-        boto3_session : boto3.Session(), optional
-            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-
-        Returns
-        -------
-        torch.utils.data.Dataset
-
-        """
-        super().__init__()
-        self.session = _utils.ensure_session(session=boto3_session)
-        self.paths: List[str] = s3._path2list(
-            path=path,
-            suffix=suffix,
-            boto3_session=self.session,
-        )
-
-    def __getitem__(self, index):
-        path = self.paths[index]
-        obj = self._fetch_obj(path)
-        return [self.parser_fn(obj), self.label_fn(path)]
-
-    def __len__(self):
-        return len(self.paths)
-
-    def _fetch_obj(self, path):
-        bucket, key = _utils.parse_path(path=path)
-        buff = BytesIO()
-        client_s3: boto3.client = _utils.client(service_name="s3", session=self.session)
-        client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff)
-        return buff.seek(0)
-
-    def parser_fn(self, obj):
-        pass
-
-    def label_fn(self, path):
-        pass
-
-
-class _S3PartitionedDataset(_BaseS3Dataset):
-
-    def label_fn(self, path):
-        return int(re.findall(r'/(.*?)=(.*?)/', path)[-1][1])
+# class _BaseS3Dataset(Dataset):
+#     """PyTorch Map-Style S3 Dataset."""
+#
+#     def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session):
+#         """Pytorch Map-Style S3 Dataset.
+#
+#         Parameters
+#         ----------
+#         path : Union[str, List[str]]
+#             S3 prefix (e.g. s3://bucket/prefix)
+#             or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+#         boto3_session : boto3.Session(), optional
+#             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+#
+#         Returns
+#         -------
+#         torch.utils.data.Dataset
+#
+#         """
+#         super().__init__()
+#         self.session = _utils.ensure_session(session=boto3_session)
+#         self.paths: List[str] = s3._path2list(path=path, suffix=suffix, boto3_session=self.session)
+#
+#     def __getitem__(self, index):
+#         path = self.paths[index]
+#         obj = self._fetch_obj(path)
+#         return [self.parser_fn(obj), self.label_fn(path)]
+#
+#     def __len__(self):
+#         return len(self.paths)
+#
+#     def _fetch_obj(self, path):
+#         bucket, key = _utils.parse_path(path=path)
+#         buff = BytesIO()
+#         client_s3: boto3.client = _utils.client(service_name="s3", session=self.session)
+#         client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff)
+#         return buff.seek(0)
+#
+#     def parser_fn(self, obj):
+#         pass
+#
+#     def label_fn(self, path):
+#         pass
+#
+#
+# class _S3PartitionedDataset(_BaseS3Dataset):
+#     def label_fn(self, path):
+#         return int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1])
 
 
 # class AudioS3Dataset(_S3PartitionedDataset):
 #
 #     def __init__(self):
-#         """PyTorch S3 Audio Dataset.
+#         """Pytorch S3 Audio Dataset.
 #
 #         Assumes audio files are stored with the following structure:
 #
@@ -88,7 +84,8 @@ def label_fn(self, path):
 #         Parameters
 #         ----------
 #         path : Union[str, List[str]]
-#             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+#             S3 prefix (e.g. s3://bucket/prefix)
+#             or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
 #         boto3_session : boto3.Session(), optional
 #             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 #
@@ -163,7 +160,8 @@ def label_fn(self, path):
 #         Parameters
 #         ----------
 #         path : Union[str, List[str]]
-#             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+#             S3 prefix (e.g. s3://bucket/prefix)
+#             or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
 #         boto3_session : boto3.Session(), optional
 #             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 #
@@ -242,36 +240,31 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor,
             else:
                 label_col = self._label_col
             _logger.debug(f"label_col: {label_col}")
-            return self._records2tensor(cursor=cursor, chunksize=self._chunksize, label_col=label_col)
+            if self._chunksize is None:
+                return SQLDataset._records2tensor(records=cursor.fetchall(), label_col=label_col)
+            return self._iterate_cursor(cursor=cursor, chunksize=self._chunksize, label_col=label_col)
 
     @staticmethod
-    def _records2tensor(
-        cursor: Any, chunksize: Optional[int] = None, label_col: Optional[int] = None
-    ) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]:  # pylint: disable=unused-argument
-        chunks: Iterator[Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]]
-        if chunksize is None:
-            chunks = iter([SQLDataset._records2numpy(records=cursor.fetchall(), label_col=label_col)])
-        else:
-            chunks = db._iterate_cursor(  # pylint: disable=protected-access
-                fn=SQLDataset._records2numpy, cursor=cursor, chunksize=chunksize, label_col=label_col
-            )
-        if label_col is None:
-            for data in chunks:
-                for data_row in data:
-                    yield torch.as_tensor(data_row, dtype=torch.float)  # pylint: disable=no-member
-        for data, label in chunks:
-            for data_row, label_row in zip(data, label):
-                ts_data: torch.Tensor = torch.as_tensor(data_row, dtype=torch.float)  # pylint: disable=no-member
-                ts_label: torch.Tensor = torch.as_tensor(label_row, dtype=torch.float)  # pylint: disable=no-member
-                yield ts_data, ts_label
+    def _iterate_cursor(
+        cursor: Any, chunksize: int, label_col: Optional[int] = None
+    ) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]:
+        while True:
+            records = cursor.fetchmany(chunksize)
+            if not records:
+                break
+            yield from SQLDataset._records2tensor(records=records, label_col=label_col)
 
     @staticmethod
-    def _records2numpy(
+    def _records2tensor(
         records: List[Tuple[Any]], label_col: Optional[int] = None
-    ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:  # pylint: disable=unused-argument
-        arr: np.ndarray = np.array(records, dtype=np.float)
-        if label_col is None:
-            return arr
-        data: np.ndarray = np.concatenate([arr[:, :label_col], arr[:, (label_col + 1) :]], axis=1)  # noqa: E203
-        label: np.ndarray = arr[:, label_col]
-        return data, label
+    ) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]:  # pylint: disable=unused-argument
+        for row in records:
+            if label_col is None:
+                arr_data: np.ndarray = np.array(row, dtype=np.float)
+                yield torch.as_tensor(arr_data, dtype=torch.float)  # pylint: disable=no-member
+            else:
+                arr_data = np.array(row[:label_col] + row[label_col + 1 :], dtype=np.float)  # noqa: E203
+                arr_label: np.ndarray = np.array(row[label_col], dtype=np.long)
+                ts_data: torch.Tensor = torch.as_tensor(arr_data, dtype=torch.float)  # pylint: disable=no-member
+                ts_label: torch.Tensor = torch.as_tensor(arr_label, dtype=torch.long)  # pylint: disable=no-member
+                yield ts_data, ts_label
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index 3c08ec319..456269244 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -101,15 +101,15 @@ def test_torch_sql_label(parameters, db_type, chunksize):
     assert torch.all(ts[2][1].eq(torch.tensor([9], dtype=torch.long)))
 
 
-def test_torch_image_s3(bucket):
-    s3 = boto3.client("s3")
-    ref_label = 0
-    s3.put_object(Body=open("../../docs/source/_static/logo.png"), Bucket=bucket, Key=f"class={ref_label}/logo.png")
-    ds = wr.torch.ImageS3Dataset()
-    for image, label in ds:
-        assert image.shape == torch.Size([1, 28, 28])
-        assert label == torch.int(ref_label)
-        break
+# def test_torch_image_s3(bucket):
+#     s3 = boto3.client("s3")
+#     ref_label = 0
+#     s3.put_object(Body=open("../../docs/source/_static/logo.png"), Bucket=bucket, Key=f"class={ref_label}/logo.png")
+#     ds = wr.torch.ImageS3Dataset()
+#     for image, label in ds:
+#         assert image.shape == torch.Size([1, 28, 28])
+#         assert label == torch.int(ref_label)
+#         break
 
 
 # def test_torch_audio_s3(bucket):

From 36c15e48d6afbd9925f2f57f495c82c39ef16171 Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Mon, 20 Apr 2020 15:05:09 -0300
Subject: [PATCH 09/59] tested lambda & image datasets

---
 awswrangler/torch.py                   | 359 +++++++++++++------------
 testing/test_awswrangler/test_torch.py |  82 +++++-
 2 files changed, 255 insertions(+), 186 deletions(-)

diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index a73f4d198..d797193e8 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -1,190 +1,195 @@
 """PyTorch Module."""
 
+import re
 import logging
-# import re
-# from io import BytesIO
-from typing import Any, Iterator, List, Optional, Tuple, Union
 
-# import boto3  # type: ignore
+import torch  # type: ignore
+import boto3  # type: ignore
 import numpy as np  # type: ignore
 import sqlalchemy  # type: ignore
-import torch
-from torch.utils.data.dataset import IterableDataset
 
-from awswrangler import db  # , s3, _utils
+from PIL import Image
+from io import BytesIO
+from typing import Any, Iterator, List, Optional, Tuple, Union, Callable
+from torch.utils.data.dataset import Dataset, IterableDataset
+from torchvision.transforms.functional import to_tensor
+
+from awswrangler import db, s3, _utils
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
-# class _BaseS3Dataset(Dataset):
-#     """PyTorch Map-Style S3 Dataset."""
-#
-#     def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session):
-#         """Pytorch Map-Style S3 Dataset.
-#
-#         Parameters
-#         ----------
-#         path : Union[str, List[str]]
-#             S3 prefix (e.g. s3://bucket/prefix)
-#             or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-#         boto3_session : boto3.Session(), optional
-#             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-#
-#         Returns
-#         -------
-#         torch.utils.data.Dataset
-#
-#         """
-#         super().__init__()
-#         self.session = _utils.ensure_session(session=boto3_session)
-#         self.paths: List[str] = s3._path2list(path=path, suffix=suffix, boto3_session=self.session)
-#
-#     def __getitem__(self, index):
-#         path = self.paths[index]
-#         obj = self._fetch_obj(path)
-#         return [self.parser_fn(obj), self.label_fn(path)]
-#
-#     def __len__(self):
-#         return len(self.paths)
-#
-#     def _fetch_obj(self, path):
-#         bucket, key = _utils.parse_path(path=path)
-#         buff = BytesIO()
-#         client_s3: boto3.client = _utils.client(service_name="s3", session=self.session)
-#         client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff)
-#         return buff.seek(0)
-#
-#     def parser_fn(self, obj):
-#         pass
-#
-#     def label_fn(self, path):
-#         pass
-#
-#
-# class _S3PartitionedDataset(_BaseS3Dataset):
-#     def label_fn(self, path):
-#         return int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1])
-
-
-# class AudioS3Dataset(_S3PartitionedDataset):
-#
-#     def __init__(self):
-#         """Pytorch S3 Audio Dataset.
-#
-#         Assumes audio files are stored with the following structure:
-#
-#         bucket
-#         ├── class=0
-#         │   ├── audio0.wav
-#         │   └── audio1.wav
-#         └── class=1
-#             ├── audio2.wav
-#             └── audio3.wav
-#
-#         Parameters
-#         ----------
-#         path : Union[str, List[str]]
-#             S3 prefix (e.g. s3://bucket/prefix)
-#             or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-#         boto3_session : boto3.Session(), optional
-#             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-#
-#         Returns
-#         -------
-#         torch.utils.data.Dataset
-#
-#         Examples
-#         --------
-#         >>> import awswrangler as wr
-#         >>> import boto3
-#         >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session())
-#
-#         """
-#         super(AudioS3Dataset, self).__init__()
-#         import torchaudio
-#
-#     def parser_fn(self, obj):
-#         waveform, sample_rate = torchaudio.load(obj)
-#         return waveform, sample_rate
-
-
-# class LambdaS3Dataset(_BaseS3Dataset):
-#     """PyTorch S3 Audio Dataset.
-#
-#     Parameters
-#     ----------
-#     path : Union[str, List[str]]
-#         S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-#     boto3_session : boto3.Session(), optional
-#         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-#
-#     Returns
-#     -------
-#     torch.utils.data.Dataset
-#
-#     Examples
-#     --------
-#     >>> import awswrangler as wr
-#     >>> import boto3
-#     >>> parse_fn = lambda x: torch.tensor(x)
-#     >>> label_fn = lambda x: x.split('.')[-1]
-#     >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), parse_fn=parse_fn, label_fn=label_fn)
-#
-#     """
-#     def __init__(self, parse_fn, label_fn):
-#         self._parse_fn = parse_fn
-#         self._label_fn = label_fn
-#
-#     def label_fn(self, path):
-#         return self._label_fn(path)
-#
-#     def parse_fn(self, obj):
-#         return self._parse_fn(obj)
-#
-#
-# class ImageS3Dataset(_S3PartitionedDataset):
-#
-#     def __init__(self):
-#         """PyTorch Image S3 Dataset.
-#
-#         Assumes Images are stored with the following structure:
-#
-#         bucket
-#         ├── class=0
-#         │   ├── img0.jpeg
-#         │   └── img1.jpeg
-#         └── class=1
-#             ├── img2.jpeg
-#             └── img3.jpeg
-#
-#         Parameters
-#         ----------
-#         path : Union[str, List[str]]
-#             S3 prefix (e.g. s3://bucket/prefix)
-#             or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-#         boto3_session : boto3.Session(), optional
-#             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-#
-#         Returns
-#         -------
-#         torch.utils.data.Dataset
-#
-#         Examples
-#         --------
-#         >>> import awswrangler as wr
-#         >>> import boto3
-#         >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session())
-#
-#         """
-#         super(ImageS3Dataset, self).__init__()
-#         from PIL import Image
-#         from torchvision.transforms.functional import to_tensor
-#
-#     def parser_fn(self, obj):
-#         image = Image.open(obj)
-#         tensor = to_tensor(image)
-#         tensor.unsqueeze_(0)
-#         return tensor
+class _BaseS3Dataset(Dataset):
+    """PyTorch Map-Style S3 Dataset."""
+
+    def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session):
+        """PyTorch Map-Style S3 Dataset.
+
+        Parameters
+        ----------
+        path : Union[str, List[str]]
+            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        boto3_session : boto3.Session(), optional
+            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+        Returns
+        -------
+        torch.utils.data.Dataset
+
+        """
+        super().__init__()
+        self.session = _utils.ensure_session(session=boto3_session)
+        self.paths: List[str] = s3._path2list(  # pylint: disable=protected-access
+            path=path,
+            suffix=suffix,
+            boto3_session=self.session,
+        )
+
+    def __getitem__(self, index):
+        path = self.paths[index]
+        data = self._fetch_data(path)
+        return [self.data_fn(data), self.label_fn(path)]
+
+    def __len__(self):
+        return len(self.paths)
+
+    def _fetch_data(self, path):
+        bucket, key = _utils.parse_path(path=path)
+        buff = BytesIO()
+        client_s3: boto3.client = _utils.client(service_name="s3", session=self.session)
+        client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff)
+        buff.seek(0)
+        return buff
+
+    def data_fn(self, obj):
+        pass
+
+    def label_fn(self, path):
+        pass
+
+
+class _S3PartitionedDataset(_BaseS3Dataset):
+
+    def label_fn(self, path):
+        return int(re.findall(r'/(.*?)=(.*?)/', path)[-1][1])
+
+
+class LambdaS3Dataset(_BaseS3Dataset):
+
+    def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session, data_fn: Callable, label_fn: Callable):
+        """PyTorch S3 Audio Dataset.
+
+        Parameters
+        ----------
+        path : Union[str, List[str]]
+            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        boto3_session : boto3.Session(), optional
+            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+        Returns
+        -------
+        torch.utils.data.Dataset
+
+        Examples
+        --------
+        >>> import awswrangler as wr
+        >>> import boto3
+        >>> data_fn = lambda x: torch.tensor(x)
+        >>> label_fn = lambda x: x.split('.')[-1]
+        >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), data_fn=data_fn, label_fn=label_fn)
+
+        """
+        super(LambdaS3Dataset, self).__init__(path, suffix, boto3_session)
+        self._data_fn = data_fn
+        self._label_fn = label_fn
+
+    def label_fn(self, path):
+        return self._label_fn(path)
+
+    def data_fn(self, data):
+        print(type(data), data)
+        return self._data_fn(data)
+
+
+class AudioS3Dataset(_S3PartitionedDataset):
+
+    def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session):
+        """PyTorch S3 Audio Dataset.
+
+        Assumes audio files are stored with the following structure:
+
+        bucket
+        ├── class=0
+        │   ├── audio0.wav
+        │   └── audio1.wav
+        └── class=1
+            ├── audio2.wav
+            └── audio3.wav
+
+        Parameters
+        ----------
+        path : Union[str, List[str]]
+            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        boto3_session : boto3.Session(), optional
+            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+        Returns
+        -------
+        torch.utils.data.Dataset
+
+        Examples
+        --------
+        >>> import awswrangler as wr
+        >>> import boto3
+        >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session())
+
+        """
+        super(AudioS3Dataset, self).__init__(path, suffix, boto3_session)
+
+    def data_fn(self, data):
+        waveform, sample_rate = torchaudio.load(data)
+        return waveform, sample_rate
+
+
+class ImageS3Dataset(_S3PartitionedDataset):
+
+    def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session):
+        """PyTorch Image S3 Dataset.
+
+        Assumes Images are stored with the following structure:
+
+        bucket
+        ├── class=0
+        │   ├── img0.jpeg
+        │   └── img1.jpeg
+        └── class=1
+            ├── img2.jpeg
+            └── img3.jpeg
+
+        Parameters
+        ----------
+        path : Union[str, List[str]]
+            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        boto3_session : boto3.Session(), optional
+            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+        Returns
+        -------
+        torch.utils.data.Dataset
+
+        Examples
+        --------
+        >>> import awswrangler as wr
+        >>> import boto3
+        >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session())
+
+        """
+        super(ImageS3Dataset, self).__init__(path, suffix, boto3_session)
+
+    def data_fn(self, data):
+        image = Image.open(data)
+        tensor = to_tensor(image)
+        return tensor
 
 
 class SQLDataset(IterableDataset):  # pylint: disable=too-few-public-methods,abstract-method
@@ -232,7 +237,7 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor,
         """Iterate over the Dataset."""
         if torch.utils.data.get_worker_info() is not None:  # type: ignore
             raise NotImplementedError()
-        db._validate_engine(con=self._con)
+        db._validate_engine(con=self._con)  # pylint: disable=protected-access
         with self._con.connect() as con:
             cursor: Any = con.execute(self._sql)
             if (self._label_col is not None) and isinstance(self._label_col, str):
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index 456269244..4f508b31c 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -1,10 +1,16 @@
 import logging
 
+import re
 import boto3
+import numpy as np
 import pandas as pd
 import pytest
 import torch
 
+from PIL import Image
+from torch.utils.data import DataLoader
+from torchvision.transforms.functional import to_tensor
+
 import awswrangler as wr
 
 logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s")
@@ -101,16 +107,74 @@ def test_torch_sql_label(parameters, db_type, chunksize):
     assert torch.all(ts[2][1].eq(torch.tensor([9], dtype=torch.long)))
 
 
-# def test_torch_image_s3(bucket):
-#     s3 = boto3.client("s3")
-#     ref_label = 0
-#     s3.put_object(Body=open("../../docs/source/_static/logo.png"), Bucket=bucket, Key=f"class={ref_label}/logo.png")
-#     ds = wr.torch.ImageS3Dataset()
-#     for image, label in ds:
-#         assert image.shape == torch.Size([1, 28, 28])
-#         assert label == torch.int(ref_label)
-#         break
+def test_torch_image_s3(bucket):
+    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
+
+    s3 = boto3.client("s3")
+    ref_label = 0
+    s3.put_object(
+        Body=open("../../docs/source/_static/logo.png", "rb").read(),
+        Bucket=bucket,
+        Key=f"class={ref_label}/logo.png",
+        ContentType="image/png",
+    )
+    ds = wr.torch.ImageS3Dataset(path=bucket, suffix="png", boto3_session=boto3.Session())
+    image, label = ds[0]
+    assert image.shape == torch.Size([4, 494, 1636])
+    assert label == torch.tensor(ref_label, dtype=torch.int)
+
+    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
+
+
+def test_torch_image_s3_dataloader(bucket):
+    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
+
+    s3 = boto3.client("s3")
+    labels = np.random.randint(0, 4, size=(8,))
+    for i, label in enumerate(labels):
+        s3.put_object(
+            Body=open("../../docs/source/_static/logo.png", "rb").read(),
+            Bucket=bucket,
+            Key=f"class={label}/logo{i}.png",
+            ContentType="image/png",
+        )
+    ds = wr.torch.ImageS3Dataset(path=bucket, suffix="png", boto3_session=boto3.Session())
+    batch_size = 2
+    num_train = len(ds)
+    indices = list(range(num_train))
+    loader = DataLoader(
+        ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices)
+    )
+    for i, (image, label) in enumerate(loader):
+        assert image.shape == torch.Size([batch_size, 4, 494, 1636])
+        assert label.dtype == torch.int64
+
+    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
+
+
+def test_torch_lambda_s3(bucket):
+    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
+
+    s3 = boto3.client("s3")
+    ref_label = 0
+    s3.put_object(
+        Body=open("../../docs/source/_static/logo.png", "rb").read(),
+        Bucket=bucket,
+        Key=f"class={ref_label}/logo.png",
+        ContentType="image/png",
+    )
+    ds = wr.torch.LambdaS3Dataset(
+        path=bucket,
+        suffix="png",
+        boto3_session=boto3.Session(),
+        data_fn=lambda x: to_tensor(Image.open(x)),
+        label_fn=lambda x: int(re.findall(r'/class=(.*?)/', x)[-1]),
+    )
+    image, label = ds[0]
+    assert image.shape == torch.Size([4, 494, 1636])
+    assert label == torch.tensor(ref_label, dtype=torch.int)
 
+    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
 
 # def test_torch_audio_s3(bucket):
 #     ds = wr.torch.AudioS3Dataset()

From d4dcfc521f1f6cc8c0fdf1de485a7c29b8667cae Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Mon, 20 Apr 2020 15:35:11 -0300
Subject: [PATCH 10/59] add audio test

---
 awswrangler/torch.py                   |  3 ++-
 testing/test_awswrangler/test_torch.py | 29 +++++++++++++++++++-------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index d797193e8..4a6a76567 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -1,5 +1,4 @@
 """PyTorch Module."""
-
 import re
 import logging
 
@@ -7,6 +6,7 @@
 import boto3  # type: ignore
 import numpy as np  # type: ignore
 import sqlalchemy  # type: ignore
+import torchaudio
 
 from PIL import Image
 from io import BytesIO
@@ -147,6 +147,7 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto
         super(AudioS3Dataset, self).__init__(path, suffix, boto3_session)
 
     def data_fn(self, data):
+
         waveform, sample_rate = torchaudio.load(data)
         return waveform, sample_rate
 
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index 4f508b31c..a54797440 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -138,12 +138,19 @@ def test_torch_image_s3_dataloader(bucket):
             Key=f"class={label}/logo{i}.png",
             ContentType="image/png",
         )
-    ds = wr.torch.ImageS3Dataset(path=bucket, suffix="png", boto3_session=boto3.Session())
+    ds = wr.torch.ImageS3Dataset(
+        path=bucket,
+        suffix="png",
+        boto3_session=boto3.Session(),
+    )
     batch_size = 2
     num_train = len(ds)
     indices = list(range(num_train))
     loader = DataLoader(
-        ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices)
+        ds,
+        batch_size=batch_size,
+        num_workers=4,
+        sampler=torch.utils.data.sampler.RandomSampler(indices),
     )
     for i, (image, label) in enumerate(loader):
         assert image.shape == torch.Size([batch_size, 4, 494, 1636])
@@ -176,8 +183,16 @@ def test_torch_lambda_s3(bucket):
 
     wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
 
-# def test_torch_audio_s3(bucket):
-#     ds = wr.torch.AudioS3Dataset()
-#     for image, label in ds:
-#         assert image.shape == torch.Size([1, 28, 28])
-#         break
+
+def test_torch_audio_s3(bucket):
+    ds = wr.torch.AudioS3Dataset(
+        path="s3://multimedia-commons/data/videos/mp4/006/039/006039642c984a788569c7fea33ef3.mp4",
+        suffix="png",
+        boto3_session=boto3.Session(),
+    )
+    loader = DataLoader(
+        ds,
+        batch_size=1,
+    )
+    for image, label in loader:
+        assert image.shape == torch.Size([1, 28, 28])

From 30dc2fa5b275c04b5d94dc9799e9653ab479f65e Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Mon, 20 Apr 2020 18:22:50 -0300
Subject: [PATCH 11/59] Add test for torch.AudioS3Dataset

---
 .pylintrc                              |   3 +-
 awswrangler/torch.py                   | 111 +++++++++++++++----------
 pytest.ini                             |   2 +-
 requirements-dev.txt                   |   3 +-
 testing/run-validations.sh             |   2 +-
 testing/test_awswrangler/test_torch.py |  82 +++++++++---------
 6 files changed, 114 insertions(+), 89 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index 132ce213a..4f41cb3fb 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -141,7 +141,8 @@ disable=print-statement,
         comprehension-escape,
         C0330,
         C0103,
-        W1202
+        W1202,
+        too-few-public-methods
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index 4a6a76567..db09abc46 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -1,20 +1,21 @@
 """PyTorch Module."""
-import re
 import logging
+import os
+import pathlib
+import re
+from io import BytesIO
+from typing import Any, Callable, Iterator, List, Optional, Tuple, Union
 
-import torch  # type: ignore
 import boto3  # type: ignore
 import numpy as np  # type: ignore
 import sqlalchemy  # type: ignore
-import torchaudio
-
-from PIL import Image
-from io import BytesIO
-from typing import Any, Iterator, List, Optional, Tuple, Union, Callable
-from torch.utils.data.dataset import Dataset, IterableDataset
-from torchvision.transforms.functional import to_tensor
+import torch  # type: ignore
+import torchaudio  # type: ignore
+from PIL import Image  # type: ignore
+from torch.utils.data.dataset import Dataset, IterableDataset  # type: ignore
+from torchvision.transforms.functional import to_tensor  # type: ignore
 
-from awswrangler import db, s3, _utils
+from awswrangler import _utils, db, s3
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
@@ -22,7 +23,9 @@
 class _BaseS3Dataset(Dataset):
     """PyTorch Map-Style S3 Dataset."""
 
-    def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session):
+    def __init__(
+        self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
+    ):
         """PyTorch Map-Style S3 Dataset.
 
         Parameters
@@ -38,46 +41,51 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto
 
         """
         super().__init__()
-        self.session = _utils.ensure_session(session=boto3_session)
-        self.paths: List[str] = s3._path2list(  # pylint: disable=protected-access
-            path=path,
-            suffix=suffix,
-            boto3_session=self.session,
+        self._session = _utils.ensure_session(session=boto3_session)
+        self._paths: List[str] = s3._path2list(  # pylint: disable=protected-access
+            path=path, suffix=suffix, boto3_session=self._session
         )
 
     def __getitem__(self, index):
-        path = self.paths[index]
+        path = self._paths[index]
         data = self._fetch_data(path)
-        return [self.data_fn(data), self.label_fn(path)]
+        return [self._data_fn(data), self._label_fn(path)]
 
     def __len__(self):
-        return len(self.paths)
+        return len(self._paths)
 
     def _fetch_data(self, path):
         bucket, key = _utils.parse_path(path=path)
         buff = BytesIO()
-        client_s3: boto3.client = _utils.client(service_name="s3", session=self.session)
+        client_s3: boto3.client = _utils.client(service_name="s3", session=self._session)
         client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff)
         buff.seek(0)
         return buff
 
-    def data_fn(self, obj):
+    def _data_fn(self, data):
         pass
 
-    def label_fn(self, path):
+    def _label_fn(self, path: str):
         pass
 
 
 class _S3PartitionedDataset(_BaseS3Dataset):
-
-    def label_fn(self, path):
-        return int(re.findall(r'/(.*?)=(.*?)/', path)[-1][1])
+    def _label_fn(self, path: str):
+        return int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1])
 
 
 class LambdaS3Dataset(_BaseS3Dataset):
+    """PyTorch S3 Lambda Dataset."""
 
-    def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session, data_fn: Callable, label_fn: Callable):
-        """PyTorch S3 Audio Dataset.
+    def __init__(
+        self,
+        path: Union[str, List[str]],
+        data_fn: Callable,
+        label_fn: Callable,
+        suffix: Optional[str] = None,
+        boto3_session: Optional[boto3.Session] = None,
+    ):
+        """PyTorch S3 Lambda Dataset.
 
         Parameters
         ----------
@@ -94,26 +102,33 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto
         --------
         >>> import awswrangler as wr
         >>> import boto3
-        >>> data_fn = lambda x: torch.tensor(x)
-        >>> label_fn = lambda x: x.split('.')[-1]
-        >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), data_fn=data_fn, label_fn=label_fn)
+        >>> _data_fn = lambda x: torch.tensor(x)
+        >>> _label_fn = lambda x: x.split('.')[-1]
+        >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), _data_fn=_data_fn, _label_fn=_label_fn)
 
         """
         super(LambdaS3Dataset, self).__init__(path, suffix, boto3_session)
-        self._data_fn = data_fn
-        self._label_fn = label_fn
+        self._data_func = data_fn
+        self._label_func = label_fn
 
-    def label_fn(self, path):
-        return self._label_fn(path)
+    def _label_fn(self, path: str):
+        return self._label_func(path)
 
-    def data_fn(self, data):
-        print(type(data), data)
-        return self._data_fn(data)
+    def _data_fn(self, data):
+        print(type(data))
+        return self._data_func(data)
 
 
 class AudioS3Dataset(_S3PartitionedDataset):
+    """PyTorch S3 Audio Dataset."""
 
-    def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session):
+    def __init__(
+        self,
+        path: Union[str, List[str]],
+        cache_dir: str = "/tmp/",
+        suffix: Optional[str] = None,
+        boto3_session: Optional[boto3.Session] = None,
+    ):
         """PyTorch S3 Audio Dataset.
 
         Assumes audio files are stored with the following structure:
@@ -145,17 +160,27 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto
 
         """
         super(AudioS3Dataset, self).__init__(path, suffix, boto3_session)
+        self._cache_dir: str = cache_dir[:-1] if cache_dir.endswith("/") else cache_dir
 
-    def data_fn(self, data):
-
-        waveform, sample_rate = torchaudio.load(data)
+    def _data_fn(self, filename: str) -> Tuple[Any, Any]:  # pylint: disable=arguments-differ
+        waveform, sample_rate = torchaudio.load(filename)
+        os.remove(path=filename)
         return waveform, sample_rate
 
+    def _fetch_data(self, path: str) -> str:
+        bucket, key = _utils.parse_path(path=path)
+        filename: str = f"{self._cache_dir}/{bucket}/{key}"
+        pathlib.Path(filename).parent.mkdir(parents=True, exist_ok=True)
+        client_s3 = _utils.client(service_name="s3", session=self._session)
+        client_s3.download_file(Bucket=bucket, Key=key, Filename=filename)
+        return filename
+
 
 class ImageS3Dataset(_S3PartitionedDataset):
+    """PyTorch S3 Image Dataset."""
 
     def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session):
-        """PyTorch Image S3 Dataset.
+        """PyTorch S3 Image Dataset.
 
         Assumes Images are stored with the following structure:
 
@@ -187,7 +212,7 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto
         """
         super(ImageS3Dataset, self).__init__(path, suffix, boto3_session)
 
-    def data_fn(self, data):
+    def _data_fn(self, data):
         image = Image.open(data)
         tensor = to_tensor(image)
         return tensor
diff --git a/pytest.ini b/pytest.ini
index d233cbf74..8e7a47ef1 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,7 +1,7 @@
 [pytest]
 addopts =
     --verbose
-    --capture=no
+    --capture=fd
 filterwarnings =
     ignore::DeprecationWarning
     ignore::UserWarning
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 137f57383..0491e8789 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -19,4 +19,5 @@ sphinx~=3.0.1
 sphinx_bootstrap_theme~=0.7.1
 moto~=1.3.14
 torch~=1.4.0
-torchvision~=0.5.0
\ No newline at end of file
+torchvision~=0.5.0
+torchaudio~=0.4.0
\ No newline at end of file
diff --git a/testing/run-validations.sh b/testing/run-validations.sh
index 966038ec9..d32fc7808 100755
--- a/testing/run-validations.sh
+++ b/testing/run-validations.sh
@@ -9,7 +9,7 @@ mv temp.yaml cloudformation.yaml
 pushd ..
 black --line-length 120 --target-version py36 awswrangler testing/test_awswrangler
 isort -rc --line-width 120 awswrangler testing/test_awswrangler
-pydocstyle awswrangler/ --add-ignore=D204
+pydocstyle awswrangler/ --add-ignore=D204,D403
 mypy awswrangler
 flake8 setup.py awswrangler testing/test_awswrangler
 pylint -j 0 awswrangler
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index a54797440..5b7a84b38 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -1,12 +1,12 @@
 import logging
-
 import re
+
 import boto3
 import numpy as np
 import pandas as pd
 import pytest
 import torch
-
+import torchaudio
 from PIL import Image
 from torch.utils.data import DataLoader
 from torchvision.transforms.functional import to_tensor
@@ -108,91 +108,89 @@ def test_torch_sql_label(parameters, db_type, chunksize):
 
 
 def test_torch_image_s3(bucket):
-    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
-
+    path = f"s3://{bucket}/test_torch_image_s3/"
+    wr.s3.delete_objects(path=path, boto3_session=boto3.Session())
     s3 = boto3.client("s3")
     ref_label = 0
     s3.put_object(
-        Body=open("../../docs/source/_static/logo.png", "rb").read(),
+        Body=open("docs/source/_static/logo.png", "rb").read(),
         Bucket=bucket,
-        Key=f"class={ref_label}/logo.png",
+        Key=f"test_torch_image_s3/class={ref_label}/logo.png",
         ContentType="image/png",
     )
-    ds = wr.torch.ImageS3Dataset(path=bucket, suffix="png", boto3_session=boto3.Session())
+    ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session())
     image, label = ds[0]
     assert image.shape == torch.Size([4, 494, 1636])
     assert label == torch.tensor(ref_label, dtype=torch.int)
-
-    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
+    wr.s3.delete_objects(path=path)
 
 
 def test_torch_image_s3_dataloader(bucket):
-    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
-
+    path = f"s3://{bucket}/test_torch_image_s3_dataloader/"
+    wr.s3.delete_objects(path=path)
     s3 = boto3.client("s3")
     labels = np.random.randint(0, 4, size=(8,))
     for i, label in enumerate(labels):
         s3.put_object(
-            Body=open("../../docs/source/_static/logo.png", "rb").read(),
+            Body=open("./docs/source/_static/logo.png", "rb").read(),
             Bucket=bucket,
-            Key=f"class={label}/logo{i}.png",
+            Key=f"test_torch_image_s3_dataloader/class={label}/logo{i}.png",
             ContentType="image/png",
         )
-    ds = wr.torch.ImageS3Dataset(
-        path=bucket,
-        suffix="png",
-        boto3_session=boto3.Session(),
-    )
+    ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session())
     batch_size = 2
     num_train = len(ds)
     indices = list(range(num_train))
     loader = DataLoader(
-        ds,
-        batch_size=batch_size,
-        num_workers=4,
-        sampler=torch.utils.data.sampler.RandomSampler(indices),
+        ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices)
     )
     for i, (image, label) in enumerate(loader):
         assert image.shape == torch.Size([batch_size, 4, 494, 1636])
         assert label.dtype == torch.int64
-
-    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
+    wr.s3.delete_objects(path=path)
 
 
 def test_torch_lambda_s3(bucket):
-    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
-
+    path = f"s3://{bucket}/test_torch_lambda_s3/"
+    wr.s3.delete_objects(path=path)
     s3 = boto3.client("s3")
     ref_label = 0
     s3.put_object(
-        Body=open("../../docs/source/_static/logo.png", "rb").read(),
+        Body=open("./docs/source/_static/logo.png", "rb").read(),
         Bucket=bucket,
-        Key=f"class={ref_label}/logo.png",
+        Key=f"test_torch_lambda_s3/class={ref_label}/logo.png",
         ContentType="image/png",
     )
     ds = wr.torch.LambdaS3Dataset(
-        path=bucket,
+        path=path,
         suffix="png",
         boto3_session=boto3.Session(),
         data_fn=lambda x: to_tensor(Image.open(x)),
-        label_fn=lambda x: int(re.findall(r'/class=(.*?)/', x)[-1]),
+        label_fn=lambda x: int(re.findall(r"/class=(.*?)/", x)[-1]),
     )
     image, label = ds[0]
     assert image.shape == torch.Size([4, 494, 1636])
     assert label == torch.tensor(ref_label, dtype=torch.int)
-
-    wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session())
+    wr.s3.delete_objects(path=path)
 
 
 def test_torch_audio_s3(bucket):
-    ds = wr.torch.AudioS3Dataset(
-        path="s3://multimedia-commons/data/videos/mp4/006/039/006039642c984a788569c7fea33ef3.mp4",
-        suffix="png",
-        boto3_session=boto3.Session(),
-    )
-    loader = DataLoader(
-        ds,
-        batch_size=1,
+    size = (1, 8_000 * 5)
+    audio = torch.randint(low=-25, high=25, size=size) / 100.0
+    audio_file = "/tmp/amazing_sound.wav"
+    torchaudio.save(audio_file, audio, 8_000)
+    path = f"s3://{bucket}/test_torch_audio_s3/"
+    wr.s3.delete_objects(path=path, boto3_session=boto3.Session())
+    s3 = boto3.client("s3")
+    ref_label = 0
+    s3.put_object(
+        Body=open(audio_file, "rb").read(),
+        Bucket=bucket,
+        Key=f"test_torch_audio_s3/class={ref_label}/amazing_sound.wav",
+        ContentType="audio/wav",
     )
-    for image, label in loader:
-        assert image.shape == torch.Size([1, 28, 28])
+    s3_audio_file = f"{bucket}/test_torch_audio_s3/class={ref_label}/amazing_sound.wav"
+    ds = wr.torch.AudioS3Dataset(path=s3_audio_file, suffix="wav")
+    loader = DataLoader(ds, batch_size=1)
+    for (audio, rate), label in loader:
+        assert audio.shape == torch.Size((1, *size))

From 0376aefa25cc6ad9422e9a226a0d635bad64a63f Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 21 Apr 2020 23:26:58 -0300
Subject: [PATCH 12/59] Add chunked=INTEGER option to ensure batch number of
 rows #192

---
 awswrangler/athena.py                      |  61 +++++++---
 awswrangler/db.py                          |  29 ++++-
 awswrangler/s3.py                          | 124 +++++++++++++++++----
 testing/test_awswrangler/test_data_lake.py |  36 ++++++
 4 files changed, 209 insertions(+), 41 deletions(-)

diff --git a/awswrangler/athena.py b/awswrangler/athena.py
index 1933606ad..a899d405b 100644
--- a/awswrangler/athena.py
+++ b/awswrangler/athena.py
@@ -329,7 +329,7 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     database: str,
     ctas_approach: bool = True,
     categories: List[str] = None,
-    chunksize: Optional[int] = None,
+    chunksize: Optional[Union[int, bool]] = None,
     s3_output: Optional[str] = None,
     workgroup: Optional[str] = None,
     encryption: Optional[str] = None,
@@ -353,10 +353,6 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     CONS: Slower (But stills faster than other libraries that uses the regular Athena API)
     and does not handle nested types at all.
 
-    Note
-    ----
-    If `chunksize` is passed, then a Generator of DataFrames is returned.
-
     Note
     ----
     If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes,
@@ -367,6 +363,21 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     Create the default Athena bucket if it doesn't exist and s3_output is None.
     (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/)
 
+    Note
+    ----
+    ``Batching`` (`chunksize` argument) (Memory Friendly):
+
+    Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.
+
+    There are two batching strategies on Wrangler:
+
+    - If **chunksize=True**, a new DataFrame will be returned for each file in the query result.
+
+    - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER.
+
+    `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise
+    in number of rows for each Dataframe.
+
     Note
     ----
     In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
@@ -383,8 +394,10 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     categories: List[str], optional
         List of columns names that should be returned as pandas.Categorical.
         Recommended for memory restricted environments.
-    chunksize: int, optional
-        If specified, return an generator where chunksize is the number of rows to include in each chunk.
+    chunksize : Union[int, bool], optional
+        If passed will split the data in a Iterable of DataFrames (Memory friendly).
+        If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize.
+        If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER.
     s3_output : str, optional
         AWS S3 path.
     workgroup : str, optional
@@ -454,7 +467,7 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
         catalog.delete_table_if_exists(database=database, table=name, boto3_session=session)
         manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv"
         paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session)
-        chunked: bool = chunksize is not None
+        chunked: Union[bool, int] = False if chunksize is None else chunksize
         _logger.debug(f"chunked: {chunked}")
         if not paths:
             if chunked is False:
@@ -473,6 +486,8 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     path = f"{_s3_output}/{query_id}.csv"
     s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session)
     _logger.debug(f"Start CSV reading from {path}")
+    _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None
+    _logger.debug(f"_chunksize: {_chunksize}")
     ret = s3.read_csv(
         path=[path],
         dtype=dtype,
@@ -481,7 +496,7 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
         quoting=csv.QUOTE_ALL,
         keep_default_na=False,
         na_values=[""],
-        chunksize=chunksize,
+        chunksize=_chunksize,
         skip_blank_lines=False,
         use_threads=False,
         boto3_session=session,
@@ -565,7 +580,7 @@ def read_sql_table(
     database: str,
     ctas_approach: bool = True,
     categories: List[str] = None,
-    chunksize: Optional[int] = None,
+    chunksize: Optional[Union[int, bool]] = None,
     s3_output: Optional[str] = None,
     workgroup: Optional[str] = None,
     encryption: Optional[str] = None,
@@ -589,10 +604,6 @@ def read_sql_table(
     CONS: Slower (But stills faster than other libraries that uses the regular Athena API)
     and does not handle nested types at all
 
-    Note
-    ----
-    If `chunksize` is passed, then a Generator of DataFrames is returned.
-
     Note
     ----
     If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes,
@@ -603,6 +614,21 @@ def read_sql_table(
     Create the default Athena bucket if it doesn't exist and s3_output is None.
     (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/)
 
+    Note
+    ----
+    ``Batching`` (`chunksize` argument) (Memory Friendly):
+
+    Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.
+
+    There are two batching strategies on Wrangler:
+
+    - If **chunksize=True**, a new DataFrame will be returned for each file in the query result.
+
+    - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER.
+
+    `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise
+    in number of rows for each Dataframe.
+
     Note
     ----
     In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
@@ -619,8 +645,10 @@ def read_sql_table(
     categories: List[str], optional
         List of columns names that should be returned as pandas.Categorical.
         Recommended for memory restricted environments.
-    chunksize: int, optional
-        If specified, return an generator where chunksize is the number of rows to include in each chunk.
+    chunksize : Union[int, bool], optional
+        If passed will split the data in a Iterable of DataFrames (Memory friendly).
+        If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize.
+        If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER.
     s3_output : str, optional
         AWS S3 path.
     workgroup : str, optional
@@ -646,6 +674,7 @@ def read_sql_table(
     >>> df = wr.athena.read_sql_table(table='...', database='...')
 
     """
+    table = catalog.sanitize_table_name(table=table)
     return read_sql_query(
         sql=f'SELECT * FROM "{table}"',
         database=database,
diff --git a/awswrangler/db.py b/awswrangler/db.py
index 491fe7784..2c8ac2799 100644
--- a/awswrangler/db.py
+++ b/awswrangler/db.py
@@ -888,7 +888,7 @@ def unload_redshift(
     con: sqlalchemy.engine.Engine,
     iam_role: str,
     categories: List[str] = None,
-    chunked: bool = False,
+    chunked: Union[bool, int] = False,
     keep_files: bool = False,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
@@ -906,6 +906,22 @@ def unload_redshift(
 
     https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html
 
+    Note
+    ----
+    ``Batching`` (`chunked` argument) (Memory Friendly):
+
+    Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.
+
+    There are two batching strategies on Wrangler:
+
+    - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset.
+
+    - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER.
+
+    `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise
+    in number of rows for each Dataframe.
+
+
     Note
     ----
     In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
@@ -926,9 +942,10 @@ def unload_redshift(
         Recommended for memory restricted environments.
     keep_files : bool
         Should keep the stage files?
-    chunked : bool
-        If True will break the data in smaller DataFrames (Non deterministic number of lines).
-        Otherwise return a single DataFrame with the whole data.
+    chunked : Union[int, bool]
+        If passed will split the data in a Iterable of DataFrames (Memory friendly).
+        If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize.
+        If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER.
     use_threads : bool
         True to enable concurrent requests, False to disable multiple threads.
         If enabled os.cpu_count() will be used as the max number of threads.
@@ -979,6 +996,7 @@ def unload_redshift(
     return _read_parquet_iterator(
         paths=paths,
         categories=categories,
+        chunked=chunked,
         use_threads=use_threads,
         boto3_session=session,
         s3_additional_kwargs=s3_additional_kwargs,
@@ -991,13 +1009,14 @@ def _read_parquet_iterator(
     keep_files: bool,
     use_threads: bool,
     categories: List[str] = None,
+    chunked: Union[bool, int] = True,
     boto3_session: Optional[boto3.Session] = None,
     s3_additional_kwargs: Optional[Dict[str, str]] = None,
 ) -> Iterator[pd.DataFrame]:
     dfs: Iterator[pd.DataFrame] = s3.read_parquet(
         path=paths,
         categories=categories,
-        chunked=True,
+        chunked=chunked,
         dataset=False,
         use_threads=use_threads,
         boto3_session=boto3_session,
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index f728937db..0127f8897 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -1501,6 +1501,7 @@ def _read_parquet_init(
         filters=filters,
         read_dictionary=categories,
         validate_schema=validate_schema,
+        split_row_groups=False,
     )
     return data
 
@@ -1510,7 +1511,7 @@ def read_parquet(
     filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None,
     columns: Optional[List[str]] = None,
     validate_schema: bool = True,
-    chunked: bool = False,
+    chunked: Union[bool, int] = False,
     dataset: bool = False,
     categories: List[str] = None,
     use_threads: bool = True,
@@ -1522,6 +1523,22 @@ def read_parquet(
     The concept of Dataset goes beyond the simple idea of files and enable more
     complex features like partitioning and catalog integration (AWS Glue Catalog).
 
+    Note
+    ----
+    ``Batching`` (`chunked` argument) (Memory Friendly):
+
+    Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.
+
+    There are two batching strategies on Wrangler:
+
+    - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset.
+
+    - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER.
+
+    `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise
+    in number of rows for each Dataframe.
+
+
     Note
     ----
     In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
@@ -1538,11 +1555,12 @@ def read_parquet(
         Check that individual file schemas are all the same / compatible. Schemas within a
         folder prefix should all be the same. Disable if you have schemas that are different
         and want to disable this check.
-    chunked : bool
-        If True will break the data in smaller DataFrames (Non deterministic number of lines).
-        Otherwise return a single DataFrame with the whole data.
+    chunked : Union[int, bool]
+        If passed will split the data in a Iterable of DataFrames (Memory friendly).
+        If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize.
+        If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER.
     dataset: bool
-        If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns.
+        If `True` read a parquet dataset instead of simple file(s) loading all the related partitions as columns.
     categories: List[str], optional
         List of columns names that should be returned as pandas.Categorical.
         Recommended for memory restricted environments.
@@ -1583,29 +1601,43 @@ def read_parquet(
     >>> import awswrangler as wr
     >>> df = wr.s3.read_parquet(path=['s3://bucket/filename0.parquet', 's3://bucket/filename1.parquet'])
 
-    Reading in chunks
+    Reading in chunks (Chunk by file)
 
     >>> import awswrangler as wr
     >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=True)
     >>> for df in dfs:
     >>>     print(df)  # Smaller Pandas DataFrame
 
+    Reading in chunks (Chunk by 1MM rows)
+
+    >>> import awswrangler as wr
+    >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000)
+    >>> for df in dfs:
+    >>>     print(df)  # 1MM Pandas DataFrame
+
     """
     data: pyarrow.parquet.ParquetDataset = _read_parquet_init(
         path=path,
         filters=filters,
         dataset=dataset,
         categories=categories,
+        validate_schema=validate_schema,
         use_threads=use_threads,
         boto3_session=boto3_session,
         s3_additional_kwargs=s3_additional_kwargs,
-        validate_schema=validate_schema,
     )
     if chunked is False:
         return _read_parquet(
             data=data, columns=columns, categories=categories, use_threads=use_threads, validate_schema=validate_schema
         )
-    return _read_parquet_chunked(data=data, columns=columns, categories=categories, use_threads=use_threads)
+    return _read_parquet_chunked(
+        data=data,
+        columns=columns,
+        categories=categories,
+        chunked=chunked,
+        use_threads=use_threads,
+        validate_schema=validate_schema,
+    )
 
 
 def _read_parquet(
@@ -1639,22 +1671,50 @@ def _read_parquet_chunked(
     data: pyarrow.parquet.ParquetDataset,
     columns: Optional[List[str]] = None,
     categories: List[str] = None,
+    validate_schema: bool = True,
+    chunked: Union[bool, int] = True,
     use_threads: bool = True,
 ) -> Iterator[pd.DataFrame]:
+    promote: bool = not validate_schema
+    next_slice: Optional[pa.Table] = None
     for piece in data.pieces:
         table: pa.Table = piece.read(
             columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False
         )
-        yield table.to_pandas(
-            use_threads=use_threads,
-            split_blocks=True,
-            self_destruct=True,
-            integer_object_nulls=False,
-            date_as_object=True,
-            ignore_metadata=True,
-            categories=categories,
-            types_mapper=_data_types.pyarrow2pandas_extension,
-        )
+        if chunked is True:
+            yield _table2df(table=table, categories=categories, use_threads=use_threads)
+        else:
+            if next_slice is not None:
+                table = pa.lib.concat_tables([next_slice, table], promote=promote)
+            length: int = len(table)
+            while True:
+                if length == chunked:
+                    yield _table2df(table=table, categories=categories, use_threads=use_threads)
+                    next_slice = None
+                    break
+                if length < chunked:
+                    next_slice = table
+                    break
+                yield _table2df(
+                    table=table.slice(offset=0, length=chunked), categories=categories, use_threads=use_threads
+                )
+                table = table.slice(offset=chunked, length=None)
+                length = len(table)
+    if next_slice is not None:
+        yield _table2df(table=next_slice, categories=categories, use_threads=use_threads)
+
+
+def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool = True) -> pd.DataFrame:
+    return table.to_pandas(
+        use_threads=use_threads,
+        split_blocks=True,
+        self_destruct=True,
+        integer_object_nulls=False,
+        date_as_object=True,
+        ignore_metadata=True,
+        categories=categories,
+        types_mapper=_data_types.pyarrow2pandas_extension,
+    )
 
 
 def read_parquet_metadata(
@@ -1972,13 +2032,30 @@ def read_parquet_table(
     filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None,
     columns: Optional[List[str]] = None,
     categories: List[str] = None,
-    chunked: bool = False,
+    chunked: Union[bool, int] = False,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
     s3_additional_kwargs: Optional[Dict[str, str]] = None,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Read Apache Parquet table registered on AWS Glue Catalog.
 
+    Note
+    ----
+    ``Batching`` (`chunked` argument) (Memory Friendly):
+
+    Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.
+
+    There are two batching strategies on Wrangler:
+
+    - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset.
+
+    - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating
+      to return DataFrames with the number of row igual the received INTEGER.
+
+    `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise
+    in number of rows for each Dataframe.
+
+
     Note
     ----
     In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
@@ -2032,13 +2109,20 @@ def read_parquet_table(
     ...     }
     ... )
 
-    Reading Parquet Table in chunks
+    Reading Parquet Table in chunks (Chunk by file)
 
     >>> import awswrangler as wr
     >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True)
     >>> for df in dfs:
     >>>     print(df)  # Smaller Pandas DataFrame
 
+    Reading in chunks (Chunk by 1MM rows)
+
+    >>> import awswrangler as wr
+    >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000)
+    >>> for df in dfs:
+    >>>     print(df)  # 1MM Pandas DataFrame
+
     """
     path: str = catalog.get_table_location(database=database, table=table, boto3_session=boto3_session)
     return read_parquet(
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
index afa2a8307..15369ee2c 100644
--- a/testing/test_awswrangler/test_data_lake.py
+++ b/testing/test_awswrangler/test_data_lake.py
@@ -3,6 +3,7 @@
 import gzip
 import logging
 import lzma
+import math
 from io import BytesIO, TextIOWrapper
 
 import boto3
@@ -1084,3 +1085,38 @@ def test_copy(bucket):
 
     wr.s3.delete_objects(path=path)
     wr.s3.delete_objects(path=path2)
+
+
+@pytest.mark.parametrize("col2", [[1, 1, 1, 1, 1], [1, 2, 3, 4, 5], [1, 1, 1, 1, 2], [1, 2, 2, 2, 2]])
+@pytest.mark.parametrize("chunked", [True, 1, 2, 100])
+def test_parquet_chunked(bucket, database, col2, chunked):
+    table = f"test_parquet_chunked_{chunked}_{''.join([str(x) for x in col2])}"
+    path = f"s3://{bucket}/{table}/"
+    wr.s3.delete_objects(path=path)
+    values = list(range(5))
+    df = pd.DataFrame({"col1": values, "col2": col2})
+    paths = wr.s3.to_parquet(
+        df, path, index=False, dataset=True, database=database, table=table, partition_cols=["col2"], mode="overwrite"
+    )["paths"]
+    wr.s3.wait_objects_exist(paths=paths)
+
+    dfs = list(wr.s3.read_parquet(path=path, dataset=True, chunked=chunked))
+    assert sum(values) == pd.concat(dfs, ignore_index=True).col1.sum()
+    if chunked is not True:
+        assert len(dfs) == int(math.ceil(len(df) / chunked))
+        for df2 in dfs[:-1]:
+            assert chunked == len(df2)
+        assert chunked >= len(dfs[-1])
+    else:
+        assert len(dfs) == len(set(col2))
+
+    dfs = list(wr.athena.read_sql_table(database=database, table=table, chunksize=chunked))
+    assert sum(values) == pd.concat(dfs, ignore_index=True).col1.sum()
+    if chunked is not True:
+        assert len(dfs) == int(math.ceil(len(df) / chunked))
+        for df2 in dfs[:-1]:
+            assert chunked == len(df2)
+        assert chunked >= len(dfs[-1])
+
+    wr.s3.delete_objects(path=paths)
+    assert wr.catalog.delete_table_if_exists(database=database, table=table) is True

From 5a9a83f5dae7b6fe1cba06019c076a516b198756 Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Wed, 22 Apr 2020 20:48:25 -0300
Subject: [PATCH 13/59] s3 iterable dataset

---
 awswrangler/torch.py                   | 143 ++++++++++++++++++++++---
 testing/test_awswrangler/test_torch.py |  53 ++++++++-
 2 files changed, 178 insertions(+), 18 deletions(-)

diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index db09abc46..c4dac13e5 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -1,6 +1,8 @@
 """PyTorch Module."""
 import logging
+import io
 import os
+import tarfile
 import pathlib
 import re
 from io import BytesIO
@@ -20,8 +22,8 @@
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
-class _BaseS3Dataset(Dataset):
-    """PyTorch Map-Style S3 Dataset."""
+class _BaseS3Dataset:
+    """PyTorch Amazon S3 Map-Style Dataset."""
 
     def __init__(
         self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
@@ -46,6 +48,52 @@ def __init__(
             path=path, suffix=suffix, boto3_session=self._session
         )
 
+    def _fetch_data(self, path: str):
+        """Add parquet and csv support"""
+        bucket, key = _utils.parse_path(path=path)
+        buff = BytesIO()
+        client_s3: boto3.client = _utils.client(service_name="s3", session=self._session)
+        client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff)
+        buff.seek(0)
+        return buff
+
+    @staticmethod
+    def _load_data(data: io.BytesIO, path: str):
+        if path.endswith('.tar.gz') or path.endswith('.tgz'):
+            pass
+            # tarfile.open(fileobj=data)
+            # tar = tarfile.open(fileobj=data)
+            # for member in tar.getmembers():
+            #     print('member', member)
+        elif path.endswith('.pt'):
+            data = torch.load(data)
+        return data
+
+
+class _ListS3Dataset(_BaseS3Dataset, Dataset):
+    """PyTorch Amazon S3 Map-Style List Dataset."""
+
+    def __init__(
+        self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
+    ):
+        """PyTorch Map-Style List S3 Dataset.
+
+        Each file under path would be handle as a single tensor.
+
+        Parameters
+        ----------
+        path : Union[str, List[str]]
+            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        boto3_session : boto3.Session(), optional
+            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+        Returns
+        -------
+        torch.utils.data.Dataset
+
+        """
+        super(_ListS3Dataset, self).__init__(path, suffix, boto3_session)
+
     def __getitem__(self, index):
         path = self._paths[index]
         data = self._fetch_data(path)
@@ -54,14 +102,6 @@ def __getitem__(self, index):
     def __len__(self):
         return len(self._paths)
 
-    def _fetch_data(self, path):
-        bucket, key = _utils.parse_path(path=path)
-        buff = BytesIO()
-        client_s3: boto3.client = _utils.client(service_name="s3", session=self._session)
-        client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff)
-        buff.seek(0)
-        return buff
-
     def _data_fn(self, data):
         pass
 
@@ -69,13 +109,56 @@ def _label_fn(self, path: str):
         pass
 
 
-class _S3PartitionedDataset(_BaseS3Dataset):
+class _S3PartitionedDataset(_ListS3Dataset):
+    """PyTorch Amazon S3 Map-Style Partitioned Dataset."""
+
     def _label_fn(self, path: str):
         return int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1])
 
 
-class LambdaS3Dataset(_BaseS3Dataset):
-    """PyTorch S3 Lambda Dataset."""
+class S3FilesDataset(_BaseS3Dataset, Dataset):
+    """PyTorch Amazon S3 Files Map-Style Dataset."""
+
+    def __init__(
+        self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
+    ):
+        """PyTorch S3 Files Map-Style Dataset.
+
+        Each file under Amazon S3 path would be handled as a batch of tensors.
+        All files will be loaded to memory since random access is needed.
+
+        Parameters
+        ----------
+        path : Union[str, List[str]]
+            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        boto3_session : boto3.Session(), optional
+            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+        Returns
+        -------
+        torch.utils.data.Dataset
+
+        """
+        super(S3FilesDataset, self).__init__(path, suffix, boto3_session)
+
+    def _download_files(self):
+        self._data = []
+        for path in self._paths:
+            data = self._fetch_data(path)
+            data = self._load_data(data, path)
+            self._data.append(data)
+
+        self.data = torch.tensor(self._data)
+
+    def __getitem__(self, index):
+        return self._data[index]
+
+    def __len__(self):
+        return len(self._data)
+
+
+class LambdaS3Dataset(_ListS3Dataset):
+    """PyTorch Amazon S3 Lambda Map-Style Dataset."""
 
     def __init__(
         self,
@@ -218,6 +301,40 @@ def _data_fn(self, data):
         return tensor
 
 
+class S3IterableDataset(_BaseS3Dataset, IterableDataset):
+    """PyTorch Amazon S3 Iterable Dataset."""
+
+    def __init__(
+        self,
+        path: Union[str, List[str]],
+        suffix: Optional[str] = None,
+        boto3_session: Optional[boto3.Session] = None,
+    ):
+        """PyTorch Amazon S3 Iterable Dataset.
+
+        Parameters
+        ----------
+        path : Union[str, List[str]]
+            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        boto3_session : boto3.Session(), optional
+            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+        Returns
+        -------
+        torch.utils.data.Dataset
+
+        """
+        super(S3IterableDataset, self).__init__(path, suffix, boto3_session)
+        self._paths_index = 0
+
+    def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]:
+        for path in self._paths:
+            data = self._fetch_data(path)
+            data = self._load_data(data, path)
+            for d in data:
+                yield d
+
+
 class SQLDataset(IterableDataset):  # pylint: disable=too-few-public-methods,abstract-method
     """Pytorch Iterable SQL Dataset."""
 
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index 5b7a84b38..599976d33 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -1,5 +1,6 @@
-import logging
+import io
 import re
+import logging
 
 import boto3
 import numpy as np
@@ -125,13 +126,14 @@ def test_torch_image_s3(bucket):
     wr.s3.delete_objects(path=path)
 
 
-def test_torch_image_s3_dataloader(bucket):
+@pytest.mark.parametrize("drop_last", [True, False])
+def test_torch_image_s3_dataloader(bucket, drop_last):
     path = f"s3://{bucket}/test_torch_image_s3_dataloader/"
     wr.s3.delete_objects(path=path)
-    s3 = boto3.client("s3")
+    client_s3 = boto3.client("s3")
     labels = np.random.randint(0, 4, size=(8,))
     for i, label in enumerate(labels):
-        s3.put_object(
+        client_s3.put_object(
             Body=open("./docs/source/_static/logo.png", "rb").read(),
             Bucket=bucket,
             Key=f"test_torch_image_s3_dataloader/class={label}/logo{i}.png",
@@ -142,7 +144,7 @@ def test_torch_image_s3_dataloader(bucket):
     num_train = len(ds)
     indices = list(range(num_train))
     loader = DataLoader(
-        ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices)
+        ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices), drop_last=drop_last
     )
     for i, (image, label) in enumerate(loader):
         assert image.shape == torch.Size([batch_size, 4, 494, 1636])
@@ -194,3 +196,44 @@ def test_torch_audio_s3(bucket):
     loader = DataLoader(ds, batch_size=1)
     for (audio, rate), label in loader:
         assert audio.shape == torch.Size((1, *size))
+
+
+# def test_torch_s3_file_dataset(bucket):
+#     cifar10 = "s3://fast-ai-imageclas/cifar10.tgz"
+#     batch_size = 64
+#     for image, label in DataLoader(
+#         wr.torch.S3FilesDataset(cifar10),
+#         batch_size=batch_size,
+#     ):
+#         assert image.shape == torch.Size([batch_size, 3, 32, 32])
+#         assert label.dtype == torch.int64
+#         break
+
+
+@pytest.mark.parametrize("drop_last", [True, False])
+def test_torch_s3_iterable_dataset(bucket, drop_last):
+    folder = "test_torch_s3_iterable_dataset"
+    batch_size = 32
+    client_s3 = boto3.client("s3")
+    for i in range(3):
+        batch = torch.randn(100, 3, 32, 32)
+        buff = io.BytesIO()
+        torch.save(batch, buff)
+        buff.seek(0)
+        client_s3.put_object(
+            Body=buff.read(),
+            Bucket=bucket,
+            Key=f"{folder}/file{i}.pt",
+        )
+
+    for image in DataLoader(
+        wr.torch.S3IterableDataset(
+            path=f"s3://{bucket}/{folder}",
+        ),
+        batch_size=batch_size,
+        drop_last=drop_last,
+    ):
+        if drop_last:
+            assert image.shape == torch.Size([batch_size, 3, 32, 32])
+        else:
+            assert image[0].shape == torch.Size([3, 32, 32])

From 60232f44a5663fce3cdd82b7b5dcaaf431fa2b76 Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Wed, 22 Apr 2020 22:09:58 -0300
Subject: [PATCH 14/59] add tutorial draft

---
 tutorials/14 - PyTorch.ipynb | 249 +++++++++++++++++++++++++++++++++++
 1 file changed, 249 insertions(+)
 create mode 100644 tutorials/14 - PyTorch.ipynb

diff --git a/tutorials/14 - PyTorch.ipynb b/tutorials/14 - PyTorch.ipynb
new file mode 100644
index 000000000..757c817f9
--- /dev/null
+++ b/tutorials/14 - PyTorch.ipynb	
@@ -0,0 +1,249 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PyTorch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Table of Contents\n",
+    "* [1.Defining Training Function](#1.-Defininf-Training-Function)\n",
+    "* [2.Traning From Amazon S3](#1.-Traning-From-Amazon-S3)\n",
+    "\t* [2.1 Writing PyTorch Dataset to S3](#1.1-Writing-PyTorch-Dataset-to-S3)\n",
+    "\t* [2.2 Training Network](#1.2-Training-Network)\n",
+    "* [3. Training From SQL Query](#2.-Training-From-SQL-Query)\n",
+    "\t* [3.1 Writing Data to SQL Database](#2.1-Writing-Data-to-SQL-Database)\n",
+    "\t* [3.3 Training Network From SQL](#2.2-Reading-single-JSON-file)\n",
+    "* [4. Creating Custom S3 Dataset](#1.-Creating-Custom-S3-Dataset)\n",
+    "\t* [4.1 Creating Custom PyTorch Dataset](#1.1-Creating-Custom-PyTorch-Dataset)\n",
+    "\t* [4.2 Writing Data to S3](#1.1-Writing-Data-to-S3)\n",
+    "\t* [4.3 Training Network](#1.2-Training-Network)\n",
+    "* [5. Delete objects](#6.-Delete-objects)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import io\n",
+    "import boto3\n",
+    "import torch\n",
+    "import torchvision\n",
+    "import awswrangler as wr\n",
+    "\n",
+    "accuracy = lambda o, l: 100/o.size(0) * (torch.max(o.data, 1)[1] == l).sum().item()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "bucket = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. Defining Training Function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(model, dataset):\n",
+    "    criterion = torch.nn.CrossEntropyLoss()\n",
+    "    opt = torch.optim.SGD(model.parameters(), 0.025)\n",
+    "\n",
+    "    for epoch in range(2):\n",
+    "\n",
+    "        model.train()\n",
+    "        for inputs, labels in torch.utils.data.DataLoader(\n",
+    "            dataset,\n",
+    "            batch_size=64,\n",
+    "            num_workers=2,\n",
+    "        ):\n",
+    "\n",
+    "            outputs = model(inputs)\n",
+    "            loss = criterion(outputs, labels)\n",
+    "            loss.backward()\n",
+    "            opt.step()s\n",
+    "            opt.zero_grad()\n",
+    "\n",
+    "            acc = accuracy(outputs, labels)\n",
+    "            print(f'batch: {i} loss: {loss.mean().item():.4f} batch_acc: {acc:.2f}')   "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2. Traning From Amazon S3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client_s3 = boto3.client(\"s3\")\n",
+    "folder = \"tutorial_torch_dataset\"\n",
+    "for i in range(3):\n",
+    "    batch = (\n",
+    "        torch.randn(100, 3, 32, 32),\n",
+    "        torch.randint(1, size=(100,)),\n",
+    "    )\n",
+    "    buff = io.BytesIO()\n",
+    "    torch.save(batch, buff)\n",
+    "    buff.seek(0)\n",
+    "    client_s3.put_object(\n",
+    "        Body=buff.read(),\n",
+    "        Bucket=bucket,\n",
+    "        Key=f\"{folder}/file{i}.pt\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2.2 Training Network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train(\n",
+    "    torchvision.models.resnet18(),\n",
+    "    wr.torch.S3IterableDataset(path=f\"s3://{bucket}/{folder}\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2. Training Directly From SQL Query"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2.1 Writing Data to SQL Database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eng = wr.catalog.get_engine(\"aws-data-wrangler-redshift\")\n",
+    "df = pd.DataFrame({\n",
+    "    \"height\": [2, 1.4, 1.7, 1.8, 1.9],\n",
+    "    \"name\": [\"foo\", \"boo\"],\n",
+    "    \"target\": [1, 0, 0, 1, 2, 3]\n",
+    "})\n",
+    "\n",
+    "wr.db.to_sql(\n",
+    "    df,\n",
+    "    eng_redshift,\n",
+    "    schema=\"public\",\n",
+    "    name=\"torch\",\n",
+    "    if_exists=\"replace\",\n",
+    "    index=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2.2 Training Network From SQL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train(\n",
+    "    model = torch.nn.Sequential(\n",
+    "        torch.nn.Linear(, 20),\n",
+    "        torch.nn.ReLU(),\n",
+    "        torch.nn.Linear(20, 2),    \n",
+    "    ),\n",
+    "    wr.torch.SQLDataset(\n",
+    "        sql=\"SELECT * FROM public.torch\"\n",
+    "        con=eng\n",
+    "        label_col=\"target\",\n",
+    "        chunksize=100\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3. Delete Objects"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wr.s3.delete_objects(f\"s3://{bucket}/\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_pytorch_p36",
+   "language": "python",
+   "name": "conda_pytorch_p36"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 215fbd54c75f852267ced4777f9956391f4bb989 Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Thu, 23 Apr 2020 11:51:00 -0300
Subject: [PATCH 15/59] add torch extras_requirements to setuptools

---
 requirements-torch.txt | 3 +++
 setup.py               | 3 +++
 2 files changed, 6 insertions(+)
 create mode 100644 requirements-torch.txt

diff --git a/requirements-torch.txt b/requirements-torch.txt
new file mode 100644
index 000000000..325196f07
--- /dev/null
+++ b/requirements-torch.txt
@@ -0,0 +1,3 @@
+torch~=1.4.0
+torchvision~=0.5.0
+torchaudio~=0.4.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index b363e6e58..f9c861a60 100644
--- a/setup.py
+++ b/setup.py
@@ -23,4 +23,7 @@
     packages=find_packages(include=["awswrangler", "awswrangler.*"], exclude=["tests"]),
     python_requires=">=3.6, <3.9",
     install_requires=[open("requirements.txt").read().strip().split("\n")],
+    extras_require={
+        "torch": open("requirements-torch.txt").read().strip().split("\n")
+    }
 )

From 0ad9e4bf16e015562aeaed0a635ca970335b420f Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Thu, 23 Apr 2020 12:46:30 -0300
Subject: [PATCH 16/59] handle labels in S3IterableDataset

---
 awswrangler/torch.py                   | 11 ++++++++
 testing/test_awswrangler/test_torch.py | 37 ++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index c4dac13e5..29343983b 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -5,6 +5,7 @@
 import tarfile
 import pathlib
 import re
+from collections import Iterable
 from io import BytesIO
 from typing import Any, Callable, Iterator, List, Optional, Tuple, Union
 
@@ -331,10 +332,20 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor,
         for path in self._paths:
             data = self._fetch_data(path)
             data = self._load_data(data, path)
+
+            if isinstance(data, torch.Tensor):
+                pass
+            elif isinstance(data, Iterable) and all([isinstance(d, torch.Tensor) for d in data]):
+                data = zip(data)
+            else:
+                raise NotImplementedError(f"ERROR: Type: {type(data)} has not been implemented!")
+
             for d in data:
                 yield d
 
 
+
+
 class SQLDataset(IterableDataset):  # pylint: disable=too-few-public-methods,abstract-method
     """Pytorch Iterable SQL Dataset."""
 
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index 599976d33..aaac654a4 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -237,3 +237,40 @@ def test_torch_s3_iterable_dataset(bucket, drop_last):
             assert image.shape == torch.Size([batch_size, 3, 32, 32])
         else:
             assert image[0].shape == torch.Size([3, 32, 32])
+
+
+@pytest.mark.parametrize("drop_last", [True, False])
+def test_torch_s3_iterable_with_labels(bucket, drop_last):
+    folder = "test_torch_s3_iterable_dataset"
+    batch_size = 32
+    client_s3 = boto3.client("s3")
+    for i in range(3):
+        batch = (
+            torch.randn(100, 3, 32, 32),
+            torch.randint(2, size=(100,)),
+        )
+        buff = io.BytesIO()
+        torch.save(batch, buff)
+        buff.seek(0)
+        client_s3.put_object(
+            Body=buff.read(),
+            Bucket=bucket,
+            Key=f"{folder}/file{i}.pt",
+        )
+
+    for images, labels in DataLoader(
+        wr.torch.S3IterableDataset(
+            path=f"s3://{bucket}/{folder}",
+        ),
+        batch_size=batch_size,
+        drop_last=drop_last,
+    ):
+        if drop_last:
+            assert images.shape == torch.Size([batch_size, 3, 32, 32])
+            assert labels.dtype == torch.int64
+            assert labels.size == torch.Size([batch_size, 1])
+
+        else:
+            assert images[0].shape == torch.Size([3, 32, 32])
+            assert labels.dtype == torch.int64
+            assert labels.size == torch.Size([1])
\ No newline at end of file

From 5e72ddf1232c8cde025080a32ebf4cac398f833c Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Thu, 23 Apr 2020 12:59:01 -0300
Subject: [PATCH 17/59] clear bucket in S3Iterable Dataset test

---
 awswrangler/torch.py                   |  2 +-
 testing/test_awswrangler/test_torch.py | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index 29343983b..5e4365062 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -336,7 +336,7 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor,
             if isinstance(data, torch.Tensor):
                 pass
             elif isinstance(data, Iterable) and all([isinstance(d, torch.Tensor) for d in data]):
-                data = zip(data)
+                data = zip(*data)
             else:
                 raise NotImplementedError(f"ERROR: Type: {type(data)} has not been implemented!")
 
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index aaac654a4..83630b0e7 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -211,8 +211,10 @@ def test_torch_audio_s3(bucket):
 
 
 @pytest.mark.parametrize("drop_last", [True, False])
-def test_torch_s3_iterable_dataset(bucket, drop_last):
-    folder = "test_torch_s3_iterable_dataset"
+def test_torch_s3_iterable(bucket, drop_last):
+    folder = "test_torch_s3_iterable"
+    path = f"s3://{bucket}/{folder}/"
+    wr.s3.delete_objects(path=path)
     batch_size = 32
     client_s3 = boto3.client("s3")
     for i in range(3):
@@ -241,7 +243,9 @@ def test_torch_s3_iterable_dataset(bucket, drop_last):
 
 @pytest.mark.parametrize("drop_last", [True, False])
 def test_torch_s3_iterable_with_labels(bucket, drop_last):
-    folder = "test_torch_s3_iterable_dataset"
+    folder = "test_torch_s3_iterable_with_labels"
+    path = f"s3://{bucket}/{folder}/"
+    wr.s3.delete_objects(path=path)
     batch_size = 32
     client_s3 = boto3.client("s3")
     for i in range(3):
@@ -268,9 +272,9 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last):
         if drop_last:
             assert images.shape == torch.Size([batch_size, 3, 32, 32])
             assert labels.dtype == torch.int64
-            assert labels.size == torch.Size([batch_size, 1])
+            assert labels.shape == torch.Size([batch_size])
 
         else:
             assert images[0].shape == torch.Size([3, 32, 32])
-            assert labels.dtype == torch.int64
-            assert labels.size == torch.Size([1])
\ No newline at end of file
+            assert labels[0].dtype == torch.int64
+            assert labels[0].shape == torch.Size([])

From 5b399ac656be04e1c4cb5cf454cad5ea474a4b10 Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Thu, 23 Apr 2020 13:10:07 -0300
Subject: [PATCH 18/59] update setuptools

---
 requirements-dev.txt | 5 +----
 setup-dev-env.sh     | 3 +--
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 0491e8789..3fdd3cdf3 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -17,7 +17,4 @@ twine~=3.1.1
 wheel~=0.34.2
 sphinx~=3.0.1
 sphinx_bootstrap_theme~=0.7.1
-moto~=1.3.14
-torch~=1.4.0
-torchvision~=0.5.0
-torchaudio~=0.4.0
\ No newline at end of file
+moto~=1.3.14
\ No newline at end of file
diff --git a/setup-dev-env.sh b/setup-dev-env.sh
index 692724ee0..c9c2e9902 100755
--- a/setup-dev-env.sh
+++ b/setup-dev-env.sh
@@ -3,5 +3,4 @@ set -ex
 
 pip install --upgrade pip
 pip install -r requirements-dev.txt
-pip install -r requirements.txt
-pip install -e .
+pip install -e ".[torch]"

From 2db15b6d09ae1d5e80d325a7daf44ab8c163eeef Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Thu, 23 Apr 2020 13:23:08 -0300
Subject: [PATCH 19/59] update pytorch tutorial

---
 tutorials/14 - PyTorch.ipynb | 69 +++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 21 deletions(-)

diff --git a/tutorials/14 - PyTorch.ipynb b/tutorials/14 - PyTorch.ipynb
index 757c817f9..fefb8332a 100644
--- a/tutorials/14 - PyTorch.ipynb	
+++ b/tutorials/14 - PyTorch.ipynb	
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,12 +45,14 @@
     "import torchvision\n",
     "import awswrangler as wr\n",
     "\n",
-    "accuracy = lambda o, l: 100/o.size(0) * (torch.max(o.data, 1)[1] == l).sum().item()"
+    "from torch.optim import SGD\n",
+    "from torch.nn import CrossEntropyLoss\n",
+    "from torch.utils.data import DataLoader"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -67,31 +69,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def train(model, dataset):\n",
-    "    criterion = torch.nn.CrossEntropyLoss()\n",
-    "    opt = torch.optim.SGD(model.parameters(), 0.025)\n",
+    "def train(model, dataset, batch_size=64, epochs=2, device='cpu'):\n",
+    "\n",
+    "    criterion = CrossEntropyLoss().to(device)\n",
+    "    opt = SGD(model.parameters(), 0.025)\n",
+    "    loader = DataLoader(dataset, batch_size=batch_size, num_workers=1)\n",
     "\n",
-    "    for epoch in range(2):\n",
+    "    for epoch in range(epochs):\n",
     "\n",
+    "        correct = 0    \n",
     "        model.train()\n",
-    "        for inputs, labels in torch.utils.data.DataLoader(\n",
-    "            dataset,\n",
-    "            batch_size=64,\n",
-    "            num_workers=2,\n",
-    "        ):\n",
+    "        for i, (inputs, labels) in enumerate(loader):\n",
     "\n",
+    "            # Forward Pass\n",
     "            outputs = model(inputs)\n",
+    "            \n",
+    "            # Backward Pass\n",
     "            loss = criterion(outputs, labels)\n",
     "            loss.backward()\n",
-    "            opt.step()s\n",
+    "            opt.step()\n",
     "            opt.zero_grad()\n",
+    "            \n",
+    "            # Accuracy\n",
+    "            _, predicted = torch.max(outputs.data, 1)\n",
+    "            correct += (predicted == labels).sum().item()\n",
+    "            accuracy = 100 * correct / ((i+1) * batch_size)\n",
     "\n",
-    "            acc = accuracy(outputs, labels)\n",
-    "            print(f'batch: {i} loss: {loss.mean().item():.4f} batch_acc: {acc:.2f}')   "
+    "            print(f'batch: {i} loss: {loss.mean().item():.4f} acc: {accuracy:.2f}')   "
    ]
   },
   {
@@ -103,16 +111,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "client_s3 = boto3.client(\"s3\")\n",
     "folder = \"tutorial_torch_dataset\"\n",
+    "\n",
+    "wr.s3.delete_objects(f\"s3://{bucket}/{folder}\")\n",
     "for i in range(3):\n",
     "    batch = (\n",
     "        torch.randn(100, 3, 32, 32),\n",
-    "        torch.randint(1, size=(100,)),\n",
+    "        torch.randint(2, size=(100,)),\n",
     "    )\n",
     "    buff = io.BytesIO()\n",
     "    torch.save(batch, buff)\n",
@@ -133,13 +143,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "batch: 0 loss: 6.9552 acc: 0.00\n",
+      "batch: 1 loss: 2.9621 acc: 23.44\n",
+      "batch: 2 loss: 0.9873 acc: 31.77\n",
+      "batch: 3 loss: 1.9760 acc: 34.38\n",
+      "batch: 4 loss: 3.3523 acc: 33.44\n",
+      "batch: 0 loss: 1.2023 acc: 59.38\n",
+      "batch: 1 loss: 0.8057 acc: 60.16\n",
+      "batch: 2 loss: 0.6782 acc: 62.50\n",
+      "batch: 3 loss: 0.4291 acc: 67.58\n",
+      "batch: 4 loss: 0.2953 acc: 66.88\n"
+     ]
+    }
+   ],
    "source": [
     "train(\n",
     "    torchvision.models.resnet18(),\n",
-    "    wr.torch.S3IterableDataset(path=f\"s3://{bucket}/{folder}\"),\n",
+    "    wr.torch.S3IterableDataset(path=f\"{bucket}/{folder}\")\n",
     ")"
    ]
   },

From 5e647c66d0f4df62ed360d73d0a3a3aa0bbda06c Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Thu, 23 Apr 2020 17:01:50 +0000
Subject: [PATCH 20/59] Update tutorial

---
 tutorials/14 - PyTorch.ipynb | 101 +++++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 33 deletions(-)

diff --git a/tutorials/14 - PyTorch.ipynb b/tutorials/14 - PyTorch.ipynb
index fefb8332a..a3d988881 100644
--- a/tutorials/14 - PyTorch.ipynb	
+++ b/tutorials/14 - PyTorch.ipynb	
@@ -40,9 +40,11 @@
    "outputs": [],
    "source": [
     "import io\n",
+    "\n",
     "import boto3\n",
     "import torch\n",
     "import torchvision\n",
+    "import pandas as pd\n",
     "import awswrangler as wr\n",
     "\n",
     "from torch.optim import SGD\n",
@@ -54,7 +56,15 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ··········································\n"
+     ]
+    }
+   ],
    "source": [
     "import getpass\n",
     "bucket = getpass.getpass()"
@@ -69,15 +79,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def train(model, dataset, batch_size=64, epochs=2, device='cpu'):\n",
+    "def train(model, dataset, batch_size=64, epochs=2, device='cpu', num_workers=1):\n",
     "\n",
     "    criterion = CrossEntropyLoss().to(device)\n",
     "    opt = SGD(model.parameters(), 0.025)\n",
-    "    loader = DataLoader(dataset, batch_size=batch_size, num_workers=1)\n",
+    "    loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)\n",
     "\n",
     "    for epoch in range(epochs):\n",
     "\n",
@@ -111,7 +121,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -143,23 +153,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "batch: 0 loss: 6.9552 acc: 0.00\n",
-      "batch: 1 loss: 2.9621 acc: 23.44\n",
-      "batch: 2 loss: 0.9873 acc: 31.77\n",
-      "batch: 3 loss: 1.9760 acc: 34.38\n",
-      "batch: 4 loss: 3.3523 acc: 33.44\n",
-      "batch: 0 loss: 1.2023 acc: 59.38\n",
-      "batch: 1 loss: 0.8057 acc: 60.16\n",
-      "batch: 2 loss: 0.6782 acc: 62.50\n",
-      "batch: 3 loss: 0.4291 acc: 67.58\n",
-      "batch: 4 loss: 0.2953 acc: 66.88\n"
+      "batch: 0 loss: 7.0221 acc: 0.00\n",
+      "batch: 1 loss: 2.7788 acc: 23.44\n",
+      "batch: 2 loss: 0.9828 acc: 32.29\n",
+      "batch: 3 loss: 0.9414 acc: 39.45\n",
+      "batch: 4 loss: 1.0737 acc: 39.38\n",
+      "batch: 0 loss: 1.2178 acc: 50.00\n",
+      "batch: 1 loss: 1.4069 acc: 51.56\n",
+      "batch: 2 loss: 1.0783 acc: 52.08\n",
+      "batch: 3 loss: 0.9926 acc: 52.34\n",
+      "batch: 4 loss: 1.1111 acc: 49.06\n"
      ]
     }
    ],
@@ -186,20 +196,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
     "eng = wr.catalog.get_engine(\"aws-data-wrangler-redshift\")\n",
     "df = pd.DataFrame({\n",
-    "    \"height\": [2, 1.4, 1.7, 1.8, 1.9],\n",
-    "    \"name\": [\"foo\", \"boo\"],\n",
-    "    \"target\": [1, 0, 0, 1, 2, 3]\n",
+    "    \"height\": [2, 1.4, 1.7, 1.8, 1.9, 2.2],\n",
+    "    \"weigth\": [100.0, 50.0, 70.0, 80.0, 90.0, 160.0],\n",
+    "    \"target\": [1, 0, 0, 1, 1, 1]\n",
     "})\n",
     "\n",
     "wr.db.to_sql(\n",
     "    df,\n",
-    "    eng_redshift,\n",
+    "    eng,\n",
     "    schema=\"public\",\n",
     "    name=\"torch\",\n",
     "    if_exists=\"replace\",\n",
@@ -216,22 +226,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "batch: 0 loss: 5.0253 acc: 50.00\n",
+      "batch: 1 loss: 21.3174 acc: 50.00\n",
+      "batch: 2 loss: 0.5061 acc: 66.67\n",
+      "batch: 0 loss: 1.2222 acc: 50.00\n",
+      "batch: 1 loss: 0.7075 acc: 50.00\n",
+      "batch: 2 loss: 0.7077 acc: 50.00\n",
+      "batch: 0 loss: 0.9302 acc: 50.00\n",
+      "batch: 1 loss: 0.6960 acc: 50.00\n",
+      "batch: 2 loss: 0.6018 acc: 66.67\n",
+      "batch: 0 loss: 1.1284 acc: 50.00\n",
+      "batch: 1 loss: 0.7077 acc: 50.00\n",
+      "batch: 2 loss: 0.6791 acc: 50.00\n",
+      "batch: 0 loss: 1.0030 acc: 50.00\n",
+      "batch: 1 loss: 0.7053 acc: 50.00\n",
+      "batch: 2 loss: 0.6318 acc: 50.00\n"
+     ]
+    }
+   ],
    "source": [
     "train(\n",
-    "    model = torch.nn.Sequential(\n",
-    "        torch.nn.Linear(, 20),\n",
+    "    torch.nn.Sequential(\n",
+    "        torch.nn.Linear(2, 10),\n",
     "        torch.nn.ReLU(),\n",
-    "        torch.nn.Linear(20, 2),    \n",
+    "        torch.nn.Linear(10, 2),    \n",
     "    ),\n",
     "    wr.torch.SQLDataset(\n",
-    "        sql=\"SELECT * FROM public.torch\"\n",
-    "        con=eng\n",
+    "        sql=\"SELECT * FROM public.torch\",\n",
+    "        con=eng,\n",
     "        label_col=\"target\",\n",
-    "        chunksize=100\n",
-    "    )\n",
+    "        chunksize=2\n",
+    "    ),\n",
+    "    num_workers=0,\n",
+    "    batch_size=2,\n",
+    "    epochs=5\n",
     ")"
    ]
   },
@@ -244,7 +279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -254,9 +289,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "conda_pytorch_p36",
+   "display_name": "conda_python3",
    "language": "python",
-   "name": "conda_pytorch_p36"
+   "name": "conda_python3"
   },
   "language_info": {
    "codemirror_mode": {

From b3d9fe2d9d4c1563aecb225f6a2b678414df41ab Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Thu, 23 Apr 2020 18:14:55 -0300
Subject: [PATCH 21/59] parallel tests fix

---
 awswrangler/torch.py                   | 209 ++++++++++++++++---------
 building/build-docs.sh                 |   2 +-
 docs/source/api.rst                    |  13 ++
 testing/test_awswrangler/test_torch.py |  37 +++--
 4 files changed, 169 insertions(+), 92 deletions(-)

diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index 5e4365062..c25e145ee 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -113,49 +113,54 @@ def _label_fn(self, path: str):
 class _S3PartitionedDataset(_ListS3Dataset):
     """PyTorch Amazon S3 Map-Style Partitioned Dataset."""
 
-    def _label_fn(self, path: str):
-        return int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1])
-
-
-class S3FilesDataset(_BaseS3Dataset, Dataset):
-    """PyTorch Amazon S3 Files Map-Style Dataset."""
-
-    def __init__(
-        self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
-    ):
-        """PyTorch S3 Files Map-Style Dataset.
-
-        Each file under Amazon S3 path would be handled as a batch of tensors.
-        All files will be loaded to memory since random access is needed.
-
-        Parameters
-        ----------
-        path : Union[str, List[str]]
-            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-        boto3_session : boto3.Session(), optional
-            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-
-        Returns
-        -------
-        torch.utils.data.Dataset
-
-        """
-        super(S3FilesDataset, self).__init__(path, suffix, boto3_session)
-
-    def _download_files(self):
-        self._data = []
-        for path in self._paths:
-            data = self._fetch_data(path)
-            data = self._load_data(data, path)
-            self._data.append(data)
-
-        self.data = torch.tensor(self._data)
-
-    def __getitem__(self, index):
-        return self._data[index]
-
-    def __len__(self):
-        return len(self._data)
+    def _label_fn(self, path: str) -> torch.Tensor:
+        label = int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1])
+        return torch.tensor([label])
+
+
+# class S3FilesDataset(_BaseS3Dataset, Dataset):
+#     """PyTorch Amazon S3 Files Map-Style Dataset."""
+#
+#     def __init__(
+#         self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
+#     ):
+#         """PyTorch S3 Files Map-Style Dataset.
+#
+#         Each file under Amazon S3 path would be handled as a tensor or batch of tensors.
+#
+#         Note
+#         ----
+#         All files will be loaded to memory since random access is needed.
+#
+#         Parameters
+#         ----------
+#         path : Union[str, List[str]]
+#             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+#         boto3_session : boto3.Session(), optional
+#             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+#
+#         Returns
+#         -------
+#         torch.utils.data.Dataset
+#
+#         """
+#         super(S3FilesDataset, self).__init__(path, suffix, boto3_session)
+#         self._download_files()
+#
+#     def _download_files(self) -> None:
+#         self._data = []
+#         for path in self._paths:
+#             data = self._fetch_data(path)
+#             data = self._load_data(data, path)
+#             self._data.append(data)
+#
+#         self.data = torch.cat(self._data, dim=0)
+#
+#     def __getitem__(self, index):
+#         return self._data[index]
+#
+#     def __len__(self):
+#         return len(self._data)
 
 
 class LambdaS3Dataset(_ListS3Dataset):
@@ -169,7 +174,7 @@ def __init__(
         suffix: Optional[str] = None,
         boto3_session: Optional[boto3.Session] = None,
     ):
-        """PyTorch S3 Lambda Dataset.
+        """PyTorch Amazon S3 Lambda Dataset.
 
         Parameters
         ----------
@@ -184,22 +189,24 @@ def __init__(
 
         Examples
         --------
+        >>> import re
+        >>> import torch
         >>> import awswrangler as wr
-        >>> import boto3
-        >>> _data_fn = lambda x: torch.tensor(x)
-        >>> _label_fn = lambda x: x.split('.')[-1]
-        >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), _data_fn=_data_fn, _label_fn=_label_fn)
+        >>> ds = wr.torch.LambdaS3Dataset(
+        >>>     's3://bucket/path',
+        >>>     data_fn=lambda x: torch.load(x),
+        >>>     label_fn=lambda x: torch.Tensor(int(re.findall(r"/class=(.*?)/", x)[-1])),
+        >>> )
 
         """
         super(LambdaS3Dataset, self).__init__(path, suffix, boto3_session)
         self._data_func = data_fn
         self._label_func = label_fn
 
-    def _label_fn(self, path: str):
+    def _label_fn(self, path: str) -> torch.Tensor:
         return self._label_func(path)
 
-    def _data_fn(self, data):
-        print(type(data))
+    def _data_fn(self, data) -> torch.Tensor:
         return self._data_func(data)
 
 
@@ -213,17 +220,26 @@ def __init__(
         suffix: Optional[str] = None,
         boto3_session: Optional[boto3.Session] = None,
     ):
-        """PyTorch S3 Audio Dataset.
+        """PyTorch Amazon S3 Audio Dataset.
+
+        Read individual WAV audio files stores in Amazon S3 and return
+        them as torch tensors.
+
+        Note
+        ----
+
+        This dataset assumes audio files are stored with the following structure:
+
 
-        Assumes audio files are stored with the following structure:
+        ::
 
-        bucket
-        ├── class=0
-        │   ├── audio0.wav
-        │   └── audio1.wav
-        └── class=1
-            ├── audio2.wav
-            └── audio3.wav
+            bucket
+            ├── class=0
+            │   ├── audio0.wav
+            │   └── audio1.wav
+            └── class=1
+                ├── audio2.wav
+                └── audio3.wav
 
         Parameters
         ----------
@@ -238,9 +254,39 @@ def __init__(
 
         Examples
         --------
+
+        Create a Audio S3 Dataset
+
         >>> import awswrangler as wr
-        >>> import boto3
-        >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session())
+        >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path')
+
+
+        Training a Model
+
+        >>> criterion = CrossEntropyLoss().to(device)
+        >>> opt = SGD(model.parameters(), 0.025)
+        >>> loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)
+        >>>
+        >>> for epoch in range(epochs):
+        >>>
+        >>>     correct = 0
+        >>>     model.train()
+        >>>     for i, (inputs, labels) in enumerate(loader):
+        >>>
+        >>>         # Forward Pass
+        >>>         outputs = model(inputs)
+        >>>
+        >>>         # Backward Pass
+        >>>         loss = criterion(outputs, labels)
+        >>>         loss.backward()
+        >>>         opt.step()
+        >>>         opt.zero_grad()
+        >>>
+        >>>         # Accuracy
+        >>>         _, predicted = torch.max(outputs.data, 1)
+        >>>         correct += (predicted == labels).sum().item()
+        >>>         accuracy = 100 * correct / ((i+1) * batch_size)
+        >>>         print(f'batch: {i} loss: {loss.mean().item():.4f} acc: {accuracy:.2f}')
 
         """
         super(AudioS3Dataset, self).__init__(path, suffix, boto3_session)
@@ -261,20 +307,28 @@ def _fetch_data(self, path: str) -> str:
 
 
 class ImageS3Dataset(_S3PartitionedDataset):
-    """PyTorch S3 Image Dataset."""
+    """PyTorch Amazon S3 Image Dataset."""
 
     def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session):
-        """PyTorch S3 Image Dataset.
+        """PyTorch Amazon S3 Image Dataset.
+
+        ImageS3Dataset assumes images are patitioned (within class=<value> folders) in Amazon S3.
+        Each lisited object will be loaded by default Pillow library.
 
+        Note
+        ----
         Assumes Images are stored with the following structure:
 
-        bucket
-        ├── class=0
-        │   ├── img0.jpeg
-        │   └── img1.jpeg
-        └── class=1
-            ├── img2.jpeg
-            └── img3.jpeg
+
+        ::
+
+            bucket
+            ├── class=0
+            │   ├── img0.jpeg
+            │   └── img1.jpeg
+            └── class=1
+                ├── img2.jpeg
+                └── img3.jpeg
 
         Parameters
         ----------
@@ -290,13 +344,12 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto
         Examples
         --------
         >>> import awswrangler as wr
-        >>> import boto3
-        >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session())
+        >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path')
 
         """
         super(ImageS3Dataset, self).__init__(path, suffix, boto3_session)
 
-    def _data_fn(self, data):
+    def _data_fn(self, data: io.BytesIO) -> torch.Tensor:
         image = Image.open(data)
         tensor = to_tensor(image)
         return tensor
@@ -324,9 +377,13 @@ def __init__(
         -------
         torch.utils.data.Dataset
 
+        Examples
+        --------
+        >>> import awswrangler as wr
+        >>> ds = wr.torch.S3IterableDataset('s3://bucket/path')
+
         """
         super(S3IterableDataset, self).__init__(path, suffix, boto3_session)
-        self._paths_index = 0
 
     def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]:
         for path in self._paths:
@@ -344,8 +401,6 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor,
                 yield d
 
 
-
-
 class SQLDataset(IterableDataset):  # pylint: disable=too-few-public-methods,abstract-method
     """Pytorch Iterable SQL Dataset."""
 
diff --git a/building/build-docs.sh b/building/build-docs.sh
index c32c20aa0..8c807b485 100755
--- a/building/build-docs.sh
+++ b/building/build-docs.sh
@@ -4,4 +4,4 @@ set -ex
 pushd ..
 rm -rf docs/build docs/source/stubs
 make -C docs/ html
-doc8 --ignore D005 docs/source
+doc8 --ignore D005,D002 docs/source
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 897fc7a3e..aea8bbed6 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -3,6 +3,19 @@
 API Reference
 =============
 
+PyTorch
+-------
+
+.. currentmodule:: awswrangler.torch
+
+.. autosummary::
+    :toctree: stubs
+
+    AudioS3Dataset
+    ImageS3Dataset
+    S3IterableDataset
+    SQLDataset
+
 Amazon S3
 ---------
 
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index 83630b0e7..40ecf7050 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -63,7 +63,7 @@ def parameters(cloudformation_outputs):
 @pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"])
 def test_torch_sql(parameters, db_type, chunksize):
     schema = parameters[db_type]["schema"]
-    table = "test_torch_sql"
+    table = f"test_torch_sql_{db_type}_{str(chunksize).lower()}"
     engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
     wr.db.to_sql(
         df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}),
@@ -86,7 +86,7 @@ def test_torch_sql(parameters, db_type, chunksize):
 @pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"])
 def test_torch_sql_label(parameters, db_type, chunksize):
     schema = parameters[db_type]["schema"]
-    table = "test_torch_sql_label"
+    table = f"test_torch_sql_label_{db_type}_{str(chunksize).lower()}"
     engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
     wr.db.to_sql(
         df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0], "c": [7, 8, 9]}),
@@ -109,14 +109,15 @@ def test_torch_sql_label(parameters, db_type, chunksize):
 
 
 def test_torch_image_s3(bucket):
-    path = f"s3://{bucket}/test_torch_image_s3/"
+    folder = "test_torch_image_s3"
+    path = f"s3://{bucket}/{folder}/"
     wr.s3.delete_objects(path=path, boto3_session=boto3.Session())
     s3 = boto3.client("s3")
     ref_label = 0
     s3.put_object(
         Body=open("docs/source/_static/logo.png", "rb").read(),
         Bucket=bucket,
-        Key=f"test_torch_image_s3/class={ref_label}/logo.png",
+        Key=f"{folder}/class={ref_label}/logo.png",
         ContentType="image/png",
     )
     ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session())
@@ -127,8 +128,9 @@ def test_torch_image_s3(bucket):
 
 
 @pytest.mark.parametrize("drop_last", [True, False])
-def test_torch_image_s3_dataloader(bucket, drop_last):
-    path = f"s3://{bucket}/test_torch_image_s3_dataloader/"
+def test_torch_image_s3(bucket, drop_last):
+    folder = f"test_torch_image_s3_{str(drop_last).lower()}"
+    path = f"s3://{bucket}/{folder}/"
     wr.s3.delete_objects(path=path)
     client_s3 = boto3.client("s3")
     labels = np.random.randint(0, 4, size=(8,))
@@ -136,7 +138,7 @@ def test_torch_image_s3_dataloader(bucket, drop_last):
         client_s3.put_object(
             Body=open("./docs/source/_static/logo.png", "rb").read(),
             Bucket=bucket,
-            Key=f"test_torch_image_s3_dataloader/class={label}/logo{i}.png",
+            Key=f"{folder}/class={label}/logo{i}.png",
             ContentType="image/png",
         )
     ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session())
@@ -181,14 +183,15 @@ def test_torch_audio_s3(bucket):
     audio = torch.randint(low=-25, high=25, size=size) / 100.0
     audio_file = "/tmp/amazing_sound.wav"
     torchaudio.save(audio_file, audio, 8_000)
-    path = f"s3://{bucket}/test_torch_audio_s3/"
-    wr.s3.delete_objects(path=path, boto3_session=boto3.Session())
+    folder = "test_torch_audio_s3"
+    path = f"s3://{bucket}/{folder}/"
+    wr.s3.delete_objects(path=path)
     s3 = boto3.client("s3")
     ref_label = 0
     s3.put_object(
         Body=open(audio_file, "rb").read(),
         Bucket=bucket,
-        Key=f"test_torch_audio_s3/class={ref_label}/amazing_sound.wav",
+        Key=f"{folder}/class={ref_label}/amazing_sound.wav",
         ContentType="audio/wav",
     )
     s3_audio_file = f"{bucket}/test_torch_audio_s3/class={ref_label}/amazing_sound.wav"
@@ -196,6 +199,7 @@ def test_torch_audio_s3(bucket):
     loader = DataLoader(ds, batch_size=1)
     for (audio, rate), label in loader:
         assert audio.shape == torch.Size((1, *size))
+    wr.s3.delete_objects(path=path)
 
 
 # def test_torch_s3_file_dataset(bucket):
@@ -212,7 +216,7 @@ def test_torch_audio_s3(bucket):
 
 @pytest.mark.parametrize("drop_last", [True, False])
 def test_torch_s3_iterable(bucket, drop_last):
-    folder = "test_torch_s3_iterable"
+    folder = f"test_torch_s3_iterable_{str(drop_last).lower()}"
     path = f"s3://{bucket}/{folder}/"
     wr.s3.delete_objects(path=path)
     batch_size = 32
@@ -230,7 +234,7 @@ def test_torch_s3_iterable(bucket, drop_last):
 
     for image in DataLoader(
         wr.torch.S3IterableDataset(
-            path=f"s3://{bucket}/{folder}",
+            path=f"s3://{bucket}/{folder}/file",
         ),
         batch_size=batch_size,
         drop_last=drop_last,
@@ -240,10 +244,12 @@ def test_torch_s3_iterable(bucket, drop_last):
         else:
             assert image[0].shape == torch.Size([3, 32, 32])
 
+    wr.s3.delete_objects(path=path)
+
 
 @pytest.mark.parametrize("drop_last", [True, False])
 def test_torch_s3_iterable_with_labels(bucket, drop_last):
-    folder = "test_torch_s3_iterable_with_labels"
+    folder = f"test_torch_s3_iterable_with_labels_{str(drop_last).lower()}"
     path = f"s3://{bucket}/{folder}/"
     wr.s3.delete_objects(path=path)
     batch_size = 32
@@ -264,7 +270,7 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last):
 
     for images, labels in DataLoader(
         wr.torch.S3IterableDataset(
-            path=f"s3://{bucket}/{folder}",
+            path=f"s3://{bucket}/{folder}/file",
         ),
         batch_size=batch_size,
         drop_last=drop_last,
@@ -278,3 +284,6 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last):
             assert images[0].shape == torch.Size([3, 32, 32])
             assert labels[0].dtype == torch.int64
             assert labels[0].shape == torch.Size([])
+
+    wr.s3.delete_objects(path=path)
+

From c091fa82e39e58375f5ea92527ed619c467ca974 Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Thu, 23 Apr 2020 22:23:55 -0300
Subject: [PATCH 22/59] fix lint

---
 awswrangler/torch.py                   | 102 +++++++++----------------
 requirements-torch.txt                 |   3 +-
 testing/test_awswrangler/test_torch.py |  42 ++++------
 3 files changed, 52 insertions(+), 95 deletions(-)

diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index c25e145ee..a5b589386 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -1,11 +1,11 @@
 """PyTorch Module."""
-import logging
 import io
+import logging
 import os
-import tarfile
 import pathlib
 import re
-from collections import Iterable
+import tarfile
+from collections.abc import Iterable
 from io import BytesIO
 from typing import Any, Callable, Iterator, List, Optional, Tuple, Union
 
@@ -49,8 +49,8 @@ def __init__(
             path=path, suffix=suffix, boto3_session=self._session
         )
 
-    def _fetch_data(self, path: str):
-        """Add parquet and csv support"""
+    def _fetch_data(self, path: str) -> Any:
+        """Add parquet and csv support."""
         bucket, key = _utils.parse_path(path=path)
         buff = BytesIO()
         client_s3: boto3.client = _utils.client(service_name="s3", session=self._session)
@@ -59,42 +59,23 @@ def _fetch_data(self, path: str):
         return buff
 
     @staticmethod
-    def _load_data(data: io.BytesIO, path: str):
-        if path.endswith('.tar.gz') or path.endswith('.tgz'):
-            pass
-            # tarfile.open(fileobj=data)
+    def _load_data(data: io.BytesIO, path: str) -> Any:
+        if path.endswith(".pt"):
+            data = torch.load(data)
+        elif path.endswith(".tar.gz") or path.endswith(".tgz"):
+            tarfile.open(fileobj=data)
+            raise NotImplementedError("Tar loader not implemented!")
             # tar = tarfile.open(fileobj=data)
             # for member in tar.getmembers():
-            #     print('member', member)
-        elif path.endswith('.pt'):
-            data = torch.load(data)
+        else:
+            raise NotImplementedError()
+
         return data
 
 
 class _ListS3Dataset(_BaseS3Dataset, Dataset):
     """PyTorch Amazon S3 Map-Style List Dataset."""
 
-    def __init__(
-        self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
-    ):
-        """PyTorch Map-Style List S3 Dataset.
-
-        Each file under path would be handle as a single tensor.
-
-        Parameters
-        ----------
-        path : Union[str, List[str]]
-            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-        boto3_session : boto3.Session(), optional
-            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
-
-        Returns
-        -------
-        torch.utils.data.Dataset
-
-        """
-        super(_ListS3Dataset, self).__init__(path, suffix, boto3_session)
-
     def __getitem__(self, index):
         path = self._paths[index]
         data = self._fetch_data(path)
@@ -103,10 +84,10 @@ def __getitem__(self, index):
     def __len__(self):
         return len(self._paths)
 
-    def _data_fn(self, data):
+    def _data_fn(self, data) -> Any:
         pass
 
-    def _label_fn(self, path: str):
+    def _label_fn(self, path: str) -> Any:
         pass
 
 
@@ -115,7 +96,7 @@ class _S3PartitionedDataset(_ListS3Dataset):
 
     def _label_fn(self, path: str) -> torch.Tensor:
         label = int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1])
-        return torch.tensor([label])
+        return torch.tensor([label])  # pylint: disable=not-callable
 
 
 # class S3FilesDataset(_BaseS3Dataset, Dataset):
@@ -135,7 +116,8 @@ def _label_fn(self, path: str) -> torch.Tensor:
 #         Parameters
 #         ----------
 #         path : Union[str, List[str]]
-#             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+#             S3 prefix (e.g. s3://bucket/prefix) or
+#             list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
 #         boto3_session : boto3.Session(), optional
 #             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 #
@@ -227,7 +209,6 @@ def __init__(
 
         Note
         ----
-
         This dataset assumes audio files are stored with the following structure:
 
 
@@ -254,7 +235,6 @@ def __init__(
 
         Examples
         --------
-
         Create a Audio S3 Dataset
 
         >>> import awswrangler as wr
@@ -349,43 +329,35 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto
         """
         super(ImageS3Dataset, self).__init__(path, suffix, boto3_session)
 
-    def _data_fn(self, data: io.BytesIO) -> torch.Tensor:
+    def _data_fn(self, data: io.BytesIO) -> Any:
         image = Image.open(data)
         tensor = to_tensor(image)
         return tensor
 
 
-class S3IterableDataset(_BaseS3Dataset, IterableDataset):
-    """PyTorch Amazon S3 Iterable Dataset."""
-
-    def __init__(
-        self,
-        path: Union[str, List[str]],
-        suffix: Optional[str] = None,
-        boto3_session: Optional[boto3.Session] = None,
-    ):
-        """PyTorch Amazon S3 Iterable Dataset.
+class S3IterableDataset(IterableDataset, _BaseS3Dataset):  # pylint: disable=abstract-method
+    """PyTorch Amazon S3 Iterable Dataset.
 
-        Parameters
-        ----------
-        path : Union[str, List[str]]
-            S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
-        boto3_session : boto3.Session(), optional
-            Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+    Parameters
+    ----------
+    path : Union[str, List[str]]
+        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
-        Returns
-        -------
-        torch.utils.data.Dataset
+    Returns
+    -------
+    torch.utils.data.Dataset
 
-        Examples
-        --------
-        >>> import awswrangler as wr
-        >>> ds = wr.torch.S3IterableDataset('s3://bucket/path')
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> ds = wr.torch.S3IterableDataset('s3://bucket/path')
 
-        """
-        super(S3IterableDataset, self).__init__(path, suffix, boto3_session)
+    """
 
     def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Iterate over data returning tensors or expanding Iterables."""
         for path in self._paths:
             data = self._fetch_data(path)
             data = self._load_data(data, path)
diff --git a/requirements-torch.txt b/requirements-torch.txt
index 325196f07..01d2c6e65 100644
--- a/requirements-torch.txt
+++ b/requirements-torch.txt
@@ -1,3 +1,4 @@
 torch~=1.4.0
 torchvision~=0.5.0
-torchaudio~=0.4.0
\ No newline at end of file
+torchaudio~=0.4.0
+Pillow==7.1.1
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index 40ecf7050..19a300400 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -1,6 +1,6 @@
 import io
-import re
 import logging
+import re
 
 import boto3
 import numpy as np
@@ -128,8 +128,8 @@ def test_torch_image_s3(bucket):
 
 
 @pytest.mark.parametrize("drop_last", [True, False])
-def test_torch_image_s3(bucket, drop_last):
-    folder = f"test_torch_image_s3_{str(drop_last).lower()}"
+def test_torch_image_s3_loader(bucket, drop_last):
+    folder = f"test_torch_image_s3_loader_{str(drop_last).lower()}"
     path = f"s3://{bucket}/{folder}/"
     wr.s3.delete_objects(path=path)
     client_s3 = boto3.client("s3")
@@ -146,7 +146,11 @@ def test_torch_image_s3(bucket, drop_last):
     num_train = len(ds)
     indices = list(range(num_train))
     loader = DataLoader(
-        ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices), drop_last=drop_last
+        ds,
+        batch_size=batch_size,
+        num_workers=4,
+        sampler=torch.utils.data.sampler.RandomSampler(indices),
+        drop_last=drop_last,
     )
     for i, (image, label) in enumerate(loader):
         assert image.shape == torch.Size([batch_size, 4, 494, 1636])
@@ -226,18 +230,10 @@ def test_torch_s3_iterable(bucket, drop_last):
         buff = io.BytesIO()
         torch.save(batch, buff)
         buff.seek(0)
-        client_s3.put_object(
-            Body=buff.read(),
-            Bucket=bucket,
-            Key=f"{folder}/file{i}.pt",
-        )
+        client_s3.put_object(Body=buff.read(), Bucket=bucket, Key=f"{folder}/file{i}.pt")
 
     for image in DataLoader(
-        wr.torch.S3IterableDataset(
-            path=f"s3://{bucket}/{folder}/file",
-        ),
-        batch_size=batch_size,
-        drop_last=drop_last,
+        wr.torch.S3IterableDataset(path=f"s3://{bucket}/{folder}/file"), batch_size=batch_size, drop_last=drop_last
     ):
         if drop_last:
             assert image.shape == torch.Size([batch_size, 3, 32, 32])
@@ -255,25 +251,14 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last):
     batch_size = 32
     client_s3 = boto3.client("s3")
     for i in range(3):
-        batch = (
-            torch.randn(100, 3, 32, 32),
-            torch.randint(2, size=(100,)),
-        )
+        batch = (torch.randn(100, 3, 32, 32), torch.randint(2, size=(100,)))
         buff = io.BytesIO()
         torch.save(batch, buff)
         buff.seek(0)
-        client_s3.put_object(
-            Body=buff.read(),
-            Bucket=bucket,
-            Key=f"{folder}/file{i}.pt",
-        )
+        client_s3.put_object(Body=buff.read(), Bucket=bucket, Key=f"{folder}/file{i}.pt")
 
     for images, labels in DataLoader(
-        wr.torch.S3IterableDataset(
-            path=f"s3://{bucket}/{folder}/file",
-        ),
-        batch_size=batch_size,
-        drop_last=drop_last,
+        wr.torch.S3IterableDataset(path=f"s3://{bucket}/{folder}/file"), batch_size=batch_size, drop_last=drop_last
     ):
         if drop_last:
             assert images.shape == torch.Size([batch_size, 3, 32, 32])
@@ -286,4 +271,3 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last):
             assert labels[0].shape == torch.Size([])
 
     wr.s3.delete_objects(path=path)
-

From 37b7f1e7edf9aa233d07bfd06baa92db80dc7cc3 Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Fri, 24 Apr 2020 13:43:47 -0300
Subject: [PATCH 23/59] update readme

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index d4a8a3cad..624ebc12c 100644
--- a/README.md
+++ b/README.md
@@ -85,6 +85,7 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine)
   - [11 - CSV Datasets](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/11%20-%20CSV%20Datasets.ipynb)
   - [12 - CSV Crawler](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/12%20-%20CSV%20Crawler.ipynb)
   - [13 - Merging Datasets on S3](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/13%20-%20Merging%20Datasets%20on%20S3.ipynb)
+  - [14 - PyTorch](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/14%20-%20PyTorch.ipynb)
 - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/latest/api.html)
   - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#amazon-s3)
   - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#aws-glue-catalog)

From 33d74c4354ebfffd357c6730b0cacbc727d33185 Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Fri, 24 Apr 2020 14:00:21 -0300
Subject: [PATCH 24/59] remove captalized requirement from docstring

---
 .github/workflows/static-checking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml
index bc33d9327..9f0701146 100644
--- a/.github/workflows/static-checking.yml
+++ b/.github/workflows/static-checking.yml
@@ -30,7 +30,7 @@ jobs:
     - name: CloudFormation Lint
       run: cfn-lint -t testing/cloudformation.yaml
     - name: Documentation Lint
-      run: pydocstyle awswrangler/ --add-ignore=D204
+      run: pydocstyle awswrangler/ --add-ignore=D204,D403
     - name: mypy check
       run: mypy awswrangler
     - name: Flake8 Lint

From 4b05b36575237da68de61335d6f5db8777e5f2cc Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Fri, 24 Apr 2020 14:12:26 -0300
Subject: [PATCH 25/59] add torch requirements

---
 .github/workflows/static-checking.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml
index 9f0701146..56f978a50 100644
--- a/.github/workflows/static-checking.yml
+++ b/.github/workflows/static-checking.yml
@@ -27,6 +27,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r requirements-dev.txt
+        pip install -r requirements-torch.txt
     - name: CloudFormation Lint
       run: cfn-lint -t testing/cloudformation.yaml
     - name: Documentation Lint

From 9ce624b60fe9de55ce928283e76841ed45a76ea2 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sat, 25 Apr 2020 17:41:16 -0300
Subject: [PATCH 26/59] Add support to EMR with Docker

---
 awswrangler/__init__.py                     |   1 +
 awswrangler/_utils.py                       |  13 +
 awswrangler/athena.py                       |   2 +-
 awswrangler/emr.py                          | 335 ++++++++++++++++----
 awswrangler/s3.py                           |  65 +++-
 docs/source/api.rst                         |   2 +
 requirements-dev.txt                        |   3 +-
 testing/test_awswrangler/test_cloudwatch.py |   2 +-
 testing/test_awswrangler/test_data_lake.py  |   3 +
 testing/test_awswrangler/test_emr.py        |  33 ++
 testing/test_awswrangler/test_moto.py       |  27 +-
 tutorials/15 - EMR.ipynb                    | 193 +++++++++++
 tutorials/16 - EMR & Docker.ipynb           | 269 ++++++++++++++++
 13 files changed, 869 insertions(+), 79 deletions(-)
 create mode 100644 tutorials/15 - EMR.ipynb
 create mode 100644 tutorials/16 - EMR & Docker.ipynb

diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
index ce11c7ad5..4413ab5f4 100644
--- a/awswrangler/__init__.py
+++ b/awswrangler/__init__.py
@@ -9,5 +9,6 @@
 
 from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3  # noqa
 from awswrangler.__metadata__ import __description__, __license__, __title__, __version__  # noqa
+from awswrangler._utils import get_account_id  # noqa
 
 logging.getLogger("awswrangler").addHandler(logging.NullHandler())
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
index 21a27d37e..df168bdb9 100644
--- a/awswrangler/_utils.py
+++ b/awswrangler/_utils.py
@@ -166,3 +166,16 @@ def ensure_postgresql_casts():
 def get_directory(path: str) -> str:
     """Extract directory path."""
     return path.rsplit(sep="/", maxsplit=1)[0] + "/"
+
+
+def get_account_id(boto3_session: Optional[boto3.Session] = None) -> str:
+    """Get Account ID."""
+    session: boto3.Session = ensure_session(session=boto3_session)
+    return client(service_name="sts", session=session).get_caller_identity().get("Account")
+
+
+def get_region_from_subnet(subnet_id: str, boto3_session: Optional[boto3.Session] = None) -> str:
+    """Extract region from Subnet ID."""
+    session: boto3.Session = ensure_session(session=boto3_session)
+    client_ec2: boto3.client = client(service_name="ec2", session=session)
+    return client_ec2.describe_subnets(SubnetIds=[subnet_id])["Subnets"][0]["AvailabilityZone"][:9]
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
index 1933606ad..d73c41063 100644
--- a/awswrangler/athena.py
+++ b/awswrangler/athena.py
@@ -68,7 +68,7 @@ def create_athena_bucket(boto3_session: Optional[boto3.Session] = None) -> str:
 
     """
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
-    account_id: str = _utils.client(service_name="sts", session=session).get_caller_identity().get("Account")
+    account_id: str = _utils.get_account_id(boto3_session=session)
     region_name: str = str(session.region_name).lower()
     s3_output = f"s3://aws-athena-query-results-{account_id}-{region_name}/"
     s3_resource = session.resource("s3")
diff --git a/awswrangler/emr.py b/awswrangler/emr.py
index aee470621..106a57da3 100644
--- a/awswrangler/emr.py
+++ b/awswrangler/emr.py
@@ -7,12 +7,76 @@
 
 import boto3  # type: ignore
 
-from awswrangler import _utils
+from awswrangler import _utils, exceptions
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
+def _get_default_logging_path(
+    subnet_id: Optional[str] = None,
+    account_id: Optional[str] = None,
+    region: Optional[str] = None,
+    boto3_session: Optional[boto3.Session] = None,
+) -> str:
+    """Get EMR default logging path.
+
+    E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/"
+
+    Parameters
+    ----------
+    subnet_id : str, optional
+        Subnet ID. If not provided, you must pass `account_id` and `region` explicit.
+    account_id: str, optional
+        Account ID.
+    region: str, optional
+        Region e.g. 'us-east-1'
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    str
+        Default logging path.
+        E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/"
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> state = wr.emr._get_default_logging_path("subnet-id")
+    's3://aws-logs-{account_id}-{region}/elasticmapreduce/'
+
+    """
+    if account_id is None:
+        boto3_session = _utils.ensure_session(session=boto3_session)
+        _account_id: str = _utils.get_account_id(boto3_session=boto3_session)
+    else:
+        _account_id = account_id
+    if (region is None) and (subnet_id is not None):
+        boto3_session = _utils.ensure_session(session=boto3_session)
+        _region: str = _utils.get_region_from_subnet(subnet_id=subnet_id, boto3_session=boto3_session)
+    elif (region is None) and (subnet_id is None):
+        raise exceptions.InvalidArgumentCombination("You must pass region or subnet_id or both.")
+    else:
+        _region = region  # type: ignore
+    return f"s3://aws-logs-{_account_id}-{_region}/elasticmapreduce/"
+
+
+def _get_ecr_credentials_command() -> str:
+    return (
+        "sudo -s eval $(aws ecr get-login --region us-east-1 --no-include-email) && "
+        "sudo hdfs dfs -put -f /root/.docker/config.json /user/hadoop/"
+    )
+
+
 def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-statements
+    account_id: str = _utils.get_account_id(boto3_session=pars["boto3_session"])
+    region: str = _utils.get_region_from_subnet(subnet_id=pars["subnet_id"], boto3_session=pars["boto3_session"])
+
+    # S3 Logging path
+    if pars.get("logging_s3_path") is None:
+        pars["logging_s3_path"] = _get_default_logging_path(
+            subnet_id=None, account_id=account_id, region=region, boto3_session=pars["boto3_session"]
+        )
 
     spark_env: Optional[Dict[str, str]] = None
     yarn_env: Optional[Dict[str, str]] = None
@@ -20,25 +84,25 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
 
     if pars["spark_pyarrow"] is True:
         if pars["spark_defaults"] is None:
-            pars["spark_defaults"]: Dict[str, str] = {"spark.sql.execution.arrow.enabled": "true"}
+            pars["spark_defaults"] = {"spark.sql.execution.arrow.enabled": "true"}
         else:  # pragma: no cover
-            pars["spark_defaults"]["spark.sql.execution.arrow.enabled"]: str = "true"
+            pars["spark_defaults"]["spark.sql.execution.arrow.enabled"] = "true"
         spark_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
         yarn_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
         livy_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
 
     if pars["python3"] is True:
         if spark_env is None:
-            spark_env: Dict[str, str] = {"PYSPARK_PYTHON": "/usr/bin/python3"}  # pragma: no cover
+            spark_env = {"PYSPARK_PYTHON": "/usr/bin/python3"}  # pragma: no cover
         else:
-            spark_env["PYSPARK_PYTHON"]: str = "/usr/bin/python3"
+            spark_env["PYSPARK_PYTHON"] = "/usr/bin/python3"
 
     if pars["spark_jars_path"] is not None:
         paths: str = ",".join(pars["spark_jars_path"])
         if pars["spark_defaults"] is None:  # pragma: no cover
-            pars["spark_defaults"]: Dict[str, str] = {"spark.jars": paths}
+            pars["spark_defaults"] = {"spark.jars": paths}
         else:
-            pars["spark_defaults"]["spark.jars"]: str = paths
+            pars["spark_defaults"]["spark.jars"] = paths
 
     args: Dict[str, Any] = {
         "Name": pars["cluster_name"],
@@ -72,9 +136,52 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
         args["Instances"]["ServiceAccessSecurityGroup"] = pars["security_group_service_access"]
 
     # Configurations
-    args["Configurations"]: List[Dict[str, Any]] = [
+    args["Configurations"] = [
         {"Classification": "spark-log4j", "Properties": {"log4j.rootCategory": f"{pars['spark_log_level']}, console"}}
     ]
+    if (pars["docker"] is True) or (pars["spark_docker"] is True) or (pars["hive_docker"] is True):
+        if pars.get("extra_registries") is None:
+            extra_registries: List[str] = []
+        else:  # pragma: no cover
+            extra_registries = pars["extra_registries"]
+        registries: str = f"local,centos,{account_id}.dkr.ecr.{region}.amazonaws.com,{','.join(extra_registries)}"
+        registries = registries[:-1] if registries.endswith(",") else registries
+        args["Configurations"].append(
+            {
+                "Classification": "container-executor",
+                "Properties": {},
+                "Configurations": [
+                    {
+                        "Classification": "docker",
+                        "Properties": {
+                            "docker.privileged-containers.registries": registries,
+                            "docker.trusted.registries": registries,
+                        },
+                        "Configurations": [],
+                    }
+                ],
+            }
+        )
+    if pars["spark_docker"] is True:
+        if pars.get("spark_docker_image") is None:  # pragma: no cover
+            raise exceptions.InvalidArgumentCombination("You must pass a spark_docker_image if spark_docker is True.")
+        pars["spark_defaults"] = {} if pars["spark_defaults"] is None else pars["spark_defaults"]
+        pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker"
+        pars["spark_defaults"][
+            "spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG"
+        ] = "hdfs:///user/hadoop/config.json"
+        pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars["spark_docker_image"]
+        pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS"] = "/etc/passwd:/etc/passwd:ro"
+        pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker"
+        pars["spark_defaults"][
+            "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG"
+        ] = "hdfs:///user/hadoop/config.json"
+        pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars[
+            "spark_docker_image"
+        ]
+        pars["spark_defaults"][
+            "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS"
+        ] = "/etc/passwd:/etc/passwd:ro"
     if spark_env is not None:
         args["Configurations"].append(
             {
@@ -109,16 +216,21 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
                 "Configurations": [],
             }
         )
+
+    hive_conf: Optional[Dict[str, Any]] = None
+    if (pars["hive_glue_catalog"] is True) or (pars["hive_docker"] is True):
+        hive_conf: Optional[Dict[str, Any]] = {"Classification": "hive-site", "Properties": {}, "Configurations": []}
+
     if pars["hive_glue_catalog"] is True:
-        args["Configurations"].append(
-            {
-                "Classification": "hive-site",
-                "Properties": {
-                    "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"  # noqa
-                },
-                "Configurations": [],
-            }
-        )
+        hive_conf["Properties"][
+            "hive.metastore.client.factory.class"
+        ] = "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
+    if pars["hive_docker"] is True:
+        hive_conf["Properties"]["hive.execution.mode"] = "container"
+
+    if hive_conf is not None:
+        args["Configurations"].append(hive_conf)
+
     if pars["presto_glue_catalog"] is True:
         args["Configurations"].append(
             {
@@ -147,20 +259,21 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
             "Properties": pars["spark_defaults"],
         }
         args["Configurations"].append(spark_defaults)
+    if pars.get("custom_classifications") is not None:
+        for c in pars["custom_classifications"]:
+            args["Configurations"].append(c)
 
     # Applications
     if pars["applications"]:
-        args["Applications"]: List[Dict[str, str]] = [{"Name": x} for x in pars["applications"]]
+        args["Applications"] = [{"Name": x} for x in pars["applications"]]
 
     # Bootstraps
     if pars["bootstraps_paths"]:  # pragma: no cover
-        args["BootstrapActions"]: List[Dict] = [
-            {"Name": x, "ScriptBootstrapAction": {"Path": x}} for x in pars["bootstraps_paths"]
-        ]
+        args["BootstrapActions"] = [{"Name": x, "ScriptBootstrapAction": {"Path": x}} for x in pars["bootstraps_paths"]]
 
     # Debugging and Steps
     if (pars["debugging"] is True) or (pars["steps"] is not None):
-        args["Steps"]: List[Dict[str, Any]] = []
+        args["Steps"] = []
         if pars["debugging"] is True:
             args["Steps"].append(
                 {
@@ -169,6 +282,17 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
                     "HadoopJarStep": {"Jar": "command-runner.jar", "Args": ["state-pusher-script"]},
                 }
             )
+        if pars["ecr_credentials_step"] is True:
+            args["Steps"].append(
+                build_step(
+                    name="ECR Credentials Setup",
+                    command=_get_ecr_credentials_command(),
+                    action_on_failure="TERMINATE_CLUSTER",
+                    script=False,
+                    region=region,
+                    boto3_session=pars["boto3_session"],
+                )
+            )
         if pars["steps"] is not None:
             args["Steps"] += pars["steps"]
 
@@ -199,7 +323,7 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
         ],
     }
     if pars["instance_num_spot_master"] > 0:  # pragma: no cover
-        fleet_master["LaunchSpecifications"]: Dict = {
+        fleet_master["LaunchSpecifications"] = {
             "SpotSpecification": {
                 "TimeoutDurationMinutes": pars["spot_provisioning_timeout_master"],
                 "TimeoutAction": timeout_action_master,
@@ -236,7 +360,7 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
             ],
         }
         if pars["instance_num_spot_core"] > 0:
-            fleet_core["LaunchSpecifications"]: Dict = {
+            fleet_core["LaunchSpecifications"] = {
                 "SpotSpecification": {
                     "TimeoutDurationMinutes": pars["spot_provisioning_timeout_core"],
                     "TimeoutAction": timeout_action_core,
@@ -275,7 +399,7 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
             ],
         }
         if pars["instance_num_spot_task"] > 0:
-            fleet_task["LaunchSpecifications"]: Dict = {
+            fleet_task["LaunchSpecifications"] = {
                 "SpotSpecification": {
                     "TimeoutDurationMinutes": pars["spot_provisioning_timeout_task"],
                     "TimeoutAction": timeout_action_task,
@@ -292,30 +416,30 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
 
 
 def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused-argument
-    cluster_name: str,
-    logging_s3_path: str,
-    emr_release: str,
     subnet_id: str,
-    emr_ec2_role: str,
-    emr_role: str,
-    instance_type_master: str,
-    instance_type_core: str,
-    instance_type_task: str,
-    instance_ebs_size_master: int,
-    instance_ebs_size_core: int,
-    instance_ebs_size_task: int,
-    instance_num_on_demand_master: int,
-    instance_num_on_demand_core: int,
-    instance_num_on_demand_task: int,
-    instance_num_spot_master: int,
-    instance_num_spot_core: int,
-    instance_num_spot_task: int,
-    spot_bid_percentage_of_on_demand_master: int,
-    spot_bid_percentage_of_on_demand_core: int,
-    spot_bid_percentage_of_on_demand_task: int,
-    spot_provisioning_timeout_master: int,
-    spot_provisioning_timeout_core: int,
-    spot_provisioning_timeout_task: int,
+    cluster_name: str = "my-emr-cluster",
+    logging_s3_path: Optional[str] = None,
+    emr_release: str = "emr-6.0.0",
+    emr_ec2_role: str = "EMR_EC2_DefaultRole",
+    emr_role: str = "EMR_DefaultRole",
+    instance_type_master: str = "r5.xlarge",
+    instance_type_core: str = "r5.xlarge",
+    instance_type_task: str = "r5.xlarge",
+    instance_ebs_size_master: int = 64,
+    instance_ebs_size_core: int = 64,
+    instance_ebs_size_task: int = 64,
+    instance_num_on_demand_master: int = 1,
+    instance_num_on_demand_core: int = 0,
+    instance_num_on_demand_task: int = 0,
+    instance_num_spot_master: int = 0,
+    instance_num_spot_core: int = 0,
+    instance_num_spot_task: int = 0,
+    spot_bid_percentage_of_on_demand_master: int = 100,
+    spot_bid_percentage_of_on_demand_core: int = 100,
+    spot_bid_percentage_of_on_demand_task: int = 100,
+    spot_provisioning_timeout_master: int = 5,
+    spot_provisioning_timeout_core: int = 5,
+    spot_provisioning_timeout_task: int = 5,
     spot_timeout_to_on_demand_master: bool = True,
     spot_timeout_to_on_demand_core: bool = True,
     spot_timeout_to_on_demand_task: bool = True,
@@ -337,10 +461,17 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     security_group_slave: Optional[str] = None,
     security_groups_slave_additional: Optional[List[str]] = None,
     security_group_service_access: Optional[str] = None,
+    docker: bool = False,
     spark_log_level: str = "WARN",
     spark_jars_path: Optional[List[str]] = None,
     spark_defaults: Optional[Dict[str, str]] = None,
     spark_pyarrow: bool = False,
+    spark_docker: bool = False,
+    spark_docker_image: str = None,
+    hive_docker: bool = False,
+    ecr_credentials_step: bool = False,
+    extra_public_registries: Optional[List[str]] = None,
+    custom_classifications: Optional[List[Dict[str, Any]]] = None,
     maximize_resource_allocation: bool = False,
     steps: Optional[List[Dict[str, Any]]] = None,
     keep_cluster_alive_when_no_steps: bool = True,
@@ -354,18 +485,19 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
 
     Parameters
     ----------
+    subnet_id : str
+        VPC subnet ID.
     cluster_name : str
         Cluster name.
-    logging_s3_path : str
+    logging_s3_path : str, optional
         Logging s3 path (e.g. s3://BUCKET_NAME/DIRECTORY_NAME/).
+        If None, the default is `s3://aws-logs-{AccountId}-{RegionId}/elasticmapreduce/`
     emr_release : str
         EMR release (e.g. emr-5.28.0).
     emr_ec2_role : str
         IAM role name.
     emr_role : str
         IAM role name.
-    subnet_id : str
-        VPC subnet ID.
     instance_type_master : str
         EC2 instance type.
     instance_type_core : str
@@ -448,6 +580,7 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
         Debugging enabled?
     applications : List[str], optional
         List of applications (e.g ["Hadoop", "Spark", "Ganglia", "Hive"]).
+        If None, ["Spark"] will be considered.
     visible_to_all_users : bool
         True or False.
     key_pair_name : str, optional
@@ -465,6 +598,8 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     security_group_service_access : str, optional
         The identifier of the Amazon EC2 security group for the Amazon EMR
         service to access clusters in VPC private subnets.
+    docker : bool
+        Enable Docker Hub and ECR registries access.
     spark_log_level : str
         log4j.rootCategory log level (ALL, DEBUG, INFO, WARN, ERROR, FATAL, OFF, TRACE).
     spark_jars_path : List[str], optional
@@ -475,6 +610,18 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     spark_pyarrow : bool
         Enable PySpark to use PyArrow behind the scenes.
         P.S. You must install pyarrow by your self via bootstrap
+    spark_docker : bool = False
+        Add necessary Spark Defaults to run on Docker
+    spark_docker_image : str, optional
+        E.g. {ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}
+    hive_docker : bool
+        Add necessary configurations to run on Docker
+    ecr_credentials_step : bool
+        Add a extra step during the Cluster launch to retrieve ECR auth files.
+    extra_public_registries: List[str], optional
+        Additional registries.
+    custom_classifications: List[Dict[str, Any]], optional
+        Extra classifications.
     maximize_resource_allocation : bool
         Configure your executors to utilize the maximum resources possible
         https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html#emr-spark-maximizeresourceallocation
@@ -500,6 +647,21 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
 
     Examples
     --------
+    Minimal Example
+
+    >>> cluster_id = wr.emr.create_cluster("SUBNET_ID")
+
+    Minimal Exmaple on Docker
+
+    >>> cluster_id = wr.emr.create_cluster(
+    >>>     subnet_id="SUBNET_ID",
+    >>>     spark_docker=True,
+    >>>     spark_docker_image="{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}",
+    >>>     ecr_credentials_step=True
+    >>> )
+
+    Full Example
+
     >>> import awswrangler as wr
     >>> cluster_id = wr.emr.create_cluster(
     ...     cluster_name="wrangler_cluster",
@@ -548,6 +710,8 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     ...     })
 
     """
+    applications = ["Spark"] if applications is None else applications
+    boto3_session = _utils.ensure_session(session=boto3_session)
     args: Dict[str, Any] = _build_cluster_args(**locals())
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.run_job_flow(**args)
@@ -647,8 +811,8 @@ def submit_steps(
 
 def submit_step(
     cluster_id: str,
-    name: str,
     command: str,
+    name: str = "my-step",
     action_on_failure: str = "CONTINUE",
     script: bool = False,
     boto3_session: Optional[boto3.Session] = None,
@@ -659,11 +823,11 @@ def submit_step(
     ----------
     cluster_id : str
         Cluster ID.
-    name : str
-        Step name.
     command : str
         e.g. 'echo "Hello!"'
         e.g. for script 's3://.../script.sh arg1 arg2'
+    name : str, optional
+        Step name.
     action_on_failure : str
         'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
     script : bool
@@ -698,26 +862,29 @@ def submit_step(
 
 
 def build_step(
-    name: str,
     command: str,
+    name: str = "my-step",
     action_on_failure: str = "CONTINUE",
     script: bool = False,
+    region: Optional[str] = None,
     boto3_session: Optional[boto3.Session] = None,
 ) -> Dict[str, Any]:
     """Build the Step structure (dictionary).
 
     Parameters
     ----------
-    name : str
-        Step name.
     command : str
         e.g. 'echo "Hello!"'
         e.g. for script 's3://.../script.sh arg1 arg2'
+    name : str, optional
+        Step name.
     action_on_failure : str
         'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
     script : bool
-        True for raw command or False for script runner.
+        False for raw command or True for script runner.
         https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html
+    region: str, optional
+        Region name to not get it from boto3.Session. (e.g. `us-east-1`)
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -734,14 +901,17 @@ def build_step(
     >>> wr.emr.submit_steps(cluster_id="cluster-id", steps=steps)
 
     """
-    session: boto3.Session = _utils.ensure_session(session=boto3_session)
     jar: str = "command-runner.jar"
     if script is True:
-        if session.region_name is not None:
-            region: str = session.region_name
-        else:  # pragma: no cover
-            region = "us-east-1"
-        jar = f"s3://{region}.elasticmapreduce/libs/script-runner/script-runner.jar"
+        if region is not None:  # pragma: no cover
+            _region: str = region
+        else:
+            session: boto3.Session = _utils.ensure_session(session=boto3_session)
+            if session.region_name is not None:
+                _region = session.region_name
+            else:  # pragma: no cover
+                _region = "us-east-1"
+        jar = f"s3://{_region}.elasticmapreduce/libs/script-runner/script-runner.jar"
     step: Dict[str, Any] = {
         "Name": name,
         "ActionOnFailure": action_on_failure,
@@ -780,3 +950,40 @@ def get_step_state(cluster_id: str, step_id: str, boto3_session: Optional[boto3.
     response: Dict[str, Any] = client_emr.describe_step(ClusterId=cluster_id, StepId=step_id)
     _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
     return response["Step"]["Status"]["State"]
+
+
+def update_ecr_credentials(
+    cluster_id: str, action_on_failure: str = "CONTINUE", boto3_session: Optional[boto3.Session] = None
+) -> str:
+    """Update internal ECR credentials.
+
+    Parameters
+    ----------
+    cluster_id : str
+        Cluster ID.
+    action_on_failure : str
+        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    str
+        Step ID.
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> step_id = wr.emr.update_ecr_credentials("cluster_id")
+
+    """
+    name: str = "Update ECR Credentials"
+    command: str = _get_ecr_credentials_command()
+    session: boto3.Session = _utils.ensure_session(session=boto3_session)
+    step: Dict[str, Any] = build_step(
+        name=name, command=command, action_on_failure=action_on_failure, script=False, boto3_session=session
+    )
+    client_emr: boto3.client = _utils.client(service_name="emr", session=session)
+    response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
+    _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
+    return response["StepIds"][0]
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index f728937db..527c1ae76 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -111,6 +111,40 @@ def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None)
         raise ex  # pragma: no cover
 
 
+def list_directories(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]:
+    """List Amazon S3 objects from a prefix.
+
+    Parameters
+    ----------
+    path : str
+        S3 path (e.g. s3://bucket/prefix).
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    List[str]
+        List of objects paths.
+
+    Examples
+    --------
+    Using the default boto3 session
+
+    >>> import awswrangler as wr
+    >>> wr.s3.list_objects('s3://bucket/prefix/')
+    ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2']
+
+    Using a custom boto3 session
+
+    >>> import boto3
+    >>> import awswrangler as wr
+    >>> wr.s3.list_objects('s3://bucket/prefix/', boto3_session=boto3.Session())
+    ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2']
+
+    """
+    return _list_objects(path=path, delimiter="/", boto3_session=boto3_session)
+
+
 def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]:
     """List Amazon S3 objects from a prefix.
 
@@ -142,20 +176,37 @@ def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> Li
     ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2']
 
     """
+    return _list_objects(path=path, delimiter=None, boto3_session=boto3_session)
+
+
+def _list_objects(
+    path: str, delimiter: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
+) -> List[str]:
     client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
     paginator = client_s3.get_paginator("list_objects_v2")
     bucket: str
     prefix: str
     bucket, prefix = _utils.parse_path(path=path)
-    response_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix, PaginationConfig={"PageSize": 1000})
+    args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}}
+    if delimiter is not None:
+        args["Delimiter"] = delimiter
+    response_iterator = paginator.paginate(**args)
     paths: List[str] = []
     for page in response_iterator:
-        contents: Optional[List] = page.get("Contents")
-        if contents is not None:
-            for content in contents:
-                if (content is not None) and ("Key" in content):
-                    key: str = content["Key"]
-                    paths.append(f"s3://{bucket}/{key}")
+        if delimiter is None:
+            contents: Optional[List[Optional[Dict[str, str]]]] = page.get("Contents")
+            if contents is not None:
+                for content in contents:
+                    if (content is not None) and ("Key" in content):
+                        key: str = content["Key"]
+                        paths.append(f"s3://{bucket}/{key}")
+        else:
+            prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes")
+            if prefixes is not None:
+                for pfx in prefixes:
+                    if (pfx is not None) and ("Prefix" in pfx):
+                        key = pfx["Prefix"]
+                        paths.append(f"s3://{bucket}/{key}")
     return paths
 
 
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 897fc7a3e..7d2d51602 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -16,6 +16,7 @@ Amazon S3
     does_object_exist
     get_bucket_region
     list_objects
+    list_directories
     read_csv
     read_fwf
     read_json
@@ -115,6 +116,7 @@ EMR
     submit_steps
     build_step
     get_step_state
+    update_ecr_credentials
 
 CloudWatch Logs
 ---------------
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 3fdd3cdf3..99a9b0730 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -17,4 +17,5 @@ twine~=3.1.1
 wheel~=0.34.2
 sphinx~=3.0.1
 sphinx_bootstrap_theme~=0.7.1
-moto~=1.3.14
\ No newline at end of file
+moto~=1.3.14
+jupyterlab~=2.1.1
\ No newline at end of file
diff --git a/testing/test_awswrangler/test_cloudwatch.py b/testing/test_awswrangler/test_cloudwatch.py
index f59b8b3dd..eced7a754 100644
--- a/testing/test_awswrangler/test_cloudwatch.py
+++ b/testing/test_awswrangler/test_cloudwatch.py
@@ -48,7 +48,7 @@ def loggroup(cloudformation_outputs):
 def test_query_cancelled(loggroup):
     client_logs = boto3.client("logs")
     query_id = wr.cloudwatch.start_query(
-        log_group_names=[loggroup], query="fields @timestamp, @message | sort @timestamp desc | limit 5"
+        log_group_names=[loggroup], query="fields @timestamp, @message | sort @timestamp desc"
     )
     client_logs.stop_query(queryId=query_id)
     with pytest.raises(exceptions.QueryCancelled):
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
index afa2a8307..bd53d4bad 100644
--- a/testing/test_awswrangler/test_data_lake.py
+++ b/testing/test_awswrangler/test_data_lake.py
@@ -127,6 +127,9 @@ def test_athena_ctas(bucket, database, kms_key):
         partition_cols=["par0", "par1"],
     )["paths"]
     wr.s3.wait_objects_exist(paths=paths)
+    dirs = wr.s3.list_directories(path=f"s3://{bucket}/test_athena_ctas/")
+    for d in dirs:
+        assert d.startswith(f"s3://{bucket}/test_athena_ctas/par0=")
     df = wr.s3.read_parquet_table(table="test_athena_ctas", database=database)
     assert len(df.index) == 3
     ensure_data_types(df=df, has_list=True)
diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py
index e64329b33..df2dab1cb 100644
--- a/testing/test_awswrangler/test_emr.py
+++ b/testing/test_awswrangler/test_emr.py
@@ -146,3 +146,36 @@ def test_cluster_single_node(bucket, cloudformation_outputs):
     wr.emr.submit_steps(cluster_id=cluster_id, steps=steps)
     wr.emr.terminate_cluster(cluster_id=cluster_id)
     wr.s3.delete_objects(f"s3://{bucket}/emr-logs/")
+
+
+def test_default_logging_path(cloudformation_outputs):
+    path = wr.emr._get_default_logging_path(subnet_id=cloudformation_outputs["SubnetId"])
+    assert path.startswith("s3://aws-logs-")
+    assert path.endswith("/elasticmapreduce/")
+    with pytest.raises(wr.exceptions.InvalidArgumentCombination):
+        wr.emr._get_default_logging_path()
+
+
+def test_docker(cloudformation_outputs):
+    cluster_id = wr.emr.create_cluster(
+        subnet_id=cloudformation_outputs["SubnetId"],
+        docker=True,
+        spark_docker=True,
+        spark_docker_image="787535711150.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr",
+        hive_docker=True,
+        ecr_credentials_step=True,
+        custom_classifications=[
+            {
+                "Classification": "livy-conf",
+                "Properties": {
+                    "livy.spark.master": "yarn",
+                    "livy.spark.deploy-mode": "cluster",
+                    "livy.server.session.timeout": "16h",
+                },
+            }
+        ],
+        steps=[wr.emr.build_step("spark-submit --deploy-mode cluster s3://igor-tavares/emr.py")],
+    )
+    wr.emr.submit_step(cluster_id=cluster_id, command="spark-submit --deploy-mode cluster s3://igor-tavares/emr.py")
+    wr.emr.update_ecr_credentials(cluster_id=cluster_id)
+    wr.emr.terminate_cluster(cluster_id=cluster_id)
diff --git a/testing/test_awswrangler/test_moto.py b/testing/test_awswrangler/test_moto.py
index db12dbe1a..2adc7aec8 100644
--- a/testing/test_awswrangler/test_moto.py
+++ b/testing/test_awswrangler/test_moto.py
@@ -20,6 +20,21 @@ def emr():
         yield True
 
 
+@pytest.fixture(scope="module")
+def sts():
+    with moto.mock_sts():
+        yield True
+
+
+@pytest.fixture(scope="module")
+def subnet():
+    with moto.mock_ec2():
+        ec2 = boto3.resource("ec2", region_name="us-west-1")
+        vpc = ec2.create_vpc(CidrBlock="10.0.0.0/16")
+        subnet = ec2.create_subnet(VpcId=vpc.id, CidrBlock="10.0.0.0/24", AvailabilityZone="us-west-1a")
+        yield subnet.id
+
+
 def test_csv(s3):
     path = "s3://bucket/test.csv"
     wr.s3.to_csv(df=get_df_csv(), path=path, index=False)
@@ -37,12 +52,13 @@ def test_parquet(s3):
     assert len(df.columns) == 18
 
 
-def test_emr(s3, emr):
+def test_emr(s3, emr, sts, subnet):
+    session = boto3.Session(region_name="us-west-1")
     cluster_id = wr.emr.create_cluster(
         cluster_name="wrangler_cluster",
         logging_s3_path="s3://bucket/emr-logs/",
         emr_release="emr-5.29.0",
-        subnet_id="foo",
+        subnet_id=subnet,
         emr_ec2_role="EMR_EC2_DefaultRole",
         emr_role="EMR_DefaultRole",
         instance_type_master="m5.xlarge",
@@ -87,11 +103,12 @@ def test_emr(s3, emr):
         termination_protected=False,
         spark_pyarrow=False,
         tags={"foo": "boo", "bar": "xoo"},
+        boto3_session=session,
     )
-    wr.emr.get_cluster_state(cluster_id=cluster_id)
+    wr.emr.get_cluster_state(cluster_id=cluster_id, boto3_session=session)
     steps = []
     for cmd in ['echo "Hello"', "ls -la"]:
         steps.append(wr.emr.build_step(name=cmd, command=cmd))
-    wr.emr.submit_steps(cluster_id=cluster_id, steps=steps)
-    wr.emr.terminate_cluster(cluster_id=cluster_id)
+    wr.emr.submit_steps(cluster_id=cluster_id, steps=steps, boto3_session=session)
+    wr.emr.terminate_cluster(cluster_id=cluster_id, boto3_session=session)
     wr.s3.delete_objects("s3://bucket/emr-logs/")
diff --git a/tutorials/15 - EMR.ipynb b/tutorials/15 - EMR.ipynb
new file mode 100644
index 000000000..4e1c627e6
--- /dev/null
+++ b/tutorials/15 - EMR.ipynb	
@@ -0,0 +1,193 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)\n",
+    "\n",
+    "# 15 - EMR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import awswrangler as wr\n",
+    "import boto3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Enter your bucket name:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ··········································\n"
+     ]
+    }
+   ],
+   "source": [
+    "import getpass\n",
+    "bucket = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Enter your Subnet ID:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ························\n"
+     ]
+    }
+   ],
+   "source": [
+    "subnet = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating EMR Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cluster_id = wr.emr.create_cluster(subnet)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Uploading our PySpark script to Amazon S3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "script = \"\"\"\n",
+    "from pyspark.sql import SparkSession\n",
+    "spark = SparkSession.builder.appName(\"docker-awswrangler\").getOrCreate()\n",
+    "sc = spark.sparkContext\n",
+    "\n",
+    "print(\"Spark Initialized\")\n",
+    "\"\"\"\n",
+    "\n",
+    "_ = boto3.client(\"s3\").put_object(\n",
+    "    Body=script,\n",
+    "    Bucket=bucket,\n",
+    "    Key=\"test.py\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Submit PySpark step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "step_id = wr.emr.submit_step(cluster_id, command=f\"spark-submit s3://{bucket}/test.py\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Wait Step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Terminate Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wr.emr.terminate_cluster(cluster_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb
new file mode 100644
index 000000000..138759d8f
--- /dev/null
+++ b/tutorials/16 - EMR & Docker.ipynb	
@@ -0,0 +1,269 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)\n",
+    "\n",
+    "# 16 - EMR & Docker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import awswrangler as wr\n",
+    "import boto3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Enter your bucket name:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ··········································\n"
+     ]
+    }
+   ],
+   "source": [
+    "import getpass\n",
+    "bucket = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Enter your Subnet ID:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ························\n"
+     ]
+    }
+   ],
+   "source": [
+    "subnet = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Build and Upload Docker Image to ECR repository\n",
+    "\n",
+    "Replace the `{ACCOUNT_ID}` placeholder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "name": "#%%writefile Dockerfile\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%writefile Dockerfile\n",
+    "\n",
+    "FROM amazoncorretto:8\n",
+    "\n",
+    "RUN yum -y update\n",
+    "RUN yum -y install yum-utils\n",
+    "RUN yum -y groupinstall development\n",
+    "\n",
+    "RUN yum list python3*\n",
+    "RUN yum -y install python3 python3-dev python3-pip python3-virtualenv\n",
+    "\n",
+    "RUN python -V\n",
+    "RUN python3 -V\n",
+    "\n",
+    "ENV PYSPARK_DRIVER_PYTHON python3\n",
+    "ENV PYSPARK_PYTHON python3\n",
+    "\n",
+    "RUN pip3 install --upgrade pip\n",
+    "RUN pip3 install awswrangler\n",
+    "\n",
+    "RUN python3 -c \"import awswrangler as wr\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "docker build -t 'local/emr-wrangler' .\n",
+    "aws ecr create-repository --repository-name emr-wrangler\n",
+    "docker tag local/emr-wrangler {ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\n",
+    "eval $(aws ecr get-login --region us-east-1 --no-include-email)\n",
+    "docker push {ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating EMR Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n",
+    "\n",
+    "cluster_id = wr.emr.create_cluster(\n",
+    "    subnet_id=subnet,\n",
+    "    spark_docker=True,\n",
+    "    spark_docker_image=DOCKER_IMAGE,\n",
+    "    ecr_credentials_step=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Uploading our PySpark script to Amazon S3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "script = \"\"\"\n",
+    "from pyspark.sql import SparkSession\n",
+    "spark = SparkSession.builder.appName(\"docker-awswrangler\").getOrCreate()\n",
+    "sc = spark.sparkContext\n",
+    "\n",
+    "print(\"Spark Initialized\")\n",
+    "\n",
+    "import awswrangler as wr\n",
+    "\n",
+    "print(f\"Wrangler version: {wr.__version__}\")\n",
+    "\"\"\"\n",
+    "\n",
+    "_ = boto3.client(\"s3\").put_object(\n",
+    "    Body=script,\n",
+    "    Bucket=bucket,\n",
+    "    Key=\"test_docker.py\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Submit PySpark step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "step_id = wr.emr.submit_step(cluster_id, command=f\"spark-submit --deploy-mode cluster s3://{bucket}/test_docker.py\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Wait Step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Terminate Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wr.emr.terminate_cluster(cluster_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From c2db8cd27bbd80da857b40df737385c4bd254eb7 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sat, 25 Apr 2020 17:41:16 -0300
Subject: [PATCH 27/59] Add support to EMR with Docker #193

---
 awswrangler/__init__.py                     |   1 +
 awswrangler/_utils.py                       |  13 +
 awswrangler/athena.py                       |   2 +-
 awswrangler/emr.py                          | 335 ++++++++++++++++----
 awswrangler/s3.py                           |  65 +++-
 docs/source/api.rst                         |   2 +
 requirements-dev.txt                        |   3 +-
 testing/test_awswrangler/test_cloudwatch.py |   2 +-
 testing/test_awswrangler/test_data_lake.py  |   3 +
 testing/test_awswrangler/test_emr.py        |  33 ++
 testing/test_awswrangler/test_moto.py       |  27 +-
 tutorials/15 - EMR.ipynb                    | 193 +++++++++++
 tutorials/16 - EMR & Docker.ipynb           | 269 ++++++++++++++++
 13 files changed, 869 insertions(+), 79 deletions(-)
 create mode 100644 tutorials/15 - EMR.ipynb
 create mode 100644 tutorials/16 - EMR & Docker.ipynb

diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
index ce11c7ad5..4413ab5f4 100644
--- a/awswrangler/__init__.py
+++ b/awswrangler/__init__.py
@@ -9,5 +9,6 @@
 
 from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3  # noqa
 from awswrangler.__metadata__ import __description__, __license__, __title__, __version__  # noqa
+from awswrangler._utils import get_account_id  # noqa
 
 logging.getLogger("awswrangler").addHandler(logging.NullHandler())
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
index 21a27d37e..df168bdb9 100644
--- a/awswrangler/_utils.py
+++ b/awswrangler/_utils.py
@@ -166,3 +166,16 @@ def ensure_postgresql_casts():
 def get_directory(path: str) -> str:
     """Extract directory path."""
     return path.rsplit(sep="/", maxsplit=1)[0] + "/"
+
+
+def get_account_id(boto3_session: Optional[boto3.Session] = None) -> str:
+    """Get Account ID."""
+    session: boto3.Session = ensure_session(session=boto3_session)
+    return client(service_name="sts", session=session).get_caller_identity().get("Account")
+
+
+def get_region_from_subnet(subnet_id: str, boto3_session: Optional[boto3.Session] = None) -> str:
+    """Extract region from Subnet ID."""
+    session: boto3.Session = ensure_session(session=boto3_session)
+    client_ec2: boto3.client = client(service_name="ec2", session=session)
+    return client_ec2.describe_subnets(SubnetIds=[subnet_id])["Subnets"][0]["AvailabilityZone"][:9]
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
index 1933606ad..d73c41063 100644
--- a/awswrangler/athena.py
+++ b/awswrangler/athena.py
@@ -68,7 +68,7 @@ def create_athena_bucket(boto3_session: Optional[boto3.Session] = None) -> str:
 
     """
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
-    account_id: str = _utils.client(service_name="sts", session=session).get_caller_identity().get("Account")
+    account_id: str = _utils.get_account_id(boto3_session=session)
     region_name: str = str(session.region_name).lower()
     s3_output = f"s3://aws-athena-query-results-{account_id}-{region_name}/"
     s3_resource = session.resource("s3")
diff --git a/awswrangler/emr.py b/awswrangler/emr.py
index aee470621..106a57da3 100644
--- a/awswrangler/emr.py
+++ b/awswrangler/emr.py
@@ -7,12 +7,76 @@
 
 import boto3  # type: ignore
 
-from awswrangler import _utils
+from awswrangler import _utils, exceptions
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
+def _get_default_logging_path(
+    subnet_id: Optional[str] = None,
+    account_id: Optional[str] = None,
+    region: Optional[str] = None,
+    boto3_session: Optional[boto3.Session] = None,
+) -> str:
+    """Get EMR default logging path.
+
+    E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/"
+
+    Parameters
+    ----------
+    subnet_id : str, optional
+        Subnet ID. If not provided, you must pass `account_id` and `region` explicit.
+    account_id: str, optional
+        Account ID.
+    region: str, optional
+        Region e.g. 'us-east-1'
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    str
+        Default logging path.
+        E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/"
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> state = wr.emr._get_default_logging_path("subnet-id")
+    's3://aws-logs-{account_id}-{region}/elasticmapreduce/'
+
+    """
+    if account_id is None:
+        boto3_session = _utils.ensure_session(session=boto3_session)
+        _account_id: str = _utils.get_account_id(boto3_session=boto3_session)
+    else:
+        _account_id = account_id
+    if (region is None) and (subnet_id is not None):
+        boto3_session = _utils.ensure_session(session=boto3_session)
+        _region: str = _utils.get_region_from_subnet(subnet_id=subnet_id, boto3_session=boto3_session)
+    elif (region is None) and (subnet_id is None):
+        raise exceptions.InvalidArgumentCombination("You must pass region or subnet_id or both.")
+    else:
+        _region = region  # type: ignore
+    return f"s3://aws-logs-{_account_id}-{_region}/elasticmapreduce/"
+
+
+def _get_ecr_credentials_command() -> str:
+    return (
+        "sudo -s eval $(aws ecr get-login --region us-east-1 --no-include-email) && "
+        "sudo hdfs dfs -put -f /root/.docker/config.json /user/hadoop/"
+    )
+
+
 def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-statements
+    account_id: str = _utils.get_account_id(boto3_session=pars["boto3_session"])
+    region: str = _utils.get_region_from_subnet(subnet_id=pars["subnet_id"], boto3_session=pars["boto3_session"])
+
+    # S3 Logging path
+    if pars.get("logging_s3_path") is None:
+        pars["logging_s3_path"] = _get_default_logging_path(
+            subnet_id=None, account_id=account_id, region=region, boto3_session=pars["boto3_session"]
+        )
 
     spark_env: Optional[Dict[str, str]] = None
     yarn_env: Optional[Dict[str, str]] = None
@@ -20,25 +84,25 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
 
     if pars["spark_pyarrow"] is True:
         if pars["spark_defaults"] is None:
-            pars["spark_defaults"]: Dict[str, str] = {"spark.sql.execution.arrow.enabled": "true"}
+            pars["spark_defaults"] = {"spark.sql.execution.arrow.enabled": "true"}
         else:  # pragma: no cover
-            pars["spark_defaults"]["spark.sql.execution.arrow.enabled"]: str = "true"
+            pars["spark_defaults"]["spark.sql.execution.arrow.enabled"] = "true"
         spark_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
         yarn_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
         livy_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
 
     if pars["python3"] is True:
         if spark_env is None:
-            spark_env: Dict[str, str] = {"PYSPARK_PYTHON": "/usr/bin/python3"}  # pragma: no cover
+            spark_env = {"PYSPARK_PYTHON": "/usr/bin/python3"}  # pragma: no cover
         else:
-            spark_env["PYSPARK_PYTHON"]: str = "/usr/bin/python3"
+            spark_env["PYSPARK_PYTHON"] = "/usr/bin/python3"
 
     if pars["spark_jars_path"] is not None:
         paths: str = ",".join(pars["spark_jars_path"])
         if pars["spark_defaults"] is None:  # pragma: no cover
-            pars["spark_defaults"]: Dict[str, str] = {"spark.jars": paths}
+            pars["spark_defaults"] = {"spark.jars": paths}
         else:
-            pars["spark_defaults"]["spark.jars"]: str = paths
+            pars["spark_defaults"]["spark.jars"] = paths
 
     args: Dict[str, Any] = {
         "Name": pars["cluster_name"],
@@ -72,9 +136,52 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
         args["Instances"]["ServiceAccessSecurityGroup"] = pars["security_group_service_access"]
 
     # Configurations
-    args["Configurations"]: List[Dict[str, Any]] = [
+    args["Configurations"] = [
         {"Classification": "spark-log4j", "Properties": {"log4j.rootCategory": f"{pars['spark_log_level']}, console"}}
     ]
+    if (pars["docker"] is True) or (pars["spark_docker"] is True) or (pars["hive_docker"] is True):
+        if pars.get("extra_registries") is None:
+            extra_registries: List[str] = []
+        else:  # pragma: no cover
+            extra_registries = pars["extra_registries"]
+        registries: str = f"local,centos,{account_id}.dkr.ecr.{region}.amazonaws.com,{','.join(extra_registries)}"
+        registries = registries[:-1] if registries.endswith(",") else registries
+        args["Configurations"].append(
+            {
+                "Classification": "container-executor",
+                "Properties": {},
+                "Configurations": [
+                    {
+                        "Classification": "docker",
+                        "Properties": {
+                            "docker.privileged-containers.registries": registries,
+                            "docker.trusted.registries": registries,
+                        },
+                        "Configurations": [],
+                    }
+                ],
+            }
+        )
+    if pars["spark_docker"] is True:
+        if pars.get("spark_docker_image") is None:  # pragma: no cover
+            raise exceptions.InvalidArgumentCombination("You must pass a spark_docker_image if spark_docker is True.")
+        pars["spark_defaults"] = {} if pars["spark_defaults"] is None else pars["spark_defaults"]
+        pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker"
+        pars["spark_defaults"][
+            "spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG"
+        ] = "hdfs:///user/hadoop/config.json"
+        pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars["spark_docker_image"]
+        pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS"] = "/etc/passwd:/etc/passwd:ro"
+        pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker"
+        pars["spark_defaults"][
+            "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG"
+        ] = "hdfs:///user/hadoop/config.json"
+        pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars[
+            "spark_docker_image"
+        ]
+        pars["spark_defaults"][
+            "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS"
+        ] = "/etc/passwd:/etc/passwd:ro"
     if spark_env is not None:
         args["Configurations"].append(
             {
@@ -109,16 +216,21 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
                 "Configurations": [],
             }
         )
+
+    hive_conf: Optional[Dict[str, Any]] = None
+    if (pars["hive_glue_catalog"] is True) or (pars["hive_docker"] is True):
+        hive_conf: Optional[Dict[str, Any]] = {"Classification": "hive-site", "Properties": {}, "Configurations": []}
+
     if pars["hive_glue_catalog"] is True:
-        args["Configurations"].append(
-            {
-                "Classification": "hive-site",
-                "Properties": {
-                    "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"  # noqa
-                },
-                "Configurations": [],
-            }
-        )
+        hive_conf["Properties"][
+            "hive.metastore.client.factory.class"
+        ] = "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
+    if pars["hive_docker"] is True:
+        hive_conf["Properties"]["hive.execution.mode"] = "container"
+
+    if hive_conf is not None:
+        args["Configurations"].append(hive_conf)
+
     if pars["presto_glue_catalog"] is True:
         args["Configurations"].append(
             {
@@ -147,20 +259,21 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
             "Properties": pars["spark_defaults"],
         }
         args["Configurations"].append(spark_defaults)
+    if pars.get("custom_classifications") is not None:
+        for c in pars["custom_classifications"]:
+            args["Configurations"].append(c)
 
     # Applications
     if pars["applications"]:
-        args["Applications"]: List[Dict[str, str]] = [{"Name": x} for x in pars["applications"]]
+        args["Applications"] = [{"Name": x} for x in pars["applications"]]
 
     # Bootstraps
     if pars["bootstraps_paths"]:  # pragma: no cover
-        args["BootstrapActions"]: List[Dict] = [
-            {"Name": x, "ScriptBootstrapAction": {"Path": x}} for x in pars["bootstraps_paths"]
-        ]
+        args["BootstrapActions"] = [{"Name": x, "ScriptBootstrapAction": {"Path": x}} for x in pars["bootstraps_paths"]]
 
     # Debugging and Steps
     if (pars["debugging"] is True) or (pars["steps"] is not None):
-        args["Steps"]: List[Dict[str, Any]] = []
+        args["Steps"] = []
         if pars["debugging"] is True:
             args["Steps"].append(
                 {
@@ -169,6 +282,17 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
                     "HadoopJarStep": {"Jar": "command-runner.jar", "Args": ["state-pusher-script"]},
                 }
             )
+        if pars["ecr_credentials_step"] is True:
+            args["Steps"].append(
+                build_step(
+                    name="ECR Credentials Setup",
+                    command=_get_ecr_credentials_command(),
+                    action_on_failure="TERMINATE_CLUSTER",
+                    script=False,
+                    region=region,
+                    boto3_session=pars["boto3_session"],
+                )
+            )
         if pars["steps"] is not None:
             args["Steps"] += pars["steps"]
 
@@ -199,7 +323,7 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
         ],
     }
     if pars["instance_num_spot_master"] > 0:  # pragma: no cover
-        fleet_master["LaunchSpecifications"]: Dict = {
+        fleet_master["LaunchSpecifications"] = {
             "SpotSpecification": {
                 "TimeoutDurationMinutes": pars["spot_provisioning_timeout_master"],
                 "TimeoutAction": timeout_action_master,
@@ -236,7 +360,7 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
             ],
         }
         if pars["instance_num_spot_core"] > 0:
-            fleet_core["LaunchSpecifications"]: Dict = {
+            fleet_core["LaunchSpecifications"] = {
                 "SpotSpecification": {
                     "TimeoutDurationMinutes": pars["spot_provisioning_timeout_core"],
                     "TimeoutAction": timeout_action_core,
@@ -275,7 +399,7 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
             ],
         }
         if pars["instance_num_spot_task"] > 0:
-            fleet_task["LaunchSpecifications"]: Dict = {
+            fleet_task["LaunchSpecifications"] = {
                 "SpotSpecification": {
                     "TimeoutDurationMinutes": pars["spot_provisioning_timeout_task"],
                     "TimeoutAction": timeout_action_task,
@@ -292,30 +416,30 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
 
 
 def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused-argument
-    cluster_name: str,
-    logging_s3_path: str,
-    emr_release: str,
     subnet_id: str,
-    emr_ec2_role: str,
-    emr_role: str,
-    instance_type_master: str,
-    instance_type_core: str,
-    instance_type_task: str,
-    instance_ebs_size_master: int,
-    instance_ebs_size_core: int,
-    instance_ebs_size_task: int,
-    instance_num_on_demand_master: int,
-    instance_num_on_demand_core: int,
-    instance_num_on_demand_task: int,
-    instance_num_spot_master: int,
-    instance_num_spot_core: int,
-    instance_num_spot_task: int,
-    spot_bid_percentage_of_on_demand_master: int,
-    spot_bid_percentage_of_on_demand_core: int,
-    spot_bid_percentage_of_on_demand_task: int,
-    spot_provisioning_timeout_master: int,
-    spot_provisioning_timeout_core: int,
-    spot_provisioning_timeout_task: int,
+    cluster_name: str = "my-emr-cluster",
+    logging_s3_path: Optional[str] = None,
+    emr_release: str = "emr-6.0.0",
+    emr_ec2_role: str = "EMR_EC2_DefaultRole",
+    emr_role: str = "EMR_DefaultRole",
+    instance_type_master: str = "r5.xlarge",
+    instance_type_core: str = "r5.xlarge",
+    instance_type_task: str = "r5.xlarge",
+    instance_ebs_size_master: int = 64,
+    instance_ebs_size_core: int = 64,
+    instance_ebs_size_task: int = 64,
+    instance_num_on_demand_master: int = 1,
+    instance_num_on_demand_core: int = 0,
+    instance_num_on_demand_task: int = 0,
+    instance_num_spot_master: int = 0,
+    instance_num_spot_core: int = 0,
+    instance_num_spot_task: int = 0,
+    spot_bid_percentage_of_on_demand_master: int = 100,
+    spot_bid_percentage_of_on_demand_core: int = 100,
+    spot_bid_percentage_of_on_demand_task: int = 100,
+    spot_provisioning_timeout_master: int = 5,
+    spot_provisioning_timeout_core: int = 5,
+    spot_provisioning_timeout_task: int = 5,
     spot_timeout_to_on_demand_master: bool = True,
     spot_timeout_to_on_demand_core: bool = True,
     spot_timeout_to_on_demand_task: bool = True,
@@ -337,10 +461,17 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     security_group_slave: Optional[str] = None,
     security_groups_slave_additional: Optional[List[str]] = None,
     security_group_service_access: Optional[str] = None,
+    docker: bool = False,
     spark_log_level: str = "WARN",
     spark_jars_path: Optional[List[str]] = None,
     spark_defaults: Optional[Dict[str, str]] = None,
     spark_pyarrow: bool = False,
+    spark_docker: bool = False,
+    spark_docker_image: str = None,
+    hive_docker: bool = False,
+    ecr_credentials_step: bool = False,
+    extra_public_registries: Optional[List[str]] = None,
+    custom_classifications: Optional[List[Dict[str, Any]]] = None,
     maximize_resource_allocation: bool = False,
     steps: Optional[List[Dict[str, Any]]] = None,
     keep_cluster_alive_when_no_steps: bool = True,
@@ -354,18 +485,19 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
 
     Parameters
     ----------
+    subnet_id : str
+        VPC subnet ID.
     cluster_name : str
         Cluster name.
-    logging_s3_path : str
+    logging_s3_path : str, optional
         Logging s3 path (e.g. s3://BUCKET_NAME/DIRECTORY_NAME/).
+        If None, the default is `s3://aws-logs-{AccountId}-{RegionId}/elasticmapreduce/`
     emr_release : str
         EMR release (e.g. emr-5.28.0).
     emr_ec2_role : str
         IAM role name.
     emr_role : str
         IAM role name.
-    subnet_id : str
-        VPC subnet ID.
     instance_type_master : str
         EC2 instance type.
     instance_type_core : str
@@ -448,6 +580,7 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
         Debugging enabled?
     applications : List[str], optional
         List of applications (e.g ["Hadoop", "Spark", "Ganglia", "Hive"]).
+        If None, ["Spark"] will be considered.
     visible_to_all_users : bool
         True or False.
     key_pair_name : str, optional
@@ -465,6 +598,8 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     security_group_service_access : str, optional
         The identifier of the Amazon EC2 security group for the Amazon EMR
         service to access clusters in VPC private subnets.
+    docker : bool
+        Enable Docker Hub and ECR registries access.
     spark_log_level : str
         log4j.rootCategory log level (ALL, DEBUG, INFO, WARN, ERROR, FATAL, OFF, TRACE).
     spark_jars_path : List[str], optional
@@ -475,6 +610,18 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     spark_pyarrow : bool
         Enable PySpark to use PyArrow behind the scenes.
         P.S. You must install pyarrow by your self via bootstrap
+    spark_docker : bool = False
+        Add necessary Spark Defaults to run on Docker
+    spark_docker_image : str, optional
+        E.g. {ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}
+    hive_docker : bool
+        Add necessary configurations to run on Docker
+    ecr_credentials_step : bool
+        Add a extra step during the Cluster launch to retrieve ECR auth files.
+    extra_public_registries: List[str], optional
+        Additional registries.
+    custom_classifications: List[Dict[str, Any]], optional
+        Extra classifications.
     maximize_resource_allocation : bool
         Configure your executors to utilize the maximum resources possible
         https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html#emr-spark-maximizeresourceallocation
@@ -500,6 +647,21 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
 
     Examples
     --------
+    Minimal Example
+
+    >>> cluster_id = wr.emr.create_cluster("SUBNET_ID")
+
+    Minimal Exmaple on Docker
+
+    >>> cluster_id = wr.emr.create_cluster(
+    >>>     subnet_id="SUBNET_ID",
+    >>>     spark_docker=True,
+    >>>     spark_docker_image="{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}",
+    >>>     ecr_credentials_step=True
+    >>> )
+
+    Full Example
+
     >>> import awswrangler as wr
     >>> cluster_id = wr.emr.create_cluster(
     ...     cluster_name="wrangler_cluster",
@@ -548,6 +710,8 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     ...     })
 
     """
+    applications = ["Spark"] if applications is None else applications
+    boto3_session = _utils.ensure_session(session=boto3_session)
     args: Dict[str, Any] = _build_cluster_args(**locals())
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.run_job_flow(**args)
@@ -647,8 +811,8 @@ def submit_steps(
 
 def submit_step(
     cluster_id: str,
-    name: str,
     command: str,
+    name: str = "my-step",
     action_on_failure: str = "CONTINUE",
     script: bool = False,
     boto3_session: Optional[boto3.Session] = None,
@@ -659,11 +823,11 @@ def submit_step(
     ----------
     cluster_id : str
         Cluster ID.
-    name : str
-        Step name.
     command : str
         e.g. 'echo "Hello!"'
         e.g. for script 's3://.../script.sh arg1 arg2'
+    name : str, optional
+        Step name.
     action_on_failure : str
         'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
     script : bool
@@ -698,26 +862,29 @@ def submit_step(
 
 
 def build_step(
-    name: str,
     command: str,
+    name: str = "my-step",
     action_on_failure: str = "CONTINUE",
     script: bool = False,
+    region: Optional[str] = None,
     boto3_session: Optional[boto3.Session] = None,
 ) -> Dict[str, Any]:
     """Build the Step structure (dictionary).
 
     Parameters
     ----------
-    name : str
-        Step name.
     command : str
         e.g. 'echo "Hello!"'
         e.g. for script 's3://.../script.sh arg1 arg2'
+    name : str, optional
+        Step name.
     action_on_failure : str
         'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
     script : bool
-        True for raw command or False for script runner.
+        False for raw command or True for script runner.
         https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html
+    region: str, optional
+        Region name to not get it from boto3.Session. (e.g. `us-east-1`)
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -734,14 +901,17 @@ def build_step(
     >>> wr.emr.submit_steps(cluster_id="cluster-id", steps=steps)
 
     """
-    session: boto3.Session = _utils.ensure_session(session=boto3_session)
     jar: str = "command-runner.jar"
     if script is True:
-        if session.region_name is not None:
-            region: str = session.region_name
-        else:  # pragma: no cover
-            region = "us-east-1"
-        jar = f"s3://{region}.elasticmapreduce/libs/script-runner/script-runner.jar"
+        if region is not None:  # pragma: no cover
+            _region: str = region
+        else:
+            session: boto3.Session = _utils.ensure_session(session=boto3_session)
+            if session.region_name is not None:
+                _region = session.region_name
+            else:  # pragma: no cover
+                _region = "us-east-1"
+        jar = f"s3://{_region}.elasticmapreduce/libs/script-runner/script-runner.jar"
     step: Dict[str, Any] = {
         "Name": name,
         "ActionOnFailure": action_on_failure,
@@ -780,3 +950,40 @@ def get_step_state(cluster_id: str, step_id: str, boto3_session: Optional[boto3.
     response: Dict[str, Any] = client_emr.describe_step(ClusterId=cluster_id, StepId=step_id)
     _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
     return response["Step"]["Status"]["State"]
+
+
+def update_ecr_credentials(
+    cluster_id: str, action_on_failure: str = "CONTINUE", boto3_session: Optional[boto3.Session] = None
+) -> str:
+    """Update internal ECR credentials.
+
+    Parameters
+    ----------
+    cluster_id : str
+        Cluster ID.
+    action_on_failure : str
+        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    str
+        Step ID.
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> step_id = wr.emr.update_ecr_credentials("cluster_id")
+
+    """
+    name: str = "Update ECR Credentials"
+    command: str = _get_ecr_credentials_command()
+    session: boto3.Session = _utils.ensure_session(session=boto3_session)
+    step: Dict[str, Any] = build_step(
+        name=name, command=command, action_on_failure=action_on_failure, script=False, boto3_session=session
+    )
+    client_emr: boto3.client = _utils.client(service_name="emr", session=session)
+    response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
+    _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
+    return response["StepIds"][0]
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index f728937db..527c1ae76 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -111,6 +111,40 @@ def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None)
         raise ex  # pragma: no cover
 
 
+def list_directories(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]:
+    """List Amazon S3 objects from a prefix.
+
+    Parameters
+    ----------
+    path : str
+        S3 path (e.g. s3://bucket/prefix).
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    List[str]
+        List of objects paths.
+
+    Examples
+    --------
+    Using the default boto3 session
+
+    >>> import awswrangler as wr
+    >>> wr.s3.list_objects('s3://bucket/prefix/')
+    ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2']
+
+    Using a custom boto3 session
+
+    >>> import boto3
+    >>> import awswrangler as wr
+    >>> wr.s3.list_objects('s3://bucket/prefix/', boto3_session=boto3.Session())
+    ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2']
+
+    """
+    return _list_objects(path=path, delimiter="/", boto3_session=boto3_session)
+
+
 def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]:
     """List Amazon S3 objects from a prefix.
 
@@ -142,20 +176,37 @@ def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> Li
     ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2']
 
     """
+    return _list_objects(path=path, delimiter=None, boto3_session=boto3_session)
+
+
+def _list_objects(
+    path: str, delimiter: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
+) -> List[str]:
     client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
     paginator = client_s3.get_paginator("list_objects_v2")
     bucket: str
     prefix: str
     bucket, prefix = _utils.parse_path(path=path)
-    response_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix, PaginationConfig={"PageSize": 1000})
+    args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}}
+    if delimiter is not None:
+        args["Delimiter"] = delimiter
+    response_iterator = paginator.paginate(**args)
     paths: List[str] = []
     for page in response_iterator:
-        contents: Optional[List] = page.get("Contents")
-        if contents is not None:
-            for content in contents:
-                if (content is not None) and ("Key" in content):
-                    key: str = content["Key"]
-                    paths.append(f"s3://{bucket}/{key}")
+        if delimiter is None:
+            contents: Optional[List[Optional[Dict[str, str]]]] = page.get("Contents")
+            if contents is not None:
+                for content in contents:
+                    if (content is not None) and ("Key" in content):
+                        key: str = content["Key"]
+                        paths.append(f"s3://{bucket}/{key}")
+        else:
+            prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes")
+            if prefixes is not None:
+                for pfx in prefixes:
+                    if (pfx is not None) and ("Prefix" in pfx):
+                        key = pfx["Prefix"]
+                        paths.append(f"s3://{bucket}/{key}")
     return paths
 
 
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 897fc7a3e..7d2d51602 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -16,6 +16,7 @@ Amazon S3
     does_object_exist
     get_bucket_region
     list_objects
+    list_directories
     read_csv
     read_fwf
     read_json
@@ -115,6 +116,7 @@ EMR
     submit_steps
     build_step
     get_step_state
+    update_ecr_credentials
 
 CloudWatch Logs
 ---------------
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 3fdd3cdf3..99a9b0730 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -17,4 +17,5 @@ twine~=3.1.1
 wheel~=0.34.2
 sphinx~=3.0.1
 sphinx_bootstrap_theme~=0.7.1
-moto~=1.3.14
\ No newline at end of file
+moto~=1.3.14
+jupyterlab~=2.1.1
\ No newline at end of file
diff --git a/testing/test_awswrangler/test_cloudwatch.py b/testing/test_awswrangler/test_cloudwatch.py
index f59b8b3dd..eced7a754 100644
--- a/testing/test_awswrangler/test_cloudwatch.py
+++ b/testing/test_awswrangler/test_cloudwatch.py
@@ -48,7 +48,7 @@ def loggroup(cloudformation_outputs):
 def test_query_cancelled(loggroup):
     client_logs = boto3.client("logs")
     query_id = wr.cloudwatch.start_query(
-        log_group_names=[loggroup], query="fields @timestamp, @message | sort @timestamp desc | limit 5"
+        log_group_names=[loggroup], query="fields @timestamp, @message | sort @timestamp desc"
     )
     client_logs.stop_query(queryId=query_id)
     with pytest.raises(exceptions.QueryCancelled):
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
index afa2a8307..bd53d4bad 100644
--- a/testing/test_awswrangler/test_data_lake.py
+++ b/testing/test_awswrangler/test_data_lake.py
@@ -127,6 +127,9 @@ def test_athena_ctas(bucket, database, kms_key):
         partition_cols=["par0", "par1"],
     )["paths"]
     wr.s3.wait_objects_exist(paths=paths)
+    dirs = wr.s3.list_directories(path=f"s3://{bucket}/test_athena_ctas/")
+    for d in dirs:
+        assert d.startswith(f"s3://{bucket}/test_athena_ctas/par0=")
     df = wr.s3.read_parquet_table(table="test_athena_ctas", database=database)
     assert len(df.index) == 3
     ensure_data_types(df=df, has_list=True)
diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py
index e64329b33..df2dab1cb 100644
--- a/testing/test_awswrangler/test_emr.py
+++ b/testing/test_awswrangler/test_emr.py
@@ -146,3 +146,36 @@ def test_cluster_single_node(bucket, cloudformation_outputs):
     wr.emr.submit_steps(cluster_id=cluster_id, steps=steps)
     wr.emr.terminate_cluster(cluster_id=cluster_id)
     wr.s3.delete_objects(f"s3://{bucket}/emr-logs/")
+
+
+def test_default_logging_path(cloudformation_outputs):
+    path = wr.emr._get_default_logging_path(subnet_id=cloudformation_outputs["SubnetId"])
+    assert path.startswith("s3://aws-logs-")
+    assert path.endswith("/elasticmapreduce/")
+    with pytest.raises(wr.exceptions.InvalidArgumentCombination):
+        wr.emr._get_default_logging_path()
+
+
+def test_docker(cloudformation_outputs):
+    cluster_id = wr.emr.create_cluster(
+        subnet_id=cloudformation_outputs["SubnetId"],
+        docker=True,
+        spark_docker=True,
+        spark_docker_image="787535711150.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr",
+        hive_docker=True,
+        ecr_credentials_step=True,
+        custom_classifications=[
+            {
+                "Classification": "livy-conf",
+                "Properties": {
+                    "livy.spark.master": "yarn",
+                    "livy.spark.deploy-mode": "cluster",
+                    "livy.server.session.timeout": "16h",
+                },
+            }
+        ],
+        steps=[wr.emr.build_step("spark-submit --deploy-mode cluster s3://igor-tavares/emr.py")],
+    )
+    wr.emr.submit_step(cluster_id=cluster_id, command="spark-submit --deploy-mode cluster s3://igor-tavares/emr.py")
+    wr.emr.update_ecr_credentials(cluster_id=cluster_id)
+    wr.emr.terminate_cluster(cluster_id=cluster_id)
diff --git a/testing/test_awswrangler/test_moto.py b/testing/test_awswrangler/test_moto.py
index db12dbe1a..2adc7aec8 100644
--- a/testing/test_awswrangler/test_moto.py
+++ b/testing/test_awswrangler/test_moto.py
@@ -20,6 +20,21 @@ def emr():
         yield True
 
 
+@pytest.fixture(scope="module")
+def sts():
+    with moto.mock_sts():
+        yield True
+
+
+@pytest.fixture(scope="module")
+def subnet():
+    with moto.mock_ec2():
+        ec2 = boto3.resource("ec2", region_name="us-west-1")
+        vpc = ec2.create_vpc(CidrBlock="10.0.0.0/16")
+        subnet = ec2.create_subnet(VpcId=vpc.id, CidrBlock="10.0.0.0/24", AvailabilityZone="us-west-1a")
+        yield subnet.id
+
+
 def test_csv(s3):
     path = "s3://bucket/test.csv"
     wr.s3.to_csv(df=get_df_csv(), path=path, index=False)
@@ -37,12 +52,13 @@ def test_parquet(s3):
     assert len(df.columns) == 18
 
 
-def test_emr(s3, emr):
+def test_emr(s3, emr, sts, subnet):
+    session = boto3.Session(region_name="us-west-1")
     cluster_id = wr.emr.create_cluster(
         cluster_name="wrangler_cluster",
         logging_s3_path="s3://bucket/emr-logs/",
         emr_release="emr-5.29.0",
-        subnet_id="foo",
+        subnet_id=subnet,
         emr_ec2_role="EMR_EC2_DefaultRole",
         emr_role="EMR_DefaultRole",
         instance_type_master="m5.xlarge",
@@ -87,11 +103,12 @@ def test_emr(s3, emr):
         termination_protected=False,
         spark_pyarrow=False,
         tags={"foo": "boo", "bar": "xoo"},
+        boto3_session=session,
     )
-    wr.emr.get_cluster_state(cluster_id=cluster_id)
+    wr.emr.get_cluster_state(cluster_id=cluster_id, boto3_session=session)
     steps = []
     for cmd in ['echo "Hello"', "ls -la"]:
         steps.append(wr.emr.build_step(name=cmd, command=cmd))
-    wr.emr.submit_steps(cluster_id=cluster_id, steps=steps)
-    wr.emr.terminate_cluster(cluster_id=cluster_id)
+    wr.emr.submit_steps(cluster_id=cluster_id, steps=steps, boto3_session=session)
+    wr.emr.terminate_cluster(cluster_id=cluster_id, boto3_session=session)
     wr.s3.delete_objects("s3://bucket/emr-logs/")
diff --git a/tutorials/15 - EMR.ipynb b/tutorials/15 - EMR.ipynb
new file mode 100644
index 000000000..4e1c627e6
--- /dev/null
+++ b/tutorials/15 - EMR.ipynb	
@@ -0,0 +1,193 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)\n",
+    "\n",
+    "# 15 - EMR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import awswrangler as wr\n",
+    "import boto3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Enter your bucket name:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ··········································\n"
+     ]
+    }
+   ],
+   "source": [
+    "import getpass\n",
+    "bucket = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Enter your Subnet ID:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ························\n"
+     ]
+    }
+   ],
+   "source": [
+    "subnet = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating EMR Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cluster_id = wr.emr.create_cluster(subnet)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Uploading our PySpark script to Amazon S3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "script = \"\"\"\n",
+    "from pyspark.sql import SparkSession\n",
+    "spark = SparkSession.builder.appName(\"docker-awswrangler\").getOrCreate()\n",
+    "sc = spark.sparkContext\n",
+    "\n",
+    "print(\"Spark Initialized\")\n",
+    "\"\"\"\n",
+    "\n",
+    "_ = boto3.client(\"s3\").put_object(\n",
+    "    Body=script,\n",
+    "    Bucket=bucket,\n",
+    "    Key=\"test.py\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Submit PySpark step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "step_id = wr.emr.submit_step(cluster_id, command=f\"spark-submit s3://{bucket}/test.py\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Wait Step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Terminate Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wr.emr.terminate_cluster(cluster_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb
new file mode 100644
index 000000000..138759d8f
--- /dev/null
+++ b/tutorials/16 - EMR & Docker.ipynb	
@@ -0,0 +1,269 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)\n",
+    "\n",
+    "# 16 - EMR & Docker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import awswrangler as wr\n",
+    "import boto3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Enter your bucket name:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ··········································\n"
+     ]
+    }
+   ],
+   "source": [
+    "import getpass\n",
+    "bucket = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Enter your Subnet ID:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ························\n"
+     ]
+    }
+   ],
+   "source": [
+    "subnet = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Build and Upload Docker Image to ECR repository\n",
+    "\n",
+    "Replace the `{ACCOUNT_ID}` placeholder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "name": "#%%writefile Dockerfile\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%writefile Dockerfile\n",
+    "\n",
+    "FROM amazoncorretto:8\n",
+    "\n",
+    "RUN yum -y update\n",
+    "RUN yum -y install yum-utils\n",
+    "RUN yum -y groupinstall development\n",
+    "\n",
+    "RUN yum list python3*\n",
+    "RUN yum -y install python3 python3-dev python3-pip python3-virtualenv\n",
+    "\n",
+    "RUN python -V\n",
+    "RUN python3 -V\n",
+    "\n",
+    "ENV PYSPARK_DRIVER_PYTHON python3\n",
+    "ENV PYSPARK_PYTHON python3\n",
+    "\n",
+    "RUN pip3 install --upgrade pip\n",
+    "RUN pip3 install awswrangler\n",
+    "\n",
+    "RUN python3 -c \"import awswrangler as wr\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "docker build -t 'local/emr-wrangler' .\n",
+    "aws ecr create-repository --repository-name emr-wrangler\n",
+    "docker tag local/emr-wrangler {ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\n",
+    "eval $(aws ecr get-login --region us-east-1 --no-include-email)\n",
+    "docker push {ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating EMR Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n",
+    "\n",
+    "cluster_id = wr.emr.create_cluster(\n",
+    "    subnet_id=subnet,\n",
+    "    spark_docker=True,\n",
+    "    spark_docker_image=DOCKER_IMAGE,\n",
+    "    ecr_credentials_step=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Uploading our PySpark script to Amazon S3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "script = \"\"\"\n",
+    "from pyspark.sql import SparkSession\n",
+    "spark = SparkSession.builder.appName(\"docker-awswrangler\").getOrCreate()\n",
+    "sc = spark.sparkContext\n",
+    "\n",
+    "print(\"Spark Initialized\")\n",
+    "\n",
+    "import awswrangler as wr\n",
+    "\n",
+    "print(f\"Wrangler version: {wr.__version__}\")\n",
+    "\"\"\"\n",
+    "\n",
+    "_ = boto3.client(\"s3\").put_object(\n",
+    "    Body=script,\n",
+    "    Bucket=bucket,\n",
+    "    Key=\"test_docker.py\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Submit PySpark step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "step_id = wr.emr.submit_step(cluster_id, command=f\"spark-submit --deploy-mode cluster s3://{bucket}/test_docker.py\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Wait Step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Terminate Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wr.emr.terminate_cluster(cluster_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 9611a0ae88ebb2b44853ebd460916013383cae26 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sat, 25 Apr 2020 18:17:36 -0300
Subject: [PATCH 28/59] Improve EMR tutorials #193

---
 awswrangler/emr.py                   | 21 ++++++++++++++++++++-
 testing/test_awswrangler/test_emr.py |  6 +++---
 tutorials/16 - EMR & Docker.ipynb    |  7 +++++--
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/awswrangler/emr.py b/awswrangler/emr.py
index 106a57da3..7490b29c9 100644
--- a/awswrangler/emr.py
+++ b/awswrangler/emr.py
@@ -649,10 +649,29 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     --------
     Minimal Example
 
+    >>> import awswrangler as wr
     >>> cluster_id = wr.emr.create_cluster("SUBNET_ID")
 
-    Minimal Exmaple on Docker
+    Minimal Example With Custom Classification
 
+    >>> import awswrangler as wr
+    >>> cluster_id = wr.emr.create_cluster(
+    >>> subnet_id="SUBNET_ID",
+    >>> custom_classifications=[
+    >>>         {
+    >>>             "Classification": "livy-conf",
+    >>>             "Properties": {
+    >>>                 "livy.spark.master": "yarn",
+    >>>                 "livy.spark.deploy-mode": "cluster",
+    >>>                 "livy.server.session.timeout": "16h",
+    >>>             },
+    >>>         }
+    >>>     ],
+    >>> )
+
+    Minimal Example on Docker
+
+    >>> import awswrangler as wr
     >>> cluster_id = wr.emr.create_cluster(
     >>>     subnet_id="SUBNET_ID",
     >>>     spark_docker=True,
diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py
index df2dab1cb..66f8e139f 100644
--- a/testing/test_awswrangler/test_emr.py
+++ b/testing/test_awswrangler/test_emr.py
@@ -161,7 +161,7 @@ def test_docker(cloudformation_outputs):
         subnet_id=cloudformation_outputs["SubnetId"],
         docker=True,
         spark_docker=True,
-        spark_docker_image="787535711150.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr",
+        spark_docker_image="123456789123.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr",
         hive_docker=True,
         ecr_credentials_step=True,
         custom_classifications=[
@@ -174,8 +174,8 @@ def test_docker(cloudformation_outputs):
                 },
             }
         ],
-        steps=[wr.emr.build_step("spark-submit --deploy-mode cluster s3://igor-tavares/emr.py")],
+        steps=[wr.emr.build_step("spark-submit --deploy-mode cluster s3://bucket/emr.py")],
     )
-    wr.emr.submit_step(cluster_id=cluster_id, command="spark-submit --deploy-mode cluster s3://igor-tavares/emr.py")
+    wr.emr.submit_step(cluster_id=cluster_id, command="spark-submit --deploy-mode cluster s3://bucket/emr.py")
     wr.emr.update_ecr_credentials(cluster_id=cluster_id)
     wr.emr.terminate_cluster(cluster_id=cluster_id)
diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb
index 138759d8f..440d72066 100644
--- a/tutorials/16 - EMR & Docker.ipynb	
+++ b/tutorials/16 - EMR & Docker.ipynb	
@@ -201,7 +201,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "step_id = wr.emr.submit_step(cluster_id, command=f\"spark-submit --deploy-mode cluster s3://{bucket}/test_docker.py\")"
+    "step_id = wr.emr.submit_step(\n",
+    "    cluster_id=cluster_id,\n",
+    "    command=f\"spark-submit --deploy-mode cluster s3://{bucket}/test_docker.py\"\n",
+    ")"
    ]
   },
   {
@@ -266,4 +269,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file

From 3c3ca645718aedbae47eb8b4134118d178ef90a4 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sun, 26 Apr 2020 16:30:02 -0300
Subject: [PATCH 29/59] Splitting up the ecr_credentials to a individual
 function #193

---
 awswrangler/emr.py                   | 184 +++++++++++++++------------
 testing/test_awswrangler/test_emr.py |  18 ++-
 tutorials/16 - EMR & Docker.ipynb    | 132 ++++++++++++++++---
 3 files changed, 230 insertions(+), 104 deletions(-)

diff --git a/awswrangler/emr.py b/awswrangler/emr.py
index 7490b29c9..3658d4573 100644
--- a/awswrangler/emr.py
+++ b/awswrangler/emr.py
@@ -61,13 +61,6 @@ def _get_default_logging_path(
     return f"s3://aws-logs-{_account_id}-{_region}/elasticmapreduce/"
 
 
-def _get_ecr_credentials_command() -> str:
-    return (
-        "sudo -s eval $(aws ecr get-login --region us-east-1 --no-include-email) && "
-        "sudo hdfs dfs -put -f /root/.docker/config.json /user/hadoop/"
-    )
-
-
 def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-statements
     account_id: str = _utils.get_account_id(boto3_session=pars["boto3_session"])
     region: str = _utils.get_region_from_subnet(subnet_id=pars["subnet_id"], boto3_session=pars["boto3_session"])
@@ -139,7 +132,7 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
     args["Configurations"] = [
         {"Classification": "spark-log4j", "Properties": {"log4j.rootCategory": f"{pars['spark_log_level']}, console"}}
     ]
-    if (pars["docker"] is True) or (pars["spark_docker"] is True) or (pars["hive_docker"] is True):
+    if pars["docker"] is True:
         if pars.get("extra_registries") is None:
             extra_registries: List[str] = []
         else:  # pragma: no cover
@@ -162,26 +155,6 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
                 ],
             }
         )
-    if pars["spark_docker"] is True:
-        if pars.get("spark_docker_image") is None:  # pragma: no cover
-            raise exceptions.InvalidArgumentCombination("You must pass a spark_docker_image if spark_docker is True.")
-        pars["spark_defaults"] = {} if pars["spark_defaults"] is None else pars["spark_defaults"]
-        pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker"
-        pars["spark_defaults"][
-            "spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG"
-        ] = "hdfs:///user/hadoop/config.json"
-        pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars["spark_docker_image"]
-        pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS"] = "/etc/passwd:/etc/passwd:ro"
-        pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker"
-        pars["spark_defaults"][
-            "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG"
-        ] = "hdfs:///user/hadoop/config.json"
-        pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars[
-            "spark_docker_image"
-        ]
-        pars["spark_defaults"][
-            "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS"
-        ] = "/etc/passwd:/etc/passwd:ro"
     if spark_env is not None:
         args["Configurations"].append(
             {
@@ -216,21 +189,12 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
                 "Configurations": [],
             }
         )
-
-    hive_conf: Optional[Dict[str, Any]] = None
-    if (pars["hive_glue_catalog"] is True) or (pars["hive_docker"] is True):
-        hive_conf: Optional[Dict[str, Any]] = {"Classification": "hive-site", "Properties": {}, "Configurations": []}
-
     if pars["hive_glue_catalog"] is True:
+        hive_conf: Optional[Dict[str, Any]] = {"Classification": "hive-site", "Properties": {}, "Configurations": []}
         hive_conf["Properties"][
             "hive.metastore.client.factory.class"
         ] = "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
-    if pars["hive_docker"] is True:
-        hive_conf["Properties"]["hive.execution.mode"] = "container"
-
-    if hive_conf is not None:
         args["Configurations"].append(hive_conf)
-
     if pars["presto_glue_catalog"] is True:
         args["Configurations"].append(
             {
@@ -282,17 +246,6 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
                     "HadoopJarStep": {"Jar": "command-runner.jar", "Args": ["state-pusher-script"]},
                 }
             )
-        if pars["ecr_credentials_step"] is True:
-            args["Steps"].append(
-                build_step(
-                    name="ECR Credentials Setup",
-                    command=_get_ecr_credentials_command(),
-                    action_on_failure="TERMINATE_CLUSTER",
-                    script=False,
-                    region=region,
-                    boto3_session=pars["boto3_session"],
-                )
-            )
         if pars["steps"] is not None:
             args["Steps"] += pars["steps"]
 
@@ -462,15 +415,11 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     security_groups_slave_additional: Optional[List[str]] = None,
     security_group_service_access: Optional[str] = None,
     docker: bool = False,
+    extra_public_registries: Optional[List[str]] = None,
     spark_log_level: str = "WARN",
     spark_jars_path: Optional[List[str]] = None,
     spark_defaults: Optional[Dict[str, str]] = None,
     spark_pyarrow: bool = False,
-    spark_docker: bool = False,
-    spark_docker_image: str = None,
-    hive_docker: bool = False,
-    ecr_credentials_step: bool = False,
-    extra_public_registries: Optional[List[str]] = None,
     custom_classifications: Optional[List[Dict[str, Any]]] = None,
     maximize_resource_allocation: bool = False,
     steps: Optional[List[Dict[str, Any]]] = None,
@@ -600,6 +549,8 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
         service to access clusters in VPC private subnets.
     docker : bool
         Enable Docker Hub and ECR registries access.
+    extra_public_registries: List[str], optional
+        Additional docker registries.
     spark_log_level : str
         log4j.rootCategory log level (ALL, DEBUG, INFO, WARN, ERROR, FATAL, OFF, TRACE).
     spark_jars_path : List[str], optional
@@ -610,16 +561,6 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     spark_pyarrow : bool
         Enable PySpark to use PyArrow behind the scenes.
         P.S. You must install pyarrow by your self via bootstrap
-    spark_docker : bool = False
-        Add necessary Spark Defaults to run on Docker
-    spark_docker_image : str, optional
-        E.g. {ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}
-    hive_docker : bool
-        Add necessary configurations to run on Docker
-    ecr_credentials_step : bool
-        Add a extra step during the Cluster launch to retrieve ECR auth files.
-    extra_public_registries: List[str], optional
-        Additional registries.
     custom_classifications: List[Dict[str, Any]], optional
         Extra classifications.
     maximize_resource_allocation : bool
@@ -669,16 +610,6 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     >>>     ],
     >>> )
 
-    Minimal Example on Docker
-
-    >>> import awswrangler as wr
-    >>> cluster_id = wr.emr.create_cluster(
-    >>>     subnet_id="SUBNET_ID",
-    >>>     spark_docker=True,
-    >>>     spark_docker_image="{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}",
-    >>>     ecr_credentials_step=True
-    >>> )
-
     Full Example
 
     >>> import awswrangler as wr
@@ -971,8 +902,8 @@ def get_step_state(cluster_id: str, step_id: str, boto3_session: Optional[boto3.
     return response["Step"]["Status"]["State"]
 
 
-def update_ecr_credentials(
-    cluster_id: str, action_on_failure: str = "CONTINUE", boto3_session: Optional[boto3.Session] = None
+def submit_ecr_credentials_refresh(
+    cluster_id: str, path: str, action_on_failure: str = "CONTINUE", boto3_session: Optional[boto3.Session] = None
 ) -> str:
     """Update internal ECR credentials.
 
@@ -980,6 +911,8 @@ def update_ecr_credentials(
     ----------
     cluster_id : str
         Cluster ID.
+    path : str
+        Amazon S3 path where Wrangler will stage the script ecr_credentials_refresh.py (e.g. s3://bucket/emr/)
     action_on_failure : str
         'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
     boto3_session : boto3.Session(), optional
@@ -993,12 +926,17 @@ def update_ecr_credentials(
     Examples
     --------
     >>> import awswrangler as wr
-    >>> step_id = wr.emr.update_ecr_credentials("cluster_id")
+    >>> step_id = wr.emr.submit_ecr_credentials_refresh("cluster_id", "s3://bucket/emr/")
 
     """
-    name: str = "Update ECR Credentials"
-    command: str = _get_ecr_credentials_command()
+    path = path[:-1] if path.endswith("/") else path
+    path_script: str = f"{path}/ecr_credentials_refresh.py"
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
+    client_s3: boto3.client = _utils.client(service_name="s3", session=session)
+    bucket, key = _utils.parse_path(path=path_script)
+    client_s3.put_object(Body=_get_ecr_credentials_refresh_content().encode(encoding="utf-8"), Bucket=bucket, Key=key)
+    command: str = f"spark-submit --deploy-mode cluster {path_script}"
+    name: str = "ECR Credentials Refresh"
     step: Dict[str, Any] = build_step(
         name=name, command=command, action_on_failure=action_on_failure, script=False, boto3_session=session
     )
@@ -1006,3 +944,91 @@ def update_ecr_credentials(
     response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
     _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
     return response["StepIds"][0]
+
+
+def _get_ecr_credentials_refresh_content() -> str:
+    return """
+import subprocess
+from pyspark.sql import SparkSession
+spark = SparkSession.builder.appName("ECR Setup Job").getOrCreate()
+
+COMMANDS = [
+    "sudo -s eval $(aws ecr get-login --region us-east-1 --no-include-email)",
+    "sudo hdfs dfs -put -f /root/.docker/config.json /user/hadoop/"
+]
+
+for command in COMMANDS:
+    subprocess.run(command.split(" "), timeout=6.0, check=True)
+
+print("done!")
+    """
+
+
+def build_spark_step(
+    path: str,
+    deploy_mode: str = "cluster",
+    docker_image: Optional[str] = None,
+    name: str = "my-step",
+    action_on_failure: str = "CONTINUE",
+    region: Optional[str] = None,
+    boto3_session: Optional[boto3.Session] = None,
+) -> Dict[str, Any]:
+    """Build the Step structure (dictionary).
+
+    Parameters
+    ----------
+    path : str
+        Script path. (e.g. s3://bucket/app.py)
+    deploy_mode : str
+        "cluster" | "client"
+    docker_image : str, optional
+        e.g. "{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}"
+    name : str, optional
+        Step name.
+    action_on_failure : str
+        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
+    region: str, optional
+        Region name to not get it from boto3.Session. (e.g. `us-east-1`)
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    Dict[str, Any]
+        Step structure.
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> step_id = wr.emr.submit_steps(
+    >>>     cluster_id="cluster-id",
+    >>>     steps=[
+    >>>         wr.emr.build_spark_step(path="s3://bucket/app.py")
+    >>>     ]
+    >>> )
+
+    """
+    if docker_image is None:  # pragma: no cover
+        cmd: str = f"spark-submit --deploy-mode {deploy_mode} {path}"
+    else:
+        config: str = "hdfs:///user/hadoop/config.json"
+        cmd = (
+            f"spark-submit --deploy-mode cluster "
+            f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_TYPE=docker "
+            f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE={docker_image} "
+            f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG={config} "
+            f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS=/etc/passwd:/etc/passwd:ro "
+            f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_TYPE=docker "
+            f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE={docker_image} "
+            f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG={config} "
+            f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS=/etc/passwd:/etc/passwd:ro "
+            f"{path}"
+        )
+    return build_step(
+        command=cmd,
+        name=name,
+        action_on_failure=action_on_failure,
+        script=False,
+        region=region,
+        boto3_session=boto3_session,
+    )
diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py
index 66f8e139f..fdda2fa25 100644
--- a/testing/test_awswrangler/test_emr.py
+++ b/testing/test_awswrangler/test_emr.py
@@ -156,14 +156,10 @@ def test_default_logging_path(cloudformation_outputs):
         wr.emr._get_default_logging_path()
 
 
-def test_docker(cloudformation_outputs):
+def test_docker(bucket, cloudformation_outputs):
     cluster_id = wr.emr.create_cluster(
         subnet_id=cloudformation_outputs["SubnetId"],
         docker=True,
-        spark_docker=True,
-        spark_docker_image="123456789123.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr",
-        hive_docker=True,
-        ecr_credentials_step=True,
         custom_classifications=[
             {
                 "Classification": "livy-conf",
@@ -176,6 +172,14 @@ def test_docker(cloudformation_outputs):
         ],
         steps=[wr.emr.build_step("spark-submit --deploy-mode cluster s3://bucket/emr.py")],
     )
-    wr.emr.submit_step(cluster_id=cluster_id, command="spark-submit --deploy-mode cluster s3://bucket/emr.py")
-    wr.emr.update_ecr_credentials(cluster_id=cluster_id)
+    wr.emr.submit_ecr_credentials_refresh(cluster_id, path=f"s3://{bucket}/emr/")
+    wr.emr.submit_steps(
+        cluster_id=cluster_id,
+        steps=[
+            wr.emr.build_spark_step(
+                path=f"s3://{bucket}/emr/test_docker.py",
+                docker_image="123456789123.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr",
+            )
+        ],
+    )
     wr.emr.terminate_cluster(cluster_id=cluster_id)
diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb
index 440d72066..8a637af86 100644
--- a/tutorials/16 - EMR & Docker.ipynb	
+++ b/tutorials/16 - EMR & Docker.ipynb	
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -53,7 +53,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -142,25 +142,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
-    "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n",
-    "\n",
-    "cluster_id = wr.emr.create_cluster(\n",
-    "    subnet_id=subnet,\n",
-    "    spark_docker=True,\n",
-    "    spark_docker_image=DOCKER_IMAGE,\n",
-    "    ecr_credentials_step=True\n",
-    ")"
+    "cluster_id = wr.emr.create_cluster(subnet, docker=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Refresh ECR credentials in the cluster (expiration time: 12h )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'s-3OPMPDCYGEGOT'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wr.emr.submit_ecr_credentials_refresh(cluster_id, path=f\"s3://{bucket}/emr/\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Uploading our PySpark script to Amazon S3"
+    "## Uploading application script to Amazon S3 (PySpark)"
    ]
   },
   {
@@ -184,7 +204,7 @@
     "_ = boto3.client(\"s3\").put_object(\n",
     "    Body=script,\n",
     "    Bucket=bucket,\n",
-    "    Key=\"test_docker.py\"\n",
+    "    Key=\"emr/test_docker.py\"\n",
     ")"
    ]
   },
@@ -201,9 +221,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "step_id = wr.emr.submit_step(\n",
+    "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n",
+    "\n",
+    "step = wr.emr.build_spark_step(f\"s3://{bucket}/emr/test_docker.py\", docker_image=DOCKER_IMAGE)\n",
+    "\n",
+    "steps_ids = wr.emr.submit_steps(\n",
     "    cluster_id=cluster_id,\n",
-    "    command=f\"spark-submit --deploy-mode cluster s3://{bucket}/test_docker.py\"\n",
+    "    steps=[step]\n",
     ")"
    ]
   },
@@ -220,7 +244,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n",
+    "while wr.emr.get_step_state(cluster_id, steps_ids[0]) != \"COMPLETED\":\n",
     "    pass"
    ]
   },
@@ -240,6 +264,78 @@
     "wr.emr.terminate_cluster(cluster_id)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Another example with custom configurations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cluster_id = wr.emr.create_cluster(\n",
+    "    cluster_name=\"my-demo-cluster-v2\",\n",
+    "    logging_s3_path=f\"s3://{bucket}/emr-logs/\",\n",
+    "    emr_release=\"emr-6.0.0\",\n",
+    "    subnet_id=subnet,\n",
+    "    emr_ec2_role=\"EMR_EC2_DefaultRole\",\n",
+    "    emr_role=\"EMR_DefaultRole\",\n",
+    "    instance_type_master=\"m5.2xlarge\",\n",
+    "    instance_type_core=\"m5.2xlarge\",\n",
+    "    instance_ebs_size_master=50,\n",
+    "    instance_ebs_size_core=50,\n",
+    "    instance_num_on_demand_master=0,\n",
+    "    instance_num_on_demand_core=0,\n",
+    "    instance_num_spot_master=1,\n",
+    "    instance_num_spot_core=2,\n",
+    "    spot_bid_percentage_of_on_demand_master=100,\n",
+    "    spot_bid_percentage_of_on_demand_core=100,\n",
+    "    spot_provisioning_timeout_master=5,\n",
+    "    spot_provisioning_timeout_core=5,\n",
+    "    spot_timeout_to_on_demand_master=False,\n",
+    "    spot_timeout_to_on_demand_core=False,\n",
+    "    python3=True,\n",
+    "    docker=True,\n",
+    "    spark_glue_catalog=True,\n",
+    "    hive_glue_catalog=True,\n",
+    "    presto_glue_catalog=True,\n",
+    "    debugging=True,\n",
+    "    applications=[\"Hadoop\", \"Spark\", \"Hive\", \"Zeppelin\", \"Livy\"],\n",
+    "    visible_to_all_users=True,\n",
+    "    maximize_resource_allocation=True,\n",
+    "    keep_cluster_alive_when_no_steps=True,\n",
+    "    termination_protected=False,\n",
+    "    spark_pyarrow=True\n",
+    ")\n",
+    "\n",
+    "wr.emr.submit_ecr_credentials_refresh(cluster_id, path=f\"s3://{bucket}/emr/\")\n",
+    "\n",
+    "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n",
+    "\n",
+    "steps_ids = wr.emr.submit_steps(\n",
+    "    cluster_id=cluster_id,\n",
+    "    steps=[\n",
+    "        wr.emr.build_spark_step(f\"s3://{bucket}/emr/test_docker.py\", docker_image=DOCKER_IMAGE)\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "while wr.emr.get_step_state(cluster_id, steps_ids[0]) != \"COMPLETED\":\n",
+    "    pass\n",
+    "\n",
+    "wr.emr.terminate_cluster(cluster_id)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -269,4 +365,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}

From 2eefb3aa4ff8694fcdf58bc6e5bb943633c0de5d Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sun, 26 Apr 2020 16:42:46 -0300
Subject: [PATCH 30/59] Small update in the EMR tutorial

---
 tutorials/16 - EMR & Docker.ipynb | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb
index 8a637af86..9bfa182fc 100644
--- a/tutorials/16 - EMR & Docker.ipynb	
+++ b/tutorials/16 - EMR & Docker.ipynb	
@@ -225,10 +225,7 @@
     "\n",
     "step = wr.emr.build_spark_step(f\"s3://{bucket}/emr/test_docker.py\", docker_image=DOCKER_IMAGE)\n",
     "\n",
-    "steps_ids = wr.emr.submit_steps(\n",
-    "    cluster_id=cluster_id,\n",
-    "    steps=[step]\n",
-    ")"
+    "steps_ids = wr.emr.submit_steps(cluster_id, steps=[step])"
    ]
   },
   {

From 86cdb307a27c7a6107627e34272da857c52573bf Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Sun, 26 Apr 2020 17:49:00 -0300
Subject: [PATCH 31/59] fix init and docs

---
 awswrangler/__init__.py | 11 ++++++++++-
 awswrangler/torch.py    | 27 +++++++++++++++++++++++----
 requirements-torch.txt  |  4 ++--
 3 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
index ff6a2bd71..b7f931a3d 100644
--- a/awswrangler/__init__.py
+++ b/awswrangler/__init__.py
@@ -5,9 +5,18 @@
 
 """
 
+import importlib
 import logging
 
-from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3, torch  # noqa
+from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3  # noqa
 from awswrangler.__metadata__ import __description__, __license__, __title__, __version__  # noqa
 
+if (
+    importlib.util.find_spec("torch")
+    and importlib.util.find_spec("torchvision")
+    and importlib.util.find_spec("torchaudio")
+    and importlib.util.find_spec("PIL")
+):  # type: ignore
+    from awswrangler import torch  # noqa
+
 logging.getLogger("awswrangler").addHandler(logging.NullHandler())
diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index a5b589386..e7cd4518f 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -35,6 +35,8 @@ def __init__(
         ----------
         path : Union[str, List[str]]
             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        suffix: str, optional
+            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png).
         boto3_session : boto3.Session(), optional
             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -85,10 +87,10 @@ def __len__(self):
         return len(self._paths)
 
     def _data_fn(self, data) -> Any:
-        pass
+        raise NotImplementedError()
 
     def _label_fn(self, path: str) -> Any:
-        pass
+        raise NotImplementedError()
 
 
 class _S3PartitionedDataset(_ListS3Dataset):
@@ -98,6 +100,9 @@ def _label_fn(self, path: str) -> torch.Tensor:
         label = int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1])
         return torch.tensor([label])  # pylint: disable=not-callable
 
+    def _data_fn(self, data) -> Any:
+        raise NotImplementedError()
+
 
 # class S3FilesDataset(_BaseS3Dataset, Dataset):
 #     """PyTorch Amazon S3 Files Map-Style Dataset."""
@@ -162,6 +167,12 @@ def __init__(
         ----------
         path : Union[str, List[str]]
             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        data_fn: Callable
+            Function that receives a io.BytesIO object and returns a torch.Tensor
+        label_fn: Callable
+            Function that receives object path (str) and return a torch.Tensor
+        suffix: str, optional
+            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png).
         boto3_session : boto3.Session(), optional
             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -226,6 +237,8 @@ def __init__(
         ----------
         path : Union[str, List[str]]
             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        suffix: str, optional
+            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png).
         boto3_session : boto3.Session(), optional
             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -314,6 +327,8 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto
         ----------
         path : Union[str, List[str]]
             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+        suffix: str, optional
+            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png).
         boto3_session : boto3.Session(), optional
             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -342,6 +357,8 @@ class S3IterableDataset(IterableDataset, _BaseS3Dataset):  # pylint: disable=abs
     ----------
     path : Union[str, List[str]]
         S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
+    suffix: str, optional
+        S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png).
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -395,7 +412,9 @@ def __init__(
             SQLAlchemy Engine. Please use,
             wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()
         label_col : int, optional
-            Label column number
+            Label column number.
+        chunksize : int, optional
+            The chunksize determines que number of rows to be retrived from the database at each time.
 
         Returns
         -------
@@ -425,7 +444,7 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor,
                 label_col: Optional[int] = list(cursor.keys()).index(self._label_col)
             else:
                 label_col = self._label_col
-            _logger.debug(f"label_col: {label_col}")
+            _logger.debug("label_col: %s", label_col)
             if self._chunksize is None:
                 return SQLDataset._records2tensor(records=cursor.fetchall(), label_col=label_col)
             return self._iterate_cursor(cursor=cursor, chunksize=self._chunksize, label_col=label_col)
diff --git a/requirements-torch.txt b/requirements-torch.txt
index 01d2c6e65..61de25397 100644
--- a/requirements-torch.txt
+++ b/requirements-torch.txt
@@ -1,4 +1,4 @@
-torch~=1.4.0
+torch~=1.5.0
 torchvision~=0.5.0
 torchaudio~=0.4.0
-Pillow==7.1.1
+Pillow~=7.1.1

From b3c8c811282f8d50fc3a7fa855ab7a0933e8d121 Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Sun, 26 Apr 2020 20:11:44 -0300
Subject: [PATCH 32/59] update tutorial

---
 tutorials/14 - PyTorch.ipynb | 121 ++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 51 deletions(-)

diff --git a/tutorials/14 - PyTorch.ipynb b/tutorials/14 - PyTorch.ipynb
index a3d988881..b85596986 100644
--- a/tutorials/14 - PyTorch.ipynb	
+++ b/tutorials/14 - PyTorch.ipynb	
@@ -19,24 +19,28 @@
    "metadata": {},
    "source": [
     "## Table of Contents\n",
-    "* [1.Defining Training Function](#1.-Defininf-Training-Function)\n",
-    "* [2.Traning From Amazon S3](#1.-Traning-From-Amazon-S3)\n",
-    "\t* [2.1 Writing PyTorch Dataset to S3](#1.1-Writing-PyTorch-Dataset-to-S3)\n",
-    "\t* [2.2 Training Network](#1.2-Training-Network)\n",
-    "* [3. Training From SQL Query](#2.-Training-From-SQL-Query)\n",
-    "\t* [3.1 Writing Data to SQL Database](#2.1-Writing-Data-to-SQL-Database)\n",
-    "\t* [3.3 Training Network From SQL](#2.2-Reading-single-JSON-file)\n",
-    "* [4. Creating Custom S3 Dataset](#1.-Creating-Custom-S3-Dataset)\n",
-    "\t* [4.1 Creating Custom PyTorch Dataset](#1.1-Creating-Custom-PyTorch-Dataset)\n",
-    "\t* [4.2 Writing Data to S3](#1.1-Writing-Data-to-S3)\n",
-    "\t* [4.3 Training Network](#1.2-Training-Network)\n",
-    "* [5. Delete objects](#6.-Delete-objects)"
+    "* [1.Defining Training Function](#1.-Defining-Training-Function)\n",
+    "* [2.Training From Amazon S3](#2.-Traoning-From-Amazon-S3)\n",
+    "\t* [2.1 Writing PyTorch Dataset to S3](#2.1-Writing-PyTorch-Dataset-to-S3)\n",
+    "\t* [2.2 Training Network](#2.2-Training-Network)\n",
+    "* [3. Training From SQL Query](#3.-Training-From-SQL-Query)\n",
+    "\t* [3.1 Writing Data to SQL Database](#3.1-Writing-Data-to-SQL-Database)\n",
+    "\t* [3.3 Training Network From SQL](#3.3-Reading-single-JSON-file)\n",
+    "* [4. Creating Custom S3 Dataset](#4.-Creating-Custom-S3-Dataset)\n",
+    "\t* [4.1 Creating Custom PyTorch Dataset](#4.1-Creating-Custom-PyTorch-Dataset)\n",
+    "\t* [4.2 Writing Data to S3](#4.2-Writing-Data-to-S3)\n",
+    "\t* [4.3 Training Network](#4.4-Training-Network)\n",
+    "* [5. Delete objects](#5.-Delete-objects)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [],
    "source": [
     "import io\n",
@@ -55,13 +59,17 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
-     "name": "stdin",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      " ··········································\n"
+      "········\n"
      ]
     }
    ],
@@ -116,13 +124,24 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 2. Traning From Amazon S3"
+    "# 2. Training From Amazon S3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2.1 Writing PyTorch Dataset to S3"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [],
    "source": [
     "client_s3 = boto3.client(\"s3\")\n",
@@ -153,23 +172,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "batch: 0 loss: 7.0221 acc: 0.00\n",
-      "batch: 1 loss: 2.7788 acc: 23.44\n",
-      "batch: 2 loss: 0.9828 acc: 32.29\n",
-      "batch: 3 loss: 0.9414 acc: 39.45\n",
-      "batch: 4 loss: 1.0737 acc: 39.38\n",
-      "batch: 0 loss: 1.2178 acc: 50.00\n",
-      "batch: 1 loss: 1.4069 acc: 51.56\n",
-      "batch: 2 loss: 1.0783 acc: 52.08\n",
-      "batch: 3 loss: 0.9926 acc: 52.34\n",
-      "batch: 4 loss: 1.1111 acc: 49.06\n"
+      "batch: 0 loss: 7.0132 acc: 0.00\n",
+      "batch: 1 loss: 2.8764 acc: 21.09\n",
+      "batch: 2 loss: 0.9600 acc: 32.29\n",
+      "batch: 3 loss: 0.8676 acc: 36.33\n",
+      "batch: 4 loss: 1.1386 acc: 36.88\n",
+      "batch: 0 loss: 1.0754 acc: 51.56\n",
+      "batch: 1 loss: 1.4241 acc: 51.56\n",
+      "batch: 2 loss: 1.3019 acc: 51.04\n",
+      "batch: 3 loss: 0.8631 acc: 53.52\n",
+      "batch: 4 loss: 0.4252 acc: 54.38\n"
      ]
     }
    ],
@@ -196,7 +215,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -226,28 +245,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "batch: 0 loss: 5.0253 acc: 50.00\n",
-      "batch: 1 loss: 21.3174 acc: 50.00\n",
-      "batch: 2 loss: 0.5061 acc: 66.67\n",
-      "batch: 0 loss: 1.2222 acc: 50.00\n",
-      "batch: 1 loss: 0.7075 acc: 50.00\n",
-      "batch: 2 loss: 0.7077 acc: 50.00\n",
-      "batch: 0 loss: 0.9302 acc: 50.00\n",
-      "batch: 1 loss: 0.6960 acc: 50.00\n",
-      "batch: 2 loss: 0.6018 acc: 66.67\n",
-      "batch: 0 loss: 1.1284 acc: 50.00\n",
-      "batch: 1 loss: 0.7077 acc: 50.00\n",
-      "batch: 2 loss: 0.6791 acc: 50.00\n",
-      "batch: 0 loss: 1.0030 acc: 50.00\n",
-      "batch: 1 loss: 0.7053 acc: 50.00\n",
-      "batch: 2 loss: 0.6318 acc: 50.00\n"
+      "batch: 0 loss: 8.8708 acc: 50.00\n",
+      "batch: 1 loss: 88.7789 acc: 50.00\n",
+      "batch: 2 loss: 0.8655 acc: 33.33\n",
+      "batch: 0 loss: 0.7036 acc: 50.00\n",
+      "batch: 1 loss: 0.7034 acc: 50.00\n",
+      "batch: 2 loss: 0.8447 acc: 33.33\n",
+      "batch: 0 loss: 0.7012 acc: 50.00\n",
+      "batch: 1 loss: 0.7010 acc: 50.00\n",
+      "batch: 2 loss: 0.8250 acc: 33.33\n",
+      "batch: 0 loss: 0.6992 acc: 50.00\n",
+      "batch: 1 loss: 0.6991 acc: 50.00\n",
+      "batch: 2 loss: 0.8063 acc: 33.33\n",
+      "batch: 0 loss: 0.6975 acc: 50.00\n",
+      "batch: 1 loss: 0.6974 acc: 50.00\n",
+      "batch: 2 loss: 0.7886 acc: 33.33\n"
      ]
     }
    ],
@@ -279,7 +298,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -289,9 +308,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "conda_python3",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "conda_python3"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -303,9 +322,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.7.5"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file

From f6927a4e75ced46bb42192b28539bd6473cb5848 Mon Sep 17 00:00:00 2001
From: Luigi Tedesco <ltedesc@amazon.com>
Date: Sun, 26 Apr 2020 21:16:10 -0300
Subject: [PATCH 33/59] rollback pytorch==1.5.0, due to torchaudio requirement

---
 requirements-torch.txt       | 2 +-
 tutorials/14 - PyTorch.ipynb | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements-torch.txt b/requirements-torch.txt
index 61de25397..73b8aae36 100644
--- a/requirements-torch.txt
+++ b/requirements-torch.txt
@@ -1,4 +1,4 @@
-torch~=1.5.0
+torch~=1.4.0
 torchvision~=0.5.0
 torchaudio~=0.4.0
 Pillow~=7.1.1
diff --git a/tutorials/14 - PyTorch.ipynb b/tutorials/14 - PyTorch.ipynb
index b85596986..b7af04627 100644
--- a/tutorials/14 - PyTorch.ipynb	
+++ b/tutorials/14 - PyTorch.ipynb	
@@ -222,7 +222,7 @@
     "eng = wr.catalog.get_engine(\"aws-data-wrangler-redshift\")\n",
     "df = pd.DataFrame({\n",
     "    \"height\": [2, 1.4, 1.7, 1.8, 1.9, 2.2],\n",
-    "    \"weigth\": [100.0, 50.0, 70.0, 80.0, 90.0, 160.0],\n",
+    "    \"weight\": [100.0, 50.0, 70.0, 80.0, 90.0, 160.0],\n",
     "    \"target\": [1, 0, 0, 1, 1, 1]\n",
     "})\n",
     "\n",
@@ -302,7 +302,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "wr.s3.delete_objects(f\"s3://{bucket}/\")"
+    "wr.s3.delete_objects(f\"s3://{bucket}/{folder}\")"
    ]
   }
  ],

From f0f154bd42807066dbfa24204fcd28a66c1981ab Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sun, 26 Apr 2020 22:37:45 -0300
Subject: [PATCH 34/59] Add wr.emr.submit_spark_step

---
 awswrangler/emr.py                   | 58 ++++++++++++++++++++++++++++
 docs/source/api.rst                  |  4 +-
 testing/test_awswrangler/test_emr.py |  1 +
 tutorials/16 - EMR & Docker.ipynb    | 47 +++++++++++-----------
 4 files changed, 84 insertions(+), 26 deletions(-)

diff --git a/awswrangler/emr.py b/awswrangler/emr.py
index 3658d4573..3801d340e 100644
--- a/awswrangler/emr.py
+++ b/awswrangler/emr.py
@@ -1032,3 +1032,61 @@ def build_spark_step(
         region=region,
         boto3_session=boto3_session,
     )
+
+
+def submit_spark_step(
+    cluster_id: str,
+    path: str,
+    deploy_mode: str = "cluster",
+    docker_image: Optional[str] = None,
+    name: str = "my-step",
+    action_on_failure: str = "CONTINUE",
+    region: Optional[str] = None,
+    boto3_session: Optional[boto3.Session] = None,
+) -> str:
+    """Submit Spark Step.
+
+    Parameters
+    ----------
+    cluster_id : str
+        Cluster ID.
+    path : str
+        Script path. (e.g. s3://bucket/app.py)
+    deploy_mode : str
+        "cluster" | "client"
+    docker_image : str, optional
+        e.g. "{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}"
+    name : str, optional
+        Step name.
+    action_on_failure : str
+        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
+    region: str, optional
+        Region name to not get it from boto3.Session. (e.g. `us-east-1`)
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    str
+        Step ID.
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> step_id = wr.emr.submit_spark_step(
+    >>>     cluster_id="cluster-id",
+    >>>     path="s3://bucket/emr/app.py"
+    >>> )
+
+    """
+    session: boto3.Session = _utils.ensure_session(session=boto3_session)
+    step = build_spark_step(
+        path=path,
+        deploy_mode=deploy_mode,
+        docker_image=docker_image,
+        name=name,
+        action_on_failure=action_on_failure,
+        region=region,
+        boto3_session=session,
+    )
+    return submit_steps(cluster_id=cluster_id, steps=[step], boto3_session=session)[0]
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 7d2d51602..6b841705e 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -113,10 +113,12 @@ EMR
     get_cluster_state
     terminate_cluster
     submit_step
+    submit_spark_step
+    submit_ecr_credentials_refresh
     submit_steps
     build_step
+    build_spark_step
     get_step_state
-    update_ecr_credentials
 
 CloudWatch Logs
 ---------------
diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py
index fdda2fa25..0c0112bf8 100644
--- a/testing/test_awswrangler/test_emr.py
+++ b/testing/test_awswrangler/test_emr.py
@@ -182,4 +182,5 @@ def test_docker(bucket, cloudformation_outputs):
             )
         ],
     )
+    wr.emr.submit_spark_step(cluster_id=cluster_id, path=f"s3://{bucket}/emr/test_docker.py")
     wr.emr.terminate_cluster(cluster_id=cluster_id)
diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb
index 9bfa182fc..4ffb2be2b 100644
--- a/tutorials/16 - EMR & Docker.ipynb	
+++ b/tutorials/16 - EMR & Docker.ipynb	
@@ -11,12 +11,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import awswrangler as wr\n",
-    "import boto3"
+    "import boto3\n",
+    "import getpass"
    ]
   },
   {
@@ -40,7 +41,6 @@
     }
    ],
    "source": [
-    "import getpass\n",
     "bucket = getpass.getpass()"
    ]
   },
@@ -164,7 +164,7 @@
     {
      "data": {
       "text/plain": [
-       "'s-3OPMPDCYGEGOT'"
+       "'s-1B0O45RWJL8CL'"
       ]
      },
      "execution_count": 5,
@@ -173,7 +173,7 @@
     }
    ],
    "source": [
-    "wr.emr.submit_ecr_credentials_refresh(cluster_id, path=f\"s3://{bucket}/emr/\")"
+    "wr.emr.submit_ecr_credentials_refresh(cluster_id, path=f\"s3://{bucket}/\")"
    ]
   },
   {
@@ -185,7 +185,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,11 +201,7 @@
     "print(f\"Wrangler version: {wr.__version__}\")\n",
     "\"\"\"\n",
     "\n",
-    "_ = boto3.client(\"s3\").put_object(\n",
-    "    Body=script,\n",
-    "    Bucket=bucket,\n",
-    "    Key=\"emr/test_docker.py\"\n",
-    ")"
+    "boto3.client(\"s3\").put_object(Body=script, Bucket=bucket, Key=\"test_docker.py\");"
    ]
   },
   {
@@ -217,15 +213,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
     "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n",
     "\n",
-    "step = wr.emr.build_spark_step(f\"s3://{bucket}/emr/test_docker.py\", docker_image=DOCKER_IMAGE)\n",
-    "\n",
-    "steps_ids = wr.emr.submit_steps(cluster_id, steps=[step])"
+    "step_id = wr.emr.submit_spark_step(\n",
+    "    cluster_id,\n",
+    "    f\"s3://{bucket}/test_docker.py\",\n",
+    "    docker_image=DOCKER_IMAGE\n",
+    ")"
    ]
   },
   {
@@ -237,11 +235,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "while wr.emr.get_step_state(cluster_id, steps_ids[0]) != \"COMPLETED\":\n",
+    "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n",
     "    pass"
    ]
   },
@@ -254,7 +252,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -270,7 +268,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -313,11 +311,10 @@
     "\n",
     "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n",
     "\n",
-    "steps_ids = wr.emr.submit_steps(\n",
-    "    cluster_id=cluster_id,\n",
-    "    steps=[\n",
-    "        wr.emr.build_spark_step(f\"s3://{bucket}/emr/test_docker.py\", docker_image=DOCKER_IMAGE)\n",
-    "    ]\n",
+    "step_id = wr.emr.submit_spark_step(\n",
+    "    cluster_id,\n",
+    "    f\"s3://{bucket}/test_docker.py\",\n",
+    "    docker_image=DOCKER_IMAGE\n",
     ")"
    ]
   },
@@ -327,7 +324,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "while wr.emr.get_step_state(cluster_id, steps_ids[0]) != \"COMPLETED\":\n",
+    "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n",
     "    pass\n",
     "\n",
     "wr.emr.terminate_cluster(cluster_id)"

From 602bceb5f20d31989c8eda2949e8ad106285b27c Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sun, 26 Apr 2020 22:43:32 -0300
Subject: [PATCH 35/59] Bumping version to 1.1.0

---
 README.md                                 | 2 +-
 awswrangler/__metadata__.py               | 2 +-
 testing/test_awswrangler/test_metadata.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 608297330..ce57f6b00 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ We just released a new major version `1.0` with breaking changes. Please make su
 
 ![AWS Data Wrangler](docs/source/_static/logo2.png?raw=true "AWS Data Wrangler")
 
-[![Release](https://img.shields.io/badge/release-1.0.4-brightgreen.svg)](https://pypi.org/project/awswrangler/)
+[![Release](https://img.shields.io/badge/release-1.1.0-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
diff --git a/awswrangler/__metadata__.py b/awswrangler/__metadata__.py
index 724b626da..cfc9336b9 100644
--- a/awswrangler/__metadata__.py
+++ b/awswrangler/__metadata__.py
@@ -7,5 +7,5 @@
 
 __title__ = "awswrangler"
 __description__ = "Pandas on AWS."
-__version__ = "1.0.4"
+__version__ = "1.1.0"
 __license__ = "Apache License 2.0"
diff --git a/testing/test_awswrangler/test_metadata.py b/testing/test_awswrangler/test_metadata.py
index d076c0d94..88e71c5a3 100644
--- a/testing/test_awswrangler/test_metadata.py
+++ b/testing/test_awswrangler/test_metadata.py
@@ -2,7 +2,7 @@
 
 
 def test_metadata():
-    assert wr.__version__ == "1.0.4"
+    assert wr.__version__ == "1.1.0"
     assert wr.__title__ == "awswrangler"
     assert wr.__description__ == "Pandas on AWS."
     assert wr.__license__ == "Apache License 2.0"

From aeb8792c939db9138660be33854027434ddf4677 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Mon, 27 Apr 2020 13:01:43 -0300
Subject: [PATCH 36/59] Improving the chunksize parser slicer algorithm

---
 awswrangler/s3.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index 0127f8897..7090c2c0f 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -1684,23 +1684,15 @@ def _read_parquet_chunked(
         if chunked is True:
             yield _table2df(table=table, categories=categories, use_threads=use_threads)
         else:
-            if next_slice is not None:
+            if next_slice:
                 table = pa.lib.concat_tables([next_slice, table], promote=promote)
-            length: int = len(table)
-            while True:
-                if length == chunked:
-                    yield _table2df(table=table, categories=categories, use_threads=use_threads)
-                    next_slice = None
-                    break
-                if length < chunked:
-                    next_slice = table
-                    break
+            while len(table) >= chunked:
                 yield _table2df(
                     table=table.slice(offset=0, length=chunked), categories=categories, use_threads=use_threads
                 )
                 table = table.slice(offset=chunked, length=None)
-                length = len(table)
-    if next_slice is not None:
+            next_slice = table
+    if next_slice:
         yield _table2df(table=next_slice, categories=categories, use_threads=use_threads)
 
 

From 797fba55245e4847c777b35be1c7eb2111d456cb Mon Sep 17 00:00:00 2001
From: Igor Tavares <igorborgest@gmail.com>
Date: Mon, 27 Apr 2020 15:32:42 -0300
Subject: [PATCH 37/59] Update badges on README

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ce57f6b00..c72692f85 100644
--- a/README.md
+++ b/README.md
@@ -15,9 +15,8 @@ We just released a new major version `1.0` with breaking changes. Please make su
 [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
-[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
-[![Average time to resolve an issue](http://isitmaintained.com/badge/resolution/awslabs/aws-data-wrangler.svg)](http://isitmaintained.com/project/awslabs/aws-data-wrangler "Average time to resolve an issue")
 
+[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
 [![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 ![Static Checking](https://github.com/awslabs/aws-data-wrangler/workflows/Static%20Checking/badge.svg?branch=master)
 [![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/?badge=latest)

From d9f107a75ff7b35a350ff379f614712412802765 Mon Sep 17 00:00:00 2001
From: Igor Tavares <igorborgest@gmail.com>
Date: Mon, 27 Apr 2020 15:35:23 -0300
Subject: [PATCH 38/59] Add EMR tutorials to README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index c72692f85..66095288c 100644
--- a/README.md
+++ b/README.md
@@ -84,6 +84,8 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine)
   - [11 - CSV Datasets](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/11%20-%20CSV%20Datasets.ipynb)
   - [12 - CSV Crawler](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/12%20-%20CSV%20Crawler.ipynb)
   - [13 - Merging Datasets on S3](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/13%20-%20Merging%20Datasets%20on%20S3.ipynb)
+  - [15 - EMR](https://github.com/awslabs/aws-data-wrangler/blob/dev/tutorials/15%20-%20EMR.ipynb)
+  - [16 - EMR & Docker](https://github.com/awslabs/aws-data-wrangler/blob/dev/tutorials/16%20-%20EMR%20%26%20Docker.ipynb)
 - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/latest/api.html)
   - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#amazon-s3)
   - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#aws-glue-catalog)

From 7fd449e02b30668ad717e9d34a0e303fac28e059 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Mon, 27 Apr 2020 15:54:53 -0300
Subject: [PATCH 39/59] Adapting to validations

---
 awswrangler/__init__.py | 9 ++-------
 awswrangler/s3.py       | 9 ++++++---
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
index f2a390a18..78299541e 100644
--- a/awswrangler/__init__.py
+++ b/awswrangler/__init__.py
@@ -5,19 +5,14 @@
 
 """
 
-import importlib
 import logging
+from importlib.util import find_spec
 
 from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3  # noqa
 from awswrangler.__metadata__ import __description__, __license__, __title__, __version__  # noqa
 from awswrangler._utils import get_account_id  # noqa
 
-if (
-    importlib.util.find_spec("torch")
-    and importlib.util.find_spec("torchvision")
-    and importlib.util.find_spec("torchaudio")
-    and importlib.util.find_spec("PIL")
-):  # type: ignore
+if find_spec("torch") and find_spec("torchvision") and find_spec("torchaudio") and find_spec("PIL"):
     from awswrangler import torch  # noqa
 
 logging.getLogger("awswrangler").addHandler(logging.NullHandler())
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index 7661e61d0..f4be39359 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -178,11 +178,14 @@ def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optiona
     ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2']
 
     """
-    return _list_objects(path=path, delimiter=None, boto3_session=boto3_session)
+    return _list_objects(path=path, delimiter=None, suffix=suffix, boto3_session=boto3_session)
 
 
 def _list_objects(
-    path: str, delimiter: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
+    path: str,
+    delimiter: Optional[str] = None,
+    suffix: Optional[str] = None,
+    boto3_session: Optional[boto3.Session] = None,
 ) -> List[str]:
     client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
     paginator = client_s3.get_paginator("list_objects_v2")
@@ -194,7 +197,7 @@ def _list_objects(
         args["Delimiter"] = delimiter
     response_iterator = paginator.paginate(**args)
     paths: List[str] = []
-    for page in response_iterator:
+    for page in response_iterator:  # pylint: disable=too-many-nested-blocks
         if delimiter is None:
             contents: Optional[List] = page.get("Contents")
             if contents is not None:

From fd115d89c240a03856936fe076bc2407aa1188ac Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Mon, 27 Apr 2020 19:30:30 -0300
Subject: [PATCH 40/59] Bumping dev dependencies

---
 requirements-dev.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 99a9b0730..bfdd15c5e 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,5 +1,5 @@
 black~=19.3b0
-pylint~=2.4.4
+pylint~=2.5.0
 flake8~=3.7.9
 mypy~=0.770
 isort~=4.3.21
@@ -11,11 +11,11 @@ pytest-cov~=2.8.1
 pytest-xdist~=1.31.0
 scikit-learn~=0.22.1
 awscli>=1.18.22
-cfn-lint~=0.29.5
-cfn-flip~=1.2.2
+cfn-lint~=0.29.6
+cfn-flip~=1.2.3
 twine~=3.1.1
 wheel~=0.34.2
-sphinx~=3.0.1
+sphinx~=3.0.3
 sphinx_bootstrap_theme~=0.7.1
 moto~=1.3.14
 jupyterlab~=2.1.1
\ No newline at end of file

From 8fad37c30f2ec85d845da624d1f7841cf861da49 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Mon, 27 Apr 2020 19:31:02 -0300
Subject: [PATCH 41/59] Bumping PyTorch libs versions

---
 requirements-torch.txt | 8 ++++----
 tox.ini                | 7 +++++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/requirements-torch.txt b/requirements-torch.txt
index 73b8aae36..d3e36447e 100644
--- a/requirements-torch.txt
+++ b/requirements-torch.txt
@@ -1,4 +1,4 @@
-torch~=1.4.0
-torchvision~=0.5.0
-torchaudio~=0.4.0
-Pillow~=7.1.1
+torch~=1.5.0
+torchvision~=0.6.0
+torchaudio~=0.5.0
+Pillow~=7.1.2
diff --git a/tox.ini b/tox.ini
index 9768fd204..f2bb572c2 100644
--- a/tox.ini
+++ b/tox.ini
@@ -6,10 +6,13 @@ deps =
        pytest
        pytest-xdist
        moto
-commands = pytest -n 8 testing/test_awswrangler
+       -rrequirements-torch.txt
+commands =
+       pytest -n 8 testing/test_awswrangler
 
 [testenv:py36]
 deps =
        {[testenv]deps}
        pytest-cov
-commands = pytest --cov=awswrangler -n 8 testing/test_awswrangler
+commands =
+       pytest --cov=awswrangler -n 8 testing/test_awswrangler

From 85bfade15ae16108cf45ba67ad81f0d08289571a Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Mon, 27 Apr 2020 19:32:01 -0300
Subject: [PATCH 42/59] Replacing all f-string on logging commands

---
 awswrangler/_data_types.py | 14 +++++++-------
 awswrangler/athena.py      | 26 +++++++++++++-------------
 awswrangler/catalog.py     | 10 +++++-----
 awswrangler/cloudwatch.py  |  8 ++++----
 awswrangler/db.py          | 22 +++++++++++-----------
 awswrangler/emr.py         | 16 ++++++++--------
 awswrangler/s3.py          | 36 ++++++++++++++++++------------------
 7 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
index 62928e816..947b058b0 100644
--- a/awswrangler/_data_types.py
+++ b/awswrangler/_data_types.py
@@ -207,7 +207,7 @@ def pyarrow2sqlalchemy(  # pylint: disable=too-many-branches,too-many-return-sta
         return sqlalchemy.types.Date
     if pa.types.is_binary(dtype):
         if db_type == "redshift":
-            raise exceptions.UnsupportedType(f"Binary columns are not supported for Redshift.")  # pragma: no cover
+            raise exceptions.UnsupportedType("Binary columns are not supported for Redshift.")  # pragma: no cover
         return sqlalchemy.types.Binary
     if pa.types.is_decimal(dtype):
         return sqlalchemy.types.Numeric(precision=dtype.precision, scale=dtype.scale)
@@ -257,7 +257,7 @@ def pyarrow_types_from_pandas(
     # Filling schema
     columns_types: Dict[str, pa.DataType]
     columns_types = {n: cols_dtypes[n] for n in sorted_cols}
-    _logger.debug(f"columns_types: {columns_types}")
+    _logger.debug("columns_types: %s", columns_types)
     return columns_types
 
 
@@ -275,7 +275,7 @@ def athena_types_from_pandas(
             athena_columns_types[k] = casts[k]
         else:
             athena_columns_types[k] = pyarrow2athena(dtype=v)
-    _logger.debug(f"athena_columns_types: {athena_columns_types}")
+    _logger.debug("athena_columns_types: %s", athena_columns_types)
     return athena_columns_types
 
 
@@ -315,7 +315,7 @@ def pyarrow_schema_from_pandas(
         if (k in df.columns) and (k not in ignore):
             columns_types[k] = athena2pyarrow(v)
     columns_types = {k: v for k, v in columns_types.items() if v is not None}
-    _logger.debug(f"columns_types: {columns_types}")
+    _logger.debug("columns_types: %s", columns_types)
     return pa.schema(fields=columns_types)
 
 
@@ -324,11 +324,11 @@ def athena_types_from_pyarrow_schema(
 ) -> Tuple[Dict[str, str], Optional[Dict[str, str]]]:
     """Extract the related Athena data types from any PyArrow Schema considering possible partitions."""
     columns_types: Dict[str, str] = {str(f.name): pyarrow2athena(dtype=f.type) for f in schema}
-    _logger.debug(f"columns_types: {columns_types}")
+    _logger.debug("columns_types: %s", columns_types)
     partitions_types: Optional[Dict[str, str]] = None
     if partitions is not None:
         partitions_types = {p.name: pyarrow2athena(p.dictionary.type) for p in partitions}
-    _logger.debug(f"partitions_types: {partitions_types}")
+    _logger.debug("partitions_types: %s", partitions_types)
     return columns_types, partitions_types
 
 
@@ -382,5 +382,5 @@ def sqlalchemy_types_from_pandas(
             sqlalchemy_columns_types[k] = casts[k]
         else:
             sqlalchemy_columns_types[k] = pyarrow2sqlalchemy(dtype=v, db_type=db_type)
-    _logger.debug(f"sqlalchemy_columns_types: {sqlalchemy_columns_types}")
+    _logger.debug("sqlalchemy_columns_types: %s", sqlalchemy_columns_types)
     return sqlalchemy_columns_types
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
index 4948f56dc..bd5c7cb35 100644
--- a/awswrangler/athena.py
+++ b/awswrangler/athena.py
@@ -176,8 +176,8 @@ def wait_query(query_execution_id: str, boto3_session: Optional[boto3.Session] =
         time.sleep(_QUERY_WAIT_POLLING_DELAY)
         response = client_athena.get_query_execution(QueryExecutionId=query_execution_id)
         state = response["QueryExecution"]["Status"]["State"]
-    _logger.debug(f"state: {state}")
-    _logger.debug(f"StateChangeReason: {response['QueryExecution']['Status'].get('StateChangeReason')}")
+    _logger.debug("state: %s", state)
+    _logger.debug("StateChangeReason: %s", response["QueryExecution"]["Status"].get("StateChangeReason"))
     if state == "FAILED":
         raise exceptions.QueryFailed(response["QueryExecution"]["Status"].get("StateChangeReason"))
     if state == "CANCELLED":
@@ -265,7 +265,7 @@ def _get_query_metadata(
     cols_types: Dict[str, str] = get_query_columns_types(
         query_execution_id=query_execution_id, boto3_session=boto3_session
     )
-    _logger.debug(f"cols_types: {cols_types}")
+    _logger.debug("cols_types: %s", cols_types)
     dtype: Dict[str, str] = {}
     parse_timestamps: List[str] = []
     parse_dates: List[str] = []
@@ -298,11 +298,11 @@ def _get_query_metadata(
             converters[col_name] = lambda x: Decimal(str(x)) if str(x) not in ("", "none", " ", "<NA>") else None
         else:
             dtype[col_name] = pandas_type
-    _logger.debug(f"dtype: {dtype}")
-    _logger.debug(f"parse_timestamps: {parse_timestamps}")
-    _logger.debug(f"parse_dates: {parse_dates}")
-    _logger.debug(f"converters: {converters}")
-    _logger.debug(f"binaries: {binaries}")
+    _logger.debug("dtype: %s", dtype)
+    _logger.debug("parse_timestamps: %s", parse_timestamps)
+    _logger.debug("parse_dates: %s", parse_dates)
+    _logger.debug("converters: %s", converters)
+    _logger.debug("binaries: %s", binaries)
     return dtype, parse_timestamps, parse_dates, converters, binaries
 
 
@@ -446,7 +446,7 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
             f") AS\n"
             f"{sql}"
         )
-    _logger.debug(f"sql: {sql}")
+    _logger.debug("sql: %s", sql)
     query_id: str = start_query_execution(
         sql=sql,
         database=database,
@@ -456,7 +456,7 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
         kms_key=kms_key,
         boto3_session=session,
     )
-    _logger.debug(f"query_id: {query_id}")
+    _logger.debug("query_id: %s", query_id)
     query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session)
     if query_response["QueryExecution"]["Status"]["State"] in ["FAILED", "CANCELLED"]:  # pragma: no cover
         reason: str = query_response["QueryExecution"]["Status"]["StateChangeReason"]
@@ -468,7 +468,7 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
         manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv"
         paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session)
         chunked: Union[bool, int] = False if chunksize is None else chunksize
-        _logger.debug(f"chunked: {chunked}")
+        _logger.debug("chunked: %s", chunked)
         if not paths:
             if chunked is False:
                 dfs = pd.DataFrame()
@@ -485,9 +485,9 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     )
     path = f"{_s3_output}/{query_id}.csv"
     s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session)
-    _logger.debug(f"Start CSV reading from {path}")
+    _logger.debug("Start CSV reading from %s", path)
     _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None
-    _logger.debug(f"_chunksize: {_chunksize}")
+    _logger.debug("_chunksize: %s", _chunksize)
     ret = s3.read_csv(
         path=[path],
         dtype=dtype,
diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py
index 8a53d4370..93092626b 100644
--- a/awswrangler/catalog.py
+++ b/awswrangler/catalog.py
@@ -766,7 +766,7 @@ def drop_duplicated_columns(df: pd.DataFrame) -> pd.DataFrame:
     duplicated_cols = df.columns.duplicated()
     duplicated_cols_names: List[str] = list(df.columns[duplicated_cols])
     if len(duplicated_cols_names) > 0:
-        _logger.warning(f"Dropping repeated columns: {duplicated_cols_names}")
+        _logger.warning("Dropping repeated columns: %s", duplicated_cols_names)
     return df.loc[:, ~duplicated_cols]
 
 
@@ -967,11 +967,11 @@ def _create_table(
             if name in columns_comments:
                 par["Comment"] = columns_comments[name]
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
-
-    if mode == "overwrite":
+    exist: bool = does_table_exist(database=database, table=table, boto3_session=session)
+    if (mode == "overwrite") or (exist is False):
         delete_table_if_exists(database=database, table=table, boto3_session=session)
-    client_glue: boto3.client = _utils.client(service_name="glue", session=session)
-    client_glue.create_table(DatabaseName=database, TableInput=table_input)
+        client_glue: boto3.client = _utils.client(service_name="glue", session=session)
+        client_glue.create_table(DatabaseName=database, TableInput=table_input)
 
 
 def _csv_table_definition(
diff --git a/awswrangler/cloudwatch.py b/awswrangler/cloudwatch.py
index e0a01f066..c36fab70b 100644
--- a/awswrangler/cloudwatch.py
+++ b/awswrangler/cloudwatch.py
@@ -56,11 +56,11 @@ def start_query(
     ... )
 
     """
-    _logger.debug(f"log_group_names: {log_group_names}")
+    _logger.debug("log_group_names: %s", log_group_names)
     start_timestamp: int = int(1000 * start_time.timestamp())
     end_timestamp: int = int(1000 * end_time.timestamp())
-    _logger.debug(f"start_timestamp: {start_timestamp}")
-    _logger.debug(f"end_timestamp: {end_timestamp}")
+    _logger.debug("start_timestamp: %s", start_timestamp)
+    _logger.debug("end_timestamp: %s", end_timestamp)
     args: Dict[str, Any] = {
         "logGroupNames": log_group_names,
         "startTime": start_timestamp,
@@ -109,7 +109,7 @@ def wait_query(query_id: str, boto3_session: Optional[boto3.Session] = None) ->
         time.sleep(_QUERY_WAIT_POLLING_DELAY)
         response = client_logs.get_query_results(queryId=query_id)
         status = response["status"]
-    _logger.debug(f"status: {status}")
+    _logger.debug("status: %s", status)
     if status == "Failed":  # pragma: no cover
         raise exceptions.QueryFailed(f"query ID: {query_id}")
     if status == "Cancelled":
diff --git a/awswrangler/db.py b/awswrangler/db.py
index c00ccf1a8..21b4789c4 100644
--- a/awswrangler/db.py
+++ b/awswrangler/db.py
@@ -646,7 +646,7 @@ def copy_files_to_redshift(  # pylint: disable=too-many-locals,too-many-argument
     athena_types, _ = s3.read_parquet_metadata(
         path=paths, dataset=False, use_threads=use_threads, boto3_session=session
     )
-    _logger.debug(f"athena_types: {athena_types}")
+    _logger.debug("athena_types: %s", athena_types)
     redshift_types: Dict[str, str] = {}
     for col_name, col_type in athena_types.items():
         length: int = _varchar_lengths[col_name] if col_name in _varchar_lengths else varchar_lengths_default
@@ -680,7 +680,7 @@ def copy_files_to_redshift(  # pylint: disable=too-many-locals,too-many-argument
 def _rs_upsert(con: Any, table: str, temp_table: str, schema: str, primary_keys: Optional[List[str]] = None) -> None:
     if not primary_keys:
         primary_keys = _rs_get_primary_keys(con=con, schema=schema, table=table)
-    _logger.debug(f"primary_keys: {primary_keys}")
+    _logger.debug("primary_keys: %s", primary_keys)
     if not primary_keys:  # pragma: no cover
         raise exceptions.InvalidRedshiftPrimaryKeys()
     equals_clause: str = f"{table}.%s = {temp_table}.%s"
@@ -735,7 +735,7 @@ def _rs_create_table(
         f"{distkey_str}"
         f"{sortkey_str}"
     )
-    _logger.debug(f"Create table query:\n{sql}")
+    _logger.debug("Create table query:\n%s", sql)
     con.execute(sql)
     return table, schema
 
@@ -746,7 +746,7 @@ def _rs_validate_parameters(
     if diststyle not in _RS_DISTSTYLES:
         raise exceptions.InvalidRedshiftDiststyle(f"diststyle must be in {_RS_DISTSTYLES}")
     cols = list(redshift_types.keys())
-    _logger.debug(f"Redshift columns: {cols}")
+    _logger.debug("Redshift columns: %s", cols)
     if (diststyle == "KEY") and (not distkey):
         raise exceptions.InvalidRedshiftDistkey("You must pass a distkey if you intend to use KEY diststyle")
     if distkey and distkey not in cols:
@@ -775,13 +775,13 @@ def _rs_copy(
     sql: str = (
         f"COPY {table_name} FROM '{manifest_path}'\n" f"IAM_ROLE '{iam_role}'\n" "MANIFEST\n" "FORMAT AS PARQUET"
     )
-    _logger.debug(f"copy query:\n{sql}")
+    _logger.debug("copy query:\n%s", sql)
     con.execute(sql)
     sql = "SELECT pg_last_copy_id() AS query_id"
     query_id: int = con.execute(sql).fetchall()[0][0]
     sql = f"SELECT COUNT(DISTINCT filename) as num_files_loaded " f"FROM STL_LOAD_COMMITS WHERE query = {query_id}"
     num_files_loaded: int = con.execute(sql).fetchall()[0][0]
-    _logger.debug(f"{num_files_loaded} files counted. {num_files} expected.")
+    _logger.debug("%s files counted. %s expected.", num_files_loaded, num_files)
     if num_files_loaded != num_files:  # pragma: no cover
         raise exceptions.RedshiftLoadError(
             f"Redshift load rollbacked. {num_files_loaded} files counted. {num_files} expected."
@@ -846,17 +846,17 @@ def write_redshift_copy_manifest(
     payload: str = json.dumps(manifest)
     bucket: str
     bucket, key = _utils.parse_path(manifest_path)
-    _logger.debug(f"payload: {payload}")
+    _logger.debug("payload: %s", payload)
     client_s3: boto3.client = _utils.client(service_name="s3", session=session)
-    _logger.debug(f"bucket: {bucket}")
-    _logger.debug(f"key: {key}")
+    _logger.debug("bucket: %s", bucket)
+    _logger.debug("key: %s", key)
     client_s3.put_object(Body=payload, Bucket=bucket, Key=key)
     return manifest
 
 
 def _rs_drop_table(con: Any, schema: str, table: str) -> None:
     sql = f"DROP TABLE IF EXISTS {schema}.{table}"
-    _logger.debug(f"Drop table query:\n{sql}")
+    _logger.debug("Drop table query:\n%s", sql)
     con.execute(sql)
 
 
@@ -1104,7 +1104,7 @@ def unload_redshift_to_files(
         query_id: int = _con.execute(sql).fetchall()[0][0]
         sql = f"SELECT path FROM STL_UNLOAD_LOG WHERE query={query_id};"
         paths = [x[0].replace(" ", "") for x in _con.execute(sql).fetchall()]
-        _logger.debug(f"paths: {paths}")
+        _logger.debug("paths: %s", paths)
         return paths
 
 
diff --git a/awswrangler/emr.py b/awswrangler/emr.py
index 3801d340e..f3e505b00 100644
--- a/awswrangler/emr.py
+++ b/awswrangler/emr.py
@@ -364,7 +364,7 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
     if pars["tags"] is not None:
         args["Tags"] = [{"Key": k, "Value": v} for k, v in pars["tags"].items()]
 
-    _logger.info(f"args: \n{json.dumps(args, default=str, indent=4)}")
+    _logger.info("args: \n%s", json.dumps(args, default=str, indent=4))
     return args
 
 
@@ -665,7 +665,7 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     args: Dict[str, Any] = _build_cluster_args(**locals())
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.run_job_flow(**args)
-    _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
+    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
     return response["JobFlowId"]
 
 
@@ -696,7 +696,7 @@ def get_cluster_state(cluster_id: str, boto3_session: Optional[boto3.Session] =
     """
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.describe_cluster(ClusterId=cluster_id)
-    _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
+    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
     return response["Cluster"]["Status"]["State"]
 
 
@@ -723,7 +723,7 @@ def terminate_cluster(cluster_id: str, boto3_session: Optional[boto3.Session] =
     """
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.terminate_job_flows(JobFlowIds=[cluster_id])
-    _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
+    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
 
 
 def submit_steps(
@@ -755,7 +755,7 @@ def submit_steps(
     """
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=steps)
-    _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
+    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
     return response["StepIds"]
 
 
@@ -807,7 +807,7 @@ def submit_step(
     )
     client_emr: boto3.client = _utils.client(service_name="emr", session=session)
     response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
-    _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
+    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
     return response["StepIds"][0]
 
 
@@ -898,7 +898,7 @@ def get_step_state(cluster_id: str, step_id: str, boto3_session: Optional[boto3.
     """
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.describe_step(ClusterId=cluster_id, StepId=step_id)
-    _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
+    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
     return response["Step"]["Status"]["State"]
 
 
@@ -942,7 +942,7 @@ def submit_ecr_credentials_refresh(
     )
     client_emr: boto3.client = _utils.client(service_name="emr", session=session)
     response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
-    _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}")
+    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
     return response["StepIds"][0]
 
 
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index f4be39359..770f588a7 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -56,10 +56,10 @@ def get_bucket_region(bucket: str, boto3_session: Optional[boto3.Session] = None
 
     """
     client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
-    _logger.debug(f"bucket: {bucket}")
+    _logger.debug("bucket: %s", bucket)
     region: str = client_s3.get_bucket_location(Bucket=bucket)["LocationConstraint"]
     region = "us-east-1" if region is None else region
-    _logger.debug(f"region: {region}")
+    _logger.debug("region: %s", region)
     return region
 
 
@@ -286,7 +286,7 @@ def _split_paths_by_bucket(paths: List[str]) -> Dict[str, List[str]]:
 
 
 def _delete_objects(bucket: str, keys: List[str], client_s3: boto3.client) -> None:
-    _logger.debug(f"len(keys): {len(keys)}")
+    _logger.debug("len(keys): %s", len(keys))
     batch: List[Dict[str, str]] = [{"Key": key} for key in keys]
     client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch})
 
@@ -366,7 +366,7 @@ def _describe_object(
             break
         except botocore.exceptions.ClientError as e:  # pragma: no cover
             if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404:  # Not Found
-                _logger.debug(f"Object not found. {i} seconds remaining to wait.")
+                _logger.debug("Object not found. %s seconds remaining to wait.", i)
                 if i == 1:  # Last try, there is no more need to sleep
                     break
                 time.sleep(1)
@@ -680,7 +680,7 @@ def to_csv(  # pylint: disable=too-many-arguments
                     sep=sep,
                 )
             if partitions_values:
-                _logger.debug(f"partitions_values:\n{partitions_values}")
+                _logger.debug("partitions_values:\n%s", partitions_values)
                 catalog.add_csv_partitions(
                     database=database, table=table, partitions_values=partitions_values, boto3_session=session, sep=sep
                 )
@@ -709,7 +709,7 @@ def _to_csv_dataset(
     if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)):
         delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session)
     df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype)
-    _logger.debug(f"dtypes: {df.dtypes}")
+    _logger.debug("dtypes: %s", df.dtypes)
     if not partition_cols:
         file_path: str = f"{path}{uuid.uuid4().hex}.csv"
         _to_text(
@@ -1094,7 +1094,7 @@ def to_parquet(  # pylint: disable=too-many-arguments
                     mode="overwrite",
                 )
             if partitions_values:
-                _logger.debug(f"partitions_values:\n{partitions_values}")
+                _logger.debug("partitions_values:\n%s", partitions_values)
                 catalog.add_parquet_partitions(
                     database=database,
                     table=table,
@@ -1132,7 +1132,7 @@ def _to_parquet_dataset(
     schema: pa.Schema = _data_types.pyarrow_schema_from_pandas(
         df=df, index=index, ignore_cols=partition_cols, dtype=dtype
     )
-    _logger.debug(f"schema: {schema}")
+    _logger.debug("schema: %s", schema)
     if not partition_cols:
         file_path: str = f"{path}{uuid.uuid4().hex}{compression_ext}.parquet"
         _to_parquet_file(
@@ -1180,7 +1180,7 @@ def _to_parquet_file(
             pyarrow_dtype = _data_types.athena2pyarrow(col_type)
             field = pa.field(name=col_name, type=pyarrow_dtype)
             table = table.set_column(col_index, field, table.column(col_name).cast(pyarrow_dtype))
-            _logger.debug(f"Casting column {col_name} ({col_index}) to {col_type} ({pyarrow_dtype})")
+            _logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype)
     pyarrow.parquet.write_table(
         table=table,
         where=path,
@@ -1508,7 +1508,7 @@ def _read_text_chunksize(
 ) -> Iterator[pd.DataFrame]:
     fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs)
     for path in paths:
-        _logger.debug(f"path: {path}")
+        _logger.debug("path: %s", path)
         if pandas_args.get("compression", "infer") == "infer":
             pandas_args["compression"] = infer_compression(path, compression="infer")
         with fs.open(path, "rb") as f:
@@ -1548,7 +1548,7 @@ def _read_parquet_init(
         path_or_paths = path[:-1] if path.endswith("/") else path
     else:
         path_or_paths = path
-    _logger.debug(f"path_or_paths: {path_or_paths}")
+    _logger.debug("path_or_paths: %s", path_or_paths)
     fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs)
     cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
     data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset(
@@ -2245,12 +2245,12 @@ def merge_datasets(
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
 
     paths: List[str] = list_objects(path=f"{source_path}/", boto3_session=session)
-    _logger.debug(f"len(paths): {len(paths)}")
+    _logger.debug("len(paths): %s", len(paths))
     if len(paths) < 1:
         return []
 
     if mode == "overwrite":
-        _logger.debug(f"Deleting to overwrite: {target_path}/")
+        _logger.debug("Deleting to overwrite: %s/", target_path)
         delete_objects(path=f"{target_path}/", use_threads=use_threads, boto3_session=session)
     elif mode == "overwrite_partitions":
         paths_wo_prefix: List[str] = [x.replace(f"{source_path}/", "") for x in paths]
@@ -2258,7 +2258,7 @@ def merge_datasets(
         partitions_paths: List[str] = list(set(paths_wo_filename))
         target_partitions_paths = [f"{target_path}/{x}" for x in partitions_paths]
         for path in target_partitions_paths:
-            _logger.debug(f"Deleting to overwrite_partitions: {path}")
+            _logger.debug("Deleting to overwrite_partitions: %s", path)
             delete_objects(path=path, use_threads=use_threads, boto3_session=session)
     elif mode != "append":
         raise exceptions.InvalidArgumentValue(f"{mode} is a invalid mode option.")
@@ -2266,7 +2266,7 @@ def merge_datasets(
     new_objects: List[str] = copy_objects(
         paths=paths, source_path=source_path, target_path=target_path, use_threads=use_threads, boto3_session=session
     )
-    _logger.debug(f"len(new_objects): {len(new_objects)}")
+    _logger.debug("len(new_objects): %s", len(new_objects))
     return new_objects
 
 
@@ -2313,7 +2313,7 @@ def copy_objects(
     ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"]
 
     """
-    _logger.debug(f"len(paths): {len(paths)}")
+    _logger.debug("len(paths): %s", len(paths))
     if len(paths) < 1:
         return []
     source_path = source_path[:-1] if source_path[-1] == "/" else source_path
@@ -2326,13 +2326,13 @@ def copy_objects(
         path_final: str = f"{target_path}/{path_wo_prefix}"
         new_objects.append(path_final)
         batch.append((path, path_final))
-    _logger.debug(f"len(new_objects): {len(new_objects)}")
+    _logger.debug("len(new_objects): %s", len(new_objects))
     _copy_objects(batch=batch, use_threads=use_threads, boto3_session=session)
     return new_objects
 
 
 def _copy_objects(batch: List[Tuple[str, str]], use_threads: bool, boto3_session: boto3.Session) -> None:
-    _logger.debug(f"len(batch): {len(batch)}")
+    _logger.debug("len(batch): %s", len(batch))
     client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
     resource_s3: boto3.resource = _utils.resource(service_name="s3", session=boto3_session)
     for source, target in batch:

From 910e3b69a467df64a13a23e6e6f7c5c512e8d6aa Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Mon, 27 Apr 2020 20:11:08 -0300
Subject: [PATCH 43/59] 100% test coverage on wr.torch

---
 .github/workflows/static-checking.yml      |  8 ++------
 awswrangler/torch.py                       | 18 ++++++++----------
 testing/test_awswrangler/test_data_lake.py |  2 +-
 testing/test_awswrangler/test_torch.py     |  7 +++++--
 4 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml
index 63592d182..a23a74d99 100644
--- a/.github/workflows/static-checking.yml
+++ b/.github/workflows/static-checking.yml
@@ -24,12 +24,8 @@ jobs:
         uses: actions/setup-python@v1
         with:
           python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-          pip install -r requirements-dev.txt
-          pip install -r requirements-torch.txt
+      - name: Setup Environment
+        run: ./setup-dev-env.sh
       - name: CloudFormation Lint
         run: cfn-lint -t testing/cloudformation.yaml
       - name: Documentation Lint
diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index e7cd4518f..7d3c47316 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -4,7 +4,6 @@
 import os
 import pathlib
 import re
-import tarfile
 from collections.abc import Iterable
 from io import BytesIO
 from typing import Any, Callable, Iterator, List, Optional, Tuple, Union
@@ -64,12 +63,12 @@ def _fetch_data(self, path: str) -> Any:
     def _load_data(data: io.BytesIO, path: str) -> Any:
         if path.endswith(".pt"):
             data = torch.load(data)
-        elif path.endswith(".tar.gz") or path.endswith(".tgz"):
-            tarfile.open(fileobj=data)
+        elif path.endswith(".tar.gz") or path.endswith(".tgz"):  # pragma: no cover
             raise NotImplementedError("Tar loader not implemented!")
+            # tarfile.open(fileobj=data)
             # tar = tarfile.open(fileobj=data)
             # for member in tar.getmembers():
-        else:
+        else:  # pragma: no cover
             raise NotImplementedError()
 
         return data
@@ -86,10 +85,10 @@ def __getitem__(self, index):
     def __len__(self):
         return len(self._paths)
 
-    def _data_fn(self, data) -> Any:
+    def _data_fn(self, data) -> Any:  # pragma: no cover
         raise NotImplementedError()
 
-    def _label_fn(self, path: str) -> Any:
+    def _label_fn(self, path: str) -> Any:  # pragma: no cover
         raise NotImplementedError()
 
 
@@ -100,7 +99,7 @@ def _label_fn(self, path: str) -> torch.Tensor:
         label = int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1])
         return torch.tensor([label])  # pylint: disable=not-callable
 
-    def _data_fn(self, data) -> Any:
+    def _data_fn(self, data) -> Any:  # pragma: no cover
         raise NotImplementedError()
 
 
@@ -383,9 +382,8 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor,
                 pass
             elif isinstance(data, Iterable) and all([isinstance(d, torch.Tensor) for d in data]):
                 data = zip(*data)
-            else:
+            else:  # pragma: no cover
                 raise NotImplementedError(f"ERROR: Type: {type(data)} has not been implemented!")
-
             for d in data:
                 yield d
 
@@ -436,7 +434,7 @@ def __init__(
     def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]:
         """Iterate over the Dataset."""
         if torch.utils.data.get_worker_info() is not None:  # type: ignore
-            raise NotImplementedError()
+            raise NotImplementedError()  # pragma: no cover
         db._validate_engine(con=self._con)  # pylint: disable=protected-access
         with self._con.connect() as con:
             cursor: Any = con.execute(self._sql)
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
index a815cd388..94541d8e6 100644
--- a/testing/test_awswrangler/test_data_lake.py
+++ b/testing/test_awswrangler/test_data_lake.py
@@ -708,7 +708,7 @@ def test_parquet_validate_schema(bucket, database):
     df2 = pd.DataFrame({"id2": [1, 2, 3], "val": ["foo", "boo", "bar"]})
     path_file2 = f"s3://{bucket}/test_parquet_file_validate/1.parquet"
     wr.s3.to_parquet(df=df2, path=path_file2)
-    wr.s3.wait_objects_exist(paths=[path_file2])
+    wr.s3.wait_objects_exist(paths=[path_file2], use_threads=False)
     df3 = wr.s3.read_parquet(path=path, validate_schema=False)
     assert len(df3.index) == 6
     assert len(df3.columns) == 3
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index 19a300400..6e8a3427d 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -84,7 +84,8 @@ def test_torch_sql(parameters, db_type, chunksize):
 
 @pytest.mark.parametrize("chunksize", [None, 1, 10])
 @pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"])
-def test_torch_sql_label(parameters, db_type, chunksize):
+@pytest.mark.parametrize("label_col", [2, "c"])
+def test_torch_sql_label(parameters, db_type, chunksize, label_col):
     schema = parameters[db_type]["schema"]
     table = f"test_torch_sql_label_{db_type}_{str(chunksize).lower()}"
     engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
@@ -99,7 +100,9 @@ def test_torch_sql_label(parameters, db_type, chunksize):
         chunksize=None,
         method=None,
     )
-    ts = list(wr.torch.SQLDataset(f"SELECT * FROM {schema}.{table}", con=engine, chunksize=chunksize, label_col=2))
+    ts = list(
+        wr.torch.SQLDataset(f"SELECT * FROM {schema}.{table}", con=engine, chunksize=chunksize, label_col=label_col)
+    )
     assert torch.all(ts[0][0].eq(torch.tensor([1.0, 4.0])))
     assert torch.all(ts[0][1].eq(torch.tensor([7], dtype=torch.long)))
     assert torch.all(ts[1][0].eq(torch.tensor([2.0, 5.0])))

From b4f6a36d18699e540415c31b8ecf38eb7b418aac Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 28 Apr 2020 14:31:56 -0300
Subject: [PATCH 44/59] Revisiting Athena encryption and workgroup #201

---
 awswrangler/athena.py                      | 111 +++++++++++++++------
 awswrangler/emr.py                         |  18 ++--
 testing/test_awswrangler/test_data_lake.py | 102 +++++++++++++++++--
 testing/test_awswrangler/test_torch.py     |   8 +-
 tox.ini                                    |   4 +-
 5 files changed, 192 insertions(+), 51 deletions(-)

diff --git a/awswrangler/athena.py b/awswrangler/athena.py
index bd5c7cb35..76cb0a108 100644
--- a/awswrangler/athena.py
+++ b/awswrangler/athena.py
@@ -2,6 +2,7 @@
 
 import csv
 import logging
+import pprint
 import time
 from decimal import Decimal
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
@@ -120,19 +121,49 @@ def start_query_execution(
     >>> query_exec_id = wr.athena.start_query_execution(sql='...', database='...')
 
     """
+    session: boto3.Session = _utils.ensure_session(session=boto3_session)
+    wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config(session=session, workgroup=workgroup)
+    return _start_query_execution(
+        sql=sql,
+        wg_config=wg_config,
+        database=database,
+        s3_output=s3_output,
+        workgroup=workgroup,
+        encryption=encryption,
+        kms_key=kms_key,
+        boto3_session=session,
+    )
+
+
+def _start_query_execution(
+    sql: str,
+    wg_config: Dict[str, Union[Optional[bool], Optional[str]]],
+    database: Optional[str] = None,
+    s3_output: Optional[str] = None,
+    workgroup: Optional[str] = None,
+    encryption: Optional[str] = None,
+    kms_key: Optional[str] = None,
+    boto3_session: Optional[boto3.Session] = None,
+) -> str:
     args: Dict[str, Any] = {"QueryString": sql}
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
 
     # s3_output
-    if s3_output is None:  # pragma: no cover
-        s3_output = create_athena_bucket(boto3_session=session)
-    args["ResultConfiguration"] = {"OutputLocation": s3_output}
+    args["ResultConfiguration"] = {
+        "OutputLocation": _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session)
+    }
 
     # encryption
-    if encryption is not None:
-        args["ResultConfiguration"]["EncryptionConfiguration"] = {"EncryptionOption": encryption}
-        if kms_key is not None:
-            args["ResultConfiguration"]["EncryptionConfiguration"]["KmsKey"] = kms_key
+    if wg_config["enforced"] is True:
+        if wg_config["encryption"] is not None:
+            args["ResultConfiguration"]["EncryptionConfiguration"] = {"EncryptionOption": wg_config["encryption"]}
+            if wg_config["kms_key"] is not None:
+                args["ResultConfiguration"]["EncryptionConfiguration"]["KmsKey"] = wg_config["kms_key"]
+    else:
+        if encryption is not None:
+            args["ResultConfiguration"]["EncryptionConfiguration"] = {"EncryptionOption": encryption}
+            if kms_key is not None:
+                args["ResultConfiguration"]["EncryptionConfiguration"]["KmsKey"] = kms_key
 
     # database
     if database is not None:
@@ -143,10 +174,25 @@ def start_query_execution(
         args["WorkGroup"] = workgroup
 
     client_athena: boto3.client = _utils.client(service_name="athena", session=session)
+    _logger.debug("args: \n%s", pprint.pformat(args))
     response = client_athena.start_query_execution(**args)
     return response["QueryExecutionId"]
 
 
+def _get_s3_output(
+    s3_output: Optional[str], wg_config: Dict[str, Union[bool, Optional[str]]], boto3_session: boto3.Session
+) -> str:
+    if s3_output is None:
+        _s3_output: Optional[str] = wg_config["s3_output"]  # type: ignore
+        if _s3_output is not None:
+            s3_output = _s3_output
+        else:
+            s3_output = create_athena_bucket(boto3_session=boto3_session)
+    elif wg_config["enforced"] is True:
+        s3_output = wg_config["s3_output"]  # type: ignore
+    return s3_output
+
+
 def wait_query(query_execution_id: str, boto3_session: Optional[boto3.Session] = None) -> Dict[str, Any]:
     """Wait for the query end.
 
@@ -355,12 +401,14 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
 
     Note
     ----
-    If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes,
-    but it still useful to overcome memory limitation.
+    Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS'].
+
+    `P.S. 'CSE_KMS' is not supported.`
 
     Note
     ----
     Create the default Athena bucket if it doesn't exist and s3_output is None.
+
     (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/)
 
     Note
@@ -403,9 +451,9 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     workgroup : str, optional
         Athena workgroup.
     encryption : str, optional
-        None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'.
+        Valid values: [None, 'SSE_S3', 'SSE_KMS']. Notice: 'CSE_KMS' is not supported.
     kms_key : str, optional
-        For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID.
+        For SSE-KMS, this is the KMS key ARN or ID.
     use_threads : bool
         True to enable concurrent requests, False to disable multiple threads.
         If enabled os.cpu_count() will be used as the max number of threads.
@@ -424,31 +472,27 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
 
     """
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
-    wg_s3_output, _, _ = _ensure_workgroup(session=session, workgroup=workgroup)
-    if s3_output is None:
-        if wg_s3_output is None:
-            _s3_output: str = create_athena_bucket(boto3_session=session)
-        else:
-            _s3_output = wg_s3_output
-    else:
-        _s3_output = s3_output
+    wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config(session=session, workgroup=workgroup)
+    _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session)
     _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output
     name: str = ""
     if ctas_approach is True:
         name = f"temp_table_{pa.compat.guid()}"
         path: str = f"{_s3_output}/{name}"
+        ext_location: str = "\n" if wg_config["enforced"] is True else f",\n    external_location = '{path}'\n"
         sql = (
             f"CREATE TABLE {name}\n"
             f"WITH(\n"
             f"    format = 'Parquet',\n"
-            f"    parquet_compression = 'SNAPPY',\n"
-            f"    external_location = '{path}'\n"
+            f"    parquet_compression = 'SNAPPY'"
+            f"{ext_location}"
             f") AS\n"
             f"{sql}"
         )
     _logger.debug("sql: %s", sql)
-    query_id: str = start_query_execution(
+    query_id: str = _start_query_execution(
         sql=sql,
+        wg_config=wg_config,
         database=database,
         s3_output=_s3_output,
         workgroup=workgroup,
@@ -466,6 +510,7 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     if ctas_approach is True:
         catalog.delete_table_if_exists(database=database, table=name, boto3_session=session)
         manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv"
+        _logger.debug("manifest_path: %s", manifest_path)
         paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session)
         chunked: Union[bool, int] = False if chunksize is None else chunksize
         _logger.debug("chunked: %s", chunked)
@@ -560,19 +605,27 @@ def get_work_group(workgroup: str, boto3_session: Optional[boto3.Session] = None
     return client_athena.get_work_group(WorkGroup=workgroup)
 
 
-def _ensure_workgroup(
+def _get_workgroup_config(
     session: boto3.Session, workgroup: Optional[str] = None
-) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+) -> Dict[str, Union[bool, Optional[str]]]:
     if workgroup is not None:
         res: Dict[str, Any] = get_work_group(workgroup=workgroup, boto3_session=session)
+        enforced: bool = res["WorkGroup"]["Configuration"]["EnforceWorkGroupConfiguration"]
         config: Dict[str, Any] = res["WorkGroup"]["Configuration"]["ResultConfiguration"]
         wg_s3_output: Optional[str] = config.get("OutputLocation")
         encrypt_config: Optional[Dict[str, str]] = config.get("EncryptionConfiguration")
         wg_encryption: Optional[str] = None if encrypt_config is None else encrypt_config.get("EncryptionOption")
         wg_kms_key: Optional[str] = None if encrypt_config is None else encrypt_config.get("KmsKey")
     else:
-        wg_s3_output, wg_encryption, wg_kms_key = None, None, None
-    return wg_s3_output, wg_encryption, wg_kms_key
+        enforced, wg_s3_output, wg_encryption, wg_kms_key = False, None, None, None
+    wg_config: Dict[str, Union[bool, Optional[str]]] = {
+        "enforced": enforced,
+        "s3_output": wg_s3_output,
+        "encryption": wg_encryption,
+        "kms_key": wg_kms_key,
+    }
+    _logger.debug("wg_config: \n%s", pprint.pformat(wg_config))
+    return wg_config
 
 
 def read_sql_table(
@@ -606,12 +659,14 @@ def read_sql_table(
 
     Note
     ----
-    If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes,
-    but it still useful to overcome memory limitation.
+    Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS'].
+
+    `P.S. 'CSE_KMS' is not supported.`
 
     Note
     ----
     Create the default Athena bucket if it doesn't exist and s3_output is None.
+
     (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/)
 
     Note
diff --git a/awswrangler/emr.py b/awswrangler/emr.py
index f3e505b00..5a93d752d 100644
--- a/awswrangler/emr.py
+++ b/awswrangler/emr.py
@@ -1,8 +1,8 @@
 """EMR (Elastic Map Reduce) module."""
 # pylint: disable=line-too-long
 
-import json
 import logging
+import pprint
 from typing import Any, Dict, List, Optional, Union
 
 import boto3  # type: ignore
@@ -364,7 +364,7 @@ def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-s
     if pars["tags"] is not None:
         args["Tags"] = [{"Key": k, "Value": v} for k, v in pars["tags"].items()]
 
-    _logger.info("args: \n%s", json.dumps(args, default=str, indent=4))
+    _logger.debug("args: \n%s", pprint.pformat(args))
     return args
 
 
@@ -665,7 +665,7 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
     args: Dict[str, Any] = _build_cluster_args(**locals())
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.run_job_flow(**args)
-    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
+    _logger.debug("response: \n%s", pprint.pformat(response))
     return response["JobFlowId"]
 
 
@@ -696,7 +696,7 @@ def get_cluster_state(cluster_id: str, boto3_session: Optional[boto3.Session] =
     """
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.describe_cluster(ClusterId=cluster_id)
-    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
+    _logger.debug("response: \n%s", pprint.pformat(response))
     return response["Cluster"]["Status"]["State"]
 
 
@@ -723,7 +723,7 @@ def terminate_cluster(cluster_id: str, boto3_session: Optional[boto3.Session] =
     """
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.terminate_job_flows(JobFlowIds=[cluster_id])
-    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
+    _logger.debug("response: \n%s", pprint.pformat(response))
 
 
 def submit_steps(
@@ -755,7 +755,7 @@ def submit_steps(
     """
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=steps)
-    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
+    _logger.debug("response: \n%s", pprint.pformat(response))
     return response["StepIds"]
 
 
@@ -807,7 +807,7 @@ def submit_step(
     )
     client_emr: boto3.client = _utils.client(service_name="emr", session=session)
     response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
-    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
+    _logger.debug("response: \n%s", pprint.pformat(response))
     return response["StepIds"][0]
 
 
@@ -898,7 +898,7 @@ def get_step_state(cluster_id: str, step_id: str, boto3_session: Optional[boto3.
     """
     client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
     response: Dict[str, Any] = client_emr.describe_step(ClusterId=cluster_id, StepId=step_id)
-    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
+    _logger.debug("response: \n%s", pprint.pformat(response))
     return response["Step"]["Status"]["State"]
 
 
@@ -942,7 +942,7 @@ def submit_ecr_credentials_refresh(
     )
     client_emr: boto3.client = _utils.client(service_name="emr", session=session)
     response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
-    _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4))
+    _logger.debug("response: \n%s", pprint.pformat(response))
     return response["StepIds"][0]
 
 
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
index 94541d8e6..b05fb0881 100644
--- a/testing/test_awswrangler/test_data_lake.py
+++ b/testing/test_awswrangler/test_data_lake.py
@@ -74,10 +74,7 @@ def workgroup0(bucket):
         client.create_work_group(
             Name=wkg_name,
             Configuration={
-                "ResultConfiguration": {
-                    "OutputLocation": f"s3://{bucket}/athena_workgroup0/",
-                    "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
-                },
+                "ResultConfiguration": {"OutputLocation": f"s3://{bucket}/athena_workgroup0/"},
                 "EnforceWorkGroupConfiguration": True,
                 "PublishCloudWatchMetricsEnabled": True,
                 "BytesScannedCutoffPerQuery": 100_000_000,
@@ -98,7 +95,10 @@ def workgroup1(bucket):
         client.create_work_group(
             Name=wkg_name,
             Configuration={
-                "ResultConfiguration": {"OutputLocation": f"s3://{bucket}/athena_workgroup1/"},
+                "ResultConfiguration": {
+                    "OutputLocation": f"s3://{bucket}/athena_workgroup1/",
+                    "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
+                },
                 "EnforceWorkGroupConfiguration": True,
                 "PublishCloudWatchMetricsEnabled": True,
                 "BytesScannedCutoffPerQuery": 100_000_000,
@@ -109,7 +109,57 @@ def workgroup1(bucket):
     yield wkg_name
 
 
+@pytest.fixture(scope="module")
+def workgroup2(bucket, kms_key):
+    wkg_name = "awswrangler_test_2"
+    client = boto3.client("athena")
+    wkgs = client.list_work_groups()
+    wkgs = [x["Name"] for x in wkgs["WorkGroups"]]
+    if wkg_name not in wkgs:
+        client.create_work_group(
+            Name=wkg_name,
+            Configuration={
+                "ResultConfiguration": {
+                    "OutputLocation": f"s3://{bucket}/athena_workgroup2/",
+                    "EncryptionConfiguration": {"EncryptionOption": "SSE_KMS", "KmsKey": kms_key},
+                },
+                "EnforceWorkGroupConfiguration": False,
+                "PublishCloudWatchMetricsEnabled": True,
+                "BytesScannedCutoffPerQuery": 100_000_000,
+                "RequesterPaysEnabled": False,
+            },
+            Description="AWS Data Wrangler Test WorkGroup Number 2",
+        )
+    yield wkg_name
+
+
+@pytest.fixture(scope="module")
+def workgroup3(bucket, kms_key):
+    wkg_name = "awswrangler_test_3"
+    client = boto3.client("athena")
+    wkgs = client.list_work_groups()
+    wkgs = [x["Name"] for x in wkgs["WorkGroups"]]
+    if wkg_name not in wkgs:
+        client.create_work_group(
+            Name=wkg_name,
+            Configuration={
+                "ResultConfiguration": {
+                    "OutputLocation": f"s3://{bucket}/athena_workgroup3/",
+                    "EncryptionConfiguration": {"EncryptionOption": "SSE_KMS", "KmsKey": kms_key},
+                },
+                "EnforceWorkGroupConfiguration": True,
+                "PublishCloudWatchMetricsEnabled": True,
+                "BytesScannedCutoffPerQuery": 100_000_000,
+                "RequesterPaysEnabled": False,
+            },
+            Description="AWS Data Wrangler Test WorkGroup Number 3",
+        )
+    yield wkg_name
+
+
 def test_athena_ctas(bucket, database, kms_key):
+    wr.s3.delete_objects(path=f"s3://{bucket}/test_athena_ctas/")
+    wr.s3.delete_objects(path=f"s3://{bucket}/test_athena_ctas_result/")
     df = get_df_list()
     columns_types, partitions_types = wr.catalog.extract_athena_types(df=df, partition_cols=["par0", "par1"])
     assert len(columns_types) == 16
@@ -256,13 +306,12 @@ def test_fwf(bucket):
 
 
 def test_parquet(bucket):
-    wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_file")
-    wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_dataset")
+    wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet/")
     df_file = pd.DataFrame({"id": [1, 2, 3]})
-    path_file = f"s3://{bucket}/test_parquet_file.parquet"
+    path_file = f"s3://{bucket}/test_parquet/test_parquet_file.parquet"
     df_dataset = pd.DataFrame({"id": [1, 2, 3], "partition": ["A", "A", "B"]})
     df_dataset["partition"] = df_dataset["partition"].astype("category")
-    path_dataset = f"s3://{bucket}/test_parquet_dataset"
+    path_dataset = f"s3://{bucket}/test_parquet/test_parquet_dataset"
     with pytest.raises(wr.exceptions.InvalidArgumentCombination):
         wr.s3.to_parquet(df=df_file, path=path_file, mode="append")
     with pytest.raises(wr.exceptions.InvalidCompression):
@@ -292,8 +341,7 @@ def test_parquet(bucket):
     wr.s3.to_parquet(
         df=df_dataset, path=path_dataset, dataset=True, partition_cols=["partition"], mode="overwrite_partitions"
     )
-    wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_file")
-    wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_dataset")
+    wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet/")
 
 
 def test_parquet_catalog(bucket, database):
@@ -1123,3 +1171,35 @@ def test_parquet_chunked(bucket, database, col2, chunked):
 
     wr.s3.delete_objects(path=paths)
     assert wr.catalog.delete_table_if_exists(database=database, table=table) is True
+
+
+@pytest.mark.parametrize("workgroup", [None, 0, 1, 2, 3])
+@pytest.mark.parametrize("encryption", [None, "SSE_S3", "SSE_KMS"])
+def test_athena_encryption(
+    bucket, database, kms_key, encryption, workgroup, workgroup0, workgroup1, workgroup2, workgroup3
+):
+    kms_key = None if (encryption == "SSE_S3") or (encryption is None) else kms_key
+    if workgroup == 0:
+        workgroup = workgroup0
+    elif workgroup == 1:
+        workgroup = workgroup1
+    elif workgroup == 2:
+        workgroup = workgroup2
+    elif workgroup == 3:
+        workgroup = workgroup3
+    table = f"test_athena_encryption_{str(encryption).lower()}_{str(workgroup).lower()}"
+    path = f"s3://{bucket}/{table}/"
+    wr.s3.delete_objects(path=path)
+    df = pd.DataFrame({"a": [1, 2], "b": ["foo", "boo"]})
+    paths = wr.s3.to_parquet(
+        df=df, path=path, dataset=True, mode="overwrite", database=database, table=table, s3_additional_kwargs=None
+    )["paths"]
+    wr.s3.wait_objects_exist(paths=paths, use_threads=False)
+    df2 = wr.athena.read_sql_table(
+        table=table, ctas_approach=True, database=database, encryption=encryption, workgroup=workgroup, kms_key=kms_key
+    )
+    print(df2)
+    assert len(df2.index) == 2
+    assert len(df2.columns) == 2
+    wr.catalog.delete_table_if_exists(database=database, table=table)
+    wr.s3.delete_objects(path=paths)
diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py
index 6e8a3427d..a19dd64b5 100644
--- a/testing/test_awswrangler/test_torch.py
+++ b/testing/test_awswrangler/test_torch.py
@@ -87,7 +87,7 @@ def test_torch_sql(parameters, db_type, chunksize):
 @pytest.mark.parametrize("label_col", [2, "c"])
 def test_torch_sql_label(parameters, db_type, chunksize, label_col):
     schema = parameters[db_type]["schema"]
-    table = f"test_torch_sql_label_{db_type}_{str(chunksize).lower()}"
+    table = f"test_torch_sql_label_{db_type}_{str(chunksize).lower()}_{label_col}"
     engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
     wr.db.to_sql(
         df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0], "c": [7, 8, 9]}),
@@ -123,6 +123,7 @@ def test_torch_image_s3(bucket):
         Key=f"{folder}/class={ref_label}/logo.png",
         ContentType="image/png",
     )
+    wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/{folder}/class={ref_label}/logo.png"])
     ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session())
     image, label = ds[0]
     assert image.shape == torch.Size([4, 494, 1636])
@@ -144,6 +145,7 @@ def test_torch_image_s3_loader(bucket, drop_last):
             Key=f"{folder}/class={label}/logo{i}.png",
             ContentType="image/png",
         )
+        wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/{folder}/class={label}/logo{i}.png"])
     ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session())
     batch_size = 2
     num_train = len(ds)
@@ -172,6 +174,7 @@ def test_torch_lambda_s3(bucket):
         Key=f"test_torch_lambda_s3/class={ref_label}/logo.png",
         ContentType="image/png",
     )
+    wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/test_torch_lambda_s3/class={ref_label}/logo.png"])
     ds = wr.torch.LambdaS3Dataset(
         path=path,
         suffix="png",
@@ -201,6 +204,7 @@ def test_torch_audio_s3(bucket):
         Key=f"{folder}/class={ref_label}/amazing_sound.wav",
         ContentType="audio/wav",
     )
+    wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/{folder}/class={ref_label}/amazing_sound.wav"])
     s3_audio_file = f"{bucket}/test_torch_audio_s3/class={ref_label}/amazing_sound.wav"
     ds = wr.torch.AudioS3Dataset(path=s3_audio_file, suffix="wav")
     loader = DataLoader(ds, batch_size=1)
@@ -234,6 +238,7 @@ def test_torch_s3_iterable(bucket, drop_last):
         torch.save(batch, buff)
         buff.seek(0)
         client_s3.put_object(Body=buff.read(), Bucket=bucket, Key=f"{folder}/file{i}.pt")
+        wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/{folder}/file{i}.pt"])
 
     for image in DataLoader(
         wr.torch.S3IterableDataset(path=f"s3://{bucket}/{folder}/file"), batch_size=batch_size, drop_last=drop_last
@@ -259,6 +264,7 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last):
         torch.save(batch, buff)
         buff.seek(0)
         client_s3.put_object(Body=buff.read(), Bucket=bucket, Key=f"{folder}/file{i}.pt")
+        wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/{folder}/file{i}.pt"])
 
     for images, labels in DataLoader(
         wr.torch.S3IterableDataset(path=f"s3://{bucket}/{folder}/file"), batch_size=batch_size, drop_last=drop_last
diff --git a/tox.ini b/tox.ini
index f2bb572c2..cbede50a3 100644
--- a/tox.ini
+++ b/tox.ini
@@ -8,11 +8,11 @@ deps =
        moto
        -rrequirements-torch.txt
 commands =
-       pytest -n 8 testing/test_awswrangler
+       pytest -n 16 testing/test_awswrangler
 
 [testenv:py36]
 deps =
        {[testenv]deps}
        pytest-cov
 commands =
-       pytest --cov=awswrangler -n 8 testing/test_awswrangler
+       pytest --cov=awswrangler -n 16 testing/test_awswrangler

From 2a26d4f79e77918ee9c13f5232506cc421eedb42 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 28 Apr 2020 23:34:27 -0300
Subject: [PATCH 45/59] Decrease tox parallelism

---
 tox.ini | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tox.ini b/tox.ini
index cbede50a3..f2bb572c2 100644
--- a/tox.ini
+++ b/tox.ini
@@ -8,11 +8,11 @@ deps =
        moto
        -rrequirements-torch.txt
 commands =
-       pytest -n 16 testing/test_awswrangler
+       pytest -n 8 testing/test_awswrangler
 
 [testenv:py36]
 deps =
        {[testenv]deps}
        pytest-cov
 commands =
-       pytest --cov=awswrangler -n 16 testing/test_awswrangler
+       pytest --cov=awswrangler -n 8 testing/test_awswrangler

From 5298aaf84a629c099aaf4a2f2af0de4a534c409f Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 28 Apr 2020 23:35:08 -0300
Subject: [PATCH 46/59] Add kms_key_id, max_file_size and region to Redshift
 Unload

---
 awswrangler/db.py | 67 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 62 insertions(+), 5 deletions(-)

diff --git a/awswrangler/db.py b/awswrangler/db.py
index 21b4789c4..b695bdd17 100644
--- a/awswrangler/db.py
+++ b/awswrangler/db.py
@@ -2,6 +2,7 @@
 
 import json
 import logging
+import time
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 from urllib.parse import quote_plus
 
@@ -91,7 +92,16 @@ def to_sql(df: pd.DataFrame, con: sqlalchemy.engine.Engine, **pandas_kwargs) ->
     )
     pandas_kwargs["dtype"] = dtypes
     pandas_kwargs["con"] = con
-    df.to_sql(**pandas_kwargs)
+    max_attempts: int = 3
+    for attempt in range(max_attempts):
+        try:
+            df.to_sql(**pandas_kwargs)
+        except sqlalchemy.exc.InternalError as ex:  # pragma: no cover
+            if attempt == (max_attempts - 1):
+                raise ex
+            time.sleep(1)
+        else:
+            break
 
 
 def read_sql_query(
@@ -887,6 +897,9 @@ def unload_redshift(
     path: str,
     con: sqlalchemy.engine.Engine,
     iam_role: str,
+    region: Optional[str] = None,
+    max_file_size: Optional[float] = None,
+    kms_key_id: Optional[str] = None,
     categories: List[str] = None,
     chunked: Union[bool, int] = False,
     keep_files: bool = False,
@@ -937,6 +950,19 @@ def unload_redshift(
         wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()
     iam_role : str
         AWS IAM role with the related permissions.
+    region : str, optional
+        Specifies the AWS Region where the target Amazon S3 bucket is located.
+        REGION is required for UNLOAD to an Amazon S3 bucket that isn't in the
+        same AWS Region as the Amazon Redshift cluster. By default, UNLOAD
+        assumes that the target Amazon S3 bucket is located in the same AWS
+        Region as the Amazon Redshift cluster.
+    max_file_size : float, optional
+        Specifies the maximum size (MB) of files that UNLOAD creates in Amazon S3.
+        Specify a decimal value between 5.0 MB and 6200.0 MB. If None, the default
+        maximum file size is 6200.0 MB.
+    kms_key_id : str, optional
+        Specifies the key ID for an AWS Key Management Service (AWS KMS) key to be
+        used to encrypt data files on Amazon S3.
     categories: List[str], optional
         List of columns names that should be returned as pandas.Categorical.
         Recommended for memory restricted environments.
@@ -973,7 +999,15 @@ def unload_redshift(
     """
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
     paths: List[str] = unload_redshift_to_files(
-        sql=sql, path=path, con=con, iam_role=iam_role, use_threads=use_threads, boto3_session=session
+        sql=sql,
+        path=path,
+        con=con,
+        iam_role=iam_role,
+        region=region,
+        max_file_size=max_file_size,
+        kms_key_id=kms_key_id,
+        use_threads=use_threads,
+        boto3_session=session,
     )
     s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session)
     if chunked is False:
@@ -1032,6 +1066,9 @@ def unload_redshift_to_files(
     path: str,
     con: sqlalchemy.engine.Engine,
     iam_role: str,
+    region: Optional[str] = None,
+    max_file_size: Optional[float] = None,
+    kms_key_id: Optional[str] = None,
     use_threads: bool = True,
     manifest: bool = False,
     partition_cols: Optional[List] = None,
@@ -1056,6 +1093,19 @@ def unload_redshift_to_files(
         wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()
     iam_role : str
         AWS IAM role with the related permissions.
+    region : str, optional
+        Specifies the AWS Region where the target Amazon S3 bucket is located.
+        REGION is required for UNLOAD to an Amazon S3 bucket that isn't in the
+        same AWS Region as the Amazon Redshift cluster. By default, UNLOAD
+        assumes that the target Amazon S3 bucket is located in the same AWS
+        Region as the Amazon Redshift cluster.
+    max_file_size : float, optional
+        Specifies the maximum size (MB) of files that UNLOAD creates in Amazon S3.
+        Specify a decimal value between 5.0 MB and 6200.0 MB. If None, the default
+        maximum file size is 6200.0 MB.
+    kms_key_id : str, optional
+        Specifies the key ID for an AWS Key Management Service (AWS KMS) key to be
+        used to encrypt data files on Amazon S3.
     use_threads : bool
         True to enable concurrent requests, False to disable multiple threads.
         If enabled os.cpu_count() will be used as the max number of threads.
@@ -1086,19 +1136,26 @@ def unload_redshift_to_files(
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
     s3.delete_objects(path=path, use_threads=use_threads, boto3_session=session)
     with con.connect() as _con:
-        partition_str: str = f"PARTITION BY ({','.join(partition_cols)})\n" if partition_cols else ""
+        partition_str: str = f"\nPARTITION BY ({','.join(partition_cols)})" if partition_cols else ""
         manifest_str: str = "\nmanifest" if manifest is True else ""
+        region_str: str = f"\nREGION AS '{region}'" if region is not None else ""
+        max_file_size_str: str = f"\nMAXFILESIZE AS {max_file_size} MB" if max_file_size is not None else ""
+        kms_key_id_str: str = f"\nKMS_KEY_ID '{kms_key_id}'" if kms_key_id is not None else ""
         sql = (
             f"UNLOAD ('{sql}')\n"
             f"TO '{path}'\n"
             f"IAM_ROLE '{iam_role}'\n"
             "ALLOWOVERWRITE\n"
             "PARALLEL ON\n"
-            "ENCRYPTED\n"
+            "FORMAT PARQUET\n"
+            "ENCRYPTED"
+            f"{kms_key_id_str}"
             f"{partition_str}"
-            "FORMAT PARQUET"
+            f"{region_str}"
+            f"{max_file_size_str}"
             f"{manifest_str};"
         )
+        _logger.debug("sql: \n%s", sql)
         _con.execute(sql)
         sql = "SELECT pg_last_query_id() AS query_id"
         query_id: int = _con.execute(sql).fetchall()[0][0]

From d4b27c6fc950523b095d169ff6704c92933dcc97 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 28 Apr 2020 23:35:32 -0300
Subject: [PATCH 47/59] Add KMS permission to Redshift Role

---
 testing/cloudformation.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/testing/cloudformation.yaml b/testing/cloudformation.yaml
index d709f1861..1b4b90f08 100644
--- a/testing/cloudformation.yaml
+++ b/testing/cloudformation.yaml
@@ -96,6 +96,15 @@ Resources:
           PolicyDocument:
             Version: 2012-10-17
             Statement:
+              - Effect: Allow
+                Action:
+                  - kms:Encrypt
+                  - kms:Decrypt
+                  - kms:GenerateDataKey
+                Resource:
+                  - Fn::GetAtt:
+                      - KmsKey
+                      - Arn
               - Effect: Allow
                 Action:
                   - s3:Get*

From 924b0bb624bfef949d559f0e68d5724645ae8394 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 28 Apr 2020 23:35:53 -0300
Subject: [PATCH 48/59] Add Redshift tests

---
 testing/test_awswrangler/test_db.py | 74 +++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/testing/test_awswrangler/test_db.py b/testing/test_awswrangler/test_db.py
index adcacf4a4..65f5ad15b 100644
--- a/testing/test_awswrangler/test_db.py
+++ b/testing/test_awswrangler/test_db.py
@@ -76,6 +76,11 @@ def external_schema(cloudformation_outputs, parameters, glue_database):
     yield "aws_data_wrangler_external"
 
 
+@pytest.fixture(scope="module")
+def kms_key_id(cloudformation_outputs):
+    yield cloudformation_outputs["KmsKeyArn"].split("/", 1)[1]
+
+
 @pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"])
 def test_sql(parameters, db_type):
     df = get_df()
@@ -386,3 +391,72 @@ def test_redshift_category(bucket, parameters):
     for df2 in dfs:
         ensure_data_types_category(df2)
     wr.s3.delete_objects(path=path)
+
+
+def test_redshift_unload_extras(bucket, parameters, kms_key_id):
+    table = "test_redshift_unload_extras"
+    schema = parameters["redshift"]["schema"]
+    path = f"s3://{bucket}/{table}/"
+    wr.s3.delete_objects(path=path)
+    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-redshift")
+    df = pd.DataFrame({"id": [1, 2], "name": ["foo", "boo"]})
+    wr.db.to_sql(df=df, con=engine, name=table, schema=schema, if_exists="replace", index=False)
+    paths = wr.db.unload_redshift_to_files(
+        sql=f"SELECT * FROM {schema}.{table}",
+        path=path,
+        con=engine,
+        iam_role=parameters["redshift"]["role"],
+        region=wr.s3.get_bucket_region(bucket),
+        max_file_size=5.0,
+        kms_key_id=kms_key_id,
+        partition_cols=["name"],
+    )
+    wr.s3.wait_objects_exist(paths=paths)
+    df = wr.s3.read_parquet(path=path, dataset=True)
+    assert len(df.index) == 2
+    assert len(df.columns) == 2
+    wr.s3.delete_objects(path=path)
+    df = wr.db.unload_redshift(
+        sql=f"SELECT * FROM {schema}.{table}",
+        con=engine,
+        iam_role=parameters["redshift"]["role"],
+        path=path,
+        keep_files=False,
+        region=wr.s3.get_bucket_region(bucket),
+        max_file_size=5.0,
+        kms_key_id=kms_key_id,
+    )
+    assert len(df.index) == 2
+    assert len(df.columns) == 2
+    wr.s3.delete_objects(path=path)
+
+
+@pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"])
+def test_to_sql_cast(parameters, db_type):
+    table = "test_to_sql_cast"
+    schema = parameters[db_type]["schema"]
+    df = pd.DataFrame(
+        {
+            "col": [
+                "".join([str(i)[-1] for i in range(1_024)]),
+                "".join([str(i)[-1] for i in range(1_024)]),
+                "".join([str(i)[-1] for i in range(1_024)]),
+            ]
+        },
+        dtype="string",
+    )
+    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
+    wr.db.to_sql(
+        df=df,
+        con=engine,
+        name=table,
+        schema=schema,
+        if_exists="replace",
+        index=False,
+        index_label=None,
+        chunksize=None,
+        method=None,
+        dtype={"col": sqlalchemy.types.VARCHAR(length=1_024)},
+    )
+    df2 = wr.db.read_sql_query(sql=f"SELECT * FROM {schema}.{table}", con=engine)
+    assert df.equals(df2)

From ad22aea48ee721ba48bc2c31beb64de994214373 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 28 Apr 2020 23:37:12 -0300
Subject: [PATCH 49/59] Insignificant fix in _data_types.py

---
 awswrangler/_data_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
index 947b058b0..01237ea49 100644
--- a/awswrangler/_data_types.py
+++ b/awswrangler/_data_types.py
@@ -372,7 +372,7 @@ def sqlalchemy_types_from_pandas(
     df: pd.DataFrame, db_type: str, dtype: Optional[Dict[str, VisitableType]] = None
 ) -> Dict[str, VisitableType]:
     """Extract the related SQLAlchemy data types from any Pandas DataFrame."""
-    casts: Dict[str, VisitableType] = dtype if dtype else {}
+    casts: Dict[str, VisitableType] = dtype if dtype is not None else {}
     pa_columns_types: Dict[str, Optional[pa.DataType]] = pyarrow_types_from_pandas(
         df=df, index=False, ignore_cols=list(casts.keys())
     )

From 0e068fe76612cfa4fd72ee88befb26f6dff6dab8 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 28 Apr 2020 23:38:02 -0300
Subject: [PATCH 50/59] Parquet chunksize now paginating on Pandas instead of
 PyArrow

---
 awswrangler/s3.py | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index 770f588a7..c0eb71c3f 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -1132,7 +1132,7 @@ def _to_parquet_dataset(
     schema: pa.Schema = _data_types.pyarrow_schema_from_pandas(
         df=df, index=index, ignore_cols=partition_cols, dtype=dtype
     )
-    _logger.debug("schema: %s", schema)
+    _logger.debug("schema: \n%s", schema)
     if not partition_cols:
         file_path: str = f"{path}{uuid.uuid4().hex}{compression_ext}.parquet"
         _to_parquet_file(
@@ -1733,24 +1733,32 @@ def _read_parquet_chunked(
     use_threads: bool = True,
 ) -> Iterator[pd.DataFrame]:
     promote: bool = not validate_schema
-    next_slice: Optional[pa.Table] = None
+    next_slice: Optional[pd.DataFrame] = None
     for piece in data.pieces:
-        table: pa.Table = piece.read(
-            columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False
+        df: pd.DataFrame = _table2df(
+            table=piece.read(
+                columns=columns,
+                use_threads=use_threads,
+                partitions=data.partitions,
+                use_pandas_metadata=False
+            ),
+            categories=categories,
+            use_threads=use_threads
         )
         if chunked is True:
-            yield _table2df(table=table, categories=categories, use_threads=use_threads)
+            yield df
         else:
-            if next_slice:
-                table = pa.lib.concat_tables([next_slice, table], promote=promote)
-            while len(table) >= chunked:
-                yield _table2df(
-                    table=table.slice(offset=0, length=chunked), categories=categories, use_threads=use_threads
-                )
-                table = table.slice(offset=chunked, length=None)
-            next_slice = table
-    if next_slice:
-        yield _table2df(table=next_slice, categories=categories, use_threads=use_threads)
+            if next_slice is not None:
+                df = pd.concat(objs=[next_slice, df], ignore_index=True, sort=False)
+            while len(df.index) >= chunked:
+                yield df.iloc[:chunked]
+                df = df.iloc[chunked:]
+            if df.empty:
+                next_slice = None
+            else:
+                next_slice = df
+    if next_slice is not None:
+        yield next_slice
 
 
 def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool = True) -> pd.DataFrame:

From ca133a0248c5977c9e72d6974e5c4a1f681e3ab2 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 28 Apr 2020 23:48:45 -0300
Subject: [PATCH 51/59] Linting

---
 awswrangler/s3.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index c0eb71c3f..a8512f0b5 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -1688,12 +1688,7 @@ def read_parquet(
             data=data, columns=columns, categories=categories, use_threads=use_threads, validate_schema=validate_schema
         )
     return _read_parquet_chunked(
-        data=data,
-        columns=columns,
-        categories=categories,
-        chunked=chunked,
-        use_threads=use_threads,
-        validate_schema=validate_schema,
+        data=data, columns=columns, categories=categories, chunked=chunked, use_threads=use_threads
     )
 
 
@@ -1728,22 +1723,17 @@ def _read_parquet_chunked(
     data: pyarrow.parquet.ParquetDataset,
     columns: Optional[List[str]] = None,
     categories: List[str] = None,
-    validate_schema: bool = True,
     chunked: Union[bool, int] = True,
     use_threads: bool = True,
 ) -> Iterator[pd.DataFrame]:
-    promote: bool = not validate_schema
     next_slice: Optional[pd.DataFrame] = None
     for piece in data.pieces:
         df: pd.DataFrame = _table2df(
             table=piece.read(
-                columns=columns,
-                use_threads=use_threads,
-                partitions=data.partitions,
-                use_pandas_metadata=False
+                columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False
             ),
             categories=categories,
-            use_threads=use_threads
+            use_threads=use_threads,
         )
         if chunked is True:
             yield df

From e8660cb6e6414dacca6f966b450133367d9de6af Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sat, 2 May 2020 18:03:50 -0300
Subject: [PATCH 52/59] Bumping dependencies versions

---
 requirements-dev.txt | 4 ++--
 requirements.txt     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index bfdd15c5e..81b576472 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -5,13 +5,13 @@ mypy~=0.770
 isort~=4.3.21
 pydocstyle~=5.0.2
 doc8~=0.8.0
-tox~=3.14.6
+tox~=3.15.0
 pytest~=5.4.1
 pytest-cov~=2.8.1
 pytest-xdist~=1.31.0
 scikit-learn~=0.22.1
 awscli>=1.18.22
-cfn-lint~=0.29.6
+cfn-lint~=0.30.1
 cfn-flip~=1.2.3
 twine~=3.1.1
 wheel~=0.34.2
diff --git a/requirements.txt b/requirements.txt
index ec72c05b7..9c1013d22 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 numpy~=1.18.1
 pandas~=1.0.3
-pyarrow~=0.16.0
+pyarrow~=0.17.0
 boto3>=1.12.22
 botocore>=1.15.22
 s3fs~=0.4.2

From b484ae1c8bf54efd3122c4add35098c266512e10 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sun, 3 May 2020 18:11:10 -0300
Subject: [PATCH 53/59] Add support for query UUID columns on PostgreSQL and
 full NULL columns for all databases.

---
 testing/test_awswrangler/test_db.py | 66 +++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/testing/test_awswrangler/test_db.py b/testing/test_awswrangler/test_db.py
index 65f5ad15b..86a57a74d 100644
--- a/testing/test_awswrangler/test_db.py
+++ b/testing/test_awswrangler/test_db.py
@@ -460,3 +460,69 @@ def test_to_sql_cast(parameters, db_type):
     )
     df2 = wr.db.read_sql_query(sql=f"SELECT * FROM {schema}.{table}", con=engine)
     assert df.equals(df2)
+
+
+def test_uuid(parameters):
+    table = "test_uuid"
+    schema = parameters["postgresql"]["schema"]
+    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-postgresql")
+    df = pd.DataFrame(
+        {
+            "id": [1, 2, 3],
+            "uuid": [
+                "ec0f0482-8d3b-11ea-8b27-8c859043dd95",
+                "f56ff7c0-8d3b-11ea-be94-8c859043dd95",
+                "fa043e90-8d3b-11ea-b7e7-8c859043dd95",
+            ],
+        }
+    )
+    wr.db.to_sql(
+        df=df,
+        con=engine,
+        name=table,
+        schema=schema,
+        if_exists="replace",
+        index=False,
+        index_label=None,
+        chunksize=None,
+        method=None,
+        dtype={"uuid": sqlalchemy.dialects.postgresql.UUID},
+    )
+    df2 = wr.db.read_sql_table(table=table, schema=schema, con=engine)
+    df["id"] = df["id"].astype("Int64")
+    df["uuid"] = df["uuid"].astype("string")
+    assert df.equals(df2)
+
+
+@pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"])
+def test_null(parameters, db_type):
+    table = "test_null"
+    schema = parameters[db_type]["schema"]
+    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
+    df = pd.DataFrame({"id": [1, 2, 3], "nothing": [None, None, None]})
+    wr.db.to_sql(
+        df=df,
+        con=engine,
+        name=table,
+        schema=schema,
+        if_exists="replace",
+        index=False,
+        index_label=None,
+        chunksize=None,
+        method=None,
+        dtype={"nothing": sqlalchemy.types.Integer},
+    )
+    wr.db.to_sql(
+        df=df,
+        con=engine,
+        name=table,
+        schema=schema,
+        if_exists="append",
+        index=False,
+        index_label=None,
+        chunksize=None,
+        method=None,
+    )
+    df2 = wr.db.read_sql_table(table=table, schema=schema, con=engine)
+    df["id"] = df["id"].astype("Int64")
+    assert pd.concat(objs=[df, df], ignore_index=True).equals(df2)

From 08cf244090583e253d4295d7d1d2aa4d0bbb867e Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Sun, 3 May 2020 21:56:43 -0300
Subject: [PATCH 54/59] Add support to write nested types (array and struct).

---
 .gitignore                                 |  2 +
 awswrangler/_data_types.py                 | 67 ++++++++++++++++++----
 awswrangler/db.py                          |  5 +-
 testing/test_awswrangler/test_data_lake.py | 22 +++++++
 4 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1947b2e87..8a3474a30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -138,6 +138,8 @@ testing/*parameters-*.properties
 testing/*requirements*.txt
 testing/coverage/*
 building/*requirements*.txt
+building/arrow
+building/lambda/arrow
 /docs/coverage/
 /docs/build/
 /docs/source/_build/
diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
index 01237ea49..fac82a37b 100644
--- a/awswrangler/_data_types.py
+++ b/awswrangler/_data_types.py
@@ -1,8 +1,9 @@
 """Internal (private) Data Types Module."""
 
 import logging
+import re
 from decimal import Decimal
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Match, Optional, Sequence, Tuple
 
 import pandas as pd  # type: ignore
 import pyarrow as pa  # type: ignore
@@ -139,8 +140,10 @@ def pyarrow2athena(dtype: pa.DataType) -> str:  # pylint: disable=too-many-branc
         return f"decimal({dtype.precision},{dtype.scale})"
     if pa.types.is_list(dtype):
         return f"array<{pyarrow2athena(dtype=dtype.value_type)}>"
-    if pa.types.is_struct(dtype):  # pragma: no cover
-        return f"struct<{', '.join([f'{f.name}: {pyarrow2athena(dtype=f.type)}' for f in dtype])}>"
+    if pa.types.is_struct(dtype):
+        return f"struct<{', '.join([f'{f.name}:{pyarrow2athena(dtype=f.type)}' for f in dtype])}>"
+    if pa.types.is_map(dtype):  # pragma: no cover
+        return f"map<{pyarrow2athena(dtype=dtype.key_type)},{pyarrow2athena(dtype=dtype.item_type)}>"
     if dtype == pa.null():
         raise exceptions.UndetectedType("We can not infer the data type from an entire null object column")
     raise exceptions.UnsupportedType(f"Unsupported Pyarrow type: {dtype}")  # pragma: no cover
@@ -167,7 +170,7 @@ def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-retu
 
 def pyarrow2sqlalchemy(  # pylint: disable=too-many-branches,too-many-return-statements
     dtype: pa.DataType, db_type: str
-) -> VisitableType:
+) -> Optional[VisitableType]:
     """Pyarrow to Athena data types conversion."""
     if pa.types.is_int8(dtype):
         return sqlalchemy.types.SmallInteger
@@ -214,7 +217,7 @@ def pyarrow2sqlalchemy(  # pylint: disable=too-many-branches,too-many-return-sta
     if pa.types.is_dictionary(dtype):
         return pyarrow2sqlalchemy(dtype=dtype.value_type, db_type=db_type)
     if dtype == pa.null():  # pragma: no cover
-        raise exceptions.UndetectedType("We can not infer the data type from an entire null object column")
+        return None
     raise exceptions.UnsupportedType(f"Unsupported Pyarrow type: {dtype}")  # pragma: no cover
 
 
@@ -243,12 +246,23 @@ def pyarrow_types_from_pandas(
         else:
             cols.append(name)
 
-    # Filling cols_dtypes and indexes
+    # Filling cols_dtypes
+    for col in cols:
+        _logger.debug("Inferring PyArrow type from column: %s", col)
+        try:
+            schema: pa.Schema = pa.Schema.from_pandas(df=df[[col]], preserve_index=False)
+        except pa.ArrowInvalid as ex:  # pragma: no cover
+            cols_dtypes[col] = process_not_inferred_dtype(ex)
+        else:
+            cols_dtypes[col] = schema.field(col).type
+
+    # Filling indexes
     indexes: List[str] = []
-    for field in pa.Schema.from_pandas(df=df[cols], preserve_index=index):
-        name = str(field.name)
-        cols_dtypes[name] = field.type
-        if (name not in df.columns) and (index is True):
+    if index is True:
+        for field in pa.Schema.from_pandas(df=df[[]], preserve_index=True):
+            name = str(field.name)
+            _logger.debug("Inferring PyArrow type from index: %s", name)
+            cols_dtypes[name] = field.type
             indexes.append(name)
 
     # Merging Index
@@ -261,6 +275,39 @@ def pyarrow_types_from_pandas(
     return columns_types
 
 
+def process_not_inferred_dtype(ex: pa.ArrowInvalid) -> pa.DataType:
+    """Infer data type from PyArrow inference exception."""
+    ex_str = str(ex)
+    _logger.debug("PyArrow was not able to infer data type:\n%s", ex_str)
+    match: Optional[Match] = re.search(
+        pattern="Could not convert (.*) with type (.*): did not recognize "
+        "Python value type when inferring an Arrow data type",
+        string=ex_str,
+    )
+    if match is None:
+        raise ex  # pragma: no cover
+    groups: Optional[Sequence[str]] = match.groups()
+    if groups is None:
+        raise ex  # pragma: no cover
+    if len(groups) != 2:
+        raise ex  # pragma: no cover
+    _logger.debug("groups: %s", groups)
+    type_str: str = groups[1]
+    if type_str == "UUID":
+        return pa.string()
+    raise ex  # pragma: no cover
+
+
+def process_not_inferred_array(ex: pa.ArrowInvalid, values: Any) -> pa.Array:
+    """Infer `pyarrow.array` from PyArrow inference exception."""
+    dtype = process_not_inferred_dtype(ex=ex)
+    if dtype == pa.string():
+        array: pa.Array = pa.array(obj=[str(x) for x in values], type=dtype, safe=True)
+    else:
+        raise ex  # pragma: no cover
+    return array
+
+
 def athena_types_from_pandas(
     df: pd.DataFrame, index: bool, dtype: Optional[Dict[str, str]] = None, index_left: bool = False
 ) -> Dict[str, str]:
diff --git a/awswrangler/db.py b/awswrangler/db.py
index b695bdd17..f5e90c78e 100644
--- a/awswrangler/db.py
+++ b/awswrangler/db.py
@@ -185,7 +185,10 @@ def _records2df(
     arrays: List[pa.Array] = []
     for col_values, col_name in zip(tuple(zip(*records)), cols_names):  # Transposing
         if (dtype is None) or (col_name not in dtype):
-            array: pa.Array = pa.array(obj=col_values, safe=True)  # Creating Arrow array
+            try:
+                array: pa.Array = pa.array(obj=col_values, safe=True)  # Creating Arrow array
+            except pa.ArrowInvalid as ex:
+                array = _data_types.process_not_inferred_array(ex, values=col_values)  # Creating Arrow array
         else:
             array = pa.array(obj=col_values, type=dtype[col_name], safe=True)  # Creating Arrow array with dtype
         arrays.append(array)
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
index b05fb0881..99c1df1c6 100644
--- a/testing/test_awswrangler/test_data_lake.py
+++ b/testing/test_awswrangler/test_data_lake.py
@@ -1203,3 +1203,25 @@ def test_athena_encryption(
     assert len(df2.columns) == 2
     wr.catalog.delete_table_if_exists(database=database, table=table)
     wr.s3.delete_objects(path=paths)
+
+
+def test_athena_nested(bucket, database):
+    table = "test_athena_nested"
+    path = f"s3://{bucket}/{table}/"
+    df = pd.DataFrame(
+        {
+            "c0": [[1, 2, 3], [4, 5, 6]],
+            "c1": [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+            "c2": [[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]],
+            "c3": [[], [[[[[[[[1]]]]]]]]],
+            "c4": [{"a": 1}, {"a": 1}],
+            "c5": [{"a": {"b": {"c": [1, 2]}}}, {"a": {"b": {"c": [3, 4]}}}],
+        }
+    )
+    paths = wr.s3.to_parquet(
+        df=df, path=path, index=False, use_threads=True, dataset=True, mode="overwrite", database=database, table=table
+    )["paths"]
+    wr.s3.wait_objects_exist(paths=paths)
+    df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c1, c2, c4 FROM {table}", database=database)
+    assert len(df2.index) == 2
+    assert len(df2.columns) == 4

From 458bf266f684f096cd26a729ef6eb2d3beffc02d Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Mon, 4 May 2020 19:13:39 -0300
Subject: [PATCH 55/59] Add keep_files and ctas_temp_table_name to
 wr.athena.read_*(). #203

---
 awswrangler/athena.py                      | 76 +++++++++++++++++-----
 awswrangler/torch.py                       | 20 +++---
 testing/test_awswrangler/test_data_lake.py | 62 +++++++++++++++++-
 3 files changed, 130 insertions(+), 28 deletions(-)

diff --git a/awswrangler/athena.py b/awswrangler/athena.py
index 76cb0a108..671dabd42 100644
--- a/awswrangler/athena.py
+++ b/awswrangler/athena.py
@@ -370,7 +370,7 @@ def _fix_csv_types(df: pd.DataFrame, parse_dates: List[str], binaries: List[str]
     return df
 
 
-def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
+def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements
     sql: str,
     database: str,
     ctas_approach: bool = True,
@@ -380,6 +380,8 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     workgroup: Optional[str] = None,
     encryption: Optional[str] = None,
     kms_key: Optional[str] = None,
+    keep_files: bool = True,
+    ctas_temp_table_name: Optional[str] = None,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
@@ -454,6 +456,12 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
         Valid values: [None, 'SSE_S3', 'SSE_KMS']. Notice: 'CSE_KMS' is not supported.
     kms_key : str, optional
         For SSE-KMS, this is the KMS key ARN or ID.
+    keep_files : bool
+        Should Wrangler delete or keep the staging files produced by Athena?
+    ctas_temp_table_name : str, optional
+        The name of the temporary table and also the directory name on S3 where the CTAS result is stored.
+        If None, it will use the follow random pattern: `f"temp_table_{pyarrow.compat.guid()}"`.
+        On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`.
     use_threads : bool
         True to enable concurrent requests, False to disable multiple threads.
         If enabled os.cpu_count() will be used as the max number of threads.
@@ -477,7 +485,10 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output
     name: str = ""
     if ctas_approach is True:
-        name = f"temp_table_{pa.compat.guid()}"
+        if ctas_temp_table_name is not None:
+            name = catalog.sanitize_table_name(ctas_temp_table_name)
+        else:
+            name = f"temp_table_{pa.compat.guid()}"
         path: str = f"{_s3_output}/{name}"
         ext_location: str = "\n" if wg_config["enforced"] is True else f",\n    external_location = '{path}'\n"
         sql = (
@@ -506,25 +517,34 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
         reason: str = query_response["QueryExecution"]["Status"]["StateChangeReason"]
         message_error: str = f"Query error: {reason}"
         raise exceptions.AthenaQueryError(message_error)
-    dfs: Union[pd.DataFrame, Iterator[pd.DataFrame]]
+    ret: Union[pd.DataFrame, Iterator[pd.DataFrame]]
     if ctas_approach is True:
         catalog.delete_table_if_exists(database=database, table=name, boto3_session=session)
         manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv"
+        metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata"
         _logger.debug("manifest_path: %s", manifest_path)
+        _logger.debug("metadata_path: %s", metadata_path)
+        s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session)
         paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session)
         chunked: Union[bool, int] = False if chunksize is None else chunksize
         _logger.debug("chunked: %s", chunked)
         if not paths:
             if chunked is False:
-                dfs = pd.DataFrame()
-            else:
-                dfs = _utils.empty_generator()
-        else:
-            s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session)
-            dfs = s3.read_parquet(
-                path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories
-            )
-        return dfs
+                return pd.DataFrame()
+            return _utils.empty_generator()
+        s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session)
+        ret = s3.read_parquet(
+            path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories
+        )
+        paths_delete: List[str] = paths + [manifest_path, metadata_path]
+        _logger.debug(type(ret))
+        if chunked is False:
+            if keep_files is False:
+                s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session)
+            return ret
+        if keep_files is False:
+            return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session)
+        return ret
     dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata(
         query_execution_id=query_id, categories=categories, boto3_session=session
     )
@@ -547,10 +567,26 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
         boto3_session=session,
     )
     _logger.debug("Start type casting...")
-    if chunksize is None:
-        return _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries)
     _logger.debug(type(ret))
-    return _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries)
+    if chunksize is None:
+        df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries)
+        if keep_files is False:
+            s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session)
+        return df
+    dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries)
+    if keep_files is False:
+        return _delete_after_iterate(
+            dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session
+        )
+    return dfs
+
+
+def _delete_after_iterate(
+    dfs: Iterator[pd.DataFrame], paths: List[str], use_threads: bool, boto3_session: boto3.Session
+) -> Iterator[pd.DataFrame]:
+    for df in dfs:
+        yield df
+    s3.delete_objects(path=paths, use_threads=use_threads, boto3_session=boto3_session)
 
 
 def stop_query_execution(query_execution_id: str, boto3_session: Optional[boto3.Session] = None) -> None:
@@ -638,6 +674,8 @@ def read_sql_table(
     workgroup: Optional[str] = None,
     encryption: Optional[str] = None,
     kms_key: Optional[str] = None,
+    keep_files: bool = True,
+    ctas_temp_table_name: Optional[str] = None,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
@@ -712,6 +750,12 @@ def read_sql_table(
         None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'.
     kms_key : str, optional
         For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID.
+    keep_files : bool
+        Should Wrangler delete or keep the staging files produced by Athena?
+    ctas_temp_table_name : str, optional
+        The name of the temporary table and also the directory name on S3 where the CTAS result is stored.
+        If None, it will use the follow random pattern: `f"temp_table_{pyarrow.compat.guid()}"`.
+        On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`.
     use_threads : bool
         True to enable concurrent requests, False to disable multiple threads.
         If enabled os.cpu_count() will be used as the max number of threads.
@@ -740,6 +784,8 @@ def read_sql_table(
         workgroup=workgroup,
         encryption=encryption,
         kms_key=kms_key,
+        keep_files=keep_files,
+        ctas_temp_table_name=ctas_temp_table_name,
         use_threads=use_threads,
         boto3_session=boto3_session,
     )
diff --git a/awswrangler/torch.py b/awswrangler/torch.py
index 7d3c47316..70df93f34 100644
--- a/awswrangler/torch.py
+++ b/awswrangler/torch.py
@@ -28,14 +28,14 @@ class _BaseS3Dataset:
     def __init__(
         self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
     ):
-        """PyTorch Map-Style S3 Dataset.
+        r"""PyTorch Map-Style S3 Dataset.
 
         Parameters
         ----------
         path : Union[str, List[str]]
             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
         suffix: str, optional
-            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png).
+            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://\*.png).
         boto3_session : boto3.Session(), optional
             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -160,7 +160,7 @@ def __init__(
         suffix: Optional[str] = None,
         boto3_session: Optional[boto3.Session] = None,
     ):
-        """PyTorch Amazon S3 Lambda Dataset.
+        r"""PyTorch Amazon S3 Lambda Dataset.
 
         Parameters
         ----------
@@ -171,7 +171,7 @@ def __init__(
         label_fn: Callable
             Function that receives object path (str) and return a torch.Tensor
         suffix: str, optional
-            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png).
+            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://\*.png).
         boto3_session : boto3.Session(), optional
             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -212,7 +212,7 @@ def __init__(
         suffix: Optional[str] = None,
         boto3_session: Optional[boto3.Session] = None,
     ):
-        """PyTorch Amazon S3 Audio Dataset.
+        r"""PyTorch Amazon S3 Audio Dataset.
 
         Read individual WAV audio files stores in Amazon S3 and return
         them as torch tensors.
@@ -237,7 +237,7 @@ def __init__(
         path : Union[str, List[str]]
             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
         suffix: str, optional
-            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png).
+            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://\*.png).
         boto3_session : boto3.Session(), optional
             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -302,7 +302,7 @@ class ImageS3Dataset(_S3PartitionedDataset):
     """PyTorch Amazon S3 Image Dataset."""
 
     def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session):
-        """PyTorch Amazon S3 Image Dataset.
+        r"""PyTorch Amazon S3 Image Dataset.
 
         ImageS3Dataset assumes images are patitioned (within class=<value> folders) in Amazon S3.
         Each lisited object will be loaded by default Pillow library.
@@ -327,7 +327,7 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto
         path : Union[str, List[str]]
             S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
         suffix: str, optional
-            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png).
+            S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://\*.png).
         boto3_session : boto3.Session(), optional
             Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -350,14 +350,14 @@ def _data_fn(self, data: io.BytesIO) -> Any:
 
 
 class S3IterableDataset(IterableDataset, _BaseS3Dataset):  # pylint: disable=abstract-method
-    """PyTorch Amazon S3 Iterable Dataset.
+    r"""PyTorch Amazon S3 Iterable Dataset.
 
     Parameters
     ----------
     path : Union[str, List[str]]
         S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
     suffix: str, optional
-        S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png).
+        S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://\*.png).
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
index 99c1df1c6..e9c9834df 100644
--- a/testing/test_awswrangler/test_data_lake.py
+++ b/testing/test_awswrangler/test_data_lake.py
@@ -191,14 +191,51 @@ def test_athena_ctas(bucket, database, kms_key):
         encryption="SSE_KMS",
         kms_key=kms_key,
         s3_output=f"s3://{bucket}/test_athena_ctas_result",
+        keep_files=False,
     )
     assert len(df.index) == 3
     ensure_data_types(df=df, has_list=True)
+    temp_table = "test_athena_ctas2"
+    s3_output = f"s3://{bucket}/s3_output/"
+    final_destination = f"{s3_output}{temp_table}/"
+
+    # keep_files=False
+    wr.s3.delete_objects(path=s3_output)
     dfs = wr.athena.read_sql_query(
-        sql=f"SELECT * FROM test_athena_ctas", database=database, ctas_approach=True, chunksize=1
+        sql=f"SELECT * FROM test_athena_ctas",
+        database=database,
+        ctas_approach=True,
+        chunksize=1,
+        keep_files=False,
+        ctas_temp_table_name=temp_table,
+        s3_output=s3_output,
     )
+    assert wr.catalog.does_table_exist(database=database, table=temp_table) is False
+    assert len(wr.s3.list_objects(path=s3_output)) > 2
+    assert len(wr.s3.list_objects(path=final_destination)) > 0
     for df in dfs:
         ensure_data_types(df=df, has_list=True)
+    assert len(wr.s3.list_objects(path=s3_output)) == 0
+
+    # keep_files=True
+    wr.s3.delete_objects(path=s3_output)
+    dfs = wr.athena.read_sql_query(
+        sql=f"SELECT * FROM test_athena_ctas",
+        database=database,
+        ctas_approach=True,
+        chunksize=2,
+        keep_files=True,
+        ctas_temp_table_name=temp_table,
+        s3_output=s3_output,
+    )
+    assert wr.catalog.does_table_exist(database=database, table=temp_table) is False
+    assert len(wr.s3.list_objects(path=s3_output)) > 2
+    assert len(wr.s3.list_objects(path=final_destination)) > 0
+    for df in dfs:
+        ensure_data_types(df=df, has_list=True)
+    assert len(wr.s3.list_objects(path=s3_output)) > 2
+
+    # Cleaning Up
     wr.catalog.delete_table_if_exists(database=database, table="test_athena_ctas")
     wr.s3.delete_objects(path=paths)
     wr.s3.wait_objects_not_exist(paths=paths)
@@ -227,12 +264,17 @@ def test_athena(bucket, database, kms_key, workgroup0, workgroup1):
         encryption="SSE_KMS",
         kms_key=kms_key,
         workgroup=workgroup0,
+        keep_files=False,
     )
     for df2 in dfs:
         print(df2)
         ensure_data_types(df=df2)
     df = wr.athena.read_sql_query(
-        sql="SELECT * FROM __test_athena", database=database, ctas_approach=False, workgroup=workgroup1
+        sql="SELECT * FROM __test_athena",
+        database=database,
+        ctas_approach=False,
+        workgroup=workgroup1,
+        keep_files=False,
     )
     assert len(df.index) == 3
     ensure_data_types(df=df)
@@ -1195,9 +1237,23 @@ def test_athena_encryption(
         df=df, path=path, dataset=True, mode="overwrite", database=database, table=table, s3_additional_kwargs=None
     )["paths"]
     wr.s3.wait_objects_exist(paths=paths, use_threads=False)
+    temp_table = table + "2"
+    s3_output = f"s3://{bucket}/encryptio_s3_output/"
+    final_destination = f"{s3_output}{temp_table}/"
+    wr.s3.delete_objects(path=final_destination)
     df2 = wr.athena.read_sql_table(
-        table=table, ctas_approach=True, database=database, encryption=encryption, workgroup=workgroup, kms_key=kms_key
+        table=table,
+        ctas_approach=True,
+        database=database,
+        encryption=encryption,
+        workgroup=workgroup,
+        kms_key=kms_key,
+        keep_files=True,
+        ctas_temp_table_name=temp_table,
+        s3_output=s3_output,
     )
+    assert wr.catalog.does_table_exist(database=database, table=temp_table) is False
+    assert len(wr.s3.list_objects(path=s3_output)) > 2
     print(df2)
     assert len(df2.index) == 2
     assert len(df2.columns) == 2

From fe6f50bb41ac42d8ce02ce39c241f0f7c90433c8 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 5 May 2020 12:55:34 -0300
Subject: [PATCH 56/59] Removing delete_table operations from
 catalog._create_table() and add catalog_versioning arg. #198

---
 awswrangler/catalog.py                     | 27 ++++++---
 awswrangler/s3.py                          | 12 ++++
 testing/test_awswrangler/test_data_lake.py | 67 ++++++++++++++++++++++
 3 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py
index 93092626b..9ef55066c 100644
--- a/awswrangler/catalog.py
+++ b/awswrangler/catalog.py
@@ -93,6 +93,7 @@ def create_parquet_table(
     parameters: Optional[Dict[str, str]] = None,
     columns_comments: Optional[Dict[str, str]] = None,
     mode: str = "overwrite",
+    catalog_versioning: bool = False,
     boto3_session: Optional[boto3.Session] = None,
 ) -> None:
     """Create a Parquet Table (Metadata Only) in the AWS Glue Catalog.
@@ -121,6 +122,8 @@ def create_parquet_table(
         Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}).
     mode: str
         'overwrite' to recreate any possible existing table or 'append' to keep any possible existing table.
+    catalog_versioning : bool
+        If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it.
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -157,6 +160,7 @@ def create_parquet_table(
         parameters=parameters,
         columns_comments=columns_comments,
         mode=mode,
+        catalog_versioning=catalog_versioning,
         boto3_session=boto3_session,
         table_input=table_input,
     )
@@ -865,6 +869,7 @@ def create_csv_table(
     parameters: Optional[Dict[str, str]] = None,
     columns_comments: Optional[Dict[str, str]] = None,
     mode: str = "overwrite",
+    catalog_versioning: bool = False,
     sep: str = ",",
     boto3_session: Optional[boto3.Session] = None,
 ) -> None:
@@ -884,16 +889,18 @@ def create_csv_table(
         Dictionary with keys as column names and vales as data types (e.g. {'col0': 'bigint', 'col1': 'double'}).
     partitions_types: Dict[str, str], optional
         Dictionary with keys as partition names and values as data types (e.g. {'col2': 'date'}).
-    compression: str, optional
+    compression : str, optional
         Compression style (``None``, ``gzip``, etc).
-    description: str, optional
+    description : str, optional
         Table description
-    parameters: Dict[str, str], optional
+    parameters : Dict[str, str], optional
         Key/value pairs to tag the table.
     columns_comments: Dict[str, str], optional
         Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}).
-    mode: str
+    mode : str
         'overwrite' to recreate any possible axisting table or 'append' to keep any possible axisting table.
+    catalog_versioning : bool
+        If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it.
     sep : str
         String of length 1. Field delimiter for the output file.
     boto3_session : boto3.Session(), optional
@@ -937,6 +944,7 @@ def create_csv_table(
         parameters=parameters,
         columns_comments=columns_comments,
         mode=mode,
+        catalog_versioning=catalog_versioning,
         boto3_session=boto3_session,
         table_input=table_input,
     )
@@ -949,6 +957,7 @@ def _create_table(
     parameters: Optional[Dict[str, str]],
     columns_comments: Optional[Dict[str, str]],
     mode: str,
+    catalog_versioning: bool,
     boto3_session: Optional[boto3.Session],
     table_input: Dict[str, Any],
 ):
@@ -967,10 +976,14 @@ def _create_table(
             if name in columns_comments:
                 par["Comment"] = columns_comments[name]
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
+    client_glue: boto3.client = _utils.client(service_name="glue", session=session)
     exist: bool = does_table_exist(database=database, table=table, boto3_session=session)
-    if (mode == "overwrite") or (exist is False):
-        delete_table_if_exists(database=database, table=table, boto3_session=session)
-        client_glue: boto3.client = _utils.client(service_name="glue", session=session)
+    if mode not in ("overwrite", "append"):  # pragma: no cover
+        raise exceptions.InvalidArgument(f"{mode} is not a valid mode. It must be 'overwrite' or 'append'.")
+    if (exist is True) and (mode == "overwrite"):
+        skip_archive: bool = not catalog_versioning
+        client_glue.update_table(DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive)
+    elif exist is False:
         client_glue.create_table(DatabaseName=database, TableInput=table_input)
 
 
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index a8512f0b5..31c7b2ea6 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -433,6 +433,7 @@ def to_csv(  # pylint: disable=too-many-arguments
     dataset: bool = False,
     partition_cols: Optional[List[str]] = None,
     mode: Optional[str] = None,
+    catalog_versioning: bool = False,
     database: Optional[str] = None,
     table: Optional[str] = None,
     dtype: Optional[Dict[str, str]] = None,
@@ -483,6 +484,8 @@ def to_csv(  # pylint: disable=too-many-arguments
         List of column names that will be used to create partitions. Only takes effect if dataset=True.
     mode: str, optional
         ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True.
+    catalog_versioning : bool
+        If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it.
     database : str, optional
         Glue/Athena catalog: Database name.
     table : str, optional
@@ -677,6 +680,7 @@ def to_csv(  # pylint: disable=too-many-arguments
                     columns_comments=columns_comments,
                     boto3_session=session,
                     mode="overwrite",
+                    catalog_versioning=catalog_versioning,
                     sep=sep,
                 )
             if partitions_values:
@@ -846,6 +850,7 @@ def to_parquet(  # pylint: disable=too-many-arguments
     dataset: bool = False,
     partition_cols: Optional[List[str]] = None,
     mode: Optional[str] = None,
+    catalog_versioning: bool = False,
     database: Optional[str] = None,
     table: Optional[str] = None,
     dtype: Optional[Dict[str, str]] = None,
@@ -893,6 +898,8 @@ def to_parquet(  # pylint: disable=too-many-arguments
         List of column names that will be used to create partitions. Only takes effect if dataset=True.
     mode: str, optional
         ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True.
+    catalog_versioning : bool
+        If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it.
     database : str, optional
         Glue/Athena catalog: Database name.
     table : str, optional
@@ -1092,6 +1099,7 @@ def to_parquet(  # pylint: disable=too-many-arguments
                     columns_comments=columns_comments,
                     boto3_session=session,
                     mode="overwrite",
+                    catalog_versioning=catalog_versioning,
                 )
             if partitions_values:
                 _logger.debug("partitions_values:\n%s", partitions_values)
@@ -1838,6 +1846,7 @@ def store_parquet_metadata(
     columns_comments: Optional[Dict[str, str]] = None,
     compression: Optional[str] = None,
     mode: str = "overwrite",
+    catalog_versioning: bool = False,
     boto3_session: Optional[boto3.Session] = None,
 ) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]:
     """Infer and store parquet metadata on AWS Glue Catalog.
@@ -1879,6 +1888,8 @@ def store_parquet_metadata(
         Compression style (``None``, ``snappy``, ``gzip``, etc).
     mode: str
         'overwrite' to recreate any possible existing table or 'append' to keep any possible existing table.
+    catalog_versioning : bool
+        If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it.
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -1924,6 +1935,7 @@ def store_parquet_metadata(
         parameters=parameters,
         columns_comments=columns_comments,
         mode=mode,
+        catalog_versioning=catalog_versioning,
         boto3_session=session,
     )
     partitions_values: Dict[str, List[str]] = _data_types.athena_partitions_from_pyarrow_partitions(
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
index e9c9834df..77bd5310e 100644
--- a/testing/test_awswrangler/test_data_lake.py
+++ b/testing/test_awswrangler/test_data_lake.py
@@ -1281,3 +1281,70 @@ def test_athena_nested(bucket, database):
     df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c1, c2, c4 FROM {table}", database=database)
     assert len(df2.index) == 2
     assert len(df2.columns) == 4
+
+
+def test_catalog_versioning(bucket, database):
+    table = "test_catalog_versioning"
+    wr.catalog.delete_table_if_exists(database=database, table=table)
+    path = f"s3://{bucket}/{table}/"
+    wr.s3.delete_objects(path=path)
+
+    # Version 0
+    df = pd.DataFrame({"c0": [1, 2]})
+    paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table, mode="overwrite")["paths"]
+    wr.s3.wait_objects_exist(paths=paths, use_threads=False)
+    df = wr.athena.read_sql_table(table=table, database=database)
+    assert len(df.index) == 2
+    assert len(df.columns) == 1
+    assert str(df.c0.dtype).startswith("Int")
+
+    # Version 1
+    df = pd.DataFrame({"c1": ["foo", "boo"]})
+    paths = wr.s3.to_parquet(
+        df=df, path=path, dataset=True, database=database, table=table, mode="overwrite", catalog_versioning=True
+    )["paths"]
+    wr.s3.wait_objects_exist(paths=paths, use_threads=False)
+    df = wr.athena.read_sql_table(table=table, database=database)
+    assert len(df.index) == 2
+    assert len(df.columns) == 1
+    assert str(df.c1.dtype) == "string"
+
+    # Version 2
+    df = pd.DataFrame({"c1": [1.0, 2.0]})
+    paths = wr.s3.to_csv(
+        df=df,
+        path=path,
+        dataset=True,
+        database=database,
+        table=table,
+        mode="overwrite",
+        catalog_versioning=True,
+        index=False,
+    )["paths"]
+    wr.s3.wait_objects_exist(paths=paths, use_threads=False)
+    df = wr.athena.read_sql_table(table=table, database=database)
+    assert len(df.index) == 2
+    assert len(df.columns) == 1
+    assert str(df.c1.dtype).startswith("float")
+
+    # Version 3 (removing version 2)
+    df = pd.DataFrame({"c1": [True, False]})
+    paths = wr.s3.to_csv(
+        df=df,
+        path=path,
+        dataset=True,
+        database=database,
+        table=table,
+        mode="overwrite",
+        catalog_versioning=False,
+        index=False,
+    )["paths"]
+    wr.s3.wait_objects_exist(paths=paths, use_threads=False)
+    df = wr.athena.read_sql_table(table=table, database=database)
+    assert len(df.index) == 2
+    assert len(df.columns) == 1
+    assert str(df.c1.dtype).startswith("boolean")
+
+    # Cleaning Up
+    wr.catalog.delete_table_if_exists(database=database, table=table)
+    wr.s3.delete_objects(path=path)

From a6ba86c170b3a43a12a8fe191778bf4d3871e441 Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 5 May 2020 13:46:56 -0300
Subject: [PATCH 57/59] add replace_filenames argument to wr.s3.copy_objects()
 #215

---
 awswrangler/s3.py                          | 10 ++++++++++
 testing/test_awswrangler/test_data_lake.py | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/awswrangler/s3.py b/awswrangler/s3.py
index 31c7b2ea6..f4d1e5746 100644
--- a/awswrangler/s3.py
+++ b/awswrangler/s3.py
@@ -2284,6 +2284,7 @@ def copy_objects(
     paths: List[str],
     source_path: str,
     target_path: str,
+    replace_filenames: Optional[Dict[str, str]] = None,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
 ) -> List[str]:
@@ -2334,6 +2335,15 @@ def copy_objects(
     for path in paths:
         path_wo_prefix: str = path.replace(f"{source_path}/", "")
         path_final: str = f"{target_path}/{path_wo_prefix}"
+        if replace_filenames is not None:
+            parts: List[str] = path_final.rsplit(sep="/", maxsplit=1)
+            if len(parts) == 2:
+                path_wo_filename: str = parts[0]
+                filename: str = parts[1]
+                if filename in replace_filenames:
+                    new_filename: str = replace_filenames[filename]
+                    _logger.debug("Replacing filename: %s -> %s", filename, new_filename)
+                    path_final = f"{path_wo_filename}/{new_filename}"
         new_objects.append(path_final)
         batch.append((path, path_final))
     _logger.debug("len(new_objects): %s", len(new_objects))
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
index 77bd5310e..f95c691fb 100644
--- a/testing/test_awswrangler/test_data_lake.py
+++ b/testing/test_awswrangler/test_data_lake.py
@@ -1348,3 +1348,22 @@ def test_catalog_versioning(bucket, database):
     # Cleaning Up
     wr.catalog.delete_table_if_exists(database=database, table=table)
     wr.s3.delete_objects(path=path)
+
+
+def test_copy_replacing_filename(bucket):
+    path = f"s3://{bucket}/test_copy_replacing_filename/"
+    wr.s3.delete_objects(path=path)
+    df = pd.DataFrame({"c0": [1, 2]})
+    file_path = f"{path}myfile.parquet"
+    wr.s3.to_parquet(df=df, path=file_path)
+    wr.s3.wait_objects_exist(paths=[file_path], use_threads=False)
+    path2 = f"s3://{bucket}/test_copy_replacing_filename2/"
+    wr.s3.copy_objects(
+        paths=[file_path], source_path=path, target_path=path2, replace_filenames={"myfile.parquet": "myfile2.parquet"}
+    )
+    expected_file = f"{path2}myfile2.parquet"
+    wr.s3.wait_objects_exist(paths=[expected_file], use_threads=False)
+    objs = wr.s3.list_objects(path=path2)
+    assert objs[0] == expected_file
+    wr.s3.delete_objects(path=path)
+    wr.s3.delete_objects(path=path2)

From 5be05d3ce7342dbdfc96d5fa125f8f78235773fb Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 5 May 2020 13:48:56 -0300
Subject: [PATCH 58/59] Update README

---
 README.md             | 2 +-
 docs/source/index.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ccb5dc669..424808bf8 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 
 **NOTE**
 
-We just released a new major version `1.0` with breaking changes. Please make sure that all your old projects has dependencies frozen on the desired version (e.g. `pip install awswrangler==0.3.2`).
+Due the new major version `1.*.*` with breaking changes, please make sure that all your old projects has dependencies frozen on the desired version (e.g. `pip install awswrangler==0.3.2`).
 
 ---
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 2528a6032..0a9059392 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,4 +1,4 @@
-.. note:: We just released a new major version `1.0` with breaking changes. Please make sure that all your old projects has dependencies frozen on the desired version (e.g. `pip install awswrangler==0.3.2`).
+.. note:: Due the new major version `1.*.*` with breaking changes, please make sure that all your old projects has dependencies frozen on the desired version (e.g. `pip install awswrangler==0.3.2`).
 
 Quick Start
 -----------

From 12d0f66faa67ac305b17a42317015a1783f578fc Mon Sep 17 00:00:00 2001
From: igorborgest <igorborgest@gmail.com>
Date: Tue, 5 May 2020 13:55:26 -0300
Subject: [PATCH 59/59] Updating requirements

---
 requirements-dev.txt   |  8 ++++----
 requirements-torch.txt |  2 +-
 requirements.txt       | 18 +++++++++---------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 81b576472..e6e788815 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,5 +1,5 @@
 black~=19.3b0
-pylint~=2.5.0
+pylint~=2.5.2
 flake8~=3.7.9
 mypy~=0.770
 isort~=4.3.21
@@ -8,9 +8,9 @@ doc8~=0.8.0
 tox~=3.15.0
 pytest~=5.4.1
 pytest-cov~=2.8.1
-pytest-xdist~=1.31.0
+pytest-xdist~=1.32.0
 scikit-learn~=0.22.1
-awscli>=1.18.22
+awscli>=1.18.0
 cfn-lint~=0.30.1
 cfn-flip~=1.2.3
 twine~=3.1.1
@@ -18,4 +18,4 @@ wheel~=0.34.2
 sphinx~=3.0.3
 sphinx_bootstrap_theme~=0.7.1
 moto~=1.3.14
-jupyterlab~=2.1.1
\ No newline at end of file
+jupyterlab~=2.1.2
\ No newline at end of file
diff --git a/requirements-torch.txt b/requirements-torch.txt
index d3e36447e..20f8cdba9 100644
--- a/requirements-torch.txt
+++ b/requirements-torch.txt
@@ -1,4 +1,4 @@
 torch~=1.5.0
 torchvision~=0.6.0
 torchaudio~=0.5.0
-Pillow~=7.1.2
+Pillow~=7.1.0
diff --git a/requirements.txt b/requirements.txt
index 9c1013d22..c6ff840d4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
-numpy~=1.18.1
-pandas~=1.0.3
+boto3>=1.12.0
+botocore>=1.15.0
+numpy~=1.18.0
+pandas~=1.0.0
 pyarrow~=0.17.0
-boto3>=1.12.22
-botocore>=1.15.22
-s3fs~=0.4.2
-psycopg2-binary~=2.8.5
-pymysql~=0.9.3
-SQLAlchemy==1.3.13
-sqlalchemy-redshift~=0.7.7
\ No newline at end of file
+s3fs~=0.4.0
+psycopg2-binary~=2.8.0
+pymysql~=0.9.0
+sqlalchemy-redshift~=0.7.0
+SQLAlchemy==1.3.13
\ No newline at end of file