From 8fd49660d875f60cb1682cd9fe6a43426564399b Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Sat, 18 Apr 2020 20:09:14 -0300 Subject: [PATCH 01/59] initial draft --- awswrangler/db.py | 36 ++++++++++---- awswrangler/torch.py | 111 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 8 deletions(-) create mode 100644 awswrangler/torch.py diff --git a/awswrangler/db.py b/awswrangler/db.py index 491fe7784..42d22fe73 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -155,6 +155,18 @@ def read_sql_query( ... ) """ + return _read_sql_query(fn=_record2df, sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype) + + +def _read_sql_query( + sql: str, + con: sqlalchemy.engine.Engine, + index_col: Optional[Union[str, List[str]]] = None, + params: Optional[Union[List, Tuple, Dict]] = None, + chunksize: Optional[int] = None, + dtype: Optional[Dict[str, pa.DataType]] = None, + fn: Callable, +): if not isinstance(con, sqlalchemy.engine.Engine): # pragma: no cover raise exceptions.InvalidConnection( "Invalid 'con' argument, please pass a " @@ -165,19 +177,27 @@ def read_sql_query( args = _convert_params(sql, params) cursor = _con.execute(*args) if chunksize is None: - return _records2df(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype) - return _iterate_cursor(cursor=cursor, chunksize=chunksize, index=index_col, dtype=dtype) + return fn(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype) + return _iterate_cursor(fn=fn, cursor=cursor, chunksize=chunksize, index=index_col, dtype=dtype) def _iterate_cursor( cursor, chunksize: int, index: Optional[Union[str, List[str]]], dtype: Optional[Dict[str, pa.DataType]] = None -) -> Iterator[pd.DataFrame]: +) -> Iterator[Any]: while True: records = cursor.fetchmany(chunksize) - if not records: - break - df: pd.DataFrame = _records2df(records=records, cols_names=cursor.keys(), index=index, dtype=dtype) - yield df + if not records: break + yield fn(records=records, cols_names=cursor.keys(), index=index, dtype=dtype) + + +def _records2numpy( + records: List[Tuple[Any]], + cols_names: List[str], + index: Optional[Union[str, List[str]]], + dtype: Optional[Dict[str, pa.DataType]] = None, +) -> Iterator[np.ndarry]: + for record in records: + yield np.array(record, float) def _records2df( @@ -191,7 +211,7 @@ def _records2df( if (dtype is None) or (col_name not in dtype): array: pa.Array = pa.array(obj=col_values, safe=True) # Creating Arrow array else: - array = pa.array(obj=col_values, type=dtype[col_name], safe=True) # Creating Arrow array with dtype + array: pa.Array = pa.array(obj=col_values, type=dtype[col_name], safe=True) # Creating Arrow array with dtype arrays.append(array) table = pa.Table.from_arrays(arrays=arrays, names=cols_names) # Creating arrow Table df: pd.DataFrame = table.to_pandas( # Creating Pandas DataFrame diff --git a/awswrangler/torch.py b/awswrangler/torch.py new file mode 100644 index 000000000..00cc273f0 --- /dev/null +++ b/awswrangler/torch.py @@ -0,0 +1,111 @@ +"""PyTorch Module.""" + +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union + +import torch +import boto3 # type: ignore +import botocore.exceptions # type: ignore +import pandas as pd # type: ignore +import pandas.io.parsers # type: ignore +import pyarrow as pa # type: ignore +import pyarrow.lib # type: ignore +import pyarrow.parquet # type: ignore +import s3fs # type: ignore +from boto3.s3.transfer import TransferConfig # type: ignore +from pandas.io.common import infer_compression # type: ignore +from torch.utils.data import Dataset, IterableDataset + +from awswrangler import _data_types, _utils, catalog, exceptions, s3 + +_logger: logging.Logger = logging.getLogger(__name__) + + +class S3Dataset(Dataset): + """PyTorch Map-Style S3 Dataset. + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + Examples + -------- + >>> import awswrangler as wr + >>> import boto3 + >>> label_fn = lambda path: path.split[0][-2] + >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session()) + + """ + def __init__(self, path: Union[str, List[str]], label_fn, boto3_session): + super(S3IterableDataset).__init__() + self.label_fn = label_fn + self.paths: List[str] = s3._path2list( + path=path, + boto3_session=self.boto3_session + ) + self._s3 = boto3_session.resource('s3') + + def _fetch_obj(self, path): + obj = _s3.Object(bucket_name, key).get() + return obj['Body'].read() + + def __getitem__(self, index): + path = self.paths[index]) + return [self._fetch_obj(path), label_fn(path)] + + def __len__(self): + return len(self.paths) + + +class SQLDataset(torch.utils.data.IterableDataset): + """PyTorch Iterable SQL Dataset. + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + Examples + -------- + >>> import awswrangler as wr + >>> con = wr.catalog.get_engine("aws-data-wrangler-postgresql") + >>> ds = wr.torch.SQLDataset('select * from public.tutorial', con=con) + + """ + def __init__(self, ): + super(SQLDataset).__init__( + sql: str, + con: sqlalchemy.engine.Engine, + index_col: Optional[Union[str, List[str]]] = None, + ): + self.sql = sql + self.con = con + self.index_col = index_col + + def __iter__(self): + worker_info = torch.utils.data.get_worker_info() + if worker_info is None: # single-process data loading, return the full iterator + pass + else: # in a worker process + raise NotImplemented() + + for ds in wr.db._read_sql_query( + fn=wr.db._records2numpy, + sql=self.sql, + con=self.con, + index_col=self.index_col, + ): + for row in ds: + yield row From 863ba2698fd97411edb5a82a9f22c176852f5093 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sun, 19 Apr 2020 09:16:54 -0300 Subject: [PATCH 02/59] adding Pytorch as a development dependency --- requirements-dev.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 3fdd3cdf3..137f57383 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -17,4 +17,6 @@ twine~=3.1.1 wheel~=0.34.2 sphinx~=3.0.1 sphinx_bootstrap_theme~=0.7.1 -moto~=1.3.14 \ No newline at end of file +moto~=1.3.14 +torch~=1.4.0 +torchvision~=0.5.0 \ No newline at end of file From 2864dc09c2851a44661f1a875d3b6e47ec1f0017 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sun, 19 Apr 2020 09:52:41 -0300 Subject: [PATCH 03/59] Cleaning up initial draft --- awswrangler/db.py | 27 ++++--- awswrangler/torch.py | 173 ++++++++++++++++++++----------------------- 2 files changed, 95 insertions(+), 105 deletions(-) diff --git a/awswrangler/db.py b/awswrangler/db.py index 42d22fe73..f4508d09c 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -2,10 +2,11 @@ import json import logging -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union from urllib.parse import quote_plus import boto3 # type: ignore +import numpy as np # type: ignore import pandas as pd # type: ignore import pyarrow as pa # type: ignore import sqlalchemy # type: ignore @@ -155,17 +156,19 @@ def read_sql_query( ... ) """ - return _read_sql_query(fn=_record2df, sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype) + return _read_sql_query( + fn=_records2df, sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype + ) def _read_sql_query( + fn: Callable, sql: str, con: sqlalchemy.engine.Engine, index_col: Optional[Union[str, List[str]]] = None, params: Optional[Union[List, Tuple, Dict]] = None, chunksize: Optional[int] = None, dtype: Optional[Dict[str, pa.DataType]] = None, - fn: Callable, ): if not isinstance(con, sqlalchemy.engine.Engine): # pragma: no cover raise exceptions.InvalidConnection( @@ -182,20 +185,20 @@ def _read_sql_query( def _iterate_cursor( - cursor, chunksize: int, index: Optional[Union[str, List[str]]], dtype: Optional[Dict[str, pa.DataType]] = None + fn: Callable, + cursor, + chunksize: int, + index: Optional[Union[str, List[str]]], + dtype: Optional[Dict[str, pa.DataType]] = None, ) -> Iterator[Any]: while True: records = cursor.fetchmany(chunksize) - if not records: break + if not records: + break yield fn(records=records, cols_names=cursor.keys(), index=index, dtype=dtype) -def _records2numpy( - records: List[Tuple[Any]], - cols_names: List[str], - index: Optional[Union[str, List[str]]], - dtype: Optional[Dict[str, pa.DataType]] = None, -) -> Iterator[np.ndarry]: +def _records2numpy(records: List[Tuple[Any]], **kwargs) -> Iterator[np.ndarry]: # pylint: disable=unused-argument for record in records: yield np.array(record, float) @@ -211,7 +214,7 @@ def _records2df( if (dtype is None) or (col_name not in dtype): array: pa.Array = pa.array(obj=col_values, safe=True) # Creating Arrow array else: - array: pa.Array = pa.array(obj=col_values, type=dtype[col_name], safe=True) # Creating Arrow array with dtype + array = pa.array(obj=col_values, type=dtype[col_name], safe=True) # Creating Arrow array with dtype arrays.append(array) table = pa.Table.from_arrays(arrays=arrays, names=cols_names) # Creating arrow Table df: pd.DataFrame = table.to_pandas( # Creating Pandas DataFrame diff --git a/awswrangler/torch.py b/awswrangler/torch.py index 00cc273f0..afe85f1de 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -1,111 +1,98 @@ """PyTorch Module.""" -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union +import logging +import sqlalchemy # type: ignore import torch -import boto3 # type: ignore -import botocore.exceptions # type: ignore -import pandas as pd # type: ignore -import pandas.io.parsers # type: ignore -import pyarrow as pa # type: ignore -import pyarrow.lib # type: ignore -import pyarrow.parquet # type: ignore -import s3fs # type: ignore -from boto3.s3.transfer import TransferConfig # type: ignore -from pandas.io.common import infer_compression # type: ignore -from torch.utils.data import Dataset, IterableDataset - -from awswrangler import _data_types, _utils, catalog, exceptions, s3 +from torch.utils.data.dataset import IterableDataset + +from awswrangler import db _logger: logging.Logger = logging.getLogger(__name__) -class S3Dataset(Dataset): - """PyTorch Map-Style S3 Dataset. - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - torch.utils.data.Dataset - - Examples - -------- - >>> import awswrangler as wr - >>> import boto3 - >>> label_fn = lambda path: path.split[0][-2] - >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session()) - - """ - def __init__(self, path: Union[str, List[str]], label_fn, boto3_session): - super(S3IterableDataset).__init__() - self.label_fn = label_fn - self.paths: List[str] = s3._path2list( - path=path, - boto3_session=self.boto3_session - ) - self._s3 = boto3_session.resource('s3') - - def _fetch_obj(self, path): - obj = _s3.Object(bucket_name, key).get() - return obj['Body'].read() - - def __getitem__(self, index): - path = self.paths[index]) - return [self._fetch_obj(path), label_fn(path)] - - def __len__(self): - return len(self.paths) - - -class SQLDataset(torch.utils.data.IterableDataset): - """PyTorch Iterable SQL Dataset. - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - torch.utils.data.Dataset - - Examples - -------- - >>> import awswrangler as wr - >>> con = wr.catalog.get_engine("aws-data-wrangler-postgresql") - >>> ds = wr.torch.SQLDataset('select * from public.tutorial', con=con) - - """ - def __init__(self, ): - super(SQLDataset).__init__( - sql: str, - con: sqlalchemy.engine.Engine, - index_col: Optional[Union[str, List[str]]] = None, - ): +# class S3Dataset(Dataset): +# """PyTorch Map-Style S3 Dataset. +# +# Parameters +# ---------- +# path : Union[str, List[str]] +# S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). +# boto3_session : boto3.Session(), optional +# Boto3 Session. The default boto3 session will be used if boto3_session receive None. +# +# Returns +# ------- +# torch.utils.data.Dataset +# +# Examples +# -------- +# >>> import awswrangler as wr +# >>> import boto3 +# >>> label_fn = lambda path: path.split[0][-2] +# >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session()) +# +# """ +# def __init__(self, path: Union[str, List[str]], label_fn, boto3_session): +# super(S3IterableDataset).__init__() +# self.label_fn = label_fn +# self.paths: List[str] = s3._path2list( +# path=path, +# boto3_session=self.boto3_session +# ) +# self._s3 = boto3_session.resource('s3') +# +# def _fetch_obj(self, path): +# obj = _s3.Object(bucket_name, key).get() +# return obj['Body'].read() +# +# def __getitem__(self, index): +# path = self.paths[index]) +# return [self._fetch_obj(path), label_fn(path)] +# +# def __len__(self): +# return len(self.paths) + + +class SQLDataset(IterableDataset): # pylint: disable=too-few-public-methods,abstract-method + """Pytorch Iterable SQL Dataset.""" + + def __init__(self, sql: str, con: sqlalchemy.engine.Engine): + """Pytorch Iterable SQL Dataset. + + Support for **Redshift**, **PostgreSQL** and **MySQL**. + + Parameters + ---------- + sql : str + Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html + con : sqlalchemy.engine.Engine + SQLAlchemy Engine. Please use, + wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() + + Returns + ------- + torch.utils.data.dataset.IterableDataset + + Examples + -------- + >>> import awswrangler as wr + >>> con = wr.catalog.get_engine("aws-data-wrangler-postgresql") + >>> ds = wr.torch.SQLDataset('select * from public.tutorial', con=con) + + """ + super().__init__() self.sql = sql self.con = con - self.index_col = index_col def __iter__(self): + """Iterate over the Dataset.""" worker_info = torch.utils.data.get_worker_info() if worker_info is None: # single-process data loading, return the full iterator pass else: # in a worker process - raise NotImplemented() - - for ds in wr.db._read_sql_query( - fn=wr.db._records2numpy, - sql=self.sql, - con=self.con, - index_col=self.index_col, - ): + raise NotImplementedError() + + for ds in db._read_sql_query(fn=db._records2numpy, sql=self.sql, con=self.con): for row in ds: yield row From 4fed4c7f5f743e90dbb16b8678b5cd9a104ae3ed Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sun, 19 Apr 2020 13:53:07 -0300 Subject: [PATCH 04/59] Add first test --- awswrangler/__init__.py | 2 +- awswrangler/db.py | 5 +- awswrangler/s3.py | 9 +- awswrangler/torch.py | 127 ++++++++++++++++--------- pytest.ini | 2 +- testing/test_awswrangler/test_torch.py | 99 +++++++++++++++++++ 6 files changed, 188 insertions(+), 56 deletions(-) create mode 100644 testing/test_awswrangler/test_torch.py diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py index ce11c7ad5..ff6a2bd71 100644 --- a/awswrangler/__init__.py +++ b/awswrangler/__init__.py @@ -7,7 +7,7 @@ import logging -from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3 # noqa +from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3, torch # noqa from awswrangler.__metadata__ import __description__, __license__, __title__, __version__ # noqa logging.getLogger("awswrangler").addHandler(logging.NullHandler()) diff --git a/awswrangler/db.py b/awswrangler/db.py index f4508d09c..78979787c 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -198,9 +198,8 @@ def _iterate_cursor( yield fn(records=records, cols_names=cursor.keys(), index=index, dtype=dtype) -def _records2numpy(records: List[Tuple[Any]], **kwargs) -> Iterator[np.ndarry]: # pylint: disable=unused-argument - for record in records: - yield np.array(record, float) +def _records2numpy(records: List[Tuple[Any]], **kwargs) -> Iterator[np.ndarray]: # pylint: disable=unused-argument + return np.array(records, dtype=float) def _records2df( diff --git a/awswrangler/s3.py b/awswrangler/s3.py index f728937db..157607d8c 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -111,7 +111,7 @@ def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None) raise ex # pragma: no cover -def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]: +def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> List[str]: """List Amazon S3 objects from a prefix. Parameters @@ -155,15 +155,16 @@ def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> Li for content in contents: if (content is not None) and ("Key" in content): key: str = content["Key"] - paths.append(f"s3://{bucket}/{key}") + if (suffix is None) or key.endswith(suffix): + paths.append(f"s3://{bucket}/{key}") return paths -def _path2list(path: Union[str, List[str]], boto3_session: Optional[boto3.Session]) -> List[str]: +def _path2list(path: Union[str, List[str]], boto3_session: Optional[boto3.Session], suffix: Optional[str] = None) -> List[str]: if isinstance(path, str): # prefix paths: List[str] = list_objects(path=path, boto3_session=boto3_session) elif isinstance(path, list): - paths = path + paths = path if suffix is None else [x for x in path if x.endswith(suffix)] else: raise exceptions.InvalidArgumentType(f"{type(path)} is not a valid path type. Please, use str or List[str].") return paths diff --git a/awswrangler/torch.py b/awswrangler/torch.py index afe85f1de..7d84be981 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -1,63 +1,93 @@ """PyTorch Module.""" import logging +from io import BytesIO +from typing import Optional, Union, List import sqlalchemy # type: ignore +import numpy as np # type: ignore +import boto3 # type: ignore import torch -from torch.utils.data.dataset import IterableDataset +from torch.utils.data.dataset import Dataset, IterableDataset +from PIL import Image +from torchvision.transforms.functional import to_tensor -from awswrangler import db +from awswrangler import db, s3, _utils _logger: logging.Logger = logging.getLogger(__name__) -# class S3Dataset(Dataset): -# """PyTorch Map-Style S3 Dataset. -# -# Parameters -# ---------- -# path : Union[str, List[str]] -# S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). -# boto3_session : boto3.Session(), optional -# Boto3 Session. The default boto3 session will be used if boto3_session receive None. -# -# Returns -# ------- -# torch.utils.data.Dataset -# -# Examples -# -------- -# >>> import awswrangler as wr -# >>> import boto3 -# >>> label_fn = lambda path: path.split[0][-2] -# >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session()) -# -# """ -# def __init__(self, path: Union[str, List[str]], label_fn, boto3_session): -# super(S3IterableDataset).__init__() -# self.label_fn = label_fn -# self.paths: List[str] = s3._path2list( -# path=path, -# boto3_session=self.boto3_session -# ) -# self._s3 = boto3_session.resource('s3') -# -# def _fetch_obj(self, path): -# obj = _s3.Object(bucket_name, key).get() -# return obj['Body'].read() -# -# def __getitem__(self, index): -# path = self.paths[index]) -# return [self._fetch_obj(path), label_fn(path)] -# -# def __len__(self): -# return len(self.paths) +class _BaseS3Dataset(Dataset): + """PyTorch Map-Style S3 Dataset. + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + Examples + -------- + >>> import awswrangler as wr + >>> import boto3 + >>> label_fn = lambda path: path.split[0][-2] + >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session()) + + """ + def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session): + super().__init__() + self.session = _utils.ensure_session(session=boto3_session) + self.paths: List[str] = s3._path2list( + path=path, + suffix=suffix, + boto3_session=self.session + ) + + def __getitem__(self, index): + path = self.paths[index] + obj = self._fetch_obj(path) + return [self.parser_fn(obj), self.label_fn(path)] + + def __len__(self): + return len(self.paths) + + def _fetch_obj(self, path): + bucket, key = _utils.parse_path(path=path) + buff = BytesIO() + client_s3: boto3.client = _utils.client(service_name="s3", session=self.session) + client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff) + return buff.seek(0) + + def parser_fn(self, obj): + pass + + def label_fn(self, obj): + pass + + +class ImageS3Dataset(Dataset): + + @staticmethod + def parser_fn(obj): + image = Image.open('YOUR_PATH') + tensor = to_tensor(image) + tensor.unsqueeze_(0) + return tensor + + @staticmethod + def label_fn(obj): + pass class SQLDataset(IterableDataset): # pylint: disable=too-few-public-methods,abstract-method """Pytorch Iterable SQL Dataset.""" - def __init__(self, sql: str, con: sqlalchemy.engine.Engine): + def __init__(self, sql: str, con: sqlalchemy.engine.Engine, chunksize: Optional[int] = None,): """Pytorch Iterable SQL Dataset. Support for **Redshift**, **PostgreSQL** and **MySQL**. @@ -84,6 +114,7 @@ def __init__(self, sql: str, con: sqlalchemy.engine.Engine): super().__init__() self.sql = sql self.con = con + self.chunksize = chunksize def __iter__(self): """Iterate over the Dataset.""" @@ -92,7 +123,9 @@ def __iter__(self): pass else: # in a worker process raise NotImplementedError() - - for ds in db._read_sql_query(fn=db._records2numpy, sql=self.sql, con=self.con): + ret = db._read_sql_query(fn=db._records2numpy, sql=self.sql, con=self.con, chunksize=self.chunksize) + if isinstance(ret, np.ndarray): + ret = [ret] + for ds in ret: for row in ds: - yield row + yield torch.as_tensor(row, dtype=torch.float) diff --git a/pytest.ini b/pytest.ini index 8e7a47ef1..d233cbf74 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,7 +1,7 @@ [pytest] addopts = --verbose - --capture=fd + --capture=no filterwarnings = ignore::DeprecationWarning ignore::UserWarning \ No newline at end of file diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py new file mode 100644 index 000000000..4725f1ea4 --- /dev/null +++ b/testing/test_awswrangler/test_torch.py @@ -0,0 +1,99 @@ +import logging + +import boto3 +import pandas as pd +import pytest +import torch + +import awswrangler as wr + +logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s") +logging.getLogger("awswrangler").setLevel(logging.DEBUG) +logging.getLogger("botocore.credentials").setLevel(logging.CRITICAL) + + +@pytest.fixture(scope="module") +def cloudformation_outputs(): + response = boto3.client("cloudformation").describe_stacks(StackName="aws-data-wrangler-test") + outputs = {} + for output in response.get("Stacks")[0].get("Outputs"): + outputs[output.get("OutputKey")] = output.get("OutputValue") + yield outputs + + +@pytest.fixture(scope="module") +def bucket(cloudformation_outputs): + if "BucketName" in cloudformation_outputs: + bucket = cloudformation_outputs["BucketName"] + else: + raise Exception("You must deploy/update the test infrastructure (CloudFormation)") + yield bucket + + +@pytest.fixture(scope="module") +def parameters(cloudformation_outputs): + parameters = dict(postgresql={}, mysql={}, redshift={}) + parameters["postgresql"]["host"] = cloudformation_outputs["PostgresqlAddress"] + parameters["postgresql"]["port"] = 3306 + parameters["postgresql"]["schema"] = "public" + parameters["postgresql"]["database"] = "postgres" + parameters["mysql"]["host"] = cloudformation_outputs["MysqlAddress"] + parameters["mysql"]["port"] = 3306 + parameters["mysql"]["schema"] = "test" + parameters["mysql"]["database"] = "test" + parameters["redshift"]["host"] = cloudformation_outputs["RedshiftAddress"] + parameters["redshift"]["port"] = cloudformation_outputs["RedshiftPort"] + parameters["redshift"]["identifier"] = cloudformation_outputs["RedshiftIdentifier"] + parameters["redshift"]["schema"] = "public" + parameters["redshift"]["database"] = "test" + parameters["redshift"]["role"] = cloudformation_outputs["RedshiftRole"] + parameters["password"] = cloudformation_outputs["DatabasesPassword"] + parameters["user"] = "test" + yield parameters + + +@pytest.mark.parametrize("db_type, chunksize", [ + ("mysql", None), + ("redshift", None), + ("postgresql", None), + ("mysql", 1), + ("redshift", 1), + ("postgresql", 1), +]) +def test_torch_sql(parameters, db_type, chunksize): + schema = parameters[db_type]["schema"] + table = "test_torch_sql" + engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") + wr.db.to_sql( + df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}), + con=engine, + name=table, + schema=schema, + if_exists="replace", + index=False, + index_label=None, + chunksize=None, + method=None + ) + ds = list(wr.torch.SQLDataset(f"SELECT * FROM {schema}.{table}", con=engine, chunksize=chunksize)) + assert torch.all(ds[0].eq(torch.tensor([1.0, 4.0]))) + assert torch.all(ds[1].eq(torch.tensor([2.0, 5.0]))) + assert torch.all(ds[2].eq(torch.tensor([3.0, 6.0]))) + + +def test_torch_sql(parameters, db_type, chunksize): + schema = parameters[db_type]["schema"] + table = "test_torch_sql" + engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") + wr.db.to_sql( + df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}), + con=engine, + name=table, + schema=schema, + if_exists="replace", + index=False, + index_label=None, + chunksize=None, + method=None + ) + From 72c739c905f3a34545ffc71da7693ff4baf029c1 Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Sun, 19 Apr 2020 18:58:13 -0300 Subject: [PATCH 05/59] add audio and image dataset --- awswrangler/s3.py | 4 +- awswrangler/torch.py | 169 ++++++++++++++++++++----- testing/test_awswrangler/test_torch.py | 32 +++-- 3 files changed, 159 insertions(+), 46 deletions(-) diff --git a/awswrangler/s3.py b/awswrangler/s3.py index 157607d8c..f2f869ac2 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -120,6 +120,8 @@ def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optiona S3 path (e.g. s3://bucket/prefix). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. + suffix: str, optional + Suffix for filtering S3 keys Returns ------- @@ -160,7 +162,7 @@ def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optiona return paths -def _path2list(path: Union[str, List[str]], boto3_session: Optional[boto3.Session], suffix: Optional[str] = None) -> List[str]: +def _path2list(path: object, boto3_session: boto3.Session, suffix: str = None) -> List[str]: if isinstance(path, str): # prefix paths: List[str] = list_objects(path=path, boto3_session=boto3_session) elif isinstance(path, list): diff --git a/awswrangler/torch.py b/awswrangler/torch.py index 7d84be981..a5b6497d8 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -1,5 +1,6 @@ """PyTorch Module.""" +import re import logging from io import BytesIO from typing import Optional, Union, List @@ -9,8 +10,7 @@ import boto3 # type: ignore import torch from torch.utils.data.dataset import Dataset, IterableDataset -from PIL import Image -from torchvision.transforms.functional import to_tensor + from awswrangler import db, s3, _utils @@ -18,34 +18,29 @@ class _BaseS3Dataset(Dataset): - """PyTorch Map-Style S3 Dataset. + """PyTorch Map-Style S3 Dataset.""" - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. + def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session): + """PyTorch Map-Style S3 Dataset. - Returns - ------- - torch.utils.data.Dataset + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. - Examples - -------- - >>> import awswrangler as wr - >>> import boto3 - >>> label_fn = lambda path: path.split[0][-2] - >>> ds = wr.torch.S3Dataset('s3://bucket/path', label_fn, boto3.Session()) + Returns + ------- + torch.utils.data.Dataset - """ - def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session): + """ super().__init__() self.session = _utils.ensure_session(session=boto3_session) self.paths: List[str] = s3._path2list( path=path, suffix=suffix, - boto3_session=self.session + boto3_session=self.session, ) def __getitem__(self, index): @@ -66,28 +61,139 @@ def _fetch_obj(self, path): def parser_fn(self, obj): pass - def label_fn(self, obj): + def label_fn(self, path): pass -class ImageS3Dataset(Dataset): +class _S3PartitionedDataset(_BaseS3Dataset): + + def label_fn(self, path): + return int(re.findall(r'/(.*?)=(.*?)/', path)[-1][1]) + + +class AudioS3Dataset(_S3PartitionedDataset): + + def __init__(self): + """PyTorch S3 Audio Dataset. + + Assumes audio files are stored with the following structure: + + bucket + ├── class=0 + │ ├── audio0.wav + │ └── audio1.wav + └── class=1 + ├── audio2.wav + └── audio3.wav + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + Examples + -------- + >>> import awswrangler as wr + >>> import boto3 + >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session()) + + """ + super(AudioS3Dataset, self).__init__() + import torchaudio + + def parser_fn(self, obj): + waveform, sample_rate = torchaudio.load(obj) + return waveform, sample_rate + + +class LambdaS3Dataset(_BaseS3Dataset): + """PyTorch S3 Audio Dataset. + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + Examples + -------- + >>> import awswrangler as wr + >>> import boto3 + >>> parse_fn = lambda x: torch.tensor(x) + >>> label_fn = lambda x: x.split('.')[-1] + >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), parse_fn=parse_fn, label_fn=label_fn) + + """ + def __init__(self, parse_fn, label_fn): + self._parse_fn = parse_fn + self._label_fn = label_fn + + def label_fn(self, path): + return self._label_fn(path) - @staticmethod - def parser_fn(obj): - image = Image.open('YOUR_PATH') + def parse_fn(self, obj): + return self._parse_fn(obj) + + +class ImageS3Dataset(_S3PartitionedDataset): + + def __init__(self): + """PyTorch Image S3 Dataset. + + Assumes Images are stored with the following structure: + + bucket + ├── class=0 + │ ├── img0.jpeg + │ └── img1.jpeg + └── class=1 + ├── img2.jpeg + └── img3.jpeg + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + Examples + -------- + >>> import awswrangler as wr + >>> import boto3 + >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session()) + + """ + super(ImageS3Dataset, self).__init__() + from PIL import Image + from torchvision.transforms.functional import to_tensor + + def parser_fn(self, obj): + image = Image.open(obj) tensor = to_tensor(image) tensor.unsqueeze_(0) return tensor - @staticmethod - def label_fn(obj): - pass - class SQLDataset(IterableDataset): # pylint: disable=too-few-public-methods,abstract-method """Pytorch Iterable SQL Dataset.""" - def __init__(self, sql: str, con: sqlalchemy.engine.Engine, chunksize: Optional[int] = None,): + def __init__(self, sql: str, con: sqlalchemy.engine.Engine, label_col: Optional[str], chunksize: Optional[int] = None,): """Pytorch Iterable SQL Dataset. Support for **Redshift**, **PostgreSQL** and **MySQL**. @@ -114,6 +220,7 @@ def __init__(self, sql: str, con: sqlalchemy.engine.Engine, chunksize: Optional[ super().__init__() self.sql = sql self.con = con + self.label_col = label_col self.chunksize = chunksize def __iter__(self): diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index 4725f1ea4..f30736c16 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -81,19 +81,23 @@ def test_torch_sql(parameters, db_type, chunksize): assert torch.all(ds[2].eq(torch.tensor([3.0, 6.0]))) -def test_torch_sql(parameters, db_type, chunksize): - schema = parameters[db_type]["schema"] - table = "test_torch_sql" - engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") - wr.db.to_sql( - df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}), - con=engine, - name=table, - schema=schema, - if_exists="replace", - index=False, - index_label=None, - chunksize=None, - method=None +def test_torch_image_s3(bucket): + s3 = boto3.client('s3') + ref_label = 0 + s3.put_object( + Body=open("../../docs/source/_static/logo.png"), + Bucket=bucket, + Key=f'class={ref_label}/logo.png', ) + ds = wr.torch.ImageS3Dataset() + for image, label in ds: + assert image.shape == torch.Size([1, 28, 28]) + assert label == torch.int(ref_label) + break + +# def test_torch_audio_s3(bucket): +# ds = wr.torch.AudioS3Dataset() +# for image, label in ds: +# assert image.shape == torch.Size([1, 28, 28]) +# break \ No newline at end of file From f72810ec53fb33a60df0b5c97fb5ab8059317f81 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Mon, 20 Apr 2020 01:07:55 -0300 Subject: [PATCH 06/59] Add label_col to torch.SQLDataset --- awswrangler/db.py | 63 ++--- awswrangler/s3.py | 2 +- awswrangler/torch.py | 313 ++++++++++++++----------- testing/test_awswrangler/test_torch.py | 52 ++-- 4 files changed, 235 insertions(+), 195 deletions(-) diff --git a/awswrangler/db.py b/awswrangler/db.py index 78979787c..e69739433 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -6,7 +6,6 @@ from urllib.parse import quote_plus import boto3 # type: ignore -import numpy as np # type: ignore import pandas as pd # type: ignore import pyarrow as pa # type: ignore import sqlalchemy # type: ignore @@ -156,50 +155,15 @@ def read_sql_query( ... ) """ - return _read_sql_query( - fn=_records2df, sql=sql, con=con, index_col=index_col, params=params, chunksize=chunksize, dtype=dtype - ) - - -def _read_sql_query( - fn: Callable, - sql: str, - con: sqlalchemy.engine.Engine, - index_col: Optional[Union[str, List[str]]] = None, - params: Optional[Union[List, Tuple, Dict]] = None, - chunksize: Optional[int] = None, - dtype: Optional[Dict[str, pa.DataType]] = None, -): - if not isinstance(con, sqlalchemy.engine.Engine): # pragma: no cover - raise exceptions.InvalidConnection( - "Invalid 'con' argument, please pass a " - "SQLAlchemy Engine. Use wr.db.get_engine(), " - "wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()" - ) + _validate_engine(con=con) with con.connect() as _con: args = _convert_params(sql, params) cursor = _con.execute(*args) if chunksize is None: - return fn(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype) - return _iterate_cursor(fn=fn, cursor=cursor, chunksize=chunksize, index=index_col, dtype=dtype) - - -def _iterate_cursor( - fn: Callable, - cursor, - chunksize: int, - index: Optional[Union[str, List[str]]], - dtype: Optional[Dict[str, pa.DataType]] = None, -) -> Iterator[Any]: - while True: - records = cursor.fetchmany(chunksize) - if not records: - break - yield fn(records=records, cols_names=cursor.keys(), index=index, dtype=dtype) - - -def _records2numpy(records: List[Tuple[Any]], **kwargs) -> Iterator[np.ndarray]: # pylint: disable=unused-argument - return np.array(records, dtype=float) + return _records2df(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype) + return _iterate_cursor( + fn=_records2df, cursor=cursor, chunksize=chunksize, cols_names=cursor.keys(), index=index_col, dtype=dtype + ) def _records2df( @@ -229,6 +193,14 @@ def _records2df( return df +def _iterate_cursor(fn: Callable, cursor: Any, chunksize: int, **kwargs) -> Iterator[Any]: + while True: + records = cursor.fetchmany(chunksize) + if not records: + break + yield fn(records=records, **kwargs) + + def _convert_params(sql: str, params: Optional[Union[List, Tuple, Dict]]) -> List[Any]: args: List[Any] = [sql] if params is not None: @@ -1109,3 +1081,12 @@ def unload_redshift_to_files( paths = [x[0].replace(" ", "") for x in _con.execute(sql).fetchall()] _logger.debug(f"paths: {paths}") return paths + + +def _validate_engine(con: sqlalchemy.engine.Engine) -> None: # pragma: no cover + if not isinstance(con, sqlalchemy.engine.Engine): + raise exceptions.InvalidConnection( + "Invalid 'con' argument, please pass a " + "SQLAlchemy Engine. Use wr.db.get_engine(), " + "wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()" + ) diff --git a/awswrangler/s3.py b/awswrangler/s3.py index f2f869ac2..c083d52c5 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -121,7 +121,7 @@ def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optiona boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. suffix: str, optional - Suffix for filtering S3 keys + Suffix for filtering S3 keys. Returns ------- diff --git a/awswrangler/torch.py b/awswrangler/torch.py index a5b6497d8..b27422750 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -3,16 +3,15 @@ import re import logging from io import BytesIO -from typing import Optional, Union, List +from typing import Any, Iterator, List, Optional, Tuple, Union -import sqlalchemy # type: ignore import numpy as np # type: ignore +import sqlalchemy # type: ignore import boto3 # type: ignore import torch from torch.utils.data.dataset import Dataset, IterableDataset - -from awswrangler import db, s3, _utils +from awswrangler import db, _utils, s3 _logger: logging.Logger = logging.getLogger(__name__) @@ -71,129 +70,135 @@ def label_fn(self, path): return int(re.findall(r'/(.*?)=(.*?)/', path)[-1][1]) -class AudioS3Dataset(_S3PartitionedDataset): - - def __init__(self): - """PyTorch S3 Audio Dataset. - - Assumes audio files are stored with the following structure: - - bucket - ├── class=0 - │ ├── audio0.wav - │ └── audio1.wav - └── class=1 - ├── audio2.wav - └── audio3.wav - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - torch.utils.data.Dataset - - Examples - -------- - >>> import awswrangler as wr - >>> import boto3 - >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session()) - - """ - super(AudioS3Dataset, self).__init__() - import torchaudio - - def parser_fn(self, obj): - waveform, sample_rate = torchaudio.load(obj) - return waveform, sample_rate - - -class LambdaS3Dataset(_BaseS3Dataset): - """PyTorch S3 Audio Dataset. - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - torch.utils.data.Dataset - - Examples - -------- - >>> import awswrangler as wr - >>> import boto3 - >>> parse_fn = lambda x: torch.tensor(x) - >>> label_fn = lambda x: x.split('.')[-1] - >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), parse_fn=parse_fn, label_fn=label_fn) - - """ - def __init__(self, parse_fn, label_fn): - self._parse_fn = parse_fn - self._label_fn = label_fn - - def label_fn(self, path): - return self._label_fn(path) - - def parse_fn(self, obj): - return self._parse_fn(obj) - - -class ImageS3Dataset(_S3PartitionedDataset): - - def __init__(self): - """PyTorch Image S3 Dataset. - - Assumes Images are stored with the following structure: - - bucket - ├── class=0 - │ ├── img0.jpeg - │ └── img1.jpeg - └── class=1 - ├── img2.jpeg - └── img3.jpeg - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - torch.utils.data.Dataset - - Examples - -------- - >>> import awswrangler as wr - >>> import boto3 - >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session()) - - """ - super(ImageS3Dataset, self).__init__() - from PIL import Image - from torchvision.transforms.functional import to_tensor - - def parser_fn(self, obj): - image = Image.open(obj) - tensor = to_tensor(image) - tensor.unsqueeze_(0) - return tensor +# class AudioS3Dataset(_S3PartitionedDataset): +# +# def __init__(self): +# """PyTorch S3 Audio Dataset. +# +# Assumes audio files are stored with the following structure: +# +# bucket +# ├── class=0 +# │ ├── audio0.wav +# │ └── audio1.wav +# └── class=1 +# ├── audio2.wav +# └── audio3.wav +# +# Parameters +# ---------- +# path : Union[str, List[str]] +# S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). +# boto3_session : boto3.Session(), optional +# Boto3 Session. The default boto3 session will be used if boto3_session receive None. +# +# Returns +# ------- +# torch.utils.data.Dataset +# +# Examples +# -------- +# >>> import awswrangler as wr +# >>> import boto3 +# >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session()) +# +# """ +# super(AudioS3Dataset, self).__init__() +# import torchaudio +# +# def parser_fn(self, obj): +# waveform, sample_rate = torchaudio.load(obj) +# return waveform, sample_rate + + +# class LambdaS3Dataset(_BaseS3Dataset): +# """PyTorch S3 Audio Dataset. +# +# Parameters +# ---------- +# path : Union[str, List[str]] +# S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). +# boto3_session : boto3.Session(), optional +# Boto3 Session. The default boto3 session will be used if boto3_session receive None. +# +# Returns +# ------- +# torch.utils.data.Dataset +# +# Examples +# -------- +# >>> import awswrangler as wr +# >>> import boto3 +# >>> parse_fn = lambda x: torch.tensor(x) +# >>> label_fn = lambda x: x.split('.')[-1] +# >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), parse_fn=parse_fn, label_fn=label_fn) +# +# """ +# def __init__(self, parse_fn, label_fn): +# self._parse_fn = parse_fn +# self._label_fn = label_fn +# +# def label_fn(self, path): +# return self._label_fn(path) +# +# def parse_fn(self, obj): +# return self._parse_fn(obj) +# +# +# class ImageS3Dataset(_S3PartitionedDataset): +# +# def __init__(self): +# """PyTorch Image S3 Dataset. +# +# Assumes Images are stored with the following structure: +# +# bucket +# ├── class=0 +# │ ├── img0.jpeg +# │ └── img1.jpeg +# └── class=1 +# ├── img2.jpeg +# └── img3.jpeg +# +# Parameters +# ---------- +# path : Union[str, List[str]] +# S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). +# boto3_session : boto3.Session(), optional +# Boto3 Session. The default boto3 session will be used if boto3_session receive None. +# +# Returns +# ------- +# torch.utils.data.Dataset +# +# Examples +# -------- +# >>> import awswrangler as wr +# >>> import boto3 +# >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session()) +# +# """ +# super(ImageS3Dataset, self).__init__() +# from PIL import Image +# from torchvision.transforms.functional import to_tensor +# +# def parser_fn(self, obj): +# image = Image.open(obj) +# tensor = to_tensor(image) +# tensor.unsqueeze_(0) +# return tensor class SQLDataset(IterableDataset): # pylint: disable=too-few-public-methods,abstract-method """Pytorch Iterable SQL Dataset.""" - def __init__(self, sql: str, con: sqlalchemy.engine.Engine, label_col: Optional[str], chunksize: Optional[int] = None,): + def __init__( + self, + sql: str, + con: sqlalchemy.engine.Engine, + label_col: Optional[Union[int, str]] = None, + chunksize: Optional[int] = None, + ): """Pytorch Iterable SQL Dataset. Support for **Redshift**, **PostgreSQL** and **MySQL**. @@ -205,6 +210,8 @@ def __init__(self, sql: str, con: sqlalchemy.engine.Engine, label_col: Optional[ con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() + label_col : int, optional + Label column number Returns ------- @@ -218,21 +225,53 @@ def __init__(self, sql: str, con: sqlalchemy.engine.Engine, label_col: Optional[ """ super().__init__() - self.sql = sql - self.con = con - self.label_col = label_col - self.chunksize = chunksize + self._sql = sql + self._con = con + self._label_col = label_col + self._chunksize = chunksize - def __iter__(self): + def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]: """Iterate over the Dataset.""" - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: # single-process data loading, return the full iterator - pass - else: # in a worker process + if torch.utils.data.get_worker_info() is not None: # type: ignore raise NotImplementedError() - ret = db._read_sql_query(fn=db._records2numpy, sql=self.sql, con=self.con, chunksize=self.chunksize) - if isinstance(ret, np.ndarray): - ret = [ret] - for ds in ret: - for row in ds: - yield torch.as_tensor(row, dtype=torch.float) + db._validate_engine(con=self._con) + with self._con.connect() as con: + cursor: Any = con.execute(self._sql) + if (self._label_col is not None) and isinstance(self._label_col, str): + label_col: Optional[int] = list(cursor.keys()).index(self._label_col) + else: + label_col = self._label_col + _logger.debug(f"label_col: {label_col}") + return self._records2tensor(cursor=cursor, chunksize=self._chunksize, label_col=label_col) + + @staticmethod + def _records2tensor( + cursor: Any, chunksize: Optional[int] = None, label_col: Optional[int] = None + ) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]: # pylint: disable=unused-argument + chunks: Iterator[Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]] + if chunksize is None: + chunks = iter([SQLDataset._records2numpy(records=cursor.fetchall(), label_col=label_col)]) + else: + chunks = db._iterate_cursor( # pylint: disable=protected-access + fn=SQLDataset._records2numpy, cursor=cursor, chunksize=chunksize, label_col=label_col + ) + if label_col is None: + for data in chunks: + for data_row in data: + yield torch.as_tensor(data_row, dtype=torch.float) # pylint: disable=no-member + for data, label in chunks: + for data_row, label_row in zip(data, label): + ts_data: torch.Tensor = torch.as_tensor(data_row, dtype=torch.float) # pylint: disable=no-member + ts_label: torch.Tensor = torch.as_tensor(label_row, dtype=torch.float) # pylint: disable=no-member + yield ts_data, ts_label + + @staticmethod + def _records2numpy( + records: List[Tuple[Any]], label_col: Optional[int] = None + ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: # pylint: disable=unused-argument + arr: np.ndarray = np.array(records, dtype=np.float) + if label_col is None: + return arr + data: np.ndarray = np.concatenate([arr[:, :label_col], arr[:, (label_col + 1) :]], axis=1) # noqa: E203 + label: np.ndarray = arr[:, label_col] + return data, label diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index f30736c16..d39ec8ddb 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -52,14 +52,10 @@ def parameters(cloudformation_outputs): yield parameters -@pytest.mark.parametrize("db_type, chunksize", [ - ("mysql", None), - ("redshift", None), - ("postgresql", None), - ("mysql", 1), - ("redshift", 1), - ("postgresql", 1), -]) +@pytest.mark.parametrize( + "db_type, chunksize", + [("mysql", None), ("redshift", None), ("postgresql", None), ("mysql", 1), ("redshift", 1), ("postgresql", 1)], +) def test_torch_sql(parameters, db_type, chunksize): schema = parameters[db_type]["schema"] table = "test_torch_sql" @@ -73,7 +69,7 @@ def test_torch_sql(parameters, db_type, chunksize): index=False, index_label=None, chunksize=None, - method=None + method=None, ) ds = list(wr.torch.SQLDataset(f"SELECT * FROM {schema}.{table}", con=engine, chunksize=chunksize)) assert torch.all(ds[0].eq(torch.tensor([1.0, 4.0]))) @@ -81,14 +77,38 @@ def test_torch_sql(parameters, db_type, chunksize): assert torch.all(ds[2].eq(torch.tensor([3.0, 6.0]))) +@pytest.mark.parametrize( + "db_type, chunksize", + [("mysql", None), ("redshift", None), ("postgresql", None), ("mysql", 1), ("redshift", 1), ("postgresql", 1)], +) +def test_torch_sql_label(parameters, db_type, chunksize): + schema = parameters[db_type]["schema"] + table = "test_torch_sql_label" + engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") + wr.db.to_sql( + df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0], "c": [7, 8, 9]}), + con=engine, + name=table, + schema=schema, + if_exists="replace", + index=False, + index_label=None, + chunksize=None, + method=None, + ) + ts = list(wr.torch.SQLDataset(f"SELECT * FROM {schema}.{table}", con=engine, chunksize=chunksize, label_col=2)) + assert torch.all(ts[0][0].eq(torch.tensor([1.0, 4.0]))) + assert torch.all(ts[0][1].eq(torch.tensor([7], dtype=torch.long))) + assert torch.all(ts[1][0].eq(torch.tensor([2.0, 5.0]))) + assert torch.all(ts[1][1].eq(torch.tensor([8], dtype=torch.long))) + assert torch.all(ts[2][0].eq(torch.tensor([3.0, 6.0]))) + assert torch.all(ts[2][1].eq(torch.tensor([9], dtype=torch.long))) + + def test_torch_image_s3(bucket): - s3 = boto3.client('s3') + s3 = boto3.client("s3") ref_label = 0 - s3.put_object( - Body=open("../../docs/source/_static/logo.png"), - Bucket=bucket, - Key=f'class={ref_label}/logo.png', - ) + s3.put_object(Body=open("../../docs/source/_static/logo.png"), Bucket=bucket, Key=f"class={ref_label}/logo.png") ds = wr.torch.ImageS3Dataset() for image, label in ds: assert image.shape == torch.Size([1, 28, 28]) @@ -100,4 +120,4 @@ def test_torch_image_s3(bucket): # ds = wr.torch.AudioS3Dataset() # for image, label in ds: # assert image.shape == torch.Size([1, 28, 28]) -# break \ No newline at end of file +# break From bf1be0746d0523d91db1d9181152150ff18c9919 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Mon, 20 Apr 2020 09:20:41 -0300 Subject: [PATCH 07/59] Updating catersian product of pytest parameters --- testing/test_awswrangler/test_torch.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index d39ec8ddb..3c08ec319 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -52,10 +52,8 @@ def parameters(cloudformation_outputs): yield parameters -@pytest.mark.parametrize( - "db_type, chunksize", - [("mysql", None), ("redshift", None), ("postgresql", None), ("mysql", 1), ("redshift", 1), ("postgresql", 1)], -) +@pytest.mark.parametrize("chunksize", [None, 1, 10]) +@pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"]) def test_torch_sql(parameters, db_type, chunksize): schema = parameters[db_type]["schema"] table = "test_torch_sql" @@ -77,10 +75,8 @@ def test_torch_sql(parameters, db_type, chunksize): assert torch.all(ds[2].eq(torch.tensor([3.0, 6.0]))) -@pytest.mark.parametrize( - "db_type, chunksize", - [("mysql", None), ("redshift", None), ("postgresql", None), ("mysql", 1), ("redshift", 1), ("postgresql", 1)], -) +@pytest.mark.parametrize("chunksize", [None, 1, 10]) +@pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"]) def test_torch_sql_label(parameters, db_type, chunksize): schema = parameters[db_type]["schema"] table = "test_torch_sql_label" From 1a41d1887217312298f2fab4f32e156fffb7e8d5 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Mon, 20 Apr 2020 12:37:21 -0300 Subject: [PATCH 08/59] Pivoting SQLDataset parser strategy to avoid cast losses. --- awswrangler/db.py | 14 +- awswrangler/torch.py | 169 ++++++++++++------------- testing/test_awswrangler/test_torch.py | 18 +-- 3 files changed, 100 insertions(+), 101 deletions(-) diff --git a/awswrangler/db.py b/awswrangler/db.py index e69739433..5d16301ad 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -2,7 +2,7 @@ import json import logging -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from urllib.parse import quote_plus import boto3 # type: ignore @@ -162,7 +162,7 @@ def read_sql_query( if chunksize is None: return _records2df(records=cursor.fetchall(), cols_names=cursor.keys(), index=index_col, dtype=dtype) return _iterate_cursor( - fn=_records2df, cursor=cursor, chunksize=chunksize, cols_names=cursor.keys(), index=index_col, dtype=dtype + cursor=cursor, chunksize=chunksize, cols_names=cursor.keys(), index=index_col, dtype=dtype ) @@ -193,12 +193,18 @@ def _records2df( return df -def _iterate_cursor(fn: Callable, cursor: Any, chunksize: int, **kwargs) -> Iterator[Any]: +def _iterate_cursor( + cursor: Any, + chunksize: int, + cols_names: List[str], + index: Optional[Union[str, List[str]]], + dtype: Optional[Dict[str, pa.DataType]] = None, +) -> Iterator[pd.DataFrame]: while True: records = cursor.fetchmany(chunksize) if not records: break - yield fn(records=records, **kwargs) + yield _records2df(records=records, cols_names=cols_names, index=index, dtype=dtype) def _convert_params(sql: str, params: Optional[Union[List, Tuple, Dict]]) -> List[Any]: diff --git a/awswrangler/torch.py b/awswrangler/torch.py index b27422750..a73f4d198 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -1,79 +1,75 @@ """PyTorch Module.""" -import re import logging -from io import BytesIO +# import re +# from io import BytesIO from typing import Any, Iterator, List, Optional, Tuple, Union +# import boto3 # type: ignore import numpy as np # type: ignore import sqlalchemy # type: ignore -import boto3 # type: ignore import torch -from torch.utils.data.dataset import Dataset, IterableDataset +from torch.utils.data.dataset import IterableDataset -from awswrangler import db, _utils, s3 +from awswrangler import db # , s3, _utils _logger: logging.Logger = logging.getLogger(__name__) -class _BaseS3Dataset(Dataset): - """PyTorch Map-Style S3 Dataset.""" - - def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session): - """PyTorch Map-Style S3 Dataset. - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - torch.utils.data.Dataset - - """ - super().__init__() - self.session = _utils.ensure_session(session=boto3_session) - self.paths: List[str] = s3._path2list( - path=path, - suffix=suffix, - boto3_session=self.session, - ) - - def __getitem__(self, index): - path = self.paths[index] - obj = self._fetch_obj(path) - return [self.parser_fn(obj), self.label_fn(path)] - - def __len__(self): - return len(self.paths) - - def _fetch_obj(self, path): - bucket, key = _utils.parse_path(path=path) - buff = BytesIO() - client_s3: boto3.client = _utils.client(service_name="s3", session=self.session) - client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff) - return buff.seek(0) - - def parser_fn(self, obj): - pass - - def label_fn(self, path): - pass - - -class _S3PartitionedDataset(_BaseS3Dataset): - - def label_fn(self, path): - return int(re.findall(r'/(.*?)=(.*?)/', path)[-1][1]) +# class _BaseS3Dataset(Dataset): +# """PyTorch Map-Style S3 Dataset.""" +# +# def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session): +# """Pytorch Map-Style S3 Dataset. +# +# Parameters +# ---------- +# path : Union[str, List[str]] +# S3 prefix (e.g. s3://bucket/prefix) +# or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). +# boto3_session : boto3.Session(), optional +# Boto3 Session. The default boto3 session will be used if boto3_session receive None. +# +# Returns +# ------- +# torch.utils.data.Dataset +# +# """ +# super().__init__() +# self.session = _utils.ensure_session(session=boto3_session) +# self.paths: List[str] = s3._path2list(path=path, suffix=suffix, boto3_session=self.session) +# +# def __getitem__(self, index): +# path = self.paths[index] +# obj = self._fetch_obj(path) +# return [self.parser_fn(obj), self.label_fn(path)] +# +# def __len__(self): +# return len(self.paths) +# +# def _fetch_obj(self, path): +# bucket, key = _utils.parse_path(path=path) +# buff = BytesIO() +# client_s3: boto3.client = _utils.client(service_name="s3", session=self.session) +# client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff) +# return buff.seek(0) +# +# def parser_fn(self, obj): +# pass +# +# def label_fn(self, path): +# pass +# +# +# class _S3PartitionedDataset(_BaseS3Dataset): +# def label_fn(self, path): +# return int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1]) # class AudioS3Dataset(_S3PartitionedDataset): # # def __init__(self): -# """PyTorch S3 Audio Dataset. +# """Pytorch S3 Audio Dataset. # # Assumes audio files are stored with the following structure: # @@ -88,7 +84,8 @@ def label_fn(self, path): # Parameters # ---------- # path : Union[str, List[str]] -# S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). +# S3 prefix (e.g. s3://bucket/prefix) +# or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). # boto3_session : boto3.Session(), optional # Boto3 Session. The default boto3 session will be used if boto3_session receive None. # @@ -163,7 +160,8 @@ def label_fn(self, path): # Parameters # ---------- # path : Union[str, List[str]] -# S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). +# S3 prefix (e.g. s3://bucket/prefix) +# or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). # boto3_session : boto3.Session(), optional # Boto3 Session. The default boto3 session will be used if boto3_session receive None. # @@ -242,36 +240,31 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, else: label_col = self._label_col _logger.debug(f"label_col: {label_col}") - return self._records2tensor(cursor=cursor, chunksize=self._chunksize, label_col=label_col) + if self._chunksize is None: + return SQLDataset._records2tensor(records=cursor.fetchall(), label_col=label_col) + return self._iterate_cursor(cursor=cursor, chunksize=self._chunksize, label_col=label_col) @staticmethod - def _records2tensor( - cursor: Any, chunksize: Optional[int] = None, label_col: Optional[int] = None - ) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]: # pylint: disable=unused-argument - chunks: Iterator[Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]] - if chunksize is None: - chunks = iter([SQLDataset._records2numpy(records=cursor.fetchall(), label_col=label_col)]) - else: - chunks = db._iterate_cursor( # pylint: disable=protected-access - fn=SQLDataset._records2numpy, cursor=cursor, chunksize=chunksize, label_col=label_col - ) - if label_col is None: - for data in chunks: - for data_row in data: - yield torch.as_tensor(data_row, dtype=torch.float) # pylint: disable=no-member - for data, label in chunks: - for data_row, label_row in zip(data, label): - ts_data: torch.Tensor = torch.as_tensor(data_row, dtype=torch.float) # pylint: disable=no-member - ts_label: torch.Tensor = torch.as_tensor(label_row, dtype=torch.float) # pylint: disable=no-member - yield ts_data, ts_label + def _iterate_cursor( + cursor: Any, chunksize: int, label_col: Optional[int] = None + ) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]: + while True: + records = cursor.fetchmany(chunksize) + if not records: + break + yield from SQLDataset._records2tensor(records=records, label_col=label_col) @staticmethod - def _records2numpy( + def _records2tensor( records: List[Tuple[Any]], label_col: Optional[int] = None - ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: # pylint: disable=unused-argument - arr: np.ndarray = np.array(records, dtype=np.float) - if label_col is None: - return arr - data: np.ndarray = np.concatenate([arr[:, :label_col], arr[:, (label_col + 1) :]], axis=1) # noqa: E203 - label: np.ndarray = arr[:, label_col] - return data, label + ) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]: # pylint: disable=unused-argument + for row in records: + if label_col is None: + arr_data: np.ndarray = np.array(row, dtype=np.float) + yield torch.as_tensor(arr_data, dtype=torch.float) # pylint: disable=no-member + else: + arr_data = np.array(row[:label_col] + row[label_col + 1 :], dtype=np.float) # noqa: E203 + arr_label: np.ndarray = np.array(row[label_col], dtype=np.long) + ts_data: torch.Tensor = torch.as_tensor(arr_data, dtype=torch.float) # pylint: disable=no-member + ts_label: torch.Tensor = torch.as_tensor(arr_label, dtype=torch.long) # pylint: disable=no-member + yield ts_data, ts_label diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index 3c08ec319..456269244 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -101,15 +101,15 @@ def test_torch_sql_label(parameters, db_type, chunksize): assert torch.all(ts[2][1].eq(torch.tensor([9], dtype=torch.long))) -def test_torch_image_s3(bucket): - s3 = boto3.client("s3") - ref_label = 0 - s3.put_object(Body=open("../../docs/source/_static/logo.png"), Bucket=bucket, Key=f"class={ref_label}/logo.png") - ds = wr.torch.ImageS3Dataset() - for image, label in ds: - assert image.shape == torch.Size([1, 28, 28]) - assert label == torch.int(ref_label) - break +# def test_torch_image_s3(bucket): +# s3 = boto3.client("s3") +# ref_label = 0 +# s3.put_object(Body=open("../../docs/source/_static/logo.png"), Bucket=bucket, Key=f"class={ref_label}/logo.png") +# ds = wr.torch.ImageS3Dataset() +# for image, label in ds: +# assert image.shape == torch.Size([1, 28, 28]) +# assert label == torch.int(ref_label) +# break # def test_torch_audio_s3(bucket): From 36c15e48d6afbd9925f2f57f495c82c39ef16171 Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Mon, 20 Apr 2020 15:05:09 -0300 Subject: [PATCH 09/59] tested lambda & image datasets --- awswrangler/torch.py | 359 +++++++++++++------------ testing/test_awswrangler/test_torch.py | 82 +++++- 2 files changed, 255 insertions(+), 186 deletions(-) diff --git a/awswrangler/torch.py b/awswrangler/torch.py index a73f4d198..d797193e8 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -1,190 +1,195 @@ """PyTorch Module.""" +import re import logging -# import re -# from io import BytesIO -from typing import Any, Iterator, List, Optional, Tuple, Union -# import boto3 # type: ignore +import torch # type: ignore +import boto3 # type: ignore import numpy as np # type: ignore import sqlalchemy # type: ignore -import torch -from torch.utils.data.dataset import IterableDataset -from awswrangler import db # , s3, _utils +from PIL import Image +from io import BytesIO +from typing import Any, Iterator, List, Optional, Tuple, Union, Callable +from torch.utils.data.dataset import Dataset, IterableDataset +from torchvision.transforms.functional import to_tensor + +from awswrangler import db, s3, _utils _logger: logging.Logger = logging.getLogger(__name__) -# class _BaseS3Dataset(Dataset): -# """PyTorch Map-Style S3 Dataset.""" -# -# def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session): -# """Pytorch Map-Style S3 Dataset. -# -# Parameters -# ---------- -# path : Union[str, List[str]] -# S3 prefix (e.g. s3://bucket/prefix) -# or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). -# boto3_session : boto3.Session(), optional -# Boto3 Session. The default boto3 session will be used if boto3_session receive None. -# -# Returns -# ------- -# torch.utils.data.Dataset -# -# """ -# super().__init__() -# self.session = _utils.ensure_session(session=boto3_session) -# self.paths: List[str] = s3._path2list(path=path, suffix=suffix, boto3_session=self.session) -# -# def __getitem__(self, index): -# path = self.paths[index] -# obj = self._fetch_obj(path) -# return [self.parser_fn(obj), self.label_fn(path)] -# -# def __len__(self): -# return len(self.paths) -# -# def _fetch_obj(self, path): -# bucket, key = _utils.parse_path(path=path) -# buff = BytesIO() -# client_s3: boto3.client = _utils.client(service_name="s3", session=self.session) -# client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff) -# return buff.seek(0) -# -# def parser_fn(self, obj): -# pass -# -# def label_fn(self, path): -# pass -# -# -# class _S3PartitionedDataset(_BaseS3Dataset): -# def label_fn(self, path): -# return int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1]) - - -# class AudioS3Dataset(_S3PartitionedDataset): -# -# def __init__(self): -# """Pytorch S3 Audio Dataset. -# -# Assumes audio files are stored with the following structure: -# -# bucket -# ├── class=0 -# │ ├── audio0.wav -# │ └── audio1.wav -# └── class=1 -# ├── audio2.wav -# └── audio3.wav -# -# Parameters -# ---------- -# path : Union[str, List[str]] -# S3 prefix (e.g. s3://bucket/prefix) -# or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). -# boto3_session : boto3.Session(), optional -# Boto3 Session. The default boto3 session will be used if boto3_session receive None. -# -# Returns -# ------- -# torch.utils.data.Dataset -# -# Examples -# -------- -# >>> import awswrangler as wr -# >>> import boto3 -# >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session()) -# -# """ -# super(AudioS3Dataset, self).__init__() -# import torchaudio -# -# def parser_fn(self, obj): -# waveform, sample_rate = torchaudio.load(obj) -# return waveform, sample_rate - - -# class LambdaS3Dataset(_BaseS3Dataset): -# """PyTorch S3 Audio Dataset. -# -# Parameters -# ---------- -# path : Union[str, List[str]] -# S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). -# boto3_session : boto3.Session(), optional -# Boto3 Session. The default boto3 session will be used if boto3_session receive None. -# -# Returns -# ------- -# torch.utils.data.Dataset -# -# Examples -# -------- -# >>> import awswrangler as wr -# >>> import boto3 -# >>> parse_fn = lambda x: torch.tensor(x) -# >>> label_fn = lambda x: x.split('.')[-1] -# >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), parse_fn=parse_fn, label_fn=label_fn) -# -# """ -# def __init__(self, parse_fn, label_fn): -# self._parse_fn = parse_fn -# self._label_fn = label_fn -# -# def label_fn(self, path): -# return self._label_fn(path) -# -# def parse_fn(self, obj): -# return self._parse_fn(obj) -# -# -# class ImageS3Dataset(_S3PartitionedDataset): -# -# def __init__(self): -# """PyTorch Image S3 Dataset. -# -# Assumes Images are stored with the following structure: -# -# bucket -# ├── class=0 -# │ ├── img0.jpeg -# │ └── img1.jpeg -# └── class=1 -# ├── img2.jpeg -# └── img3.jpeg -# -# Parameters -# ---------- -# path : Union[str, List[str]] -# S3 prefix (e.g. s3://bucket/prefix) -# or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). -# boto3_session : boto3.Session(), optional -# Boto3 Session. The default boto3 session will be used if boto3_session receive None. -# -# Returns -# ------- -# torch.utils.data.Dataset -# -# Examples -# -------- -# >>> import awswrangler as wr -# >>> import boto3 -# >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session()) -# -# """ -# super(ImageS3Dataset, self).__init__() -# from PIL import Image -# from torchvision.transforms.functional import to_tensor -# -# def parser_fn(self, obj): -# image = Image.open(obj) -# tensor = to_tensor(image) -# tensor.unsqueeze_(0) -# return tensor +class _BaseS3Dataset(Dataset): + """PyTorch Map-Style S3 Dataset.""" + + def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session): + """PyTorch Map-Style S3 Dataset. + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + """ + super().__init__() + self.session = _utils.ensure_session(session=boto3_session) + self.paths: List[str] = s3._path2list( # pylint: disable=protected-access + path=path, + suffix=suffix, + boto3_session=self.session, + ) + + def __getitem__(self, index): + path = self.paths[index] + data = self._fetch_data(path) + return [self.data_fn(data), self.label_fn(path)] + + def __len__(self): + return len(self.paths) + + def _fetch_data(self, path): + bucket, key = _utils.parse_path(path=path) + buff = BytesIO() + client_s3: boto3.client = _utils.client(service_name="s3", session=self.session) + client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff) + buff.seek(0) + return buff + + def data_fn(self, obj): + pass + + def label_fn(self, path): + pass + + +class _S3PartitionedDataset(_BaseS3Dataset): + + def label_fn(self, path): + return int(re.findall(r'/(.*?)=(.*?)/', path)[-1][1]) + + +class LambdaS3Dataset(_BaseS3Dataset): + + def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session, data_fn: Callable, label_fn: Callable): + """PyTorch S3 Audio Dataset. + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + Examples + -------- + >>> import awswrangler as wr + >>> import boto3 + >>> data_fn = lambda x: torch.tensor(x) + >>> label_fn = lambda x: x.split('.')[-1] + >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), data_fn=data_fn, label_fn=label_fn) + + """ + super(LambdaS3Dataset, self).__init__(path, suffix, boto3_session) + self._data_fn = data_fn + self._label_fn = label_fn + + def label_fn(self, path): + return self._label_fn(path) + + def data_fn(self, data): + print(type(data), data) + return self._data_fn(data) + + +class AudioS3Dataset(_S3PartitionedDataset): + + def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session): + """PyTorch S3 Audio Dataset. + + Assumes audio files are stored with the following structure: + + bucket + ├── class=0 + │ ├── audio0.wav + │ └── audio1.wav + └── class=1 + ├── audio2.wav + └── audio3.wav + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + Examples + -------- + >>> import awswrangler as wr + >>> import boto3 + >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session()) + + """ + super(AudioS3Dataset, self).__init__(path, suffix, boto3_session) + + def data_fn(self, data): + waveform, sample_rate = torchaudio.load(data) + return waveform, sample_rate + + +class ImageS3Dataset(_S3PartitionedDataset): + + def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session): + """PyTorch Image S3 Dataset. + + Assumes Images are stored with the following structure: + + bucket + ├── class=0 + │ ├── img0.jpeg + │ └── img1.jpeg + └── class=1 + ├── img2.jpeg + └── img3.jpeg + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + Examples + -------- + >>> import awswrangler as wr + >>> import boto3 + >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session()) + + """ + super(ImageS3Dataset, self).__init__(path, suffix, boto3_session) + + def data_fn(self, data): + image = Image.open(data) + tensor = to_tensor(image) + return tensor class SQLDataset(IterableDataset): # pylint: disable=too-few-public-methods,abstract-method @@ -232,7 +237,7 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, """Iterate over the Dataset.""" if torch.utils.data.get_worker_info() is not None: # type: ignore raise NotImplementedError() - db._validate_engine(con=self._con) + db._validate_engine(con=self._con) # pylint: disable=protected-access with self._con.connect() as con: cursor: Any = con.execute(self._sql) if (self._label_col is not None) and isinstance(self._label_col, str): diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index 456269244..4f508b31c 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -1,10 +1,16 @@ import logging +import re import boto3 +import numpy as np import pandas as pd import pytest import torch +from PIL import Image +from torch.utils.data import DataLoader +from torchvision.transforms.functional import to_tensor + import awswrangler as wr logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s") @@ -101,16 +107,74 @@ def test_torch_sql_label(parameters, db_type, chunksize): assert torch.all(ts[2][1].eq(torch.tensor([9], dtype=torch.long))) -# def test_torch_image_s3(bucket): -# s3 = boto3.client("s3") -# ref_label = 0 -# s3.put_object(Body=open("../../docs/source/_static/logo.png"), Bucket=bucket, Key=f"class={ref_label}/logo.png") -# ds = wr.torch.ImageS3Dataset() -# for image, label in ds: -# assert image.shape == torch.Size([1, 28, 28]) -# assert label == torch.int(ref_label) -# break +def test_torch_image_s3(bucket): + wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) + + s3 = boto3.client("s3") + ref_label = 0 + s3.put_object( + Body=open("../../docs/source/_static/logo.png", "rb").read(), + Bucket=bucket, + Key=f"class={ref_label}/logo.png", + ContentType="image/png", + ) + ds = wr.torch.ImageS3Dataset(path=bucket, suffix="png", boto3_session=boto3.Session()) + image, label = ds[0] + assert image.shape == torch.Size([4, 494, 1636]) + assert label == torch.tensor(ref_label, dtype=torch.int) + + wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) + + +def test_torch_image_s3_dataloader(bucket): + wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) + + s3 = boto3.client("s3") + labels = np.random.randint(0, 4, size=(8,)) + for i, label in enumerate(labels): + s3.put_object( + Body=open("../../docs/source/_static/logo.png", "rb").read(), + Bucket=bucket, + Key=f"class={label}/logo{i}.png", + ContentType="image/png", + ) + ds = wr.torch.ImageS3Dataset(path=bucket, suffix="png", boto3_session=boto3.Session()) + batch_size = 2 + num_train = len(ds) + indices = list(range(num_train)) + loader = DataLoader( + ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices) + ) + for i, (image, label) in enumerate(loader): + assert image.shape == torch.Size([batch_size, 4, 494, 1636]) + assert label.dtype == torch.int64 + + wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) + + +def test_torch_lambda_s3(bucket): + wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) + + s3 = boto3.client("s3") + ref_label = 0 + s3.put_object( + Body=open("../../docs/source/_static/logo.png", "rb").read(), + Bucket=bucket, + Key=f"class={ref_label}/logo.png", + ContentType="image/png", + ) + ds = wr.torch.LambdaS3Dataset( + path=bucket, + suffix="png", + boto3_session=boto3.Session(), + data_fn=lambda x: to_tensor(Image.open(x)), + label_fn=lambda x: int(re.findall(r'/class=(.*?)/', x)[-1]), + ) + image, label = ds[0] + assert image.shape == torch.Size([4, 494, 1636]) + assert label == torch.tensor(ref_label, dtype=torch.int) + wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) # def test_torch_audio_s3(bucket): # ds = wr.torch.AudioS3Dataset() From d4dcfc521f1f6cc8c0fdf1de485a7c29b8667cae Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Mon, 20 Apr 2020 15:35:11 -0300 Subject: [PATCH 10/59] add audio test --- awswrangler/torch.py | 3 ++- testing/test_awswrangler/test_torch.py | 29 +++++++++++++++++++------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/awswrangler/torch.py b/awswrangler/torch.py index d797193e8..4a6a76567 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -1,5 +1,4 @@ """PyTorch Module.""" - import re import logging @@ -7,6 +6,7 @@ import boto3 # type: ignore import numpy as np # type: ignore import sqlalchemy # type: ignore +import torchaudio from PIL import Image from io import BytesIO @@ -147,6 +147,7 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto super(AudioS3Dataset, self).__init__(path, suffix, boto3_session) def data_fn(self, data): + waveform, sample_rate = torchaudio.load(data) return waveform, sample_rate diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index 4f508b31c..a54797440 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -138,12 +138,19 @@ def test_torch_image_s3_dataloader(bucket): Key=f"class={label}/logo{i}.png", ContentType="image/png", ) - ds = wr.torch.ImageS3Dataset(path=bucket, suffix="png", boto3_session=boto3.Session()) + ds = wr.torch.ImageS3Dataset( + path=bucket, + suffix="png", + boto3_session=boto3.Session(), + ) batch_size = 2 num_train = len(ds) indices = list(range(num_train)) loader = DataLoader( - ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices) + ds, + batch_size=batch_size, + num_workers=4, + sampler=torch.utils.data.sampler.RandomSampler(indices), ) for i, (image, label) in enumerate(loader): assert image.shape == torch.Size([batch_size, 4, 494, 1636]) @@ -176,8 +183,16 @@ def test_torch_lambda_s3(bucket): wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) -# def test_torch_audio_s3(bucket): -# ds = wr.torch.AudioS3Dataset() -# for image, label in ds: -# assert image.shape == torch.Size([1, 28, 28]) -# break + +def test_torch_audio_s3(bucket): + ds = wr.torch.AudioS3Dataset( + path="s3://multimedia-commons/data/videos/mp4/006/039/006039642c984a788569c7fea33ef3.mp4", + suffix="png", + boto3_session=boto3.Session(), + ) + loader = DataLoader( + ds, + batch_size=1, + ) + for image, label in loader: + assert image.shape == torch.Size([1, 28, 28]) From 30dc2fa5b275c04b5d94dc9799e9653ab479f65e Mon Sep 17 00:00:00 2001 From: igorborgest Date: Mon, 20 Apr 2020 18:22:50 -0300 Subject: [PATCH 11/59] Add test for torch.AudioS3Dataset --- .pylintrc | 3 +- awswrangler/torch.py | 111 +++++++++++++++---------- pytest.ini | 2 +- requirements-dev.txt | 3 +- testing/run-validations.sh | 2 +- testing/test_awswrangler/test_torch.py | 82 +++++++++--------- 6 files changed, 114 insertions(+), 89 deletions(-) diff --git a/.pylintrc b/.pylintrc index 132ce213a..4f41cb3fb 100644 --- a/.pylintrc +++ b/.pylintrc @@ -141,7 +141,8 @@ disable=print-statement, comprehension-escape, C0330, C0103, - W1202 + W1202, + too-few-public-methods # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/awswrangler/torch.py b/awswrangler/torch.py index 4a6a76567..db09abc46 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -1,20 +1,21 @@ """PyTorch Module.""" -import re import logging +import os +import pathlib +import re +from io import BytesIO +from typing import Any, Callable, Iterator, List, Optional, Tuple, Union -import torch # type: ignore import boto3 # type: ignore import numpy as np # type: ignore import sqlalchemy # type: ignore -import torchaudio - -from PIL import Image -from io import BytesIO -from typing import Any, Iterator, List, Optional, Tuple, Union, Callable -from torch.utils.data.dataset import Dataset, IterableDataset -from torchvision.transforms.functional import to_tensor +import torch # type: ignore +import torchaudio # type: ignore +from PIL import Image # type: ignore +from torch.utils.data.dataset import Dataset, IterableDataset # type: ignore +from torchvision.transforms.functional import to_tensor # type: ignore -from awswrangler import db, s3, _utils +from awswrangler import _utils, db, s3 _logger: logging.Logger = logging.getLogger(__name__) @@ -22,7 +23,9 @@ class _BaseS3Dataset(Dataset): """PyTorch Map-Style S3 Dataset.""" - def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session): + def __init__( + self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None + ): """PyTorch Map-Style S3 Dataset. Parameters @@ -38,46 +41,51 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto """ super().__init__() - self.session = _utils.ensure_session(session=boto3_session) - self.paths: List[str] = s3._path2list( # pylint: disable=protected-access - path=path, - suffix=suffix, - boto3_session=self.session, + self._session = _utils.ensure_session(session=boto3_session) + self._paths: List[str] = s3._path2list( # pylint: disable=protected-access + path=path, suffix=suffix, boto3_session=self._session ) def __getitem__(self, index): - path = self.paths[index] + path = self._paths[index] data = self._fetch_data(path) - return [self.data_fn(data), self.label_fn(path)] + return [self._data_fn(data), self._label_fn(path)] def __len__(self): - return len(self.paths) + return len(self._paths) def _fetch_data(self, path): bucket, key = _utils.parse_path(path=path) buff = BytesIO() - client_s3: boto3.client = _utils.client(service_name="s3", session=self.session) + client_s3: boto3.client = _utils.client(service_name="s3", session=self._session) client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff) buff.seek(0) return buff - def data_fn(self, obj): + def _data_fn(self, data): pass - def label_fn(self, path): + def _label_fn(self, path: str): pass class _S3PartitionedDataset(_BaseS3Dataset): - - def label_fn(self, path): - return int(re.findall(r'/(.*?)=(.*?)/', path)[-1][1]) + def _label_fn(self, path: str): + return int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1]) class LambdaS3Dataset(_BaseS3Dataset): + """PyTorch S3 Lambda Dataset.""" - def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session, data_fn: Callable, label_fn: Callable): - """PyTorch S3 Audio Dataset. + def __init__( + self, + path: Union[str, List[str]], + data_fn: Callable, + label_fn: Callable, + suffix: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, + ): + """PyTorch S3 Lambda Dataset. Parameters ---------- @@ -94,26 +102,33 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto -------- >>> import awswrangler as wr >>> import boto3 - >>> data_fn = lambda x: torch.tensor(x) - >>> label_fn = lambda x: x.split('.')[-1] - >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), data_fn=data_fn, label_fn=label_fn) + >>> _data_fn = lambda x: torch.tensor(x) + >>> _label_fn = lambda x: x.split('.')[-1] + >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), _data_fn=_data_fn, _label_fn=_label_fn) """ super(LambdaS3Dataset, self).__init__(path, suffix, boto3_session) - self._data_fn = data_fn - self._label_fn = label_fn + self._data_func = data_fn + self._label_func = label_fn - def label_fn(self, path): - return self._label_fn(path) + def _label_fn(self, path: str): + return self._label_func(path) - def data_fn(self, data): - print(type(data), data) - return self._data_fn(data) + def _data_fn(self, data): + print(type(data)) + return self._data_func(data) class AudioS3Dataset(_S3PartitionedDataset): + """PyTorch S3 Audio Dataset.""" - def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session): + def __init__( + self, + path: Union[str, List[str]], + cache_dir: str = "/tmp/", + suffix: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, + ): """PyTorch S3 Audio Dataset. Assumes audio files are stored with the following structure: @@ -145,17 +160,27 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto """ super(AudioS3Dataset, self).__init__(path, suffix, boto3_session) + self._cache_dir: str = cache_dir[:-1] if cache_dir.endswith("/") else cache_dir - def data_fn(self, data): - - waveform, sample_rate = torchaudio.load(data) + def _data_fn(self, filename: str) -> Tuple[Any, Any]: # pylint: disable=arguments-differ + waveform, sample_rate = torchaudio.load(filename) + os.remove(path=filename) return waveform, sample_rate + def _fetch_data(self, path: str) -> str: + bucket, key = _utils.parse_path(path=path) + filename: str = f"{self._cache_dir}/{bucket}/{key}" + pathlib.Path(filename).parent.mkdir(parents=True, exist_ok=True) + client_s3 = _utils.client(service_name="s3", session=self._session) + client_s3.download_file(Bucket=bucket, Key=key, Filename=filename) + return filename + class ImageS3Dataset(_S3PartitionedDataset): + """PyTorch S3 Image Dataset.""" def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session): - """PyTorch Image S3 Dataset. + """PyTorch S3 Image Dataset. Assumes Images are stored with the following structure: @@ -187,7 +212,7 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto """ super(ImageS3Dataset, self).__init__(path, suffix, boto3_session) - def data_fn(self, data): + def _data_fn(self, data): image = Image.open(data) tensor = to_tensor(image) return tensor diff --git a/pytest.ini b/pytest.ini index d233cbf74..8e7a47ef1 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,7 +1,7 @@ [pytest] addopts = --verbose - --capture=no + --capture=fd filterwarnings = ignore::DeprecationWarning ignore::UserWarning \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index 137f57383..0491e8789 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -19,4 +19,5 @@ sphinx~=3.0.1 sphinx_bootstrap_theme~=0.7.1 moto~=1.3.14 torch~=1.4.0 -torchvision~=0.5.0 \ No newline at end of file +torchvision~=0.5.0 +torchaudio~=0.4.0 \ No newline at end of file diff --git a/testing/run-validations.sh b/testing/run-validations.sh index 966038ec9..d32fc7808 100755 --- a/testing/run-validations.sh +++ b/testing/run-validations.sh @@ -9,7 +9,7 @@ mv temp.yaml cloudformation.yaml pushd .. black --line-length 120 --target-version py36 awswrangler testing/test_awswrangler isort -rc --line-width 120 awswrangler testing/test_awswrangler -pydocstyle awswrangler/ --add-ignore=D204 +pydocstyle awswrangler/ --add-ignore=D204,D403 mypy awswrangler flake8 setup.py awswrangler testing/test_awswrangler pylint -j 0 awswrangler diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index a54797440..5b7a84b38 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -1,12 +1,12 @@ import logging - import re + import boto3 import numpy as np import pandas as pd import pytest import torch - +import torchaudio from PIL import Image from torch.utils.data import DataLoader from torchvision.transforms.functional import to_tensor @@ -108,91 +108,89 @@ def test_torch_sql_label(parameters, db_type, chunksize): def test_torch_image_s3(bucket): - wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) - + path = f"s3://{bucket}/test_torch_image_s3/" + wr.s3.delete_objects(path=path, boto3_session=boto3.Session()) s3 = boto3.client("s3") ref_label = 0 s3.put_object( - Body=open("../../docs/source/_static/logo.png", "rb").read(), + Body=open("docs/source/_static/logo.png", "rb").read(), Bucket=bucket, - Key=f"class={ref_label}/logo.png", + Key=f"test_torch_image_s3/class={ref_label}/logo.png", ContentType="image/png", ) - ds = wr.torch.ImageS3Dataset(path=bucket, suffix="png", boto3_session=boto3.Session()) + ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session()) image, label = ds[0] assert image.shape == torch.Size([4, 494, 1636]) assert label == torch.tensor(ref_label, dtype=torch.int) - - wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) + wr.s3.delete_objects(path=path) def test_torch_image_s3_dataloader(bucket): - wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) - + path = f"s3://{bucket}/test_torch_image_s3_dataloader/" + wr.s3.delete_objects(path=path) s3 = boto3.client("s3") labels = np.random.randint(0, 4, size=(8,)) for i, label in enumerate(labels): s3.put_object( - Body=open("../../docs/source/_static/logo.png", "rb").read(), + Body=open("./docs/source/_static/logo.png", "rb").read(), Bucket=bucket, - Key=f"class={label}/logo{i}.png", + Key=f"test_torch_image_s3_dataloader/class={label}/logo{i}.png", ContentType="image/png", ) - ds = wr.torch.ImageS3Dataset( - path=bucket, - suffix="png", - boto3_session=boto3.Session(), - ) + ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session()) batch_size = 2 num_train = len(ds) indices = list(range(num_train)) loader = DataLoader( - ds, - batch_size=batch_size, - num_workers=4, - sampler=torch.utils.data.sampler.RandomSampler(indices), + ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices) ) for i, (image, label) in enumerate(loader): assert image.shape == torch.Size([batch_size, 4, 494, 1636]) assert label.dtype == torch.int64 - - wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) + wr.s3.delete_objects(path=path) def test_torch_lambda_s3(bucket): - wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) - + path = f"s3://{bucket}/test_torch_lambda_s3/" + wr.s3.delete_objects(path=path) s3 = boto3.client("s3") ref_label = 0 s3.put_object( - Body=open("../../docs/source/_static/logo.png", "rb").read(), + Body=open("./docs/source/_static/logo.png", "rb").read(), Bucket=bucket, - Key=f"class={ref_label}/logo.png", + Key=f"test_torch_lambda_s3/class={ref_label}/logo.png", ContentType="image/png", ) ds = wr.torch.LambdaS3Dataset( - path=bucket, + path=path, suffix="png", boto3_session=boto3.Session(), data_fn=lambda x: to_tensor(Image.open(x)), - label_fn=lambda x: int(re.findall(r'/class=(.*?)/', x)[-1]), + label_fn=lambda x: int(re.findall(r"/class=(.*?)/", x)[-1]), ) image, label = ds[0] assert image.shape == torch.Size([4, 494, 1636]) assert label == torch.tensor(ref_label, dtype=torch.int) - - wr.s3.delete_objects(path=bucket, boto3_session=boto3.Session()) + wr.s3.delete_objects(path=path) def test_torch_audio_s3(bucket): - ds = wr.torch.AudioS3Dataset( - path="s3://multimedia-commons/data/videos/mp4/006/039/006039642c984a788569c7fea33ef3.mp4", - suffix="png", - boto3_session=boto3.Session(), - ) - loader = DataLoader( - ds, - batch_size=1, + size = (1, 8_000 * 5) + audio = torch.randint(low=-25, high=25, size=size) / 100.0 + audio_file = "/tmp/amazing_sound.wav" + torchaudio.save(audio_file, audio, 8_000) + path = f"s3://{bucket}/test_torch_audio_s3/" + wr.s3.delete_objects(path=path, boto3_session=boto3.Session()) + s3 = boto3.client("s3") + ref_label = 0 + s3.put_object( + Body=open(audio_file, "rb").read(), + Bucket=bucket, + Key=f"test_torch_audio_s3/class={ref_label}/amazing_sound.wav", + ContentType="audio/wav", ) - for image, label in loader: - assert image.shape == torch.Size([1, 28, 28]) + s3_audio_file = f"{bucket}/test_torch_audio_s3/class={ref_label}/amazing_sound.wav" + ds = wr.torch.AudioS3Dataset(path=s3_audio_file, suffix="wav") + loader = DataLoader(ds, batch_size=1) + for (audio, rate), label in loader: + assert audio.shape == torch.Size((1, *size)) From 0376aefa25cc6ad9422e9a226a0d635bad64a63f Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 21 Apr 2020 23:26:58 -0300 Subject: [PATCH 12/59] Add chunked=INTEGER option to ensure batch number of rows #192 --- awswrangler/athena.py | 61 +++++++--- awswrangler/db.py | 29 ++++- awswrangler/s3.py | 124 +++++++++++++++++---- testing/test_awswrangler/test_data_lake.py | 36 ++++++ 4 files changed, 209 insertions(+), 41 deletions(-) diff --git a/awswrangler/athena.py b/awswrangler/athena.py index 1933606ad..a899d405b 100644 --- a/awswrangler/athena.py +++ b/awswrangler/athena.py @@ -329,7 +329,7 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals database: str, ctas_approach: bool = True, categories: List[str] = None, - chunksize: Optional[int] = None, + chunksize: Optional[Union[int, bool]] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, @@ -353,10 +353,6 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals CONS: Slower (But stills faster than other libraries that uses the regular Athena API) and does not handle nested types at all. - Note - ---- - If `chunksize` is passed, then a Generator of DataFrames is returned. - Note ---- If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes, @@ -367,6 +363,21 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) + Note + ---- + ``Batching`` (`chunksize` argument) (Memory Friendly): + + Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. + + There are two batching strategies on Wrangler: + + - If **chunksize=True**, a new DataFrame will be returned for each file in the query result. + + - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. + + `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise + in number of rows for each Dataframe. + Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). @@ -383,8 +394,10 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. - chunksize: int, optional - If specified, return an generator where chunksize is the number of rows to include in each chunk. + chunksize : Union[int, bool], optional + If passed will split the data in a Iterable of DataFrames (Memory friendly). + If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. + If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. s3_output : str, optional AWS S3 path. workgroup : str, optional @@ -454,7 +467,7 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) - chunked: bool = chunksize is not None + chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug(f"chunked: {chunked}") if not paths: if chunked is False: @@ -473,6 +486,8 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug(f"Start CSV reading from {path}") + _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None + _logger.debug(f"_chunksize: {_chunksize}") ret = s3.read_csv( path=[path], dtype=dtype, @@ -481,7 +496,7 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], - chunksize=chunksize, + chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, @@ -565,7 +580,7 @@ def read_sql_table( database: str, ctas_approach: bool = True, categories: List[str] = None, - chunksize: Optional[int] = None, + chunksize: Optional[Union[int, bool]] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, @@ -589,10 +604,6 @@ def read_sql_table( CONS: Slower (But stills faster than other libraries that uses the regular Athena API) and does not handle nested types at all - Note - ---- - If `chunksize` is passed, then a Generator of DataFrames is returned. - Note ---- If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes, @@ -603,6 +614,21 @@ def read_sql_table( Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) + Note + ---- + ``Batching`` (`chunksize` argument) (Memory Friendly): + + Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. + + There are two batching strategies on Wrangler: + + - If **chunksize=True**, a new DataFrame will be returned for each file in the query result. + + - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. + + `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise + in number of rows for each Dataframe. + Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). @@ -619,8 +645,10 @@ def read_sql_table( categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. - chunksize: int, optional - If specified, return an generator where chunksize is the number of rows to include in each chunk. + chunksize : Union[int, bool], optional + If passed will split the data in a Iterable of DataFrames (Memory friendly). + If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. + If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. s3_output : str, optional AWS S3 path. workgroup : str, optional @@ -646,6 +674,7 @@ def read_sql_table( >>> df = wr.athena.read_sql_table(table='...', database='...') """ + table = catalog.sanitize_table_name(table=table) return read_sql_query( sql=f'SELECT * FROM "{table}"', database=database, diff --git a/awswrangler/db.py b/awswrangler/db.py index 491fe7784..2c8ac2799 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -888,7 +888,7 @@ def unload_redshift( con: sqlalchemy.engine.Engine, iam_role: str, categories: List[str] = None, - chunked: bool = False, + chunked: Union[bool, int] = False, keep_files: bool = False, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, @@ -906,6 +906,22 @@ def unload_redshift( https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html + Note + ---- + ``Batching`` (`chunked` argument) (Memory Friendly): + + Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. + + There are two batching strategies on Wrangler: + + - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. + + - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. + + `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise + in number of rows for each Dataframe. + + Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). @@ -926,9 +942,10 @@ def unload_redshift( Recommended for memory restricted environments. keep_files : bool Should keep the stage files? - chunked : bool - If True will break the data in smaller DataFrames (Non deterministic number of lines). - Otherwise return a single DataFrame with the whole data. + chunked : Union[int, bool] + If passed will split the data in a Iterable of DataFrames (Memory friendly). + If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. + If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. @@ -979,6 +996,7 @@ def unload_redshift( return _read_parquet_iterator( paths=paths, categories=categories, + chunked=chunked, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, @@ -991,13 +1009,14 @@ def _read_parquet_iterator( keep_files: bool, use_threads: bool, categories: List[str] = None, + chunked: Union[bool, int] = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> Iterator[pd.DataFrame]: dfs: Iterator[pd.DataFrame] = s3.read_parquet( path=paths, categories=categories, - chunked=True, + chunked=chunked, dataset=False, use_threads=use_threads, boto3_session=boto3_session, diff --git a/awswrangler/s3.py b/awswrangler/s3.py index f728937db..0127f8897 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -1501,6 +1501,7 @@ def _read_parquet_init( filters=filters, read_dictionary=categories, validate_schema=validate_schema, + split_row_groups=False, ) return data @@ -1510,7 +1511,7 @@ def read_parquet( filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, columns: Optional[List[str]] = None, validate_schema: bool = True, - chunked: bool = False, + chunked: Union[bool, int] = False, dataset: bool = False, categories: List[str] = None, use_threads: bool = True, @@ -1522,6 +1523,22 @@ def read_parquet( The concept of Dataset goes beyond the simple idea of files and enable more complex features like partitioning and catalog integration (AWS Glue Catalog). + Note + ---- + ``Batching`` (`chunked` argument) (Memory Friendly): + + Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. + + There are two batching strategies on Wrangler: + + - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. + + - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. + + `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise + in number of rows for each Dataframe. + + Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). @@ -1538,11 +1555,12 @@ def read_parquet( Check that individual file schemas are all the same / compatible. Schemas within a folder prefix should all be the same. Disable if you have schemas that are different and want to disable this check. - chunked : bool - If True will break the data in smaller DataFrames (Non deterministic number of lines). - Otherwise return a single DataFrame with the whole data. + chunked : Union[int, bool] + If passed will split the data in a Iterable of DataFrames (Memory friendly). + If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. + If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. dataset: bool - If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns. + If `True` read a parquet dataset instead of simple file(s) loading all the related partitions as columns. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. @@ -1583,29 +1601,43 @@ def read_parquet( >>> import awswrangler as wr >>> df = wr.s3.read_parquet(path=['s3://bucket/filename0.parquet', 's3://bucket/filename1.parquet']) - Reading in chunks + Reading in chunks (Chunk by file) >>> import awswrangler as wr >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=True) >>> for df in dfs: >>> print(df) # Smaller Pandas DataFrame + Reading in chunks (Chunk by 1MM rows) + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000) + >>> for df in dfs: + >>> print(df) # 1MM Pandas DataFrame + """ data: pyarrow.parquet.ParquetDataset = _read_parquet_init( path=path, filters=filters, dataset=dataset, categories=categories, + validate_schema=validate_schema, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, - validate_schema=validate_schema, ) if chunked is False: return _read_parquet( data=data, columns=columns, categories=categories, use_threads=use_threads, validate_schema=validate_schema ) - return _read_parquet_chunked(data=data, columns=columns, categories=categories, use_threads=use_threads) + return _read_parquet_chunked( + data=data, + columns=columns, + categories=categories, + chunked=chunked, + use_threads=use_threads, + validate_schema=validate_schema, + ) def _read_parquet( @@ -1639,22 +1671,50 @@ def _read_parquet_chunked( data: pyarrow.parquet.ParquetDataset, columns: Optional[List[str]] = None, categories: List[str] = None, + validate_schema: bool = True, + chunked: Union[bool, int] = True, use_threads: bool = True, ) -> Iterator[pd.DataFrame]: + promote: bool = not validate_schema + next_slice: Optional[pa.Table] = None for piece in data.pieces: table: pa.Table = piece.read( columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False ) - yield table.to_pandas( - use_threads=use_threads, - split_blocks=True, - self_destruct=True, - integer_object_nulls=False, - date_as_object=True, - ignore_metadata=True, - categories=categories, - types_mapper=_data_types.pyarrow2pandas_extension, - ) + if chunked is True: + yield _table2df(table=table, categories=categories, use_threads=use_threads) + else: + if next_slice is not None: + table = pa.lib.concat_tables([next_slice, table], promote=promote) + length: int = len(table) + while True: + if length == chunked: + yield _table2df(table=table, categories=categories, use_threads=use_threads) + next_slice = None + break + if length < chunked: + next_slice = table + break + yield _table2df( + table=table.slice(offset=0, length=chunked), categories=categories, use_threads=use_threads + ) + table = table.slice(offset=chunked, length=None) + length = len(table) + if next_slice is not None: + yield _table2df(table=next_slice, categories=categories, use_threads=use_threads) + + +def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool = True) -> pd.DataFrame: + return table.to_pandas( + use_threads=use_threads, + split_blocks=True, + self_destruct=True, + integer_object_nulls=False, + date_as_object=True, + ignore_metadata=True, + categories=categories, + types_mapper=_data_types.pyarrow2pandas_extension, + ) def read_parquet_metadata( @@ -1972,13 +2032,30 @@ def read_parquet_table( filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, columns: Optional[List[str]] = None, categories: List[str] = None, - chunked: bool = False, + chunked: Union[bool, int] = False, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Read Apache Parquet table registered on AWS Glue Catalog. + Note + ---- + ``Batching`` (`chunked` argument) (Memory Friendly): + + Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. + + There are two batching strategies on Wrangler: + + - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. + + - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating + to return DataFrames with the number of row igual the received INTEGER. + + `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise + in number of rows for each Dataframe. + + Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). @@ -2032,13 +2109,20 @@ def read_parquet_table( ... } ... ) - Reading Parquet Table in chunks + Reading Parquet Table in chunks (Chunk by file) >>> import awswrangler as wr >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True) >>> for df in dfs: >>> print(df) # Smaller Pandas DataFrame + Reading in chunks (Chunk by 1MM rows) + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000) + >>> for df in dfs: + >>> print(df) # 1MM Pandas DataFrame + """ path: str = catalog.get_table_location(database=database, table=table, boto3_session=boto3_session) return read_parquet( diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index afa2a8307..15369ee2c 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -3,6 +3,7 @@ import gzip import logging import lzma +import math from io import BytesIO, TextIOWrapper import boto3 @@ -1084,3 +1085,38 @@ def test_copy(bucket): wr.s3.delete_objects(path=path) wr.s3.delete_objects(path=path2) + + +@pytest.mark.parametrize("col2", [[1, 1, 1, 1, 1], [1, 2, 3, 4, 5], [1, 1, 1, 1, 2], [1, 2, 2, 2, 2]]) +@pytest.mark.parametrize("chunked", [True, 1, 2, 100]) +def test_parquet_chunked(bucket, database, col2, chunked): + table = f"test_parquet_chunked_{chunked}_{''.join([str(x) for x in col2])}" + path = f"s3://{bucket}/{table}/" + wr.s3.delete_objects(path=path) + values = list(range(5)) + df = pd.DataFrame({"col1": values, "col2": col2}) + paths = wr.s3.to_parquet( + df, path, index=False, dataset=True, database=database, table=table, partition_cols=["col2"], mode="overwrite" + )["paths"] + wr.s3.wait_objects_exist(paths=paths) + + dfs = list(wr.s3.read_parquet(path=path, dataset=True, chunked=chunked)) + assert sum(values) == pd.concat(dfs, ignore_index=True).col1.sum() + if chunked is not True: + assert len(dfs) == int(math.ceil(len(df) / chunked)) + for df2 in dfs[:-1]: + assert chunked == len(df2) + assert chunked >= len(dfs[-1]) + else: + assert len(dfs) == len(set(col2)) + + dfs = list(wr.athena.read_sql_table(database=database, table=table, chunksize=chunked)) + assert sum(values) == pd.concat(dfs, ignore_index=True).col1.sum() + if chunked is not True: + assert len(dfs) == int(math.ceil(len(df) / chunked)) + for df2 in dfs[:-1]: + assert chunked == len(df2) + assert chunked >= len(dfs[-1]) + + wr.s3.delete_objects(path=paths) + assert wr.catalog.delete_table_if_exists(database=database, table=table) is True From 5a9a83f5dae7b6fe1cba06019c076a516b198756 Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Wed, 22 Apr 2020 20:48:25 -0300 Subject: [PATCH 13/59] s3 iterable dataset --- awswrangler/torch.py | 143 ++++++++++++++++++++++--- testing/test_awswrangler/test_torch.py | 53 ++++++++- 2 files changed, 178 insertions(+), 18 deletions(-) diff --git a/awswrangler/torch.py b/awswrangler/torch.py index db09abc46..c4dac13e5 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -1,6 +1,8 @@ """PyTorch Module.""" import logging +import io import os +import tarfile import pathlib import re from io import BytesIO @@ -20,8 +22,8 @@ _logger: logging.Logger = logging.getLogger(__name__) -class _BaseS3Dataset(Dataset): - """PyTorch Map-Style S3 Dataset.""" +class _BaseS3Dataset: + """PyTorch Amazon S3 Map-Style Dataset.""" def __init__( self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None @@ -46,6 +48,52 @@ def __init__( path=path, suffix=suffix, boto3_session=self._session ) + def _fetch_data(self, path: str): + """Add parquet and csv support""" + bucket, key = _utils.parse_path(path=path) + buff = BytesIO() + client_s3: boto3.client = _utils.client(service_name="s3", session=self._session) + client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff) + buff.seek(0) + return buff + + @staticmethod + def _load_data(data: io.BytesIO, path: str): + if path.endswith('.tar.gz') or path.endswith('.tgz'): + pass + # tarfile.open(fileobj=data) + # tar = tarfile.open(fileobj=data) + # for member in tar.getmembers(): + # print('member', member) + elif path.endswith('.pt'): + data = torch.load(data) + return data + + +class _ListS3Dataset(_BaseS3Dataset, Dataset): + """PyTorch Amazon S3 Map-Style List Dataset.""" + + def __init__( + self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None + ): + """PyTorch Map-Style List S3 Dataset. + + Each file under path would be handle as a single tensor. + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + """ + super(_ListS3Dataset, self).__init__(path, suffix, boto3_session) + def __getitem__(self, index): path = self._paths[index] data = self._fetch_data(path) @@ -54,14 +102,6 @@ def __getitem__(self, index): def __len__(self): return len(self._paths) - def _fetch_data(self, path): - bucket, key = _utils.parse_path(path=path) - buff = BytesIO() - client_s3: boto3.client = _utils.client(service_name="s3", session=self._session) - client_s3.download_fileobj(Bucket=bucket, Key=key, Fileobj=buff) - buff.seek(0) - return buff - def _data_fn(self, data): pass @@ -69,13 +109,56 @@ def _label_fn(self, path: str): pass -class _S3PartitionedDataset(_BaseS3Dataset): +class _S3PartitionedDataset(_ListS3Dataset): + """PyTorch Amazon S3 Map-Style Partitioned Dataset.""" + def _label_fn(self, path: str): return int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1]) -class LambdaS3Dataset(_BaseS3Dataset): - """PyTorch S3 Lambda Dataset.""" +class S3FilesDataset(_BaseS3Dataset, Dataset): + """PyTorch Amazon S3 Files Map-Style Dataset.""" + + def __init__( + self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None + ): + """PyTorch S3 Files Map-Style Dataset. + + Each file under Amazon S3 path would be handled as a batch of tensors. + All files will be loaded to memory since random access is needed. + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + """ + super(S3FilesDataset, self).__init__(path, suffix, boto3_session) + + def _download_files(self): + self._data = [] + for path in self._paths: + data = self._fetch_data(path) + data = self._load_data(data, path) + self._data.append(data) + + self.data = torch.tensor(self._data) + + def __getitem__(self, index): + return self._data[index] + + def __len__(self): + return len(self._data) + + +class LambdaS3Dataset(_ListS3Dataset): + """PyTorch Amazon S3 Lambda Map-Style Dataset.""" def __init__( self, @@ -218,6 +301,40 @@ def _data_fn(self, data): return tensor +class S3IterableDataset(_BaseS3Dataset, IterableDataset): + """PyTorch Amazon S3 Iterable Dataset.""" + + def __init__( + self, + path: Union[str, List[str]], + suffix: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, + ): + """PyTorch Amazon S3 Iterable Dataset. + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + torch.utils.data.Dataset + + """ + super(S3IterableDataset, self).__init__(path, suffix, boto3_session) + self._paths_index = 0 + + def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]: + for path in self._paths: + data = self._fetch_data(path) + data = self._load_data(data, path) + for d in data: + yield d + + class SQLDataset(IterableDataset): # pylint: disable=too-few-public-methods,abstract-method """Pytorch Iterable SQL Dataset.""" diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index 5b7a84b38..599976d33 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -1,5 +1,6 @@ -import logging +import io import re +import logging import boto3 import numpy as np @@ -125,13 +126,14 @@ def test_torch_image_s3(bucket): wr.s3.delete_objects(path=path) -def test_torch_image_s3_dataloader(bucket): +@pytest.mark.parametrize("drop_last", [True, False]) +def test_torch_image_s3_dataloader(bucket, drop_last): path = f"s3://{bucket}/test_torch_image_s3_dataloader/" wr.s3.delete_objects(path=path) - s3 = boto3.client("s3") + client_s3 = boto3.client("s3") labels = np.random.randint(0, 4, size=(8,)) for i, label in enumerate(labels): - s3.put_object( + client_s3.put_object( Body=open("./docs/source/_static/logo.png", "rb").read(), Bucket=bucket, Key=f"test_torch_image_s3_dataloader/class={label}/logo{i}.png", @@ -142,7 +144,7 @@ def test_torch_image_s3_dataloader(bucket): num_train = len(ds) indices = list(range(num_train)) loader = DataLoader( - ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices) + ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices), drop_last=drop_last ) for i, (image, label) in enumerate(loader): assert image.shape == torch.Size([batch_size, 4, 494, 1636]) @@ -194,3 +196,44 @@ def test_torch_audio_s3(bucket): loader = DataLoader(ds, batch_size=1) for (audio, rate), label in loader: assert audio.shape == torch.Size((1, *size)) + + +# def test_torch_s3_file_dataset(bucket): +# cifar10 = "s3://fast-ai-imageclas/cifar10.tgz" +# batch_size = 64 +# for image, label in DataLoader( +# wr.torch.S3FilesDataset(cifar10), +# batch_size=batch_size, +# ): +# assert image.shape == torch.Size([batch_size, 3, 32, 32]) +# assert label.dtype == torch.int64 +# break + + +@pytest.mark.parametrize("drop_last", [True, False]) +def test_torch_s3_iterable_dataset(bucket, drop_last): + folder = "test_torch_s3_iterable_dataset" + batch_size = 32 + client_s3 = boto3.client("s3") + for i in range(3): + batch = torch.randn(100, 3, 32, 32) + buff = io.BytesIO() + torch.save(batch, buff) + buff.seek(0) + client_s3.put_object( + Body=buff.read(), + Bucket=bucket, + Key=f"{folder}/file{i}.pt", + ) + + for image in DataLoader( + wr.torch.S3IterableDataset( + path=f"s3://{bucket}/{folder}", + ), + batch_size=batch_size, + drop_last=drop_last, + ): + if drop_last: + assert image.shape == torch.Size([batch_size, 3, 32, 32]) + else: + assert image[0].shape == torch.Size([3, 32, 32]) From 60232f44a5663fce3cdd82b7b5dcaaf431fa2b76 Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Wed, 22 Apr 2020 22:09:58 -0300 Subject: [PATCH 14/59] add tutorial draft --- tutorials/14 - PyTorch.ipynb | 249 +++++++++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 tutorials/14 - PyTorch.ipynb diff --git a/tutorials/14 - PyTorch.ipynb b/tutorials/14 - PyTorch.ipynb new file mode 100644 index 000000000..757c817f9 --- /dev/null +++ b/tutorials/14 - PyTorch.ipynb @@ -0,0 +1,249 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PyTorch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of Contents\n", + "* [1.Defining Training Function](#1.-Defininf-Training-Function)\n", + "* [2.Traning From Amazon S3](#1.-Traning-From-Amazon-S3)\n", + "\t* [2.1 Writing PyTorch Dataset to S3](#1.1-Writing-PyTorch-Dataset-to-S3)\n", + "\t* [2.2 Training Network](#1.2-Training-Network)\n", + "* [3. Training From SQL Query](#2.-Training-From-SQL-Query)\n", + "\t* [3.1 Writing Data to SQL Database](#2.1-Writing-Data-to-SQL-Database)\n", + "\t* [3.3 Training Network From SQL](#2.2-Reading-single-JSON-file)\n", + "* [4. Creating Custom S3 Dataset](#1.-Creating-Custom-S3-Dataset)\n", + "\t* [4.1 Creating Custom PyTorch Dataset](#1.1-Creating-Custom-PyTorch-Dataset)\n", + "\t* [4.2 Writing Data to S3](#1.1-Writing-Data-to-S3)\n", + "\t* [4.3 Training Network](#1.2-Training-Network)\n", + "* [5. Delete objects](#6.-Delete-objects)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import io\n", + "import boto3\n", + "import torch\n", + "import torchvision\n", + "import awswrangler as wr\n", + "\n", + "accuracy = lambda o, l: 100/o.size(0) * (torch.max(o.data, 1)[1] == l).sum().item()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "bucket = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Defining Training Function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def train(model, dataset):\n", + " criterion = torch.nn.CrossEntropyLoss()\n", + " opt = torch.optim.SGD(model.parameters(), 0.025)\n", + "\n", + " for epoch in range(2):\n", + "\n", + " model.train()\n", + " for inputs, labels in torch.utils.data.DataLoader(\n", + " dataset,\n", + " batch_size=64,\n", + " num_workers=2,\n", + " ):\n", + "\n", + " outputs = model(inputs)\n", + " loss = criterion(outputs, labels)\n", + " loss.backward()\n", + " opt.step()s\n", + " opt.zero_grad()\n", + "\n", + " acc = accuracy(outputs, labels)\n", + " print(f'batch: {i} loss: {loss.mean().item():.4f} batch_acc: {acc:.2f}') " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Traning From Amazon S3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client_s3 = boto3.client(\"s3\")\n", + "folder = \"tutorial_torch_dataset\"\n", + "for i in range(3):\n", + " batch = (\n", + " torch.randn(100, 3, 32, 32),\n", + " torch.randint(1, size=(100,)),\n", + " )\n", + " buff = io.BytesIO()\n", + " torch.save(batch, buff)\n", + " buff.seek(0)\n", + " client_s3.put_object(\n", + " Body=buff.read(),\n", + " Bucket=bucket,\n", + " Key=f\"{folder}/file{i}.pt\",\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.2 Training Network" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train(\n", + " torchvision.models.resnet18(),\n", + " wr.torch.S3IterableDataset(path=f\"s3://{bucket}/{folder}\"),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Training Directly From SQL Query" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.1 Writing Data to SQL Database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eng = wr.catalog.get_engine(\"aws-data-wrangler-redshift\")\n", + "df = pd.DataFrame({\n", + " \"height\": [2, 1.4, 1.7, 1.8, 1.9],\n", + " \"name\": [\"foo\", \"boo\"],\n", + " \"target\": [1, 0, 0, 1, 2, 3]\n", + "})\n", + "\n", + "wr.db.to_sql(\n", + " df,\n", + " eng_redshift,\n", + " schema=\"public\",\n", + " name=\"torch\",\n", + " if_exists=\"replace\",\n", + " index=False\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.2 Training Network From SQL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train(\n", + " model = torch.nn.Sequential(\n", + " torch.nn.Linear(, 20),\n", + " torch.nn.ReLU(),\n", + " torch.nn.Linear(20, 2), \n", + " ),\n", + " wr.torch.SQLDataset(\n", + " sql=\"SELECT * FROM public.torch\"\n", + " con=eng\n", + " label_col=\"target\",\n", + " chunksize=100\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Delete Objects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wr.s3.delete_objects(f\"s3://{bucket}/\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_pytorch_p36", + "language": "python", + "name": "conda_pytorch_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 215fbd54c75f852267ced4777f9956391f4bb989 Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Thu, 23 Apr 2020 11:51:00 -0300 Subject: [PATCH 15/59] add torch extras_requirements to setuptools --- requirements-torch.txt | 3 +++ setup.py | 3 +++ 2 files changed, 6 insertions(+) create mode 100644 requirements-torch.txt diff --git a/requirements-torch.txt b/requirements-torch.txt new file mode 100644 index 000000000..325196f07 --- /dev/null +++ b/requirements-torch.txt @@ -0,0 +1,3 @@ +torch~=1.4.0 +torchvision~=0.5.0 +torchaudio~=0.4.0 \ No newline at end of file diff --git a/setup.py b/setup.py index b363e6e58..f9c861a60 100644 --- a/setup.py +++ b/setup.py @@ -23,4 +23,7 @@ packages=find_packages(include=["awswrangler", "awswrangler.*"], exclude=["tests"]), python_requires=">=3.6, <3.9", install_requires=[open("requirements.txt").read().strip().split("\n")], + extras_require={ + "torch": open("requirements-torch.txt").read().strip().split("\n") + } ) From 0ad9e4bf16e015562aeaed0a635ca970335b420f Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Thu, 23 Apr 2020 12:46:30 -0300 Subject: [PATCH 16/59] handle labels in S3IterableDataset --- awswrangler/torch.py | 11 ++++++++ testing/test_awswrangler/test_torch.py | 37 ++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/awswrangler/torch.py b/awswrangler/torch.py index c4dac13e5..29343983b 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -5,6 +5,7 @@ import tarfile import pathlib import re +from collections import Iterable from io import BytesIO from typing import Any, Callable, Iterator, List, Optional, Tuple, Union @@ -331,10 +332,20 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, for path in self._paths: data = self._fetch_data(path) data = self._load_data(data, path) + + if isinstance(data, torch.Tensor): + pass + elif isinstance(data, Iterable) and all([isinstance(d, torch.Tensor) for d in data]): + data = zip(data) + else: + raise NotImplementedError(f"ERROR: Type: {type(data)} has not been implemented!") + for d in data: yield d + + class SQLDataset(IterableDataset): # pylint: disable=too-few-public-methods,abstract-method """Pytorch Iterable SQL Dataset.""" diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index 599976d33..aaac654a4 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -237,3 +237,40 @@ def test_torch_s3_iterable_dataset(bucket, drop_last): assert image.shape == torch.Size([batch_size, 3, 32, 32]) else: assert image[0].shape == torch.Size([3, 32, 32]) + + +@pytest.mark.parametrize("drop_last", [True, False]) +def test_torch_s3_iterable_with_labels(bucket, drop_last): + folder = "test_torch_s3_iterable_dataset" + batch_size = 32 + client_s3 = boto3.client("s3") + for i in range(3): + batch = ( + torch.randn(100, 3, 32, 32), + torch.randint(2, size=(100,)), + ) + buff = io.BytesIO() + torch.save(batch, buff) + buff.seek(0) + client_s3.put_object( + Body=buff.read(), + Bucket=bucket, + Key=f"{folder}/file{i}.pt", + ) + + for images, labels in DataLoader( + wr.torch.S3IterableDataset( + path=f"s3://{bucket}/{folder}", + ), + batch_size=batch_size, + drop_last=drop_last, + ): + if drop_last: + assert images.shape == torch.Size([batch_size, 3, 32, 32]) + assert labels.dtype == torch.int64 + assert labels.size == torch.Size([batch_size, 1]) + + else: + assert images[0].shape == torch.Size([3, 32, 32]) + assert labels.dtype == torch.int64 + assert labels.size == torch.Size([1]) \ No newline at end of file From 5e72ddf1232c8cde025080a32ebf4cac398f833c Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Thu, 23 Apr 2020 12:59:01 -0300 Subject: [PATCH 17/59] clear bucket in S3Iterable Dataset test --- awswrangler/torch.py | 2 +- testing/test_awswrangler/test_torch.py | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/awswrangler/torch.py b/awswrangler/torch.py index 29343983b..5e4365062 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -336,7 +336,7 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, if isinstance(data, torch.Tensor): pass elif isinstance(data, Iterable) and all([isinstance(d, torch.Tensor) for d in data]): - data = zip(data) + data = zip(*data) else: raise NotImplementedError(f"ERROR: Type: {type(data)} has not been implemented!") diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index aaac654a4..83630b0e7 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -211,8 +211,10 @@ def test_torch_audio_s3(bucket): @pytest.mark.parametrize("drop_last", [True, False]) -def test_torch_s3_iterable_dataset(bucket, drop_last): - folder = "test_torch_s3_iterable_dataset" +def test_torch_s3_iterable(bucket, drop_last): + folder = "test_torch_s3_iterable" + path = f"s3://{bucket}/{folder}/" + wr.s3.delete_objects(path=path) batch_size = 32 client_s3 = boto3.client("s3") for i in range(3): @@ -241,7 +243,9 @@ def test_torch_s3_iterable_dataset(bucket, drop_last): @pytest.mark.parametrize("drop_last", [True, False]) def test_torch_s3_iterable_with_labels(bucket, drop_last): - folder = "test_torch_s3_iterable_dataset" + folder = "test_torch_s3_iterable_with_labels" + path = f"s3://{bucket}/{folder}/" + wr.s3.delete_objects(path=path) batch_size = 32 client_s3 = boto3.client("s3") for i in range(3): @@ -268,9 +272,9 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last): if drop_last: assert images.shape == torch.Size([batch_size, 3, 32, 32]) assert labels.dtype == torch.int64 - assert labels.size == torch.Size([batch_size, 1]) + assert labels.shape == torch.Size([batch_size]) else: assert images[0].shape == torch.Size([3, 32, 32]) - assert labels.dtype == torch.int64 - assert labels.size == torch.Size([1]) \ No newline at end of file + assert labels[0].dtype == torch.int64 + assert labels[0].shape == torch.Size([]) From 5b399ac656be04e1c4cb5cf454cad5ea474a4b10 Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Thu, 23 Apr 2020 13:10:07 -0300 Subject: [PATCH 18/59] update setuptools --- requirements-dev.txt | 5 +---- setup-dev-env.sh | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 0491e8789..3fdd3cdf3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -17,7 +17,4 @@ twine~=3.1.1 wheel~=0.34.2 sphinx~=3.0.1 sphinx_bootstrap_theme~=0.7.1 -moto~=1.3.14 -torch~=1.4.0 -torchvision~=0.5.0 -torchaudio~=0.4.0 \ No newline at end of file +moto~=1.3.14 \ No newline at end of file diff --git a/setup-dev-env.sh b/setup-dev-env.sh index 692724ee0..c9c2e9902 100755 --- a/setup-dev-env.sh +++ b/setup-dev-env.sh @@ -3,5 +3,4 @@ set -ex pip install --upgrade pip pip install -r requirements-dev.txt -pip install -r requirements.txt -pip install -e . +pip install -e ".[torch]" From 2db15b6d09ae1d5e80d325a7daf44ab8c163eeef Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Thu, 23 Apr 2020 13:23:08 -0300 Subject: [PATCH 19/59] update pytorch tutorial --- tutorials/14 - PyTorch.ipynb | 69 +++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/tutorials/14 - PyTorch.ipynb b/tutorials/14 - PyTorch.ipynb index 757c817f9..fefb8332a 100644 --- a/tutorials/14 - PyTorch.ipynb +++ b/tutorials/14 - PyTorch.ipynb @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -45,12 +45,14 @@ "import torchvision\n", "import awswrangler as wr\n", "\n", - "accuracy = lambda o, l: 100/o.size(0) * (torch.max(o.data, 1)[1] == l).sum().item()" + "from torch.optim import SGD\n", + "from torch.nn import CrossEntropyLoss\n", + "from torch.utils.data import DataLoader" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -67,31 +69,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "def train(model, dataset):\n", - " criterion = torch.nn.CrossEntropyLoss()\n", - " opt = torch.optim.SGD(model.parameters(), 0.025)\n", + "def train(model, dataset, batch_size=64, epochs=2, device='cpu'):\n", + "\n", + " criterion = CrossEntropyLoss().to(device)\n", + " opt = SGD(model.parameters(), 0.025)\n", + " loader = DataLoader(dataset, batch_size=batch_size, num_workers=1)\n", "\n", - " for epoch in range(2):\n", + " for epoch in range(epochs):\n", "\n", + " correct = 0 \n", " model.train()\n", - " for inputs, labels in torch.utils.data.DataLoader(\n", - " dataset,\n", - " batch_size=64,\n", - " num_workers=2,\n", - " ):\n", + " for i, (inputs, labels) in enumerate(loader):\n", "\n", + " # Forward Pass\n", " outputs = model(inputs)\n", + " \n", + " # Backward Pass\n", " loss = criterion(outputs, labels)\n", " loss.backward()\n", - " opt.step()s\n", + " opt.step()\n", " opt.zero_grad()\n", + " \n", + " # Accuracy\n", + " _, predicted = torch.max(outputs.data, 1)\n", + " correct += (predicted == labels).sum().item()\n", + " accuracy = 100 * correct / ((i+1) * batch_size)\n", "\n", - " acc = accuracy(outputs, labels)\n", - " print(f'batch: {i} loss: {loss.mean().item():.4f} batch_acc: {acc:.2f}') " + " print(f'batch: {i} loss: {loss.mean().item():.4f} acc: {accuracy:.2f}') " ] }, { @@ -103,16 +111,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "client_s3 = boto3.client(\"s3\")\n", "folder = \"tutorial_torch_dataset\"\n", + "\n", + "wr.s3.delete_objects(f\"s3://{bucket}/{folder}\")\n", "for i in range(3):\n", " batch = (\n", " torch.randn(100, 3, 32, 32),\n", - " torch.randint(1, size=(100,)),\n", + " torch.randint(2, size=(100,)),\n", " )\n", " buff = io.BytesIO()\n", " torch.save(batch, buff)\n", @@ -133,13 +143,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "batch: 0 loss: 6.9552 acc: 0.00\n", + "batch: 1 loss: 2.9621 acc: 23.44\n", + "batch: 2 loss: 0.9873 acc: 31.77\n", + "batch: 3 loss: 1.9760 acc: 34.38\n", + "batch: 4 loss: 3.3523 acc: 33.44\n", + "batch: 0 loss: 1.2023 acc: 59.38\n", + "batch: 1 loss: 0.8057 acc: 60.16\n", + "batch: 2 loss: 0.6782 acc: 62.50\n", + "batch: 3 loss: 0.4291 acc: 67.58\n", + "batch: 4 loss: 0.2953 acc: 66.88\n" + ] + } + ], "source": [ "train(\n", " torchvision.models.resnet18(),\n", - " wr.torch.S3IterableDataset(path=f\"s3://{bucket}/{folder}\"),\n", + " wr.torch.S3IterableDataset(path=f\"{bucket}/{folder}\")\n", ")" ] }, From 5e647c66d0f4df62ed360d73d0a3a3aa0bbda06c Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 23 Apr 2020 17:01:50 +0000 Subject: [PATCH 20/59] Update tutorial --- tutorials/14 - PyTorch.ipynb | 101 +++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 33 deletions(-) diff --git a/tutorials/14 - PyTorch.ipynb b/tutorials/14 - PyTorch.ipynb index fefb8332a..a3d988881 100644 --- a/tutorials/14 - PyTorch.ipynb +++ b/tutorials/14 - PyTorch.ipynb @@ -40,9 +40,11 @@ "outputs": [], "source": [ "import io\n", + "\n", "import boto3\n", "import torch\n", "import torchvision\n", + "import pandas as pd\n", "import awswrangler as wr\n", "\n", "from torch.optim import SGD\n", @@ -54,7 +56,15 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ··········································\n" + ] + } + ], "source": [ "import getpass\n", "bucket = getpass.getpass()" @@ -69,15 +79,15 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "def train(model, dataset, batch_size=64, epochs=2, device='cpu'):\n", + "def train(model, dataset, batch_size=64, epochs=2, device='cpu', num_workers=1):\n", "\n", " criterion = CrossEntropyLoss().to(device)\n", " opt = SGD(model.parameters(), 0.025)\n", - " loader = DataLoader(dataset, batch_size=batch_size, num_workers=1)\n", + " loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)\n", "\n", " for epoch in range(epochs):\n", "\n", @@ -111,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -143,23 +153,23 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "batch: 0 loss: 6.9552 acc: 0.00\n", - "batch: 1 loss: 2.9621 acc: 23.44\n", - "batch: 2 loss: 0.9873 acc: 31.77\n", - "batch: 3 loss: 1.9760 acc: 34.38\n", - "batch: 4 loss: 3.3523 acc: 33.44\n", - "batch: 0 loss: 1.2023 acc: 59.38\n", - "batch: 1 loss: 0.8057 acc: 60.16\n", - "batch: 2 loss: 0.6782 acc: 62.50\n", - "batch: 3 loss: 0.4291 acc: 67.58\n", - "batch: 4 loss: 0.2953 acc: 66.88\n" + "batch: 0 loss: 7.0221 acc: 0.00\n", + "batch: 1 loss: 2.7788 acc: 23.44\n", + "batch: 2 loss: 0.9828 acc: 32.29\n", + "batch: 3 loss: 0.9414 acc: 39.45\n", + "batch: 4 loss: 1.0737 acc: 39.38\n", + "batch: 0 loss: 1.2178 acc: 50.00\n", + "batch: 1 loss: 1.4069 acc: 51.56\n", + "batch: 2 loss: 1.0783 acc: 52.08\n", + "batch: 3 loss: 0.9926 acc: 52.34\n", + "batch: 4 loss: 1.1111 acc: 49.06\n" ] } ], @@ -186,20 +196,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "eng = wr.catalog.get_engine(\"aws-data-wrangler-redshift\")\n", "df = pd.DataFrame({\n", - " \"height\": [2, 1.4, 1.7, 1.8, 1.9],\n", - " \"name\": [\"foo\", \"boo\"],\n", - " \"target\": [1, 0, 0, 1, 2, 3]\n", + " \"height\": [2, 1.4, 1.7, 1.8, 1.9, 2.2],\n", + " \"weigth\": [100.0, 50.0, 70.0, 80.0, 90.0, 160.0],\n", + " \"target\": [1, 0, 0, 1, 1, 1]\n", "})\n", "\n", "wr.db.to_sql(\n", " df,\n", - " eng_redshift,\n", + " eng,\n", " schema=\"public\",\n", " name=\"torch\",\n", " if_exists=\"replace\",\n", @@ -216,22 +226,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "batch: 0 loss: 5.0253 acc: 50.00\n", + "batch: 1 loss: 21.3174 acc: 50.00\n", + "batch: 2 loss: 0.5061 acc: 66.67\n", + "batch: 0 loss: 1.2222 acc: 50.00\n", + "batch: 1 loss: 0.7075 acc: 50.00\n", + "batch: 2 loss: 0.7077 acc: 50.00\n", + "batch: 0 loss: 0.9302 acc: 50.00\n", + "batch: 1 loss: 0.6960 acc: 50.00\n", + "batch: 2 loss: 0.6018 acc: 66.67\n", + "batch: 0 loss: 1.1284 acc: 50.00\n", + "batch: 1 loss: 0.7077 acc: 50.00\n", + "batch: 2 loss: 0.6791 acc: 50.00\n", + "batch: 0 loss: 1.0030 acc: 50.00\n", + "batch: 1 loss: 0.7053 acc: 50.00\n", + "batch: 2 loss: 0.6318 acc: 50.00\n" + ] + } + ], "source": [ "train(\n", - " model = torch.nn.Sequential(\n", - " torch.nn.Linear(, 20),\n", + " torch.nn.Sequential(\n", + " torch.nn.Linear(2, 10),\n", " torch.nn.ReLU(),\n", - " torch.nn.Linear(20, 2), \n", + " torch.nn.Linear(10, 2), \n", " ),\n", " wr.torch.SQLDataset(\n", - " sql=\"SELECT * FROM public.torch\"\n", - " con=eng\n", + " sql=\"SELECT * FROM public.torch\",\n", + " con=eng,\n", " label_col=\"target\",\n", - " chunksize=100\n", - " )\n", + " chunksize=2\n", + " ),\n", + " num_workers=0,\n", + " batch_size=2,\n", + " epochs=5\n", ")" ] }, @@ -244,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -254,9 +289,9 @@ ], "metadata": { "kernelspec": { - "display_name": "conda_pytorch_p36", + "display_name": "conda_python3", "language": "python", - "name": "conda_pytorch_p36" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { From b3d9fe2d9d4c1563aecb225f6a2b678414df41ab Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Thu, 23 Apr 2020 18:14:55 -0300 Subject: [PATCH 21/59] parallel tests fix --- awswrangler/torch.py | 209 ++++++++++++++++--------- building/build-docs.sh | 2 +- docs/source/api.rst | 13 ++ testing/test_awswrangler/test_torch.py | 37 +++-- 4 files changed, 169 insertions(+), 92 deletions(-) diff --git a/awswrangler/torch.py b/awswrangler/torch.py index 5e4365062..c25e145ee 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -113,49 +113,54 @@ def _label_fn(self, path: str): class _S3PartitionedDataset(_ListS3Dataset): """PyTorch Amazon S3 Map-Style Partitioned Dataset.""" - def _label_fn(self, path: str): - return int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1]) - - -class S3FilesDataset(_BaseS3Dataset, Dataset): - """PyTorch Amazon S3 Files Map-Style Dataset.""" - - def __init__( - self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None - ): - """PyTorch S3 Files Map-Style Dataset. - - Each file under Amazon S3 path would be handled as a batch of tensors. - All files will be loaded to memory since random access is needed. - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - torch.utils.data.Dataset - - """ - super(S3FilesDataset, self).__init__(path, suffix, boto3_session) - - def _download_files(self): - self._data = [] - for path in self._paths: - data = self._fetch_data(path) - data = self._load_data(data, path) - self._data.append(data) - - self.data = torch.tensor(self._data) - - def __getitem__(self, index): - return self._data[index] - - def __len__(self): - return len(self._data) + def _label_fn(self, path: str) -> torch.Tensor: + label = int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1]) + return torch.tensor([label]) + + +# class S3FilesDataset(_BaseS3Dataset, Dataset): +# """PyTorch Amazon S3 Files Map-Style Dataset.""" +# +# def __init__( +# self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +# ): +# """PyTorch S3 Files Map-Style Dataset. +# +# Each file under Amazon S3 path would be handled as a tensor or batch of tensors. +# +# Note +# ---- +# All files will be loaded to memory since random access is needed. +# +# Parameters +# ---------- +# path : Union[str, List[str]] +# S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). +# boto3_session : boto3.Session(), optional +# Boto3 Session. The default boto3 session will be used if boto3_session receive None. +# +# Returns +# ------- +# torch.utils.data.Dataset +# +# """ +# super(S3FilesDataset, self).__init__(path, suffix, boto3_session) +# self._download_files() +# +# def _download_files(self) -> None: +# self._data = [] +# for path in self._paths: +# data = self._fetch_data(path) +# data = self._load_data(data, path) +# self._data.append(data) +# +# self.data = torch.cat(self._data, dim=0) +# +# def __getitem__(self, index): +# return self._data[index] +# +# def __len__(self): +# return len(self._data) class LambdaS3Dataset(_ListS3Dataset): @@ -169,7 +174,7 @@ def __init__( suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ): - """PyTorch S3 Lambda Dataset. + """PyTorch Amazon S3 Lambda Dataset. Parameters ---------- @@ -184,22 +189,24 @@ def __init__( Examples -------- + >>> import re + >>> import torch >>> import awswrangler as wr - >>> import boto3 - >>> _data_fn = lambda x: torch.tensor(x) - >>> _label_fn = lambda x: x.split('.')[-1] - >>> ds = wr.torch.LambdaS3Dataset('s3://bucket/path', boto3.Session(), _data_fn=_data_fn, _label_fn=_label_fn) + >>> ds = wr.torch.LambdaS3Dataset( + >>> 's3://bucket/path', + >>> data_fn=lambda x: torch.load(x), + >>> label_fn=lambda x: torch.Tensor(int(re.findall(r"/class=(.*?)/", x)[-1])), + >>> ) """ super(LambdaS3Dataset, self).__init__(path, suffix, boto3_session) self._data_func = data_fn self._label_func = label_fn - def _label_fn(self, path: str): + def _label_fn(self, path: str) -> torch.Tensor: return self._label_func(path) - def _data_fn(self, data): - print(type(data)) + def _data_fn(self, data) -> torch.Tensor: return self._data_func(data) @@ -213,17 +220,26 @@ def __init__( suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ): - """PyTorch S3 Audio Dataset. + """PyTorch Amazon S3 Audio Dataset. + + Read individual WAV audio files stores in Amazon S3 and return + them as torch tensors. + + Note + ---- + + This dataset assumes audio files are stored with the following structure: + - Assumes audio files are stored with the following structure: + :: - bucket - ├── class=0 - │ ├── audio0.wav - │ └── audio1.wav - └── class=1 - ├── audio2.wav - └── audio3.wav + bucket + ├── class=0 + │ ├── audio0.wav + │ └── audio1.wav + └── class=1 + ├── audio2.wav + └── audio3.wav Parameters ---------- @@ -238,9 +254,39 @@ def __init__( Examples -------- + + Create a Audio S3 Dataset + >>> import awswrangler as wr - >>> import boto3 - >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path', boto3.Session()) + >>> ds = wr.torch.AudioS3Dataset('s3://bucket/path') + + + Training a Model + + >>> criterion = CrossEntropyLoss().to(device) + >>> opt = SGD(model.parameters(), 0.025) + >>> loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers) + >>> + >>> for epoch in range(epochs): + >>> + >>> correct = 0 + >>> model.train() + >>> for i, (inputs, labels) in enumerate(loader): + >>> + >>> # Forward Pass + >>> outputs = model(inputs) + >>> + >>> # Backward Pass + >>> loss = criterion(outputs, labels) + >>> loss.backward() + >>> opt.step() + >>> opt.zero_grad() + >>> + >>> # Accuracy + >>> _, predicted = torch.max(outputs.data, 1) + >>> correct += (predicted == labels).sum().item() + >>> accuracy = 100 * correct / ((i+1) * batch_size) + >>> print(f'batch: {i} loss: {loss.mean().item():.4f} acc: {accuracy:.2f}') """ super(AudioS3Dataset, self).__init__(path, suffix, boto3_session) @@ -261,20 +307,28 @@ def _fetch_data(self, path: str) -> str: class ImageS3Dataset(_S3PartitionedDataset): - """PyTorch S3 Image Dataset.""" + """PyTorch Amazon S3 Image Dataset.""" def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session): - """PyTorch S3 Image Dataset. + """PyTorch Amazon S3 Image Dataset. + + ImageS3Dataset assumes images are patitioned (within class= folders) in Amazon S3. + Each lisited object will be loaded by default Pillow library. + Note + ---- Assumes Images are stored with the following structure: - bucket - ├── class=0 - │ ├── img0.jpeg - │ └── img1.jpeg - └── class=1 - ├── img2.jpeg - └── img3.jpeg + + :: + + bucket + ├── class=0 + │ ├── img0.jpeg + │ └── img1.jpeg + └── class=1 + ├── img2.jpeg + └── img3.jpeg Parameters ---------- @@ -290,13 +344,12 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto Examples -------- >>> import awswrangler as wr - >>> import boto3 - >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path', boto3.Session()) + >>> ds = wr.torch.ImageS3Dataset('s3://bucket/path') """ super(ImageS3Dataset, self).__init__(path, suffix, boto3_session) - def _data_fn(self, data): + def _data_fn(self, data: io.BytesIO) -> torch.Tensor: image = Image.open(data) tensor = to_tensor(image) return tensor @@ -324,9 +377,13 @@ def __init__( ------- torch.utils.data.Dataset + Examples + -------- + >>> import awswrangler as wr + >>> ds = wr.torch.S3IterableDataset('s3://bucket/path') + """ super(S3IterableDataset, self).__init__(path, suffix, boto3_session) - self._paths_index = 0 def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]: for path in self._paths: @@ -344,8 +401,6 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, yield d - - class SQLDataset(IterableDataset): # pylint: disable=too-few-public-methods,abstract-method """Pytorch Iterable SQL Dataset.""" diff --git a/building/build-docs.sh b/building/build-docs.sh index c32c20aa0..8c807b485 100755 --- a/building/build-docs.sh +++ b/building/build-docs.sh @@ -4,4 +4,4 @@ set -ex pushd .. rm -rf docs/build docs/source/stubs make -C docs/ html -doc8 --ignore D005 docs/source +doc8 --ignore D005,D002 docs/source diff --git a/docs/source/api.rst b/docs/source/api.rst index 897fc7a3e..aea8bbed6 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -3,6 +3,19 @@ API Reference ============= +PyTorch +------- + +.. currentmodule:: awswrangler.torch + +.. autosummary:: + :toctree: stubs + + AudioS3Dataset + ImageS3Dataset + S3IterableDataset + SQLDataset + Amazon S3 --------- diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index 83630b0e7..40ecf7050 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -63,7 +63,7 @@ def parameters(cloudformation_outputs): @pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"]) def test_torch_sql(parameters, db_type, chunksize): schema = parameters[db_type]["schema"] - table = "test_torch_sql" + table = f"test_torch_sql_{db_type}_{str(chunksize).lower()}" engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") wr.db.to_sql( df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}), @@ -86,7 +86,7 @@ def test_torch_sql(parameters, db_type, chunksize): @pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"]) def test_torch_sql_label(parameters, db_type, chunksize): schema = parameters[db_type]["schema"] - table = "test_torch_sql_label" + table = f"test_torch_sql_label_{db_type}_{str(chunksize).lower()}" engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") wr.db.to_sql( df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0], "c": [7, 8, 9]}), @@ -109,14 +109,15 @@ def test_torch_sql_label(parameters, db_type, chunksize): def test_torch_image_s3(bucket): - path = f"s3://{bucket}/test_torch_image_s3/" + folder = "test_torch_image_s3" + path = f"s3://{bucket}/{folder}/" wr.s3.delete_objects(path=path, boto3_session=boto3.Session()) s3 = boto3.client("s3") ref_label = 0 s3.put_object( Body=open("docs/source/_static/logo.png", "rb").read(), Bucket=bucket, - Key=f"test_torch_image_s3/class={ref_label}/logo.png", + Key=f"{folder}/class={ref_label}/logo.png", ContentType="image/png", ) ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session()) @@ -127,8 +128,9 @@ def test_torch_image_s3(bucket): @pytest.mark.parametrize("drop_last", [True, False]) -def test_torch_image_s3_dataloader(bucket, drop_last): - path = f"s3://{bucket}/test_torch_image_s3_dataloader/" +def test_torch_image_s3(bucket, drop_last): + folder = f"test_torch_image_s3_{str(drop_last).lower()}" + path = f"s3://{bucket}/{folder}/" wr.s3.delete_objects(path=path) client_s3 = boto3.client("s3") labels = np.random.randint(0, 4, size=(8,)) @@ -136,7 +138,7 @@ def test_torch_image_s3_dataloader(bucket, drop_last): client_s3.put_object( Body=open("./docs/source/_static/logo.png", "rb").read(), Bucket=bucket, - Key=f"test_torch_image_s3_dataloader/class={label}/logo{i}.png", + Key=f"{folder}/class={label}/logo{i}.png", ContentType="image/png", ) ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session()) @@ -181,14 +183,15 @@ def test_torch_audio_s3(bucket): audio = torch.randint(low=-25, high=25, size=size) / 100.0 audio_file = "/tmp/amazing_sound.wav" torchaudio.save(audio_file, audio, 8_000) - path = f"s3://{bucket}/test_torch_audio_s3/" - wr.s3.delete_objects(path=path, boto3_session=boto3.Session()) + folder = "test_torch_audio_s3" + path = f"s3://{bucket}/{folder}/" + wr.s3.delete_objects(path=path) s3 = boto3.client("s3") ref_label = 0 s3.put_object( Body=open(audio_file, "rb").read(), Bucket=bucket, - Key=f"test_torch_audio_s3/class={ref_label}/amazing_sound.wav", + Key=f"{folder}/class={ref_label}/amazing_sound.wav", ContentType="audio/wav", ) s3_audio_file = f"{bucket}/test_torch_audio_s3/class={ref_label}/amazing_sound.wav" @@ -196,6 +199,7 @@ def test_torch_audio_s3(bucket): loader = DataLoader(ds, batch_size=1) for (audio, rate), label in loader: assert audio.shape == torch.Size((1, *size)) + wr.s3.delete_objects(path=path) # def test_torch_s3_file_dataset(bucket): @@ -212,7 +216,7 @@ def test_torch_audio_s3(bucket): @pytest.mark.parametrize("drop_last", [True, False]) def test_torch_s3_iterable(bucket, drop_last): - folder = "test_torch_s3_iterable" + folder = f"test_torch_s3_iterable_{str(drop_last).lower()}" path = f"s3://{bucket}/{folder}/" wr.s3.delete_objects(path=path) batch_size = 32 @@ -230,7 +234,7 @@ def test_torch_s3_iterable(bucket, drop_last): for image in DataLoader( wr.torch.S3IterableDataset( - path=f"s3://{bucket}/{folder}", + path=f"s3://{bucket}/{folder}/file", ), batch_size=batch_size, drop_last=drop_last, @@ -240,10 +244,12 @@ def test_torch_s3_iterable(bucket, drop_last): else: assert image[0].shape == torch.Size([3, 32, 32]) + wr.s3.delete_objects(path=path) + @pytest.mark.parametrize("drop_last", [True, False]) def test_torch_s3_iterable_with_labels(bucket, drop_last): - folder = "test_torch_s3_iterable_with_labels" + folder = f"test_torch_s3_iterable_with_labels_{str(drop_last).lower()}" path = f"s3://{bucket}/{folder}/" wr.s3.delete_objects(path=path) batch_size = 32 @@ -264,7 +270,7 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last): for images, labels in DataLoader( wr.torch.S3IterableDataset( - path=f"s3://{bucket}/{folder}", + path=f"s3://{bucket}/{folder}/file", ), batch_size=batch_size, drop_last=drop_last, @@ -278,3 +284,6 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last): assert images[0].shape == torch.Size([3, 32, 32]) assert labels[0].dtype == torch.int64 assert labels[0].shape == torch.Size([]) + + wr.s3.delete_objects(path=path) + From c091fa82e39e58375f5ea92527ed619c467ca974 Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Thu, 23 Apr 2020 22:23:55 -0300 Subject: [PATCH 22/59] fix lint --- awswrangler/torch.py | 102 +++++++++---------------- requirements-torch.txt | 3 +- testing/test_awswrangler/test_torch.py | 42 ++++------ 3 files changed, 52 insertions(+), 95 deletions(-) diff --git a/awswrangler/torch.py b/awswrangler/torch.py index c25e145ee..a5b589386 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -1,11 +1,11 @@ """PyTorch Module.""" -import logging import io +import logging import os -import tarfile import pathlib import re -from collections import Iterable +import tarfile +from collections.abc import Iterable from io import BytesIO from typing import Any, Callable, Iterator, List, Optional, Tuple, Union @@ -49,8 +49,8 @@ def __init__( path=path, suffix=suffix, boto3_session=self._session ) - def _fetch_data(self, path: str): - """Add parquet and csv support""" + def _fetch_data(self, path: str) -> Any: + """Add parquet and csv support.""" bucket, key = _utils.parse_path(path=path) buff = BytesIO() client_s3: boto3.client = _utils.client(service_name="s3", session=self._session) @@ -59,42 +59,23 @@ def _fetch_data(self, path: str): return buff @staticmethod - def _load_data(data: io.BytesIO, path: str): - if path.endswith('.tar.gz') or path.endswith('.tgz'): - pass - # tarfile.open(fileobj=data) + def _load_data(data: io.BytesIO, path: str) -> Any: + if path.endswith(".pt"): + data = torch.load(data) + elif path.endswith(".tar.gz") or path.endswith(".tgz"): + tarfile.open(fileobj=data) + raise NotImplementedError("Tar loader not implemented!") # tar = tarfile.open(fileobj=data) # for member in tar.getmembers(): - # print('member', member) - elif path.endswith('.pt'): - data = torch.load(data) + else: + raise NotImplementedError() + return data class _ListS3Dataset(_BaseS3Dataset, Dataset): """PyTorch Amazon S3 Map-Style List Dataset.""" - def __init__( - self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None - ): - """PyTorch Map-Style List S3 Dataset. - - Each file under path would be handle as a single tensor. - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - torch.utils.data.Dataset - - """ - super(_ListS3Dataset, self).__init__(path, suffix, boto3_session) - def __getitem__(self, index): path = self._paths[index] data = self._fetch_data(path) @@ -103,10 +84,10 @@ def __getitem__(self, index): def __len__(self): return len(self._paths) - def _data_fn(self, data): + def _data_fn(self, data) -> Any: pass - def _label_fn(self, path: str): + def _label_fn(self, path: str) -> Any: pass @@ -115,7 +96,7 @@ class _S3PartitionedDataset(_ListS3Dataset): def _label_fn(self, path: str) -> torch.Tensor: label = int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1]) - return torch.tensor([label]) + return torch.tensor([label]) # pylint: disable=not-callable # class S3FilesDataset(_BaseS3Dataset, Dataset): @@ -135,7 +116,8 @@ def _label_fn(self, path: str) -> torch.Tensor: # Parameters # ---------- # path : Union[str, List[str]] -# S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). +# S3 prefix (e.g. s3://bucket/prefix) or +# list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). # boto3_session : boto3.Session(), optional # Boto3 Session. The default boto3 session will be used if boto3_session receive None. # @@ -227,7 +209,6 @@ def __init__( Note ---- - This dataset assumes audio files are stored with the following structure: @@ -254,7 +235,6 @@ def __init__( Examples -------- - Create a Audio S3 Dataset >>> import awswrangler as wr @@ -349,43 +329,35 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto """ super(ImageS3Dataset, self).__init__(path, suffix, boto3_session) - def _data_fn(self, data: io.BytesIO) -> torch.Tensor: + def _data_fn(self, data: io.BytesIO) -> Any: image = Image.open(data) tensor = to_tensor(image) return tensor -class S3IterableDataset(_BaseS3Dataset, IterableDataset): - """PyTorch Amazon S3 Iterable Dataset.""" - - def __init__( - self, - path: Union[str, List[str]], - suffix: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, - ): - """PyTorch Amazon S3 Iterable Dataset. +class S3IterableDataset(IterableDataset, _BaseS3Dataset): # pylint: disable=abstract-method + """PyTorch Amazon S3 Iterable Dataset. - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. - Returns - ------- - torch.utils.data.Dataset + Returns + ------- + torch.utils.data.Dataset - Examples - -------- - >>> import awswrangler as wr - >>> ds = wr.torch.S3IterableDataset('s3://bucket/path') + Examples + -------- + >>> import awswrangler as wr + >>> ds = wr.torch.S3IterableDataset('s3://bucket/path') - """ - super(S3IterableDataset, self).__init__(path, suffix, boto3_session) + """ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]: + """Iterate over data returning tensors or expanding Iterables.""" for path in self._paths: data = self._fetch_data(path) data = self._load_data(data, path) diff --git a/requirements-torch.txt b/requirements-torch.txt index 325196f07..01d2c6e65 100644 --- a/requirements-torch.txt +++ b/requirements-torch.txt @@ -1,3 +1,4 @@ torch~=1.4.0 torchvision~=0.5.0 -torchaudio~=0.4.0 \ No newline at end of file +torchaudio~=0.4.0 +Pillow==7.1.1 diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index 40ecf7050..19a300400 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -1,6 +1,6 @@ import io -import re import logging +import re import boto3 import numpy as np @@ -128,8 +128,8 @@ def test_torch_image_s3(bucket): @pytest.mark.parametrize("drop_last", [True, False]) -def test_torch_image_s3(bucket, drop_last): - folder = f"test_torch_image_s3_{str(drop_last).lower()}" +def test_torch_image_s3_loader(bucket, drop_last): + folder = f"test_torch_image_s3_loader_{str(drop_last).lower()}" path = f"s3://{bucket}/{folder}/" wr.s3.delete_objects(path=path) client_s3 = boto3.client("s3") @@ -146,7 +146,11 @@ def test_torch_image_s3(bucket, drop_last): num_train = len(ds) indices = list(range(num_train)) loader = DataLoader( - ds, batch_size=batch_size, num_workers=4, sampler=torch.utils.data.sampler.RandomSampler(indices), drop_last=drop_last + ds, + batch_size=batch_size, + num_workers=4, + sampler=torch.utils.data.sampler.RandomSampler(indices), + drop_last=drop_last, ) for i, (image, label) in enumerate(loader): assert image.shape == torch.Size([batch_size, 4, 494, 1636]) @@ -226,18 +230,10 @@ def test_torch_s3_iterable(bucket, drop_last): buff = io.BytesIO() torch.save(batch, buff) buff.seek(0) - client_s3.put_object( - Body=buff.read(), - Bucket=bucket, - Key=f"{folder}/file{i}.pt", - ) + client_s3.put_object(Body=buff.read(), Bucket=bucket, Key=f"{folder}/file{i}.pt") for image in DataLoader( - wr.torch.S3IterableDataset( - path=f"s3://{bucket}/{folder}/file", - ), - batch_size=batch_size, - drop_last=drop_last, + wr.torch.S3IterableDataset(path=f"s3://{bucket}/{folder}/file"), batch_size=batch_size, drop_last=drop_last ): if drop_last: assert image.shape == torch.Size([batch_size, 3, 32, 32]) @@ -255,25 +251,14 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last): batch_size = 32 client_s3 = boto3.client("s3") for i in range(3): - batch = ( - torch.randn(100, 3, 32, 32), - torch.randint(2, size=(100,)), - ) + batch = (torch.randn(100, 3, 32, 32), torch.randint(2, size=(100,))) buff = io.BytesIO() torch.save(batch, buff) buff.seek(0) - client_s3.put_object( - Body=buff.read(), - Bucket=bucket, - Key=f"{folder}/file{i}.pt", - ) + client_s3.put_object(Body=buff.read(), Bucket=bucket, Key=f"{folder}/file{i}.pt") for images, labels in DataLoader( - wr.torch.S3IterableDataset( - path=f"s3://{bucket}/{folder}/file", - ), - batch_size=batch_size, - drop_last=drop_last, + wr.torch.S3IterableDataset(path=f"s3://{bucket}/{folder}/file"), batch_size=batch_size, drop_last=drop_last ): if drop_last: assert images.shape == torch.Size([batch_size, 3, 32, 32]) @@ -286,4 +271,3 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last): assert labels[0].shape == torch.Size([]) wr.s3.delete_objects(path=path) - From 37b7f1e7edf9aa233d07bfd06baa92db80dc7cc3 Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Fri, 24 Apr 2020 13:43:47 -0300 Subject: [PATCH 23/59] update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d4a8a3cad..624ebc12c 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,7 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) - [11 - CSV Datasets](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/11%20-%20CSV%20Datasets.ipynb) - [12 - CSV Crawler](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/12%20-%20CSV%20Crawler.ipynb) - [13 - Merging Datasets on S3](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/13%20-%20Merging%20Datasets%20on%20S3.ipynb) + - [14 - PyTorch](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/14%20-%20PyTorch.ipynb) - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/latest/api.html) - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#amazon-s3) - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#aws-glue-catalog) From 33d74c4354ebfffd357c6730b0cacbc727d33185 Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Fri, 24 Apr 2020 14:00:21 -0300 Subject: [PATCH 24/59] remove captalized requirement from docstring --- .github/workflows/static-checking.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml index bc33d9327..9f0701146 100644 --- a/.github/workflows/static-checking.yml +++ b/.github/workflows/static-checking.yml @@ -30,7 +30,7 @@ jobs: - name: CloudFormation Lint run: cfn-lint -t testing/cloudformation.yaml - name: Documentation Lint - run: pydocstyle awswrangler/ --add-ignore=D204 + run: pydocstyle awswrangler/ --add-ignore=D204,D403 - name: mypy check run: mypy awswrangler - name: Flake8 Lint From 4b05b36575237da68de61335d6f5db8777e5f2cc Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Fri, 24 Apr 2020 14:12:26 -0300 Subject: [PATCH 25/59] add torch requirements --- .github/workflows/static-checking.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml index 9f0701146..56f978a50 100644 --- a/.github/workflows/static-checking.yml +++ b/.github/workflows/static-checking.yml @@ -27,6 +27,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -r requirements-dev.txt + pip install -r requirements-torch.txt - name: CloudFormation Lint run: cfn-lint -t testing/cloudformation.yaml - name: Documentation Lint From 9ce624b60fe9de55ce928283e76841ed45a76ea2 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 25 Apr 2020 17:41:16 -0300 Subject: [PATCH 26/59] Add support to EMR with Docker --- awswrangler/__init__.py | 1 + awswrangler/_utils.py | 13 + awswrangler/athena.py | 2 +- awswrangler/emr.py | 335 ++++++++++++++++---- awswrangler/s3.py | 65 +++- docs/source/api.rst | 2 + requirements-dev.txt | 3 +- testing/test_awswrangler/test_cloudwatch.py | 2 +- testing/test_awswrangler/test_data_lake.py | 3 + testing/test_awswrangler/test_emr.py | 33 ++ testing/test_awswrangler/test_moto.py | 27 +- tutorials/15 - EMR.ipynb | 193 +++++++++++ tutorials/16 - EMR & Docker.ipynb | 269 ++++++++++++++++ 13 files changed, 869 insertions(+), 79 deletions(-) create mode 100644 tutorials/15 - EMR.ipynb create mode 100644 tutorials/16 - EMR & Docker.ipynb diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py index ce11c7ad5..4413ab5f4 100644 --- a/awswrangler/__init__.py +++ b/awswrangler/__init__.py @@ -9,5 +9,6 @@ from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3 # noqa from awswrangler.__metadata__ import __description__, __license__, __title__, __version__ # noqa +from awswrangler._utils import get_account_id # noqa logging.getLogger("awswrangler").addHandler(logging.NullHandler()) diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py index 21a27d37e..df168bdb9 100644 --- a/awswrangler/_utils.py +++ b/awswrangler/_utils.py @@ -166,3 +166,16 @@ def ensure_postgresql_casts(): def get_directory(path: str) -> str: """Extract directory path.""" return path.rsplit(sep="/", maxsplit=1)[0] + "/" + + +def get_account_id(boto3_session: Optional[boto3.Session] = None) -> str: + """Get Account ID.""" + session: boto3.Session = ensure_session(session=boto3_session) + return client(service_name="sts", session=session).get_caller_identity().get("Account") + + +def get_region_from_subnet(subnet_id: str, boto3_session: Optional[boto3.Session] = None) -> str: + """Extract region from Subnet ID.""" + session: boto3.Session = ensure_session(session=boto3_session) + client_ec2: boto3.client = client(service_name="ec2", session=session) + return client_ec2.describe_subnets(SubnetIds=[subnet_id])["Subnets"][0]["AvailabilityZone"][:9] diff --git a/awswrangler/athena.py b/awswrangler/athena.py index 1933606ad..d73c41063 100644 --- a/awswrangler/athena.py +++ b/awswrangler/athena.py @@ -68,7 +68,7 @@ def create_athena_bucket(boto3_session: Optional[boto3.Session] = None) -> str: """ session: boto3.Session = _utils.ensure_session(session=boto3_session) - account_id: str = _utils.client(service_name="sts", session=session).get_caller_identity().get("Account") + account_id: str = _utils.get_account_id(boto3_session=session) region_name: str = str(session.region_name).lower() s3_output = f"s3://aws-athena-query-results-{account_id}-{region_name}/" s3_resource = session.resource("s3") diff --git a/awswrangler/emr.py b/awswrangler/emr.py index aee470621..106a57da3 100644 --- a/awswrangler/emr.py +++ b/awswrangler/emr.py @@ -7,12 +7,76 @@ import boto3 # type: ignore -from awswrangler import _utils +from awswrangler import _utils, exceptions _logger: logging.Logger = logging.getLogger(__name__) +def _get_default_logging_path( + subnet_id: Optional[str] = None, + account_id: Optional[str] = None, + region: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> str: + """Get EMR default logging path. + + E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/" + + Parameters + ---------- + subnet_id : str, optional + Subnet ID. If not provided, you must pass `account_id` and `region` explicit. + account_id: str, optional + Account ID. + region: str, optional + Region e.g. 'us-east-1' + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Default logging path. + E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/" + + Examples + -------- + >>> import awswrangler as wr + >>> state = wr.emr._get_default_logging_path("subnet-id") + 's3://aws-logs-{account_id}-{region}/elasticmapreduce/' + + """ + if account_id is None: + boto3_session = _utils.ensure_session(session=boto3_session) + _account_id: str = _utils.get_account_id(boto3_session=boto3_session) + else: + _account_id = account_id + if (region is None) and (subnet_id is not None): + boto3_session = _utils.ensure_session(session=boto3_session) + _region: str = _utils.get_region_from_subnet(subnet_id=subnet_id, boto3_session=boto3_session) + elif (region is None) and (subnet_id is None): + raise exceptions.InvalidArgumentCombination("You must pass region or subnet_id or both.") + else: + _region = region # type: ignore + return f"s3://aws-logs-{_account_id}-{_region}/elasticmapreduce/" + + +def _get_ecr_credentials_command() -> str: + return ( + "sudo -s eval $(aws ecr get-login --region us-east-1 --no-include-email) && " + "sudo hdfs dfs -put -f /root/.docker/config.json /user/hadoop/" + ) + + def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-statements + account_id: str = _utils.get_account_id(boto3_session=pars["boto3_session"]) + region: str = _utils.get_region_from_subnet(subnet_id=pars["subnet_id"], boto3_session=pars["boto3_session"]) + + # S3 Logging path + if pars.get("logging_s3_path") is None: + pars["logging_s3_path"] = _get_default_logging_path( + subnet_id=None, account_id=account_id, region=region, boto3_session=pars["boto3_session"] + ) spark_env: Optional[Dict[str, str]] = None yarn_env: Optional[Dict[str, str]] = None @@ -20,25 +84,25 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s if pars["spark_pyarrow"] is True: if pars["spark_defaults"] is None: - pars["spark_defaults"]: Dict[str, str] = {"spark.sql.execution.arrow.enabled": "true"} + pars["spark_defaults"] = {"spark.sql.execution.arrow.enabled": "true"} else: # pragma: no cover - pars["spark_defaults"]["spark.sql.execution.arrow.enabled"]: str = "true" + pars["spark_defaults"]["spark.sql.execution.arrow.enabled"] = "true" spark_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} yarn_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} livy_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} if pars["python3"] is True: if spark_env is None: - spark_env: Dict[str, str] = {"PYSPARK_PYTHON": "/usr/bin/python3"} # pragma: no cover + spark_env = {"PYSPARK_PYTHON": "/usr/bin/python3"} # pragma: no cover else: - spark_env["PYSPARK_PYTHON"]: str = "/usr/bin/python3" + spark_env["PYSPARK_PYTHON"] = "/usr/bin/python3" if pars["spark_jars_path"] is not None: paths: str = ",".join(pars["spark_jars_path"]) if pars["spark_defaults"] is None: # pragma: no cover - pars["spark_defaults"]: Dict[str, str] = {"spark.jars": paths} + pars["spark_defaults"] = {"spark.jars": paths} else: - pars["spark_defaults"]["spark.jars"]: str = paths + pars["spark_defaults"]["spark.jars"] = paths args: Dict[str, Any] = { "Name": pars["cluster_name"], @@ -72,9 +136,52 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s args["Instances"]["ServiceAccessSecurityGroup"] = pars["security_group_service_access"] # Configurations - args["Configurations"]: List[Dict[str, Any]] = [ + args["Configurations"] = [ {"Classification": "spark-log4j", "Properties": {"log4j.rootCategory": f"{pars['spark_log_level']}, console"}} ] + if (pars["docker"] is True) or (pars["spark_docker"] is True) or (pars["hive_docker"] is True): + if pars.get("extra_registries") is None: + extra_registries: List[str] = [] + else: # pragma: no cover + extra_registries = pars["extra_registries"] + registries: str = f"local,centos,{account_id}.dkr.ecr.{region}.amazonaws.com,{','.join(extra_registries)}" + registries = registries[:-1] if registries.endswith(",") else registries + args["Configurations"].append( + { + "Classification": "container-executor", + "Properties": {}, + "Configurations": [ + { + "Classification": "docker", + "Properties": { + "docker.privileged-containers.registries": registries, + "docker.trusted.registries": registries, + }, + "Configurations": [], + } + ], + } + ) + if pars["spark_docker"] is True: + if pars.get("spark_docker_image") is None: # pragma: no cover + raise exceptions.InvalidArgumentCombination("You must pass a spark_docker_image if spark_docker is True.") + pars["spark_defaults"] = {} if pars["spark_defaults"] is None else pars["spark_defaults"] + pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker" + pars["spark_defaults"][ + "spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG" + ] = "hdfs:///user/hadoop/config.json" + pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars["spark_docker_image"] + pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS"] = "/etc/passwd:/etc/passwd:ro" + pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker" + pars["spark_defaults"][ + "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG" + ] = "hdfs:///user/hadoop/config.json" + pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars[ + "spark_docker_image" + ] + pars["spark_defaults"][ + "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS" + ] = "/etc/passwd:/etc/passwd:ro" if spark_env is not None: args["Configurations"].append( { @@ -109,16 +216,21 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s "Configurations": [], } ) + + hive_conf: Optional[Dict[str, Any]] = None + if (pars["hive_glue_catalog"] is True) or (pars["hive_docker"] is True): + hive_conf: Optional[Dict[str, Any]] = {"Classification": "hive-site", "Properties": {}, "Configurations": []} + if pars["hive_glue_catalog"] is True: - args["Configurations"].append( - { - "Classification": "hive-site", - "Properties": { - "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" # noqa - }, - "Configurations": [], - } - ) + hive_conf["Properties"][ + "hive.metastore.client.factory.class" + ] = "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" + if pars["hive_docker"] is True: + hive_conf["Properties"]["hive.execution.mode"] = "container" + + if hive_conf is not None: + args["Configurations"].append(hive_conf) + if pars["presto_glue_catalog"] is True: args["Configurations"].append( { @@ -147,20 +259,21 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s "Properties": pars["spark_defaults"], } args["Configurations"].append(spark_defaults) + if pars.get("custom_classifications") is not None: + for c in pars["custom_classifications"]: + args["Configurations"].append(c) # Applications if pars["applications"]: - args["Applications"]: List[Dict[str, str]] = [{"Name": x} for x in pars["applications"]] + args["Applications"] = [{"Name": x} for x in pars["applications"]] # Bootstraps if pars["bootstraps_paths"]: # pragma: no cover - args["BootstrapActions"]: List[Dict] = [ - {"Name": x, "ScriptBootstrapAction": {"Path": x}} for x in pars["bootstraps_paths"] - ] + args["BootstrapActions"] = [{"Name": x, "ScriptBootstrapAction": {"Path": x}} for x in pars["bootstraps_paths"]] # Debugging and Steps if (pars["debugging"] is True) or (pars["steps"] is not None): - args["Steps"]: List[Dict[str, Any]] = [] + args["Steps"] = [] if pars["debugging"] is True: args["Steps"].append( { @@ -169,6 +282,17 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s "HadoopJarStep": {"Jar": "command-runner.jar", "Args": ["state-pusher-script"]}, } ) + if pars["ecr_credentials_step"] is True: + args["Steps"].append( + build_step( + name="ECR Credentials Setup", + command=_get_ecr_credentials_command(), + action_on_failure="TERMINATE_CLUSTER", + script=False, + region=region, + boto3_session=pars["boto3_session"], + ) + ) if pars["steps"] is not None: args["Steps"] += pars["steps"] @@ -199,7 +323,7 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s ], } if pars["instance_num_spot_master"] > 0: # pragma: no cover - fleet_master["LaunchSpecifications"]: Dict = { + fleet_master["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_master"], "TimeoutAction": timeout_action_master, @@ -236,7 +360,7 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s ], } if pars["instance_num_spot_core"] > 0: - fleet_core["LaunchSpecifications"]: Dict = { + fleet_core["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_core"], "TimeoutAction": timeout_action_core, @@ -275,7 +399,7 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s ], } if pars["instance_num_spot_task"] > 0: - fleet_task["LaunchSpecifications"]: Dict = { + fleet_task["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_task"], "TimeoutAction": timeout_action_task, @@ -292,30 +416,30 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused-argument - cluster_name: str, - logging_s3_path: str, - emr_release: str, subnet_id: str, - emr_ec2_role: str, - emr_role: str, - instance_type_master: str, - instance_type_core: str, - instance_type_task: str, - instance_ebs_size_master: int, - instance_ebs_size_core: int, - instance_ebs_size_task: int, - instance_num_on_demand_master: int, - instance_num_on_demand_core: int, - instance_num_on_demand_task: int, - instance_num_spot_master: int, - instance_num_spot_core: int, - instance_num_spot_task: int, - spot_bid_percentage_of_on_demand_master: int, - spot_bid_percentage_of_on_demand_core: int, - spot_bid_percentage_of_on_demand_task: int, - spot_provisioning_timeout_master: int, - spot_provisioning_timeout_core: int, - spot_provisioning_timeout_task: int, + cluster_name: str = "my-emr-cluster", + logging_s3_path: Optional[str] = None, + emr_release: str = "emr-6.0.0", + emr_ec2_role: str = "EMR_EC2_DefaultRole", + emr_role: str = "EMR_DefaultRole", + instance_type_master: str = "r5.xlarge", + instance_type_core: str = "r5.xlarge", + instance_type_task: str = "r5.xlarge", + instance_ebs_size_master: int = 64, + instance_ebs_size_core: int = 64, + instance_ebs_size_task: int = 64, + instance_num_on_demand_master: int = 1, + instance_num_on_demand_core: int = 0, + instance_num_on_demand_task: int = 0, + instance_num_spot_master: int = 0, + instance_num_spot_core: int = 0, + instance_num_spot_task: int = 0, + spot_bid_percentage_of_on_demand_master: int = 100, + spot_bid_percentage_of_on_demand_core: int = 100, + spot_bid_percentage_of_on_demand_task: int = 100, + spot_provisioning_timeout_master: int = 5, + spot_provisioning_timeout_core: int = 5, + spot_provisioning_timeout_task: int = 5, spot_timeout_to_on_demand_master: bool = True, spot_timeout_to_on_demand_core: bool = True, spot_timeout_to_on_demand_task: bool = True, @@ -337,10 +461,17 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused security_group_slave: Optional[str] = None, security_groups_slave_additional: Optional[List[str]] = None, security_group_service_access: Optional[str] = None, + docker: bool = False, spark_log_level: str = "WARN", spark_jars_path: Optional[List[str]] = None, spark_defaults: Optional[Dict[str, str]] = None, spark_pyarrow: bool = False, + spark_docker: bool = False, + spark_docker_image: str = None, + hive_docker: bool = False, + ecr_credentials_step: bool = False, + extra_public_registries: Optional[List[str]] = None, + custom_classifications: Optional[List[Dict[str, Any]]] = None, maximize_resource_allocation: bool = False, steps: Optional[List[Dict[str, Any]]] = None, keep_cluster_alive_when_no_steps: bool = True, @@ -354,18 +485,19 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused Parameters ---------- + subnet_id : str + VPC subnet ID. cluster_name : str Cluster name. - logging_s3_path : str + logging_s3_path : str, optional Logging s3 path (e.g. s3://BUCKET_NAME/DIRECTORY_NAME/). + If None, the default is `s3://aws-logs-{AccountId}-{RegionId}/elasticmapreduce/` emr_release : str EMR release (e.g. emr-5.28.0). emr_ec2_role : str IAM role name. emr_role : str IAM role name. - subnet_id : str - VPC subnet ID. instance_type_master : str EC2 instance type. instance_type_core : str @@ -448,6 +580,7 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused Debugging enabled? applications : List[str], optional List of applications (e.g ["Hadoop", "Spark", "Ganglia", "Hive"]). + If None, ["Spark"] will be considered. visible_to_all_users : bool True or False. key_pair_name : str, optional @@ -465,6 +598,8 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused security_group_service_access : str, optional The identifier of the Amazon EC2 security group for the Amazon EMR service to access clusters in VPC private subnets. + docker : bool + Enable Docker Hub and ECR registries access. spark_log_level : str log4j.rootCategory log level (ALL, DEBUG, INFO, WARN, ERROR, FATAL, OFF, TRACE). spark_jars_path : List[str], optional @@ -475,6 +610,18 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused spark_pyarrow : bool Enable PySpark to use PyArrow behind the scenes. P.S. You must install pyarrow by your self via bootstrap + spark_docker : bool = False + Add necessary Spark Defaults to run on Docker + spark_docker_image : str, optional + E.g. {ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG} + hive_docker : bool + Add necessary configurations to run on Docker + ecr_credentials_step : bool + Add a extra step during the Cluster launch to retrieve ECR auth files. + extra_public_registries: List[str], optional + Additional registries. + custom_classifications: List[Dict[str, Any]], optional + Extra classifications. maximize_resource_allocation : bool Configure your executors to utilize the maximum resources possible https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html#emr-spark-maximizeresourceallocation @@ -500,6 +647,21 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused Examples -------- + Minimal Example + + >>> cluster_id = wr.emr.create_cluster("SUBNET_ID") + + Minimal Exmaple on Docker + + >>> cluster_id = wr.emr.create_cluster( + >>> subnet_id="SUBNET_ID", + >>> spark_docker=True, + >>> spark_docker_image="{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}", + >>> ecr_credentials_step=True + >>> ) + + Full Example + >>> import awswrangler as wr >>> cluster_id = wr.emr.create_cluster( ... cluster_name="wrangler_cluster", @@ -548,6 +710,8 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused ... }) """ + applications = ["Spark"] if applications is None else applications + boto3_session = _utils.ensure_session(session=boto3_session) args: Dict[str, Any] = _build_cluster_args(**locals()) client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.run_job_flow(**args) @@ -647,8 +811,8 @@ def submit_steps( def submit_step( cluster_id: str, - name: str, command: str, + name: str = "my-step", action_on_failure: str = "CONTINUE", script: bool = False, boto3_session: Optional[boto3.Session] = None, @@ -659,11 +823,11 @@ def submit_step( ---------- cluster_id : str Cluster ID. - name : str - Step name. command : str e.g. 'echo "Hello!"' e.g. for script 's3://.../script.sh arg1 arg2' + name : str, optional + Step name. action_on_failure : str 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' script : bool @@ -698,26 +862,29 @@ def submit_step( def build_step( - name: str, command: str, + name: str = "my-step", action_on_failure: str = "CONTINUE", script: bool = False, + region: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Any]: """Build the Step structure (dictionary). Parameters ---------- - name : str - Step name. command : str e.g. 'echo "Hello!"' e.g. for script 's3://.../script.sh arg1 arg2' + name : str, optional + Step name. action_on_failure : str 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' script : bool - True for raw command or False for script runner. + False for raw command or True for script runner. https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html + region: str, optional + Region name to not get it from boto3.Session. (e.g. `us-east-1`) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -734,14 +901,17 @@ def build_step( >>> wr.emr.submit_steps(cluster_id="cluster-id", steps=steps) """ - session: boto3.Session = _utils.ensure_session(session=boto3_session) jar: str = "command-runner.jar" if script is True: - if session.region_name is not None: - region: str = session.region_name - else: # pragma: no cover - region = "us-east-1" - jar = f"s3://{region}.elasticmapreduce/libs/script-runner/script-runner.jar" + if region is not None: # pragma: no cover + _region: str = region + else: + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if session.region_name is not None: + _region = session.region_name + else: # pragma: no cover + _region = "us-east-1" + jar = f"s3://{_region}.elasticmapreduce/libs/script-runner/script-runner.jar" step: Dict[str, Any] = { "Name": name, "ActionOnFailure": action_on_failure, @@ -780,3 +950,40 @@ def get_step_state(cluster_id: str, step_id: str, boto3_session: Optional[boto3. response: Dict[str, Any] = client_emr.describe_step(ClusterId=cluster_id, StepId=step_id) _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") return response["Step"]["Status"]["State"] + + +def update_ecr_credentials( + cluster_id: str, action_on_failure: str = "CONTINUE", boto3_session: Optional[boto3.Session] = None +) -> str: + """Update internal ECR credentials. + + Parameters + ---------- + cluster_id : str + Cluster ID. + action_on_failure : str + 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Step ID. + + Examples + -------- + >>> import awswrangler as wr + >>> step_id = wr.emr.update_ecr_credentials("cluster_id") + + """ + name: str = "Update ECR Credentials" + command: str = _get_ecr_credentials_command() + session: boto3.Session = _utils.ensure_session(session=boto3_session) + step: Dict[str, Any] = build_step( + name=name, command=command, action_on_failure=action_on_failure, script=False, boto3_session=session + ) + client_emr: boto3.client = _utils.client(service_name="emr", session=session) + response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step]) + _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") + return response["StepIds"][0] diff --git a/awswrangler/s3.py b/awswrangler/s3.py index f728937db..527c1ae76 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -111,6 +111,40 @@ def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None) raise ex # pragma: no cover +def list_directories(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]: + """List Amazon S3 objects from a prefix. + + Parameters + ---------- + path : str + S3 path (e.g. s3://bucket/prefix). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + List of objects paths. + + Examples + -------- + Using the default boto3 session + + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix/') + ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] + + Using a custom boto3 session + + >>> import boto3 + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix/', boto3_session=boto3.Session()) + ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] + + """ + return _list_objects(path=path, delimiter="/", boto3_session=boto3_session) + + def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]: """List Amazon S3 objects from a prefix. @@ -142,20 +176,37 @@ def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> Li ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2'] """ + return _list_objects(path=path, delimiter=None, boto3_session=boto3_session) + + +def _list_objects( + path: str, delimiter: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) paginator = client_s3.get_paginator("list_objects_v2") bucket: str prefix: str bucket, prefix = _utils.parse_path(path=path) - response_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix, PaginationConfig={"PageSize": 1000}) + args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}} + if delimiter is not None: + args["Delimiter"] = delimiter + response_iterator = paginator.paginate(**args) paths: List[str] = [] for page in response_iterator: - contents: Optional[List] = page.get("Contents") - if contents is not None: - for content in contents: - if (content is not None) and ("Key" in content): - key: str = content["Key"] - paths.append(f"s3://{bucket}/{key}") + if delimiter is None: + contents: Optional[List[Optional[Dict[str, str]]]] = page.get("Contents") + if contents is not None: + for content in contents: + if (content is not None) and ("Key" in content): + key: str = content["Key"] + paths.append(f"s3://{bucket}/{key}") + else: + prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes") + if prefixes is not None: + for pfx in prefixes: + if (pfx is not None) and ("Prefix" in pfx): + key = pfx["Prefix"] + paths.append(f"s3://{bucket}/{key}") return paths diff --git a/docs/source/api.rst b/docs/source/api.rst index 897fc7a3e..7d2d51602 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -16,6 +16,7 @@ Amazon S3 does_object_exist get_bucket_region list_objects + list_directories read_csv read_fwf read_json @@ -115,6 +116,7 @@ EMR submit_steps build_step get_step_state + update_ecr_credentials CloudWatch Logs --------------- diff --git a/requirements-dev.txt b/requirements-dev.txt index 3fdd3cdf3..99a9b0730 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -17,4 +17,5 @@ twine~=3.1.1 wheel~=0.34.2 sphinx~=3.0.1 sphinx_bootstrap_theme~=0.7.1 -moto~=1.3.14 \ No newline at end of file +moto~=1.3.14 +jupyterlab~=2.1.1 \ No newline at end of file diff --git a/testing/test_awswrangler/test_cloudwatch.py b/testing/test_awswrangler/test_cloudwatch.py index f59b8b3dd..eced7a754 100644 --- a/testing/test_awswrangler/test_cloudwatch.py +++ b/testing/test_awswrangler/test_cloudwatch.py @@ -48,7 +48,7 @@ def loggroup(cloudformation_outputs): def test_query_cancelled(loggroup): client_logs = boto3.client("logs") query_id = wr.cloudwatch.start_query( - log_group_names=[loggroup], query="fields @timestamp, @message | sort @timestamp desc | limit 5" + log_group_names=[loggroup], query="fields @timestamp, @message | sort @timestamp desc" ) client_logs.stop_query(queryId=query_id) with pytest.raises(exceptions.QueryCancelled): diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index afa2a8307..bd53d4bad 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -127,6 +127,9 @@ def test_athena_ctas(bucket, database, kms_key): partition_cols=["par0", "par1"], )["paths"] wr.s3.wait_objects_exist(paths=paths) + dirs = wr.s3.list_directories(path=f"s3://{bucket}/test_athena_ctas/") + for d in dirs: + assert d.startswith(f"s3://{bucket}/test_athena_ctas/par0=") df = wr.s3.read_parquet_table(table="test_athena_ctas", database=database) assert len(df.index) == 3 ensure_data_types(df=df, has_list=True) diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py index e64329b33..df2dab1cb 100644 --- a/testing/test_awswrangler/test_emr.py +++ b/testing/test_awswrangler/test_emr.py @@ -146,3 +146,36 @@ def test_cluster_single_node(bucket, cloudformation_outputs): wr.emr.submit_steps(cluster_id=cluster_id, steps=steps) wr.emr.terminate_cluster(cluster_id=cluster_id) wr.s3.delete_objects(f"s3://{bucket}/emr-logs/") + + +def test_default_logging_path(cloudformation_outputs): + path = wr.emr._get_default_logging_path(subnet_id=cloudformation_outputs["SubnetId"]) + assert path.startswith("s3://aws-logs-") + assert path.endswith("/elasticmapreduce/") + with pytest.raises(wr.exceptions.InvalidArgumentCombination): + wr.emr._get_default_logging_path() + + +def test_docker(cloudformation_outputs): + cluster_id = wr.emr.create_cluster( + subnet_id=cloudformation_outputs["SubnetId"], + docker=True, + spark_docker=True, + spark_docker_image="787535711150.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr", + hive_docker=True, + ecr_credentials_step=True, + custom_classifications=[ + { + "Classification": "livy-conf", + "Properties": { + "livy.spark.master": "yarn", + "livy.spark.deploy-mode": "cluster", + "livy.server.session.timeout": "16h", + }, + } + ], + steps=[wr.emr.build_step("spark-submit --deploy-mode cluster s3://igor-tavares/emr.py")], + ) + wr.emr.submit_step(cluster_id=cluster_id, command="spark-submit --deploy-mode cluster s3://igor-tavares/emr.py") + wr.emr.update_ecr_credentials(cluster_id=cluster_id) + wr.emr.terminate_cluster(cluster_id=cluster_id) diff --git a/testing/test_awswrangler/test_moto.py b/testing/test_awswrangler/test_moto.py index db12dbe1a..2adc7aec8 100644 --- a/testing/test_awswrangler/test_moto.py +++ b/testing/test_awswrangler/test_moto.py @@ -20,6 +20,21 @@ def emr(): yield True +@pytest.fixture(scope="module") +def sts(): + with moto.mock_sts(): + yield True + + +@pytest.fixture(scope="module") +def subnet(): + with moto.mock_ec2(): + ec2 = boto3.resource("ec2", region_name="us-west-1") + vpc = ec2.create_vpc(CidrBlock="10.0.0.0/16") + subnet = ec2.create_subnet(VpcId=vpc.id, CidrBlock="10.0.0.0/24", AvailabilityZone="us-west-1a") + yield subnet.id + + def test_csv(s3): path = "s3://bucket/test.csv" wr.s3.to_csv(df=get_df_csv(), path=path, index=False) @@ -37,12 +52,13 @@ def test_parquet(s3): assert len(df.columns) == 18 -def test_emr(s3, emr): +def test_emr(s3, emr, sts, subnet): + session = boto3.Session(region_name="us-west-1") cluster_id = wr.emr.create_cluster( cluster_name="wrangler_cluster", logging_s3_path="s3://bucket/emr-logs/", emr_release="emr-5.29.0", - subnet_id="foo", + subnet_id=subnet, emr_ec2_role="EMR_EC2_DefaultRole", emr_role="EMR_DefaultRole", instance_type_master="m5.xlarge", @@ -87,11 +103,12 @@ def test_emr(s3, emr): termination_protected=False, spark_pyarrow=False, tags={"foo": "boo", "bar": "xoo"}, + boto3_session=session, ) - wr.emr.get_cluster_state(cluster_id=cluster_id) + wr.emr.get_cluster_state(cluster_id=cluster_id, boto3_session=session) steps = [] for cmd in ['echo "Hello"', "ls -la"]: steps.append(wr.emr.build_step(name=cmd, command=cmd)) - wr.emr.submit_steps(cluster_id=cluster_id, steps=steps) - wr.emr.terminate_cluster(cluster_id=cluster_id) + wr.emr.submit_steps(cluster_id=cluster_id, steps=steps, boto3_session=session) + wr.emr.terminate_cluster(cluster_id=cluster_id, boto3_session=session) wr.s3.delete_objects("s3://bucket/emr-logs/") diff --git a/tutorials/15 - EMR.ipynb b/tutorials/15 - EMR.ipynb new file mode 100644 index 000000000..4e1c627e6 --- /dev/null +++ b/tutorials/15 - EMR.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)\n", + "\n", + "# 15 - EMR" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import awswrangler as wr\n", + "import boto3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enter your bucket name:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ··········································\n" + ] + } + ], + "source": [ + "import getpass\n", + "bucket = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enter your Subnet ID:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ························\n" + ] + } + ], + "source": [ + "subnet = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating EMR Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "cluster_id = wr.emr.create_cluster(subnet)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Uploading our PySpark script to Amazon S3" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.appName(\"docker-awswrangler\").getOrCreate()\n", + "sc = spark.sparkContext\n", + "\n", + "print(\"Spark Initialized\")\n", + "\"\"\"\n", + "\n", + "_ = boto3.client(\"s3\").put_object(\n", + " Body=script,\n", + " Bucket=bucket,\n", + " Key=\"test.py\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submit PySpark step" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "step_id = wr.emr.submit_step(cluster_id, command=f\"spark-submit s3://{bucket}/test.py\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wait Step" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Terminate Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "wr.emr.terminate_cluster(cluster_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb new file mode 100644 index 000000000..138759d8f --- /dev/null +++ b/tutorials/16 - EMR & Docker.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)\n", + "\n", + "# 16 - EMR & Docker" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import awswrangler as wr\n", + "import boto3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enter your bucket name:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ··········································\n" + ] + } + ], + "source": [ + "import getpass\n", + "bucket = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enter your Subnet ID:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ························\n" + ] + } + ], + "source": [ + "subnet = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Build and Upload Docker Image to ECR repository\n", + "\n", + "Replace the `{ACCOUNT_ID}` placeholder." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%writefile Dockerfile\n" + } + }, + "outputs": [], + "source": [ + "%%writefile Dockerfile\n", + "\n", + "FROM amazoncorretto:8\n", + "\n", + "RUN yum -y update\n", + "RUN yum -y install yum-utils\n", + "RUN yum -y groupinstall development\n", + "\n", + "RUN yum list python3*\n", + "RUN yum -y install python3 python3-dev python3-pip python3-virtualenv\n", + "\n", + "RUN python -V\n", + "RUN python3 -V\n", + "\n", + "ENV PYSPARK_DRIVER_PYTHON python3\n", + "ENV PYSPARK_PYTHON python3\n", + "\n", + "RUN pip3 install --upgrade pip\n", + "RUN pip3 install awswrangler\n", + "\n", + "RUN python3 -c \"import awswrangler as wr\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "docker build -t 'local/emr-wrangler' .\n", + "aws ecr create-repository --repository-name emr-wrangler\n", + "docker tag local/emr-wrangler {ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\n", + "eval $(aws ecr get-login --region us-east-1 --no-include-email)\n", + "docker push {ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating EMR Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n", + "\n", + "cluster_id = wr.emr.create_cluster(\n", + " subnet_id=subnet,\n", + " spark_docker=True,\n", + " spark_docker_image=DOCKER_IMAGE,\n", + " ecr_credentials_step=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Uploading our PySpark script to Amazon S3" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.appName(\"docker-awswrangler\").getOrCreate()\n", + "sc = spark.sparkContext\n", + "\n", + "print(\"Spark Initialized\")\n", + "\n", + "import awswrangler as wr\n", + "\n", + "print(f\"Wrangler version: {wr.__version__}\")\n", + "\"\"\"\n", + "\n", + "_ = boto3.client(\"s3\").put_object(\n", + " Body=script,\n", + " Bucket=bucket,\n", + " Key=\"test_docker.py\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submit PySpark step" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "step_id = wr.emr.submit_step(cluster_id, command=f\"spark-submit --deploy-mode cluster s3://{bucket}/test_docker.py\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wait Step" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Terminate Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "wr.emr.terminate_cluster(cluster_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From c2db8cd27bbd80da857b40df737385c4bd254eb7 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 25 Apr 2020 17:41:16 -0300 Subject: [PATCH 27/59] Add support to EMR with Docker #193 --- awswrangler/__init__.py | 1 + awswrangler/_utils.py | 13 + awswrangler/athena.py | 2 +- awswrangler/emr.py | 335 ++++++++++++++++---- awswrangler/s3.py | 65 +++- docs/source/api.rst | 2 + requirements-dev.txt | 3 +- testing/test_awswrangler/test_cloudwatch.py | 2 +- testing/test_awswrangler/test_data_lake.py | 3 + testing/test_awswrangler/test_emr.py | 33 ++ testing/test_awswrangler/test_moto.py | 27 +- tutorials/15 - EMR.ipynb | 193 +++++++++++ tutorials/16 - EMR & Docker.ipynb | 269 ++++++++++++++++ 13 files changed, 869 insertions(+), 79 deletions(-) create mode 100644 tutorials/15 - EMR.ipynb create mode 100644 tutorials/16 - EMR & Docker.ipynb diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py index ce11c7ad5..4413ab5f4 100644 --- a/awswrangler/__init__.py +++ b/awswrangler/__init__.py @@ -9,5 +9,6 @@ from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3 # noqa from awswrangler.__metadata__ import __description__, __license__, __title__, __version__ # noqa +from awswrangler._utils import get_account_id # noqa logging.getLogger("awswrangler").addHandler(logging.NullHandler()) diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py index 21a27d37e..df168bdb9 100644 --- a/awswrangler/_utils.py +++ b/awswrangler/_utils.py @@ -166,3 +166,16 @@ def ensure_postgresql_casts(): def get_directory(path: str) -> str: """Extract directory path.""" return path.rsplit(sep="/", maxsplit=1)[0] + "/" + + +def get_account_id(boto3_session: Optional[boto3.Session] = None) -> str: + """Get Account ID.""" + session: boto3.Session = ensure_session(session=boto3_session) + return client(service_name="sts", session=session).get_caller_identity().get("Account") + + +def get_region_from_subnet(subnet_id: str, boto3_session: Optional[boto3.Session] = None) -> str: + """Extract region from Subnet ID.""" + session: boto3.Session = ensure_session(session=boto3_session) + client_ec2: boto3.client = client(service_name="ec2", session=session) + return client_ec2.describe_subnets(SubnetIds=[subnet_id])["Subnets"][0]["AvailabilityZone"][:9] diff --git a/awswrangler/athena.py b/awswrangler/athena.py index 1933606ad..d73c41063 100644 --- a/awswrangler/athena.py +++ b/awswrangler/athena.py @@ -68,7 +68,7 @@ def create_athena_bucket(boto3_session: Optional[boto3.Session] = None) -> str: """ session: boto3.Session = _utils.ensure_session(session=boto3_session) - account_id: str = _utils.client(service_name="sts", session=session).get_caller_identity().get("Account") + account_id: str = _utils.get_account_id(boto3_session=session) region_name: str = str(session.region_name).lower() s3_output = f"s3://aws-athena-query-results-{account_id}-{region_name}/" s3_resource = session.resource("s3") diff --git a/awswrangler/emr.py b/awswrangler/emr.py index aee470621..106a57da3 100644 --- a/awswrangler/emr.py +++ b/awswrangler/emr.py @@ -7,12 +7,76 @@ import boto3 # type: ignore -from awswrangler import _utils +from awswrangler import _utils, exceptions _logger: logging.Logger = logging.getLogger(__name__) +def _get_default_logging_path( + subnet_id: Optional[str] = None, + account_id: Optional[str] = None, + region: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> str: + """Get EMR default logging path. + + E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/" + + Parameters + ---------- + subnet_id : str, optional + Subnet ID. If not provided, you must pass `account_id` and `region` explicit. + account_id: str, optional + Account ID. + region: str, optional + Region e.g. 'us-east-1' + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Default logging path. + E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/" + + Examples + -------- + >>> import awswrangler as wr + >>> state = wr.emr._get_default_logging_path("subnet-id") + 's3://aws-logs-{account_id}-{region}/elasticmapreduce/' + + """ + if account_id is None: + boto3_session = _utils.ensure_session(session=boto3_session) + _account_id: str = _utils.get_account_id(boto3_session=boto3_session) + else: + _account_id = account_id + if (region is None) and (subnet_id is not None): + boto3_session = _utils.ensure_session(session=boto3_session) + _region: str = _utils.get_region_from_subnet(subnet_id=subnet_id, boto3_session=boto3_session) + elif (region is None) and (subnet_id is None): + raise exceptions.InvalidArgumentCombination("You must pass region or subnet_id or both.") + else: + _region = region # type: ignore + return f"s3://aws-logs-{_account_id}-{_region}/elasticmapreduce/" + + +def _get_ecr_credentials_command() -> str: + return ( + "sudo -s eval $(aws ecr get-login --region us-east-1 --no-include-email) && " + "sudo hdfs dfs -put -f /root/.docker/config.json /user/hadoop/" + ) + + def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-statements + account_id: str = _utils.get_account_id(boto3_session=pars["boto3_session"]) + region: str = _utils.get_region_from_subnet(subnet_id=pars["subnet_id"], boto3_session=pars["boto3_session"]) + + # S3 Logging path + if pars.get("logging_s3_path") is None: + pars["logging_s3_path"] = _get_default_logging_path( + subnet_id=None, account_id=account_id, region=region, boto3_session=pars["boto3_session"] + ) spark_env: Optional[Dict[str, str]] = None yarn_env: Optional[Dict[str, str]] = None @@ -20,25 +84,25 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s if pars["spark_pyarrow"] is True: if pars["spark_defaults"] is None: - pars["spark_defaults"]: Dict[str, str] = {"spark.sql.execution.arrow.enabled": "true"} + pars["spark_defaults"] = {"spark.sql.execution.arrow.enabled": "true"} else: # pragma: no cover - pars["spark_defaults"]["spark.sql.execution.arrow.enabled"]: str = "true" + pars["spark_defaults"]["spark.sql.execution.arrow.enabled"] = "true" spark_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} yarn_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} livy_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} if pars["python3"] is True: if spark_env is None: - spark_env: Dict[str, str] = {"PYSPARK_PYTHON": "/usr/bin/python3"} # pragma: no cover + spark_env = {"PYSPARK_PYTHON": "/usr/bin/python3"} # pragma: no cover else: - spark_env["PYSPARK_PYTHON"]: str = "/usr/bin/python3" + spark_env["PYSPARK_PYTHON"] = "/usr/bin/python3" if pars["spark_jars_path"] is not None: paths: str = ",".join(pars["spark_jars_path"]) if pars["spark_defaults"] is None: # pragma: no cover - pars["spark_defaults"]: Dict[str, str] = {"spark.jars": paths} + pars["spark_defaults"] = {"spark.jars": paths} else: - pars["spark_defaults"]["spark.jars"]: str = paths + pars["spark_defaults"]["spark.jars"] = paths args: Dict[str, Any] = { "Name": pars["cluster_name"], @@ -72,9 +136,52 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s args["Instances"]["ServiceAccessSecurityGroup"] = pars["security_group_service_access"] # Configurations - args["Configurations"]: List[Dict[str, Any]] = [ + args["Configurations"] = [ {"Classification": "spark-log4j", "Properties": {"log4j.rootCategory": f"{pars['spark_log_level']}, console"}} ] + if (pars["docker"] is True) or (pars["spark_docker"] is True) or (pars["hive_docker"] is True): + if pars.get("extra_registries") is None: + extra_registries: List[str] = [] + else: # pragma: no cover + extra_registries = pars["extra_registries"] + registries: str = f"local,centos,{account_id}.dkr.ecr.{region}.amazonaws.com,{','.join(extra_registries)}" + registries = registries[:-1] if registries.endswith(",") else registries + args["Configurations"].append( + { + "Classification": "container-executor", + "Properties": {}, + "Configurations": [ + { + "Classification": "docker", + "Properties": { + "docker.privileged-containers.registries": registries, + "docker.trusted.registries": registries, + }, + "Configurations": [], + } + ], + } + ) + if pars["spark_docker"] is True: + if pars.get("spark_docker_image") is None: # pragma: no cover + raise exceptions.InvalidArgumentCombination("You must pass a spark_docker_image if spark_docker is True.") + pars["spark_defaults"] = {} if pars["spark_defaults"] is None else pars["spark_defaults"] + pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker" + pars["spark_defaults"][ + "spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG" + ] = "hdfs:///user/hadoop/config.json" + pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars["spark_docker_image"] + pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS"] = "/etc/passwd:/etc/passwd:ro" + pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker" + pars["spark_defaults"][ + "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG" + ] = "hdfs:///user/hadoop/config.json" + pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars[ + "spark_docker_image" + ] + pars["spark_defaults"][ + "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS" + ] = "/etc/passwd:/etc/passwd:ro" if spark_env is not None: args["Configurations"].append( { @@ -109,16 +216,21 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s "Configurations": [], } ) + + hive_conf: Optional[Dict[str, Any]] = None + if (pars["hive_glue_catalog"] is True) or (pars["hive_docker"] is True): + hive_conf: Optional[Dict[str, Any]] = {"Classification": "hive-site", "Properties": {}, "Configurations": []} + if pars["hive_glue_catalog"] is True: - args["Configurations"].append( - { - "Classification": "hive-site", - "Properties": { - "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" # noqa - }, - "Configurations": [], - } - ) + hive_conf["Properties"][ + "hive.metastore.client.factory.class" + ] = "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" + if pars["hive_docker"] is True: + hive_conf["Properties"]["hive.execution.mode"] = "container" + + if hive_conf is not None: + args["Configurations"].append(hive_conf) + if pars["presto_glue_catalog"] is True: args["Configurations"].append( { @@ -147,20 +259,21 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s "Properties": pars["spark_defaults"], } args["Configurations"].append(spark_defaults) + if pars.get("custom_classifications") is not None: + for c in pars["custom_classifications"]: + args["Configurations"].append(c) # Applications if pars["applications"]: - args["Applications"]: List[Dict[str, str]] = [{"Name": x} for x in pars["applications"]] + args["Applications"] = [{"Name": x} for x in pars["applications"]] # Bootstraps if pars["bootstraps_paths"]: # pragma: no cover - args["BootstrapActions"]: List[Dict] = [ - {"Name": x, "ScriptBootstrapAction": {"Path": x}} for x in pars["bootstraps_paths"] - ] + args["BootstrapActions"] = [{"Name": x, "ScriptBootstrapAction": {"Path": x}} for x in pars["bootstraps_paths"]] # Debugging and Steps if (pars["debugging"] is True) or (pars["steps"] is not None): - args["Steps"]: List[Dict[str, Any]] = [] + args["Steps"] = [] if pars["debugging"] is True: args["Steps"].append( { @@ -169,6 +282,17 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s "HadoopJarStep": {"Jar": "command-runner.jar", "Args": ["state-pusher-script"]}, } ) + if pars["ecr_credentials_step"] is True: + args["Steps"].append( + build_step( + name="ECR Credentials Setup", + command=_get_ecr_credentials_command(), + action_on_failure="TERMINATE_CLUSTER", + script=False, + region=region, + boto3_session=pars["boto3_session"], + ) + ) if pars["steps"] is not None: args["Steps"] += pars["steps"] @@ -199,7 +323,7 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s ], } if pars["instance_num_spot_master"] > 0: # pragma: no cover - fleet_master["LaunchSpecifications"]: Dict = { + fleet_master["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_master"], "TimeoutAction": timeout_action_master, @@ -236,7 +360,7 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s ], } if pars["instance_num_spot_core"] > 0: - fleet_core["LaunchSpecifications"]: Dict = { + fleet_core["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_core"], "TimeoutAction": timeout_action_core, @@ -275,7 +399,7 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s ], } if pars["instance_num_spot_task"] > 0: - fleet_task["LaunchSpecifications"]: Dict = { + fleet_task["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_task"], "TimeoutAction": timeout_action_task, @@ -292,30 +416,30 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused-argument - cluster_name: str, - logging_s3_path: str, - emr_release: str, subnet_id: str, - emr_ec2_role: str, - emr_role: str, - instance_type_master: str, - instance_type_core: str, - instance_type_task: str, - instance_ebs_size_master: int, - instance_ebs_size_core: int, - instance_ebs_size_task: int, - instance_num_on_demand_master: int, - instance_num_on_demand_core: int, - instance_num_on_demand_task: int, - instance_num_spot_master: int, - instance_num_spot_core: int, - instance_num_spot_task: int, - spot_bid_percentage_of_on_demand_master: int, - spot_bid_percentage_of_on_demand_core: int, - spot_bid_percentage_of_on_demand_task: int, - spot_provisioning_timeout_master: int, - spot_provisioning_timeout_core: int, - spot_provisioning_timeout_task: int, + cluster_name: str = "my-emr-cluster", + logging_s3_path: Optional[str] = None, + emr_release: str = "emr-6.0.0", + emr_ec2_role: str = "EMR_EC2_DefaultRole", + emr_role: str = "EMR_DefaultRole", + instance_type_master: str = "r5.xlarge", + instance_type_core: str = "r5.xlarge", + instance_type_task: str = "r5.xlarge", + instance_ebs_size_master: int = 64, + instance_ebs_size_core: int = 64, + instance_ebs_size_task: int = 64, + instance_num_on_demand_master: int = 1, + instance_num_on_demand_core: int = 0, + instance_num_on_demand_task: int = 0, + instance_num_spot_master: int = 0, + instance_num_spot_core: int = 0, + instance_num_spot_task: int = 0, + spot_bid_percentage_of_on_demand_master: int = 100, + spot_bid_percentage_of_on_demand_core: int = 100, + spot_bid_percentage_of_on_demand_task: int = 100, + spot_provisioning_timeout_master: int = 5, + spot_provisioning_timeout_core: int = 5, + spot_provisioning_timeout_task: int = 5, spot_timeout_to_on_demand_master: bool = True, spot_timeout_to_on_demand_core: bool = True, spot_timeout_to_on_demand_task: bool = True, @@ -337,10 +461,17 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused security_group_slave: Optional[str] = None, security_groups_slave_additional: Optional[List[str]] = None, security_group_service_access: Optional[str] = None, + docker: bool = False, spark_log_level: str = "WARN", spark_jars_path: Optional[List[str]] = None, spark_defaults: Optional[Dict[str, str]] = None, spark_pyarrow: bool = False, + spark_docker: bool = False, + spark_docker_image: str = None, + hive_docker: bool = False, + ecr_credentials_step: bool = False, + extra_public_registries: Optional[List[str]] = None, + custom_classifications: Optional[List[Dict[str, Any]]] = None, maximize_resource_allocation: bool = False, steps: Optional[List[Dict[str, Any]]] = None, keep_cluster_alive_when_no_steps: bool = True, @@ -354,18 +485,19 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused Parameters ---------- + subnet_id : str + VPC subnet ID. cluster_name : str Cluster name. - logging_s3_path : str + logging_s3_path : str, optional Logging s3 path (e.g. s3://BUCKET_NAME/DIRECTORY_NAME/). + If None, the default is `s3://aws-logs-{AccountId}-{RegionId}/elasticmapreduce/` emr_release : str EMR release (e.g. emr-5.28.0). emr_ec2_role : str IAM role name. emr_role : str IAM role name. - subnet_id : str - VPC subnet ID. instance_type_master : str EC2 instance type. instance_type_core : str @@ -448,6 +580,7 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused Debugging enabled? applications : List[str], optional List of applications (e.g ["Hadoop", "Spark", "Ganglia", "Hive"]). + If None, ["Spark"] will be considered. visible_to_all_users : bool True or False. key_pair_name : str, optional @@ -465,6 +598,8 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused security_group_service_access : str, optional The identifier of the Amazon EC2 security group for the Amazon EMR service to access clusters in VPC private subnets. + docker : bool + Enable Docker Hub and ECR registries access. spark_log_level : str log4j.rootCategory log level (ALL, DEBUG, INFO, WARN, ERROR, FATAL, OFF, TRACE). spark_jars_path : List[str], optional @@ -475,6 +610,18 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused spark_pyarrow : bool Enable PySpark to use PyArrow behind the scenes. P.S. You must install pyarrow by your self via bootstrap + spark_docker : bool = False + Add necessary Spark Defaults to run on Docker + spark_docker_image : str, optional + E.g. {ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG} + hive_docker : bool + Add necessary configurations to run on Docker + ecr_credentials_step : bool + Add a extra step during the Cluster launch to retrieve ECR auth files. + extra_public_registries: List[str], optional + Additional registries. + custom_classifications: List[Dict[str, Any]], optional + Extra classifications. maximize_resource_allocation : bool Configure your executors to utilize the maximum resources possible https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html#emr-spark-maximizeresourceallocation @@ -500,6 +647,21 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused Examples -------- + Minimal Example + + >>> cluster_id = wr.emr.create_cluster("SUBNET_ID") + + Minimal Exmaple on Docker + + >>> cluster_id = wr.emr.create_cluster( + >>> subnet_id="SUBNET_ID", + >>> spark_docker=True, + >>> spark_docker_image="{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}", + >>> ecr_credentials_step=True + >>> ) + + Full Example + >>> import awswrangler as wr >>> cluster_id = wr.emr.create_cluster( ... cluster_name="wrangler_cluster", @@ -548,6 +710,8 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused ... }) """ + applications = ["Spark"] if applications is None else applications + boto3_session = _utils.ensure_session(session=boto3_session) args: Dict[str, Any] = _build_cluster_args(**locals()) client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.run_job_flow(**args) @@ -647,8 +811,8 @@ def submit_steps( def submit_step( cluster_id: str, - name: str, command: str, + name: str = "my-step", action_on_failure: str = "CONTINUE", script: bool = False, boto3_session: Optional[boto3.Session] = None, @@ -659,11 +823,11 @@ def submit_step( ---------- cluster_id : str Cluster ID. - name : str - Step name. command : str e.g. 'echo "Hello!"' e.g. for script 's3://.../script.sh arg1 arg2' + name : str, optional + Step name. action_on_failure : str 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' script : bool @@ -698,26 +862,29 @@ def submit_step( def build_step( - name: str, command: str, + name: str = "my-step", action_on_failure: str = "CONTINUE", script: bool = False, + region: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Any]: """Build the Step structure (dictionary). Parameters ---------- - name : str - Step name. command : str e.g. 'echo "Hello!"' e.g. for script 's3://.../script.sh arg1 arg2' + name : str, optional + Step name. action_on_failure : str 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' script : bool - True for raw command or False for script runner. + False for raw command or True for script runner. https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html + region: str, optional + Region name to not get it from boto3.Session. (e.g. `us-east-1`) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -734,14 +901,17 @@ def build_step( >>> wr.emr.submit_steps(cluster_id="cluster-id", steps=steps) """ - session: boto3.Session = _utils.ensure_session(session=boto3_session) jar: str = "command-runner.jar" if script is True: - if session.region_name is not None: - region: str = session.region_name - else: # pragma: no cover - region = "us-east-1" - jar = f"s3://{region}.elasticmapreduce/libs/script-runner/script-runner.jar" + if region is not None: # pragma: no cover + _region: str = region + else: + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if session.region_name is not None: + _region = session.region_name + else: # pragma: no cover + _region = "us-east-1" + jar = f"s3://{_region}.elasticmapreduce/libs/script-runner/script-runner.jar" step: Dict[str, Any] = { "Name": name, "ActionOnFailure": action_on_failure, @@ -780,3 +950,40 @@ def get_step_state(cluster_id: str, step_id: str, boto3_session: Optional[boto3. response: Dict[str, Any] = client_emr.describe_step(ClusterId=cluster_id, StepId=step_id) _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") return response["Step"]["Status"]["State"] + + +def update_ecr_credentials( + cluster_id: str, action_on_failure: str = "CONTINUE", boto3_session: Optional[boto3.Session] = None +) -> str: + """Update internal ECR credentials. + + Parameters + ---------- + cluster_id : str + Cluster ID. + action_on_failure : str + 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Step ID. + + Examples + -------- + >>> import awswrangler as wr + >>> step_id = wr.emr.update_ecr_credentials("cluster_id") + + """ + name: str = "Update ECR Credentials" + command: str = _get_ecr_credentials_command() + session: boto3.Session = _utils.ensure_session(session=boto3_session) + step: Dict[str, Any] = build_step( + name=name, command=command, action_on_failure=action_on_failure, script=False, boto3_session=session + ) + client_emr: boto3.client = _utils.client(service_name="emr", session=session) + response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step]) + _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") + return response["StepIds"][0] diff --git a/awswrangler/s3.py b/awswrangler/s3.py index f728937db..527c1ae76 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -111,6 +111,40 @@ def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None) raise ex # pragma: no cover +def list_directories(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]: + """List Amazon S3 objects from a prefix. + + Parameters + ---------- + path : str + S3 path (e.g. s3://bucket/prefix). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + List of objects paths. + + Examples + -------- + Using the default boto3 session + + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix/') + ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] + + Using a custom boto3 session + + >>> import boto3 + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix/', boto3_session=boto3.Session()) + ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] + + """ + return _list_objects(path=path, delimiter="/", boto3_session=boto3_session) + + def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]: """List Amazon S3 objects from a prefix. @@ -142,20 +176,37 @@ def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> Li ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2'] """ + return _list_objects(path=path, delimiter=None, boto3_session=boto3_session) + + +def _list_objects( + path: str, delimiter: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) paginator = client_s3.get_paginator("list_objects_v2") bucket: str prefix: str bucket, prefix = _utils.parse_path(path=path) - response_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix, PaginationConfig={"PageSize": 1000}) + args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}} + if delimiter is not None: + args["Delimiter"] = delimiter + response_iterator = paginator.paginate(**args) paths: List[str] = [] for page in response_iterator: - contents: Optional[List] = page.get("Contents") - if contents is not None: - for content in contents: - if (content is not None) and ("Key" in content): - key: str = content["Key"] - paths.append(f"s3://{bucket}/{key}") + if delimiter is None: + contents: Optional[List[Optional[Dict[str, str]]]] = page.get("Contents") + if contents is not None: + for content in contents: + if (content is not None) and ("Key" in content): + key: str = content["Key"] + paths.append(f"s3://{bucket}/{key}") + else: + prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes") + if prefixes is not None: + for pfx in prefixes: + if (pfx is not None) and ("Prefix" in pfx): + key = pfx["Prefix"] + paths.append(f"s3://{bucket}/{key}") return paths diff --git a/docs/source/api.rst b/docs/source/api.rst index 897fc7a3e..7d2d51602 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -16,6 +16,7 @@ Amazon S3 does_object_exist get_bucket_region list_objects + list_directories read_csv read_fwf read_json @@ -115,6 +116,7 @@ EMR submit_steps build_step get_step_state + update_ecr_credentials CloudWatch Logs --------------- diff --git a/requirements-dev.txt b/requirements-dev.txt index 3fdd3cdf3..99a9b0730 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -17,4 +17,5 @@ twine~=3.1.1 wheel~=0.34.2 sphinx~=3.0.1 sphinx_bootstrap_theme~=0.7.1 -moto~=1.3.14 \ No newline at end of file +moto~=1.3.14 +jupyterlab~=2.1.1 \ No newline at end of file diff --git a/testing/test_awswrangler/test_cloudwatch.py b/testing/test_awswrangler/test_cloudwatch.py index f59b8b3dd..eced7a754 100644 --- a/testing/test_awswrangler/test_cloudwatch.py +++ b/testing/test_awswrangler/test_cloudwatch.py @@ -48,7 +48,7 @@ def loggroup(cloudformation_outputs): def test_query_cancelled(loggroup): client_logs = boto3.client("logs") query_id = wr.cloudwatch.start_query( - log_group_names=[loggroup], query="fields @timestamp, @message | sort @timestamp desc | limit 5" + log_group_names=[loggroup], query="fields @timestamp, @message | sort @timestamp desc" ) client_logs.stop_query(queryId=query_id) with pytest.raises(exceptions.QueryCancelled): diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index afa2a8307..bd53d4bad 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -127,6 +127,9 @@ def test_athena_ctas(bucket, database, kms_key): partition_cols=["par0", "par1"], )["paths"] wr.s3.wait_objects_exist(paths=paths) + dirs = wr.s3.list_directories(path=f"s3://{bucket}/test_athena_ctas/") + for d in dirs: + assert d.startswith(f"s3://{bucket}/test_athena_ctas/par0=") df = wr.s3.read_parquet_table(table="test_athena_ctas", database=database) assert len(df.index) == 3 ensure_data_types(df=df, has_list=True) diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py index e64329b33..df2dab1cb 100644 --- a/testing/test_awswrangler/test_emr.py +++ b/testing/test_awswrangler/test_emr.py @@ -146,3 +146,36 @@ def test_cluster_single_node(bucket, cloudformation_outputs): wr.emr.submit_steps(cluster_id=cluster_id, steps=steps) wr.emr.terminate_cluster(cluster_id=cluster_id) wr.s3.delete_objects(f"s3://{bucket}/emr-logs/") + + +def test_default_logging_path(cloudformation_outputs): + path = wr.emr._get_default_logging_path(subnet_id=cloudformation_outputs["SubnetId"]) + assert path.startswith("s3://aws-logs-") + assert path.endswith("/elasticmapreduce/") + with pytest.raises(wr.exceptions.InvalidArgumentCombination): + wr.emr._get_default_logging_path() + + +def test_docker(cloudformation_outputs): + cluster_id = wr.emr.create_cluster( + subnet_id=cloudformation_outputs["SubnetId"], + docker=True, + spark_docker=True, + spark_docker_image="787535711150.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr", + hive_docker=True, + ecr_credentials_step=True, + custom_classifications=[ + { + "Classification": "livy-conf", + "Properties": { + "livy.spark.master": "yarn", + "livy.spark.deploy-mode": "cluster", + "livy.server.session.timeout": "16h", + }, + } + ], + steps=[wr.emr.build_step("spark-submit --deploy-mode cluster s3://igor-tavares/emr.py")], + ) + wr.emr.submit_step(cluster_id=cluster_id, command="spark-submit --deploy-mode cluster s3://igor-tavares/emr.py") + wr.emr.update_ecr_credentials(cluster_id=cluster_id) + wr.emr.terminate_cluster(cluster_id=cluster_id) diff --git a/testing/test_awswrangler/test_moto.py b/testing/test_awswrangler/test_moto.py index db12dbe1a..2adc7aec8 100644 --- a/testing/test_awswrangler/test_moto.py +++ b/testing/test_awswrangler/test_moto.py @@ -20,6 +20,21 @@ def emr(): yield True +@pytest.fixture(scope="module") +def sts(): + with moto.mock_sts(): + yield True + + +@pytest.fixture(scope="module") +def subnet(): + with moto.mock_ec2(): + ec2 = boto3.resource("ec2", region_name="us-west-1") + vpc = ec2.create_vpc(CidrBlock="10.0.0.0/16") + subnet = ec2.create_subnet(VpcId=vpc.id, CidrBlock="10.0.0.0/24", AvailabilityZone="us-west-1a") + yield subnet.id + + def test_csv(s3): path = "s3://bucket/test.csv" wr.s3.to_csv(df=get_df_csv(), path=path, index=False) @@ -37,12 +52,13 @@ def test_parquet(s3): assert len(df.columns) == 18 -def test_emr(s3, emr): +def test_emr(s3, emr, sts, subnet): + session = boto3.Session(region_name="us-west-1") cluster_id = wr.emr.create_cluster( cluster_name="wrangler_cluster", logging_s3_path="s3://bucket/emr-logs/", emr_release="emr-5.29.0", - subnet_id="foo", + subnet_id=subnet, emr_ec2_role="EMR_EC2_DefaultRole", emr_role="EMR_DefaultRole", instance_type_master="m5.xlarge", @@ -87,11 +103,12 @@ def test_emr(s3, emr): termination_protected=False, spark_pyarrow=False, tags={"foo": "boo", "bar": "xoo"}, + boto3_session=session, ) - wr.emr.get_cluster_state(cluster_id=cluster_id) + wr.emr.get_cluster_state(cluster_id=cluster_id, boto3_session=session) steps = [] for cmd in ['echo "Hello"', "ls -la"]: steps.append(wr.emr.build_step(name=cmd, command=cmd)) - wr.emr.submit_steps(cluster_id=cluster_id, steps=steps) - wr.emr.terminate_cluster(cluster_id=cluster_id) + wr.emr.submit_steps(cluster_id=cluster_id, steps=steps, boto3_session=session) + wr.emr.terminate_cluster(cluster_id=cluster_id, boto3_session=session) wr.s3.delete_objects("s3://bucket/emr-logs/") diff --git a/tutorials/15 - EMR.ipynb b/tutorials/15 - EMR.ipynb new file mode 100644 index 000000000..4e1c627e6 --- /dev/null +++ b/tutorials/15 - EMR.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)\n", + "\n", + "# 15 - EMR" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import awswrangler as wr\n", + "import boto3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enter your bucket name:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ··········································\n" + ] + } + ], + "source": [ + "import getpass\n", + "bucket = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enter your Subnet ID:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ························\n" + ] + } + ], + "source": [ + "subnet = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating EMR Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "cluster_id = wr.emr.create_cluster(subnet)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Uploading our PySpark script to Amazon S3" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.appName(\"docker-awswrangler\").getOrCreate()\n", + "sc = spark.sparkContext\n", + "\n", + "print(\"Spark Initialized\")\n", + "\"\"\"\n", + "\n", + "_ = boto3.client(\"s3\").put_object(\n", + " Body=script,\n", + " Bucket=bucket,\n", + " Key=\"test.py\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submit PySpark step" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "step_id = wr.emr.submit_step(cluster_id, command=f\"spark-submit s3://{bucket}/test.py\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wait Step" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Terminate Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "wr.emr.terminate_cluster(cluster_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb new file mode 100644 index 000000000..138759d8f --- /dev/null +++ b/tutorials/16 - EMR & Docker.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)\n", + "\n", + "# 16 - EMR & Docker" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import awswrangler as wr\n", + "import boto3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enter your bucket name:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ··········································\n" + ] + } + ], + "source": [ + "import getpass\n", + "bucket = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enter your Subnet ID:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ························\n" + ] + } + ], + "source": [ + "subnet = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Build and Upload Docker Image to ECR repository\n", + "\n", + "Replace the `{ACCOUNT_ID}` placeholder." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%writefile Dockerfile\n" + } + }, + "outputs": [], + "source": [ + "%%writefile Dockerfile\n", + "\n", + "FROM amazoncorretto:8\n", + "\n", + "RUN yum -y update\n", + "RUN yum -y install yum-utils\n", + "RUN yum -y groupinstall development\n", + "\n", + "RUN yum list python3*\n", + "RUN yum -y install python3 python3-dev python3-pip python3-virtualenv\n", + "\n", + "RUN python -V\n", + "RUN python3 -V\n", + "\n", + "ENV PYSPARK_DRIVER_PYTHON python3\n", + "ENV PYSPARK_PYTHON python3\n", + "\n", + "RUN pip3 install --upgrade pip\n", + "RUN pip3 install awswrangler\n", + "\n", + "RUN python3 -c \"import awswrangler as wr\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "docker build -t 'local/emr-wrangler' .\n", + "aws ecr create-repository --repository-name emr-wrangler\n", + "docker tag local/emr-wrangler {ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\n", + "eval $(aws ecr get-login --region us-east-1 --no-include-email)\n", + "docker push {ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating EMR Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n", + "\n", + "cluster_id = wr.emr.create_cluster(\n", + " subnet_id=subnet,\n", + " spark_docker=True,\n", + " spark_docker_image=DOCKER_IMAGE,\n", + " ecr_credentials_step=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Uploading our PySpark script to Amazon S3" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.appName(\"docker-awswrangler\").getOrCreate()\n", + "sc = spark.sparkContext\n", + "\n", + "print(\"Spark Initialized\")\n", + "\n", + "import awswrangler as wr\n", + "\n", + "print(f\"Wrangler version: {wr.__version__}\")\n", + "\"\"\"\n", + "\n", + "_ = boto3.client(\"s3\").put_object(\n", + " Body=script,\n", + " Bucket=bucket,\n", + " Key=\"test_docker.py\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submit PySpark step" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "step_id = wr.emr.submit_step(cluster_id, command=f\"spark-submit --deploy-mode cluster s3://{bucket}/test_docker.py\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wait Step" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Terminate Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "wr.emr.terminate_cluster(cluster_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 9611a0ae88ebb2b44853ebd460916013383cae26 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 25 Apr 2020 18:17:36 -0300 Subject: [PATCH 28/59] Improve EMR tutorials #193 --- awswrangler/emr.py | 21 ++++++++++++++++++++- testing/test_awswrangler/test_emr.py | 6 +++--- tutorials/16 - EMR & Docker.ipynb | 7 +++++-- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/awswrangler/emr.py b/awswrangler/emr.py index 106a57da3..7490b29c9 100644 --- a/awswrangler/emr.py +++ b/awswrangler/emr.py @@ -649,10 +649,29 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused -------- Minimal Example + >>> import awswrangler as wr >>> cluster_id = wr.emr.create_cluster("SUBNET_ID") - Minimal Exmaple on Docker + Minimal Example With Custom Classification + >>> import awswrangler as wr + >>> cluster_id = wr.emr.create_cluster( + >>> subnet_id="SUBNET_ID", + >>> custom_classifications=[ + >>> { + >>> "Classification": "livy-conf", + >>> "Properties": { + >>> "livy.spark.master": "yarn", + >>> "livy.spark.deploy-mode": "cluster", + >>> "livy.server.session.timeout": "16h", + >>> }, + >>> } + >>> ], + >>> ) + + Minimal Example on Docker + + >>> import awswrangler as wr >>> cluster_id = wr.emr.create_cluster( >>> subnet_id="SUBNET_ID", >>> spark_docker=True, diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py index df2dab1cb..66f8e139f 100644 --- a/testing/test_awswrangler/test_emr.py +++ b/testing/test_awswrangler/test_emr.py @@ -161,7 +161,7 @@ def test_docker(cloudformation_outputs): subnet_id=cloudformation_outputs["SubnetId"], docker=True, spark_docker=True, - spark_docker_image="787535711150.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr", + spark_docker_image="123456789123.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr", hive_docker=True, ecr_credentials_step=True, custom_classifications=[ @@ -174,8 +174,8 @@ def test_docker(cloudformation_outputs): }, } ], - steps=[wr.emr.build_step("spark-submit --deploy-mode cluster s3://igor-tavares/emr.py")], + steps=[wr.emr.build_step("spark-submit --deploy-mode cluster s3://bucket/emr.py")], ) - wr.emr.submit_step(cluster_id=cluster_id, command="spark-submit --deploy-mode cluster s3://igor-tavares/emr.py") + wr.emr.submit_step(cluster_id=cluster_id, command="spark-submit --deploy-mode cluster s3://bucket/emr.py") wr.emr.update_ecr_credentials(cluster_id=cluster_id) wr.emr.terminate_cluster(cluster_id=cluster_id) diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb index 138759d8f..440d72066 100644 --- a/tutorials/16 - EMR & Docker.ipynb +++ b/tutorials/16 - EMR & Docker.ipynb @@ -201,7 +201,10 @@ "metadata": {}, "outputs": [], "source": [ - "step_id = wr.emr.submit_step(cluster_id, command=f\"spark-submit --deploy-mode cluster s3://{bucket}/test_docker.py\")" + "step_id = wr.emr.submit_step(\n", + " cluster_id=cluster_id,\n", + " command=f\"spark-submit --deploy-mode cluster s3://{bucket}/test_docker.py\"\n", + ")" ] }, { @@ -266,4 +269,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file From 3c3ca645718aedbae47eb8b4134118d178ef90a4 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sun, 26 Apr 2020 16:30:02 -0300 Subject: [PATCH 29/59] Splitting up the ecr_credentials to a individual function #193 --- awswrangler/emr.py | 184 +++++++++++++++------------ testing/test_awswrangler/test_emr.py | 18 ++- tutorials/16 - EMR & Docker.ipynb | 132 ++++++++++++++++--- 3 files changed, 230 insertions(+), 104 deletions(-) diff --git a/awswrangler/emr.py b/awswrangler/emr.py index 7490b29c9..3658d4573 100644 --- a/awswrangler/emr.py +++ b/awswrangler/emr.py @@ -61,13 +61,6 @@ def _get_default_logging_path( return f"s3://aws-logs-{_account_id}-{_region}/elasticmapreduce/" -def _get_ecr_credentials_command() -> str: - return ( - "sudo -s eval $(aws ecr get-login --region us-east-1 --no-include-email) && " - "sudo hdfs dfs -put -f /root/.docker/config.json /user/hadoop/" - ) - - def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-statements account_id: str = _utils.get_account_id(boto3_session=pars["boto3_session"]) region: str = _utils.get_region_from_subnet(subnet_id=pars["subnet_id"], boto3_session=pars["boto3_session"]) @@ -139,7 +132,7 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s args["Configurations"] = [ {"Classification": "spark-log4j", "Properties": {"log4j.rootCategory": f"{pars['spark_log_level']}, console"}} ] - if (pars["docker"] is True) or (pars["spark_docker"] is True) or (pars["hive_docker"] is True): + if pars["docker"] is True: if pars.get("extra_registries") is None: extra_registries: List[str] = [] else: # pragma: no cover @@ -162,26 +155,6 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s ], } ) - if pars["spark_docker"] is True: - if pars.get("spark_docker_image") is None: # pragma: no cover - raise exceptions.InvalidArgumentCombination("You must pass a spark_docker_image if spark_docker is True.") - pars["spark_defaults"] = {} if pars["spark_defaults"] is None else pars["spark_defaults"] - pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker" - pars["spark_defaults"][ - "spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG" - ] = "hdfs:///user/hadoop/config.json" - pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars["spark_docker_image"] - pars["spark_defaults"]["spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS"] = "/etc/passwd:/etc/passwd:ro" - pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_TYPE"] = "docker" - pars["spark_defaults"][ - "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG" - ] = "hdfs:///user/hadoop/config.json" - pars["spark_defaults"]["spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE"] = pars[ - "spark_docker_image" - ] - pars["spark_defaults"][ - "spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS" - ] = "/etc/passwd:/etc/passwd:ro" if spark_env is not None: args["Configurations"].append( { @@ -216,21 +189,12 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s "Configurations": [], } ) - - hive_conf: Optional[Dict[str, Any]] = None - if (pars["hive_glue_catalog"] is True) or (pars["hive_docker"] is True): - hive_conf: Optional[Dict[str, Any]] = {"Classification": "hive-site", "Properties": {}, "Configurations": []} - if pars["hive_glue_catalog"] is True: + hive_conf: Optional[Dict[str, Any]] = {"Classification": "hive-site", "Properties": {}, "Configurations": []} hive_conf["Properties"][ "hive.metastore.client.factory.class" ] = "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" - if pars["hive_docker"] is True: - hive_conf["Properties"]["hive.execution.mode"] = "container" - - if hive_conf is not None: args["Configurations"].append(hive_conf) - if pars["presto_glue_catalog"] is True: args["Configurations"].append( { @@ -282,17 +246,6 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s "HadoopJarStep": {"Jar": "command-runner.jar", "Args": ["state-pusher-script"]}, } ) - if pars["ecr_credentials_step"] is True: - args["Steps"].append( - build_step( - name="ECR Credentials Setup", - command=_get_ecr_credentials_command(), - action_on_failure="TERMINATE_CLUSTER", - script=False, - region=region, - boto3_session=pars["boto3_session"], - ) - ) if pars["steps"] is not None: args["Steps"] += pars["steps"] @@ -462,15 +415,11 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused security_groups_slave_additional: Optional[List[str]] = None, security_group_service_access: Optional[str] = None, docker: bool = False, + extra_public_registries: Optional[List[str]] = None, spark_log_level: str = "WARN", spark_jars_path: Optional[List[str]] = None, spark_defaults: Optional[Dict[str, str]] = None, spark_pyarrow: bool = False, - spark_docker: bool = False, - spark_docker_image: str = None, - hive_docker: bool = False, - ecr_credentials_step: bool = False, - extra_public_registries: Optional[List[str]] = None, custom_classifications: Optional[List[Dict[str, Any]]] = None, maximize_resource_allocation: bool = False, steps: Optional[List[Dict[str, Any]]] = None, @@ -600,6 +549,8 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused service to access clusters in VPC private subnets. docker : bool Enable Docker Hub and ECR registries access. + extra_public_registries: List[str], optional + Additional docker registries. spark_log_level : str log4j.rootCategory log level (ALL, DEBUG, INFO, WARN, ERROR, FATAL, OFF, TRACE). spark_jars_path : List[str], optional @@ -610,16 +561,6 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused spark_pyarrow : bool Enable PySpark to use PyArrow behind the scenes. P.S. You must install pyarrow by your self via bootstrap - spark_docker : bool = False - Add necessary Spark Defaults to run on Docker - spark_docker_image : str, optional - E.g. {ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG} - hive_docker : bool - Add necessary configurations to run on Docker - ecr_credentials_step : bool - Add a extra step during the Cluster launch to retrieve ECR auth files. - extra_public_registries: List[str], optional - Additional registries. custom_classifications: List[Dict[str, Any]], optional Extra classifications. maximize_resource_allocation : bool @@ -669,16 +610,6 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused >>> ], >>> ) - Minimal Example on Docker - - >>> import awswrangler as wr - >>> cluster_id = wr.emr.create_cluster( - >>> subnet_id="SUBNET_ID", - >>> spark_docker=True, - >>> spark_docker_image="{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}", - >>> ecr_credentials_step=True - >>> ) - Full Example >>> import awswrangler as wr @@ -971,8 +902,8 @@ def get_step_state(cluster_id: str, step_id: str, boto3_session: Optional[boto3. return response["Step"]["Status"]["State"] -def update_ecr_credentials( - cluster_id: str, action_on_failure: str = "CONTINUE", boto3_session: Optional[boto3.Session] = None +def submit_ecr_credentials_refresh( + cluster_id: str, path: str, action_on_failure: str = "CONTINUE", boto3_session: Optional[boto3.Session] = None ) -> str: """Update internal ECR credentials. @@ -980,6 +911,8 @@ def update_ecr_credentials( ---------- cluster_id : str Cluster ID. + path : str + Amazon S3 path where Wrangler will stage the script ecr_credentials_refresh.py (e.g. s3://bucket/emr/) action_on_failure : str 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' boto3_session : boto3.Session(), optional @@ -993,12 +926,17 @@ def update_ecr_credentials( Examples -------- >>> import awswrangler as wr - >>> step_id = wr.emr.update_ecr_credentials("cluster_id") + >>> step_id = wr.emr.submit_ecr_credentials_refresh("cluster_id", "s3://bucket/emr/") """ - name: str = "Update ECR Credentials" - command: str = _get_ecr_credentials_command() + path = path[:-1] if path.endswith("/") else path + path_script: str = f"{path}/ecr_credentials_refresh.py" session: boto3.Session = _utils.ensure_session(session=boto3_session) + client_s3: boto3.client = _utils.client(service_name="s3", session=session) + bucket, key = _utils.parse_path(path=path_script) + client_s3.put_object(Body=_get_ecr_credentials_refresh_content().encode(encoding="utf-8"), Bucket=bucket, Key=key) + command: str = f"spark-submit --deploy-mode cluster {path_script}" + name: str = "ECR Credentials Refresh" step: Dict[str, Any] = build_step( name=name, command=command, action_on_failure=action_on_failure, script=False, boto3_session=session ) @@ -1006,3 +944,91 @@ def update_ecr_credentials( response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step]) _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") return response["StepIds"][0] + + +def _get_ecr_credentials_refresh_content() -> str: + return """ +import subprocess +from pyspark.sql import SparkSession +spark = SparkSession.builder.appName("ECR Setup Job").getOrCreate() + +COMMANDS = [ + "sudo -s eval $(aws ecr get-login --region us-east-1 --no-include-email)", + "sudo hdfs dfs -put -f /root/.docker/config.json /user/hadoop/" +] + +for command in COMMANDS: + subprocess.run(command.split(" "), timeout=6.0, check=True) + +print("done!") + """ + + +def build_spark_step( + path: str, + deploy_mode: str = "cluster", + docker_image: Optional[str] = None, + name: str = "my-step", + action_on_failure: str = "CONTINUE", + region: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Build the Step structure (dictionary). + + Parameters + ---------- + path : str + Script path. (e.g. s3://bucket/app.py) + deploy_mode : str + "cluster" | "client" + docker_image : str, optional + e.g. "{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}" + name : str, optional + Step name. + action_on_failure : str + 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' + region: str, optional + Region name to not get it from boto3.Session. (e.g. `us-east-1`) + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Step structure. + + Examples + -------- + >>> import awswrangler as wr + >>> step_id = wr.emr.submit_steps( + >>> cluster_id="cluster-id", + >>> steps=[ + >>> wr.emr.build_spark_step(path="s3://bucket/app.py") + >>> ] + >>> ) + + """ + if docker_image is None: # pragma: no cover + cmd: str = f"spark-submit --deploy-mode {deploy_mode} {path}" + else: + config: str = "hdfs:///user/hadoop/config.json" + cmd = ( + f"spark-submit --deploy-mode cluster " + f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_TYPE=docker " + f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE={docker_image} " + f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG={config} " + f"--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS=/etc/passwd:/etc/passwd:ro " + f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_TYPE=docker " + f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE={docker_image} " + f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG={config} " + f"--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS=/etc/passwd:/etc/passwd:ro " + f"{path}" + ) + return build_step( + command=cmd, + name=name, + action_on_failure=action_on_failure, + script=False, + region=region, + boto3_session=boto3_session, + ) diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py index 66f8e139f..fdda2fa25 100644 --- a/testing/test_awswrangler/test_emr.py +++ b/testing/test_awswrangler/test_emr.py @@ -156,14 +156,10 @@ def test_default_logging_path(cloudformation_outputs): wr.emr._get_default_logging_path() -def test_docker(cloudformation_outputs): +def test_docker(bucket, cloudformation_outputs): cluster_id = wr.emr.create_cluster( subnet_id=cloudformation_outputs["SubnetId"], docker=True, - spark_docker=True, - spark_docker_image="123456789123.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr", - hive_docker=True, - ecr_credentials_step=True, custom_classifications=[ { "Classification": "livy-conf", @@ -176,6 +172,14 @@ def test_docker(cloudformation_outputs): ], steps=[wr.emr.build_step("spark-submit --deploy-mode cluster s3://bucket/emr.py")], ) - wr.emr.submit_step(cluster_id=cluster_id, command="spark-submit --deploy-mode cluster s3://bucket/emr.py") - wr.emr.update_ecr_credentials(cluster_id=cluster_id) + wr.emr.submit_ecr_credentials_refresh(cluster_id, path=f"s3://{bucket}/emr/") + wr.emr.submit_steps( + cluster_id=cluster_id, + steps=[ + wr.emr.build_spark_step( + path=f"s3://{bucket}/emr/test_docker.py", + docker_image="123456789123.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr", + ) + ], + ) wr.emr.terminate_cluster(cluster_id=cluster_id) diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb index 440d72066..8a637af86 100644 --- a/tutorials/16 - EMR & Docker.ipynb +++ b/tutorials/16 - EMR & Docker.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -142,25 +142,45 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n", - "\n", - "cluster_id = wr.emr.create_cluster(\n", - " subnet_id=subnet,\n", - " spark_docker=True,\n", - " spark_docker_image=DOCKER_IMAGE,\n", - " ecr_credentials_step=True\n", - ")" + "cluster_id = wr.emr.create_cluster(subnet, docker=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Refresh ECR credentials in the cluster (expiration time: 12h )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'s-3OPMPDCYGEGOT'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.emr.submit_ecr_credentials_refresh(cluster_id, path=f\"s3://{bucket}/emr/\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Uploading our PySpark script to Amazon S3" + "## Uploading application script to Amazon S3 (PySpark)" ] }, { @@ -184,7 +204,7 @@ "_ = boto3.client(\"s3\").put_object(\n", " Body=script,\n", " Bucket=bucket,\n", - " Key=\"test_docker.py\"\n", + " Key=\"emr/test_docker.py\"\n", ")" ] }, @@ -201,9 +221,13 @@ "metadata": {}, "outputs": [], "source": [ - "step_id = wr.emr.submit_step(\n", + "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n", + "\n", + "step = wr.emr.build_spark_step(f\"s3://{bucket}/emr/test_docker.py\", docker_image=DOCKER_IMAGE)\n", + "\n", + "steps_ids = wr.emr.submit_steps(\n", " cluster_id=cluster_id,\n", - " command=f\"spark-submit --deploy-mode cluster s3://{bucket}/test_docker.py\"\n", + " steps=[step]\n", ")" ] }, @@ -220,7 +244,7 @@ "metadata": {}, "outputs": [], "source": [ - "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n", + "while wr.emr.get_step_state(cluster_id, steps_ids[0]) != \"COMPLETED\":\n", " pass" ] }, @@ -240,6 +264,78 @@ "wr.emr.terminate_cluster(cluster_id)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Another example with custom configurations" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "cluster_id = wr.emr.create_cluster(\n", + " cluster_name=\"my-demo-cluster-v2\",\n", + " logging_s3_path=f\"s3://{bucket}/emr-logs/\",\n", + " emr_release=\"emr-6.0.0\",\n", + " subnet_id=subnet,\n", + " emr_ec2_role=\"EMR_EC2_DefaultRole\",\n", + " emr_role=\"EMR_DefaultRole\",\n", + " instance_type_master=\"m5.2xlarge\",\n", + " instance_type_core=\"m5.2xlarge\",\n", + " instance_ebs_size_master=50,\n", + " instance_ebs_size_core=50,\n", + " instance_num_on_demand_master=0,\n", + " instance_num_on_demand_core=0,\n", + " instance_num_spot_master=1,\n", + " instance_num_spot_core=2,\n", + " spot_bid_percentage_of_on_demand_master=100,\n", + " spot_bid_percentage_of_on_demand_core=100,\n", + " spot_provisioning_timeout_master=5,\n", + " spot_provisioning_timeout_core=5,\n", + " spot_timeout_to_on_demand_master=False,\n", + " spot_timeout_to_on_demand_core=False,\n", + " python3=True,\n", + " docker=True,\n", + " spark_glue_catalog=True,\n", + " hive_glue_catalog=True,\n", + " presto_glue_catalog=True,\n", + " debugging=True,\n", + " applications=[\"Hadoop\", \"Spark\", \"Hive\", \"Zeppelin\", \"Livy\"],\n", + " visible_to_all_users=True,\n", + " maximize_resource_allocation=True,\n", + " keep_cluster_alive_when_no_steps=True,\n", + " termination_protected=False,\n", + " spark_pyarrow=True\n", + ")\n", + "\n", + "wr.emr.submit_ecr_credentials_refresh(cluster_id, path=f\"s3://{bucket}/emr/\")\n", + "\n", + "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n", + "\n", + "steps_ids = wr.emr.submit_steps(\n", + " cluster_id=cluster_id,\n", + " steps=[\n", + " wr.emr.build_spark_step(f\"s3://{bucket}/emr/test_docker.py\", docker_image=DOCKER_IMAGE)\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "while wr.emr.get_step_state(cluster_id, steps_ids[0]) != \"COMPLETED\":\n", + " pass\n", + "\n", + "wr.emr.terminate_cluster(cluster_id)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -269,4 +365,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} From 2eefb3aa4ff8694fcdf58bc6e5bb943633c0de5d Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sun, 26 Apr 2020 16:42:46 -0300 Subject: [PATCH 30/59] Small update in the EMR tutorial --- tutorials/16 - EMR & Docker.ipynb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb index 8a637af86..9bfa182fc 100644 --- a/tutorials/16 - EMR & Docker.ipynb +++ b/tutorials/16 - EMR & Docker.ipynb @@ -225,10 +225,7 @@ "\n", "step = wr.emr.build_spark_step(f\"s3://{bucket}/emr/test_docker.py\", docker_image=DOCKER_IMAGE)\n", "\n", - "steps_ids = wr.emr.submit_steps(\n", - " cluster_id=cluster_id,\n", - " steps=[step]\n", - ")" + "steps_ids = wr.emr.submit_steps(cluster_id, steps=[step])" ] }, { From 86cdb307a27c7a6107627e34272da857c52573bf Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Sun, 26 Apr 2020 17:49:00 -0300 Subject: [PATCH 31/59] fix init and docs --- awswrangler/__init__.py | 11 ++++++++++- awswrangler/torch.py | 27 +++++++++++++++++++++++---- requirements-torch.txt | 4 ++-- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py index ff6a2bd71..b7f931a3d 100644 --- a/awswrangler/__init__.py +++ b/awswrangler/__init__.py @@ -5,9 +5,18 @@ """ +import importlib import logging -from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3, torch # noqa +from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3 # noqa from awswrangler.__metadata__ import __description__, __license__, __title__, __version__ # noqa +if ( + importlib.util.find_spec("torch") + and importlib.util.find_spec("torchvision") + and importlib.util.find_spec("torchaudio") + and importlib.util.find_spec("PIL") +): # type: ignore + from awswrangler import torch # noqa + logging.getLogger("awswrangler").addHandler(logging.NullHandler()) diff --git a/awswrangler/torch.py b/awswrangler/torch.py index a5b589386..e7cd4518f 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -35,6 +35,8 @@ def __init__( ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + suffix: str, optional + S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -85,10 +87,10 @@ def __len__(self): return len(self._paths) def _data_fn(self, data) -> Any: - pass + raise NotImplementedError() def _label_fn(self, path: str) -> Any: - pass + raise NotImplementedError() class _S3PartitionedDataset(_ListS3Dataset): @@ -98,6 +100,9 @@ def _label_fn(self, path: str) -> torch.Tensor: label = int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1]) return torch.tensor([label]) # pylint: disable=not-callable + def _data_fn(self, data) -> Any: + raise NotImplementedError() + # class S3FilesDataset(_BaseS3Dataset, Dataset): # """PyTorch Amazon S3 Files Map-Style Dataset.""" @@ -162,6 +167,12 @@ def __init__( ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + data_fn: Callable + Function that receives a io.BytesIO object and returns a torch.Tensor + label_fn: Callable + Function that receives object path (str) and return a torch.Tensor + suffix: str, optional + S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -226,6 +237,8 @@ def __init__( ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + suffix: str, optional + S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -314,6 +327,8 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + suffix: str, optional + S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -342,6 +357,8 @@ class S3IterableDataset(IterableDataset, _BaseS3Dataset): # pylint: disable=abs ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + suffix: str, optional + S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -395,7 +412,9 @@ def __init__( SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() label_col : int, optional - Label column number + Label column number. + chunksize : int, optional + The chunksize determines que number of rows to be retrived from the database at each time. Returns ------- @@ -425,7 +444,7 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, label_col: Optional[int] = list(cursor.keys()).index(self._label_col) else: label_col = self._label_col - _logger.debug(f"label_col: {label_col}") + _logger.debug("label_col: %s", label_col) if self._chunksize is None: return SQLDataset._records2tensor(records=cursor.fetchall(), label_col=label_col) return self._iterate_cursor(cursor=cursor, chunksize=self._chunksize, label_col=label_col) diff --git a/requirements-torch.txt b/requirements-torch.txt index 01d2c6e65..61de25397 100644 --- a/requirements-torch.txt +++ b/requirements-torch.txt @@ -1,4 +1,4 @@ -torch~=1.4.0 +torch~=1.5.0 torchvision~=0.5.0 torchaudio~=0.4.0 -Pillow==7.1.1 +Pillow~=7.1.1 From b3c8c811282f8d50fc3a7fa855ab7a0933e8d121 Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Sun, 26 Apr 2020 20:11:44 -0300 Subject: [PATCH 32/59] update tutorial --- tutorials/14 - PyTorch.ipynb | 121 ++++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 51 deletions(-) diff --git a/tutorials/14 - PyTorch.ipynb b/tutorials/14 - PyTorch.ipynb index a3d988881..b85596986 100644 --- a/tutorials/14 - PyTorch.ipynb +++ b/tutorials/14 - PyTorch.ipynb @@ -19,24 +19,28 @@ "metadata": {}, "source": [ "## Table of Contents\n", - "* [1.Defining Training Function](#1.-Defininf-Training-Function)\n", - "* [2.Traning From Amazon S3](#1.-Traning-From-Amazon-S3)\n", - "\t* [2.1 Writing PyTorch Dataset to S3](#1.1-Writing-PyTorch-Dataset-to-S3)\n", - "\t* [2.2 Training Network](#1.2-Training-Network)\n", - "* [3. Training From SQL Query](#2.-Training-From-SQL-Query)\n", - "\t* [3.1 Writing Data to SQL Database](#2.1-Writing-Data-to-SQL-Database)\n", - "\t* [3.3 Training Network From SQL](#2.2-Reading-single-JSON-file)\n", - "* [4. Creating Custom S3 Dataset](#1.-Creating-Custom-S3-Dataset)\n", - "\t* [4.1 Creating Custom PyTorch Dataset](#1.1-Creating-Custom-PyTorch-Dataset)\n", - "\t* [4.2 Writing Data to S3](#1.1-Writing-Data-to-S3)\n", - "\t* [4.3 Training Network](#1.2-Training-Network)\n", - "* [5. Delete objects](#6.-Delete-objects)" + "* [1.Defining Training Function](#1.-Defining-Training-Function)\n", + "* [2.Training From Amazon S3](#2.-Traoning-From-Amazon-S3)\n", + "\t* [2.1 Writing PyTorch Dataset to S3](#2.1-Writing-PyTorch-Dataset-to-S3)\n", + "\t* [2.2 Training Network](#2.2-Training-Network)\n", + "* [3. Training From SQL Query](#3.-Training-From-SQL-Query)\n", + "\t* [3.1 Writing Data to SQL Database](#3.1-Writing-Data-to-SQL-Database)\n", + "\t* [3.3 Training Network From SQL](#3.3-Reading-single-JSON-file)\n", + "* [4. Creating Custom S3 Dataset](#4.-Creating-Custom-S3-Dataset)\n", + "\t* [4.1 Creating Custom PyTorch Dataset](#4.1-Creating-Custom-PyTorch-Dataset)\n", + "\t* [4.2 Writing Data to S3](#4.2-Writing-Data-to-S3)\n", + "\t* [4.3 Training Network](#4.4-Training-Network)\n", + "* [5. Delete objects](#5.-Delete-objects)" ] }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "import io\n", @@ -55,13 +59,17 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ - " ··········································\n" + "········\n" ] } ], @@ -116,13 +124,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# 2. Traning From Amazon S3" + "# 2. Training From Amazon S3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.1 Writing PyTorch Dataset to S3" ] }, { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "client_s3 = boto3.client(\"s3\")\n", @@ -153,23 +172,23 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "batch: 0 loss: 7.0221 acc: 0.00\n", - "batch: 1 loss: 2.7788 acc: 23.44\n", - "batch: 2 loss: 0.9828 acc: 32.29\n", - "batch: 3 loss: 0.9414 acc: 39.45\n", - "batch: 4 loss: 1.0737 acc: 39.38\n", - "batch: 0 loss: 1.2178 acc: 50.00\n", - "batch: 1 loss: 1.4069 acc: 51.56\n", - "batch: 2 loss: 1.0783 acc: 52.08\n", - "batch: 3 loss: 0.9926 acc: 52.34\n", - "batch: 4 loss: 1.1111 acc: 49.06\n" + "batch: 0 loss: 7.0132 acc: 0.00\n", + "batch: 1 loss: 2.8764 acc: 21.09\n", + "batch: 2 loss: 0.9600 acc: 32.29\n", + "batch: 3 loss: 0.8676 acc: 36.33\n", + "batch: 4 loss: 1.1386 acc: 36.88\n", + "batch: 0 loss: 1.0754 acc: 51.56\n", + "batch: 1 loss: 1.4241 acc: 51.56\n", + "batch: 2 loss: 1.3019 acc: 51.04\n", + "batch: 3 loss: 0.8631 acc: 53.52\n", + "batch: 4 loss: 0.4252 acc: 54.38\n" ] } ], @@ -196,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -226,28 +245,28 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "batch: 0 loss: 5.0253 acc: 50.00\n", - "batch: 1 loss: 21.3174 acc: 50.00\n", - "batch: 2 loss: 0.5061 acc: 66.67\n", - "batch: 0 loss: 1.2222 acc: 50.00\n", - "batch: 1 loss: 0.7075 acc: 50.00\n", - "batch: 2 loss: 0.7077 acc: 50.00\n", - "batch: 0 loss: 0.9302 acc: 50.00\n", - "batch: 1 loss: 0.6960 acc: 50.00\n", - "batch: 2 loss: 0.6018 acc: 66.67\n", - "batch: 0 loss: 1.1284 acc: 50.00\n", - "batch: 1 loss: 0.7077 acc: 50.00\n", - "batch: 2 loss: 0.6791 acc: 50.00\n", - "batch: 0 loss: 1.0030 acc: 50.00\n", - "batch: 1 loss: 0.7053 acc: 50.00\n", - "batch: 2 loss: 0.6318 acc: 50.00\n" + "batch: 0 loss: 8.8708 acc: 50.00\n", + "batch: 1 loss: 88.7789 acc: 50.00\n", + "batch: 2 loss: 0.8655 acc: 33.33\n", + "batch: 0 loss: 0.7036 acc: 50.00\n", + "batch: 1 loss: 0.7034 acc: 50.00\n", + "batch: 2 loss: 0.8447 acc: 33.33\n", + "batch: 0 loss: 0.7012 acc: 50.00\n", + "batch: 1 loss: 0.7010 acc: 50.00\n", + "batch: 2 loss: 0.8250 acc: 33.33\n", + "batch: 0 loss: 0.6992 acc: 50.00\n", + "batch: 1 loss: 0.6991 acc: 50.00\n", + "batch: 2 loss: 0.8063 acc: 33.33\n", + "batch: 0 loss: 0.6975 acc: 50.00\n", + "batch: 1 loss: 0.6974 acc: 50.00\n", + "batch: 2 loss: 0.7886 acc: 33.33\n" ] } ], @@ -279,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -289,9 +308,9 @@ ], "metadata": { "kernelspec": { - "display_name": "conda_python3", + "display_name": "Python 3", "language": "python", - "name": "conda_python3" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -303,9 +322,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.5" } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file From f6927a4e75ced46bb42192b28539bd6473cb5848 Mon Sep 17 00:00:00 2001 From: Luigi Tedesco Date: Sun, 26 Apr 2020 21:16:10 -0300 Subject: [PATCH 33/59] rollback pytorch==1.5.0, due to torchaudio requirement --- requirements-torch.txt | 2 +- tutorials/14 - PyTorch.ipynb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements-torch.txt b/requirements-torch.txt index 61de25397..73b8aae36 100644 --- a/requirements-torch.txt +++ b/requirements-torch.txt @@ -1,4 +1,4 @@ -torch~=1.5.0 +torch~=1.4.0 torchvision~=0.5.0 torchaudio~=0.4.0 Pillow~=7.1.1 diff --git a/tutorials/14 - PyTorch.ipynb b/tutorials/14 - PyTorch.ipynb index b85596986..b7af04627 100644 --- a/tutorials/14 - PyTorch.ipynb +++ b/tutorials/14 - PyTorch.ipynb @@ -222,7 +222,7 @@ "eng = wr.catalog.get_engine(\"aws-data-wrangler-redshift\")\n", "df = pd.DataFrame({\n", " \"height\": [2, 1.4, 1.7, 1.8, 1.9, 2.2],\n", - " \"weigth\": [100.0, 50.0, 70.0, 80.0, 90.0, 160.0],\n", + " \"weight\": [100.0, 50.0, 70.0, 80.0, 90.0, 160.0],\n", " \"target\": [1, 0, 0, 1, 1, 1]\n", "})\n", "\n", @@ -302,7 +302,7 @@ "metadata": {}, "outputs": [], "source": [ - "wr.s3.delete_objects(f\"s3://{bucket}/\")" + "wr.s3.delete_objects(f\"s3://{bucket}/{folder}\")" ] } ], From f0f154bd42807066dbfa24204fcd28a66c1981ab Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sun, 26 Apr 2020 22:37:45 -0300 Subject: [PATCH 34/59] Add wr.emr.submit_spark_step --- awswrangler/emr.py | 58 ++++++++++++++++++++++++++++ docs/source/api.rst | 4 +- testing/test_awswrangler/test_emr.py | 1 + tutorials/16 - EMR & Docker.ipynb | 47 +++++++++++----------- 4 files changed, 84 insertions(+), 26 deletions(-) diff --git a/awswrangler/emr.py b/awswrangler/emr.py index 3658d4573..3801d340e 100644 --- a/awswrangler/emr.py +++ b/awswrangler/emr.py @@ -1032,3 +1032,61 @@ def build_spark_step( region=region, boto3_session=boto3_session, ) + + +def submit_spark_step( + cluster_id: str, + path: str, + deploy_mode: str = "cluster", + docker_image: Optional[str] = None, + name: str = "my-step", + action_on_failure: str = "CONTINUE", + region: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> str: + """Submit Spark Step. + + Parameters + ---------- + cluster_id : str + Cluster ID. + path : str + Script path. (e.g. s3://bucket/app.py) + deploy_mode : str + "cluster" | "client" + docker_image : str, optional + e.g. "{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{IMAGE_NAME}:{TAG}" + name : str, optional + Step name. + action_on_failure : str + 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' + region: str, optional + Region name to not get it from boto3.Session. (e.g. `us-east-1`) + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Step ID. + + Examples + -------- + >>> import awswrangler as wr + >>> step_id = wr.emr.submit_spark_step( + >>> cluster_id="cluster-id", + >>> path="s3://bucket/emr/app.py" + >>> ) + + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + step = build_spark_step( + path=path, + deploy_mode=deploy_mode, + docker_image=docker_image, + name=name, + action_on_failure=action_on_failure, + region=region, + boto3_session=session, + ) + return submit_steps(cluster_id=cluster_id, steps=[step], boto3_session=session)[0] diff --git a/docs/source/api.rst b/docs/source/api.rst index 7d2d51602..6b841705e 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -113,10 +113,12 @@ EMR get_cluster_state terminate_cluster submit_step + submit_spark_step + submit_ecr_credentials_refresh submit_steps build_step + build_spark_step get_step_state - update_ecr_credentials CloudWatch Logs --------------- diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py index fdda2fa25..0c0112bf8 100644 --- a/testing/test_awswrangler/test_emr.py +++ b/testing/test_awswrangler/test_emr.py @@ -182,4 +182,5 @@ def test_docker(bucket, cloudformation_outputs): ) ], ) + wr.emr.submit_spark_step(cluster_id=cluster_id, path=f"s3://{bucket}/emr/test_docker.py") wr.emr.terminate_cluster(cluster_id=cluster_id) diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb index 9bfa182fc..4ffb2be2b 100644 --- a/tutorials/16 - EMR & Docker.ipynb +++ b/tutorials/16 - EMR & Docker.ipynb @@ -11,12 +11,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import awswrangler as wr\n", - "import boto3" + "import boto3\n", + "import getpass" ] }, { @@ -40,7 +41,6 @@ } ], "source": [ - "import getpass\n", "bucket = getpass.getpass()" ] }, @@ -164,7 +164,7 @@ { "data": { "text/plain": [ - "'s-3OPMPDCYGEGOT'" + "'s-1B0O45RWJL8CL'" ] }, "execution_count": 5, @@ -173,7 +173,7 @@ } ], "source": [ - "wr.emr.submit_ecr_credentials_refresh(cluster_id, path=f\"s3://{bucket}/emr/\")" + "wr.emr.submit_ecr_credentials_refresh(cluster_id, path=f\"s3://{bucket}/\")" ] }, { @@ -185,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -201,11 +201,7 @@ "print(f\"Wrangler version: {wr.__version__}\")\n", "\"\"\"\n", "\n", - "_ = boto3.client(\"s3\").put_object(\n", - " Body=script,\n", - " Bucket=bucket,\n", - " Key=\"emr/test_docker.py\"\n", - ")" + "boto3.client(\"s3\").put_object(Body=script, Bucket=bucket, Key=\"test_docker.py\");" ] }, { @@ -217,15 +213,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n", "\n", - "step = wr.emr.build_spark_step(f\"s3://{bucket}/emr/test_docker.py\", docker_image=DOCKER_IMAGE)\n", - "\n", - "steps_ids = wr.emr.submit_steps(cluster_id, steps=[step])" + "step_id = wr.emr.submit_spark_step(\n", + " cluster_id,\n", + " f\"s3://{bucket}/test_docker.py\",\n", + " docker_image=DOCKER_IMAGE\n", + ")" ] }, { @@ -237,11 +235,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "while wr.emr.get_step_state(cluster_id, steps_ids[0]) != \"COMPLETED\":\n", + "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n", " pass" ] }, @@ -254,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -270,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -313,11 +311,10 @@ "\n", "DOCKER_IMAGE = f\"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler\"\n", "\n", - "steps_ids = wr.emr.submit_steps(\n", - " cluster_id=cluster_id,\n", - " steps=[\n", - " wr.emr.build_spark_step(f\"s3://{bucket}/emr/test_docker.py\", docker_image=DOCKER_IMAGE)\n", - " ]\n", + "step_id = wr.emr.submit_spark_step(\n", + " cluster_id,\n", + " f\"s3://{bucket}/test_docker.py\",\n", + " docker_image=DOCKER_IMAGE\n", ")" ] }, @@ -327,7 +324,7 @@ "metadata": {}, "outputs": [], "source": [ - "while wr.emr.get_step_state(cluster_id, steps_ids[0]) != \"COMPLETED\":\n", + "while wr.emr.get_step_state(cluster_id, step_id) != \"COMPLETED\":\n", " pass\n", "\n", "wr.emr.terminate_cluster(cluster_id)" From 602bceb5f20d31989c8eda2949e8ad106285b27c Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sun, 26 Apr 2020 22:43:32 -0300 Subject: [PATCH 35/59] Bumping version to 1.1.0 --- README.md | 2 +- awswrangler/__metadata__.py | 2 +- testing/test_awswrangler/test_metadata.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 608297330..ce57f6b00 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ We just released a new major version `1.0` with breaking changes. Please make su ![AWS Data Wrangler](docs/source/_static/logo2.png?raw=true "AWS Data Wrangler") -[![Release](https://img.shields.io/badge/release-1.0.4-brightgreen.svg)](https://pypi.org/project/awswrangler/) +[![Release](https://img.shields.io/badge/release-1.1.0-brightgreen.svg)](https://pypi.org/project/awswrangler/) [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) diff --git a/awswrangler/__metadata__.py b/awswrangler/__metadata__.py index 724b626da..cfc9336b9 100644 --- a/awswrangler/__metadata__.py +++ b/awswrangler/__metadata__.py @@ -7,5 +7,5 @@ __title__ = "awswrangler" __description__ = "Pandas on AWS." -__version__ = "1.0.4" +__version__ = "1.1.0" __license__ = "Apache License 2.0" diff --git a/testing/test_awswrangler/test_metadata.py b/testing/test_awswrangler/test_metadata.py index d076c0d94..88e71c5a3 100644 --- a/testing/test_awswrangler/test_metadata.py +++ b/testing/test_awswrangler/test_metadata.py @@ -2,7 +2,7 @@ def test_metadata(): - assert wr.__version__ == "1.0.4" + assert wr.__version__ == "1.1.0" assert wr.__title__ == "awswrangler" assert wr.__description__ == "Pandas on AWS." assert wr.__license__ == "Apache License 2.0" From aeb8792c939db9138660be33854027434ddf4677 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Mon, 27 Apr 2020 13:01:43 -0300 Subject: [PATCH 36/59] Improving the chunksize parser slicer algorithm --- awswrangler/s3.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/awswrangler/s3.py b/awswrangler/s3.py index 0127f8897..7090c2c0f 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -1684,23 +1684,15 @@ def _read_parquet_chunked( if chunked is True: yield _table2df(table=table, categories=categories, use_threads=use_threads) else: - if next_slice is not None: + if next_slice: table = pa.lib.concat_tables([next_slice, table], promote=promote) - length: int = len(table) - while True: - if length == chunked: - yield _table2df(table=table, categories=categories, use_threads=use_threads) - next_slice = None - break - if length < chunked: - next_slice = table - break + while len(table) >= chunked: yield _table2df( table=table.slice(offset=0, length=chunked), categories=categories, use_threads=use_threads ) table = table.slice(offset=chunked, length=None) - length = len(table) - if next_slice is not None: + next_slice = table + if next_slice: yield _table2df(table=next_slice, categories=categories, use_threads=use_threads) From 797fba55245e4847c777b35be1c7eb2111d456cb Mon Sep 17 00:00:00 2001 From: Igor Tavares Date: Mon, 27 Apr 2020 15:32:42 -0300 Subject: [PATCH 37/59] Update badges on README --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index ce57f6b00..c72692f85 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,8 @@ We just released a new major version `1.0` with breaking changes. Please make su [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) -[![Average time to resolve an issue](http://isitmaintained.com/badge/resolution/awslabs/aws-data-wrangler.svg)](http://isitmaintained.com/project/awslabs/aws-data-wrangler "Average time to resolve an issue") +[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) [![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen.svg)](https://pypi.org/project/awswrangler/) ![Static Checking](https://github.com/awslabs/aws-data-wrangler/workflows/Static%20Checking/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/?badge=latest) From d9f107a75ff7b35a350ff379f614712412802765 Mon Sep 17 00:00:00 2001 From: Igor Tavares Date: Mon, 27 Apr 2020 15:35:23 -0300 Subject: [PATCH 38/59] Add EMR tutorials to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c72692f85..66095288c 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,8 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) - [11 - CSV Datasets](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/11%20-%20CSV%20Datasets.ipynb) - [12 - CSV Crawler](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/12%20-%20CSV%20Crawler.ipynb) - [13 - Merging Datasets on S3](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/13%20-%20Merging%20Datasets%20on%20S3.ipynb) + - [15 - EMR](https://github.com/awslabs/aws-data-wrangler/blob/dev/tutorials/15%20-%20EMR.ipynb) + - [16 - EMR & Docker](https://github.com/awslabs/aws-data-wrangler/blob/dev/tutorials/16%20-%20EMR%20%26%20Docker.ipynb) - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/latest/api.html) - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#amazon-s3) - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#aws-glue-catalog) From 7fd449e02b30668ad717e9d34a0e303fac28e059 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Mon, 27 Apr 2020 15:54:53 -0300 Subject: [PATCH 39/59] Adapting to validations --- awswrangler/__init__.py | 9 ++------- awswrangler/s3.py | 9 ++++++--- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py index f2a390a18..78299541e 100644 --- a/awswrangler/__init__.py +++ b/awswrangler/__init__.py @@ -5,19 +5,14 @@ """ -import importlib import logging +from importlib.util import find_spec from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3 # noqa from awswrangler.__metadata__ import __description__, __license__, __title__, __version__ # noqa from awswrangler._utils import get_account_id # noqa -if ( - importlib.util.find_spec("torch") - and importlib.util.find_spec("torchvision") - and importlib.util.find_spec("torchaudio") - and importlib.util.find_spec("PIL") -): # type: ignore +if find_spec("torch") and find_spec("torchvision") and find_spec("torchaudio") and find_spec("PIL"): from awswrangler import torch # noqa logging.getLogger("awswrangler").addHandler(logging.NullHandler()) diff --git a/awswrangler/s3.py b/awswrangler/s3.py index 7661e61d0..f4be39359 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -178,11 +178,14 @@ def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optiona ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2'] """ - return _list_objects(path=path, delimiter=None, boto3_session=boto3_session) + return _list_objects(path=path, delimiter=None, suffix=suffix, boto3_session=boto3_session) def _list_objects( - path: str, delimiter: Optional[str] = None, boto3_session: Optional[boto3.Session] = None + path: str, + delimiter: Optional[str] = None, + suffix: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, ) -> List[str]: client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) paginator = client_s3.get_paginator("list_objects_v2") @@ -194,7 +197,7 @@ def _list_objects( args["Delimiter"] = delimiter response_iterator = paginator.paginate(**args) paths: List[str] = [] - for page in response_iterator: + for page in response_iterator: # pylint: disable=too-many-nested-blocks if delimiter is None: contents: Optional[List] = page.get("Contents") if contents is not None: From fd115d89c240a03856936fe076bc2407aa1188ac Mon Sep 17 00:00:00 2001 From: igorborgest Date: Mon, 27 Apr 2020 19:30:30 -0300 Subject: [PATCH 40/59] Bumping dev dependencies --- requirements-dev.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 99a9b0730..bfdd15c5e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ black~=19.3b0 -pylint~=2.4.4 +pylint~=2.5.0 flake8~=3.7.9 mypy~=0.770 isort~=4.3.21 @@ -11,11 +11,11 @@ pytest-cov~=2.8.1 pytest-xdist~=1.31.0 scikit-learn~=0.22.1 awscli>=1.18.22 -cfn-lint~=0.29.5 -cfn-flip~=1.2.2 +cfn-lint~=0.29.6 +cfn-flip~=1.2.3 twine~=3.1.1 wheel~=0.34.2 -sphinx~=3.0.1 +sphinx~=3.0.3 sphinx_bootstrap_theme~=0.7.1 moto~=1.3.14 jupyterlab~=2.1.1 \ No newline at end of file From 8fad37c30f2ec85d845da624d1f7841cf861da49 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Mon, 27 Apr 2020 19:31:02 -0300 Subject: [PATCH 41/59] Bumping PyTorch libs versions --- requirements-torch.txt | 8 ++++---- tox.ini | 7 +++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/requirements-torch.txt b/requirements-torch.txt index 73b8aae36..d3e36447e 100644 --- a/requirements-torch.txt +++ b/requirements-torch.txt @@ -1,4 +1,4 @@ -torch~=1.4.0 -torchvision~=0.5.0 -torchaudio~=0.4.0 -Pillow~=7.1.1 +torch~=1.5.0 +torchvision~=0.6.0 +torchaudio~=0.5.0 +Pillow~=7.1.2 diff --git a/tox.ini b/tox.ini index 9768fd204..f2bb572c2 100644 --- a/tox.ini +++ b/tox.ini @@ -6,10 +6,13 @@ deps = pytest pytest-xdist moto -commands = pytest -n 8 testing/test_awswrangler + -rrequirements-torch.txt +commands = + pytest -n 8 testing/test_awswrangler [testenv:py36] deps = {[testenv]deps} pytest-cov -commands = pytest --cov=awswrangler -n 8 testing/test_awswrangler +commands = + pytest --cov=awswrangler -n 8 testing/test_awswrangler From 85bfade15ae16108cf45ba67ad81f0d08289571a Mon Sep 17 00:00:00 2001 From: igorborgest Date: Mon, 27 Apr 2020 19:32:01 -0300 Subject: [PATCH 42/59] Replacing all f-string on logging commands --- awswrangler/_data_types.py | 14 +++++++------- awswrangler/athena.py | 26 +++++++++++++------------- awswrangler/catalog.py | 10 +++++----- awswrangler/cloudwatch.py | 8 ++++---- awswrangler/db.py | 22 +++++++++++----------- awswrangler/emr.py | 16 ++++++++-------- awswrangler/s3.py | 36 ++++++++++++++++++------------------ 7 files changed, 66 insertions(+), 66 deletions(-) diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py index 62928e816..947b058b0 100644 --- a/awswrangler/_data_types.py +++ b/awswrangler/_data_types.py @@ -207,7 +207,7 @@ def pyarrow2sqlalchemy( # pylint: disable=too-many-branches,too-many-return-sta return sqlalchemy.types.Date if pa.types.is_binary(dtype): if db_type == "redshift": - raise exceptions.UnsupportedType(f"Binary columns are not supported for Redshift.") # pragma: no cover + raise exceptions.UnsupportedType("Binary columns are not supported for Redshift.") # pragma: no cover return sqlalchemy.types.Binary if pa.types.is_decimal(dtype): return sqlalchemy.types.Numeric(precision=dtype.precision, scale=dtype.scale) @@ -257,7 +257,7 @@ def pyarrow_types_from_pandas( # Filling schema columns_types: Dict[str, pa.DataType] columns_types = {n: cols_dtypes[n] for n in sorted_cols} - _logger.debug(f"columns_types: {columns_types}") + _logger.debug("columns_types: %s", columns_types) return columns_types @@ -275,7 +275,7 @@ def athena_types_from_pandas( athena_columns_types[k] = casts[k] else: athena_columns_types[k] = pyarrow2athena(dtype=v) - _logger.debug(f"athena_columns_types: {athena_columns_types}") + _logger.debug("athena_columns_types: %s", athena_columns_types) return athena_columns_types @@ -315,7 +315,7 @@ def pyarrow_schema_from_pandas( if (k in df.columns) and (k not in ignore): columns_types[k] = athena2pyarrow(v) columns_types = {k: v for k, v in columns_types.items() if v is not None} - _logger.debug(f"columns_types: {columns_types}") + _logger.debug("columns_types: %s", columns_types) return pa.schema(fields=columns_types) @@ -324,11 +324,11 @@ def athena_types_from_pyarrow_schema( ) -> Tuple[Dict[str, str], Optional[Dict[str, str]]]: """Extract the related Athena data types from any PyArrow Schema considering possible partitions.""" columns_types: Dict[str, str] = {str(f.name): pyarrow2athena(dtype=f.type) for f in schema} - _logger.debug(f"columns_types: {columns_types}") + _logger.debug("columns_types: %s", columns_types) partitions_types: Optional[Dict[str, str]] = None if partitions is not None: partitions_types = {p.name: pyarrow2athena(p.dictionary.type) for p in partitions} - _logger.debug(f"partitions_types: {partitions_types}") + _logger.debug("partitions_types: %s", partitions_types) return columns_types, partitions_types @@ -382,5 +382,5 @@ def sqlalchemy_types_from_pandas( sqlalchemy_columns_types[k] = casts[k] else: sqlalchemy_columns_types[k] = pyarrow2sqlalchemy(dtype=v, db_type=db_type) - _logger.debug(f"sqlalchemy_columns_types: {sqlalchemy_columns_types}") + _logger.debug("sqlalchemy_columns_types: %s", sqlalchemy_columns_types) return sqlalchemy_columns_types diff --git a/awswrangler/athena.py b/awswrangler/athena.py index 4948f56dc..bd5c7cb35 100644 --- a/awswrangler/athena.py +++ b/awswrangler/athena.py @@ -176,8 +176,8 @@ def wait_query(query_execution_id: str, boto3_session: Optional[boto3.Session] = time.sleep(_QUERY_WAIT_POLLING_DELAY) response = client_athena.get_query_execution(QueryExecutionId=query_execution_id) state = response["QueryExecution"]["Status"]["State"] - _logger.debug(f"state: {state}") - _logger.debug(f"StateChangeReason: {response['QueryExecution']['Status'].get('StateChangeReason')}") + _logger.debug("state: %s", state) + _logger.debug("StateChangeReason: %s", response["QueryExecution"]["Status"].get("StateChangeReason")) if state == "FAILED": raise exceptions.QueryFailed(response["QueryExecution"]["Status"].get("StateChangeReason")) if state == "CANCELLED": @@ -265,7 +265,7 @@ def _get_query_metadata( cols_types: Dict[str, str] = get_query_columns_types( query_execution_id=query_execution_id, boto3_session=boto3_session ) - _logger.debug(f"cols_types: {cols_types}") + _logger.debug("cols_types: %s", cols_types) dtype: Dict[str, str] = {} parse_timestamps: List[str] = [] parse_dates: List[str] = [] @@ -298,11 +298,11 @@ def _get_query_metadata( converters[col_name] = lambda x: Decimal(str(x)) if str(x) not in ("", "none", " ", "") else None else: dtype[col_name] = pandas_type - _logger.debug(f"dtype: {dtype}") - _logger.debug(f"parse_timestamps: {parse_timestamps}") - _logger.debug(f"parse_dates: {parse_dates}") - _logger.debug(f"converters: {converters}") - _logger.debug(f"binaries: {binaries}") + _logger.debug("dtype: %s", dtype) + _logger.debug("parse_timestamps: %s", parse_timestamps) + _logger.debug("parse_dates: %s", parse_dates) + _logger.debug("converters: %s", converters) + _logger.debug("binaries: %s", binaries) return dtype, parse_timestamps, parse_dates, converters, binaries @@ -446,7 +446,7 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals f") AS\n" f"{sql}" ) - _logger.debug(f"sql: {sql}") + _logger.debug("sql: %s", sql) query_id: str = start_query_execution( sql=sql, database=database, @@ -456,7 +456,7 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals kms_key=kms_key, boto3_session=session, ) - _logger.debug(f"query_id: {query_id}") + _logger.debug("query_id: %s", query_id) query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) if query_response["QueryExecution"]["Status"]["State"] in ["FAILED", "CANCELLED"]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"]["StateChangeReason"] @@ -468,7 +468,7 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize - _logger.debug(f"chunked: {chunked}") + _logger.debug("chunked: %s", chunked) if not paths: if chunked is False: dfs = pd.DataFrame() @@ -485,9 +485,9 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals ) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) - _logger.debug(f"Start CSV reading from {path}") + _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None - _logger.debug(f"_chunksize: {_chunksize}") + _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py index 8a53d4370..93092626b 100644 --- a/awswrangler/catalog.py +++ b/awswrangler/catalog.py @@ -766,7 +766,7 @@ def drop_duplicated_columns(df: pd.DataFrame) -> pd.DataFrame: duplicated_cols = df.columns.duplicated() duplicated_cols_names: List[str] = list(df.columns[duplicated_cols]) if len(duplicated_cols_names) > 0: - _logger.warning(f"Dropping repeated columns: {duplicated_cols_names}") + _logger.warning("Dropping repeated columns: %s", duplicated_cols_names) return df.loc[:, ~duplicated_cols] @@ -967,11 +967,11 @@ def _create_table( if name in columns_comments: par["Comment"] = columns_comments[name] session: boto3.Session = _utils.ensure_session(session=boto3_session) - - if mode == "overwrite": + exist: bool = does_table_exist(database=database, table=table, boto3_session=session) + if (mode == "overwrite") or (exist is False): delete_table_if_exists(database=database, table=table, boto3_session=session) - client_glue: boto3.client = _utils.client(service_name="glue", session=session) - client_glue.create_table(DatabaseName=database, TableInput=table_input) + client_glue: boto3.client = _utils.client(service_name="glue", session=session) + client_glue.create_table(DatabaseName=database, TableInput=table_input) def _csv_table_definition( diff --git a/awswrangler/cloudwatch.py b/awswrangler/cloudwatch.py index e0a01f066..c36fab70b 100644 --- a/awswrangler/cloudwatch.py +++ b/awswrangler/cloudwatch.py @@ -56,11 +56,11 @@ def start_query( ... ) """ - _logger.debug(f"log_group_names: {log_group_names}") + _logger.debug("log_group_names: %s", log_group_names) start_timestamp: int = int(1000 * start_time.timestamp()) end_timestamp: int = int(1000 * end_time.timestamp()) - _logger.debug(f"start_timestamp: {start_timestamp}") - _logger.debug(f"end_timestamp: {end_timestamp}") + _logger.debug("start_timestamp: %s", start_timestamp) + _logger.debug("end_timestamp: %s", end_timestamp) args: Dict[str, Any] = { "logGroupNames": log_group_names, "startTime": start_timestamp, @@ -109,7 +109,7 @@ def wait_query(query_id: str, boto3_session: Optional[boto3.Session] = None) -> time.sleep(_QUERY_WAIT_POLLING_DELAY) response = client_logs.get_query_results(queryId=query_id) status = response["status"] - _logger.debug(f"status: {status}") + _logger.debug("status: %s", status) if status == "Failed": # pragma: no cover raise exceptions.QueryFailed(f"query ID: {query_id}") if status == "Cancelled": diff --git a/awswrangler/db.py b/awswrangler/db.py index c00ccf1a8..21b4789c4 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -646,7 +646,7 @@ def copy_files_to_redshift( # pylint: disable=too-many-locals,too-many-argument athena_types, _ = s3.read_parquet_metadata( path=paths, dataset=False, use_threads=use_threads, boto3_session=session ) - _logger.debug(f"athena_types: {athena_types}") + _logger.debug("athena_types: %s", athena_types) redshift_types: Dict[str, str] = {} for col_name, col_type in athena_types.items(): length: int = _varchar_lengths[col_name] if col_name in _varchar_lengths else varchar_lengths_default @@ -680,7 +680,7 @@ def copy_files_to_redshift( # pylint: disable=too-many-locals,too-many-argument def _rs_upsert(con: Any, table: str, temp_table: str, schema: str, primary_keys: Optional[List[str]] = None) -> None: if not primary_keys: primary_keys = _rs_get_primary_keys(con=con, schema=schema, table=table) - _logger.debug(f"primary_keys: {primary_keys}") + _logger.debug("primary_keys: %s", primary_keys) if not primary_keys: # pragma: no cover raise exceptions.InvalidRedshiftPrimaryKeys() equals_clause: str = f"{table}.%s = {temp_table}.%s" @@ -735,7 +735,7 @@ def _rs_create_table( f"{distkey_str}" f"{sortkey_str}" ) - _logger.debug(f"Create table query:\n{sql}") + _logger.debug("Create table query:\n%s", sql) con.execute(sql) return table, schema @@ -746,7 +746,7 @@ def _rs_validate_parameters( if diststyle not in _RS_DISTSTYLES: raise exceptions.InvalidRedshiftDiststyle(f"diststyle must be in {_RS_DISTSTYLES}") cols = list(redshift_types.keys()) - _logger.debug(f"Redshift columns: {cols}") + _logger.debug("Redshift columns: %s", cols) if (diststyle == "KEY") and (not distkey): raise exceptions.InvalidRedshiftDistkey("You must pass a distkey if you intend to use KEY diststyle") if distkey and distkey not in cols: @@ -775,13 +775,13 @@ def _rs_copy( sql: str = ( f"COPY {table_name} FROM '{manifest_path}'\n" f"IAM_ROLE '{iam_role}'\n" "MANIFEST\n" "FORMAT AS PARQUET" ) - _logger.debug(f"copy query:\n{sql}") + _logger.debug("copy query:\n%s", sql) con.execute(sql) sql = "SELECT pg_last_copy_id() AS query_id" query_id: int = con.execute(sql).fetchall()[0][0] sql = f"SELECT COUNT(DISTINCT filename) as num_files_loaded " f"FROM STL_LOAD_COMMITS WHERE query = {query_id}" num_files_loaded: int = con.execute(sql).fetchall()[0][0] - _logger.debug(f"{num_files_loaded} files counted. {num_files} expected.") + _logger.debug("%s files counted. %s expected.", num_files_loaded, num_files) if num_files_loaded != num_files: # pragma: no cover raise exceptions.RedshiftLoadError( f"Redshift load rollbacked. {num_files_loaded} files counted. {num_files} expected." @@ -846,17 +846,17 @@ def write_redshift_copy_manifest( payload: str = json.dumps(manifest) bucket: str bucket, key = _utils.parse_path(manifest_path) - _logger.debug(f"payload: {payload}") + _logger.debug("payload: %s", payload) client_s3: boto3.client = _utils.client(service_name="s3", session=session) - _logger.debug(f"bucket: {bucket}") - _logger.debug(f"key: {key}") + _logger.debug("bucket: %s", bucket) + _logger.debug("key: %s", key) client_s3.put_object(Body=payload, Bucket=bucket, Key=key) return manifest def _rs_drop_table(con: Any, schema: str, table: str) -> None: sql = f"DROP TABLE IF EXISTS {schema}.{table}" - _logger.debug(f"Drop table query:\n{sql}") + _logger.debug("Drop table query:\n%s", sql) con.execute(sql) @@ -1104,7 +1104,7 @@ def unload_redshift_to_files( query_id: int = _con.execute(sql).fetchall()[0][0] sql = f"SELECT path FROM STL_UNLOAD_LOG WHERE query={query_id};" paths = [x[0].replace(" ", "") for x in _con.execute(sql).fetchall()] - _logger.debug(f"paths: {paths}") + _logger.debug("paths: %s", paths) return paths diff --git a/awswrangler/emr.py b/awswrangler/emr.py index 3801d340e..f3e505b00 100644 --- a/awswrangler/emr.py +++ b/awswrangler/emr.py @@ -364,7 +364,7 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s if pars["tags"] is not None: args["Tags"] = [{"Key": k, "Value": v} for k, v in pars["tags"].items()] - _logger.info(f"args: \n{json.dumps(args, default=str, indent=4)}") + _logger.info("args: \n%s", json.dumps(args, default=str, indent=4)) return args @@ -665,7 +665,7 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused args: Dict[str, Any] = _build_cluster_args(**locals()) client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.run_job_flow(**args) - _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") + _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) return response["JobFlowId"] @@ -696,7 +696,7 @@ def get_cluster_state(cluster_id: str, boto3_session: Optional[boto3.Session] = """ client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.describe_cluster(ClusterId=cluster_id) - _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") + _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) return response["Cluster"]["Status"]["State"] @@ -723,7 +723,7 @@ def terminate_cluster(cluster_id: str, boto3_session: Optional[boto3.Session] = """ client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.terminate_job_flows(JobFlowIds=[cluster_id]) - _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") + _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) def submit_steps( @@ -755,7 +755,7 @@ def submit_steps( """ client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=steps) - _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") + _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) return response["StepIds"] @@ -807,7 +807,7 @@ def submit_step( ) client_emr: boto3.client = _utils.client(service_name="emr", session=session) response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step]) - _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") + _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) return response["StepIds"][0] @@ -898,7 +898,7 @@ def get_step_state(cluster_id: str, step_id: str, boto3_session: Optional[boto3. """ client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.describe_step(ClusterId=cluster_id, StepId=step_id) - _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") + _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) return response["Step"]["Status"]["State"] @@ -942,7 +942,7 @@ def submit_ecr_credentials_refresh( ) client_emr: boto3.client = _utils.client(service_name="emr", session=session) response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step]) - _logger.debug(f"response: \n{json.dumps(response, default=str, indent=4)}") + _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) return response["StepIds"][0] diff --git a/awswrangler/s3.py b/awswrangler/s3.py index f4be39359..770f588a7 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -56,10 +56,10 @@ def get_bucket_region(bucket: str, boto3_session: Optional[boto3.Session] = None """ client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - _logger.debug(f"bucket: {bucket}") + _logger.debug("bucket: %s", bucket) region: str = client_s3.get_bucket_location(Bucket=bucket)["LocationConstraint"] region = "us-east-1" if region is None else region - _logger.debug(f"region: {region}") + _logger.debug("region: %s", region) return region @@ -286,7 +286,7 @@ def _split_paths_by_bucket(paths: List[str]) -> Dict[str, List[str]]: def _delete_objects(bucket: str, keys: List[str], client_s3: boto3.client) -> None: - _logger.debug(f"len(keys): {len(keys)}") + _logger.debug("len(keys): %s", len(keys)) batch: List[Dict[str, str]] = [{"Key": key} for key in keys] client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch}) @@ -366,7 +366,7 @@ def _describe_object( break except botocore.exceptions.ClientError as e: # pragma: no cover if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404: # Not Found - _logger.debug(f"Object not found. {i} seconds remaining to wait.") + _logger.debug("Object not found. %s seconds remaining to wait.", i) if i == 1: # Last try, there is no more need to sleep break time.sleep(1) @@ -680,7 +680,7 @@ def to_csv( # pylint: disable=too-many-arguments sep=sep, ) if partitions_values: - _logger.debug(f"partitions_values:\n{partitions_values}") + _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_csv_partitions( database=database, table=table, partitions_values=partitions_values, boto3_session=session, sep=sep ) @@ -709,7 +709,7 @@ def _to_csv_dataset( if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session) df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) - _logger.debug(f"dtypes: {df.dtypes}") + _logger.debug("dtypes: %s", df.dtypes) if not partition_cols: file_path: str = f"{path}{uuid.uuid4().hex}.csv" _to_text( @@ -1094,7 +1094,7 @@ def to_parquet( # pylint: disable=too-many-arguments mode="overwrite", ) if partitions_values: - _logger.debug(f"partitions_values:\n{partitions_values}") + _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_parquet_partitions( database=database, table=table, @@ -1132,7 +1132,7 @@ def _to_parquet_dataset( schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( df=df, index=index, ignore_cols=partition_cols, dtype=dtype ) - _logger.debug(f"schema: {schema}") + _logger.debug("schema: %s", schema) if not partition_cols: file_path: str = f"{path}{uuid.uuid4().hex}{compression_ext}.parquet" _to_parquet_file( @@ -1180,7 +1180,7 @@ def _to_parquet_file( pyarrow_dtype = _data_types.athena2pyarrow(col_type) field = pa.field(name=col_name, type=pyarrow_dtype) table = table.set_column(col_index, field, table.column(col_name).cast(pyarrow_dtype)) - _logger.debug(f"Casting column {col_name} ({col_index}) to {col_type} ({pyarrow_dtype})") + _logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype) pyarrow.parquet.write_table( table=table, where=path, @@ -1508,7 +1508,7 @@ def _read_text_chunksize( ) -> Iterator[pd.DataFrame]: fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) for path in paths: - _logger.debug(f"path: {path}") + _logger.debug("path: %s", path) if pandas_args.get("compression", "infer") == "infer": pandas_args["compression"] = infer_compression(path, compression="infer") with fs.open(path, "rb") as f: @@ -1548,7 +1548,7 @@ def _read_parquet_init( path_or_paths = path[:-1] if path.endswith("/") else path else: path_or_paths = path - _logger.debug(f"path_or_paths: {path_or_paths}") + _logger.debug("path_or_paths: %s", path_or_paths) fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset( @@ -2245,12 +2245,12 @@ def merge_datasets( session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = list_objects(path=f"{source_path}/", boto3_session=session) - _logger.debug(f"len(paths): {len(paths)}") + _logger.debug("len(paths): %s", len(paths)) if len(paths) < 1: return [] if mode == "overwrite": - _logger.debug(f"Deleting to overwrite: {target_path}/") + _logger.debug("Deleting to overwrite: %s/", target_path) delete_objects(path=f"{target_path}/", use_threads=use_threads, boto3_session=session) elif mode == "overwrite_partitions": paths_wo_prefix: List[str] = [x.replace(f"{source_path}/", "") for x in paths] @@ -2258,7 +2258,7 @@ def merge_datasets( partitions_paths: List[str] = list(set(paths_wo_filename)) target_partitions_paths = [f"{target_path}/{x}" for x in partitions_paths] for path in target_partitions_paths: - _logger.debug(f"Deleting to overwrite_partitions: {path}") + _logger.debug("Deleting to overwrite_partitions: %s", path) delete_objects(path=path, use_threads=use_threads, boto3_session=session) elif mode != "append": raise exceptions.InvalidArgumentValue(f"{mode} is a invalid mode option.") @@ -2266,7 +2266,7 @@ def merge_datasets( new_objects: List[str] = copy_objects( paths=paths, source_path=source_path, target_path=target_path, use_threads=use_threads, boto3_session=session ) - _logger.debug(f"len(new_objects): {len(new_objects)}") + _logger.debug("len(new_objects): %s", len(new_objects)) return new_objects @@ -2313,7 +2313,7 @@ def copy_objects( ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] """ - _logger.debug(f"len(paths): {len(paths)}") + _logger.debug("len(paths): %s", len(paths)) if len(paths) < 1: return [] source_path = source_path[:-1] if source_path[-1] == "/" else source_path @@ -2326,13 +2326,13 @@ def copy_objects( path_final: str = f"{target_path}/{path_wo_prefix}" new_objects.append(path_final) batch.append((path, path_final)) - _logger.debug(f"len(new_objects): {len(new_objects)}") + _logger.debug("len(new_objects): %s", len(new_objects)) _copy_objects(batch=batch, use_threads=use_threads, boto3_session=session) return new_objects def _copy_objects(batch: List[Tuple[str, str]], use_threads: bool, boto3_session: boto3.Session) -> None: - _logger.debug(f"len(batch): {len(batch)}") + _logger.debug("len(batch): %s", len(batch)) client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) resource_s3: boto3.resource = _utils.resource(service_name="s3", session=boto3_session) for source, target in batch: From 910e3b69a467df64a13a23e6e6f7c5c512e8d6aa Mon Sep 17 00:00:00 2001 From: igorborgest Date: Mon, 27 Apr 2020 20:11:08 -0300 Subject: [PATCH 43/59] 100% test coverage on wr.torch --- .github/workflows/static-checking.yml | 8 ++------ awswrangler/torch.py | 18 ++++++++---------- testing/test_awswrangler/test_data_lake.py | 2 +- testing/test_awswrangler/test_torch.py | 7 +++++-- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml index 63592d182..a23a74d99 100644 --- a/.github/workflows/static-checking.yml +++ b/.github/workflows/static-checking.yml @@ -24,12 +24,8 @@ jobs: uses: actions/setup-python@v1 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - pip install -r requirements-dev.txt - pip install -r requirements-torch.txt + - name: Setup Environment + run: ./setup-dev-env.sh - name: CloudFormation Lint run: cfn-lint -t testing/cloudformation.yaml - name: Documentation Lint diff --git a/awswrangler/torch.py b/awswrangler/torch.py index e7cd4518f..7d3c47316 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -4,7 +4,6 @@ import os import pathlib import re -import tarfile from collections.abc import Iterable from io import BytesIO from typing import Any, Callable, Iterator, List, Optional, Tuple, Union @@ -64,12 +63,12 @@ def _fetch_data(self, path: str) -> Any: def _load_data(data: io.BytesIO, path: str) -> Any: if path.endswith(".pt"): data = torch.load(data) - elif path.endswith(".tar.gz") or path.endswith(".tgz"): - tarfile.open(fileobj=data) + elif path.endswith(".tar.gz") or path.endswith(".tgz"): # pragma: no cover raise NotImplementedError("Tar loader not implemented!") + # tarfile.open(fileobj=data) # tar = tarfile.open(fileobj=data) # for member in tar.getmembers(): - else: + else: # pragma: no cover raise NotImplementedError() return data @@ -86,10 +85,10 @@ def __getitem__(self, index): def __len__(self): return len(self._paths) - def _data_fn(self, data) -> Any: + def _data_fn(self, data) -> Any: # pragma: no cover raise NotImplementedError() - def _label_fn(self, path: str) -> Any: + def _label_fn(self, path: str) -> Any: # pragma: no cover raise NotImplementedError() @@ -100,7 +99,7 @@ def _label_fn(self, path: str) -> torch.Tensor: label = int(re.findall(r"/(.*?)=(.*?)/", path)[-1][1]) return torch.tensor([label]) # pylint: disable=not-callable - def _data_fn(self, data) -> Any: + def _data_fn(self, data) -> Any: # pragma: no cover raise NotImplementedError() @@ -383,9 +382,8 @@ def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, pass elif isinstance(data, Iterable) and all([isinstance(d, torch.Tensor) for d in data]): data = zip(*data) - else: + else: # pragma: no cover raise NotImplementedError(f"ERROR: Type: {type(data)} has not been implemented!") - for d in data: yield d @@ -436,7 +434,7 @@ def __init__( def __iter__(self) -> Union[Iterator[torch.Tensor], Iterator[Tuple[torch.Tensor, torch.Tensor]]]: """Iterate over the Dataset.""" if torch.utils.data.get_worker_info() is not None: # type: ignore - raise NotImplementedError() + raise NotImplementedError() # pragma: no cover db._validate_engine(con=self._con) # pylint: disable=protected-access with self._con.connect() as con: cursor: Any = con.execute(self._sql) diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index a815cd388..94541d8e6 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -708,7 +708,7 @@ def test_parquet_validate_schema(bucket, database): df2 = pd.DataFrame({"id2": [1, 2, 3], "val": ["foo", "boo", "bar"]}) path_file2 = f"s3://{bucket}/test_parquet_file_validate/1.parquet" wr.s3.to_parquet(df=df2, path=path_file2) - wr.s3.wait_objects_exist(paths=[path_file2]) + wr.s3.wait_objects_exist(paths=[path_file2], use_threads=False) df3 = wr.s3.read_parquet(path=path, validate_schema=False) assert len(df3.index) == 6 assert len(df3.columns) == 3 diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index 19a300400..6e8a3427d 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -84,7 +84,8 @@ def test_torch_sql(parameters, db_type, chunksize): @pytest.mark.parametrize("chunksize", [None, 1, 10]) @pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"]) -def test_torch_sql_label(parameters, db_type, chunksize): +@pytest.mark.parametrize("label_col", [2, "c"]) +def test_torch_sql_label(parameters, db_type, chunksize, label_col): schema = parameters[db_type]["schema"] table = f"test_torch_sql_label_{db_type}_{str(chunksize).lower()}" engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") @@ -99,7 +100,9 @@ def test_torch_sql_label(parameters, db_type, chunksize): chunksize=None, method=None, ) - ts = list(wr.torch.SQLDataset(f"SELECT * FROM {schema}.{table}", con=engine, chunksize=chunksize, label_col=2)) + ts = list( + wr.torch.SQLDataset(f"SELECT * FROM {schema}.{table}", con=engine, chunksize=chunksize, label_col=label_col) + ) assert torch.all(ts[0][0].eq(torch.tensor([1.0, 4.0]))) assert torch.all(ts[0][1].eq(torch.tensor([7], dtype=torch.long))) assert torch.all(ts[1][0].eq(torch.tensor([2.0, 5.0]))) From b4f6a36d18699e540415c31b8ecf38eb7b418aac Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 28 Apr 2020 14:31:56 -0300 Subject: [PATCH 44/59] Revisiting Athena encryption and workgroup #201 --- awswrangler/athena.py | 111 +++++++++++++++------ awswrangler/emr.py | 18 ++-- testing/test_awswrangler/test_data_lake.py | 102 +++++++++++++++++-- testing/test_awswrangler/test_torch.py | 8 +- tox.ini | 4 +- 5 files changed, 192 insertions(+), 51 deletions(-) diff --git a/awswrangler/athena.py b/awswrangler/athena.py index bd5c7cb35..76cb0a108 100644 --- a/awswrangler/athena.py +++ b/awswrangler/athena.py @@ -2,6 +2,7 @@ import csv import logging +import pprint import time from decimal import Decimal from typing import Any, Dict, Iterator, List, Optional, Tuple, Union @@ -120,19 +121,49 @@ def start_query_execution( >>> query_exec_id = wr.athena.start_query_execution(sql='...', database='...') """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config(session=session, workgroup=workgroup) + return _start_query_execution( + sql=sql, + wg_config=wg_config, + database=database, + s3_output=s3_output, + workgroup=workgroup, + encryption=encryption, + kms_key=kms_key, + boto3_session=session, + ) + + +def _start_query_execution( + sql: str, + wg_config: Dict[str, Union[Optional[bool], Optional[str]]], + database: Optional[str] = None, + s3_output: Optional[str] = None, + workgroup: Optional[str] = None, + encryption: Optional[str] = None, + kms_key: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> str: args: Dict[str, Any] = {"QueryString": sql} session: boto3.Session = _utils.ensure_session(session=boto3_session) # s3_output - if s3_output is None: # pragma: no cover - s3_output = create_athena_bucket(boto3_session=session) - args["ResultConfiguration"] = {"OutputLocation": s3_output} + args["ResultConfiguration"] = { + "OutputLocation": _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) + } # encryption - if encryption is not None: - args["ResultConfiguration"]["EncryptionConfiguration"] = {"EncryptionOption": encryption} - if kms_key is not None: - args["ResultConfiguration"]["EncryptionConfiguration"]["KmsKey"] = kms_key + if wg_config["enforced"] is True: + if wg_config["encryption"] is not None: + args["ResultConfiguration"]["EncryptionConfiguration"] = {"EncryptionOption": wg_config["encryption"]} + if wg_config["kms_key"] is not None: + args["ResultConfiguration"]["EncryptionConfiguration"]["KmsKey"] = wg_config["kms_key"] + else: + if encryption is not None: + args["ResultConfiguration"]["EncryptionConfiguration"] = {"EncryptionOption": encryption} + if kms_key is not None: + args["ResultConfiguration"]["EncryptionConfiguration"]["KmsKey"] = kms_key # database if database is not None: @@ -143,10 +174,25 @@ def start_query_execution( args["WorkGroup"] = workgroup client_athena: boto3.client = _utils.client(service_name="athena", session=session) + _logger.debug("args: \n%s", pprint.pformat(args)) response = client_athena.start_query_execution(**args) return response["QueryExecutionId"] +def _get_s3_output( + s3_output: Optional[str], wg_config: Dict[str, Union[bool, Optional[str]]], boto3_session: boto3.Session +) -> str: + if s3_output is None: + _s3_output: Optional[str] = wg_config["s3_output"] # type: ignore + if _s3_output is not None: + s3_output = _s3_output + else: + s3_output = create_athena_bucket(boto3_session=boto3_session) + elif wg_config["enforced"] is True: + s3_output = wg_config["s3_output"] # type: ignore + return s3_output + + def wait_query(query_execution_id: str, boto3_session: Optional[boto3.Session] = None) -> Dict[str, Any]: """Wait for the query end. @@ -355,12 +401,14 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals Note ---- - If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes, - but it still useful to overcome memory limitation. + Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS']. + + `P.S. 'CSE_KMS' is not supported.` Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. + (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note @@ -403,9 +451,9 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals workgroup : str, optional Athena workgroup. encryption : str, optional - None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. + Valid values: [None, 'SSE_S3', 'SSE_KMS']. Notice: 'CSE_KMS' is not supported. kms_key : str, optional - For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. + For SSE-KMS, this is the KMS key ARN or ID. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. @@ -424,31 +472,27 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals """ session: boto3.Session = _utils.ensure_session(session=boto3_session) - wg_s3_output, _, _ = _ensure_workgroup(session=session, workgroup=workgroup) - if s3_output is None: - if wg_s3_output is None: - _s3_output: str = create_athena_bucket(boto3_session=session) - else: - _s3_output = wg_s3_output - else: - _s3_output = s3_output + wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config(session=session, workgroup=workgroup) + _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output name: str = "" if ctas_approach is True: name = f"temp_table_{pa.compat.guid()}" path: str = f"{_s3_output}/{name}" + ext_location: str = "\n" if wg_config["enforced"] is True else f",\n external_location = '{path}'\n" sql = ( f"CREATE TABLE {name}\n" f"WITH(\n" f" format = 'Parquet',\n" - f" parquet_compression = 'SNAPPY',\n" - f" external_location = '{path}'\n" + f" parquet_compression = 'SNAPPY'" + f"{ext_location}" f") AS\n" f"{sql}" ) _logger.debug("sql: %s", sql) - query_id: str = start_query_execution( + query_id: str = _start_query_execution( sql=sql, + wg_config=wg_config, database=database, s3_output=_s3_output, workgroup=workgroup, @@ -466,6 +510,7 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" + _logger.debug("manifest_path: %s", manifest_path) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) @@ -560,19 +605,27 @@ def get_work_group(workgroup: str, boto3_session: Optional[boto3.Session] = None return client_athena.get_work_group(WorkGroup=workgroup) -def _ensure_workgroup( +def _get_workgroup_config( session: boto3.Session, workgroup: Optional[str] = None -) -> Tuple[Optional[str], Optional[str], Optional[str]]: +) -> Dict[str, Union[bool, Optional[str]]]: if workgroup is not None: res: Dict[str, Any] = get_work_group(workgroup=workgroup, boto3_session=session) + enforced: bool = res["WorkGroup"]["Configuration"]["EnforceWorkGroupConfiguration"] config: Dict[str, Any] = res["WorkGroup"]["Configuration"]["ResultConfiguration"] wg_s3_output: Optional[str] = config.get("OutputLocation") encrypt_config: Optional[Dict[str, str]] = config.get("EncryptionConfiguration") wg_encryption: Optional[str] = None if encrypt_config is None else encrypt_config.get("EncryptionOption") wg_kms_key: Optional[str] = None if encrypt_config is None else encrypt_config.get("KmsKey") else: - wg_s3_output, wg_encryption, wg_kms_key = None, None, None - return wg_s3_output, wg_encryption, wg_kms_key + enforced, wg_s3_output, wg_encryption, wg_kms_key = False, None, None, None + wg_config: Dict[str, Union[bool, Optional[str]]] = { + "enforced": enforced, + "s3_output": wg_s3_output, + "encryption": wg_encryption, + "kms_key": wg_kms_key, + } + _logger.debug("wg_config: \n%s", pprint.pformat(wg_config)) + return wg_config def read_sql_table( @@ -606,12 +659,14 @@ def read_sql_table( Note ---- - If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes, - but it still useful to overcome memory limitation. + Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS']. + + `P.S. 'CSE_KMS' is not supported.` Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. + (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note diff --git a/awswrangler/emr.py b/awswrangler/emr.py index f3e505b00..5a93d752d 100644 --- a/awswrangler/emr.py +++ b/awswrangler/emr.py @@ -1,8 +1,8 @@ """EMR (Elastic Map Reduce) module.""" # pylint: disable=line-too-long -import json import logging +import pprint from typing import Any, Dict, List, Optional, Union import boto3 # type: ignore @@ -364,7 +364,7 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s if pars["tags"] is not None: args["Tags"] = [{"Key": k, "Value": v} for k, v in pars["tags"].items()] - _logger.info("args: \n%s", json.dumps(args, default=str, indent=4)) + _logger.debug("args: \n%s", pprint.pformat(args)) return args @@ -665,7 +665,7 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused args: Dict[str, Any] = _build_cluster_args(**locals()) client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.run_job_flow(**args) - _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) + _logger.debug("response: \n%s", pprint.pformat(response)) return response["JobFlowId"] @@ -696,7 +696,7 @@ def get_cluster_state(cluster_id: str, boto3_session: Optional[boto3.Session] = """ client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.describe_cluster(ClusterId=cluster_id) - _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) + _logger.debug("response: \n%s", pprint.pformat(response)) return response["Cluster"]["Status"]["State"] @@ -723,7 +723,7 @@ def terminate_cluster(cluster_id: str, boto3_session: Optional[boto3.Session] = """ client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.terminate_job_flows(JobFlowIds=[cluster_id]) - _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) + _logger.debug("response: \n%s", pprint.pformat(response)) def submit_steps( @@ -755,7 +755,7 @@ def submit_steps( """ client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=steps) - _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) + _logger.debug("response: \n%s", pprint.pformat(response)) return response["StepIds"] @@ -807,7 +807,7 @@ def submit_step( ) client_emr: boto3.client = _utils.client(service_name="emr", session=session) response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step]) - _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) + _logger.debug("response: \n%s", pprint.pformat(response)) return response["StepIds"][0] @@ -898,7 +898,7 @@ def get_step_state(cluster_id: str, step_id: str, boto3_session: Optional[boto3. """ client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.describe_step(ClusterId=cluster_id, StepId=step_id) - _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) + _logger.debug("response: \n%s", pprint.pformat(response)) return response["Step"]["Status"]["State"] @@ -942,7 +942,7 @@ def submit_ecr_credentials_refresh( ) client_emr: boto3.client = _utils.client(service_name="emr", session=session) response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step]) - _logger.debug("response: \n%s", json.dumps(response, default=str, indent=4)) + _logger.debug("response: \n%s", pprint.pformat(response)) return response["StepIds"][0] diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index 94541d8e6..b05fb0881 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -74,10 +74,7 @@ def workgroup0(bucket): client.create_work_group( Name=wkg_name, Configuration={ - "ResultConfiguration": { - "OutputLocation": f"s3://{bucket}/athena_workgroup0/", - "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"}, - }, + "ResultConfiguration": {"OutputLocation": f"s3://{bucket}/athena_workgroup0/"}, "EnforceWorkGroupConfiguration": True, "PublishCloudWatchMetricsEnabled": True, "BytesScannedCutoffPerQuery": 100_000_000, @@ -98,7 +95,10 @@ def workgroup1(bucket): client.create_work_group( Name=wkg_name, Configuration={ - "ResultConfiguration": {"OutputLocation": f"s3://{bucket}/athena_workgroup1/"}, + "ResultConfiguration": { + "OutputLocation": f"s3://{bucket}/athena_workgroup1/", + "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"}, + }, "EnforceWorkGroupConfiguration": True, "PublishCloudWatchMetricsEnabled": True, "BytesScannedCutoffPerQuery": 100_000_000, @@ -109,7 +109,57 @@ def workgroup1(bucket): yield wkg_name +@pytest.fixture(scope="module") +def workgroup2(bucket, kms_key): + wkg_name = "awswrangler_test_2" + client = boto3.client("athena") + wkgs = client.list_work_groups() + wkgs = [x["Name"] for x in wkgs["WorkGroups"]] + if wkg_name not in wkgs: + client.create_work_group( + Name=wkg_name, + Configuration={ + "ResultConfiguration": { + "OutputLocation": f"s3://{bucket}/athena_workgroup2/", + "EncryptionConfiguration": {"EncryptionOption": "SSE_KMS", "KmsKey": kms_key}, + }, + "EnforceWorkGroupConfiguration": False, + "PublishCloudWatchMetricsEnabled": True, + "BytesScannedCutoffPerQuery": 100_000_000, + "RequesterPaysEnabled": False, + }, + Description="AWS Data Wrangler Test WorkGroup Number 2", + ) + yield wkg_name + + +@pytest.fixture(scope="module") +def workgroup3(bucket, kms_key): + wkg_name = "awswrangler_test_3" + client = boto3.client("athena") + wkgs = client.list_work_groups() + wkgs = [x["Name"] for x in wkgs["WorkGroups"]] + if wkg_name not in wkgs: + client.create_work_group( + Name=wkg_name, + Configuration={ + "ResultConfiguration": { + "OutputLocation": f"s3://{bucket}/athena_workgroup3/", + "EncryptionConfiguration": {"EncryptionOption": "SSE_KMS", "KmsKey": kms_key}, + }, + "EnforceWorkGroupConfiguration": True, + "PublishCloudWatchMetricsEnabled": True, + "BytesScannedCutoffPerQuery": 100_000_000, + "RequesterPaysEnabled": False, + }, + Description="AWS Data Wrangler Test WorkGroup Number 3", + ) + yield wkg_name + + def test_athena_ctas(bucket, database, kms_key): + wr.s3.delete_objects(path=f"s3://{bucket}/test_athena_ctas/") + wr.s3.delete_objects(path=f"s3://{bucket}/test_athena_ctas_result/") df = get_df_list() columns_types, partitions_types = wr.catalog.extract_athena_types(df=df, partition_cols=["par0", "par1"]) assert len(columns_types) == 16 @@ -256,13 +306,12 @@ def test_fwf(bucket): def test_parquet(bucket): - wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_file") - wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_dataset") + wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet/") df_file = pd.DataFrame({"id": [1, 2, 3]}) - path_file = f"s3://{bucket}/test_parquet_file.parquet" + path_file = f"s3://{bucket}/test_parquet/test_parquet_file.parquet" df_dataset = pd.DataFrame({"id": [1, 2, 3], "partition": ["A", "A", "B"]}) df_dataset["partition"] = df_dataset["partition"].astype("category") - path_dataset = f"s3://{bucket}/test_parquet_dataset" + path_dataset = f"s3://{bucket}/test_parquet/test_parquet_dataset" with pytest.raises(wr.exceptions.InvalidArgumentCombination): wr.s3.to_parquet(df=df_file, path=path_file, mode="append") with pytest.raises(wr.exceptions.InvalidCompression): @@ -292,8 +341,7 @@ def test_parquet(bucket): wr.s3.to_parquet( df=df_dataset, path=path_dataset, dataset=True, partition_cols=["partition"], mode="overwrite_partitions" ) - wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_file") - wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_dataset") + wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet/") def test_parquet_catalog(bucket, database): @@ -1123,3 +1171,35 @@ def test_parquet_chunked(bucket, database, col2, chunked): wr.s3.delete_objects(path=paths) assert wr.catalog.delete_table_if_exists(database=database, table=table) is True + + +@pytest.mark.parametrize("workgroup", [None, 0, 1, 2, 3]) +@pytest.mark.parametrize("encryption", [None, "SSE_S3", "SSE_KMS"]) +def test_athena_encryption( + bucket, database, kms_key, encryption, workgroup, workgroup0, workgroup1, workgroup2, workgroup3 +): + kms_key = None if (encryption == "SSE_S3") or (encryption is None) else kms_key + if workgroup == 0: + workgroup = workgroup0 + elif workgroup == 1: + workgroup = workgroup1 + elif workgroup == 2: + workgroup = workgroup2 + elif workgroup == 3: + workgroup = workgroup3 + table = f"test_athena_encryption_{str(encryption).lower()}_{str(workgroup).lower()}" + path = f"s3://{bucket}/{table}/" + wr.s3.delete_objects(path=path) + df = pd.DataFrame({"a": [1, 2], "b": ["foo", "boo"]}) + paths = wr.s3.to_parquet( + df=df, path=path, dataset=True, mode="overwrite", database=database, table=table, s3_additional_kwargs=None + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.athena.read_sql_table( + table=table, ctas_approach=True, database=database, encryption=encryption, workgroup=workgroup, kms_key=kms_key + ) + print(df2) + assert len(df2.index) == 2 + assert len(df2.columns) == 2 + wr.catalog.delete_table_if_exists(database=database, table=table) + wr.s3.delete_objects(path=paths) diff --git a/testing/test_awswrangler/test_torch.py b/testing/test_awswrangler/test_torch.py index 6e8a3427d..a19dd64b5 100644 --- a/testing/test_awswrangler/test_torch.py +++ b/testing/test_awswrangler/test_torch.py @@ -87,7 +87,7 @@ def test_torch_sql(parameters, db_type, chunksize): @pytest.mark.parametrize("label_col", [2, "c"]) def test_torch_sql_label(parameters, db_type, chunksize, label_col): schema = parameters[db_type]["schema"] - table = f"test_torch_sql_label_{db_type}_{str(chunksize).lower()}" + table = f"test_torch_sql_label_{db_type}_{str(chunksize).lower()}_{label_col}" engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") wr.db.to_sql( df=pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0], "c": [7, 8, 9]}), @@ -123,6 +123,7 @@ def test_torch_image_s3(bucket): Key=f"{folder}/class={ref_label}/logo.png", ContentType="image/png", ) + wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/{folder}/class={ref_label}/logo.png"]) ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session()) image, label = ds[0] assert image.shape == torch.Size([4, 494, 1636]) @@ -144,6 +145,7 @@ def test_torch_image_s3_loader(bucket, drop_last): Key=f"{folder}/class={label}/logo{i}.png", ContentType="image/png", ) + wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/{folder}/class={label}/logo{i}.png"]) ds = wr.torch.ImageS3Dataset(path=path, suffix="png", boto3_session=boto3.Session()) batch_size = 2 num_train = len(ds) @@ -172,6 +174,7 @@ def test_torch_lambda_s3(bucket): Key=f"test_torch_lambda_s3/class={ref_label}/logo.png", ContentType="image/png", ) + wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/test_torch_lambda_s3/class={ref_label}/logo.png"]) ds = wr.torch.LambdaS3Dataset( path=path, suffix="png", @@ -201,6 +204,7 @@ def test_torch_audio_s3(bucket): Key=f"{folder}/class={ref_label}/amazing_sound.wav", ContentType="audio/wav", ) + wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/{folder}/class={ref_label}/amazing_sound.wav"]) s3_audio_file = f"{bucket}/test_torch_audio_s3/class={ref_label}/amazing_sound.wav" ds = wr.torch.AudioS3Dataset(path=s3_audio_file, suffix="wav") loader = DataLoader(ds, batch_size=1) @@ -234,6 +238,7 @@ def test_torch_s3_iterable(bucket, drop_last): torch.save(batch, buff) buff.seek(0) client_s3.put_object(Body=buff.read(), Bucket=bucket, Key=f"{folder}/file{i}.pt") + wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/{folder}/file{i}.pt"]) for image in DataLoader( wr.torch.S3IterableDataset(path=f"s3://{bucket}/{folder}/file"), batch_size=batch_size, drop_last=drop_last @@ -259,6 +264,7 @@ def test_torch_s3_iterable_with_labels(bucket, drop_last): torch.save(batch, buff) buff.seek(0) client_s3.put_object(Body=buff.read(), Bucket=bucket, Key=f"{folder}/file{i}.pt") + wr.s3.wait_objects_exist(paths=[f"s3://{bucket}/{folder}/file{i}.pt"]) for images, labels in DataLoader( wr.torch.S3IterableDataset(path=f"s3://{bucket}/{folder}/file"), batch_size=batch_size, drop_last=drop_last diff --git a/tox.ini b/tox.ini index f2bb572c2..cbede50a3 100644 --- a/tox.ini +++ b/tox.ini @@ -8,11 +8,11 @@ deps = moto -rrequirements-torch.txt commands = - pytest -n 8 testing/test_awswrangler + pytest -n 16 testing/test_awswrangler [testenv:py36] deps = {[testenv]deps} pytest-cov commands = - pytest --cov=awswrangler -n 8 testing/test_awswrangler + pytest --cov=awswrangler -n 16 testing/test_awswrangler From 2a26d4f79e77918ee9c13f5232506cc421eedb42 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 28 Apr 2020 23:34:27 -0300 Subject: [PATCH 45/59] Decrease tox parallelism --- tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index cbede50a3..f2bb572c2 100644 --- a/tox.ini +++ b/tox.ini @@ -8,11 +8,11 @@ deps = moto -rrequirements-torch.txt commands = - pytest -n 16 testing/test_awswrangler + pytest -n 8 testing/test_awswrangler [testenv:py36] deps = {[testenv]deps} pytest-cov commands = - pytest --cov=awswrangler -n 16 testing/test_awswrangler + pytest --cov=awswrangler -n 8 testing/test_awswrangler From 5298aaf84a629c099aaf4a2f2af0de4a534c409f Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 28 Apr 2020 23:35:08 -0300 Subject: [PATCH 46/59] Add kms_key_id, max_file_size and region to Redshift Unload --- awswrangler/db.py | 67 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 5 deletions(-) diff --git a/awswrangler/db.py b/awswrangler/db.py index 21b4789c4..b695bdd17 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -2,6 +2,7 @@ import json import logging +import time from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from urllib.parse import quote_plus @@ -91,7 +92,16 @@ def to_sql(df: pd.DataFrame, con: sqlalchemy.engine.Engine, **pandas_kwargs) -> ) pandas_kwargs["dtype"] = dtypes pandas_kwargs["con"] = con - df.to_sql(**pandas_kwargs) + max_attempts: int = 3 + for attempt in range(max_attempts): + try: + df.to_sql(**pandas_kwargs) + except sqlalchemy.exc.InternalError as ex: # pragma: no cover + if attempt == (max_attempts - 1): + raise ex + time.sleep(1) + else: + break def read_sql_query( @@ -887,6 +897,9 @@ def unload_redshift( path: str, con: sqlalchemy.engine.Engine, iam_role: str, + region: Optional[str] = None, + max_file_size: Optional[float] = None, + kms_key_id: Optional[str] = None, categories: List[str] = None, chunked: Union[bool, int] = False, keep_files: bool = False, @@ -937,6 +950,19 @@ def unload_redshift( wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() iam_role : str AWS IAM role with the related permissions. + region : str, optional + Specifies the AWS Region where the target Amazon S3 bucket is located. + REGION is required for UNLOAD to an Amazon S3 bucket that isn't in the + same AWS Region as the Amazon Redshift cluster. By default, UNLOAD + assumes that the target Amazon S3 bucket is located in the same AWS + Region as the Amazon Redshift cluster. + max_file_size : float, optional + Specifies the maximum size (MB) of files that UNLOAD creates in Amazon S3. + Specify a decimal value between 5.0 MB and 6200.0 MB. If None, the default + maximum file size is 6200.0 MB. + kms_key_id : str, optional + Specifies the key ID for an AWS Key Management Service (AWS KMS) key to be + used to encrypt data files on Amazon S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. @@ -973,7 +999,15 @@ def unload_redshift( """ session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = unload_redshift_to_files( - sql=sql, path=path, con=con, iam_role=iam_role, use_threads=use_threads, boto3_session=session + sql=sql, + path=path, + con=con, + iam_role=iam_role, + region=region, + max_file_size=max_file_size, + kms_key_id=kms_key_id, + use_threads=use_threads, + boto3_session=session, ) s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) if chunked is False: @@ -1032,6 +1066,9 @@ def unload_redshift_to_files( path: str, con: sqlalchemy.engine.Engine, iam_role: str, + region: Optional[str] = None, + max_file_size: Optional[float] = None, + kms_key_id: Optional[str] = None, use_threads: bool = True, manifest: bool = False, partition_cols: Optional[List] = None, @@ -1056,6 +1093,19 @@ def unload_redshift_to_files( wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() iam_role : str AWS IAM role with the related permissions. + region : str, optional + Specifies the AWS Region where the target Amazon S3 bucket is located. + REGION is required for UNLOAD to an Amazon S3 bucket that isn't in the + same AWS Region as the Amazon Redshift cluster. By default, UNLOAD + assumes that the target Amazon S3 bucket is located in the same AWS + Region as the Amazon Redshift cluster. + max_file_size : float, optional + Specifies the maximum size (MB) of files that UNLOAD creates in Amazon S3. + Specify a decimal value between 5.0 MB and 6200.0 MB. If None, the default + maximum file size is 6200.0 MB. + kms_key_id : str, optional + Specifies the key ID for an AWS Key Management Service (AWS KMS) key to be + used to encrypt data files on Amazon S3. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. @@ -1086,19 +1136,26 @@ def unload_redshift_to_files( session: boto3.Session = _utils.ensure_session(session=boto3_session) s3.delete_objects(path=path, use_threads=use_threads, boto3_session=session) with con.connect() as _con: - partition_str: str = f"PARTITION BY ({','.join(partition_cols)})\n" if partition_cols else "" + partition_str: str = f"\nPARTITION BY ({','.join(partition_cols)})" if partition_cols else "" manifest_str: str = "\nmanifest" if manifest is True else "" + region_str: str = f"\nREGION AS '{region}'" if region is not None else "" + max_file_size_str: str = f"\nMAXFILESIZE AS {max_file_size} MB" if max_file_size is not None else "" + kms_key_id_str: str = f"\nKMS_KEY_ID '{kms_key_id}'" if kms_key_id is not None else "" sql = ( f"UNLOAD ('{sql}')\n" f"TO '{path}'\n" f"IAM_ROLE '{iam_role}'\n" "ALLOWOVERWRITE\n" "PARALLEL ON\n" - "ENCRYPTED\n" + "FORMAT PARQUET\n" + "ENCRYPTED" + f"{kms_key_id_str}" f"{partition_str}" - "FORMAT PARQUET" + f"{region_str}" + f"{max_file_size_str}" f"{manifest_str};" ) + _logger.debug("sql: \n%s", sql) _con.execute(sql) sql = "SELECT pg_last_query_id() AS query_id" query_id: int = _con.execute(sql).fetchall()[0][0] From d4b27c6fc950523b095d169ff6704c92933dcc97 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 28 Apr 2020 23:35:32 -0300 Subject: [PATCH 47/59] Add KMS permission to Redshift Role --- testing/cloudformation.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/testing/cloudformation.yaml b/testing/cloudformation.yaml index d709f1861..1b4b90f08 100644 --- a/testing/cloudformation.yaml +++ b/testing/cloudformation.yaml @@ -96,6 +96,15 @@ Resources: PolicyDocument: Version: 2012-10-17 Statement: + - Effect: Allow + Action: + - kms:Encrypt + - kms:Decrypt + - kms:GenerateDataKey + Resource: + - Fn::GetAtt: + - KmsKey + - Arn - Effect: Allow Action: - s3:Get* From 924b0bb624bfef949d559f0e68d5724645ae8394 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 28 Apr 2020 23:35:53 -0300 Subject: [PATCH 48/59] Add Redshift tests --- testing/test_awswrangler/test_db.py | 74 +++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/testing/test_awswrangler/test_db.py b/testing/test_awswrangler/test_db.py index adcacf4a4..65f5ad15b 100644 --- a/testing/test_awswrangler/test_db.py +++ b/testing/test_awswrangler/test_db.py @@ -76,6 +76,11 @@ def external_schema(cloudformation_outputs, parameters, glue_database): yield "aws_data_wrangler_external" +@pytest.fixture(scope="module") +def kms_key_id(cloudformation_outputs): + yield cloudformation_outputs["KmsKeyArn"].split("/", 1)[1] + + @pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"]) def test_sql(parameters, db_type): df = get_df() @@ -386,3 +391,72 @@ def test_redshift_category(bucket, parameters): for df2 in dfs: ensure_data_types_category(df2) wr.s3.delete_objects(path=path) + + +def test_redshift_unload_extras(bucket, parameters, kms_key_id): + table = "test_redshift_unload_extras" + schema = parameters["redshift"]["schema"] + path = f"s3://{bucket}/{table}/" + wr.s3.delete_objects(path=path) + engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-redshift") + df = pd.DataFrame({"id": [1, 2], "name": ["foo", "boo"]}) + wr.db.to_sql(df=df, con=engine, name=table, schema=schema, if_exists="replace", index=False) + paths = wr.db.unload_redshift_to_files( + sql=f"SELECT * FROM {schema}.{table}", + path=path, + con=engine, + iam_role=parameters["redshift"]["role"], + region=wr.s3.get_bucket_region(bucket), + max_file_size=5.0, + kms_key_id=kms_key_id, + partition_cols=["name"], + ) + wr.s3.wait_objects_exist(paths=paths) + df = wr.s3.read_parquet(path=path, dataset=True) + assert len(df.index) == 2 + assert len(df.columns) == 2 + wr.s3.delete_objects(path=path) + df = wr.db.unload_redshift( + sql=f"SELECT * FROM {schema}.{table}", + con=engine, + iam_role=parameters["redshift"]["role"], + path=path, + keep_files=False, + region=wr.s3.get_bucket_region(bucket), + max_file_size=5.0, + kms_key_id=kms_key_id, + ) + assert len(df.index) == 2 + assert len(df.columns) == 2 + wr.s3.delete_objects(path=path) + + +@pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"]) +def test_to_sql_cast(parameters, db_type): + table = "test_to_sql_cast" + schema = parameters[db_type]["schema"] + df = pd.DataFrame( + { + "col": [ + "".join([str(i)[-1] for i in range(1_024)]), + "".join([str(i)[-1] for i in range(1_024)]), + "".join([str(i)[-1] for i in range(1_024)]), + ] + }, + dtype="string", + ) + engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") + wr.db.to_sql( + df=df, + con=engine, + name=table, + schema=schema, + if_exists="replace", + index=False, + index_label=None, + chunksize=None, + method=None, + dtype={"col": sqlalchemy.types.VARCHAR(length=1_024)}, + ) + df2 = wr.db.read_sql_query(sql=f"SELECT * FROM {schema}.{table}", con=engine) + assert df.equals(df2) From ad22aea48ee721ba48bc2c31beb64de994214373 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 28 Apr 2020 23:37:12 -0300 Subject: [PATCH 49/59] Insignificant fix in _data_types.py --- awswrangler/_data_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py index 947b058b0..01237ea49 100644 --- a/awswrangler/_data_types.py +++ b/awswrangler/_data_types.py @@ -372,7 +372,7 @@ def sqlalchemy_types_from_pandas( df: pd.DataFrame, db_type: str, dtype: Optional[Dict[str, VisitableType]] = None ) -> Dict[str, VisitableType]: """Extract the related SQLAlchemy data types from any Pandas DataFrame.""" - casts: Dict[str, VisitableType] = dtype if dtype else {} + casts: Dict[str, VisitableType] = dtype if dtype is not None else {} pa_columns_types: Dict[str, Optional[pa.DataType]] = pyarrow_types_from_pandas( df=df, index=False, ignore_cols=list(casts.keys()) ) From 0e068fe76612cfa4fd72ee88befb26f6dff6dab8 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 28 Apr 2020 23:38:02 -0300 Subject: [PATCH 50/59] Parquet chunksize now paginating on Pandas instead of PyArrow --- awswrangler/s3.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/awswrangler/s3.py b/awswrangler/s3.py index 770f588a7..c0eb71c3f 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -1132,7 +1132,7 @@ def _to_parquet_dataset( schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( df=df, index=index, ignore_cols=partition_cols, dtype=dtype ) - _logger.debug("schema: %s", schema) + _logger.debug("schema: \n%s", schema) if not partition_cols: file_path: str = f"{path}{uuid.uuid4().hex}{compression_ext}.parquet" _to_parquet_file( @@ -1733,24 +1733,32 @@ def _read_parquet_chunked( use_threads: bool = True, ) -> Iterator[pd.DataFrame]: promote: bool = not validate_schema - next_slice: Optional[pa.Table] = None + next_slice: Optional[pd.DataFrame] = None for piece in data.pieces: - table: pa.Table = piece.read( - columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False + df: pd.DataFrame = _table2df( + table=piece.read( + columns=columns, + use_threads=use_threads, + partitions=data.partitions, + use_pandas_metadata=False + ), + categories=categories, + use_threads=use_threads ) if chunked is True: - yield _table2df(table=table, categories=categories, use_threads=use_threads) + yield df else: - if next_slice: - table = pa.lib.concat_tables([next_slice, table], promote=promote) - while len(table) >= chunked: - yield _table2df( - table=table.slice(offset=0, length=chunked), categories=categories, use_threads=use_threads - ) - table = table.slice(offset=chunked, length=None) - next_slice = table - if next_slice: - yield _table2df(table=next_slice, categories=categories, use_threads=use_threads) + if next_slice is not None: + df = pd.concat(objs=[next_slice, df], ignore_index=True, sort=False) + while len(df.index) >= chunked: + yield df.iloc[:chunked] + df = df.iloc[chunked:] + if df.empty: + next_slice = None + else: + next_slice = df + if next_slice is not None: + yield next_slice def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool = True) -> pd.DataFrame: From ca133a0248c5977c9e72d6974e5c4a1f681e3ab2 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 28 Apr 2020 23:48:45 -0300 Subject: [PATCH 51/59] Linting --- awswrangler/s3.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/awswrangler/s3.py b/awswrangler/s3.py index c0eb71c3f..a8512f0b5 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -1688,12 +1688,7 @@ def read_parquet( data=data, columns=columns, categories=categories, use_threads=use_threads, validate_schema=validate_schema ) return _read_parquet_chunked( - data=data, - columns=columns, - categories=categories, - chunked=chunked, - use_threads=use_threads, - validate_schema=validate_schema, + data=data, columns=columns, categories=categories, chunked=chunked, use_threads=use_threads ) @@ -1728,22 +1723,17 @@ def _read_parquet_chunked( data: pyarrow.parquet.ParquetDataset, columns: Optional[List[str]] = None, categories: List[str] = None, - validate_schema: bool = True, chunked: Union[bool, int] = True, use_threads: bool = True, ) -> Iterator[pd.DataFrame]: - promote: bool = not validate_schema next_slice: Optional[pd.DataFrame] = None for piece in data.pieces: df: pd.DataFrame = _table2df( table=piece.read( - columns=columns, - use_threads=use_threads, - partitions=data.partitions, - use_pandas_metadata=False + columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False ), categories=categories, - use_threads=use_threads + use_threads=use_threads, ) if chunked is True: yield df From e8660cb6e6414dacca6f966b450133367d9de6af Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 2 May 2020 18:03:50 -0300 Subject: [PATCH 52/59] Bumping dependencies versions --- requirements-dev.txt | 4 ++-- requirements.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index bfdd15c5e..81b576472 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,13 +5,13 @@ mypy~=0.770 isort~=4.3.21 pydocstyle~=5.0.2 doc8~=0.8.0 -tox~=3.14.6 +tox~=3.15.0 pytest~=5.4.1 pytest-cov~=2.8.1 pytest-xdist~=1.31.0 scikit-learn~=0.22.1 awscli>=1.18.22 -cfn-lint~=0.29.6 +cfn-lint~=0.30.1 cfn-flip~=1.2.3 twine~=3.1.1 wheel~=0.34.2 diff --git a/requirements.txt b/requirements.txt index ec72c05b7..9c1013d22 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy~=1.18.1 pandas~=1.0.3 -pyarrow~=0.16.0 +pyarrow~=0.17.0 boto3>=1.12.22 botocore>=1.15.22 s3fs~=0.4.2 From b484ae1c8bf54efd3122c4add35098c266512e10 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sun, 3 May 2020 18:11:10 -0300 Subject: [PATCH 53/59] Add support for query UUID columns on PostgreSQL and full NULL columns for all databases. --- testing/test_awswrangler/test_db.py | 66 +++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/testing/test_awswrangler/test_db.py b/testing/test_awswrangler/test_db.py index 65f5ad15b..86a57a74d 100644 --- a/testing/test_awswrangler/test_db.py +++ b/testing/test_awswrangler/test_db.py @@ -460,3 +460,69 @@ def test_to_sql_cast(parameters, db_type): ) df2 = wr.db.read_sql_query(sql=f"SELECT * FROM {schema}.{table}", con=engine) assert df.equals(df2) + + +def test_uuid(parameters): + table = "test_uuid" + schema = parameters["postgresql"]["schema"] + engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-postgresql") + df = pd.DataFrame( + { + "id": [1, 2, 3], + "uuid": [ + "ec0f0482-8d3b-11ea-8b27-8c859043dd95", + "f56ff7c0-8d3b-11ea-be94-8c859043dd95", + "fa043e90-8d3b-11ea-b7e7-8c859043dd95", + ], + } + ) + wr.db.to_sql( + df=df, + con=engine, + name=table, + schema=schema, + if_exists="replace", + index=False, + index_label=None, + chunksize=None, + method=None, + dtype={"uuid": sqlalchemy.dialects.postgresql.UUID}, + ) + df2 = wr.db.read_sql_table(table=table, schema=schema, con=engine) + df["id"] = df["id"].astype("Int64") + df["uuid"] = df["uuid"].astype("string") + assert df.equals(df2) + + +@pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"]) +def test_null(parameters, db_type): + table = "test_null" + schema = parameters[db_type]["schema"] + engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") + df = pd.DataFrame({"id": [1, 2, 3], "nothing": [None, None, None]}) + wr.db.to_sql( + df=df, + con=engine, + name=table, + schema=schema, + if_exists="replace", + index=False, + index_label=None, + chunksize=None, + method=None, + dtype={"nothing": sqlalchemy.types.Integer}, + ) + wr.db.to_sql( + df=df, + con=engine, + name=table, + schema=schema, + if_exists="append", + index=False, + index_label=None, + chunksize=None, + method=None, + ) + df2 = wr.db.read_sql_table(table=table, schema=schema, con=engine) + df["id"] = df["id"].astype("Int64") + assert pd.concat(objs=[df, df], ignore_index=True).equals(df2) From 08cf244090583e253d4295d7d1d2aa4d0bbb867e Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sun, 3 May 2020 21:56:43 -0300 Subject: [PATCH 54/59] Add support to write nested types (array and struct). --- .gitignore | 2 + awswrangler/_data_types.py | 67 ++++++++++++++++++---- awswrangler/db.py | 5 +- testing/test_awswrangler/test_data_lake.py | 22 +++++++ 4 files changed, 85 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 1947b2e87..8a3474a30 100644 --- a/.gitignore +++ b/.gitignore @@ -138,6 +138,8 @@ testing/*parameters-*.properties testing/*requirements*.txt testing/coverage/* building/*requirements*.txt +building/arrow +building/lambda/arrow /docs/coverage/ /docs/build/ /docs/source/_build/ diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py index 01237ea49..fac82a37b 100644 --- a/awswrangler/_data_types.py +++ b/awswrangler/_data_types.py @@ -1,8 +1,9 @@ """Internal (private) Data Types Module.""" import logging +import re from decimal import Decimal -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Match, Optional, Sequence, Tuple import pandas as pd # type: ignore import pyarrow as pa # type: ignore @@ -139,8 +140,10 @@ def pyarrow2athena(dtype: pa.DataType) -> str: # pylint: disable=too-many-branc return f"decimal({dtype.precision},{dtype.scale})" if pa.types.is_list(dtype): return f"array<{pyarrow2athena(dtype=dtype.value_type)}>" - if pa.types.is_struct(dtype): # pragma: no cover - return f"struct<{', '.join([f'{f.name}: {pyarrow2athena(dtype=f.type)}' for f in dtype])}>" + if pa.types.is_struct(dtype): + return f"struct<{', '.join([f'{f.name}:{pyarrow2athena(dtype=f.type)}' for f in dtype])}>" + if pa.types.is_map(dtype): # pragma: no cover + return f"map<{pyarrow2athena(dtype=dtype.key_type)},{pyarrow2athena(dtype=dtype.item_type)}>" if dtype == pa.null(): raise exceptions.UndetectedType("We can not infer the data type from an entire null object column") raise exceptions.UnsupportedType(f"Unsupported Pyarrow type: {dtype}") # pragma: no cover @@ -167,7 +170,7 @@ def pyarrow2pandas_extension( # pylint: disable=too-many-branches,too-many-retu def pyarrow2sqlalchemy( # pylint: disable=too-many-branches,too-many-return-statements dtype: pa.DataType, db_type: str -) -> VisitableType: +) -> Optional[VisitableType]: """Pyarrow to Athena data types conversion.""" if pa.types.is_int8(dtype): return sqlalchemy.types.SmallInteger @@ -214,7 +217,7 @@ def pyarrow2sqlalchemy( # pylint: disable=too-many-branches,too-many-return-sta if pa.types.is_dictionary(dtype): return pyarrow2sqlalchemy(dtype=dtype.value_type, db_type=db_type) if dtype == pa.null(): # pragma: no cover - raise exceptions.UndetectedType("We can not infer the data type from an entire null object column") + return None raise exceptions.UnsupportedType(f"Unsupported Pyarrow type: {dtype}") # pragma: no cover @@ -243,12 +246,23 @@ def pyarrow_types_from_pandas( else: cols.append(name) - # Filling cols_dtypes and indexes + # Filling cols_dtypes + for col in cols: + _logger.debug("Inferring PyArrow type from column: %s", col) + try: + schema: pa.Schema = pa.Schema.from_pandas(df=df[[col]], preserve_index=False) + except pa.ArrowInvalid as ex: # pragma: no cover + cols_dtypes[col] = process_not_inferred_dtype(ex) + else: + cols_dtypes[col] = schema.field(col).type + + # Filling indexes indexes: List[str] = [] - for field in pa.Schema.from_pandas(df=df[cols], preserve_index=index): - name = str(field.name) - cols_dtypes[name] = field.type - if (name not in df.columns) and (index is True): + if index is True: + for field in pa.Schema.from_pandas(df=df[[]], preserve_index=True): + name = str(field.name) + _logger.debug("Inferring PyArrow type from index: %s", name) + cols_dtypes[name] = field.type indexes.append(name) # Merging Index @@ -261,6 +275,39 @@ def pyarrow_types_from_pandas( return columns_types +def process_not_inferred_dtype(ex: pa.ArrowInvalid) -> pa.DataType: + """Infer data type from PyArrow inference exception.""" + ex_str = str(ex) + _logger.debug("PyArrow was not able to infer data type:\n%s", ex_str) + match: Optional[Match] = re.search( + pattern="Could not convert (.*) with type (.*): did not recognize " + "Python value type when inferring an Arrow data type", + string=ex_str, + ) + if match is None: + raise ex # pragma: no cover + groups: Optional[Sequence[str]] = match.groups() + if groups is None: + raise ex # pragma: no cover + if len(groups) != 2: + raise ex # pragma: no cover + _logger.debug("groups: %s", groups) + type_str: str = groups[1] + if type_str == "UUID": + return pa.string() + raise ex # pragma: no cover + + +def process_not_inferred_array(ex: pa.ArrowInvalid, values: Any) -> pa.Array: + """Infer `pyarrow.array` from PyArrow inference exception.""" + dtype = process_not_inferred_dtype(ex=ex) + if dtype == pa.string(): + array: pa.Array = pa.array(obj=[str(x) for x in values], type=dtype, safe=True) + else: + raise ex # pragma: no cover + return array + + def athena_types_from_pandas( df: pd.DataFrame, index: bool, dtype: Optional[Dict[str, str]] = None, index_left: bool = False ) -> Dict[str, str]: diff --git a/awswrangler/db.py b/awswrangler/db.py index b695bdd17..f5e90c78e 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -185,7 +185,10 @@ def _records2df( arrays: List[pa.Array] = [] for col_values, col_name in zip(tuple(zip(*records)), cols_names): # Transposing if (dtype is None) or (col_name not in dtype): - array: pa.Array = pa.array(obj=col_values, safe=True) # Creating Arrow array + try: + array: pa.Array = pa.array(obj=col_values, safe=True) # Creating Arrow array + except pa.ArrowInvalid as ex: + array = _data_types.process_not_inferred_array(ex, values=col_values) # Creating Arrow array else: array = pa.array(obj=col_values, type=dtype[col_name], safe=True) # Creating Arrow array with dtype arrays.append(array) diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index b05fb0881..99c1df1c6 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -1203,3 +1203,25 @@ def test_athena_encryption( assert len(df2.columns) == 2 wr.catalog.delete_table_if_exists(database=database, table=table) wr.s3.delete_objects(path=paths) + + +def test_athena_nested(bucket, database): + table = "test_athena_nested" + path = f"s3://{bucket}/{table}/" + df = pd.DataFrame( + { + "c0": [[1, 2, 3], [4, 5, 6]], + "c1": [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], + "c2": [[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]], + "c3": [[], [[[[[[[[1]]]]]]]]], + "c4": [{"a": 1}, {"a": 1}], + "c5": [{"a": {"b": {"c": [1, 2]}}}, {"a": {"b": {"c": [3, 4]}}}], + } + ) + paths = wr.s3.to_parquet( + df=df, path=path, index=False, use_threads=True, dataset=True, mode="overwrite", database=database, table=table + )["paths"] + wr.s3.wait_objects_exist(paths=paths) + df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c1, c2, c4 FROM {table}", database=database) + assert len(df2.index) == 2 + assert len(df2.columns) == 4 From 458bf266f684f096cd26a729ef6eb2d3beffc02d Mon Sep 17 00:00:00 2001 From: igorborgest Date: Mon, 4 May 2020 19:13:39 -0300 Subject: [PATCH 55/59] Add keep_files and ctas_temp_table_name to wr.athena.read_*(). #203 --- awswrangler/athena.py | 76 +++++++++++++++++----- awswrangler/torch.py | 20 +++--- testing/test_awswrangler/test_data_lake.py | 62 +++++++++++++++++- 3 files changed, 130 insertions(+), 28 deletions(-) diff --git a/awswrangler/athena.py b/awswrangler/athena.py index 76cb0a108..671dabd42 100644 --- a/awswrangler/athena.py +++ b/awswrangler/athena.py @@ -370,7 +370,7 @@ def _fix_csv_types(df: pd.DataFrame, parse_dates: List[str], binaries: List[str] return df -def read_sql_query( # pylint: disable=too-many-branches,too-many-locals +def read_sql_query( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, ctas_approach: bool = True, @@ -380,6 +380,8 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, + keep_files: bool = True, + ctas_temp_table_name: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: @@ -454,6 +456,12 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals Valid values: [None, 'SSE_S3', 'SSE_KMS']. Notice: 'CSE_KMS' is not supported. kms_key : str, optional For SSE-KMS, this is the KMS key ARN or ID. + keep_files : bool + Should Wrangler delete or keep the staging files produced by Athena? + ctas_temp_table_name : str, optional + The name of the temporary table and also the directory name on S3 where the CTAS result is stored. + If None, it will use the follow random pattern: `f"temp_table_{pyarrow.compat.guid()}"`. + On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. @@ -477,7 +485,10 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output name: str = "" if ctas_approach is True: - name = f"temp_table_{pa.compat.guid()}" + if ctas_temp_table_name is not None: + name = catalog.sanitize_table_name(ctas_temp_table_name) + else: + name = f"temp_table_{pa.compat.guid()}" path: str = f"{_s3_output}/{name}" ext_location: str = "\n" if wg_config["enforced"] is True else f",\n external_location = '{path}'\n" sql = ( @@ -506,25 +517,34 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals reason: str = query_response["QueryExecution"]["Status"]["StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) - dfs: Union[pd.DataFrame, Iterator[pd.DataFrame]] + ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" + metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata" _logger.debug("manifest_path: %s", manifest_path) + _logger.debug("metadata_path: %s", metadata_path) + s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: if chunked is False: - dfs = pd.DataFrame() - else: - dfs = _utils.empty_generator() - else: - s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) - dfs = s3.read_parquet( - path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories - ) - return dfs + return pd.DataFrame() + return _utils.empty_generator() + s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) + ret = s3.read_parquet( + path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories + ) + paths_delete: List[str] = paths + [manifest_path, metadata_path] + _logger.debug(type(ret)) + if chunked is False: + if keep_files is False: + s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session) + return ret + if keep_files is False: + return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session) + return ret dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session ) @@ -547,10 +567,26 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals boto3_session=session, ) _logger.debug("Start type casting...") - if chunksize is None: - return _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) _logger.debug(type(ret)) - return _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) + if chunksize is None: + df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) + if keep_files is False: + s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) + return df + dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) + if keep_files is False: + return _delete_after_iterate( + dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session + ) + return dfs + + +def _delete_after_iterate( + dfs: Iterator[pd.DataFrame], paths: List[str], use_threads: bool, boto3_session: boto3.Session +) -> Iterator[pd.DataFrame]: + for df in dfs: + yield df + s3.delete_objects(path=paths, use_threads=use_threads, boto3_session=boto3_session) def stop_query_execution(query_execution_id: str, boto3_session: Optional[boto3.Session] = None) -> None: @@ -638,6 +674,8 @@ def read_sql_table( workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, + keep_files: bool = True, + ctas_temp_table_name: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: @@ -712,6 +750,12 @@ def read_sql_table( None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. kms_key : str, optional For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. + keep_files : bool + Should Wrangler delete or keep the staging files produced by Athena? + ctas_temp_table_name : str, optional + The name of the temporary table and also the directory name on S3 where the CTAS result is stored. + If None, it will use the follow random pattern: `f"temp_table_{pyarrow.compat.guid()}"`. + On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. @@ -740,6 +784,8 @@ def read_sql_table( workgroup=workgroup, encryption=encryption, kms_key=kms_key, + keep_files=keep_files, + ctas_temp_table_name=ctas_temp_table_name, use_threads=use_threads, boto3_session=boto3_session, ) diff --git a/awswrangler/torch.py b/awswrangler/torch.py index 7d3c47316..70df93f34 100644 --- a/awswrangler/torch.py +++ b/awswrangler/torch.py @@ -28,14 +28,14 @@ class _BaseS3Dataset: def __init__( self, path: Union[str, List[str]], suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None ): - """PyTorch Map-Style S3 Dataset. + r"""PyTorch Map-Style S3 Dataset. Parameters ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). suffix: str, optional - S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png). + S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://\*.png). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -160,7 +160,7 @@ def __init__( suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ): - """PyTorch Amazon S3 Lambda Dataset. + r"""PyTorch Amazon S3 Lambda Dataset. Parameters ---------- @@ -171,7 +171,7 @@ def __init__( label_fn: Callable Function that receives object path (str) and return a torch.Tensor suffix: str, optional - S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png). + S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://\*.png). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -212,7 +212,7 @@ def __init__( suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ): - """PyTorch Amazon S3 Audio Dataset. + r"""PyTorch Amazon S3 Audio Dataset. Read individual WAV audio files stores in Amazon S3 and return them as torch tensors. @@ -237,7 +237,7 @@ def __init__( path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). suffix: str, optional - S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png). + S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://\*.png). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -302,7 +302,7 @@ class ImageS3Dataset(_S3PartitionedDataset): """PyTorch Amazon S3 Image Dataset.""" def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto3.Session): - """PyTorch Amazon S3 Image Dataset. + r"""PyTorch Amazon S3 Image Dataset. ImageS3Dataset assumes images are patitioned (within class= folders) in Amazon S3. Each lisited object will be loaded by default Pillow library. @@ -327,7 +327,7 @@ def __init__(self, path: Union[str, List[str]], suffix: str, boto3_session: boto path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). suffix: str, optional - S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png). + S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://\*.png). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -350,14 +350,14 @@ def _data_fn(self, data: io.BytesIO) -> Any: class S3IterableDataset(IterableDataset, _BaseS3Dataset): # pylint: disable=abstract-method - """PyTorch Amazon S3 Iterable Dataset. + r"""PyTorch Amazon S3 Iterable Dataset. Parameters ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). suffix: str, optional - S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://*.png). + S3 suffix filtering of object keys (i.e. suffix=".png" -> s3://\*.png). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index 99c1df1c6..e9c9834df 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -191,14 +191,51 @@ def test_athena_ctas(bucket, database, kms_key): encryption="SSE_KMS", kms_key=kms_key, s3_output=f"s3://{bucket}/test_athena_ctas_result", + keep_files=False, ) assert len(df.index) == 3 ensure_data_types(df=df, has_list=True) + temp_table = "test_athena_ctas2" + s3_output = f"s3://{bucket}/s3_output/" + final_destination = f"{s3_output}{temp_table}/" + + # keep_files=False + wr.s3.delete_objects(path=s3_output) dfs = wr.athena.read_sql_query( - sql=f"SELECT * FROM test_athena_ctas", database=database, ctas_approach=True, chunksize=1 + sql=f"SELECT * FROM test_athena_ctas", + database=database, + ctas_approach=True, + chunksize=1, + keep_files=False, + ctas_temp_table_name=temp_table, + s3_output=s3_output, ) + assert wr.catalog.does_table_exist(database=database, table=temp_table) is False + assert len(wr.s3.list_objects(path=s3_output)) > 2 + assert len(wr.s3.list_objects(path=final_destination)) > 0 for df in dfs: ensure_data_types(df=df, has_list=True) + assert len(wr.s3.list_objects(path=s3_output)) == 0 + + # keep_files=True + wr.s3.delete_objects(path=s3_output) + dfs = wr.athena.read_sql_query( + sql=f"SELECT * FROM test_athena_ctas", + database=database, + ctas_approach=True, + chunksize=2, + keep_files=True, + ctas_temp_table_name=temp_table, + s3_output=s3_output, + ) + assert wr.catalog.does_table_exist(database=database, table=temp_table) is False + assert len(wr.s3.list_objects(path=s3_output)) > 2 + assert len(wr.s3.list_objects(path=final_destination)) > 0 + for df in dfs: + ensure_data_types(df=df, has_list=True) + assert len(wr.s3.list_objects(path=s3_output)) > 2 + + # Cleaning Up wr.catalog.delete_table_if_exists(database=database, table="test_athena_ctas") wr.s3.delete_objects(path=paths) wr.s3.wait_objects_not_exist(paths=paths) @@ -227,12 +264,17 @@ def test_athena(bucket, database, kms_key, workgroup0, workgroup1): encryption="SSE_KMS", kms_key=kms_key, workgroup=workgroup0, + keep_files=False, ) for df2 in dfs: print(df2) ensure_data_types(df=df2) df = wr.athena.read_sql_query( - sql="SELECT * FROM __test_athena", database=database, ctas_approach=False, workgroup=workgroup1 + sql="SELECT * FROM __test_athena", + database=database, + ctas_approach=False, + workgroup=workgroup1, + keep_files=False, ) assert len(df.index) == 3 ensure_data_types(df=df) @@ -1195,9 +1237,23 @@ def test_athena_encryption( df=df, path=path, dataset=True, mode="overwrite", database=database, table=table, s3_additional_kwargs=None )["paths"] wr.s3.wait_objects_exist(paths=paths, use_threads=False) + temp_table = table + "2" + s3_output = f"s3://{bucket}/encryptio_s3_output/" + final_destination = f"{s3_output}{temp_table}/" + wr.s3.delete_objects(path=final_destination) df2 = wr.athena.read_sql_table( - table=table, ctas_approach=True, database=database, encryption=encryption, workgroup=workgroup, kms_key=kms_key + table=table, + ctas_approach=True, + database=database, + encryption=encryption, + workgroup=workgroup, + kms_key=kms_key, + keep_files=True, + ctas_temp_table_name=temp_table, + s3_output=s3_output, ) + assert wr.catalog.does_table_exist(database=database, table=temp_table) is False + assert len(wr.s3.list_objects(path=s3_output)) > 2 print(df2) assert len(df2.index) == 2 assert len(df2.columns) == 2 From fe6f50bb41ac42d8ce02ce39c241f0f7c90433c8 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 5 May 2020 12:55:34 -0300 Subject: [PATCH 56/59] Removing delete_table operations from catalog._create_table() and add catalog_versioning arg. #198 --- awswrangler/catalog.py | 27 ++++++--- awswrangler/s3.py | 12 ++++ testing/test_awswrangler/test_data_lake.py | 67 ++++++++++++++++++++++ 3 files changed, 99 insertions(+), 7 deletions(-) diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py index 93092626b..9ef55066c 100644 --- a/awswrangler/catalog.py +++ b/awswrangler/catalog.py @@ -93,6 +93,7 @@ def create_parquet_table( parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, mode: str = "overwrite", + catalog_versioning: bool = False, boto3_session: Optional[boto3.Session] = None, ) -> None: """Create a Parquet Table (Metadata Only) in the AWS Glue Catalog. @@ -121,6 +122,8 @@ def create_parquet_table( Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). mode: str 'overwrite' to recreate any possible existing table or 'append' to keep any possible existing table. + catalog_versioning : bool + If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -157,6 +160,7 @@ def create_parquet_table( parameters=parameters, columns_comments=columns_comments, mode=mode, + catalog_versioning=catalog_versioning, boto3_session=boto3_session, table_input=table_input, ) @@ -865,6 +869,7 @@ def create_csv_table( parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, mode: str = "overwrite", + catalog_versioning: bool = False, sep: str = ",", boto3_session: Optional[boto3.Session] = None, ) -> None: @@ -884,16 +889,18 @@ def create_csv_table( Dictionary with keys as column names and vales as data types (e.g. {'col0': 'bigint', 'col1': 'double'}). partitions_types: Dict[str, str], optional Dictionary with keys as partition names and values as data types (e.g. {'col2': 'date'}). - compression: str, optional + compression : str, optional Compression style (``None``, ``gzip``, etc). - description: str, optional + description : str, optional Table description - parameters: Dict[str, str], optional + parameters : Dict[str, str], optional Key/value pairs to tag the table. columns_comments: Dict[str, str], optional Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). - mode: str + mode : str 'overwrite' to recreate any possible axisting table or 'append' to keep any possible axisting table. + catalog_versioning : bool + If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. sep : str String of length 1. Field delimiter for the output file. boto3_session : boto3.Session(), optional @@ -937,6 +944,7 @@ def create_csv_table( parameters=parameters, columns_comments=columns_comments, mode=mode, + catalog_versioning=catalog_versioning, boto3_session=boto3_session, table_input=table_input, ) @@ -949,6 +957,7 @@ def _create_table( parameters: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], mode: str, + catalog_versioning: bool, boto3_session: Optional[boto3.Session], table_input: Dict[str, Any], ): @@ -967,10 +976,14 @@ def _create_table( if name in columns_comments: par["Comment"] = columns_comments[name] session: boto3.Session = _utils.ensure_session(session=boto3_session) + client_glue: boto3.client = _utils.client(service_name="glue", session=session) exist: bool = does_table_exist(database=database, table=table, boto3_session=session) - if (mode == "overwrite") or (exist is False): - delete_table_if_exists(database=database, table=table, boto3_session=session) - client_glue: boto3.client = _utils.client(service_name="glue", session=session) + if mode not in ("overwrite", "append"): # pragma: no cover + raise exceptions.InvalidArgument(f"{mode} is not a valid mode. It must be 'overwrite' or 'append'.") + if (exist is True) and (mode == "overwrite"): + skip_archive: bool = not catalog_versioning + client_glue.update_table(DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive) + elif exist is False: client_glue.create_table(DatabaseName=database, TableInput=table_input) diff --git a/awswrangler/s3.py b/awswrangler/s3.py index a8512f0b5..31c7b2ea6 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -433,6 +433,7 @@ def to_csv( # pylint: disable=too-many-arguments dataset: bool = False, partition_cols: Optional[List[str]] = None, mode: Optional[str] = None, + catalog_versioning: bool = False, database: Optional[str] = None, table: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, @@ -483,6 +484,8 @@ def to_csv( # pylint: disable=too-many-arguments List of column names that will be used to create partitions. Only takes effect if dataset=True. mode: str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. + catalog_versioning : bool + If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. database : str, optional Glue/Athena catalog: Database name. table : str, optional @@ -677,6 +680,7 @@ def to_csv( # pylint: disable=too-many-arguments columns_comments=columns_comments, boto3_session=session, mode="overwrite", + catalog_versioning=catalog_versioning, sep=sep, ) if partitions_values: @@ -846,6 +850,7 @@ def to_parquet( # pylint: disable=too-many-arguments dataset: bool = False, partition_cols: Optional[List[str]] = None, mode: Optional[str] = None, + catalog_versioning: bool = False, database: Optional[str] = None, table: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, @@ -893,6 +898,8 @@ def to_parquet( # pylint: disable=too-many-arguments List of column names that will be used to create partitions. Only takes effect if dataset=True. mode: str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. + catalog_versioning : bool + If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. database : str, optional Glue/Athena catalog: Database name. table : str, optional @@ -1092,6 +1099,7 @@ def to_parquet( # pylint: disable=too-many-arguments columns_comments=columns_comments, boto3_session=session, mode="overwrite", + catalog_versioning=catalog_versioning, ) if partitions_values: _logger.debug("partitions_values:\n%s", partitions_values) @@ -1838,6 +1846,7 @@ def store_parquet_metadata( columns_comments: Optional[Dict[str, str]] = None, compression: Optional[str] = None, mode: str = "overwrite", + catalog_versioning: bool = False, boto3_session: Optional[boto3.Session] = None, ) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: """Infer and store parquet metadata on AWS Glue Catalog. @@ -1879,6 +1888,8 @@ def store_parquet_metadata( Compression style (``None``, ``snappy``, ``gzip``, etc). mode: str 'overwrite' to recreate any possible existing table or 'append' to keep any possible existing table. + catalog_versioning : bool + If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -1924,6 +1935,7 @@ def store_parquet_metadata( parameters=parameters, columns_comments=columns_comments, mode=mode, + catalog_versioning=catalog_versioning, boto3_session=session, ) partitions_values: Dict[str, List[str]] = _data_types.athena_partitions_from_pyarrow_partitions( diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index e9c9834df..77bd5310e 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -1281,3 +1281,70 @@ def test_athena_nested(bucket, database): df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c1, c2, c4 FROM {table}", database=database) assert len(df2.index) == 2 assert len(df2.columns) == 4 + + +def test_catalog_versioning(bucket, database): + table = "test_catalog_versioning" + wr.catalog.delete_table_if_exists(database=database, table=table) + path = f"s3://{bucket}/{table}/" + wr.s3.delete_objects(path=path) + + # Version 0 + df = pd.DataFrame({"c0": [1, 2]}) + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table, mode="overwrite")["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 2 + assert len(df.columns) == 1 + assert str(df.c0.dtype).startswith("Int") + + # Version 1 + df = pd.DataFrame({"c1": ["foo", "boo"]}) + paths = wr.s3.to_parquet( + df=df, path=path, dataset=True, database=database, table=table, mode="overwrite", catalog_versioning=True + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 2 + assert len(df.columns) == 1 + assert str(df.c1.dtype) == "string" + + # Version 2 + df = pd.DataFrame({"c1": [1.0, 2.0]}) + paths = wr.s3.to_csv( + df=df, + path=path, + dataset=True, + database=database, + table=table, + mode="overwrite", + catalog_versioning=True, + index=False, + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 2 + assert len(df.columns) == 1 + assert str(df.c1.dtype).startswith("float") + + # Version 3 (removing version 2) + df = pd.DataFrame({"c1": [True, False]}) + paths = wr.s3.to_csv( + df=df, + path=path, + dataset=True, + database=database, + table=table, + mode="overwrite", + catalog_versioning=False, + index=False, + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 2 + assert len(df.columns) == 1 + assert str(df.c1.dtype).startswith("boolean") + + # Cleaning Up + wr.catalog.delete_table_if_exists(database=database, table=table) + wr.s3.delete_objects(path=path) From a6ba86c170b3a43a12a8fe191778bf4d3871e441 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 5 May 2020 13:46:56 -0300 Subject: [PATCH 57/59] add replace_filenames argument to wr.s3.copy_objects() #215 --- awswrangler/s3.py | 10 ++++++++++ testing/test_awswrangler/test_data_lake.py | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/awswrangler/s3.py b/awswrangler/s3.py index 31c7b2ea6..f4d1e5746 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -2284,6 +2284,7 @@ def copy_objects( paths: List[str], source_path: str, target_path: str, + replace_filenames: Optional[Dict[str, str]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> List[str]: @@ -2334,6 +2335,15 @@ def copy_objects( for path in paths: path_wo_prefix: str = path.replace(f"{source_path}/", "") path_final: str = f"{target_path}/{path_wo_prefix}" + if replace_filenames is not None: + parts: List[str] = path_final.rsplit(sep="/", maxsplit=1) + if len(parts) == 2: + path_wo_filename: str = parts[0] + filename: str = parts[1] + if filename in replace_filenames: + new_filename: str = replace_filenames[filename] + _logger.debug("Replacing filename: %s -> %s", filename, new_filename) + path_final = f"{path_wo_filename}/{new_filename}" new_objects.append(path_final) batch.append((path, path_final)) _logger.debug("len(new_objects): %s", len(new_objects)) diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index 77bd5310e..f95c691fb 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -1348,3 +1348,22 @@ def test_catalog_versioning(bucket, database): # Cleaning Up wr.catalog.delete_table_if_exists(database=database, table=table) wr.s3.delete_objects(path=path) + + +def test_copy_replacing_filename(bucket): + path = f"s3://{bucket}/test_copy_replacing_filename/" + wr.s3.delete_objects(path=path) + df = pd.DataFrame({"c0": [1, 2]}) + file_path = f"{path}myfile.parquet" + wr.s3.to_parquet(df=df, path=file_path) + wr.s3.wait_objects_exist(paths=[file_path], use_threads=False) + path2 = f"s3://{bucket}/test_copy_replacing_filename2/" + wr.s3.copy_objects( + paths=[file_path], source_path=path, target_path=path2, replace_filenames={"myfile.parquet": "myfile2.parquet"} + ) + expected_file = f"{path2}myfile2.parquet" + wr.s3.wait_objects_exist(paths=[expected_file], use_threads=False) + objs = wr.s3.list_objects(path=path2) + assert objs[0] == expected_file + wr.s3.delete_objects(path=path) + wr.s3.delete_objects(path=path2) From 5be05d3ce7342dbdfc96d5fa125f8f78235773fb Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 5 May 2020 13:48:56 -0300 Subject: [PATCH 58/59] Update README --- README.md | 2 +- docs/source/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ccb5dc669..424808bf8 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ **NOTE** -We just released a new major version `1.0` with breaking changes. Please make sure that all your old projects has dependencies frozen on the desired version (e.g. `pip install awswrangler==0.3.2`). +Due the new major version `1.*.*` with breaking changes, please make sure that all your old projects has dependencies frozen on the desired version (e.g. `pip install awswrangler==0.3.2`). --- diff --git a/docs/source/index.rst b/docs/source/index.rst index 2528a6032..0a9059392 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,4 +1,4 @@ -.. note:: We just released a new major version `1.0` with breaking changes. Please make sure that all your old projects has dependencies frozen on the desired version (e.g. `pip install awswrangler==0.3.2`). +.. note:: Due the new major version `1.*.*` with breaking changes, please make sure that all your old projects has dependencies frozen on the desired version (e.g. `pip install awswrangler==0.3.2`). Quick Start ----------- From 12d0f66faa67ac305b17a42317015a1783f578fc Mon Sep 17 00:00:00 2001 From: igorborgest Date: Tue, 5 May 2020 13:55:26 -0300 Subject: [PATCH 59/59] Updating requirements --- requirements-dev.txt | 8 ++++---- requirements-torch.txt | 2 +- requirements.txt | 18 +++++++++--------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 81b576472..e6e788815 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ black~=19.3b0 -pylint~=2.5.0 +pylint~=2.5.2 flake8~=3.7.9 mypy~=0.770 isort~=4.3.21 @@ -8,9 +8,9 @@ doc8~=0.8.0 tox~=3.15.0 pytest~=5.4.1 pytest-cov~=2.8.1 -pytest-xdist~=1.31.0 +pytest-xdist~=1.32.0 scikit-learn~=0.22.1 -awscli>=1.18.22 +awscli>=1.18.0 cfn-lint~=0.30.1 cfn-flip~=1.2.3 twine~=3.1.1 @@ -18,4 +18,4 @@ wheel~=0.34.2 sphinx~=3.0.3 sphinx_bootstrap_theme~=0.7.1 moto~=1.3.14 -jupyterlab~=2.1.1 \ No newline at end of file +jupyterlab~=2.1.2 \ No newline at end of file diff --git a/requirements-torch.txt b/requirements-torch.txt index d3e36447e..20f8cdba9 100644 --- a/requirements-torch.txt +++ b/requirements-torch.txt @@ -1,4 +1,4 @@ torch~=1.5.0 torchvision~=0.6.0 torchaudio~=0.5.0 -Pillow~=7.1.2 +Pillow~=7.1.0 diff --git a/requirements.txt b/requirements.txt index 9c1013d22..c6ff840d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ -numpy~=1.18.1 -pandas~=1.0.3 +boto3>=1.12.0 +botocore>=1.15.0 +numpy~=1.18.0 +pandas~=1.0.0 pyarrow~=0.17.0 -boto3>=1.12.22 -botocore>=1.15.22 -s3fs~=0.4.2 -psycopg2-binary~=2.8.5 -pymysql~=0.9.3 -SQLAlchemy==1.3.13 -sqlalchemy-redshift~=0.7.7 \ No newline at end of file +s3fs~=0.4.0 +psycopg2-binary~=2.8.0 +pymysql~=0.9.0 +sqlalchemy-redshift~=0.7.0 +SQLAlchemy==1.3.13 \ No newline at end of file