From 55b122654d1c1fb1f285ff84bae1ccf39b0bc498 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Wed, 8 Jan 2020 08:47:11 -0300 Subject: [PATCH 1/2] Support for empty dataframe for Pandas.read_sql_athena(ctas_approach=True) --- awswrangler/pandas.py | 2 ++ testing/test_awswrangler/test_pandas.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py index 5166da2a1..1df97c0f1 100644 --- a/awswrangler/pandas.py +++ b/awswrangler/pandas.py @@ -582,6 +582,8 @@ def _read_sql_athena_ctas(self, self._session.glue.delete_table_if_exists(database=database, table=name) manifest_path: str = f"{s3_output}/tables/{query_id}-manifest.csv" paths: List[str] = self._session.athena.extract_manifest_paths(path=manifest_path) + if not paths: + return pd.DataFrame() logger.debug(f"paths: {paths}") return self.read_parquet(path=paths, procs_cpu_bound=procs_cpu_bound, diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py index c010ac295..1c9864d31 100644 --- a/testing/test_awswrangler/test_pandas.py +++ b/testing/test_awswrangler/test_pandas.py @@ -2020,3 +2020,17 @@ def test_read_csv_prefix_iterator(bucket, sample, row_num): total_count += count wr.s3.delete_listed_objects(objects_paths=paths) assert total_count == row_num * n + + +@pytest.mark.parametrize("ctas_approach", [False, True]) +def test_read_sql_athena_empty(ctas_approach): + sql = """ + WITH dataset AS ( + SELECT 0 AS id + ) + SELECT id + FROM dataset + WHERE id != 0 + """ + df = wr.pandas.read_sql_athena(sql=sql, ctas_approach=ctas_approach) + print(df) From 220015a231301adbfcc67a78be4b44e636677b54 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Wed, 8 Jan 2020 08:58:06 -0300 Subject: [PATCH 2/2] Cleaning temp S3 files for Pandas.read_sql_athena(ctas_approach=True) --- awswrangler/pandas.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py index 1df97c0f1..a6fa2622f 100644 --- a/awswrangler/pandas.py +++ b/awswrangler/pandas.py @@ -582,13 +582,16 @@ def _read_sql_athena_ctas(self, self._session.glue.delete_table_if_exists(database=database, table=name) manifest_path: str = f"{s3_output}/tables/{query_id}-manifest.csv" paths: List[str] = self._session.athena.extract_manifest_paths(path=manifest_path) - if not paths: - return pd.DataFrame() logger.debug(f"paths: {paths}") - return self.read_parquet(path=paths, - procs_cpu_bound=procs_cpu_bound, - wait_objects=True, - wait_objects_timeout=15.0) + if not paths: + df: pd.DataFrame = pd.DataFrame() + else: + df = self.read_parquet(path=paths, + procs_cpu_bound=procs_cpu_bound, + wait_objects=True, + wait_objects_timeout=15.0) + self._session.s3.delete_listed_objects(objects_paths=[manifest_path] + paths) + return df def _read_sql_athena_regular(self, sql: str,