From 4b03fd01002e6bc486715a0ae693dfaa8fcdcd58 Mon Sep 17 00:00:00 2001 From: kukushking <3997468+kukushking@users.noreply.github.com> Date: Wed, 30 Jun 2021 15:30:09 +0100 Subject: [PATCH 1/2] Add dataset headers in wr.s3.to_csv --- awswrangler/s3/_write_text.py | 4 ++-- tests/test_s3_text.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/awswrangler/s3/_write_text.py b/awswrangler/s3/_write_text.py index 6061798b5..37e2db248 100644 --- a/awswrangler/s3/_write_text.py +++ b/awswrangler/s3/_write_text.py @@ -456,7 +456,7 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-state if database and table: quoting: Optional[int] = csv.QUOTE_NONE escapechar: Optional[str] = "\\" - header: Union[bool, List[str]] = False + header: Union[bool, List[str]] = pandas_kwargs.get("header", False) date_format: Optional[str] = "%Y-%m-%d %H:%M:%S.%f" pd_kwargs: Dict[str, Any] = {} compression: Optional[str] = pandas_kwargs.get("compression", None) @@ -529,7 +529,7 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-state catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), - skip_header_line_count=None, + skip_header_line_count=True if header else None, serde_library=serde_library, serde_parameters=serde_parameters, ) diff --git a/tests/test_s3_text.py b/tests/test_s3_text.py index 185d48d0d..370ce2f63 100644 --- a/tests/test_s3_text.py +++ b/tests/test_s3_text.py @@ -119,6 +119,17 @@ def test_csv(path): wr.s3.read_csv(path=paths, iterator=True) +@pytest.mark.parametrize("header", [True, ["identifier"]]) +def test_csv_dataset_header(path, header): + df0 = pd.DataFrame({"id": [1, 2, 3]}) + path0 = f"{path}test_csv_dataset0.csv" + wr.s3.to_csv(df=df0, path=path0, dataset=True, index=False, header=header) + df1 = wr.s3.read_csv(path=path0) + if isinstance(header, list): + df0.columns = header + assert df0.equals(df1) + + def test_json(path): df0 = pd.DataFrame({"id": [1, 2, 3]}) path0 = f"{path}test_json0.json" From c0424971788a0c986be59b6c2c80911d9f06b223 Mon Sep 17 00:00:00 2001 From: kukushking <3997468+kukushking@users.noreply.github.com> Date: Thu, 1 Jul 2021 11:46:53 +0100 Subject: [PATCH 2/2] Add test case --- tests/test_s3_text.py | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/tests/test_s3_text.py b/tests/test_s3_text.py index 370ce2f63..e730caa93 100644 --- a/tests/test_s3_text.py +++ b/tests/test_s3_text.py @@ -120,16 +120,51 @@ def test_csv(path): @pytest.mark.parametrize("header", [True, ["identifier"]]) -def test_csv_dataset_header(path, header): - df0 = pd.DataFrame({"id": [1, 2, 3]}) +def test_csv_dataset_header(path, header, glue_database, glue_table): path0 = f"{path}test_csv_dataset0.csv" - wr.s3.to_csv(df=df0, path=path0, dataset=True, index=False, header=header) + df0 = pd.DataFrame({"id": [1, 2, 3]}) + wr.s3.to_csv( + df=df0, + path=path0, + dataset=True, + database=glue_database, + table=glue_table, + index=False, + header=header, + ) df1 = wr.s3.read_csv(path=path0) if isinstance(header, list): df0.columns = header assert df0.equals(df1) +@pytest.mark.parametrize("mode", ["append", "overwrite"]) +def test_csv_dataset_header_modes(path, mode, glue_database, glue_table): + path0 = f"{path}test_csv_dataset0.csv" + dfs = [ + pd.DataFrame({"id": [1, 2, 3]}), + pd.DataFrame({"id": [4, 5, 6]}), + ] + for df in dfs: + wr.s3.to_csv( + df=df, + path=path0, + dataset=True, + database=glue_database, + table=glue_table, + mode=mode, + index=False, + header=True, + ) + dfs_conc = pd.concat(dfs) + df_res = wr.s3.read_csv(path=path0) + + if mode == "append": + assert len(df_res) == len(dfs_conc) + else: + assert df_res.equals(dfs[-1]) + + def test_json(path): df0 = pd.DataFrame({"id": [1, 2, 3]}) path0 = f"{path}test_json0.json"