Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions awswrangler/s3/_write_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-state
if database and table:
quoting: Optional[int] = csv.QUOTE_NONE
escapechar: Optional[str] = "\\"
header: Union[bool, List[str]] = False
header: Union[bool, List[str]] = pandas_kwargs.get("header", False)
date_format: Optional[str] = "%Y-%m-%d %H:%M:%S.%f"
pd_kwargs: Dict[str, Any] = {}
compression: Optional[str] = pandas_kwargs.get("compression", None)
Expand Down Expand Up @@ -529,7 +529,7 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-state
catalog_table_input=catalog_table_input,
catalog_id=catalog_id,
compression=pandas_kwargs.get("compression"),
skip_header_line_count=None,
skip_header_line_count=True if header else None,
serde_library=serde_library,
serde_parameters=serde_parameters,
)
Expand Down
46 changes: 46 additions & 0 deletions tests/test_s3_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,52 @@ def test_csv(path):
wr.s3.read_csv(path=paths, iterator=True)


@pytest.mark.parametrize("header", [True, ["identifier"]])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is that the code change you made only applies when there is a database and table specified as params in the call. Yet the to_csv call in this test does not reference a Glue table. Should we test that instead? Also can we check that if we make more than one to_csv call to the same Glue table and read it back there isn't a duplicate header line in the resulting dataframe?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, completely forgot about that in the test. I changed the test case + added a new one testing multiple modes.

def test_csv_dataset_header(path, header, glue_database, glue_table):
path0 = f"{path}test_csv_dataset0.csv"
df0 = pd.DataFrame({"id": [1, 2, 3]})
wr.s3.to_csv(
df=df0,
path=path0,
dataset=True,
database=glue_database,
table=glue_table,
index=False,
header=header,
)
df1 = wr.s3.read_csv(path=path0)
if isinstance(header, list):
df0.columns = header
assert df0.equals(df1)


@pytest.mark.parametrize("mode", ["append", "overwrite"])
def test_csv_dataset_header_modes(path, mode, glue_database, glue_table):
path0 = f"{path}test_csv_dataset0.csv"
dfs = [
pd.DataFrame({"id": [1, 2, 3]}),
pd.DataFrame({"id": [4, 5, 6]}),
]
for df in dfs:
wr.s3.to_csv(
df=df,
path=path0,
dataset=True,
database=glue_database,
table=glue_table,
mode=mode,
index=False,
header=True,
)
dfs_conc = pd.concat(dfs)
df_res = wr.s3.read_csv(path=path0)

if mode == "append":
assert len(df_res) == len(dfs_conc)
else:
assert df_res.equals(dfs[-1])


def test_json(path):
df0 = pd.DataFrame({"id": [1, 2, 3]})
path0 = f"{path}test_json0.json"
Expand Down