](https://arrow.apache.org/powered_by/)
## Table of contents
@@ -121,10 +123,13 @@ Knowing which companies are using this library is important to help prioritize t
Please send a PR with your company name and @githubhandle if you may.
-1. [Digio](https://www.digio.com.br/) [[@afonsomy](https://github.com/afonsomy)]
-2. [Pier](https://www.pier.digital/) [[@flaviomax](https://github.com/flaviomax)]
-3. [M4U](https://www.m4u.com.br/) [[@Thiago-Dantas](https://github.com/Thiago-Dantas)]
-4. [Serasa Experian](https://www.serasaexperian.com.br/) [[@andre-marcos-perez](https://github.com/andre-marcos-perez)]
-5. [LINE TV](https://www.linetv.tw/) [[@bryanyang0528](https://github.com/bryanyang0528)]
-6. [OKRA Technologies](https://okra.ai) [[@JPFrancoia](https://github.com/JPFrancoia), [@schot](https://github.com/schot)]
-7. [DNX](https://www.dnx.solutions/) [[@DNXLabs](https://github.com/DNXLabs)]
+* [Amazon](https://www.amazon.com/)
+* [AWS](https://aws.amazon.com/)
+* [Cepsa](https://cepsa.com) [[@alvaropc](https://github.com/alvaropc)]
+* [Digio](https://www.digio.com.br/) [[@afonsomy](https://github.com/afonsomy)]
+* [DNX](https://www.dnx.solutions/) [[@DNXLabs](https://github.com/DNXLabs)]
+* [LINE TV](https://www.linetv.tw/) [[@bryanyang0528](https://github.com/bryanyang0528)]
+* [M4U](https://www.m4u.com.br/) [[@Thiago-Dantas](https://github.com/Thiago-Dantas)]
+* [OKRA Technologies](https://okra.ai) [[@JPFrancoia](https://github.com/JPFrancoia), [@schot](https://github.com/schot)]
+* [Pier](https://www.pier.digital/) [[@flaviomax](https://github.com/flaviomax)]
+* [Serasa Experian](https://www.serasaexperian.com.br/) [[@andre-marcos-perez](https://github.com/andre-marcos-perez)]
\ No newline at end of file
diff --git a/THIRD_PARTY.txt b/THIRD_PARTY.txt
index 978276f04..1108c912a 100644
--- a/THIRD_PARTY.txt
+++ b/THIRD_PARTY.txt
@@ -296,9 +296,6 @@ Copyright 2013-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
** pandas; version 1.1.0 -- https://pandas.pydata.org/
Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.
-** s3fs; version 4.2.0 -- https://s3fs.readthedocs.io/en/latest/
-Copyright (c) 2016, Continuum Analytics, Inc. and contributors
-All rights reserved.
** numpy; version 1.19.1 -- https://numpy.org/
Copyright (c) 2005-2020, NumPy Developers.
All rights reserved.
diff --git a/awswrangler/__metadata__.py b/awswrangler/__metadata__.py
index b92119688..b4b469757 100644
--- a/awswrangler/__metadata__.py
+++ b/awswrangler/__metadata__.py
@@ -7,5 +7,5 @@
__title__: str = "awswrangler"
__description__: str = "Pandas on AWS."
-__version__: str = "1.8.1"
+__version__: str = "1.9.0"
__license__: str = "Apache License 2.0"
diff --git a/awswrangler/_config.py b/awswrangler/_config.py
index 7ebaa7ae6..eb5bb1506 100644
--- a/awswrangler/_config.py
+++ b/awswrangler/_config.py
@@ -5,7 +5,7 @@
import os
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, cast
-import pandas as pd # type: ignore
+import pandas as pd
from awswrangler import _utils, exceptions
@@ -29,7 +29,7 @@ class _ConfigArg(NamedTuple):
"database": _ConfigArg(dtype=str, nullable=True),
"max_cache_query_inspections": _ConfigArg(dtype=int, nullable=False),
"max_cache_seconds": _ConfigArg(dtype=int, nullable=False),
- "s3fs_block_size": _ConfigArg(dtype=int, nullable=False, enforced=True),
+ "s3_block_size": _ConfigArg(dtype=int, nullable=False, enforced=True),
}
@@ -138,8 +138,8 @@ def _apply_type(name: str, value: Any, dtype: Type[Union[str, bool, int]], nulla
exceptions.InvalidArgumentValue(f"{name} configuration does not accept a null value. Please pass {dtype}.")
try:
return dtype(value) if isinstance(value, dtype) is False else value
- except ValueError:
- raise exceptions.InvalidConfiguration(f"Config {name} must receive a {dtype} value.")
+ except ValueError as ex:
+ raise exceptions.InvalidConfiguration(f"Config {name} must receive a {dtype} value.") from ex
@staticmethod
def _is_null(value: _ConfigValueType) -> bool:
@@ -206,13 +206,13 @@ def max_cache_seconds(self, value: int) -> None:
self._set_config_value(key="max_cache_seconds", value=value)
@property
- def s3fs_block_size(self) -> int:
- """Property s3fs_block_size."""
- return cast(int, self["s3fs_block_size"])
+ def s3_block_size(self) -> int:
+ """Property s3_block_size."""
+ return cast(int, self["s3_block_size"])
- @s3fs_block_size.setter
- def s3fs_block_size(self, value: int) -> None:
- self._set_config_value(key="s3fs_block_size", value=value)
+ @s3_block_size.setter
+ def s3_block_size(self, value: int) -> None:
+ self._set_config_value(key="s3_block_size", value=value)
def _inject_config_doc(doc: Optional[str], available_configs: Tuple[str, ...]) -> str:
diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
index fa36292f0..84e750ccd 100644
--- a/awswrangler/_data_types.py
+++ b/awswrangler/_data_types.py
@@ -1,18 +1,20 @@
"""Internal (private) Data Types Module."""
+import datetime
import logging
import re
from decimal import Decimal
from typing import Any, Dict, List, Match, Optional, Sequence, Tuple
-import pandas as pd # type: ignore
-import pyarrow as pa # type: ignore
-import pyarrow.parquet # type: ignore
-import sqlalchemy # type: ignore
-import sqlalchemy.dialects.mysql # type: ignore
-import sqlalchemy.dialects.postgresql # type: ignore
-import sqlalchemy_redshift.dialect # type: ignore
-from sqlalchemy.sql.visitors import VisitableType # type: ignore
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet
+import sqlalchemy
+import sqlalchemy.dialects.mysql
+import sqlalchemy.dialects.postgresql
+import sqlalchemy_redshift.dialect
+from sqlalchemy.sql.visitors import VisitableType
from awswrangler import _utils, exceptions
@@ -444,11 +446,21 @@ def _normalize_pandas_dtype_name(dtype: str) -> str:
return dtype
+def _cast2date(value: Any) -> Any:
+ if isinstance(value, float) and (np.isnan(value) or np.isinf(value)):
+ return None
+ if pd.isna(value) or value is None:
+ return None
+ if isinstance(value, datetime.date):
+ return value
+ return pd.to_datetime(value).date()
+
+
def _cast_pandas_column(df: pd.DataFrame, col: str, current_type: str, desired_type: str) -> pd.DataFrame:
if desired_type == "datetime64":
df[col] = pd.to_datetime(df[col])
elif desired_type == "date":
- df[col] = pd.to_datetime(df[col]).dt.date.replace(to_replace={pd.NaT: None})
+ df[col] = df[col].apply(lambda x: _cast2date(value=x)).replace(to_replace={pd.NaT: None})
elif desired_type == "bytes":
df[col] = df[col].astype("string").str.encode(encoding="utf-8").replace(to_replace={pd.NA: None})
elif desired_type == "decimal":
@@ -456,15 +468,6 @@ def _cast_pandas_column(df: pd.DataFrame, col: str, current_type: str, desired_t
df = _cast_pandas_column(df=df, col=col, current_type=current_type, desired_type="string")
# Then cast to decimal
df[col] = df[col].apply(lambda x: Decimal(str(x)) if str(x) not in ("", "none", "None", " ", "| \n", - " | id | \n", - "name | \n", - "
|---|---|---|
| 0 | \n", - "1 | \n", - "foo | \n", - "
| 1 | \n", - "2 | \n", - "boo | \n", - "
| 2 | \n", - "3 | \n", - "bar | \n", - "
3898091 rows × 8 columns
\n", + "3899520 rows × 8 columns
\n", "" ], "text/plain": [ @@ -258,13 +258,13 @@ "3 AGE00135039 1897-01-01 TMAX 140 NaN NaN E NaN\n", "4 AGE00135039 1897-01-01 TMIN 40 NaN NaN E NaN\n", "... ... ... ... ... ... ... ... ...\n", - "3898086 UZM00038457 1897-12-31 TMIN -145 NaN NaN r NaN\n", - "3898087 UZM00038457 1897-12-31 PRCP 4 NaN NaN r NaN\n", - "3898088 UZM00038457 1897-12-31 TAVG -95 NaN NaN r NaN\n", - "3898089 UZM00038618 1897-12-31 PRCP 66 NaN NaN r NaN\n", - "3898090 UZM00038618 1897-12-31 TAVG -45 NaN NaN r NaN\n", + "3899515 UZM00038457 1897-12-31 TMIN -145 NaN NaN r NaN\n", + "3899516 UZM00038457 1897-12-31 PRCP 4 NaN NaN r NaN\n", + "3899517 UZM00038457 1897-12-31 TAVG -95 NaN NaN r NaN\n", + "3899518 UZM00038618 1897-12-31 PRCP 66 NaN NaN r NaN\n", + "3899519 UZM00038618 1897-12-31 TAVG -45 NaN NaN r NaN\n", "\n", - "[3898091 rows x 8 columns]" + "[3899520 rows x 8 columns]" ] }, "execution_count": 4, @@ -299,7 +299,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1min 5s, sys: 2.62 s, total: 1min 8s\n", + "CPU times: user 1min 7s, sys: 2.45 s, total: 1min 9s\n", "Wall time: 4min 29s\n" ] } @@ -326,8 +326,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 15.3 s, sys: 2.01 s, total: 17.3 s\n", - "Wall time: 27.3 s\n" + "CPU times: user 15.8 s, sys: 1.93 s, total: 17.7 s\n", + "Wall time: 28.9 s\n" ] }, { @@ -408,10 +408,10 @@ " \n", "3898091 rows × 8 columns
\n", + "3899520 rows × 8 columns
\n", "" ], "text/plain": [ @@ -494,15 +494,15 @@ "1 AGE00135039 1897-01-01 TMAX 1403898091 rows × 8 columns
\n", + "3899520 rows × 8 columns
\n", "" ], "text/plain": [ " id dt element value m_flag q_flag s_flag obs_time\n", - "0 AG000060590 1897-01-01 TMAX 17029240019 rows × 8 columns
\n", + "29249758 rows × 8 columns
\n", "" ], "text/plain": [ @@ -226,13 +226,13 @@ "3 AGE00147705 1890-01-01 TMAX 140 NaN NaN E NaN\n", "4 AGE00147705 1890-01-01 TMIN 74 NaN NaN E NaN\n", "... ... ... ... ... ... ... ... ...\n", - "29240014 UZM00038457 1899-12-31 PRCP 16 NaN NaN r NaN\n", - "29240015 UZM00038457 1899-12-31 TAVG -73 NaN NaN r NaN\n", - "29240016 UZM00038618 1899-12-31 TMIN -76 NaN NaN r NaN\n", - "29240017 UZM00038618 1899-12-31 PRCP 0 NaN NaN r NaN\n", - "29240018 UZM00038618 1899-12-31 TAVG -60 NaN NaN r NaN\n", + "29249753 UZM00038457 1899-12-31 PRCP 16 NaN NaN r NaN\n", + "29249754 UZM00038457 1899-12-31 TAVG -73 NaN NaN r NaN\n", + "29249755 UZM00038618 1899-12-31 TMIN -76 NaN NaN r NaN\n", + "29249756 UZM00038618 1899-12-31 PRCP 0 NaN NaN r NaN\n", + "29249757 UZM00038618 1899-12-31 TAVG -60 NaN NaN r NaN\n", "\n", - "[29240019 rows x 8 columns]" + "[29249758 rows x 8 columns]" ] }, "execution_count": 3, @@ -370,16 +370,16 @@ { "data": { "text/plain": [ - "['year=1890/f66834ded9314208908667b40ccb5b54.snappy.parquet',\n", - " 'year=1891/73ee737ebb9144929ee63f6cd2725b8b.snappy.parquet',\n", - " 'year=1892/aee80df68614404d957d54f8b36a6143.snappy.parquet',\n", - " 'year=1893/159ae23b89b14de499b0312f03aca345.snappy.parquet',\n", - " 'year=1894/1694a1fe48194862803d8494c5405ad1.snappy.parquet',\n", - " 'year=1895/ba4d698250364922971a7b7dce96dc67.snappy.parquet',\n", - " 'year=1896/c2e422d32b2e4cb4a9d38b398845a976.snappy.parquet',\n", - " 'year=1897/2ec3223d6f284bfe9b604abbac225996.snappy.parquet',\n", - " 'year=1898/ffc78ab36f954d4ba6890892767a3cfb.snappy.parquet',\n", - " 'year=1899/c05cd01236a94b158b2b49e924e71431.snappy.parquet']" + "['year=1890/06a519afcf8e48c9b08c8908f30adcfe.snappy.parquet',\n", + " 'year=1891/5a99c28dbef54008bfc770c946099e02.snappy.parquet',\n", + " 'year=1892/9b1ea5d1cfad40f78c920f93540ca8ec.snappy.parquet',\n", + " 'year=1893/92259b49c134401eaf772506ee802af6.snappy.parquet',\n", + " 'year=1894/c734469ffff944f69dc277c630064a16.snappy.parquet',\n", + " 'year=1895/cf7ccde86aaf4d138f86c379c0817aa6.snappy.parquet',\n", + " 'year=1896/ce02f4c2c554438786b766b33db451b6.snappy.parquet',\n", + " 'year=1897/e04de04ad3c444deadcc9c410ab97ca1.snappy.parquet',\n", + " 'year=1898/acb0e02878f04b56a6200f4b5a97be0e.snappy.parquet',\n", + " 'year=1899/a269bdbb0f6a48faac55f3bcfef7df7a.snappy.parquet']" ] }, "execution_count": 6, @@ -407,8 +407,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 862 ms, sys: 382 ms, total: 1.24 s\n", - "Wall time: 1.45 s\n" + "CPU times: user 1.81 s, sys: 528 ms, total: 2.33 s\n", + "Wall time: 3.21 s\n" ] } ], @@ -563,8 +563,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 2.08 s, sys: 423 ms, total: 2.5 s\n", - "Wall time: 7.23 s\n" + "CPU times: user 3.52 s, sys: 811 ms, total: 4.33 s\n", + "Wall time: 9.6 s\n" ] }, { @@ -602,61 +602,61 @@ " \n", "