aws · igorborgest · May 5, 2020 · Apr 18, 2020 · Apr 19, 2020 · Apr 19, 2020
diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml
@@ -24,15 +24,12 @@ jobs:
         uses: actions/setup-python@v1
         with:
           python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-          pip install -r requirements-dev.txt
+      - name: Setup Environment
+        run: ./setup-dev-env.sh
       - name: CloudFormation Lint
         run: cfn-lint -t testing/cloudformation.yaml
       - name: Documentation Lint
-        run: pydocstyle awswrangler/ --add-ignore=D204
+        run: pydocstyle awswrangler/ --add-ignore=D204,D403
       - name: mypy check
         run: mypy awswrangler
       - name: Flake8 Lint

diff --git a/.gitignore b/.gitignore
@@ -138,6 +138,8 @@ testing/*parameters-*.properties
 testing/*requirements*.txt
 testing/coverage/*
 building/*requirements*.txt
+building/arrow
+building/lambda/arrow
 /docs/coverage/
 /docs/build/
 /docs/source/_build/

diff --git a/.pylintrc b/.pylintrc
@@ -141,7 +141,8 @@ disable=print-statement,
         comprehension-escape,
         C0330,
         C0103,
-        W1202
+        W1202,
+        too-few-public-methods
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option

diff --git a/README.md b/README.md
@@ -5,19 +5,18 @@
 
 **NOTE**
 
-We just released a new major version `1.0` with breaking changes. Please make sure that all your old projects has dependencies frozen on the desired version (e.g. `pip install awswrangler==0.3.2`).
+Due the new major version `1.*.*` with breaking changes, please make sure that all your old projects has dependencies frozen on the desired version (e.g. `pip install awswrangler==0.3.2`).
 
 ---
 
 ![AWS Data Wrangler](docs/source/_static/logo2.png?raw=true "AWS Data Wrangler")
 
-[![Release](https://img.shields.io/badge/release-1.0.4-brightgreen.svg)](https://pypi.org/project/awswrangler/)
+[![Release](https://img.shields.io/badge/release-1.1.0-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
-[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
-[![Average time to resolve an issue](http://isitmaintained.com/badge/resolution/awslabs/aws-data-wrangler.svg)](http://isitmaintained.com/project/awslabs/aws-data-wrangler "Average time to resolve an issue")
 
+[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
 [![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 ![Static Checking](https://github.com/awslabs/aws-data-wrangler/workflows/Static%20Checking/badge.svg?branch=master)
 [![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/?badge=latest)
@@ -85,6 +84,9 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine)
   - [11 - CSV Datasets](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/11%20-%20CSV%20Datasets.ipynb)
   - [12 - CSV Crawler](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/12%20-%20CSV%20Crawler.ipynb)
   - [13 - Merging Datasets on S3](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/13%20-%20Merging%20Datasets%20on%20S3.ipynb)
+  - [14 - PyTorch](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/14%20-%20PyTorch.ipynb)
+  - [15 - EMR](https://github.com/awslabs/aws-data-wrangler/blob/dev/tutorials/15%20-%20EMR.ipynb)
+  - [16 - EMR & Docker](https://github.com/awslabs/aws-data-wrangler/blob/dev/tutorials/16%20-%20EMR%20%26%20Docker.ipynb)
 - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/latest/api.html)
   - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#amazon-s3)
   - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#aws-glue-catalog)

diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
@@ -6,8 +6,13 @@
 """
 
 import logging
+from importlib.util import find_spec
 
 from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3  # noqa
 from awswrangler.__metadata__ import __description__, __license__, __title__, __version__  # noqa
+from awswrangler._utils import get_account_id  # noqa
+
+if find_spec("torch") and find_spec("torchvision") and find_spec("torchaudio") and find_spec("PIL"):
+    from awswrangler import torch  # noqa
 
 logging.getLogger("awswrangler").addHandler(logging.NullHandler())
diff --git a/awswrangler/__metadata__.py b/awswrangler/__metadata__.py
@@ -7,5 +7,5 @@
 
 __title__ = "awswrangler"
 __description__ = "Pandas on AWS."
-__version__ = "1.0.4"
+__version__ = "1.1.0"
 __license__ = "Apache License 2.0"
diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
@@ -1,8 +1,9 @@
 """Internal (private) Data Types Module."""
 
 import logging
+import re
 from decimal import Decimal
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Match, Optional, Sequence, Tuple
 
 import pandas as pd  # type: ignore
 import pyarrow as pa  # type: ignore
@@ -139,8 +140,10 @@ def pyarrow2athena(dtype: pa.DataType) -> str:  # pylint: disable=too-many-branc
         return f"decimal({dtype.precision},{dtype.scale})"
     if pa.types.is_list(dtype):
         return f"array<{pyarrow2athena(dtype=dtype.value_type)}>"
-    if pa.types.is_struct(dtype):  # pragma: no cover
-        return f"struct<{', '.join([f'{f.name}: {pyarrow2athena(dtype=f.type)}' for f in dtype])}>"
+    if pa.types.is_struct(dtype):
+        return f"struct<{', '.join([f'{f.name}:{pyarrow2athena(dtype=f.type)}' for f in dtype])}>"
+    if pa.types.is_map(dtype):  # pragma: no cover
+        return f"map<{pyarrow2athena(dtype=dtype.key_type)},{pyarrow2athena(dtype=dtype.item_type)}>"
     if dtype == pa.null():
         raise exceptions.UndetectedType("We can not infer the data type from an entire null object column")
     raise exceptions.UnsupportedType(f"Unsupported Pyarrow type: {dtype}")  # pragma: no cover
@@ -167,7 +170,7 @@ def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-retu
 
 def pyarrow2sqlalchemy(  # pylint: disable=too-many-branches,too-many-return-statements
     dtype: pa.DataType, db_type: str
-) -> VisitableType:
+) -> Optional[VisitableType]:
     """Pyarrow to Athena data types conversion."""
     if pa.types.is_int8(dtype):
         return sqlalchemy.types.SmallInteger
@@ -207,14 +210,14 @@ def pyarrow2sqlalchemy(  # pylint: disable=too-many-branches,too-many-return-sta
         return sqlalchemy.types.Date
     if pa.types.is_binary(dtype):
         if db_type == "redshift":
-            raise exceptions.UnsupportedType(f"Binary columns are not supported for Redshift.")  # pragma: no cover
+            raise exceptions.UnsupportedType("Binary columns are not supported for Redshift.")  # pragma: no cover
         return sqlalchemy.types.Binary
     if pa.types.is_decimal(dtype):
         return sqlalchemy.types.Numeric(precision=dtype.precision, scale=dtype.scale)
     if pa.types.is_dictionary(dtype):
         return pyarrow2sqlalchemy(dtype=dtype.value_type, db_type=db_type)
     if dtype == pa.null():  # pragma: no cover
-        raise exceptions.UndetectedType("We can not infer the data type from an entire null object column")
+        return None
     raise exceptions.UnsupportedType(f"Unsupported Pyarrow type: {dtype}")  # pragma: no cover
 
 
@@ -243,12 +246,23 @@ def pyarrow_types_from_pandas(
         else:
             cols.append(name)
 
-    # Filling cols_dtypes and indexes
+    # Filling cols_dtypes
+    for col in cols:
+        _logger.debug("Inferring PyArrow type from column: %s", col)
+        try:
+            schema: pa.Schema = pa.Schema.from_pandas(df=df[[col]], preserve_index=False)
+        except pa.ArrowInvalid as ex:  # pragma: no cover
+            cols_dtypes[col] = process_not_inferred_dtype(ex)
+        else:
+            cols_dtypes[col] = schema.field(col).type
+
+    # Filling indexes
     indexes: List[str] = []
-    for field in pa.Schema.from_pandas(df=df[cols], preserve_index=index):
-        name = str(field.name)
-        cols_dtypes[name] = field.type
-        if (name not in df.columns) and (index is True):
+    if index is True:
+        for field in pa.Schema.from_pandas(df=df[[]], preserve_index=True):
+            name = str(field.name)
+            _logger.debug("Inferring PyArrow type from index: %s", name)
+            cols_dtypes[name] = field.type
             indexes.append(name)
 
     # Merging Index
@@ -257,10 +271,43 @@ def pyarrow_types_from_pandas(
     # Filling schema
     columns_types: Dict[str, pa.DataType]
     columns_types = {n: cols_dtypes[n] for n in sorted_cols}
-    _logger.debug(f"columns_types: {columns_types}")
+    _logger.debug("columns_types: %s", columns_types)
     return columns_types
 
 
+def process_not_inferred_dtype(ex: pa.ArrowInvalid) -> pa.DataType:
+    """Infer data type from PyArrow inference exception."""
+    ex_str = str(ex)
+    _logger.debug("PyArrow was not able to infer data type:\n%s", ex_str)
+    match: Optional[Match] = re.search(
+        pattern="Could not convert (.*) with type (.*): did not recognize "
+        "Python value type when inferring an Arrow data type",
+        string=ex_str,
+    )
+    if match is None:
+        raise ex  # pragma: no cover
+    groups: Optional[Sequence[str]] = match.groups()
+    if groups is None:
+        raise ex  # pragma: no cover
+    if len(groups) != 2:
+        raise ex  # pragma: no cover
+    _logger.debug("groups: %s", groups)
+    type_str: str = groups[1]
+    if type_str == "UUID":
+        return pa.string()
+    raise ex  # pragma: no cover
+
+
+def process_not_inferred_array(ex: pa.ArrowInvalid, values: Any) -> pa.Array:
+    """Infer `pyarrow.array` from PyArrow inference exception."""
+    dtype = process_not_inferred_dtype(ex=ex)
+    if dtype == pa.string():
+        array: pa.Array = pa.array(obj=[str(x) for x in values], type=dtype, safe=True)
+    else:
+        raise ex  # pragma: no cover
+    return array
+
+
 def athena_types_from_pandas(
     df: pd.DataFrame, index: bool, dtype: Optional[Dict[str, str]] = None, index_left: bool = False
 ) -> Dict[str, str]:
@@ -275,7 +322,7 @@ def athena_types_from_pandas(
             athena_columns_types[k] = casts[k]
         else:
             athena_columns_types[k] = pyarrow2athena(dtype=v)
-    _logger.debug(f"athena_columns_types: {athena_columns_types}")
+    _logger.debug("athena_columns_types: %s", athena_columns_types)
     return athena_columns_types
 
 
@@ -315,7 +362,7 @@ def pyarrow_schema_from_pandas(
         if (k in df.columns) and (k not in ignore):
             columns_types[k] = athena2pyarrow(v)
     columns_types = {k: v for k, v in columns_types.items() if v is not None}
-    _logger.debug(f"columns_types: {columns_types}")
+    _logger.debug("columns_types: %s", columns_types)
     return pa.schema(fields=columns_types)
 
 
@@ -324,11 +371,11 @@ def athena_types_from_pyarrow_schema(
 ) -> Tuple[Dict[str, str], Optional[Dict[str, str]]]:
     """Extract the related Athena data types from any PyArrow Schema considering possible partitions."""
     columns_types: Dict[str, str] = {str(f.name): pyarrow2athena(dtype=f.type) for f in schema}
-    _logger.debug(f"columns_types: {columns_types}")
+    _logger.debug("columns_types: %s", columns_types)
     partitions_types: Optional[Dict[str, str]] = None
     if partitions is not None:
         partitions_types = {p.name: pyarrow2athena(p.dictionary.type) for p in partitions}
-    _logger.debug(f"partitions_types: {partitions_types}")
+    _logger.debug("partitions_types: %s", partitions_types)
     return columns_types, partitions_types
 
 
@@ -372,7 +419,7 @@ def sqlalchemy_types_from_pandas(
     df: pd.DataFrame, db_type: str, dtype: Optional[Dict[str, VisitableType]] = None
 ) -> Dict[str, VisitableType]:
     """Extract the related SQLAlchemy data types from any Pandas DataFrame."""
-    casts: Dict[str, VisitableType] = dtype if dtype else {}
+    casts: Dict[str, VisitableType] = dtype if dtype is not None else {}
     pa_columns_types: Dict[str, Optional[pa.DataType]] = pyarrow_types_from_pandas(
         df=df, index=False, ignore_cols=list(casts.keys())
     )
@@ -382,5 +429,5 @@ def sqlalchemy_types_from_pandas(
             sqlalchemy_columns_types[k] = casts[k]
         else:
             sqlalchemy_columns_types[k] = pyarrow2sqlalchemy(dtype=v, db_type=db_type)
-    _logger.debug(f"sqlalchemy_columns_types: {sqlalchemy_columns_types}")
+    _logger.debug("sqlalchemy_columns_types: %s", sqlalchemy_columns_types)
     return sqlalchemy_columns_types
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
@@ -166,3 +166,16 @@ def ensure_postgresql_casts():
 def get_directory(path: str) -> str:
     """Extract directory path."""
     return path.rsplit(sep="/", maxsplit=1)[0] + "/"
+
+
+def get_account_id(boto3_session: Optional[boto3.Session] = None) -> str:
+    """Get Account ID."""
+    session: boto3.Session = ensure_session(session=boto3_session)
+    return client(service_name="sts", session=session).get_caller_identity().get("Account")
+
+
+def get_region_from_subnet(subnet_id: str, boto3_session: Optional[boto3.Session] = None) -> str:
+    """Extract region from Subnet ID."""
+    session: boto3.Session = ensure_session(session=boto3_session)
+    client_ec2: boto3.client = client(service_name="ec2", session=session)
+    return client_ec2.describe_subnets(SubnetIds=[subnet_id])["Subnets"][0]["AvailabilityZone"][:9]