Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1823dd8
adding a feature to create and delete dabases in glue datacatalog
patrick-muller Jun 6, 2020
ae30785
Small updates in the catalog database functions.
igorborgest Jun 6, 2020
a8eff42
Fix typo in tutorial 8.
igorborgest Jun 3, 2020
762655a
Bumping dependencies versions.
igorborgest Jun 3, 2020
f029a3c
Passing AWS environment variables to tox environment.
igorborgest Jun 4, 2020
87d6396
Add boto3 session serializer/deserializer on _utils.py.
igorborgest Jun 6, 2020
01cc4ee
Adding the create and delete database in the notebook
patrick-muller Jun 7, 2020
7450b88
Bumping tox version.
igorborgest Jun 8, 2020
c474900
Add S3 path check.
igorborgest Jun 11, 2020
91a96c5
Force index=False for wr.db.to_sql() with redshift.
igorborgest Jun 11, 2020
bcef16f
Improve redshift tests.
igorborgest Jun 11, 2020
d0c8614
Improve redshift tests.
igorborgest Jun 11, 2020
2164b68
Add sanitize_columns arg in to_parquet and to_csv. #278 #279
igorborgest Jun 11, 2020
10749e2
Bumping dev dependencies versions.
igorborgest Jun 11, 2020
bd9ab94
Breaking up s3 module in multiple files.
igorborgest Jun 11, 2020
b3837c6
Remove in memory copy of DataFrame for to_parquet and to_csv.
igorborgest Jun 12, 2020
7c4880d
First quicksight codes. :rocket:
igorborgest Jun 10, 2020
d74d79d
Organizing imports in the quicksight module.
igorborgest Jun 11, 2020
b67cf9f
Remove duplicated paragraph from the sessions tutorial.
igorborgest Jun 11, 2020
526a830
Fixing bug on tables catalog tables w/o PartitionKeys.
igorborgest Jun 11, 2020
8953bc2
Rollback SQLAlchemy version. #281
igorborgest Jun 12, 2020
a1a0ca3
First quicksight codes. :rocket:
igorborgest Jun 10, 2020
431bf99
Organizing imports in the quicksight module.
igorborgest Jun 11, 2020
9d970c8
Fixed a bug of user name.
ywang103 Jun 12, 2020
7f84c9e
QuickSight general clean up.
ywang103 Jun 12, 2020
e75dfa9
Merge pull request #282 from awslabs/quicksight
igorborgest Jun 12, 2020
7757810
Bumping version to 1.5.0
igorborgest Jun 14, 2020
b9ac28f
Updating README.md
igorborgest Jun 14, 2020
35f1675
Updating README.md
igorborgest Jun 14, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@

![AWS Data Wrangler](docs/source/_static/logo2.png?raw=true "AWS Data Wrangler")

[![Release](https://img.shields.io/badge/release-1.4.0-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Release](https://img.shields.io/badge/release-1.5.0-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)

[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
[![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Coverage](https://img.shields.io/badge/coverage-90%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
![Static Checking](https://github.com/awslabs/aws-data-wrangler/workflows/Static%20Checking/badge.svg?branch=master)
[![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/?badge=latest)

Expand Down Expand Up @@ -43,11 +43,27 @@ df = wr.s3.read_parquet("s3://bucket/dataset/", dataset=True)
# Retrieving the data from Amazon Athena
df = wr.athena.read_sql_query("SELECT * FROM my_table", database="my_db")

# Getting Redshift connection (SQLAlchemy) from Glue Catalog Connections
# Get Redshift connection (SQLAlchemy) from Glue and retrieving data from Redshift Spectrum
engine = wr.catalog.get_engine("my-redshift-connection")

# Retrieving the data from Amazon Redshift Spectrum
df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine)

# Creating QuickSight Data Source and Dataset to reflect our new table
wr.quicksight.create_athena_data_source("athena-source", allowed_to_manage=["username"])
wr.quicksight.create_athena_dataset(
name="my-dataset",
database="my_db",
table="my_table",
data_source_name="athena-source",
allowed_to_manage=["username"]
)

# Get MySQL connection (SQLAlchemy) from Glue Catalog and LOAD the data into MySQL
engine = wr.catalog.get_engine("my-mysql-connection")
wr.db.to_sql(df, engine, schema="test", name="my_table")

# Get PostgreSQL connection (SQLAlchemy) from Glue Catalog and LOAD the data into PostgreSQL
engine = wr.catalog.get_engine("my-postgresql-connection")
wr.db.to_sql(df, engine, schema="test", name="my_table")
```

## [Read The Docs](https://aws-data-wrangler.readthedocs.io/)
Expand Down Expand Up @@ -80,13 +96,15 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine)
- [015 - EMR](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/015%20-%20EMR.ipynb)
- [016 - EMR & Docker](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/016%20-%20EMR%20%26%20Docker.ipynb)
- [017 - Partition Projection](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/017%20-%20Partition%20Projection.ipynb)
- [018 - QuickSight](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/018%20-%20QuickSight.ipynb)
- [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/latest/api.html)
- [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#amazon-s3)
- [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#aws-glue-catalog)
- [Amazon Athena](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#amazon-athena)
- [Databases (Redshift, PostgreSQL, MySQL)](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#databases-redshift-postgresql-mysql)
- [EMR Cluster](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#emr-cluster)
- [CloudWatch Logs](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#cloudwatch-logs)
- [QuickSight](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#quicksight)
- [**License**](https://github.com/awslabs/aws-data-wrangler/blob/master/LICENSE)
- [**Contributing**](https://github.com/awslabs/aws-data-wrangler/blob/master/CONTRIBUTING.md)
- [**Legacy Docs** (pre-1.0.0)](https://aws-data-wrangler.readthedocs.io/en/legacy/)
6 changes: 3 additions & 3 deletions awswrangler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

"""

import logging
import logging as _logging

from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3 # noqa
from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, quicksight, s3 # noqa
from awswrangler.__metadata__ import __description__, __license__, __title__, __version__ # noqa
from awswrangler._utils import get_account_id # noqa

logging.getLogger("awswrangler").addHandler(logging.NullHandler())
_logging.getLogger("awswrangler").addHandler(_logging.NullHandler())
2 changes: 1 addition & 1 deletion awswrangler/__metadata__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@

__title__ = "awswrangler"
__description__ = "Pandas on AWS."
__version__ = "1.4.0"
__version__ = "1.5.0"
__license__ = "Apache License 2.0"
28 changes: 28 additions & 0 deletions awswrangler/_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,34 @@ def athena2redshift( # pylint: disable=too-many-branches,too-many-return-statem
raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}") # pragma: no cover


def athena2quicksight(dtype: str) -> str: # pylint: disable=too-many-branches,too-many-return-statements
"""Athena to Quicksight data types conversion."""
dtype = dtype.lower()
if dtype == "smallint":
return "INTEGER"
if dtype in ("int", "integer"):
return "INTEGER"
if dtype == "bigint":
return "INTEGER"
if dtype == "float":
return "DECIMAL"
if dtype == "double":
return "DECIMAL"
if dtype in ("boolean", "bool"):
return "BOOLEAN"
if dtype in ("string", "char", "varchar"):
return "STRING"
if dtype == "timestamp":
return "DATETIME"
if dtype == "date":
return "DATETIME"
if dtype.startswith("decimal"):
return "DECIMAL"
if dtype in ("binary" or "varbinary"):
return "BIT"
raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}") # pragma: no cover


def pyarrow2athena(dtype: pa.DataType) -> str: # pylint: disable=too-many-branches,too-many-return-statements
"""Pyarrow to Athena data types conversion."""
if pa.types.is_int8(dtype):
Expand Down
36 changes: 33 additions & 3 deletions awswrangler/_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""Internal (private) Utilities Module."""

import copy
import logging
import math
import os
import random
from typing import Any, Dict, Generator, List, Optional, Tuple
from typing import Any, Dict, Generator, List, Optional, Tuple, Union

import boto3 # type: ignore
import botocore.config # type: ignore
Expand All @@ -17,8 +18,10 @@
_logger: logging.Logger = logging.getLogger(__name__)


def ensure_session(session: Optional[boto3.Session] = None) -> boto3.Session:
def ensure_session(session: Optional[Union[boto3.Session, Dict[str, Optional[str]]]] = None) -> boto3.Session:
"""Ensure that a valid boto3.Session will be returned."""
if isinstance(session, dict): # Primitives received
return boto3_from_primitives(primitives=session)
if session is not None:
return session
# Ensure the boto3's default session is used so that its parameters can be
Expand All @@ -28,6 +31,30 @@ def ensure_session(session: Optional[boto3.Session] = None) -> boto3.Session:
return boto3.Session() # pragma: no cover


def boto3_to_primitives(boto3_session: Optional[boto3.Session] = None) -> Dict[str, Optional[str]]:
"""Convert Boto3 Session to Python primitives."""
_boto3_session: boto3.Session = ensure_session(session=boto3_session)
credentials = _boto3_session.get_credentials()
return {
"aws_access_key_id": getattr(credentials, "access_key", None),
"aws_secret_access_key": getattr(credentials, "secret_key", None),
"aws_session_token": getattr(credentials, "token", None),
"region_name": _boto3_session.region_name,
"profile_name": _boto3_session.profile_name,
}


def boto3_from_primitives(primitives: Dict[str, Optional[str]] = None) -> boto3.Session:
"""Convert Python primitives to Boto3 Session."""
if primitives is None:
return boto3.DEFAULT_SESSION # pragma: no cover
_primitives: Dict[str, Optional[str]] = copy.deepcopy(primitives)
profile_name: Optional[str] = _primitives.get("profile_name", None)
_primitives["profile_name"] = None if profile_name in (None, "default") else profile_name
args: Dict[str, str] = {k: v for k, v in _primitives.items() if v is not None}
return boto3.Session(**args)


def client(service_name: str, session: Optional[boto3.Session] = None) -> boto3.client:
"""Create a valid boto3.client."""
return ensure_session(session=session).client(
Expand Down Expand Up @@ -63,6 +90,8 @@ def parse_path(path: str) -> Tuple[str, str]:
>>> bucket, key = parse_path('s3://bucket/key')

"""
if path.startswith("s3://") is False:
raise exceptions.InvalidArgumentValue(f"'{path}' is not a valid path. It MUST start with 's3://'")
parts = path.replace("s3://", "").split("/", 1)
bucket: str = parts[0]
key: str = ""
Expand Down Expand Up @@ -139,7 +168,8 @@ def chunkify(lst: List[Any], num_chunks: int = 1, max_length: Optional[int] = No


def get_fs(
session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None
session: Optional[Union[boto3.Session, Dict[str, Optional[str]]]] = None,
s3_additional_kwargs: Optional[Dict[str, str]] = None,
) -> s3fs.S3FileSystem:
"""Build a S3FileSystem from a given boto3 session."""
fs: s3fs.S3FileSystem = s3fs.S3FileSystem(
Expand Down
Loading