Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ To run all database MySQL tests (Using 8 parallel processes):

``pytest -n 8 tests/unit/test_mysql.py``

To run all tests for all python versions (assuming Amazon QuickSight is activated and the optional stack deployed):
To run all tests for all python versions (assuming Amazon QuickSight is activated and the optional stacks deployed):

``./test.sh``

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ The quickest way to get started is to use AWS Glue with Ray. Read our [docs](htt
- [029 - S3 Select](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/029%20-%20S3%20Select.ipynb)
- [030 - Data Api](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/030%20-%20Data%20Api.ipynb)
- [031 - OpenSearch](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/031%20-%20OpenSearch.ipynb)
- [032 - Lake Formation Governed Tables](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/032%20-%20Lake%20Formation%20Governed%20Tables.ipynb)
- [033 - Amazon Neptune](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/033%20-%20Amazon%20Neptune.ipynb)
- [034 - Distributing Calls Using Ray](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/034%20-%20Distributing%20Calls%20using%20Ray.ipynb)
- [035 - Distributing Calls on Ray Remote Cluster](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/035%20-%20Distributing%20Calls%20on%20Ray%20Remote%20Cluster.ipynb)
Expand All @@ -161,6 +162,7 @@ The quickest way to get started is to use AWS Glue with Ray. Read our [docs](htt
- [Amazon S3](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#amazon-s3)
- [AWS Glue Catalog](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#aws-glue-catalog)
- [Amazon Athena](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#amazon-athena)
- [AWS Lake Formation](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#aws-lake-formation)
- [Amazon Redshift](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#amazon-redshift)
- [PostgreSQL](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#postgresql)
- [MySQL](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#mysql)
Expand Down
2 changes: 2 additions & 0 deletions awswrangler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
dynamodb,
emr,
exceptions,
lakeformation,
mysql,
neptune,
opensearch,
Expand Down Expand Up @@ -53,6 +54,7 @@
"s3",
"sts",
"redshift",
"lakeformation",
"mysql",
"neptune",
"postgresql",
Expand Down
20 changes: 20 additions & 0 deletions awswrangler/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class _ConfigArg(NamedTuple):
"max_local_cache_entries": _ConfigArg(dtype=int, nullable=False, parent_parameter_key="athena_cache_settings"),
"athena_query_wait_polling_delay": _ConfigArg(dtype=float, nullable=False),
"cloudwatch_query_wait_polling_delay": _ConfigArg(dtype=float, nullable=False),
"lakeformation_query_wait_polling_delay": _ConfigArg(dtype=float, nullable=False),
"s3_block_size": _ConfigArg(dtype=int, nullable=False, enforced=True),
"workgroup": _ConfigArg(dtype=str, nullable=False, enforced=True),
"chunksize": _ConfigArg(dtype=int, nullable=False, enforced=True),
Expand All @@ -53,6 +54,7 @@ class _ConfigArg(NamedTuple):
"redshift_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
"kms_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
"emr_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
"lakeformation_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
"dynamodb_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
"secretsmanager_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
"timestream_query_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
Expand Down Expand Up @@ -344,6 +346,15 @@ def cloudwatch_query_wait_polling_delay(self) -> float:
def cloudwatch_query_wait_polling_delay(self, value: float) -> None:
self._set_config_value(key="cloudwatch_query_wait_polling_delay", value=value)

@property
def lakeformation_query_wait_polling_delay(self) -> float:
"""Property lakeformation_query_wait_polling_delay."""
return cast(float, self["lakeformation_query_wait_polling_delay"])

@lakeformation_query_wait_polling_delay.setter
def lakeformation_query_wait_polling_delay(self, value: float) -> None:
self._set_config_value(key="lakeformation_query_wait_polling_delay", value=value)

@property
def s3_block_size(self) -> int:
"""Property s3_block_size."""
Expand Down Expand Up @@ -443,6 +454,15 @@ def emr_endpoint_url(self) -> Optional[str]:
def emr_endpoint_url(self, value: Optional[str]) -> None:
self._set_config_value(key="emr_endpoint_url", value=value)

@property
def lakeformation_endpoint_url(self) -> Optional[str]:
"""Property lakeformation_endpoint_url."""
return cast(Optional[str], self["lakeformation_endpoint_url"])

@lakeformation_endpoint_url.setter
def lakeformation_endpoint_url(self, value: Optional[str]) -> None:
self._set_config_value(key="lakeformation_endpoint_url", value=value)

@property
def dynamodb_endpoint_url(self) -> Optional[str]:
"""Property dynamodb_endpoint_url."""
Expand Down
36 changes: 35 additions & 1 deletion awswrangler/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@
from mypy_boto3_emr.client import EMRClient
from mypy_boto3_glue import GlueClient
from mypy_boto3_kms.client import KMSClient
from mypy_boto3_lakeformation.client import LakeFormationClient
from mypy_boto3_logs.client import CloudWatchLogsClient
from mypy_boto3_opensearch.client import OpenSearchServiceClient
from mypy_boto3_opensearchserverless.client import OpenSearchServiceServerlessClient
from mypy_boto3_opensearchserverless.literals import ServiceName
from mypy_boto3_quicksight.client import QuickSightClient
from mypy_boto3_rds_data.client import RDSDataServiceClient
from mypy_boto3_redshift.client import RedshiftClient
Expand All @@ -65,6 +65,28 @@
from mypy_boto3_timestream_write.client import TimestreamWriteClient
from typing_extensions import Literal

ServiceName = Literal[
"athena",
"dynamodb",
"ec2",
"emr",
"glue",
"kms",
"lakeformation",
"logs",
"opensearch",
"opensearchserverless",
"quicksight",
"rds-data",
"redshift-data",
"redshift",
"s3",
"secretsmanager",
"sts",
"timestream-query",
"timestream-write",
]

_logger: logging.Logger = logging.getLogger(__name__)

Boto3PrimitivesType = Dict[str, Optional[str]]
Expand Down Expand Up @@ -233,6 +255,8 @@ def _get_endpoint_url(service_name: str) -> Optional[str]:
endpoint_url = _config.config.kms_endpoint_url
elif service_name == "emr" and _config.config.emr_endpoint_url is not None:
endpoint_url = _config.config.emr_endpoint_url
elif service_name == "lakeformation" and _config.config.lakeformation_endpoint_url is not None:
endpoint_url = _config.config.lakeformation_endpoint_url
elif service_name == "dynamodb" and _config.config.dynamodb_endpoint_url is not None:
endpoint_url = _config.config.dynamodb_endpoint_url
elif service_name == "secretsmanager" and _config.config.secretsmanager_endpoint_url is not None:
Expand All @@ -255,6 +279,16 @@ def client(
...


@overload
def client(
service_name: 'Literal["lakeformation"]',
session: Optional[boto3.Session] = None,
botocore_config: Optional[Config] = None,
verify: Optional[Union[str, bool]] = None,
) -> "LakeFormationClient":
...


@overload
def client(
service_name: 'Literal["logs"]',
Expand Down
20 changes: 16 additions & 4 deletions awswrangler/athena/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from awswrangler import _data_types, _utils, catalog, exceptions, s3, sts, typing
from awswrangler._config import apply_configs
from awswrangler._sql_formatter import _process_sql_params
from awswrangler.catalog._utils import _catalog_id
from awswrangler.catalog._utils import _catalog_id, _transaction_id

from ._cache import _cache_manager, _CacheInfo, _check_for_cached_results, _LocalMetadataCacheManager

Expand Down Expand Up @@ -1034,6 +1034,8 @@ def show_create_table(
def generate_create_query(
table: str,
database: Optional[str] = None,
transaction_id: Optional[str] = None,
query_as_of_time: Optional[str] = None,
catalog_id: Optional[str] = None,
boto3_session: Optional[boto3.Session] = None,
) -> str:
Expand All @@ -1047,6 +1049,11 @@ def generate_create_query(
Table name.
database : str
Database name.
transaction_id: str, optional
The ID of the transaction.
query_as_of_time: str, optional
The time as of when to read the table contents. Must be a valid Unix epoch timestamp.
Cannot be specified alongside transaction_id.
catalog_id : str, optional
The ID of the Data Catalog from which to retrieve Databases.
If none is provided, the AWS account ID is used by default.
Expand Down Expand Up @@ -1084,9 +1091,14 @@ def parse_properties(parameters: Dict[str, str]) -> str:
return ", \n".join(properties_str)

client_glue = _utils.client(service_name="glue", session=boto3_session)
table_detail = client_glue.get_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, Name=table))[
"Table"
]
table_detail = client_glue.get_table(
**_catalog_id(
catalog_id=catalog_id,
**_transaction_id(
transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table
),
)
)["Table"]
if table_detail["TableType"] == "VIRTUAL_VIEW":
glue_base64_query: str = table_detail["ViewOriginalText"].replace("/* Presto View: ", "").replace(" */", "")
glue_query: str = json.loads(base64.b64decode(glue_base64_query))["originalSql"]
Expand Down
17 changes: 14 additions & 3 deletions awswrangler/catalog/_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
_parquet_partition_definition,
_update_table_definition,
)
from awswrangler.catalog._utils import _catalog_id, sanitize_table_name
from awswrangler.catalog._utils import _catalog_id, _transaction_id, sanitize_table_name

_logger: logging.Logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -301,6 +301,7 @@ def add_column(
column_name: str,
column_type: str = "string",
column_comment: Optional[str] = None,
transaction_id: Optional[str] = None,
boto3_session: Optional[boto3.Session] = None,
catalog_id: Optional[str] = None,
) -> None:
Expand All @@ -318,6 +319,8 @@ def add_column(
Column type.
column_comment : str
Column Comment
transaction_id: str, optional
The ID of the transaction (i.e. used with GOVERNED tables).
boto3_session : boto3.Session(), optional
Boto3 Session. The default boto3 session will be used if boto3_session receive None.
catalog_id : str, optional
Expand All @@ -341,13 +344,21 @@ def add_column(
"""
if _check_column_type(column_type):
client_glue = _utils.client(service_name="glue", session=boto3_session)
table_res = client_glue.get_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, Name=table))
table_res = client_glue.get_table(
**_catalog_id(
catalog_id=catalog_id,
**_transaction_id(transaction_id=transaction_id, DatabaseName=database, Name=table),
)
)
table_input: Dict[str, Any] = _update_table_definition(table_res)
table_input["StorageDescriptor"]["Columns"].append(
{"Name": column_name, "Type": column_type, "Comment": column_comment}
)
res: Dict[str, Any] = client_glue.update_table(
**_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input)
**_catalog_id(
catalog_id=catalog_id,
**_transaction_id(transaction_id=transaction_id, DatabaseName=database, TableInput=table_input),
)
)
if ("Errors" in res) and res["Errors"]:
for error in res["Errors"]:
Expand Down
Loading