aws · jaidisido · Apr 12, 2023 · Apr 11, 2023 · Apr 11, 2023 · Apr 11, 2023
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -224,7 +224,7 @@ To run all database MySQL tests (Using 8 parallel processes):
 
 ``pytest -n 8 tests/unit/test_mysql.py``
 
-To run all tests for all python versions (assuming Amazon QuickSight is activated and the optional stack deployed):
+To run all tests for all python versions (assuming Amazon QuickSight is activated and the optional stacks deployed):
 
 ``./test.sh``
 

diff --git a/README.md b/README.md
@@ -150,6 +150,7 @@ The quickest way to get started is to use AWS Glue with Ray. Read our [docs](htt
   - [029 - S3 Select](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/029%20-%20S3%20Select.ipynb)
   - [030 - Data Api](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/030%20-%20Data%20Api.ipynb)
   - [031 - OpenSearch](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/031%20-%20OpenSearch.ipynb)
+  - [032 - Lake Formation Governed Tables](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/032%20-%20Lake%20Formation%20Governed%20Tables.ipynb)
   - [033 - Amazon Neptune](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/033%20-%20Amazon%20Neptune.ipynb)
   - [034 - Distributing Calls Using Ray](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/034%20-%20Distributing%20Calls%20using%20Ray.ipynb)
   - [035 - Distributing Calls on Ray Remote Cluster](https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/035%20-%20Distributing%20Calls%20on%20Ray%20Remote%20Cluster.ipynb)
@@ -161,6 +162,7 @@ The quickest way to get started is to use AWS Glue with Ray. Read our [docs](htt
   - [Amazon S3](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#amazon-s3)
   - [AWS Glue Catalog](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#aws-glue-catalog)
   - [Amazon Athena](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#amazon-athena)
+  - [AWS Lake Formation](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#aws-lake-formation)
   - [Amazon Redshift](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#amazon-redshift)
   - [PostgreSQL](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#postgresql)
   - [MySQL](https://aws-sdk-pandas.readthedocs.io/en/3.0.0/api.html#mysql)

diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
@@ -17,6 +17,7 @@
     dynamodb,
     emr,
     exceptions,
+    lakeformation,
     mysql,
     neptune,
     opensearch,
@@ -53,6 +54,7 @@
     "s3",
     "sts",
     "redshift",
+    "lakeformation",
     "mysql",
     "neptune",
     "postgresql",

diff --git a/awswrangler/_config.py b/awswrangler/_config.py
@@ -41,6 +41,7 @@ class _ConfigArg(NamedTuple):
     "max_local_cache_entries": _ConfigArg(dtype=int, nullable=False, parent_parameter_key="athena_cache_settings"),
     "athena_query_wait_polling_delay": _ConfigArg(dtype=float, nullable=False),
     "cloudwatch_query_wait_polling_delay": _ConfigArg(dtype=float, nullable=False),
+    "lakeformation_query_wait_polling_delay": _ConfigArg(dtype=float, nullable=False),
     "s3_block_size": _ConfigArg(dtype=int, nullable=False, enforced=True),
     "workgroup": _ConfigArg(dtype=str, nullable=False, enforced=True),
     "chunksize": _ConfigArg(dtype=int, nullable=False, enforced=True),
@@ -53,6 +54,7 @@ class _ConfigArg(NamedTuple):
     "redshift_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
     "kms_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
     "emr_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
+    "lakeformation_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
     "dynamodb_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
     "secretsmanager_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
     "timestream_query_endpoint_url": _ConfigArg(dtype=str, nullable=True, enforced=True, loaded=True),
@@ -344,6 +346,15 @@ def cloudwatch_query_wait_polling_delay(self) -> float:
     def cloudwatch_query_wait_polling_delay(self, value: float) -> None:
         self._set_config_value(key="cloudwatch_query_wait_polling_delay", value=value)
 
+    @property
+    def lakeformation_query_wait_polling_delay(self) -> float:
+        """Property lakeformation_query_wait_polling_delay."""
+        return cast(float, self["lakeformation_query_wait_polling_delay"])
+
+    @lakeformation_query_wait_polling_delay.setter
+    def lakeformation_query_wait_polling_delay(self, value: float) -> None:
+        self._set_config_value(key="lakeformation_query_wait_polling_delay", value=value)
+
     @property
     def s3_block_size(self) -> int:
         """Property s3_block_size."""
@@ -443,6 +454,15 @@ def emr_endpoint_url(self) -> Optional[str]:
     def emr_endpoint_url(self, value: Optional[str]) -> None:
         self._set_config_value(key="emr_endpoint_url", value=value)
 
+    @property
+    def lakeformation_endpoint_url(self) -> Optional[str]:
+        """Property lakeformation_endpoint_url."""
+        return cast(Optional[str], self["lakeformation_endpoint_url"])
+
+    @lakeformation_endpoint_url.setter
+    def lakeformation_endpoint_url(self, value: Optional[str]) -> None:
+        self._set_config_value(key="lakeformation_endpoint_url", value=value)
+
     @property
     def dynamodb_endpoint_url(self) -> Optional[str]:
         """Property dynamodb_endpoint_url."""

diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
@@ -50,10 +50,10 @@
     from mypy_boto3_emr.client import EMRClient
     from mypy_boto3_glue import GlueClient
     from mypy_boto3_kms.client import KMSClient
+    from mypy_boto3_lakeformation.client import LakeFormationClient
     from mypy_boto3_logs.client import CloudWatchLogsClient
     from mypy_boto3_opensearch.client import OpenSearchServiceClient
     from mypy_boto3_opensearchserverless.client import OpenSearchServiceServerlessClient
-    from mypy_boto3_opensearchserverless.literals import ServiceName
     from mypy_boto3_quicksight.client import QuickSightClient
     from mypy_boto3_rds_data.client import RDSDataServiceClient
     from mypy_boto3_redshift.client import RedshiftClient
@@ -65,6 +65,28 @@
     from mypy_boto3_timestream_write.client import TimestreamWriteClient
     from typing_extensions import Literal
 
+    ServiceName = Literal[
+        "athena",
+        "dynamodb",
+        "ec2",
+        "emr",
+        "glue",
+        "kms",
+        "lakeformation",
+        "logs",
+        "opensearch",
+        "opensearchserverless",
+        "quicksight",
+        "rds-data",
+        "redshift-data",
+        "redshift",
+        "s3",
+        "secretsmanager",
+        "sts",
+        "timestream-query",
+        "timestream-write",
+    ]
+
 _logger: logging.Logger = logging.getLogger(__name__)
 
 Boto3PrimitivesType = Dict[str, Optional[str]]
@@ -233,6 +255,8 @@ def _get_endpoint_url(service_name: str) -> Optional[str]:
         endpoint_url = _config.config.kms_endpoint_url
     elif service_name == "emr" and _config.config.emr_endpoint_url is not None:
         endpoint_url = _config.config.emr_endpoint_url
+    elif service_name == "lakeformation" and _config.config.lakeformation_endpoint_url is not None:
+        endpoint_url = _config.config.lakeformation_endpoint_url
     elif service_name == "dynamodb" and _config.config.dynamodb_endpoint_url is not None:
         endpoint_url = _config.config.dynamodb_endpoint_url
     elif service_name == "secretsmanager" and _config.config.secretsmanager_endpoint_url is not None:
@@ -255,6 +279,16 @@ def client(
     ...
 
 
+@overload
+def client(
+    service_name: 'Literal["lakeformation"]',
+    session: Optional[boto3.Session] = None,
+    botocore_config: Optional[Config] = None,
+    verify: Optional[Union[str, bool]] = None,
+) -> "LakeFormationClient":
+    ...
+
+
 @overload
 def client(
     service_name: 'Literal["logs"]',

diff --git a/awswrangler/athena/_utils.py b/awswrangler/athena/_utils.py
@@ -31,7 +31,7 @@
 from awswrangler import _data_types, _utils, catalog, exceptions, s3, sts, typing
 from awswrangler._config import apply_configs
 from awswrangler._sql_formatter import _process_sql_params
-from awswrangler.catalog._utils import _catalog_id
+from awswrangler.catalog._utils import _catalog_id, _transaction_id
 
 from ._cache import _cache_manager, _CacheInfo, _check_for_cached_results, _LocalMetadataCacheManager
 
@@ -1034,6 +1034,8 @@ def show_create_table(
 def generate_create_query(
     table: str,
     database: Optional[str] = None,
+    transaction_id: Optional[str] = None,
+    query_as_of_time: Optional[str] = None,
     catalog_id: Optional[str] = None,
     boto3_session: Optional[boto3.Session] = None,
 ) -> str:
@@ -1047,6 +1049,11 @@ def generate_create_query(
         Table name.
     database : str
         Database name.
+    transaction_id: str, optional
+        The ID of the transaction.
+    query_as_of_time: str, optional
+        The time as of when to read the table contents. Must be a valid Unix epoch timestamp.
+        Cannot be specified alongside transaction_id.
     catalog_id : str, optional
         The ID of the Data Catalog from which to retrieve Databases.
         If none is provided, the AWS account ID is used by default.
@@ -1084,9 +1091,14 @@ def parse_properties(parameters: Dict[str, str]) -> str:
         return ", \n".join(properties_str)
 
     client_glue = _utils.client(service_name="glue", session=boto3_session)
-    table_detail = client_glue.get_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, Name=table))[
-        "Table"
-    ]
+    table_detail = client_glue.get_table(
+        **_catalog_id(
+            catalog_id=catalog_id,
+            **_transaction_id(
+                transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table
+            ),
+        )
+    )["Table"]
     if table_detail["TableType"] == "VIRTUAL_VIEW":
         glue_base64_query: str = table_detail["ViewOriginalText"].replace("/* Presto View: ", "").replace(" */", "")
         glue_query: str = json.loads(base64.b64decode(glue_base64_query))["originalSql"]

diff --git a/awswrangler/catalog/_add.py b/awswrangler/catalog/_add.py
@@ -14,7 +14,7 @@
     _parquet_partition_definition,
     _update_table_definition,
 )
-from awswrangler.catalog._utils import _catalog_id, sanitize_table_name
+from awswrangler.catalog._utils import _catalog_id, _transaction_id, sanitize_table_name
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
@@ -301,6 +301,7 @@ def add_column(
     column_name: str,
     column_type: str = "string",
     column_comment: Optional[str] = None,
+    transaction_id: Optional[str] = None,
     boto3_session: Optional[boto3.Session] = None,
     catalog_id: Optional[str] = None,
 ) -> None:
@@ -318,6 +319,8 @@ def add_column(
         Column type.
     column_comment : str
         Column Comment
+    transaction_id: str, optional
+        The ID of the transaction (i.e. used with GOVERNED tables).
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
     catalog_id : str, optional
@@ -341,13 +344,21 @@ def add_column(
     """
     if _check_column_type(column_type):
         client_glue = _utils.client(service_name="glue", session=boto3_session)
-        table_res = client_glue.get_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, Name=table))
+        table_res = client_glue.get_table(
+            **_catalog_id(
+                catalog_id=catalog_id,
+                **_transaction_id(transaction_id=transaction_id, DatabaseName=database, Name=table),
+            )
+        )
         table_input: Dict[str, Any] = _update_table_definition(table_res)
         table_input["StorageDescriptor"]["Columns"].append(
             {"Name": column_name, "Type": column_type, "Comment": column_comment}
         )
         res: Dict[str, Any] = client_glue.update_table(
-            **_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input)
+            **_catalog_id(
+                catalog_id=catalog_id,
+                **_transaction_id(transaction_id=transaction_id, DatabaseName=database, TableInput=table_input),
+            )
         )
         if ("Errors" in res) and res["Errors"]:
             for error in res["Errors"]: