From 0ecf533b0f82e7e7d09ef761bab72f691ab0dc88 Mon Sep 17 00:00:00 2001 From: jiajie Date: Thu, 21 May 2020 20:35:27 +0800 Subject: [PATCH 1/2] Fix get_region_from_subnet bug When using region like 'ap-south-1', get_region_from_subnet(***) in _utils.py return 'ap-south-' as region name. Add get_region_from_session fix it. --- awswrangler/_utils.py | 7 +++++++ awswrangler/emr.py | 14 ++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py index e4a6a16dd..7aae75b26 100644 --- a/awswrangler/_utils.py +++ b/awswrangler/_utils.py @@ -187,9 +187,16 @@ def get_region_from_subnet(subnet_id: str, boto3_session: Optional[boto3.Session """Extract region from Subnet ID.""" session: boto3.Session = ensure_session(session=boto3_session) client_ec2: boto3.client = client(service_name="ec2", session=session) + # This is wrong, when using region ap-south-1 return client_ec2.describe_subnets(SubnetIds=[subnet_id])["Subnets"][0]["AvailabilityZone"][:9] +def get_region_from_session(boto3_session: Optional[boto3.Session] = None) -> str: + """Extract region from session.""" + session: boto3.Session = ensure_session(session=boto3_session) + return session.region_name + + def extract_partitions_from_paths( path: str, paths: List[str] ) -> Tuple[Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: diff --git a/awswrangler/emr.py b/awswrangler/emr.py index 5a93d752d..bf20193cc 100644 --- a/awswrangler/emr.py +++ b/awswrangler/emr.py @@ -53,7 +53,7 @@ def _get_default_logging_path( _account_id = account_id if (region is None) and (subnet_id is not None): boto3_session = _utils.ensure_session(session=boto3_session) - _region: str = _utils.get_region_from_subnet(subnet_id=subnet_id, boto3_session=boto3_session) + _region: str = _utils.get_region_from_session(boto3_session=boto3_session) elif (region is None) and (subnet_id is None): raise exceptions.InvalidArgumentCombination("You must pass region or subnet_id or both.") else: @@ -63,7 +63,7 @@ def _get_default_logging_path( def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-statements account_id: str = _utils.get_account_id(boto3_session=pars["boto3_session"]) - region: str = _utils.get_region_from_subnet(subnet_id=pars["subnet_id"], boto3_session=pars["boto3_session"]) + region: str = _utils.get_region_from_session(boto3_session=pars["boto3_session"]) # S3 Logging path if pars.get("logging_s3_path") is None: @@ -155,6 +155,7 @@ def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-s ], } ) + if spark_env is not None: args["Configurations"].append( { @@ -934,7 +935,8 @@ def submit_ecr_credentials_refresh( session: boto3.Session = _utils.ensure_session(session=boto3_session) client_s3: boto3.client = _utils.client(service_name="s3", session=session) bucket, key = _utils.parse_path(path=path_script) - client_s3.put_object(Body=_get_ecr_credentials_refresh_content().encode(encoding="utf-8"), Bucket=bucket, Key=key) + region: str = _utils.get_region_from_session(boto3_session=boto3_session) + client_s3.put_object(Body=_get_ecr_credentials_refresh_content(region).encode(encoding="utf-8"), Bucket=bucket, Key=key) command: str = f"spark-submit --deploy-mode cluster {path_script}" name: str = "ECR Credentials Refresh" step: Dict[str, Any] = build_step( @@ -946,14 +948,14 @@ def submit_ecr_credentials_refresh( return response["StepIds"][0] -def _get_ecr_credentials_refresh_content() -> str: - return """ +def _get_ecr_credentials_refresh_content(region) -> str: + return f""" import subprocess from pyspark.sql import SparkSession spark = SparkSession.builder.appName("ECR Setup Job").getOrCreate() COMMANDS = [ - "sudo -s eval $(aws ecr get-login --region us-east-1 --no-include-email)", + "sudo -s eval $(aws ecr get-login --region {region} --no-include-email)", "sudo hdfs dfs -put -f /root/.docker/config.json /user/hadoop/" ] From fa0e0b18888a840dc60d02d88771089e0fdfec00 Mon Sep 17 00:00:00 2001 From: jiajie Date: Thu, 21 May 2020 21:12:15 +0800 Subject: [PATCH 2/2] Fix awswrangler/emr.py:939:121: E501 line too long (124 > 120 characters) --- awswrangler/emr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/awswrangler/emr.py b/awswrangler/emr.py index bf20193cc..e8114c698 100644 --- a/awswrangler/emr.py +++ b/awswrangler/emr.py @@ -936,7 +936,8 @@ def submit_ecr_credentials_refresh( client_s3: boto3.client = _utils.client(service_name="s3", session=session) bucket, key = _utils.parse_path(path=path_script) region: str = _utils.get_region_from_session(boto3_session=boto3_session) - client_s3.put_object(Body=_get_ecr_credentials_refresh_content(region).encode(encoding="utf-8"), Bucket=bucket, Key=key) + client_s3.put_object( + Body=_get_ecr_credentials_refresh_content(region).encode(encoding="utf-8"), Bucket=bucket, Key=key) command: str = f"spark-submit --deploy-mode cluster {path_script}" name: str = "ECR Credentials Refresh" step: Dict[str, Any] = build_step(