-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Description
im using spark streaming on EKS using spark submit, im able to read from kinesis but when syncing with glue data catalog i am getting this aws cred not found error (im using web identity token EKS cluster has iam role access to s3 and glue, as suggested i am using and packing sts as well)
Spark submit command:
$SPARK_HOME/bin/spark-submit
--class $MAIN_PATH
--master $MASTER
--conf spark.kubernetes.container.image=$REPO
--conf spark.driver.memory=4g
--conf spark.executor.memory=4g
--deploy-mode cluster
--conf spark.kubernetes.namespace=eternal-spark
--conf spark.kubernetes.authenticate.driver.serviceAccountName=eternal-admin
--conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.WebIdentityTokenCredentialsProvider
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
--conf spark.hadoop.fs.s3a.endpoint=s3.us-east-1.amazonaws.com
--packages org.apache.hudi:hudi-spark3-bundle_2.12:0.15.0,org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.12.773,org.apache.hudi:hudi-aws-bundle:0.15.0,software.amazon.awssdk:sts:2.17.42
local:///opt/river-1.0.jar 20
exception im getting is:
24/10/10 19:00:29 WARN HiveSyncTool: Unable to create database
org.apache.hudi.aws.sync.HoodieGlueSyncException: Fail to check if database exists test_akshay
at org.apache.hudi.aws.sync.AWSGlueCatalogSyncClient.databaseExists(AWSGlueCatalogSyncClient.java:756) ~[org.apache.hudi_hudi-aws-bundle-0.15.0.jar:0.15.0]
at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:229) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.hudi.hive.HiveSyncTool.doSync(HiveSyncTool.java:179) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:167) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.hudi.sync.common.util.SyncUtilHelpers.runHoodieMetaSync(SyncUtilHelpers.java:79) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.hudi.HoodieSparkSqlWriterInternal.$anonfun$metaSync$2(HoodieSparkSqlWriter.scala:1015) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at scala.collection.mutable.HashSet.foreach(HashSet.scala:79) ~[scala-library-2.12.17.jar:?]
at org.apache.hudi.HoodieSparkSqlWriterInternal.metaSync(HoodieSparkSqlWriter.scala:1013) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.hudi.HoodieSparkSqlWriterInternal.commitAndPerformPostOperations(HoodieSparkSqlWriter.scala:1112) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.hudi.HoodieSparkSqlWriterInternal.writeInternal(HoodieSparkSqlWriter.scala:508) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.hudi.HoodieSparkSqlWriterInternal.write(HoodieSparkSqlWriter.scala:187) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:125) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.hudi.HoodieStreamingSink.$anonfun$addBatch$3(HoodieStreamingSink.scala:141) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at scala.util.Try$.apply(Try.scala:213) ~[scala-library-2.12.17.jar:?]
at org.apache.hudi.HoodieStreamingSink.$anonfun$addBatch$2(HoodieStreamingSink.scala:133) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.hudi.HoodieStreamingSink.retry(HoodieStreamingSink.scala:237) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.hudi.HoodieStreamingSink.addBatch(HoodieStreamingSink.scala:132) ~[org.apache.hudi_hudi-spark3-bundle_2.12-0.15.0.jar:0.15.0]
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$17(MicroBatchExecution.scala:732) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:108) ~[spark-catalyst_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:264) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:138) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$9(SQLExecution.scala:174) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:108) ~[spark-catalyst_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:264) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$8(SQLExecution.scala:174) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:285) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:173) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:901) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:70) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:729) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:427) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:425) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:67) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:729) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:286) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) ~[scala-library-2.12.17.jar:?]
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:427) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:425) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:67) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:249) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:67) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:239) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:311) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) ~[scala-library-2.12.17.jar:?]
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:901) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:289) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.$anonfun$run$1(StreamExecution.scala:211) ~[spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) [scala-library-2.12.17.jar:?]
at org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94) [spark-core_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:211) [spark-sql_2.12-3.5.1-amzn-0.jar:3.5.1-amzn-0]
Caused by: java.util.concurrent.ExecutionException: org.apache.hudi.software.amazon.awssdk.core.exception.SdkClientException: Unable to load credentials from any of the providers in the chain AwsCredentialsProviderChain(credentialsProviders=[SystemPropertyCredentialsProvider(), EnvironmentVariableCredentialsProvider(), WebIdentityTokenCredentialsProvider(), ProfileCredentialsProvider(profileName=default, profileFile=ProfileFile(profilesAndSectionsMap=[])), ContainerCredentialsProvider(), InstanceProfileCredentialsProvider()]) : [SystemPropertyCredentialsProvider(): Unable to load credentials from system settings. Access key must be specified either via environment variable (AWS_ACCESS_KEY_ID) or system property (aws.accessKeyId)., EnvironmentVariableCredentialsProvider(): Unable to load credentials from system settings. Access key must be specified either via environment variable (AWS_ACCESS_KEY_ID) or system property (aws.accessKeyId)., WebIdentityTokenCredentialsProvider(): To use web identity tokens, the 'sts' service module must be on the class path., ProfileCredentialsProvider(profileName=default, profileFile=ProfileFile(profilesAndSectionsMap=[])): Profile file contained no credentials for profile 'default': ProfileFile(profilesAndSectionsMap=[]), ContainerCredentialsProvider(): Cannot fetch credentials from container - neither AWS_CONTAINER_CREDENTIALS_FULL_URI or AWS_CONTAINER_CREDENTIALS_RELATIVE_URI environment variables are set., InstanceProfileCredentialsProvider(): Failed to load credentials from IMDS.]
at java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:396) ~[?:?]
at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:2073) ~[?:?]
at org.apache.hudi.aws.sync.AWSGlueCatalogSyncClient.databaseExists(AWSGlueCatalogSyncClient.java:750) ~[org.apache.hudi_hudi-aws-bundle-0.15.0.jar:0.15.0]
... 49 more
Caused by: org.apache.hudi.software.amazon.awssdk.core.exception.SdkClientException: Unable to load credentials from any of the providers in the chain AwsCredentialsProviderChain(credentialsProviders=[SystemPropertyCredentialsProvider(), EnvironmentVariableCredentialsProvider(), WebIdentityTokenCredentialsProvider(), ProfileCredentialsProvider(profileName=default, profileFile=ProfileFile(profilesAndSectionsMap=[])), ContainerCredentialsProvider(), InstanceProfileCredentialsProvider()]) : [SystemPropertyCredentialsProvider(): Unable to load credentials from system settings. Access key must be specified either via environment variable (AWS_ACCESS_KEY_ID) or system property (aws.accessKeyId)., EnvironmentVariableCredentialsProvider(): Unable to load credentials from system settings. Access key must be specified either via environment variable (AWS_ACCESS_KEY_ID) or system property (aws.accessKeyId)., WebIdentityTokenCredentialsProvider(): To use web identity tokens, the 'sts' service module must be on the class path., ProfileCredentialsProvider(profileName=default, profileFile=ProfileFile(profilesAndSectionsMap=[])): Profile file contained no credentials for profile 'default': ProfileFile(profilesAndSectionsMap=[]), ContainerCredentialsProvider(): Cannot fetch credentials from container - neither AWS_CONTAINER_CREDENTIALS_FULL_URI or AWS_CONTAINER_CREDENTIALS_RELATIVE_URI environment variables are set., InstanceProfileCredentialsProvider(): Failed to load credentials from IMDS.]
i had tried with the wrong creds and got 403 which is expected atleast its trying to connect to s3. so the issue seems like the sts is not being recognised major issue is that this is my gradle
implementation("org.apache.spark:spark-core_2.12:3.5.1")
implementation("org.apache.spark:spark-streaming_2.12:3.5.1")
implementation("org.apache.hudi:hudi-spark3-bundle_2.12:0.15.0")
compileOnly("org.apache.spark:spark-sql_2.12:3.5.1")
implementation("org.apache.hudi:hudi-aws-bundle:0.15.0")
implementation("org.apache.hadoop:hadoop-aws:2.10.2")
implementation("com.amazonaws:aws-java-sdk-bundle:1.12.773")
implementation("org.apache.hadoop:hadoop-common:2.10.2")
implementation("org.apache.hadoop:hadoop-client:2.10.2")
testImplementation(platform("org.junit:junit-bom:5.10.0"))
implementation("software.amazon.awssdk:secretsmanager:2.10.2")
implementation("software.amazon.awssdk:sts:2.10.2")
hudi-aws-bundle uses hadoop 2.x and aws v2 while the hadoop aws uses aws v1 the aws v2 version comes only with hadoop 3.x and hudi doesnt have it, is this causing any issues ?
i tried importing sts 1.x and 2.x both gives the same error, if you have any working gradle or pom for eks it would be helpful
(IAM roles and service accounts have access) -> im also able to read from kinesis but when hive syncing to glue it shows the above exception
Metadata
Metadata
Assignees
Labels
Type
Projects
Status