-
Notifications
You must be signed in to change notification settings - Fork 29.2k
Why didn't the Executor fail after the Shuffle connection timeout? #55092
Copy link
Copy link
Open
Description
Spark Version:3.1.4
JDK Version:1.8.0_462-462
OS: Rocky 8.10
One of the Executors of the tasks are always in a running state.The shuffle operation was retried three times, but the Executor did not terminate its execution.
The shuffle config is:
- spark.shuffle.io.maxRetries:3
- spark.shuffle.io.connectionTimeout:300s
- spark.shuffle.io.retryWait:5s
- spark.network.timeout:600s
Error message:
2026-03-30 08:06:45,328 ERROR [Executor task launch worker for task 13.0 in stage 49120.0 (TID 2467856)] shuffle.RetryingBlockFetcher:155 - Exception while beginning fetch of 1 outstanding blocks
java.io.IOException: Failed to connect to node12/172.15.122.12:44199
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:287) ~[spark-network-common_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218) ~[spark-network-common_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:123) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:153) ~[spark-network-shuffle_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.shuffle.RetryingBlockFetcher.start(RetryingBlockFetcher.java:133) ~[spark-network-shuffle_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.netty.NettyBlockTransferService.fetchBlocks(NettyBlockTransferService.scala:143) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.BlockTransferService.fetchBlockSync(BlockTransferService.scala:102) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.storage.BlockManager.fetchRemoteManagedBuffer(BlockManager.scala:1061) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.storage.BlockManager.$anonfun$getRemoteBlock$8(BlockManager.scala:1005) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at scala.Option.orElse(Option.scala:447) ~[scala-library-2.12.10.jar:?]
at org.apache.spark.storage.BlockManager.getRemoteBlock(BlockManager.scala:1005) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.storage.BlockManager.getRemoteBytes(BlockManager.scala:1143) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBlocks$1(TorrentBroadcast.scala:180) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23) ~[scala-library-2.12.10.jar:?]
at scala.collection.immutable.List.foreach(List.scala:392) ~[scala-library-2.12.10.jar:?]
at org.apache.spark.broadcast.TorrentBroadcast.readBlocks(TorrentBroadcast.scala:169) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$4(TorrentBroadcast.scala:253) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at scala.Option.getOrElse(Option.scala:189) ~[scala-library-2.12.10.jar:?]
at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$2(TorrentBroadcast.scala:231) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.util.KeyLock.withLock(KeyLock.scala:64) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$1(TorrentBroadcast.scala:226) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1405) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:226) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:103) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.iceberg.spark.source.SparkWrite$WriterFactory.createWriter(SparkWrite.java:599) ~[iceberg-spark-runtime-3.1_2.12-1.3.1.jar:?]
at org.apache.iceberg.spark.source.SparkWrite$WriterFactory.createWriter(SparkWrite.java:594) ~[iceberg-spark-runtime-3.1_2.12-1.3.1.jar:?]
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:408) ~[spark-sql_2.12-3.1.2.jar:?]
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:360) ~[spark-sql_2.12-3.1.2.jar:?]
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.scheduler.Task.run(Task.scala:131) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) [spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) [spark-core_2.12-3.1.2.jar:3.1.2]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_462-462]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_462-462]
at java.lang.Thread.run(Thread.java:750) [?:1.8.0_462-462]
Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: node12/172.15.122.12:44199
Caused by: java.net.ConnectException: Connection refused
at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) ~[?:1.8.0_462-462]
at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:716) ~[?:1.8.0_462-462]
at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:330) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:334) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:702) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986) ~[iceberg-spark-runtime-3.1_2.12-1.3.1.jar:?]
at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) ~[iceberg-spark-runtime-3.1_2.12-1.3.1.jar:?]
at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) ~[iceberg-spark-runtime-3.1_2.12-1.3.1.jar:?]
... 1 more
2026-03-30 08:06:45,780 INFO [Executor task launch worker for task 13.0 in stage 49120.0 (TID 2467856)] shuffle.RetryingBlockFetcher:176 - Retrying fetch (1/3) for 1 outstanding blocks after 5000 ms
2026-03-30 08:06:50,782 INFO [Block Fetch Retry-8-1] client.TransportClientFactory:206 - Found inactive connection to node12/172.15.122.12:44199, creating a new one.
2026-03-30 08:06:50,785 ERROR [Block Fetch Retry-8-1] shuffle.RetryingBlockFetcher:155 - Exception while beginning fetch of 1 outstanding blocks (after 1 retries)
java.io.IOException: Failed to connect to node12/172.15.122.12:44199
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:287) ~[spark-network-common_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218) ~[spark-network-common_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:123) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:153) ~[spark-network-shuffle_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.shuffle.RetryingBlockFetcher.lambda$initiateRetry$0(RetryingBlockFetcher.java:181) ~[spark-network-shuffle_2.12-3.1.2.jar:3.1.2]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [?:1.8.0_462-462]
at java.util.concurrent.FutureTask.run(FutureTask.java:266) [?:1.8.0_462-462]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_462-462]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_462-462]
at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) [iceberg-spark-runtime-3.1_2.12-1.3.1.jar:?]
at java.lang.Thread.run(Thread.java:750) [?:1.8.0_462-462]
Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: node12/172.15.122.12:44199
Caused by: java.net.ConnectException: Connection refused
at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) ~[?:1.8.0_462-462]
at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:716) ~[?:1.8.0_462-462]
at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:330) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:334) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:702) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986) ~[iceberg-spark-runtime-3.1_2.12-1.3.1.jar:?]
at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) ~[iceberg-spark-runtime-3.1_2.12-1.3.1.jar:?]
... 2 more
2026-03-30 08:06:50,786 INFO [Block Fetch Retry-8-1] shuffle.RetryingBlockFetcher:176 - Retrying fetch (2/3) for 1 outstanding blocks after 5000 ms
2026-03-30 08:06:51,645 INFO [Executor task launch worker for task 77.0 in stage 49120.0 (TID 2467923)] sort.UnsafeExternalSorter:216 - Thread 19722 spilling sort data of 4.1 GiB to disk (2 times so far)
2026-03-30 08:06:55,787 INFO [Block Fetch Retry-8-2] client.TransportClientFactory:206 - Found inactive connection to node12/172.15.122.12:44199, creating a new one.
2026-03-30 08:06:55,791 ERROR [Block Fetch Retry-8-2] shuffle.RetryingBlockFetcher:155 - Exception while beginning fetch of 1 outstanding blocks (after 2 retries)
java.io.IOException: Failed to connect to node12/172.15.122.12:44199
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:287) ~[spark-network-common_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218) ~[spark-network-common_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:123) ~[spark-core_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:153) ~[spark-network-shuffle_2.12-3.1.2.jar:3.1.2]
at org.apache.spark.network.shuffle.RetryingBlockFetcher.lambda$initiateRetry$0(RetryingBlockFetcher.java:181) ~[spark-network-shuffle_2.12-3.1.2.jar:3.1.2]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [?:1.8.0_462-462]
at java.util.concurrent.FutureTask.run(FutureTask.java:266) [?:1.8.0_462-462]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_462-462]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_462-462]
at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) [iceberg-spark-runtime-3.1_2.12-1.3.1.jar:?]
at java.lang.Thread.run(Thread.java:750) [?:1.8.0_462-462]
Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: node12/172.15.122.12:44199
Caused by: java.net.ConnectException: Connection refused
at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) ~[?:1.8.0_462-462]
at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:716) ~[?:1.8.0_462-462]
at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:330) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:334) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:702) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) ~[netty-all-4.1.51.Final.jar:4.1.51.Final]
at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986) ~[iceberg-spark-runtime-3.1_2.12-1.3.1.jar:?]
at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) ~[iceberg-spark-runtime-3.1_2.12-1.3.1.jar:?]
... 2 more
2026-03-30 08:06:55,794 INFO [Block Fetch Retry-8-2] shuffle.RetryingBlockFetcher:176 - Retrying fetch (3/3) for 1 outstanding blocks after 5000 ms
2026-03-30 08:06:58,283 INFO [Executor task launch worker for task 77.0 in stage 49120.0 (TID 2467923)] sort.UnsafeExternalSorter:216 - Thread 19722 spilling sort data of 4.1 GiB to disk (3 times so far)
2026-03-30 08:07:00,794 INFO [Block Fetch Retry-8-1] client.TransportClientFactory:206 - Found inactive connection to node12/172.15.122.12:44199, creating a new one.
2026-03-30 14:51:21,259 INFO [dispatcher-Executor] executor.YarnCoarseGrainedExecutorBackend:57 - Received tokens of 133bytes
2026-03-30 14:51:21,657 INFO [dispatcher-Executor] deploy.SparkHadoopUtil:57 - Updating delegation tokens for current user.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels
Type
Fields
Give feedbackNo fields configured for issues without a type.