apache · bowenliang123 · Aug 11, 2024
diff --git a/README.md b/README.md
@@ -150,7 +150,7 @@ celeborn.worker.storage.dirs /mnt/disk1:disktype=SSD,/mnt/disk2:disktype=SSD
 
 # If Celeborn workers don't have local disks. You can use HDFS.
 # Do not set `celeborn.worker.storage.dirs` and use following configs.
-celeborn.storage.activeTypes HDFS
+celeborn.storage.availableTypes HDFS
 celeborn.worker.sortPartition.threads 64
 celeborn.worker.commitFiles.timeout 240s
 celeborn.worker.commitFiles.threads 128
@@ -199,7 +199,7 @@ celeborn.worker.storage.dirs /mnt/disk1:disktype=SSD,/mnt/disk2:disktype=SSD
 
 # If Celeborn workers don't have local disks. You can use HDFS.
 # Do not set `celeborn.worker.storage.dirs` and use following configs.
-celeborn.storage.activeTypes HDFS
+celeborn.storage.availableTypes HDFS
 celeborn.worker.sortPartition.threads 64
 celeborn.worker.commitFiles.timeout 240s
 celeborn.worker.commitFiles.threads 128

diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
@@ -1766,7 +1766,7 @@ object CelebornConf extends Logging {
       .categories("network")
       .version("0.2.0")
       .doc("Timeout for RPC ask operations. " +
-        "It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.activeTypes`")
+        "It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.availableTypes`")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("60s")
 
@@ -2627,7 +2627,7 @@ object CelebornConf extends Logging {
       .categories("master")
       .version("0.3.0")
       .doc("Policy for master to assign slots, Celeborn supports two types of policy: roundrobin and loadaware. " +
-        "Loadaware policy will be ignored when `HDFS` is enabled in `celeborn.storage.activeTypes`")
+        "Loadaware policy will be ignored when `HDFS` is enabled in `celeborn.storage.availableTypes`")
       .stringConf
       .transform(_.toUpperCase(Locale.ROOT))
       .checkValues(Set(
@@ -3167,7 +3167,7 @@ object CelebornConf extends Logging {
     buildConf("celeborn.worker.replicate.fastFail.duration")
       .categories("worker")
       .doc("If a replicate request not replied during the duration, worker will mark the replicate data request as failed." +
-        "It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.activeTypes`.")
+        "It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.availableTypes`.")
       .version("0.2.0")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("60s")
@@ -3197,7 +3197,7 @@ object CelebornConf extends Logging {
       .categories("worker")
       .version("0.3.0")
       .doc("Thread number of worker to commit shuffle data files asynchronously. " +
-        "It's recommended to set at least `128` when `HDFS` is enabled in `celeborn.storage.activeTypes`.")
+        "It's recommended to set at least `128` when `HDFS` is enabled in `celeborn.storage.availableTypes`.")
       .intConf
       .createWithDefault(32)
 
@@ -3222,7 +3222,7 @@ object CelebornConf extends Logging {
       .withAlternative("celeborn.worker.shuffle.commit.timeout")
       .categories("worker")
       .doc("Timeout for a Celeborn worker to commit files of a shuffle. " +
-        "It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.activeTypes`.")
+        "It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.availableTypes`.")
       .version("0.3.0")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("120s")
@@ -3241,7 +3241,7 @@ object CelebornConf extends Logging {
       .withAlternative("celeborn.worker.partitionSorter.threads")
       .categories("worker")
       .doc("PartitionSorter's thread counts. " +
-        "It's recommended to set at least `64` when `HDFS` is enabled in `celeborn.storage.activeTypes`.")
+        "It's recommended to set at least `64` when `HDFS` is enabled in `celeborn.storage.availableTypes`.")
       .version("0.3.0")
       .intConf
       .createOptional
@@ -4010,7 +4010,7 @@ object CelebornConf extends Logging {
       .categories("client")
       .doc("When true, Celeborn worker will replicate shuffle data to another Celeborn worker " +
         "asynchronously to ensure the pushed shuffle data won't be lost after the node failure. " +
-        "It's recommended to set `false` when `HDFS` is enabled in `celeborn.storage.activeTypes`.")
+        "It's recommended to set `false` when `HDFS` is enabled in `celeborn.storage.availableTypes`.")
       .version("0.3.0")
       .booleanConf
       .createWithDefault(false)

diff --git a/common/src/test/scala/org/apache/celeborn/common/CelebornConfSuite.scala b/common/src/test/scala/org/apache/celeborn/common/CelebornConfSuite.scala
@@ -195,25 +195,25 @@ class CelebornConfSuite extends CelebornFunSuite {
 
   test("Test empty working dir") {
     val conf = new CelebornConf()
-    conf.set("celeborn.storage.activeTypes", "HDFS")
+    conf.set("celeborn.storage.availableTypes", "HDFS")
     conf.set("celeborn.storage.hdfs.dir", "hdfs:///xxx")
     assert(conf.workerBaseDirs.isEmpty)
 
-    conf.set("celeborn.storage.activeTypes", "SSD,HDD,HDFS")
+    conf.set("celeborn.storage.availableTypes", "SSD,HDD,HDFS")
     conf.set("celeborn.storage.hdfs.dir", "hdfs:///xxx")
     assert(conf.workerBaseDirs.isEmpty)
 
-    conf.set("celeborn.storage.activeTypes", "SSD,HDD")
+    conf.set("celeborn.storage.availableTypes", "SSD,HDD")
     assert(!conf.workerBaseDirs.isEmpty)
   }
 
   test("Test commit file threads") {
     val conf = new CelebornConf()
-    conf.set("celeborn.storage.activeTypes", "HDFS")
+    conf.set("celeborn.storage.availableTypes", "HDFS")
     conf.set("celeborn.storage.hdfs.dir", "hdfs:///xxx")
     assert(conf.workerCommitThreads === 128)
 
-    conf.set("celeborn.storage.activeTypes", "SSD,HDD")
+    conf.set("celeborn.storage.availableTypes", "SSD,HDD")
     assert(conf.workerCommitThreads === 32)
   }
 

diff --git a/docs/configuration/client.md b/docs/configuration/client.md
@@ -51,7 +51,7 @@ license: |
 | celeborn.client.push.maxReqsInFlight.perWorker | 32 | false | Amount of Netty in-flight requests per worker. Default max memory of in flight requests  per worker is `celeborn.client.push.maxReqsInFlight.perWorker` * `celeborn.client.push.buffer.max.size` * compression ratio(1 in worst case): 64KiB * 32 = 2MiB. The maximum memory will not exceed `celeborn.client.push.maxReqsInFlight.total`. | 0.3.0 |  | 
 | celeborn.client.push.maxReqsInFlight.total | 256 | false | Amount of total Netty in-flight requests. The maximum memory is `celeborn.client.push.maxReqsInFlight.total` * `celeborn.client.push.buffer.max.size` * compression ratio(1 in worst case): 64KiB * 256 = 16MiB | 0.3.0 | celeborn.push.maxReqsInFlight | 
 | celeborn.client.push.queue.capacity | 512 | false | Push buffer queue size for a task. The maximum memory is `celeborn.client.push.buffer.max.size` * `celeborn.client.push.queue.capacity`, default: 64KiB * 512 = 32MiB | 0.3.0 | celeborn.push.queue.capacity | 
-| celeborn.client.push.replicate.enabled | false | false | When true, Celeborn worker will replicate shuffle data to another Celeborn worker asynchronously to ensure the pushed shuffle data won't be lost after the node failure. It's recommended to set `false` when `HDFS` is enabled in `celeborn.storage.activeTypes`. | 0.3.0 | celeborn.push.replicate.enabled | 
+| celeborn.client.push.replicate.enabled | false | false | When true, Celeborn worker will replicate shuffle data to another Celeborn worker asynchronously to ensure the pushed shuffle data won't be lost after the node failure. It's recommended to set `false` when `HDFS` is enabled in `celeborn.storage.availableTypes`. | 0.3.0 | celeborn.push.replicate.enabled | 
 | celeborn.client.push.retry.threads | 8 | false | Thread number to process shuffle re-send push data requests. | 0.3.0 | celeborn.push.retry.threads | 
 | celeborn.client.push.revive.batchSize | 2048 | false | Max number of partitions in one Revive request. | 0.3.0 |  | 
 | celeborn.client.push.revive.interval | 100ms | false | Interval for client to trigger Revive to LifecycleManager. The number of partitions in one Revive request is `celeborn.client.push.revive.batchSize`. | 0.3.0 |  | 

diff --git a/docs/configuration/master.md b/docs/configuration/master.md
@@ -68,7 +68,7 @@ license: |
 | celeborn.master.slot.assign.loadAware.flushTimeWeight | 0.0 | false | Weight of average flush time when calculating ordering in load-aware assignment strategy | 0.3.0 | celeborn.slots.assign.loadAware.flushTimeWeight | 
 | celeborn.master.slot.assign.loadAware.numDiskGroups | 5 | false | This configuration is a guidance for load-aware slot allocation algorithm. This value is control how many disk groups will be created. | 0.3.0 | celeborn.slots.assign.loadAware.numDiskGroups | 
 | celeborn.master.slot.assign.maxWorkers | 10000 | false | Max workers that slots of one shuffle can be allocated on. Will choose the smaller positive one from Master side and Client side, see `celeborn.client.slot.assign.maxWorkers`. | 0.3.1 |  | 
-| celeborn.master.slot.assign.policy | ROUNDROBIN | false | Policy for master to assign slots, Celeborn supports two types of policy: roundrobin and loadaware. Loadaware policy will be ignored when `HDFS` is enabled in `celeborn.storage.activeTypes` | 0.3.0 | celeborn.slots.assign.policy | 
+| celeborn.master.slot.assign.policy | ROUNDROBIN | false | Policy for master to assign slots, Celeborn supports two types of policy: roundrobin and loadaware. Loadaware policy will be ignored when `HDFS` is enabled in `celeborn.storage.availableTypes` | 0.3.0 | celeborn.slots.assign.policy | 
 | celeborn.master.userResourceConsumption.update.interval | 30s | false | Time length for a window about compute user resource consumption. | 0.3.0 |  | 
 | celeborn.master.workerUnavailableInfo.expireTimeout | 1800s | false | Worker unavailable info would be cleared when the retention period is expired. Set -1 to disable the expiration. | 0.3.1 |  | 
 | celeborn.quota.enabled | true | false | When Master side sets to true, the master will enable to check the quota via QuotaManager. When Client side sets to true, LifecycleManager will request Master side to check whether the current user has enough quota before registration of shuffle. Fallback to the default shuffle service of Spark when Master side checks that there is no enough quota for current user. | 0.2.0 |  | 

diff --git a/docs/configuration/network.md b/docs/configuration/network.md
@@ -47,7 +47,7 @@ license: |
 | celeborn.network.memory.allocator.verbose.metric | false | false | Whether to enable verbose metric for pooled allocator. | 0.3.0 |  | 
 | celeborn.network.timeout | 240s | false | Default timeout for network operations. | 0.2.0 |  | 
 | celeborn.port.maxRetries | 1 | false | When port is occupied, we will retry for max retry times. | 0.2.0 |  | 
-| celeborn.rpc.askTimeout | 60s | false | Timeout for RPC ask operations. It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.activeTypes` | 0.2.0 |  | 
+| celeborn.rpc.askTimeout | 60s | false | Timeout for RPC ask operations. It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.availableTypes` | 0.2.0 |  | 
 | celeborn.rpc.connect.threads | 64 | false |  | 0.2.0 |  | 
 | celeborn.rpc.dispatcher.threads | 0 | false | Threads number of message dispatcher event loop. Default to 0, which is availableCore. | 0.3.0 | celeborn.rpc.dispatcher.numThreads | 
 | celeborn.rpc.inbox.capacity | 0 | false | Specifies size of the in memory bounded capacity. | 0.5.0 |  | 

diff --git a/docs/configuration/worker.md b/docs/configuration/worker.md
@@ -54,8 +54,8 @@ license: |
 | celeborn.worker.bufferStream.threadsPerMountpoint | 8 | false | Threads count for read buffer per mount point. | 0.3.0 |  | 
 | celeborn.worker.clean.threads | 64 | false | Thread number of worker to clean up expired shuffle keys. | 0.3.2 |  | 
 | celeborn.worker.closeIdleConnections | false | false | Whether worker will close idle connections. | 0.2.0 |  | 
-| celeborn.worker.commitFiles.threads | 32 | false | Thread number of worker to commit shuffle data files asynchronously. It's recommended to set at least `128` when `HDFS` is enabled in `celeborn.storage.activeTypes`. | 0.3.0 | celeborn.worker.commit.threads | 
-| celeborn.worker.commitFiles.timeout | 120s | false | Timeout for a Celeborn worker to commit files of a shuffle. It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.activeTypes`. | 0.3.0 | celeborn.worker.shuffle.commit.timeout | 
+| celeborn.worker.commitFiles.threads | 32 | false | Thread number of worker to commit shuffle data files asynchronously. It's recommended to set at least `128` when `HDFS` is enabled in `celeborn.storage.availableTypes`. | 0.3.0 | celeborn.worker.commit.threads | 
+| celeborn.worker.commitFiles.timeout | 120s | false | Timeout for a Celeborn worker to commit files of a shuffle. It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.availableTypes`. | 0.3.0 | celeborn.worker.shuffle.commit.timeout | 
 | celeborn.worker.commitFiles.wait.threads | 32 | false | Thread number of worker to wait for commit shuffle data files to finish. | 0.5.0 |  | 
 | celeborn.worker.congestionControl.check.interval | 10ms | false | Interval of worker checks congestion if celeborn.worker.congestionControl.enabled is true. | 0.3.2 |  | 
 | celeborn.worker.congestionControl.enabled | false | false | Whether to enable congestion control or not. | 0.3.0 |  | 
@@ -143,7 +143,7 @@ license: |
 | celeborn.worker.readBuffer.target.updateInterval | 100ms | false | The interval for memory manager to calculate new read buffer's target memory. | 0.3.0 |  | 
 | celeborn.worker.readBuffer.toTriggerReadMin | 32 | false | Min buffers count for map data partition to trigger read. | 0.3.0 |  | 
 | celeborn.worker.register.timeout | 180s | false | Worker register timeout. | 0.2.0 |  | 
-| celeborn.worker.replicate.fastFail.duration | 60s | false | If a replicate request not replied during the duration, worker will mark the replicate data request as failed.It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.activeTypes`. | 0.2.0 |  | 
+| celeborn.worker.replicate.fastFail.duration | 60s | false | If a replicate request not replied during the duration, worker will mark the replicate data request as failed.It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.availableTypes`. | 0.2.0 |  | 
 | celeborn.worker.replicate.io.threads | &lt;undefined&gt; | false | Netty IO thread number of worker to replicate shuffle data. The default threads number is the number of flush thread. | 0.2.0 |  | 
 | celeborn.worker.replicate.port | 0 | false | Server port for Worker to receive replicate data request from other Workers. | 0.2.0 |  | 
 | celeborn.worker.replicate.randomConnection.enabled | true | false | Whether worker will create random connection to peer when replicate data. When false, worker tend to reuse the same cached TransportClient to a specific replicate worker; when true, worker tend to use different cached TransportClient. Netty will use the same thread to serve the same connection, so with more connections replicate server can leverage more netty threads | 0.2.1 |  | 
@@ -156,7 +156,7 @@ license: |
 | celeborn.worker.sortPartition.indexCache.maxWeight | 100000 | false | PartitionSorter's cache max weight for index buffer. | 0.4.0 |  | 
 | celeborn.worker.sortPartition.prefetch.enabled | true | false | When true, partition sorter will prefetch the original partition files to page cache and reserve memory configured by `celeborn.worker.sortPartition.reservedMemoryPerPartition` to allocate a block of memory for prefetching while sorting a shuffle file off-heap with page cache for non-hdfs files. Otherwise, partition sorter seeks to position of each block and does not prefetch for non-hdfs files. | 0.5.0 |  | 
 | celeborn.worker.sortPartition.reservedMemoryPerPartition | 1mb | false | Reserved memory when sorting a shuffle file off-heap. | 0.3.0 | celeborn.worker.partitionSorter.reservedMemoryPerPartition | 
-| celeborn.worker.sortPartition.threads | &lt;undefined&gt; | false | PartitionSorter's thread counts. It's recommended to set at least `64` when `HDFS` is enabled in `celeborn.storage.activeTypes`. | 0.3.0 | celeborn.worker.partitionSorter.threads | 
+| celeborn.worker.sortPartition.threads | &lt;undefined&gt; | false | PartitionSorter's thread counts. It's recommended to set at least `64` when `HDFS` is enabled in `celeborn.storage.availableTypes`. | 0.3.0 | celeborn.worker.partitionSorter.threads | 
 | celeborn.worker.sortPartition.timeout | 220s | false | Timeout for a shuffle file to sort. | 0.3.0 | celeborn.worker.partitionSorter.sort.timeout | 
 | celeborn.worker.storage.checkDirsEmpty.maxRetries | 3 | false | The number of retries for a worker to check if the working directory is cleaned up before registering with the master. | 0.3.0 | celeborn.worker.disk.checkFileClean.maxRetries | 
 | celeborn.worker.storage.checkDirsEmpty.timeout | 1000ms | false | The wait time per retry for a worker to check if the working directory is cleaned up before registering with the master. | 0.3.0 | celeborn.worker.disk.checkFileClean.timeout | 

diff --git a/docs/deploy.md b/docs/deploy.md
@@ -49,7 +49,7 @@ celeborn.worker.storage.dirs /mnt/disk1:disktype=SSD,/mnt/disk2:disktype=SSD
 
 # If Celeborn workers don't have local disks. You can use HDFS.
 # Do not set `celeborn.worker.storage.dirs` and use following configs.
-celeborn.storage.activeTypes HDFS
+celeborn.storage.availableTypes HDFS
 celeborn.worker.sortPartition.threads 64
 celeborn.worker.commitFiles.timeout 240s
 celeborn.worker.commitFiles.threads 128
@@ -98,7 +98,7 @@ celeborn.worker.storage.dirs /mnt/disk1:disktype=SSD,/mnt/disk2:disktype=SSD
 
 # If Celeborn workers don't have local disks. You can use HDFS.
 # Do not set `celeborn.worker.storage.dirs` and use following configs.
-celeborn.storage.activeTypes HDFS
+celeborn.storage.availableTypes HDFS
 celeborn.worker.sortPartition.threads 64
 celeborn.worker.commitFiles.timeout 240s
 celeborn.worker.commitFiles.threads 128