Skip to content

Commit

Permalink
addressed comments
Browse files Browse the repository at this point in the history
  • Loading branch information
stream2000 committed Aug 31, 2023
1 parent ab46e34 commit bb592b7
Show file tree
Hide file tree
Showing 11 changed files with 204 additions and 150 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,13 @@ public HoodieClusteringJob(JavaSparkContext jsc, Config cfg, TypedProperties pro
this.jsc = jsc;
this.props = props;
this.metaClient = UtilHelpers.createMetaClient(jsc, cfg.basePath, true);
// Disable async cleaning, will trigger synchronous cleaning manually.
this.props.put(HoodieCleanConfig.ASYNC_CLEAN.key(), false);
this.metaClient = UtilHelpers.createMetaClient(jsc, cfg.basePath, true);
if (this.metaClient.getTableConfig().isMetadataTableAvailable()) {
// add default lock config options if MDT is enabled.
UtilHelpers.addLockOptions(cfg.basePath, this.props);
}
}

private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ public HoodieCompactor(JavaSparkContext jsc, Config cfg, TypedProperties props)
this.jsc = jsc;
this.props = props;
this.metaClient = UtilHelpers.createMetaClient(jsc, cfg.basePath, true);
// Disable async cleaning, will trigger synchronous cleaning manually.
this.props.put(HoodieCleanConfig.ASYNC_CLEAN.key(), false);
if (this.metaClient.getTableConfig().isMetadataTableAvailable()) {
// add default lock config options if MDT is enabled.
UtilHelpers.addLockOptions(cfg.basePath, this.props);
}
}

private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* * Licensed to the Apache Software Foundation (ASF) under one or more
* * contributor license agreements. See the NOTICE file distributed with
* * this work for additional information regarding copyright ownership.
* * The ASF licenses this file to You under the Apache License, Version 2.0
* * (the "License"); you may not use this file except in compliance with
* * the License. You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.hudi.utilities.multitable;
Expand All @@ -29,6 +29,10 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Archive task to run in TableServicePipeline
* @see HoodieMultiTableServicesMain
*/
class ArchiveTask extends TableServiceTask {
private static final Logger LOG = LoggerFactory.getLogger(ArchiveTask.class);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* * Licensed to the Apache Software Foundation (ASF) under one or more
* * contributor license agreements. See the NOTICE file distributed with
* * this work for additional information regarding copyright ownership.
* * The ASF licenses this file to You under the Apache License, Version 2.0
* * (the "License"); you may not use this file except in compliance with
* * the License. You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.hudi.utilities.multitable;
Expand All @@ -25,6 +25,10 @@

import org.apache.spark.api.java.JavaSparkContext;

/**
* Clean task to run in TableServicePipeline
* @see HoodieMultiTableServicesMain
*/
class CleanTask extends TableServiceTask {

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* * Licensed to the Apache Software Foundation (ASF) under one or more
* * contributor license agreements. See the NOTICE file distributed with
* * this work for additional information regarding copyright ownership.
* * The ASF licenses this file to You under the Apache License, Version 2.0
* * (the "License"); you may not use this file except in compliance with
* * the License. You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.hudi.utilities.multitable;
Expand All @@ -24,6 +24,10 @@

import org.apache.spark.api.java.JavaSparkContext;

/**
* Clustering task to run in TableServicePipeline
* @see HoodieMultiTableServicesMain
*/
class ClusteringTask extends TableServiceTask {

private int parallelism;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* * Licensed to the Apache Software Foundation (ASF) under one or more
* * contributor license agreements. See the NOTICE file distributed with
* * this work for additional information regarding copyright ownership.
* * The ASF licenses this file to You under the Apache License, Version 2.0
* * (the "License"); you may not use this file except in compliance with
* * the License. You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.hudi.utilities.multitable;
Expand All @@ -25,6 +25,10 @@

import org.apache.spark.api.java.JavaSparkContext;

/**
* Compaction task to run in TableServicePipeline
* @see HoodieMultiTableServicesMain
*/
class CompactionTask extends TableServiceTask {

public String compactionRunningMode = HoodieCompactor.EXECUTE;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* * Licensed to the Apache Software Foundation (ASF) under one or more
* * contributor license agreements. See the NOTICE file distributed with
* * this work for additional information regarding copyright ownership.
* * The ASF licenses this file to You under the Apache License, Version 2.0
* * (the "License"); you may not use this file except in compliance with
* * the License. You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.hudi.utilities.multitable;
Expand All @@ -38,9 +38,8 @@
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringJoiner;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
Expand All @@ -49,8 +48,9 @@
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

import static org.apache.hudi.utilities.multitable.MultiTableServiceUtils.Constants.LOCAL_SPARK_MASTER;

/**
* Main function for executing multi-table services
*/
public class HoodieMultiTableServicesMain {
private static final Logger LOG = LoggerFactory.getLogger(HoodieStreamer.class);
final Config cfg;
Expand Down Expand Up @@ -109,14 +109,14 @@ public HoodieMultiTableServicesMain(JavaSparkContext jsc, Config cfg) {
public void startServices() throws ExecutionException, InterruptedException {
LOG.info("StartServices Config: " + cfg);
List<String> tablePaths;
if (cfg.autoDiscovering) {
if (cfg.autoDiscovery) {
// We support defining multi base paths
tablePaths = cfg.basePath.stream()
.filter(this::pathExists)
.flatMap(p -> MultiTableServiceUtils.findHoodieTablesUnderPath(jsc, p).stream())
.collect(Collectors.toList());
} else {
tablePaths = MultiTableServiceUtils.getTablesToBeIngestedFromProps(props);
tablePaths = MultiTableServiceUtils.getTablesToBeServedFromProps(props);
}
LOG.info("All table paths: " + String.join(",", tablePaths));
if (cfg.batch) {
Expand Down Expand Up @@ -148,8 +148,8 @@ public static class Config implements Serializable {
required = true, splitter = IdentitySplitter.class)
public List<String> basePath = Collections.emptyList();

@Parameter(names = {"--auto", "-a"}, description = "Whether to discover hudi tables in the base path")
public boolean autoDiscovering = false;
@Parameter(names = {"--auto-discovery", "-a"}, description = "Whether to discover hudi tables in the base path")
public boolean autoDiscovery = false;

@Parameter(names = {"--parallelism"}, description = "Parallelism for hoodie table service")
public int parallelism = 200;
Expand Down Expand Up @@ -194,6 +194,12 @@ public static class Config implements Serializable {
+ "Set \"scheduleAndExecute\" means make a clustering plan first and execute that plan immediately")
public String clusteringRunningMode = HoodieCompactor.SCHEDULE_AND_EXECUTE;

@Parameter(names = {"--spark-master", "-ms"}, description = "Spark master")
public String sparkMaster;

@Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use")
public String sparkMemory = null;

@Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
+ "hoodie client for table service")
public String propsFilePath = null;
Expand All @@ -205,27 +211,28 @@ public static class Config implements Serializable {

@Override
public String toString() {
return "Config{"
+ "basePath=" + basePath
+ ", autoDiscovering=" + autoDiscovering
+ ", parallelism=" + parallelism
+ ", batch=" + batch
+ ", scheduleDelay=" + scheduleDelay
+ ", retry=" + retry
+ ", poolSize=" + poolSize
+ ", appName='" + appName + '\''
+ ", help=" + help
+ ", enableCompaction=" + enableCompaction
+ ", enableClustering=" + enableClustering
+ ", enableClean=" + enableClean
+ ", enableArchive=" + enableArchive
+ ", compactionRunningMode='" + compactionRunningMode + '\''
+ ", clusteringRunningMode='" + clusteringRunningMode + '\''
+ ", propsFilePath='" + propsFilePath + '\''
+ ", configs=" + configs
+ '}';
return new StringJoiner(", ", Config.class.getSimpleName() + "[", "]")
.add("basePath=" + basePath)
.add("autoDiscovery=" + autoDiscovery)
.add("parallelism=" + parallelism)
.add("batch=" + batch)
.add("scheduleDelay=" + scheduleDelay)
.add("retry=" + retry)
.add("poolSize=" + poolSize)
.add("appName='" + appName + "'")
.add("help=" + help)
.add("enableCompaction=" + enableCompaction)
.add("enableClustering=" + enableClustering)
.add("enableClean=" + enableClean)
.add("enableArchive=" + enableArchive)
.add("compactionRunningMode='" + compactionRunningMode + "'")
.add("clusteringRunningMode='" + clusteringRunningMode + "'")
.add("sparkMaster='" + sparkMaster + "'")
.add("sparkMemory='" + sparkMemory + "'")
.add("propsFilePath='" + propsFilePath + "'")
.add("configs=" + configs)
.toString();
}

}

public static void main(String[] args) {
Expand All @@ -235,8 +242,7 @@ public static void main(String[] args) {
cmd.usage();
System.exit(1);
}
Map<String, String> config = new HashMap<>();
JavaSparkContext jsc = UtilHelpers.buildSparkContext(cfg.appName, LOCAL_SPARK_MASTER, config);
JavaSparkContext jsc = UtilHelpers.buildSparkContext(cfg.appName, cfg.sparkMaster, cfg.sparkMemory);
try {
new HoodieMultiTableServicesMain(jsc, cfg).startServices();
} catch (Throwable throwable) {
Expand Down

0 comments on commit bb592b7

Please sign in to comment.