Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HUDI-2850] Clustering CLI - schedule and run command fixes to avoid NumberFormatException #4101

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,21 @@ public class ClusteringCommand implements CommandMarker {

private static final Logger LOG = LogManager.getLogger(ClusteringCommand.class);

/**
* Schedule clustering table service.
* <p>
* Example:
* > connect --path {path to hudi table}
* > clustering schedule --sparkMaster local --sparkMemory 2g
*/
@CliCommand(value = "clustering schedule", help = "Schedule Clustering")
public String scheduleClustering(
manojpec marked this conversation as resolved.
Show resolved Hide resolved
@CliOption(key = "sparkMemory", help = "Spark executor memory",
unspecifiedDefaultValue = "1G") final String sparkMemory,
@CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for clustering",
unspecifiedDefaultValue = "") final String propsFilePath,
@CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
unspecifiedDefaultValue = "") final String[] configs) throws Exception {
@CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master,
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1g", help = "Spark executor memory") final String sparkMemory,
@CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations "
+ "for hoodie client for clustering", unspecifiedDefaultValue = "") final String propsFilePath,
@CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can "
+ "be passed here in the form of an array", unspecifiedDefaultValue = "") final String[] configs) throws Exception {
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
Expand All @@ -59,8 +66,8 @@ public String scheduleClustering(
// First get a clustering instant time and pass it to spark launcher for scheduling clustering
String clusteringInstantTime = HoodieActiveTimeline.createNewInstantTime();

sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_SCHEDULE.toString(), client.getBasePath(),
client.getTableConfig().getTableName(), clusteringInstantTime, sparkMemory, propsFilePath);
sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_SCHEDULE.toString(), master, sparkMemory,
client.getBasePath(), client.getTableConfig().getTableName(), clusteringInstantTime, propsFilePath);
UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
Expand All @@ -71,30 +78,35 @@ public String scheduleClustering(
return "Succeeded to schedule clustering for " + clusteringInstantTime;
}

/**
* Run clustering table service.
* <p>
* Example:
* > connect --path {path to hudi table}
* > clustering schedule --sparkMaster local --sparkMemory 2g
* > clustering run --sparkMaster local --sparkMemory 2g --clusteringInstant 20211124005208
*/
@CliCommand(value = "clustering run", help = "Run Clustering")
public String runClustering(
@CliOption(key = "parallelism", help = "Parallelism for hoodie clustering",
unspecifiedDefaultValue = "1") final String parallelism,
@CliOption(key = "sparkMemory", help = "Spark executor memory",
unspecifiedDefaultValue = "4G") final String sparkMemory,
@CliOption(key = "retry", help = "Number of retries",
unspecifiedDefaultValue = "1") final String retry,
@CliOption(key = "clusteringInstant", help = "Clustering instant time",
mandatory = true) final String clusteringInstantTime,
@CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting",
unspecifiedDefaultValue = "") final String propsFilePath,
@CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
unspecifiedDefaultValue = "") final String[] configs
) throws Exception {
@CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master,
@CliOption(key = "sparkMemory", help = "Spark executor memory", unspecifiedDefaultValue = "4g") final String sparkMemory,
@CliOption(key = "parallelism", help = "Parallelism for hoodie clustering", unspecifiedDefaultValue = "1") final String parallelism,
@CliOption(key = "retry", help = "Number of retries", unspecifiedDefaultValue = "1") final String retry,
@CliOption(key = "clusteringInstant", help = "Clustering instant time", mandatory = true) final String clusteringInstantTime,
@CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for "
+ "hoodie client for compacting", unspecifiedDefaultValue = "") final String propsFilePath,
@CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be "
+ "passed here in the form of an array", unspecifiedDefaultValue = "") final String[] configs) throws Exception {
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);

String sparkPropertiesPath =
Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_RUN.toString(), client.getBasePath(),
client.getTableConfig().getTableName(), clusteringInstantTime, parallelism, sparkMemory, retry, propsFilePath);
sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_RUN.toString(), master, sparkMemory,
client.getBasePath(), client.getTableConfig().getTableName(), clusteringInstantTime,
parallelism, retry, propsFilePath);
UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
Expand Down
46 changes: 25 additions & 21 deletions hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.config.HoodieBootstrapConfig;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
Expand Down Expand Up @@ -79,12 +80,14 @@ enum SparkCommand {
}

public static void main(String[] args) throws Exception {
String command = args[0];
LOG.info("Invoking SparkMain:" + command);
ValidationUtils.checkArgument(args.length >= 4);
final String commandString = args[0];
LOG.info("Invoking SparkMain: " + commandString);
final SparkCommand cmd = SparkCommand.valueOf(commandString);

SparkCommand cmd = SparkCommand.valueOf(command);
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + commandString,
Option.of(args[1]), Option.of(args[2]));

JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command, Option.of(args[1]), Option.of(args[2]));
int returnCode = 0;
try {
switch (cmd) {
Expand All @@ -111,8 +114,8 @@ public static void main(String[] args) throws Exception {
if (args.length > 13) {
configs.addAll(Arrays.asList(args).subList(13, args.length));
}
returnCode = dataLoad(jsc, command, args[3], args[4], args[5], args[6], args[7], args[8],
Integer.parseInt(args[9]), args[10], Integer.parseInt(args[11]), propsFilePath, configs);
returnCode = dataLoad(jsc, commandString, args[3], args[4], args[5], args[6], args[7], args[8],
Integer.parseInt(args[9]), args[10], Integer.parseInt(args[11]), propsFilePath, configs);
break;
case COMPACT_RUN:
assert (args.length >= 10);
Expand Down Expand Up @@ -159,33 +162,34 @@ public static void main(String[] args) throws Exception {
case COMPACT_UNSCHEDULE_PLAN:
assert (args.length == 9);
doCompactUnschedule(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]),
Boolean.parseBoolean(args[7]), Boolean.parseBoolean(args[8]));
Boolean.parseBoolean(args[7]), Boolean.parseBoolean(args[8]));
returnCode = 0;
break;
case CLUSTERING_RUN:
assert (args.length >= 8);
assert (args.length >= 9);
propsFilePath = null;
if (!StringUtils.isNullOrEmpty(args[7])) {
propsFilePath = args[7];
if (!StringUtils.isNullOrEmpty(args[8])) {
propsFilePath = args[8];
}
configs = new ArrayList<>();
if (args.length > 8) {
configs.addAll(Arrays.asList(args).subList(8, args.length));
if (args.length > 9) {
configs.addAll(Arrays.asList(args).subList(9, args.length));
}
returnCode = cluster(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]), args[5],
Integer.parseInt(args[6]), false, propsFilePath, configs);
returnCode = cluster(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), args[2],
Integer.parseInt(args[7]), false, propsFilePath, configs);
Comment on lines -175 to +179
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you want to log a ticket for later improvement: this positional argument pretty hard to maintain, can we adopt jcommander here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, will file a ticket.

break;
case CLUSTERING_SCHEDULE:
assert (args.length >= 6);
assert (args.length >= 7);
propsFilePath = null;
if (!StringUtils.isNullOrEmpty(args[5])) {
propsFilePath = args[5];
if (!StringUtils.isNullOrEmpty(args[6])) {
propsFilePath = args[6];
}
configs = new ArrayList<>();
if (args.length > 6) {
configs.addAll(Arrays.asList(args).subList(6, args.length));
if (args.length > 7) {
configs.addAll(Arrays.asList(args).subList(7, args.length));
}
returnCode = cluster(jsc, args[1], args[2], args[3], 1, args[4], 0, true, propsFilePath, configs);
returnCode = cluster(jsc, args[3], args[4], args[5], 1, args[2],
0, true, propsFilePath, configs);
break;
case CLEAN:
assert (args.length >= 5);
Expand Down Expand Up @@ -229,7 +233,7 @@ public static void main(String[] args) throws Exception {
break;
}
} catch (Throwable throwable) {
LOG.error("Fail to execute command", throwable);
LOG.error("Fail to execute commandString", throwable);
returnCode = -1;
} finally {
jsc.stop();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
*/
public class SparkUtil {

private static final String DEFAULT_SPARK_MASTER = "yarn";
public static final String DEFAULT_SPARK_MASTER = "yarn";

/**
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public class HoodieClusteringJob {
public HoodieClusteringJob(JavaSparkContext jsc, Config cfg) {
this.cfg = cfg;
this.jsc = jsc;
this.props = cfg.propsFilePath == null
this.props = StringUtils.isNullOrEmpty(cfg.propsFilePath)
? UtilHelpers.buildProperties(cfg.configs)
: readConfigFromFileSystem(jsc, cfg);
}
Expand Down