From d3f8d280098f42615c4d63d64d8797c8c76a8970 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Mon, 26 Oct 2015 22:07:49 -0700 Subject: [PATCH 1/9] Support setting spark.driver.memory from sparkEnvir when launching JVM backend --- R/pkg/R/sparkR.R | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 043b0057bd04a..54716d5b2eb2d 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -123,16 +123,30 @@ sparkR.init <- function( uriSep <- "////" } + sparkEnvirMap <- convertNamedListToEnv(sparkEnvir) + existingPort <- Sys.getenv("EXISTING_SPARKR_BACKEND_PORT", "") if (existingPort != "") { backendPort <- existingPort } else { path <- tempfile(pattern = "backend_port") + submitOps <- Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell") + # spark.driver.memory cannot be set in env: + # http://spark.apache.org/docs/latest/configuration.html#application-properties + # Add spark.driver.memory if set in sparkEnvir and not already set in SPARKR_SUBMIT_ARGS + if (!grepl("--spark.driver.memory", submitOps)) { + driverMemory <- sparkEnvirMap[["spark.driver.memory"]] + # format for memory properties is 2 characters + if (!is.null(driverMemory) && nchar(driverMemory) > 1) { + # --option must be before the application class "sparkr-shell" + submitOps <- paste("--driver-memory", driverMemory, submitOps, sep = " ") + } + } launchBackend( args = path, sparkHome = sparkHome, jars = jars, - sparkSubmitOpts = Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"), + sparkSubmitOpts = submitOps, packages = sparkPackages) # wait atmost 100 seconds for JVM to launch wait <- 0.1 @@ -171,8 +185,6 @@ sparkR.init <- function( sparkHome <- suppressWarnings(normalizePath(sparkHome)) } - sparkEnvirMap <- convertNamedListToEnv(sparkEnvir) - sparkExecutorEnvMap <- convertNamedListToEnv(sparkExecutorEnv) if(is.null(sparkExecutorEnvMap$LD_LIBRARY_PATH)) { sparkExecutorEnvMap[["LD_LIBRARY_PATH"]] <- From 5ecc9e09874c1bd63d4712412777aef4f2eb5afb Mon Sep 17 00:00:00 2001 From: felixcheung Date: Mon, 26 Oct 2015 22:36:39 -0700 Subject: [PATCH 2/9] oops, missed one check --- R/pkg/R/sparkR.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 54716d5b2eb2d..3b7701b8b63e0 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -134,7 +134,7 @@ sparkR.init <- function( # spark.driver.memory cannot be set in env: # http://spark.apache.org/docs/latest/configuration.html#application-properties # Add spark.driver.memory if set in sparkEnvir and not already set in SPARKR_SUBMIT_ARGS - if (!grepl("--spark.driver.memory", submitOps)) { + if (!grepl("--driver-memory", submitOps)) { driverMemory <- sparkEnvirMap[["spark.driver.memory"]] # format for memory properties is 2 characters if (!is.null(driverMemory) && nchar(driverMemory) > 1) { From db8f7fd9eb556ce258e0dde0a1f1dc9451173591 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Tue, 27 Oct 2015 21:35:15 -0700 Subject: [PATCH 3/9] Update to include spark.driver.extraClassPath, extraJavaOptions, extraLibraryPath from feedback add quote " around parameter values --- R/pkg/R/sparkR.R | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 3b7701b8b63e0..d8fb4b6d4c1ef 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -130,18 +130,12 @@ sparkR.init <- function( backendPort <- existingPort } else { path <- tempfile(pattern = "backend_port") - submitOps <- Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell") - # spark.driver.memory cannot be set in env: + # A few Spark config cannot be set in env: # http://spark.apache.org/docs/latest/configuration.html#application-properties - # Add spark.driver.memory if set in sparkEnvir and not already set in SPARKR_SUBMIT_ARGS - if (!grepl("--driver-memory", submitOps)) { - driverMemory <- sparkEnvirMap[["spark.driver.memory"]] - # format for memory properties is 2 characters - if (!is.null(driverMemory) && nchar(driverMemory) > 1) { - # --option must be before the application class "sparkr-shell" - submitOps <- paste("--driver-memory", driverMemory, submitOps, sep = " ") - } - } + # Add them to spark-submit commandline if not already set in SPARKR_SUBMIT_ARGS + submitOps <- getClientModeSparkSubmitOpts( + Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"), + sparkEnvirMap) launchBackend( args = path, sparkHome = sparkHome, @@ -332,3 +326,27 @@ clearJobGroup <- function(sc) { cancelJobGroup <- function(sc, groupId) { callJMethod(sc, "cancelJobGroup", groupId) } + +sparkConfToSubmitOps <- new.env() +sparkConfToSubmitOps[["spark.driver.memory"]] <- "--driver-memory" +sparkConfToSubmitOps[["spark.driver.extraClassPath"]] <- "--driver-class-path" +sparkConfToSubmitOps[["spark.driver.extraJavaOptions"]] <- "--driver-java-options" +sparkConfToSubmitOps[["spark.driver.extraLibraryPath"]] <- "--driver-library-path" + +# Utility function that returns Spark Submit arguments as a string +getClientModeSparkSubmitOpts <- function(submitOps, sparkEnvirMap) { + envirToOps <- lapply(ls(sparkConfToSubmitOps), function(conf) { + opsValue <- sparkEnvirMap[[conf]] + # process only if --option is not already specified + if (!is.null(opsValue) && + nchar(opsValue) > 1 && + !grepl(sparkConfToSubmitOps[[conf]], submitOps)) { + # put "" around value in case it has spaces + paste0(sparkConfToSubmitOps[[conf]], " \"", opsValue, "\" ") + } else { + "" + } + }) + # --option must be before the application class "sparkr-shell" in submitOps + paste0(paste0(envirToOps, collapse = ""), submitOps) +} From 9075eb68520b63a9b6b86aa9c0a56f651b140242 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Tue, 27 Oct 2015 22:04:36 -0700 Subject: [PATCH 4/9] add test --- R/pkg/inst/tests/test_sparkR.R | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 R/pkg/inst/tests/test_sparkR.R diff --git a/R/pkg/inst/tests/test_sparkR.R b/R/pkg/inst/tests/test_sparkR.R new file mode 100644 index 0000000000000..af5a23c436fae --- /dev/null +++ b/R/pkg/inst/tests/test_sparkR.R @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +context("functions in sparkR.R") + +test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", { + e <- new.env() + e[["spark.driver.memory"]] <- "512m" + ops <- getClientModeSparkSubmitOpts("sparkrmain", e) + expect_equal("--driver-memory \"512m\" sparkrmain", ops) + + e[["spark.driver.memory"]] <- "5g" + e[["spark.driver.extraClassPath"]] <- "/opt/class_path" + e[["spark.driver.extraJavaOptions"]] <- "-XX:+UseCompressedOops -XX:+UseCompressedStrings" + e[["spark.driver.extraLibraryPath"]] <- "/usr/local/hadoop/lib" + e[["random"]] <- "skipthis" + ops2 <- getClientModeSparkSubmitOpts("sparkr-shell", e) + expect_equal(ops2, paste0("--driver-class-path \"/opt/class_path\" --driver-java-options \"", + "-XX:+UseCompressedOops -XX:+UseCompressedStrings\" --driver-library-path \"", + "/usr/local/hadoop/lib\" --driver-memory \"5g\" sparkr-shell")) + + e[["spark.driver.extraClassPath"]] <- "/" # too short + ops3 <- getClientModeSparkSubmitOpts("--driver-memory 4g sparkr-shell2", e) + expect_equal(ops3, paste0("--driver-java-options \"-XX:+UseCompressedOops ", + "-XX:+UseCompressedStrings\" --driver-library-path \"/usr/local/hadoop/lib\"", + " --driver-memory 4g sparkr-shell2")) +}) From 2d4ffb081cc45e72014594506e64b3d37c62da2e Mon Sep 17 00:00:00 2001 From: felixcheung Date: Tue, 27 Oct 2015 22:09:37 -0700 Subject: [PATCH 5/9] Didn't realize the filename is different for test, fixing that --- R/pkg/inst/tests/test_context.R | 23 ++++++++++++++++++ R/pkg/inst/tests/test_sparkR.R | 41 --------------------------------- 2 files changed, 23 insertions(+), 41 deletions(-) delete mode 100644 R/pkg/inst/tests/test_sparkR.R diff --git a/R/pkg/inst/tests/test_context.R b/R/pkg/inst/tests/test_context.R index e99815ed1562c..e645d17cc1ff9 100644 --- a/R/pkg/inst/tests/test_context.R +++ b/R/pkg/inst/tests/test_context.R @@ -65,3 +65,26 @@ test_that("job group functions can be called", { cancelJobGroup(sc, "groupId") clearJobGroup(sc) }) + +test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", { + e <- new.env() + e[["spark.driver.memory"]] <- "512m" + ops <- getClientModeSparkSubmitOpts("sparkrmain", e) + expect_equal("--driver-memory \"512m\" sparkrmain", ops) + + e[["spark.driver.memory"]] <- "5g" + e[["spark.driver.extraClassPath"]] <- "/opt/class_path" + e[["spark.driver.extraJavaOptions"]] <- "-XX:+UseCompressedOops -XX:+UseCompressedStrings" + e[["spark.driver.extraLibraryPath"]] <- "/usr/local/hadoop/lib" + e[["random"]] <- "skipthis" + ops2 <- getClientModeSparkSubmitOpts("sparkr-shell", e) + expect_equal(ops2, paste0("--driver-class-path \"/opt/class_path\" --driver-java-options \"", + "-XX:+UseCompressedOops -XX:+UseCompressedStrings\" --driver-library-path \"", + "/usr/local/hadoop/lib\" --driver-memory \"5g\" sparkr-shell")) + + e[["spark.driver.extraClassPath"]] <- "/" # too short + ops3 <- getClientModeSparkSubmitOpts("--driver-memory 4g sparkr-shell2", e) + expect_equal(ops3, paste0("--driver-java-options \"-XX:+UseCompressedOops ", + "-XX:+UseCompressedStrings\" --driver-library-path \"/usr/local/hadoop/lib\"", + " --driver-memory 4g sparkr-shell2")) +}) diff --git a/R/pkg/inst/tests/test_sparkR.R b/R/pkg/inst/tests/test_sparkR.R deleted file mode 100644 index af5a23c436fae..0000000000000 --- a/R/pkg/inst/tests/test_sparkR.R +++ /dev/null @@ -1,41 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -context("functions in sparkR.R") - -test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", { - e <- new.env() - e[["spark.driver.memory"]] <- "512m" - ops <- getClientModeSparkSubmitOpts("sparkrmain", e) - expect_equal("--driver-memory \"512m\" sparkrmain", ops) - - e[["spark.driver.memory"]] <- "5g" - e[["spark.driver.extraClassPath"]] <- "/opt/class_path" - e[["spark.driver.extraJavaOptions"]] <- "-XX:+UseCompressedOops -XX:+UseCompressedStrings" - e[["spark.driver.extraLibraryPath"]] <- "/usr/local/hadoop/lib" - e[["random"]] <- "skipthis" - ops2 <- getClientModeSparkSubmitOpts("sparkr-shell", e) - expect_equal(ops2, paste0("--driver-class-path \"/opt/class_path\" --driver-java-options \"", - "-XX:+UseCompressedOops -XX:+UseCompressedStrings\" --driver-library-path \"", - "/usr/local/hadoop/lib\" --driver-memory \"5g\" sparkr-shell")) - - e[["spark.driver.extraClassPath"]] <- "/" # too short - ops3 <- getClientModeSparkSubmitOpts("--driver-memory 4g sparkr-shell2", e) - expect_equal(ops3, paste0("--driver-java-options \"-XX:+UseCompressedOops ", - "-XX:+UseCompressedStrings\" --driver-library-path \"/usr/local/hadoop/lib\"", - " --driver-memory 4g sparkr-shell2")) -}) From ceeca816c9c96b22887202c73c6a24d15d05fa2b Mon Sep 17 00:00:00 2001 From: felixcheung Date: Wed, 28 Oct 2015 00:05:08 -0700 Subject: [PATCH 6/9] update for lint-r --- R/pkg/inst/tests/test_context.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/pkg/inst/tests/test_context.R b/R/pkg/inst/tests/test_context.R index e645d17cc1ff9..80c1b89a4c627 100644 --- a/R/pkg/inst/tests/test_context.R +++ b/R/pkg/inst/tests/test_context.R @@ -73,18 +73,22 @@ test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whiteli expect_equal("--driver-memory \"512m\" sparkrmain", ops) e[["spark.driver.memory"]] <- "5g" - e[["spark.driver.extraClassPath"]] <- "/opt/class_path" + e[["spark.driver.extraClassPath"]] <- "/opt/class_path" # nolint e[["spark.driver.extraJavaOptions"]] <- "-XX:+UseCompressedOops -XX:+UseCompressedStrings" - e[["spark.driver.extraLibraryPath"]] <- "/usr/local/hadoop/lib" + e[["spark.driver.extraLibraryPath"]] <- "/usr/local/hadoop/lib" # nolint e[["random"]] <- "skipthis" ops2 <- getClientModeSparkSubmitOpts("sparkr-shell", e) + # nolint start expect_equal(ops2, paste0("--driver-class-path \"/opt/class_path\" --driver-java-options \"", "-XX:+UseCompressedOops -XX:+UseCompressedStrings\" --driver-library-path \"", "/usr/local/hadoop/lib\" --driver-memory \"5g\" sparkr-shell")) + # nolint end e[["spark.driver.extraClassPath"]] <- "/" # too short ops3 <- getClientModeSparkSubmitOpts("--driver-memory 4g sparkr-shell2", e) + # nolint start expect_equal(ops3, paste0("--driver-java-options \"-XX:+UseCompressedOops ", "-XX:+UseCompressedStrings\" --driver-library-path \"/usr/local/hadoop/lib\"", " --driver-memory 4g sparkr-shell2")) + # nolint end }) From c21713ed32fd0bc68444ed410cce892d70a494fc Mon Sep 17 00:00:00 2001 From: felixcheung Date: Wed, 28 Oct 2015 16:17:35 -0700 Subject: [PATCH 7/9] Comment and doc updates --- R/pkg/R/sparkR.R | 12 ++++++++---- docs/sparkr.md | 24 +++++++++++++++++------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index d8fb4b6d4c1ef..f6ccf09dc09ff 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -93,7 +93,7 @@ sparkR.stop <- function() { #' sc <- sparkR.init("local[2]", "SparkR", "/home/spark", #' list(spark.executor.memory="1g")) #' sc <- sparkR.init("yarn-client", "SparkR", "/home/spark", -#' list(spark.executor.memory="1g"), +#' list(spark.executor.memory="4g", spark.driver.memory="2g"), #' list(LD_LIBRARY_PATH="/directory of JVM libraries (libjvm.so) on workers/"), #' c("jarfile1.jar","jarfile2.jar")) #'} @@ -130,9 +130,6 @@ sparkR.init <- function( backendPort <- existingPort } else { path <- tempfile(pattern = "backend_port") - # A few Spark config cannot be set in env: - # http://spark.apache.org/docs/latest/configuration.html#application-properties - # Add them to spark-submit commandline if not already set in SPARKR_SUBMIT_ARGS submitOps <- getClientModeSparkSubmitOpts( Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"), sparkEnvirMap) @@ -334,6 +331,13 @@ sparkConfToSubmitOps[["spark.driver.extraJavaOptions"]] <- "--driver-java-option sparkConfToSubmitOps[["spark.driver.extraLibraryPath"]] <- "--driver-library-path" # Utility function that returns Spark Submit arguments as a string +# +# A few Spark Application and Runtime environment properties cannot take effort after driver +# JVM has started, as documented in: +# http://spark.apache.org/docs/latest/configuration.html#application-properties +# When starting SparkR without using spark-submit, for example, in Rstudio, add them to +# spark-submit commandline if not already set in SPARKR_SUBMIT_ARGS so that they can be +# effective. getClientModeSparkSubmitOpts <- function(submitOps, sparkEnvirMap) { envirToOps <- lapply(ls(sparkConfToSubmitOps), function(conf) { opsValue <- sparkEnvirMap[[conf]] diff --git a/docs/sparkr.md b/docs/sparkr.md index 7139d16b4a068..88985e6d7c516 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -37,17 +37,27 @@ sc <- sparkR.init() sqlContext <- sparkRSQL.init(sc) {% endhighlight %} +In the event you are creating `SparkContext` instead of using `sparkR` shell or `spark-submit`, you +could also specify certain Spark driver properties. Normally these +[Application properties](configuration.html#application-properties) and [Runtime Environment](configuration.html#runtime-environment) cannot be set programmatically, as the +driver JVM process would have been started, in this case SparkR takes care of this for you. To set +them, pass them as you would other configuration properties in the `sparkEnvir` argument. + +{% highlight r %} +sc <- sparkR.init("local[*]", "SparkR", "/home/spark", list(spark.driver.memory="2g")) +{% endhighlight %} + ## Creating DataFrames With a `SQLContext`, applications can create `DataFrame`s from a local R data frame, from a [Hive table](sql-programming-guide.html#hive-tables), or from other [data sources](sql-programming-guide.html#data-sources). ### From local data frames -The simplest way to create a data frame is to convert a local R data frame into a SparkR DataFrame. Specifically we can use `createDataFrame` and pass in the local R data frame to create a SparkR DataFrame. As an example, the following creates a `DataFrame` based using the `faithful` dataset from R. +The simplest way to create a data frame is to convert a local R data frame into a SparkR DataFrame. Specifically we can use `createDataFrame` and pass in the local R data frame to create a SparkR DataFrame. As an example, the following creates a `DataFrame` based using the `faithful` dataset from R.
{% highlight r %} -df <- createDataFrame(sqlContext, faithful) +df <- createDataFrame(sqlContext, faithful) # Displays the content of the DataFrame to stdout head(df) @@ -96,7 +106,7 @@ printSchema(people)
The data sources API can also be used to save out DataFrames into multiple file formats. For example we can save the DataFrame from the previous example -to a Parquet file using `write.df` +to a Parquet file using `write.df`
{% highlight r %} @@ -139,7 +149,7 @@ Here we include some basic examples and a complete list can be found in the [API
{% highlight r %} # Create the DataFrame -df <- createDataFrame(sqlContext, faithful) +df <- createDataFrame(sqlContext, faithful) # Get basic information about the DataFrame df @@ -152,7 +162,7 @@ head(select(df, df$eruptions)) ##2 1.800 ##3 3.333 -# You can also pass in column name as strings +# You can also pass in column name as strings head(select(df, "eruptions")) # Filter the DataFrame to only retain rows with wait times shorter than 50 mins @@ -166,7 +176,7 @@ head(filter(df, df$waiting < 50))
-### Grouping, Aggregation +### Grouping, Aggregation SparkR data frames support a number of commonly used functions to aggregate data after grouping. For example we can compute a histogram of the `waiting` time in the `faithful` dataset as shown below @@ -194,7 +204,7 @@ head(arrange(waiting_counts, desc(waiting_counts$count))) ### Operating on Columns -SparkR also provides a number of functions that can directly applied to columns for data processing and during aggregation. The example below shows the use of basic arithmetic functions. +SparkR also provides a number of functions that can directly applied to columns for data processing and during aggregation. The example below shows the use of basic arithmetic functions.
{% highlight r %} From 557bbc14335a5957c9e87038bedd24222b9bda3f Mon Sep 17 00:00:00 2001 From: felixcheung Date: Wed, 28 Oct 2015 16:52:00 -0700 Subject: [PATCH 8/9] more text clean up --- R/pkg/R/sparkR.R | 7 +++---- docs/sparkr.md | 10 ++++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index f6ccf09dc09ff..b2acd52d91b71 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -332,12 +332,11 @@ sparkConfToSubmitOps[["spark.driver.extraLibraryPath"]] <- "--driver-library-pat # Utility function that returns Spark Submit arguments as a string # -# A few Spark Application and Runtime environment properties cannot take effort after driver +# A few Spark Application and Runtime environment properties cannot take effect after driver # JVM has started, as documented in: # http://spark.apache.org/docs/latest/configuration.html#application-properties -# When starting SparkR without using spark-submit, for example, in Rstudio, add them to -# spark-submit commandline if not already set in SPARKR_SUBMIT_ARGS so that they can be -# effective. +# When starting SparkR without using spark-submit, for example, from Rstudio, add them to +# spark-submit commandline if not already set in SPARKR_SUBMIT_ARGS so that they can be effective. getClientModeSparkSubmitOpts <- function(submitOps, sparkEnvirMap) { envirToOps <- lapply(ls(sparkConfToSubmitOps), function(conf) { opsValue <- sparkEnvirMap[[conf]] diff --git a/docs/sparkr.md b/docs/sparkr.md index 88985e6d7c516..497a276679f3b 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -29,7 +29,7 @@ All of the examples on this page use sample data included in R or the Spark dist The entry point into SparkR is the `SparkContext` which connects your R program to a Spark cluster. You can create a `SparkContext` using `sparkR.init` and pass in options such as the application name , any spark packages depended on, etc. Further, to work with DataFrames we will need a `SQLContext`, -which can be created from the SparkContext. If you are working from the SparkR shell, the +which can be created from the SparkContext. If you are working from the `sparkR` shell, the `SQLContext` and `SparkContext` should already be created for you. {% highlight r %} @@ -37,11 +37,13 @@ sc <- sparkR.init() sqlContext <- sparkRSQL.init(sc) {% endhighlight %} -In the event you are creating `SparkContext` instead of using `sparkR` shell or `spark-submit`, you +In the event you are creating `SparkContext` instead of using `sparkR` shell or `spark-submit`, you could also specify certain Spark driver properties. Normally these -[Application properties](configuration.html#application-properties) and [Runtime Environment](configuration.html#runtime-environment) cannot be set programmatically, as the +[Application properties](configuration.html#application-properties) and +[Runtime Environment](configuration.html#runtime-environment) cannot be set programmatically, as the driver JVM process would have been started, in this case SparkR takes care of this for you. To set -them, pass them as you would other configuration properties in the `sparkEnvir` argument. +them, pass them as you would other configuration properties in the `sparkEnvir` argument to +`sparkR.init()`. {% highlight r %} sc <- sparkR.init("local[*]", "SparkR", "/home/spark", list(spark.driver.memory="2g")) From 871e971247976e086958b1d2de730bd636be5193 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Thu, 29 Oct 2015 11:57:00 -0700 Subject: [PATCH 9/9] update comment from feedback --- R/pkg/R/sparkR.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index b2acd52d91b71..004d08e74e1cd 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -77,7 +77,9 @@ sparkR.stop <- function() { #' Initialize a new Spark Context. #' -#' This function initializes a new SparkContext. +#' This function initializes a new SparkContext. For details on how to initialize +#' and use SparkR, refer to SparkR programming guide at +#' \url{http://spark.apache.org/docs/latest/sparkr.html#starting-up-sparkcontext-sqlcontext}. #' #' @param master The Spark master URL. #' @param appName Application name to register with cluster manager @@ -93,7 +95,7 @@ sparkR.stop <- function() { #' sc <- sparkR.init("local[2]", "SparkR", "/home/spark", #' list(spark.executor.memory="1g")) #' sc <- sparkR.init("yarn-client", "SparkR", "/home/spark", -#' list(spark.executor.memory="4g", spark.driver.memory="2g"), +#' list(spark.executor.memory="4g"), #' list(LD_LIBRARY_PATH="/directory of JVM libraries (libjvm.so) on workers/"), #' c("jarfile1.jar","jarfile2.jar")) #'}