Skip to content

Commit

Permalink
Merge branch 'master' of github.com:apache/spark into viz_codegen
Browse files Browse the repository at this point in the history
  • Loading branch information
Davies Liu committed Jan 21, 2016
2 parents f5c9087 + d741599 commit 163a014
Show file tree
Hide file tree
Showing 4 changed files with 185 additions and 101 deletions.
10 changes: 10 additions & 0 deletions R/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# R on Spark

SparkR is an R package that provides a light-weight frontend to use Spark from R.
### Installing sparkR

Libraries of sparkR need to be created in `$SPARK_HOME/R/lib`. This can be done by running the script `$SPARK_HOME/R/install-dev.sh`.
By default the above script uses the system wide installation of R. However, this can be changed to any user installed location of R by setting the environment variable `R_HOME` the full path of the base directory where R is installed, before running install-dev.sh script.
Example:
```
# where /home/username/R is where R is installed and /home/username/R/bin contains the files R and RScript
export R_HOME=/home/username/R
./install-dev.sh
```

### SparkR development

Expand Down
11 changes: 9 additions & 2 deletions R/install-dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,19 @@ LIB_DIR="$FWDIR/lib"
mkdir -p $LIB_DIR

pushd $FWDIR > /dev/null
if [ ! -z "$R_HOME" ]
then
R_SCRIPT_PATH="$R_HOME/bin"
else
R_SCRIPT_PATH="$(dirname $(which R))"
fi
echo "USING R_HOME = $R_HOME"

# Generate Rd files if devtools is installed
Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
"$R_SCRIPT_PATH/"Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'

# Install SparkR to $LIB_DIR
R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/
"$R_SCRIPT_PATH/"R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/

# Zip the SparkR package so that it can be distributed to worker nodes on YARN
cd $LIB_DIR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,16 @@ import scala.collection.JavaConverters._
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.conf.HiveConf.ConfVars
import org.apache.hadoop.hive.ql.{Context, ErrorMsg}
import org.apache.hadoop.hive.ql.plan.TableDesc
import org.apache.hadoop.hive.serde2.Serializer
import org.apache.hadoop.hive.serde2.objectinspector._
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption
import org.apache.hadoop.mapred.{FileOutputFormat, JobConf}

import org.apache.spark.{SparkException, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, FromUnsafeProjection}
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.{SparkPlan, UnaryNode}
import org.apache.spark.sql.hive._
import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
import org.apache.spark.sql.types.DataType
import org.apache.spark.SparkException
import org.apache.spark.util.SerializableJobConf

private[hive]
Expand All @@ -46,19 +42,12 @@ case class InsertIntoHiveTable(
partition: Map[String, Option[String]],
child: SparkPlan,
overwrite: Boolean,
ifNotExists: Boolean) extends UnaryNode with HiveInspectors {
ifNotExists: Boolean) extends UnaryNode {

@transient val sc: HiveContext = sqlContext.asInstanceOf[HiveContext]
@transient lazy val outputClass = newSerializer(table.tableDesc).getSerializedClass
@transient private lazy val hiveContext = new Context(sc.hiveconf)
@transient private lazy val catalog = sc.catalog

private def newSerializer(tableDesc: TableDesc): Serializer = {
val serializer = tableDesc.getDeserializerClass.newInstance().asInstanceOf[Serializer]
serializer.initialize(null, tableDesc.getProperties)
serializer
}

def output: Seq[Attribute] = Seq.empty

private def saveAsHiveFile(
Expand All @@ -78,44 +67,10 @@ case class InsertIntoHiveTable(
conf.value,
SparkHiveWriterContainer.createPathFromString(fileSinkConf.getDirName, conf.value))
log.debug("Saving as hadoop file of type " + valueClass.getSimpleName)

writerContainer.driverSideSetup()
sc.sparkContext.runJob(rdd, writeToFile _)
sc.sparkContext.runJob(rdd, writerContainer.writeToFile _)
writerContainer.commitJob()

// Note that this function is executed on executor side
def writeToFile(context: TaskContext, iterator: Iterator[InternalRow]): Unit = {
val serializer = newSerializer(fileSinkConf.getTableInfo)
val standardOI = ObjectInspectorUtils
.getStandardObjectInspector(
fileSinkConf.getTableInfo.getDeserializer.getObjectInspector,
ObjectInspectorCopyOption.JAVA)
.asInstanceOf[StructObjectInspector]

val fieldOIs = standardOI.getAllStructFieldRefs.asScala
.map(_.getFieldObjectInspector).toArray
val dataTypes: Array[DataType] = child.output.map(_.dataType).toArray
val wrappers = fieldOIs.zip(dataTypes).map { case (f, dt) => wrapperFor(f, dt)}
val outputData = new Array[Any](fieldOIs.length)

writerContainer.executorSideSetup(context.stageId, context.partitionId, context.attemptNumber)

val proj = FromUnsafeProjection(child.schema)
iterator.foreach { row =>
var i = 0
val safeRow = proj(row)
while (i < fieldOIs.length) {
outputData(i) = if (row.isNullAt(i)) null else wrappers(i)(safeRow.get(i, dataTypes(i)))
i += 1
}

writerContainer
.getLocalFileWriter(safeRow, table.schema)
.write(serializer.serialize(outputData, standardOI))
}

writerContainer.close()
}
}

/**
Expand Down Expand Up @@ -194,11 +149,21 @@ case class InsertIntoHiveTable(

val writerContainer = if (numDynamicPartitions > 0) {
val dynamicPartColNames = partitionColumnNames.takeRight(numDynamicPartitions)
new SparkHiveDynamicPartitionWriterContainer(jobConf, fileSinkConf, dynamicPartColNames)
new SparkHiveDynamicPartitionWriterContainer(
jobConf,
fileSinkConf,
dynamicPartColNames,
child.output,
table)
} else {
new SparkHiveWriterContainer(jobConf, fileSinkConf)
new SparkHiveWriterContainer(
jobConf,
fileSinkConf,
child.output,
table)
}

@transient val outputClass = writerContainer.newSerializer(table.tableDesc).getSerializedClass
saveAsHiveFile(child.execute(), outputClass, fileSinkConf, jobConfSer, writerContainer)

val outputPath = FileOutputFormat.getOutputPath(jobConf)
Expand Down
Loading

0 comments on commit 163a014

Please sign in to comment.